1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;* This program is also available under a commercial proprietary license.
21 ;* For more information, contact us at license @ x265.com.
22 ;*****************************************************************************/
25 %include "x86util.asm"
33 ;-----------------------------------------------------------------------------
34 ; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
35 ;-----------------------------------------------------------------------------
38 cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
39 mova m1, [pw_pixel_max]
68 cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1
71 pmovzxbw m2, [r2 + r4]
77 pmovzxbw m6, [r2 + r4]
100 ;-----------------------------------------------------------------------------
101 ; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
102 ;-----------------------------------------------------------------------------
103 %macro PIXEL_ADD_PS_W4_H4 2
106 cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
107 mova m1, [pw_pixel_max]
118 lea r2, [r2 + r4 * 2]
119 lea r3, [r3 + r5 * 2]
125 lea r2, [r2 + r4 * 2]
126 lea r3, [r3 + r5 * 2]
130 CLIPW2 m2, m4, m0, m1
134 lea r0, [r0 + r1 * 2]
137 lea r0, [r0 + r1 * 2]
143 cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
148 pmovzxbw m2, [r2 + r4]
151 lea r2, [r2 + r4 * 2]
152 lea r3, [r3 + r5 * 2]
154 pmovzxbw m6, [r2 + r4]
158 lea r2, [r2 + r4 * 2]
159 lea r3, [r3 + r5 * 2]
172 lea r0, [r0 + r1 * 2]
175 lea r0, [r0 + r1 * 2]
182 PIXEL_ADD_PS_W4_H4 4, 8
185 ;-----------------------------------------------------------------------------
186 ; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
187 ;-----------------------------------------------------------------------------
188 %macro PIXEL_ADD_PS_W8_H4 2
191 cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
192 mova m5, [pw_pixel_max]
203 lea r2, [r2 + r4 * 2]
204 lea r3, [r3 + r5 * 2]
208 CLIPW2 m0, m2, m4, m5
218 lea r0, [r0 + r1 * 2]
219 lea r2, [r2 + r4 * 2]
220 lea r3, [r3 + r5 * 2]
224 CLIPW2 m0, m2, m4, m5
228 lea r0, [r0 + r1 * 2]
234 cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
239 pmovzxbw m2, [r2 + r4]
242 lea r2, [r2 + r4 * 2]
243 lea r3, [r3 + r5 * 2]
245 pmovzxbw m6, [r2 + r4]
249 lea r2, [r2 + r4 * 2]
250 lea r3, [r3 + r5 * 2]
263 lea r0, [r0 + r1 * 2]
266 lea r0, [r0 + r1 * 2]
273 PIXEL_ADD_PS_W8_H4 8, 8
274 PIXEL_ADD_PS_W8_H4 8, 16
277 ;-----------------------------------------------------------------------------
278 ; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
279 ;-----------------------------------------------------------------------------
280 %macro PIXEL_ADD_PS_W16_H4 2
283 cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
284 mova m5, [pw_pixel_max]
298 CLIPW2 m0, m2, m4, m5
304 movu m2, [r2 + r4 + 16]
306 movu m3, [r3 + r5 + 16]
307 lea r2, [r2 + r4 * 2]
308 lea r3, [r3 + r5 * 2]
312 CLIPW2 m0, m2, m4, m5
315 movu [r0 + r1 + 16], m2
321 lea r0, [r0 + r1 * 2]
325 CLIPW2 m0, m2, m4, m5
331 movu m2, [r2 + r4 + 16]
333 movu m3, [r3 + r5 + 16]
335 lea r2, [r2 + r4 * 2]
336 lea r3, [r3 + r5 * 2]
340 CLIPW2 m0, m2, m4, m5
343 movu [r0 + r1 + 16], m2
344 lea r0, [r0 + r1 * 2]
350 cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
355 pmovzxbw m1, [r2 + 8]
356 pmovzxbw m4, [r2 + r4]
357 pmovzxbw m5, [r2 + r4 + 8]
361 movu m7, [r3 + r5 + 16]
362 lea r2, [r2 + r4 * 2]
363 lea r3, [r3 + r5 * 2]
376 pmovzxbw m1, [r2 + 8]
377 pmovzxbw m4, [r2 + r4]
378 pmovzxbw m5, [r2 + r4 + 8]
382 movu m7, [r3 + r5 + 16]
384 lea r0, [r0 + r1 * 2]
385 lea r2, [r2 + r4 * 2]
386 lea r3, [r3 + r5 * 2]
397 lea r0, [r0 + r1 * 2]
404 PIXEL_ADD_PS_W16_H4 16, 16
405 PIXEL_ADD_PS_W16_H4 16, 32
408 ;-----------------------------------------------------------------------------
409 ; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
410 ;-----------------------------------------------------------------------------
411 %macro PIXEL_ADD_PS_W32_H2 2
414 cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
415 mova m5, [pw_pixel_max]
429 CLIPW2 m0, m2, m4, m5
441 CLIPW2 m0, m2, m4, m5
447 movu m2, [r2 + r4 + 16]
449 movu m3, [r3 + r5 + 16]
453 CLIPW2 m0, m2, m4, m5
456 movu [r0 + r1 + 16], m2
458 movu m0, [r2 + r4 + 32]
459 movu m2, [r2 + r4 + 48]
460 movu m1, [r3 + r5 + 32]
461 movu m3, [r3 + r5 + 48]
463 lea r2, [r2 + r4 * 2]
464 lea r3, [r3 + r5 * 2]
468 CLIPW2 m0, m2, m4, m5
470 movu [r0 + r1 + 32], m0
471 movu [r0 + r1 + 48], m2
472 lea r0, [r0 + r1 * 2]
478 cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
483 pmovzxbw m1, [r2 + 8]
484 pmovzxbw m2, [r2 + 16]
485 pmovzxbw m3, [r2 + 24]
501 pmovzxbw m0, [r2 + r4]
502 pmovzxbw m1, [r2 + r4 + 8]
503 pmovzxbw m2, [r2 + r4 + 16]
504 pmovzxbw m3, [r2 + r4 + 24]
506 movu m5, [r3 + r5 + 16]
507 movu m6, [r3 + r5 + 32]
508 movu m7, [r3 + r5 + 48]
510 lea r2, [r2 + r4 * 2]
511 lea r3, [r3 + r5 * 2]
521 movu [r0 + r1 + 16], m2
522 lea r0, [r0 + r1 * 2]
529 PIXEL_ADD_PS_W32_H2 32, 32
530 PIXEL_ADD_PS_W32_H2 32, 64
533 ;-----------------------------------------------------------------------------
534 ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
535 ;-----------------------------------------------------------------------------
536 %macro PIXEL_ADD_PS_W64_H2 2
539 cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
540 mova m5, [pw_pixel_max]
554 CLIPW2 m0, m2, m4, m5
566 CLIPW2 m0, m2, m4, m5
578 CLIPW2 m0, m2, m4, m5
590 CLIPW2 m0, m2, m4, m5
596 movu m2, [r2 + r4 + 16]
598 movu m3, [r3 + r5 + 16]
602 CLIPW2 m0, m2, m4, m5
605 movu [r0 + r1 + 16], m2
607 movu m0, [r2 + r4 + 32]
608 movu m2, [r2 + r4 + 48]
609 movu m1, [r3 + r5 + 32]
610 movu m3, [r3 + r5 + 48]
614 CLIPW2 m0, m2, m4, m5
616 movu [r0 + r1 + 32], m0
617 movu [r0 + r1 + 48], m2
619 movu m0, [r2 + r4 + 64]
620 movu m2, [r2 + r4 + 80]
621 movu m1, [r3 + r5 + 64]
622 movu m3, [r3 + r5 + 80]
626 CLIPW2 m0, m2, m4, m5
628 movu [r0 + r1 + 64], m0
629 movu [r0 + r1 + 80], m2
631 movu m0, [r2 + r4 + 96]
632 movu m2, [r2 + r4 + 112]
633 movu m1, [r3 + r5 + 96]
634 movu m3, [r3 + r5 + 112]
636 lea r2, [r2 + r4 * 2]
637 lea r3, [r3 + r5 * 2]
641 CLIPW2 m0, m2, m4, m5
643 movu [r0 + r1 + 96], m0
644 movu [r0 + r1 + 112], m2
645 lea r0, [r0 + r1 * 2]
651 cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
656 pmovzxbw m1, [r2 + 8]
657 pmovzxbw m2, [r2 + 16]
658 pmovzxbw m3, [r2 + 24]
674 pmovzxbw m0, [r2 + 32]
675 pmovzxbw m1, [r2 + 40]
676 pmovzxbw m2, [r2 + 48]
677 pmovzxbw m3, [r2 + 56]
693 pmovzxbw m0, [r2 + r4]
694 pmovzxbw m1, [r2 + r4 + 8]
695 pmovzxbw m2, [r2 + r4 + 16]
696 pmovzxbw m3, [r2 + r4 + 24]
698 movu m5, [r3 + r5 + 16]
699 movu m6, [r3 + r5 + 32]
700 movu m7, [r3 + r5 + 48]
710 movu [r0 + r1 + 16], m2
712 pmovzxbw m0, [r2 + r4 + 32]
713 pmovzxbw m1, [r2 + r4 + 40]
714 pmovzxbw m2, [r2 + r4 + 48]
715 pmovzxbw m3, [r2 + r4 + 56]
716 movu m4, [r3 + r5 + 64]
717 movu m5, [r3 + r5 + 80]
718 movu m6, [r3 + r5 + 96]
719 movu m7, [r3 + r5 + 112]
721 lea r2, [r2 + r4 * 2]
722 lea r3, [r3 + r5 * 2]
731 movu [r0 + r1 + 32], m0
732 movu [r0 + r1 + 48], m2
733 lea r0, [r0 + r1 * 2]
740 PIXEL_ADD_PS_W64_H2 64, 64