1 ;*****************************************************************************
2 ;* pixel.asm: x86 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* Fiona Glaser <fiona@x264.com>
11 ;* Oskar Arvidsson <oskar@irock.se>
13 ;* This program is free software; you can redistribute it and/or modify
14 ;* it under the terms of the GNU General Public License as published by
15 ;* the Free Software Foundation; either version 2 of the License, or
16 ;* (at your option) any later version.
18 ;* This program is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;* GNU General Public License for more details.
23 ;* You should have received a copy of the GNU General Public License
24 ;* along with this program; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27 ;* This program is also available under a commercial proprietary license.
28 ;* For more information, contact us at license @ x265.com.
29 ;*****************************************************************************
32 %include "x86util.asm"
35 hmul_16p: times 16 db 1
41 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
42 mask_10: times 4 dw 0, -1
43 mask_1100: times 2 dd 0, -1
46 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
47 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
50 pd_f0: times 4 dd 0xffff0000
52 pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
70 ;=============================================================================
72 ;=============================================================================
76 ; just use shufps on anything post conroe
78 %elif cpuflag(ssse3) && notcpuflag(atom)
79 ; join 2x 32 bit and duplicate them
80 ; emulating shufps is faster on conroe
84 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
96 %macro DIFF_UNPACK_SSE2 5
105 %macro DIFF_SUMSUB_SSSE3 5
106 HSUMSUB %1, %2, %3, %4, %5
111 %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
117 %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
124 %macro LOAD_DUP_4x8P_PENRYN 8
125 ; penryn and nehalem run punpcklqdq and movddup in different units
134 %macro LOAD_SUMSUB_8x2P 9
135 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
136 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
139 %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
140 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
141 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
142 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
149 %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
155 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
158 %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
161 DEINTB %1, %2, %3, %4, %5
164 SUMSUB_BA w, %1, %2, %3
167 %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
168 ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
169 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
170 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
171 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
172 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
175 %macro LOAD_SUMSUB_16x2P_AVX2 9
176 ; 2*dst, 2*tmp, mul, 4*ptr
177 vbroadcasti128 m%1, [%6]
178 vbroadcasti128 m%3, [%7]
179 vbroadcasti128 m%2, [%8]
180 vbroadcasti128 m%4, [%9]
181 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
184 %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
185 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
186 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
187 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
194 %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
199 vpermq m%3, m%3, q0011
200 vpermq m%4, m%4, q0011
201 vpermq m%1, m%1, q0011
202 vpermq m%2, m%2, q0011
205 %macro LOAD_SUMSUB8_16x2P_AVX2 9
206 ; 2*dst, 2*tmp, mul, 4*ptr
207 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
208 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
211 %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
212 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
213 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
214 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
221 ; in: r4=3*stride1, r5=3*stride2
222 ; in: %2 = horizontal offset
223 ; in: %3 = whether we need to increment pix1 and pix2
226 %macro SATD_4x4_MMX 3
228 %assign offset %2*SIZEOF_PIXEL
229 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
230 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
231 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
232 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
237 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
242 ; in: %1 = horizontal if 0, vertical if 1
243 %macro SATD_8x4_SSE 8-9
245 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
247 HADAMARD4_V %2, %3, %4, %5, %6
248 ; doing the abs first is a slight advantage
249 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
250 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
251 HADAMARD 1, max, %2, %4, %6, %7
261 HADAMARD 1, max, %3, %5, %6, %7
266 %macro SATD_8x4_1_SSE 10
268 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
270 HADAMARD4_V %2, %3, %4, %5, %6
271 ; doing the abs first is a slight advantage
272 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
273 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
274 HADAMARD 1, max, %2, %4, %6, %7
294 HADAMARD 1, max, %3, %5, %6, %7
305 %macro SATD_START_MMX 0
307 lea r4, [3*r1] ; 3*stride1
308 lea r5, [3*r3] ; 3*stride2
311 %macro SATD_END_MMX 0
315 %else ; !HIGH_BIT_DEPTH
322 %endif ; HIGH_BIT_DEPTH
326 ; FIXME avoid the spilling of regs to hold 3*stride.
327 ; for small blocks on x86_32, modify pixel pointer instead.
329 ;-----------------------------------------------------------------------------
330 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
331 ;-----------------------------------------------------------------------------
333 cglobal pixel_satd_16x4_internal
334 SATD_4x4_MMX m2, 0, 0
335 SATD_4x4_MMX m1, 4, 0
337 SATD_4x4_MMX m2, 8, 0
339 SATD_4x4_MMX m1, 12, 0
344 cglobal pixel_satd_8x8_internal
345 SATD_4x4_MMX m2, 0, 0
346 SATD_4x4_MMX m1, 4, 1
349 pixel_satd_8x4_internal_mmx2:
350 SATD_4x4_MMX m2, 0, 0
351 SATD_4x4_MMX m1, 4, 0
357 %macro SATD_MxN_MMX 3
358 cglobal pixel_satd_%1x%2, 4,7
361 call pixel_satd_%1x%3_internal_mmx2
368 call pixel_satd_%1x%3_internal_mmx2
379 SATD_MxN_MMX 16, 16, 4
380 SATD_MxN_MMX 16, 8, 4
381 SATD_MxN_MMX 8, 16, 8
382 %endif ; HIGH_BIT_DEPTH
384 %if HIGH_BIT_DEPTH == 0
385 cglobal pixel_satd_16x16, 4,6
389 call pixel_satd_16x4_internal_mmx2
393 call pixel_satd_16x4_internal_mmx2
398 cglobal pixel_satd_16x8, 4,6
401 call pixel_satd_16x4_internal_mmx2
404 call pixel_satd_16x4_internal_mmx2
407 cglobal pixel_satd_8x16, 4,6
410 call pixel_satd_8x8_internal_mmx2
413 call pixel_satd_8x8_internal_mmx2
415 %endif ; !HIGH_BIT_DEPTH
417 cglobal pixel_satd_8x8, 4,6
420 call pixel_satd_8x8_internal_mmx2
423 cglobal pixel_satd_8x4, 4,6
426 call pixel_satd_8x4_internal_mmx2
429 cglobal pixel_satd_4x16, 4,6
431 SATD_4x4_MMX m0, 0, 1
432 SATD_4x4_MMX m1, 0, 1
434 SATD_4x4_MMX m1, 0, 1
436 SATD_4x4_MMX m1, 0, 0
440 cglobal pixel_satd_4x8, 4,6
442 SATD_4x4_MMX m0, 0, 1
443 SATD_4x4_MMX m1, 0, 0
447 cglobal pixel_satd_4x4, 4,6
449 SATD_4x4_MMX m0, 0, 0
452 %macro SATD_START_SSE2 2-3 0
454 %if HIGH_BIT_DEPTH && %3
456 %elif cpuflag(ssse3) && notcpuflag(atom)
468 %macro SATD_END_SSE2 1-2
489 %macro BACKUP_POINTERS 0
499 %macro RESTORE_AND_INC_POINTERS 0
501 lea r0, [r6+8*SIZEOF_PIXEL]
502 lea r2, [r7+8*SIZEOF_PIXEL]
509 add r0, 8*SIZEOF_PIXEL
510 add r2, 8*SIZEOF_PIXEL
514 %macro SATD_4x8_SSE 3-4
538 %else ; !HIGH_BIT_DEPTH
579 %endif ; HIGH_BIT_DEPTH
581 SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
583 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
587 ;-----------------------------------------------------------------------------
588 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
589 ;-----------------------------------------------------------------------------
591 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
593 %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
594 cglobal pixel_satd_4x4, 4, 6, 6
597 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
598 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
599 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
600 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
601 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
602 HADAMARD 0, sumsub, 0, 1, 2, 3
603 HADAMARD 4, sumsub, 0, 1, 2, 3
604 HADAMARD 1, amax, 0, 1, 2, 3
610 cglobal pixel_satd_4x8, 4, 6, 8
615 SATD_4x8_SSE vertical, 0, swap
620 cglobal pixel_satd_4x16, 4, 6, 8
625 SATD_4x8_SSE vertical, 0, swap
626 lea r0, [r0+r1*2*SIZEOF_PIXEL]
627 lea r2, [r2+r3*2*SIZEOF_PIXEL]
628 SATD_4x8_SSE vertical, 1, add
633 cglobal pixel_satd_8x8_internal
634 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
635 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
636 %%pixel_satd_8x4_internal:
637 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
638 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
641 cglobal pixel_satd_8x8_internal2
643 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
644 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
645 %%pixel_satd_8x4_internal2:
646 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
647 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
649 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
650 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
651 %%pixel_satd_8x4_internal2:
652 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
653 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
657 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
658 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
659 %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
661 cglobal pixel_satd_16x4_internal2
662 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
665 SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13
666 SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13
669 cglobal pixel_satd_16x4, 4,6,14
670 SATD_START_SSE2 m10, m7
674 call pixel_satd_16x4_internal2
683 cglobal pixel_satd_16x8, 4,6,14
684 SATD_START_SSE2 m10, m7
688 jmp %%pixel_satd_16x8_internal
690 cglobal pixel_satd_16x12, 4,6,14
691 SATD_START_SSE2 m10, m7
695 call pixel_satd_16x4_internal2
696 jmp %%pixel_satd_16x8_internal
698 cglobal pixel_satd_16x32, 4,6,14
699 SATD_START_SSE2 m10, m7
703 call pixel_satd_16x4_internal2
704 call pixel_satd_16x4_internal2
705 call pixel_satd_16x4_internal2
706 call pixel_satd_16x4_internal2
707 call pixel_satd_16x4_internal2
708 call pixel_satd_16x4_internal2
709 jmp %%pixel_satd_16x8_internal
711 cglobal pixel_satd_16x64, 4,6,14
712 SATD_START_SSE2 m10, m7
716 call pixel_satd_16x4_internal2
717 call pixel_satd_16x4_internal2
718 call pixel_satd_16x4_internal2
719 call pixel_satd_16x4_internal2
720 call pixel_satd_16x4_internal2
721 call pixel_satd_16x4_internal2
722 call pixel_satd_16x4_internal2
723 call pixel_satd_16x4_internal2
724 call pixel_satd_16x4_internal2
725 call pixel_satd_16x4_internal2
726 call pixel_satd_16x4_internal2
727 call pixel_satd_16x4_internal2
728 call pixel_satd_16x4_internal2
729 call pixel_satd_16x4_internal2
730 jmp %%pixel_satd_16x8_internal
732 cglobal pixel_satd_16x16, 4,6,14
733 SATD_START_SSE2 m10, m7
737 call pixel_satd_16x4_internal2
738 call pixel_satd_16x4_internal2
739 %%pixel_satd_16x8_internal:
740 call pixel_satd_16x4_internal2
741 call pixel_satd_16x4_internal2
750 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx)
751 SATD_START_SSE2 m10, m7
757 call pixel_satd_16x4_internal2
758 call pixel_satd_16x4_internal2
761 call pixel_satd_16x4_internal2
762 call pixel_satd_16x4_internal2
771 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
772 SATD_START_SSE2 m10, m7
778 call pixel_satd_16x4_internal2
779 call pixel_satd_16x4_internal2
780 call pixel_satd_16x4_internal2
781 call pixel_satd_16x4_internal2
784 call pixel_satd_16x4_internal2
785 call pixel_satd_16x4_internal2
786 call pixel_satd_16x4_internal2
787 call pixel_satd_16x4_internal2
796 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx)
797 SATD_START_SSE2 m10, m7
803 call pixel_satd_16x4_internal2
804 call pixel_satd_16x4_internal2
805 call pixel_satd_16x4_internal2
806 call pixel_satd_16x4_internal2
807 call pixel_satd_16x4_internal2
808 call pixel_satd_16x4_internal2
811 call pixel_satd_16x4_internal2
812 call pixel_satd_16x4_internal2
813 call pixel_satd_16x4_internal2
814 call pixel_satd_16x4_internal2
815 call pixel_satd_16x4_internal2
816 call pixel_satd_16x4_internal2
825 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
826 SATD_START_SSE2 m10, m7
832 call pixel_satd_16x4_internal2
833 call pixel_satd_16x4_internal2
834 call pixel_satd_16x4_internal2
835 call pixel_satd_16x4_internal2
836 call pixel_satd_16x4_internal2
837 call pixel_satd_16x4_internal2
838 call pixel_satd_16x4_internal2
839 call pixel_satd_16x4_internal2
842 call pixel_satd_16x4_internal2
843 call pixel_satd_16x4_internal2
844 call pixel_satd_16x4_internal2
845 call pixel_satd_16x4_internal2
846 call pixel_satd_16x4_internal2
847 call pixel_satd_16x4_internal2
848 call pixel_satd_16x4_internal2
849 call pixel_satd_16x4_internal2
858 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
859 SATD_START_SSE2 m10, m7
865 call pixel_satd_16x4_internal2
866 call pixel_satd_16x4_internal2
867 call pixel_satd_16x4_internal2
868 call pixel_satd_16x4_internal2
869 call pixel_satd_16x4_internal2
870 call pixel_satd_16x4_internal2
871 call pixel_satd_16x4_internal2
872 call pixel_satd_16x4_internal2
873 call pixel_satd_16x4_internal2
874 call pixel_satd_16x4_internal2
875 call pixel_satd_16x4_internal2
876 call pixel_satd_16x4_internal2
877 call pixel_satd_16x4_internal2
878 call pixel_satd_16x4_internal2
879 call pixel_satd_16x4_internal2
880 call pixel_satd_16x4_internal2
883 call pixel_satd_16x4_internal2
884 call pixel_satd_16x4_internal2
885 call pixel_satd_16x4_internal2
886 call pixel_satd_16x4_internal2
887 call pixel_satd_16x4_internal2
888 call pixel_satd_16x4_internal2
889 call pixel_satd_16x4_internal2
890 call pixel_satd_16x4_internal2
891 call pixel_satd_16x4_internal2
892 call pixel_satd_16x4_internal2
893 call pixel_satd_16x4_internal2
894 call pixel_satd_16x4_internal2
895 call pixel_satd_16x4_internal2
896 call pixel_satd_16x4_internal2
897 call pixel_satd_16x4_internal2
898 call pixel_satd_16x4_internal2
907 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
908 SATD_START_SSE2 m10, m7
914 call pixel_satd_16x4_internal2
915 call pixel_satd_16x4_internal2
916 call pixel_satd_16x4_internal2
917 call pixel_satd_16x4_internal2
918 call pixel_satd_16x4_internal2
919 call pixel_satd_16x4_internal2
920 call pixel_satd_16x4_internal2
921 call pixel_satd_16x4_internal2
922 call pixel_satd_16x4_internal2
923 call pixel_satd_16x4_internal2
924 call pixel_satd_16x4_internal2
925 call pixel_satd_16x4_internal2
926 call pixel_satd_16x4_internal2
927 call pixel_satd_16x4_internal2
928 call pixel_satd_16x4_internal2
929 call pixel_satd_16x4_internal2
932 call pixel_satd_16x4_internal2
933 call pixel_satd_16x4_internal2
934 call pixel_satd_16x4_internal2
935 call pixel_satd_16x4_internal2
936 call pixel_satd_16x4_internal2
937 call pixel_satd_16x4_internal2
938 call pixel_satd_16x4_internal2
939 call pixel_satd_16x4_internal2
940 call pixel_satd_16x4_internal2
941 call pixel_satd_16x4_internal2
942 call pixel_satd_16x4_internal2
943 call pixel_satd_16x4_internal2
944 call pixel_satd_16x4_internal2
945 call pixel_satd_16x4_internal2
946 call pixel_satd_16x4_internal2
947 call pixel_satd_16x4_internal2
950 call pixel_satd_16x4_internal2
951 call pixel_satd_16x4_internal2
952 call pixel_satd_16x4_internal2
953 call pixel_satd_16x4_internal2
954 call pixel_satd_16x4_internal2
955 call pixel_satd_16x4_internal2
956 call pixel_satd_16x4_internal2
957 call pixel_satd_16x4_internal2
958 call pixel_satd_16x4_internal2
959 call pixel_satd_16x4_internal2
960 call pixel_satd_16x4_internal2
961 call pixel_satd_16x4_internal2
962 call pixel_satd_16x4_internal2
963 call pixel_satd_16x4_internal2
964 call pixel_satd_16x4_internal2
965 call pixel_satd_16x4_internal2
974 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
975 SATD_START_SSE2 m10, m7
981 call pixel_satd_16x4_internal2
982 call pixel_satd_16x4_internal2
983 call pixel_satd_16x4_internal2
984 call pixel_satd_16x4_internal2
987 call pixel_satd_16x4_internal2
988 call pixel_satd_16x4_internal2
989 call pixel_satd_16x4_internal2
990 call pixel_satd_16x4_internal2
993 call pixel_satd_16x4_internal2
994 call pixel_satd_16x4_internal2
995 call pixel_satd_16x4_internal2
996 call pixel_satd_16x4_internal2
999 call pixel_satd_16x4_internal2
1000 call pixel_satd_16x4_internal2
1001 call pixel_satd_16x4_internal2
1002 call pixel_satd_16x4_internal2
1011 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
1012 SATD_START_SSE2 m10, m7
1018 call pixel_satd_16x4_internal2
1019 call pixel_satd_16x4_internal2
1020 call pixel_satd_16x4_internal2
1021 call pixel_satd_16x4_internal2
1022 call pixel_satd_16x4_internal2
1023 call pixel_satd_16x4_internal2
1024 call pixel_satd_16x4_internal2
1025 call pixel_satd_16x4_internal2
1028 call pixel_satd_16x4_internal2
1029 call pixel_satd_16x4_internal2
1030 call pixel_satd_16x4_internal2
1031 call pixel_satd_16x4_internal2
1032 call pixel_satd_16x4_internal2
1033 call pixel_satd_16x4_internal2
1034 call pixel_satd_16x4_internal2
1035 call pixel_satd_16x4_internal2
1038 call pixel_satd_16x4_internal2
1039 call pixel_satd_16x4_internal2
1040 call pixel_satd_16x4_internal2
1041 call pixel_satd_16x4_internal2
1042 call pixel_satd_16x4_internal2
1043 call pixel_satd_16x4_internal2
1044 call pixel_satd_16x4_internal2
1045 call pixel_satd_16x4_internal2
1048 call pixel_satd_16x4_internal2
1049 call pixel_satd_16x4_internal2
1050 call pixel_satd_16x4_internal2
1051 call pixel_satd_16x4_internal2
1052 call pixel_satd_16x4_internal2
1053 call pixel_satd_16x4_internal2
1054 call pixel_satd_16x4_internal2
1055 call pixel_satd_16x4_internal2
1065 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx)
1066 SATD_START_SSE2 m10, m7
1072 call pixel_satd_16x4_internal2
1073 call pixel_satd_16x4_internal2
1074 call pixel_satd_16x4_internal2
1075 call pixel_satd_16x4_internal2
1076 call pixel_satd_16x4_internal2
1077 call pixel_satd_16x4_internal2
1078 call pixel_satd_16x4_internal2
1079 call pixel_satd_16x4_internal2
1080 call pixel_satd_16x4_internal2
1081 call pixel_satd_16x4_internal2
1082 call pixel_satd_16x4_internal2
1083 call pixel_satd_16x4_internal2
1086 call pixel_satd_16x4_internal2
1087 call pixel_satd_16x4_internal2
1088 call pixel_satd_16x4_internal2
1089 call pixel_satd_16x4_internal2
1090 call pixel_satd_16x4_internal2
1091 call pixel_satd_16x4_internal2
1092 call pixel_satd_16x4_internal2
1093 call pixel_satd_16x4_internal2
1094 call pixel_satd_16x4_internal2
1095 call pixel_satd_16x4_internal2
1096 call pixel_satd_16x4_internal2
1097 call pixel_satd_16x4_internal2
1100 call pixel_satd_16x4_internal2
1101 call pixel_satd_16x4_internal2
1102 call pixel_satd_16x4_internal2
1103 call pixel_satd_16x4_internal2
1104 call pixel_satd_16x4_internal2
1105 call pixel_satd_16x4_internal2
1106 call pixel_satd_16x4_internal2
1107 call pixel_satd_16x4_internal2
1108 call pixel_satd_16x4_internal2
1109 call pixel_satd_16x4_internal2
1110 call pixel_satd_16x4_internal2
1111 call pixel_satd_16x4_internal2
1114 call pixel_satd_16x4_internal2
1115 call pixel_satd_16x4_internal2
1116 call pixel_satd_16x4_internal2
1117 call pixel_satd_16x4_internal2
1118 call pixel_satd_16x4_internal2
1119 call pixel_satd_16x4_internal2
1120 call pixel_satd_16x4_internal2
1121 call pixel_satd_16x4_internal2
1122 call pixel_satd_16x4_internal2
1123 call pixel_satd_16x4_internal2
1124 call pixel_satd_16x4_internal2
1125 call pixel_satd_16x4_internal2
1135 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
1136 SATD_START_SSE2 m10, m7
1142 call pixel_satd_16x4_internal2
1143 call pixel_satd_16x4_internal2
1144 call pixel_satd_16x4_internal2
1145 call pixel_satd_16x4_internal2
1146 call pixel_satd_16x4_internal2
1147 call pixel_satd_16x4_internal2
1148 call pixel_satd_16x4_internal2
1149 call pixel_satd_16x4_internal2
1150 call pixel_satd_16x4_internal2
1151 call pixel_satd_16x4_internal2
1152 call pixel_satd_16x4_internal2
1153 call pixel_satd_16x4_internal2
1154 call pixel_satd_16x4_internal2
1155 call pixel_satd_16x4_internal2
1156 call pixel_satd_16x4_internal2
1157 call pixel_satd_16x4_internal2
1160 call pixel_satd_16x4_internal2
1161 call pixel_satd_16x4_internal2
1162 call pixel_satd_16x4_internal2
1163 call pixel_satd_16x4_internal2
1164 call pixel_satd_16x4_internal2
1165 call pixel_satd_16x4_internal2
1166 call pixel_satd_16x4_internal2
1167 call pixel_satd_16x4_internal2
1168 call pixel_satd_16x4_internal2
1169 call pixel_satd_16x4_internal2
1170 call pixel_satd_16x4_internal2
1171 call pixel_satd_16x4_internal2
1172 call pixel_satd_16x4_internal2
1173 call pixel_satd_16x4_internal2
1174 call pixel_satd_16x4_internal2
1175 call pixel_satd_16x4_internal2
1178 call pixel_satd_16x4_internal2
1179 call pixel_satd_16x4_internal2
1180 call pixel_satd_16x4_internal2
1181 call pixel_satd_16x4_internal2
1182 call pixel_satd_16x4_internal2
1183 call pixel_satd_16x4_internal2
1184 call pixel_satd_16x4_internal2
1185 call pixel_satd_16x4_internal2
1186 call pixel_satd_16x4_internal2
1187 call pixel_satd_16x4_internal2
1188 call pixel_satd_16x4_internal2
1189 call pixel_satd_16x4_internal2
1190 call pixel_satd_16x4_internal2
1191 call pixel_satd_16x4_internal2
1192 call pixel_satd_16x4_internal2
1193 call pixel_satd_16x4_internal2
1196 call pixel_satd_16x4_internal2
1197 call pixel_satd_16x4_internal2
1198 call pixel_satd_16x4_internal2
1199 call pixel_satd_16x4_internal2
1200 call pixel_satd_16x4_internal2
1201 call pixel_satd_16x4_internal2
1202 call pixel_satd_16x4_internal2
1203 call pixel_satd_16x4_internal2
1204 call pixel_satd_16x4_internal2
1205 call pixel_satd_16x4_internal2
1206 call pixel_satd_16x4_internal2
1207 call pixel_satd_16x4_internal2
1208 call pixel_satd_16x4_internal2
1209 call pixel_satd_16x4_internal2
1210 call pixel_satd_16x4_internal2
1211 call pixel_satd_16x4_internal2
1224 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx)
1225 SATD_START_SSE2 m6, m7
1228 call pixel_satd_8x8_internal2
1229 lea r0, [r6 + 8*SIZEOF_PIXEL]
1230 lea r2, [r7 + 8*SIZEOF_PIXEL]
1231 call pixel_satd_8x8_internal2
1232 lea r0, [r6 + 16*SIZEOF_PIXEL]
1233 lea r2, [r7 + 16*SIZEOF_PIXEL]
1234 call pixel_satd_8x8_internal2
1235 lea r0, [r6 + 24*SIZEOF_PIXEL]
1236 lea r2, [r7 + 24*SIZEOF_PIXEL]
1237 call pixel_satd_8x8_internal2
1246 cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
1247 SATD_START_SSE2 m6, m7
1250 call pixel_satd_8x8_internal2
1251 lea r0, [r6 + 8*SIZEOF_PIXEL]
1253 add r2, 8*SIZEOF_PIXEL
1254 call pixel_satd_8x8_internal2
1255 lea r0, [r6 + 16*SIZEOF_PIXEL]
1257 add r2, 16*SIZEOF_PIXEL
1258 call pixel_satd_8x8_internal2
1259 lea r0, [r6 + 24*SIZEOF_PIXEL]
1261 add r2, 24*SIZEOF_PIXEL
1262 call pixel_satd_8x8_internal2
1273 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx)
1274 SATD_START_SSE2 m6, m7
1277 call pixel_satd_8x8_internal2
1278 call pixel_satd_8x8_internal2
1279 lea r0, [r6 + 8*SIZEOF_PIXEL]
1280 lea r2, [r7 + 8*SIZEOF_PIXEL]
1281 call pixel_satd_8x8_internal2
1282 call pixel_satd_8x8_internal2
1283 lea r0, [r6 + 16*SIZEOF_PIXEL]
1284 lea r2, [r7 + 16*SIZEOF_PIXEL]
1285 call pixel_satd_8x8_internal2
1286 call pixel_satd_8x8_internal2
1287 lea r0, [r6 + 24*SIZEOF_PIXEL]
1288 lea r2, [r7 + 24*SIZEOF_PIXEL]
1289 call pixel_satd_8x8_internal2
1290 call pixel_satd_8x8_internal2
1299 cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
1300 SATD_START_SSE2 m6, m7
1303 call pixel_satd_8x8_internal2
1304 call pixel_satd_8x8_internal2
1305 lea r0, [r6 + 8*SIZEOF_PIXEL]
1307 add r2, 8*SIZEOF_PIXEL
1308 call pixel_satd_8x8_internal2
1309 call pixel_satd_8x8_internal2
1310 lea r0, [r6 + 16*SIZEOF_PIXEL]
1312 add r2, 16*SIZEOF_PIXEL
1313 call pixel_satd_8x8_internal2
1314 call pixel_satd_8x8_internal2
1315 lea r0, [r6 + 24*SIZEOF_PIXEL]
1317 add r2, 24*SIZEOF_PIXEL
1318 call pixel_satd_8x8_internal2
1319 call pixel_satd_8x8_internal2
1330 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx)
1331 SATD_START_SSE2 m6, m7
1334 call pixel_satd_8x8_internal2
1335 call pixel_satd_8x8_internal2
1336 call pixel_satd_8x8_internal2
1337 lea r0, [r6 + 8*SIZEOF_PIXEL]
1338 lea r2, [r7 + 8*SIZEOF_PIXEL]
1339 call pixel_satd_8x8_internal2
1340 call pixel_satd_8x8_internal2
1341 call pixel_satd_8x8_internal2
1342 lea r0, [r6 + 16*SIZEOF_PIXEL]
1343 lea r2, [r7 + 16*SIZEOF_PIXEL]
1344 call pixel_satd_8x8_internal2
1345 call pixel_satd_8x8_internal2
1346 call pixel_satd_8x8_internal2
1347 lea r0, [r6 + 24*SIZEOF_PIXEL]
1348 lea r2, [r7 + 24*SIZEOF_PIXEL]
1349 call pixel_satd_8x8_internal2
1350 call pixel_satd_8x8_internal2
1351 call pixel_satd_8x8_internal2
1360 cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
1361 SATD_START_SSE2 m6, m7
1364 call pixel_satd_8x8_internal2
1365 call pixel_satd_8x8_internal2
1366 call pixel_satd_8x8_internal2
1367 lea r0, [r6 + 8*SIZEOF_PIXEL]
1369 add r2, 8*SIZEOF_PIXEL
1370 call pixel_satd_8x8_internal2
1371 call pixel_satd_8x8_internal2
1372 call pixel_satd_8x8_internal2
1373 lea r0, [r6 + 16*SIZEOF_PIXEL]
1375 add r2, 16*SIZEOF_PIXEL
1376 call pixel_satd_8x8_internal2
1377 call pixel_satd_8x8_internal2
1378 call pixel_satd_8x8_internal2
1379 lea r0, [r6 + 24*SIZEOF_PIXEL]
1381 add r2, 24*SIZEOF_PIXEL
1382 call pixel_satd_8x8_internal2
1383 call pixel_satd_8x8_internal2
1384 call pixel_satd_8x8_internal2
1395 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx)
1396 SATD_START_SSE2 m6, m7
1399 call pixel_satd_8x8_internal2
1400 call pixel_satd_8x8_internal2
1401 call pixel_satd_8x8_internal2
1402 call pixel_satd_8x8_internal2
1403 lea r0, [r6 + 8*SIZEOF_PIXEL]
1404 lea r2, [r7 + 8*SIZEOF_PIXEL]
1405 call pixel_satd_8x8_internal2
1406 call pixel_satd_8x8_internal2
1407 call pixel_satd_8x8_internal2
1408 call pixel_satd_8x8_internal2
1409 lea r0, [r6 + 16*SIZEOF_PIXEL]
1410 lea r2, [r7 + 16*SIZEOF_PIXEL]
1411 call pixel_satd_8x8_internal2
1412 call pixel_satd_8x8_internal2
1413 call pixel_satd_8x8_internal2
1414 call pixel_satd_8x8_internal2
1415 lea r0, [r6 + 24*SIZEOF_PIXEL]
1416 lea r2, [r7 + 24*SIZEOF_PIXEL]
1417 call pixel_satd_8x8_internal2
1418 call pixel_satd_8x8_internal2
1419 call pixel_satd_8x8_internal2
1420 call pixel_satd_8x8_internal2
1429 cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
1430 SATD_START_SSE2 m6, m7
1433 call pixel_satd_8x8_internal2
1434 call pixel_satd_8x8_internal2
1435 call pixel_satd_8x8_internal2
1436 call pixel_satd_8x8_internal2
1437 lea r0, [r6 + 8*SIZEOF_PIXEL]
1439 add r2, 8*SIZEOF_PIXEL
1440 call pixel_satd_8x8_internal2
1441 call pixel_satd_8x8_internal2
1442 call pixel_satd_8x8_internal2
1443 call pixel_satd_8x8_internal2
1444 lea r0, [r6 + 16*SIZEOF_PIXEL]
1446 add r2, 16*SIZEOF_PIXEL
1447 call pixel_satd_8x8_internal2
1448 call pixel_satd_8x8_internal2
1449 call pixel_satd_8x8_internal2
1450 call pixel_satd_8x8_internal2
1451 lea r0, [r6 + 24*SIZEOF_PIXEL]
1453 add r2, 24*SIZEOF_PIXEL
1454 call pixel_satd_8x8_internal2
1455 call pixel_satd_8x8_internal2
1456 call pixel_satd_8x8_internal2
1457 call pixel_satd_8x8_internal2
1468 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1469 SATD_START_SSE2 m6, m7
1472 call pixel_satd_8x8_internal2
1473 call pixel_satd_8x8_internal2
1474 call pixel_satd_8x8_internal2
1475 call pixel_satd_8x8_internal2
1476 call pixel_satd_8x8_internal2
1477 call pixel_satd_8x8_internal2
1478 call pixel_satd_8x8_internal2
1479 call pixel_satd_8x8_internal2
1480 lea r0, [r6 + 8*SIZEOF_PIXEL]
1481 lea r2, [r7 + 8*SIZEOF_PIXEL]
1482 call pixel_satd_8x8_internal2
1483 call pixel_satd_8x8_internal2
1484 call pixel_satd_8x8_internal2
1485 call pixel_satd_8x8_internal2
1486 call pixel_satd_8x8_internal2
1487 call pixel_satd_8x8_internal2
1488 call pixel_satd_8x8_internal2
1489 call pixel_satd_8x8_internal2
1490 lea r0, [r6 + 16*SIZEOF_PIXEL]
1491 lea r2, [r7 + 16*SIZEOF_PIXEL]
1492 call pixel_satd_8x8_internal2
1493 call pixel_satd_8x8_internal2
1494 call pixel_satd_8x8_internal2
1495 call pixel_satd_8x8_internal2
1496 call pixel_satd_8x8_internal2
1497 call pixel_satd_8x8_internal2
1498 call pixel_satd_8x8_internal2
1499 call pixel_satd_8x8_internal2
1500 lea r0, [r6 + 24*SIZEOF_PIXEL]
1501 lea r2, [r7 + 24*SIZEOF_PIXEL]
1502 call pixel_satd_8x8_internal2
1503 call pixel_satd_8x8_internal2
1504 call pixel_satd_8x8_internal2
1505 call pixel_satd_8x8_internal2
1506 call pixel_satd_8x8_internal2
1507 call pixel_satd_8x8_internal2
1508 call pixel_satd_8x8_internal2
1509 call pixel_satd_8x8_internal2
1518 cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64
1519 SATD_START_SSE2 m6, m7
1522 call pixel_satd_8x8_internal2
1523 call pixel_satd_8x8_internal2
1524 call pixel_satd_8x8_internal2
1525 call pixel_satd_8x8_internal2
1526 call pixel_satd_8x8_internal2
1527 call pixel_satd_8x8_internal2
1528 call pixel_satd_8x8_internal2
1529 call pixel_satd_8x8_internal2
1530 lea r0, [r6 + 8*SIZEOF_PIXEL]
1532 add r2, 8*SIZEOF_PIXEL
1533 call pixel_satd_8x8_internal2
1534 call pixel_satd_8x8_internal2
1535 call pixel_satd_8x8_internal2
1536 call pixel_satd_8x8_internal2
1537 call pixel_satd_8x8_internal2
1538 call pixel_satd_8x8_internal2
1539 call pixel_satd_8x8_internal2
1540 call pixel_satd_8x8_internal2
1541 lea r0, [r6 + 16*SIZEOF_PIXEL]
1543 add r2, 16*SIZEOF_PIXEL
1544 call pixel_satd_8x8_internal2
1545 call pixel_satd_8x8_internal2
1546 call pixel_satd_8x8_internal2
1547 call pixel_satd_8x8_internal2
1548 call pixel_satd_8x8_internal2
1549 call pixel_satd_8x8_internal2
1550 call pixel_satd_8x8_internal2
1551 call pixel_satd_8x8_internal2
1552 lea r0, [r6 + 24*SIZEOF_PIXEL]
1554 add r2, 24*SIZEOF_PIXEL
1555 call pixel_satd_8x8_internal2
1556 call pixel_satd_8x8_internal2
1557 call pixel_satd_8x8_internal2
1558 call pixel_satd_8x8_internal2
1559 call pixel_satd_8x8_internal2
1560 call pixel_satd_8x8_internal2
1561 call pixel_satd_8x8_internal2
1562 call pixel_satd_8x8_internal2
1573 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1574 SATD_START_SSE2 m6, m7
1577 call pixel_satd_8x8_internal2
1578 call pixel_satd_8x8_internal2
1579 call pixel_satd_8x8_internal2
1580 call pixel_satd_8x8_internal2
1581 call pixel_satd_8x8_internal2
1582 call pixel_satd_8x8_internal2
1583 call pixel_satd_8x8_internal2
1584 call pixel_satd_8x8_internal2
1585 lea r0, [r6 + 8*SIZEOF_PIXEL]
1586 lea r2, [r7 + 8*SIZEOF_PIXEL]
1587 call pixel_satd_8x8_internal2
1588 call pixel_satd_8x8_internal2
1589 call pixel_satd_8x8_internal2
1590 call pixel_satd_8x8_internal2
1591 call pixel_satd_8x8_internal2
1592 call pixel_satd_8x8_internal2
1593 call pixel_satd_8x8_internal2
1594 call pixel_satd_8x8_internal2
1595 lea r0, [r6 + 16*SIZEOF_PIXEL]
1596 lea r2, [r7 + 16*SIZEOF_PIXEL]
1597 call pixel_satd_8x8_internal2
1598 call pixel_satd_8x8_internal2
1599 call pixel_satd_8x8_internal2
1600 call pixel_satd_8x8_internal2
1601 call pixel_satd_8x8_internal2
1602 call pixel_satd_8x8_internal2
1603 call pixel_satd_8x8_internal2
1604 call pixel_satd_8x8_internal2
1605 lea r0, [r6 + 24*SIZEOF_PIXEL]
1606 lea r2, [r7 + 24*SIZEOF_PIXEL]
1607 call pixel_satd_8x8_internal2
1608 call pixel_satd_8x8_internal2
1609 call pixel_satd_8x8_internal2
1610 call pixel_satd_8x8_internal2
1611 call pixel_satd_8x8_internal2
1612 call pixel_satd_8x8_internal2
1613 call pixel_satd_8x8_internal2
1614 call pixel_satd_8x8_internal2
1615 lea r0, [r6 + 32*SIZEOF_PIXEL]
1616 lea r2, [r7 + 32*SIZEOF_PIXEL]
1617 call pixel_satd_8x8_internal2
1618 call pixel_satd_8x8_internal2
1619 call pixel_satd_8x8_internal2
1620 call pixel_satd_8x8_internal2
1621 call pixel_satd_8x8_internal2
1622 call pixel_satd_8x8_internal2
1623 call pixel_satd_8x8_internal2
1624 call pixel_satd_8x8_internal2
1625 lea r0, [r6 + 40*SIZEOF_PIXEL]
1626 lea r2, [r7 + 40*SIZEOF_PIXEL]
1627 call pixel_satd_8x8_internal2
1628 call pixel_satd_8x8_internal2
1629 call pixel_satd_8x8_internal2
1630 call pixel_satd_8x8_internal2
1631 call pixel_satd_8x8_internal2
1632 call pixel_satd_8x8_internal2
1633 call pixel_satd_8x8_internal2
1634 call pixel_satd_8x8_internal2
1643 cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64
1644 SATD_START_SSE2 m6, m7
1647 call pixel_satd_8x8_internal2
1648 call pixel_satd_8x8_internal2
1649 call pixel_satd_8x8_internal2
1650 call pixel_satd_8x8_internal2
1651 call pixel_satd_8x8_internal2
1652 call pixel_satd_8x8_internal2
1653 call pixel_satd_8x8_internal2
1654 call pixel_satd_8x8_internal2
1655 lea r0, [r6 + 8*SIZEOF_PIXEL]
1657 add r2,8*SIZEOF_PIXEL
1658 call pixel_satd_8x8_internal2
1659 call pixel_satd_8x8_internal2
1660 call pixel_satd_8x8_internal2
1661 call pixel_satd_8x8_internal2
1662 call pixel_satd_8x8_internal2
1663 call pixel_satd_8x8_internal2
1664 call pixel_satd_8x8_internal2
1665 call pixel_satd_8x8_internal2
1666 lea r0, [r6 + 16*SIZEOF_PIXEL]
1668 add r2,16*SIZEOF_PIXEL
1669 call pixel_satd_8x8_internal2
1670 call pixel_satd_8x8_internal2
1671 call pixel_satd_8x8_internal2
1672 call pixel_satd_8x8_internal2
1673 call pixel_satd_8x8_internal2
1674 call pixel_satd_8x8_internal2
1675 call pixel_satd_8x8_internal2
1676 call pixel_satd_8x8_internal2
1677 lea r0, [r6 + 24*SIZEOF_PIXEL]
1679 add r2,24*SIZEOF_PIXEL
1680 call pixel_satd_8x8_internal2
1681 call pixel_satd_8x8_internal2
1682 call pixel_satd_8x8_internal2
1683 call pixel_satd_8x8_internal2
1684 call pixel_satd_8x8_internal2
1685 call pixel_satd_8x8_internal2
1686 call pixel_satd_8x8_internal2
1687 call pixel_satd_8x8_internal2
1688 lea r0, [r6 + 32*SIZEOF_PIXEL]
1690 add r2,32*SIZEOF_PIXEL
1691 call pixel_satd_8x8_internal2
1692 call pixel_satd_8x8_internal2
1693 call pixel_satd_8x8_internal2
1694 call pixel_satd_8x8_internal2
1695 call pixel_satd_8x8_internal2
1696 call pixel_satd_8x8_internal2
1697 call pixel_satd_8x8_internal2
1698 call pixel_satd_8x8_internal2
1699 lea r0, [r6 + 40*SIZEOF_PIXEL]
1701 add r2,40*SIZEOF_PIXEL
1702 call pixel_satd_8x8_internal2
1703 call pixel_satd_8x8_internal2
1704 call pixel_satd_8x8_internal2
1705 call pixel_satd_8x8_internal2
1706 call pixel_satd_8x8_internal2
1707 call pixel_satd_8x8_internal2
1708 call pixel_satd_8x8_internal2
1709 call pixel_satd_8x8_internal2
1721 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx)
1722 SATD_START_SSE2 m6, m7
1725 call pixel_satd_8x8_internal2
1726 call pixel_satd_8x8_internal2
1727 lea r0, [r6 + 8*SIZEOF_PIXEL]
1728 lea r2, [r7 + 8*SIZEOF_PIXEL]
1729 call pixel_satd_8x8_internal2
1730 call pixel_satd_8x8_internal2
1731 lea r0, [r6 + 16*SIZEOF_PIXEL]
1732 lea r2, [r7 + 16*SIZEOF_PIXEL]
1733 call pixel_satd_8x8_internal2
1734 call pixel_satd_8x8_internal2
1735 lea r0, [r6 + 24*SIZEOF_PIXEL]
1736 lea r2, [r7 + 24*SIZEOF_PIXEL]
1737 call pixel_satd_8x8_internal2
1738 call pixel_satd_8x8_internal2
1739 lea r0, [r6 + 32*SIZEOF_PIXEL]
1740 lea r2, [r7 + 32*SIZEOF_PIXEL]
1741 call pixel_satd_8x8_internal2
1742 call pixel_satd_8x8_internal2
1743 lea r0, [r6 + 40*SIZEOF_PIXEL]
1744 lea r2, [r7 + 40*SIZEOF_PIXEL]
1745 call pixel_satd_8x8_internal2
1746 call pixel_satd_8x8_internal2
1747 lea r0, [r6 + 48*SIZEOF_PIXEL]
1748 lea r2, [r7 + 48*SIZEOF_PIXEL]
1749 call pixel_satd_8x8_internal2
1750 call pixel_satd_8x8_internal2
1751 lea r0, [r6 + 56*SIZEOF_PIXEL]
1752 lea r2, [r7 + 56*SIZEOF_PIXEL]
1753 call pixel_satd_8x8_internal2
1754 call pixel_satd_8x8_internal2
1763 cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
1764 SATD_START_SSE2 m6, m7
1767 call pixel_satd_8x8_internal2
1768 call pixel_satd_8x8_internal2
1769 lea r0, [r6 + 8*SIZEOF_PIXEL]
1771 add r2,8*SIZEOF_PIXEL
1772 call pixel_satd_8x8_internal2
1773 call pixel_satd_8x8_internal2
1774 lea r0, [r6 + 16*SIZEOF_PIXEL]
1776 add r2,16*SIZEOF_PIXEL
1777 call pixel_satd_8x8_internal2
1778 call pixel_satd_8x8_internal2
1779 lea r0, [r6 + 24*SIZEOF_PIXEL]
1781 add r2,24*SIZEOF_PIXEL
1782 call pixel_satd_8x8_internal2
1783 call pixel_satd_8x8_internal2
1784 lea r0, [r6 + 32*SIZEOF_PIXEL]
1786 add r2,32*SIZEOF_PIXEL
1787 call pixel_satd_8x8_internal2
1788 call pixel_satd_8x8_internal2
1789 lea r0, [r6 + 40*SIZEOF_PIXEL]
1791 add r2,40*SIZEOF_PIXEL
1792 call pixel_satd_8x8_internal2
1793 call pixel_satd_8x8_internal2
1794 lea r0, [r6 + 48*SIZEOF_PIXEL]
1796 add r2,48*SIZEOF_PIXEL
1797 call pixel_satd_8x8_internal2
1798 call pixel_satd_8x8_internal2
1799 lea r0, [r6 + 56*SIZEOF_PIXEL]
1801 add r2,56*SIZEOF_PIXEL
1802 call pixel_satd_8x8_internal2
1803 call pixel_satd_8x8_internal2
1814 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx)
1815 SATD_START_SSE2 m6, m7
1818 call pixel_satd_8x8_internal2
1819 call pixel_satd_8x8_internal2
1820 call pixel_satd_8x8_internal2
1821 call pixel_satd_8x8_internal2
1822 lea r0, [r6 + 8*SIZEOF_PIXEL]
1823 lea r2, [r7 + 8*SIZEOF_PIXEL]
1824 call pixel_satd_8x8_internal2
1825 call pixel_satd_8x8_internal2
1826 call pixel_satd_8x8_internal2
1827 call pixel_satd_8x8_internal2
1828 lea r0, [r6 + 16*SIZEOF_PIXEL]
1829 lea r2, [r7 + 16*SIZEOF_PIXEL]
1830 call pixel_satd_8x8_internal2
1831 call pixel_satd_8x8_internal2
1832 call pixel_satd_8x8_internal2
1833 call pixel_satd_8x8_internal2
1834 lea r0, [r6 + 24*SIZEOF_PIXEL]
1835 lea r2, [r7 + 24*SIZEOF_PIXEL]
1836 call pixel_satd_8x8_internal2
1837 call pixel_satd_8x8_internal2
1838 call pixel_satd_8x8_internal2
1839 call pixel_satd_8x8_internal2
1840 lea r0, [r6 + 32*SIZEOF_PIXEL]
1841 lea r2, [r7 + 32*SIZEOF_PIXEL]
1842 call pixel_satd_8x8_internal2
1843 call pixel_satd_8x8_internal2
1844 call pixel_satd_8x8_internal2
1845 call pixel_satd_8x8_internal2
1846 lea r0, [r6 + 40*SIZEOF_PIXEL]
1847 lea r2, [r7 + 40*SIZEOF_PIXEL]
1848 call pixel_satd_8x8_internal2
1849 call pixel_satd_8x8_internal2
1850 call pixel_satd_8x8_internal2
1851 call pixel_satd_8x8_internal2
1852 lea r0, [r6 + 48*SIZEOF_PIXEL]
1853 lea r2, [r7 + 48*SIZEOF_PIXEL]
1854 call pixel_satd_8x8_internal2
1855 call pixel_satd_8x8_internal2
1856 call pixel_satd_8x8_internal2
1857 call pixel_satd_8x8_internal2
1858 lea r0, [r6 + 56*SIZEOF_PIXEL]
1859 lea r2, [r7 + 56*SIZEOF_PIXEL]
1860 call pixel_satd_8x8_internal2
1861 call pixel_satd_8x8_internal2
1862 call pixel_satd_8x8_internal2
1863 call pixel_satd_8x8_internal2
1872 cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64
1873 SATD_START_SSE2 m6, m7
1876 call pixel_satd_8x8_internal2
1877 call pixel_satd_8x8_internal2
1878 call pixel_satd_8x8_internal2
1879 call pixel_satd_8x8_internal2
1880 lea r0, [r6 + 8*SIZEOF_PIXEL]
1882 add r2, 8*SIZEOF_PIXEL
1883 call pixel_satd_8x8_internal2
1884 call pixel_satd_8x8_internal2
1885 call pixel_satd_8x8_internal2
1886 call pixel_satd_8x8_internal2
1887 lea r0, [r6 + 16*SIZEOF_PIXEL]
1889 add r2, 16*SIZEOF_PIXEL
1890 call pixel_satd_8x8_internal2
1891 call pixel_satd_8x8_internal2
1892 call pixel_satd_8x8_internal2
1893 call pixel_satd_8x8_internal2
1894 lea r0, [r6 + 24*SIZEOF_PIXEL]
1896 add r2, 24*SIZEOF_PIXEL
1897 call pixel_satd_8x8_internal2
1898 call pixel_satd_8x8_internal2
1899 call pixel_satd_8x8_internal2
1900 call pixel_satd_8x8_internal2
1901 lea r0, [r6 + 32*SIZEOF_PIXEL]
1903 add r2, 32*SIZEOF_PIXEL
1904 call pixel_satd_8x8_internal2
1905 call pixel_satd_8x8_internal2
1906 call pixel_satd_8x8_internal2
1907 call pixel_satd_8x8_internal2
1908 lea r0, [r6 + 40*SIZEOF_PIXEL]
1910 add r2, 40*SIZEOF_PIXEL
1911 call pixel_satd_8x8_internal2
1912 call pixel_satd_8x8_internal2
1913 call pixel_satd_8x8_internal2
1914 call pixel_satd_8x8_internal2
1915 lea r0, [r6 + 48*SIZEOF_PIXEL]
1917 add r2, 48*SIZEOF_PIXEL
1918 call pixel_satd_8x8_internal2
1919 call pixel_satd_8x8_internal2
1920 call pixel_satd_8x8_internal2
1921 call pixel_satd_8x8_internal2
1922 lea r0, [r6 + 56*SIZEOF_PIXEL]
1924 add r2, 56*SIZEOF_PIXEL
1925 call pixel_satd_8x8_internal2
1926 call pixel_satd_8x8_internal2
1927 call pixel_satd_8x8_internal2
1928 call pixel_satd_8x8_internal2
1939 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx)
1940 SATD_START_SSE2 m6, m7
1943 call pixel_satd_8x8_internal2
1944 call pixel_satd_8x8_internal2
1945 call pixel_satd_8x8_internal2
1946 call pixel_satd_8x8_internal2
1947 call pixel_satd_8x8_internal2
1948 call pixel_satd_8x8_internal2
1949 lea r0, [r6 + 8*SIZEOF_PIXEL]
1950 lea r2, [r7 + 8*SIZEOF_PIXEL]
1951 call pixel_satd_8x8_internal2
1952 call pixel_satd_8x8_internal2
1953 call pixel_satd_8x8_internal2
1954 call pixel_satd_8x8_internal2
1955 call pixel_satd_8x8_internal2
1956 call pixel_satd_8x8_internal2
1957 lea r0, [r6 + 16*SIZEOF_PIXEL]
1958 lea r2, [r7 + 16*SIZEOF_PIXEL]
1959 call pixel_satd_8x8_internal2
1960 call pixel_satd_8x8_internal2
1961 call pixel_satd_8x8_internal2
1962 call pixel_satd_8x8_internal2
1963 call pixel_satd_8x8_internal2
1964 call pixel_satd_8x8_internal2
1965 lea r0, [r6 + 24*SIZEOF_PIXEL]
1966 lea r2, [r7 + 24*SIZEOF_PIXEL]
1967 call pixel_satd_8x8_internal2
1968 call pixel_satd_8x8_internal2
1969 call pixel_satd_8x8_internal2
1970 call pixel_satd_8x8_internal2
1971 call pixel_satd_8x8_internal2
1972 call pixel_satd_8x8_internal2
1973 lea r0, [r6 + 32*SIZEOF_PIXEL]
1974 lea r2, [r7 + 32*SIZEOF_PIXEL]
1975 call pixel_satd_8x8_internal2
1976 call pixel_satd_8x8_internal2
1977 call pixel_satd_8x8_internal2
1978 call pixel_satd_8x8_internal2
1979 call pixel_satd_8x8_internal2
1980 call pixel_satd_8x8_internal2
1981 lea r0, [r6 + 40*SIZEOF_PIXEL]
1982 lea r2, [r7 + 40*SIZEOF_PIXEL]
1983 call pixel_satd_8x8_internal2
1984 call pixel_satd_8x8_internal2
1985 call pixel_satd_8x8_internal2
1986 call pixel_satd_8x8_internal2
1987 call pixel_satd_8x8_internal2
1988 call pixel_satd_8x8_internal2
1989 lea r0, [r6 + 48*SIZEOF_PIXEL]
1990 lea r2, [r7 + 48*SIZEOF_PIXEL]
1991 call pixel_satd_8x8_internal2
1992 call pixel_satd_8x8_internal2
1993 call pixel_satd_8x8_internal2
1994 call pixel_satd_8x8_internal2
1995 call pixel_satd_8x8_internal2
1996 call pixel_satd_8x8_internal2
1997 lea r0, [r6 + 56*SIZEOF_PIXEL]
1998 lea r2, [r7 + 56*SIZEOF_PIXEL]
1999 call pixel_satd_8x8_internal2
2000 call pixel_satd_8x8_internal2
2001 call pixel_satd_8x8_internal2
2002 call pixel_satd_8x8_internal2
2003 call pixel_satd_8x8_internal2
2004 call pixel_satd_8x8_internal2
2013 cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64
2014 SATD_START_SSE2 m6, m7
2017 call pixel_satd_8x8_internal2
2018 call pixel_satd_8x8_internal2
2019 call pixel_satd_8x8_internal2
2020 call pixel_satd_8x8_internal2
2021 call pixel_satd_8x8_internal2
2022 call pixel_satd_8x8_internal2
2023 lea r0, [r6 + 8*SIZEOF_PIXEL]
2025 add r2, 8*SIZEOF_PIXEL
2026 call pixel_satd_8x8_internal2
2027 call pixel_satd_8x8_internal2
2028 call pixel_satd_8x8_internal2
2029 call pixel_satd_8x8_internal2
2030 call pixel_satd_8x8_internal2
2031 call pixel_satd_8x8_internal2
2032 lea r0, [r6 + 16*SIZEOF_PIXEL]
2034 add r2, 16*SIZEOF_PIXEL
2035 call pixel_satd_8x8_internal2
2036 call pixel_satd_8x8_internal2
2037 call pixel_satd_8x8_internal2
2038 call pixel_satd_8x8_internal2
2039 call pixel_satd_8x8_internal2
2040 call pixel_satd_8x8_internal2
2041 lea r0, [r6 + 24*SIZEOF_PIXEL]
2043 add r2, 24*SIZEOF_PIXEL
2044 call pixel_satd_8x8_internal2
2045 call pixel_satd_8x8_internal2
2046 call pixel_satd_8x8_internal2
2047 call pixel_satd_8x8_internal2
2048 call pixel_satd_8x8_internal2
2049 call pixel_satd_8x8_internal2
2050 lea r0, [r6 + 32*SIZEOF_PIXEL]
2052 add r2, 32*SIZEOF_PIXEL
2053 call pixel_satd_8x8_internal2
2054 call pixel_satd_8x8_internal2
2055 call pixel_satd_8x8_internal2
2056 call pixel_satd_8x8_internal2
2057 call pixel_satd_8x8_internal2
2058 call pixel_satd_8x8_internal2
2059 lea r0, [r6 + 40*SIZEOF_PIXEL]
2061 add r2, 40*SIZEOF_PIXEL
2062 call pixel_satd_8x8_internal2
2063 call pixel_satd_8x8_internal2
2064 call pixel_satd_8x8_internal2
2065 call pixel_satd_8x8_internal2
2066 call pixel_satd_8x8_internal2
2067 call pixel_satd_8x8_internal2
2068 lea r0, [r6 + 48*SIZEOF_PIXEL]
2070 add r2, 48*SIZEOF_PIXEL
2071 call pixel_satd_8x8_internal2
2072 call pixel_satd_8x8_internal2
2073 call pixel_satd_8x8_internal2
2074 call pixel_satd_8x8_internal2
2075 call pixel_satd_8x8_internal2
2076 call pixel_satd_8x8_internal2
2077 lea r0, [r6 + 56*SIZEOF_PIXEL]
2079 add r2, 56*SIZEOF_PIXEL
2080 call pixel_satd_8x8_internal2
2081 call pixel_satd_8x8_internal2
2082 call pixel_satd_8x8_internal2
2083 call pixel_satd_8x8_internal2
2084 call pixel_satd_8x8_internal2
2085 call pixel_satd_8x8_internal2
2096 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx)
2097 SATD_START_SSE2 m6, m7
2100 call pixel_satd_8x8_internal2
2101 call pixel_satd_8x8_internal2
2102 call pixel_satd_8x8_internal2
2103 call pixel_satd_8x8_internal2
2104 call pixel_satd_8x8_internal2
2105 call pixel_satd_8x8_internal2
2106 call pixel_satd_8x8_internal2
2107 call pixel_satd_8x8_internal2
2108 lea r0, [r6 + 8*SIZEOF_PIXEL]
2109 lea r2, [r7 + 8*SIZEOF_PIXEL]
2110 call pixel_satd_8x8_internal2
2111 call pixel_satd_8x8_internal2
2112 call pixel_satd_8x8_internal2
2113 call pixel_satd_8x8_internal2
2114 call pixel_satd_8x8_internal2
2115 call pixel_satd_8x8_internal2
2116 call pixel_satd_8x8_internal2
2117 call pixel_satd_8x8_internal2
2118 lea r0, [r6 + 16*SIZEOF_PIXEL]
2119 lea r2, [r7 + 16*SIZEOF_PIXEL]
2120 call pixel_satd_8x8_internal2
2121 call pixel_satd_8x8_internal2
2122 call pixel_satd_8x8_internal2
2123 call pixel_satd_8x8_internal2
2124 call pixel_satd_8x8_internal2
2125 call pixel_satd_8x8_internal2
2126 call pixel_satd_8x8_internal2
2127 call pixel_satd_8x8_internal2
2128 lea r0, [r6 + 24*SIZEOF_PIXEL]
2129 lea r2, [r7 + 24*SIZEOF_PIXEL]
2130 call pixel_satd_8x8_internal2
2131 call pixel_satd_8x8_internal2
2132 call pixel_satd_8x8_internal2
2133 call pixel_satd_8x8_internal2
2134 call pixel_satd_8x8_internal2
2135 call pixel_satd_8x8_internal2
2136 call pixel_satd_8x8_internal2
2137 call pixel_satd_8x8_internal2
2138 lea r0, [r6 + 32*SIZEOF_PIXEL]
2139 lea r2, [r7 + 32*SIZEOF_PIXEL]
2140 call pixel_satd_8x8_internal2
2141 call pixel_satd_8x8_internal2
2142 call pixel_satd_8x8_internal2
2143 call pixel_satd_8x8_internal2
2144 call pixel_satd_8x8_internal2
2145 call pixel_satd_8x8_internal2
2146 call pixel_satd_8x8_internal2
2147 call pixel_satd_8x8_internal2
2148 lea r0, [r6 + 40*SIZEOF_PIXEL]
2149 lea r2, [r7 + 40*SIZEOF_PIXEL]
2150 call pixel_satd_8x8_internal2
2151 call pixel_satd_8x8_internal2
2152 call pixel_satd_8x8_internal2
2153 call pixel_satd_8x8_internal2
2154 call pixel_satd_8x8_internal2
2155 call pixel_satd_8x8_internal2
2156 call pixel_satd_8x8_internal2
2157 call pixel_satd_8x8_internal2
2158 lea r0, [r6 + 48*SIZEOF_PIXEL]
2159 lea r2, [r7 + 48*SIZEOF_PIXEL]
2160 call pixel_satd_8x8_internal2
2161 call pixel_satd_8x8_internal2
2162 call pixel_satd_8x8_internal2
2163 call pixel_satd_8x8_internal2
2164 call pixel_satd_8x8_internal2
2165 call pixel_satd_8x8_internal2
2166 call pixel_satd_8x8_internal2
2167 call pixel_satd_8x8_internal2
2168 lea r0, [r6 + 56*SIZEOF_PIXEL]
2169 lea r2, [r7 + 56*SIZEOF_PIXEL]
2170 call pixel_satd_8x8_internal2
2171 call pixel_satd_8x8_internal2
2172 call pixel_satd_8x8_internal2
2173 call pixel_satd_8x8_internal2
2174 call pixel_satd_8x8_internal2
2175 call pixel_satd_8x8_internal2
2176 call pixel_satd_8x8_internal2
2177 call pixel_satd_8x8_internal2
2186 cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64
2187 SATD_START_SSE2 m6, m7
2190 call pixel_satd_8x8_internal2
2191 call pixel_satd_8x8_internal2
2192 call pixel_satd_8x8_internal2
2193 call pixel_satd_8x8_internal2
2194 call pixel_satd_8x8_internal2
2195 call pixel_satd_8x8_internal2
2196 call pixel_satd_8x8_internal2
2197 call pixel_satd_8x8_internal2
2198 lea r0, [r6 + 8*SIZEOF_PIXEL]
2200 add r2, 8*SIZEOF_PIXEL
2201 call pixel_satd_8x8_internal2
2202 call pixel_satd_8x8_internal2
2203 call pixel_satd_8x8_internal2
2204 call pixel_satd_8x8_internal2
2205 call pixel_satd_8x8_internal2
2206 call pixel_satd_8x8_internal2
2207 call pixel_satd_8x8_internal2
2208 call pixel_satd_8x8_internal2
2209 lea r0, [r6 + 16*SIZEOF_PIXEL]
2211 add r2, 16*SIZEOF_PIXEL
2212 call pixel_satd_8x8_internal2
2213 call pixel_satd_8x8_internal2
2214 call pixel_satd_8x8_internal2
2215 call pixel_satd_8x8_internal2
2216 call pixel_satd_8x8_internal2
2217 call pixel_satd_8x8_internal2
2218 call pixel_satd_8x8_internal2
2219 call pixel_satd_8x8_internal2
2220 lea r0, [r6 + 24*SIZEOF_PIXEL]
2222 add r2, 24*SIZEOF_PIXEL
2223 call pixel_satd_8x8_internal2
2224 call pixel_satd_8x8_internal2
2225 call pixel_satd_8x8_internal2
2226 call pixel_satd_8x8_internal2
2227 call pixel_satd_8x8_internal2
2228 call pixel_satd_8x8_internal2
2229 call pixel_satd_8x8_internal2
2230 call pixel_satd_8x8_internal2
2231 lea r0, [r6 + 32*SIZEOF_PIXEL]
2233 add r2, 32*SIZEOF_PIXEL
2234 call pixel_satd_8x8_internal2
2235 call pixel_satd_8x8_internal2
2236 call pixel_satd_8x8_internal2
2237 call pixel_satd_8x8_internal2
2238 call pixel_satd_8x8_internal2
2239 call pixel_satd_8x8_internal2
2240 call pixel_satd_8x8_internal2
2241 call pixel_satd_8x8_internal2
2242 lea r0, [r6 + 40*SIZEOF_PIXEL]
2244 add r2, 40*SIZEOF_PIXEL
2245 call pixel_satd_8x8_internal2
2246 call pixel_satd_8x8_internal2
2247 call pixel_satd_8x8_internal2
2248 call pixel_satd_8x8_internal2
2249 call pixel_satd_8x8_internal2
2250 call pixel_satd_8x8_internal2
2251 call pixel_satd_8x8_internal2
2252 call pixel_satd_8x8_internal2
2253 lea r0, [r6 + 48*SIZEOF_PIXEL]
2255 add r2, 48*SIZEOF_PIXEL
2256 call pixel_satd_8x8_internal2
2257 call pixel_satd_8x8_internal2
2258 call pixel_satd_8x8_internal2
2259 call pixel_satd_8x8_internal2
2260 call pixel_satd_8x8_internal2
2261 call pixel_satd_8x8_internal2
2262 call pixel_satd_8x8_internal2
2263 call pixel_satd_8x8_internal2
2264 lea r0, [r6 + 56*SIZEOF_PIXEL]
2266 add r2, 56*SIZEOF_PIXEL
2267 call pixel_satd_8x8_internal2
2268 call pixel_satd_8x8_internal2
2269 call pixel_satd_8x8_internal2
2270 call pixel_satd_8x8_internal2
2271 call pixel_satd_8x8_internal2
2272 call pixel_satd_8x8_internal2
2273 call pixel_satd_8x8_internal2
2274 call pixel_satd_8x8_internal2
2285 cglobal pixel_satd_16x4, 4,6,14
2287 cglobal pixel_satd_16x4, 4,6,8
2289 SATD_START_SSE2 m6, m7
2291 call %%pixel_satd_8x4_internal2
2292 RESTORE_AND_INC_POINTERS
2293 call %%pixel_satd_8x4_internal2
2303 cglobal pixel_satd_16x8, 4,6,14
2305 cglobal pixel_satd_16x8, 4,6,8
2307 SATD_START_SSE2 m6, m7
2309 call pixel_satd_8x8_internal2
2310 RESTORE_AND_INC_POINTERS
2311 call pixel_satd_8x8_internal2
2321 cglobal pixel_satd_16x12, 4,6,14
2323 cglobal pixel_satd_16x12, 4,6,8
2325 SATD_START_SSE2 m6, m7, 1
2327 call pixel_satd_8x8_internal2
2328 call %%pixel_satd_8x4_internal2
2329 RESTORE_AND_INC_POINTERS
2330 call pixel_satd_8x8_internal2
2331 call %%pixel_satd_8x4_internal2
2341 cglobal pixel_satd_16x16, 4,6,14
2343 cglobal pixel_satd_16x16, 4,6,8
2345 SATD_START_SSE2 m6, m7, 1
2347 call pixel_satd_8x8_internal2
2348 call pixel_satd_8x8_internal2
2349 RESTORE_AND_INC_POINTERS
2350 call pixel_satd_8x8_internal2
2351 call pixel_satd_8x8_internal2
2361 cglobal pixel_satd_16x32, 4,6,14
2363 cglobal pixel_satd_16x32, 4,6,8
2365 SATD_START_SSE2 m6, m7, 1
2367 call pixel_satd_8x8_internal2
2368 call pixel_satd_8x8_internal2
2369 call pixel_satd_8x8_internal2
2370 call pixel_satd_8x8_internal2
2371 RESTORE_AND_INC_POINTERS
2372 call pixel_satd_8x8_internal2
2373 call pixel_satd_8x8_internal2
2374 call pixel_satd_8x8_internal2
2375 call pixel_satd_8x8_internal2
2385 cglobal pixel_satd_16x64, 4,6,14
2387 cglobal pixel_satd_16x64, 4,6,8
2389 SATD_START_SSE2 m6, m7, 1
2391 call pixel_satd_8x8_internal2
2392 call pixel_satd_8x8_internal2
2393 call pixel_satd_8x8_internal2
2394 call pixel_satd_8x8_internal2
2395 call pixel_satd_8x8_internal2
2396 call pixel_satd_8x8_internal2
2397 call pixel_satd_8x8_internal2
2398 call pixel_satd_8x8_internal2
2399 RESTORE_AND_INC_POINTERS
2400 call pixel_satd_8x8_internal2
2401 call pixel_satd_8x8_internal2
2402 call pixel_satd_8x8_internal2
2403 call pixel_satd_8x8_internal2
2404 call pixel_satd_8x8_internal2
2405 call pixel_satd_8x8_internal2
2406 call pixel_satd_8x8_internal2
2407 call pixel_satd_8x8_internal2
2419 cglobal pixel_satd_12x16, 4,8,8
2424 SATD_4x8_SSE vertical, 0, 4, 5
2425 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2426 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2427 SATD_4x8_SSE vertical, 1, 4, 5
2428 lea r0, [r6 + 4*SIZEOF_PIXEL]
2429 lea r2, [r7 + 4*SIZEOF_PIXEL]
2430 SATD_4x8_SSE vertical, 1, 4, 5
2431 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2432 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2433 SATD_4x8_SSE vertical, 1, 4, 5
2434 lea r0, [r6 + 8*SIZEOF_PIXEL]
2435 lea r2, [r7 + 8*SIZEOF_PIXEL]
2436 SATD_4x8_SSE vertical, 1, 4, 5
2437 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2438 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2439 SATD_4x8_SSE vertical, 1, 4, 5
2448 cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2453 SATD_4x8_SSE vertical, 0, 4, 5
2454 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2455 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2456 SATD_4x8_SSE vertical, 1, 4, 5
2457 lea r0, [r6 + 4*SIZEOF_PIXEL]
2459 add r2, 4*SIZEOF_PIXEL
2460 SATD_4x8_SSE vertical, 1, 4, 5
2461 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2462 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2463 SATD_4x8_SSE vertical, 1, 4, 5
2464 lea r0, [r6 + 8*SIZEOF_PIXEL]
2466 add r2, 8*SIZEOF_PIXEL
2467 SATD_4x8_SSE vertical, 1, 4, 5
2468 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2469 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2470 SATD_4x8_SSE vertical, 1, 4, 5
2479 %else ;HIGH_BIT_DEPTH
2481 cglobal pixel_satd_12x16, 4,8,8
2488 SATD_4x8_SSE vertical, 0, swap
2489 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2490 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2491 SATD_4x8_SSE vertical, 1, add
2492 lea r0, [r6 + 4*SIZEOF_PIXEL]
2493 lea r2, [r7 + 4*SIZEOF_PIXEL]
2494 SATD_4x8_SSE vertical, 1, add
2495 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2496 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2497 SATD_4x8_SSE vertical, 1, add
2498 lea r0, [r6 + 8*SIZEOF_PIXEL]
2499 lea r2, [r7 + 8*SIZEOF_PIXEL]
2500 SATD_4x8_SSE vertical, 1, add
2501 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2502 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2503 SATD_4x8_SSE vertical, 1, add
2508 cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2515 SATD_4x8_SSE vertical, 0, swap
2516 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2517 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2518 SATD_4x8_SSE vertical, 1, add
2519 lea r0, [r6 + 4*SIZEOF_PIXEL]
2521 add r2, 4*SIZEOF_PIXEL
2522 SATD_4x8_SSE vertical, 1, add
2523 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2524 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2525 SATD_4x8_SSE vertical, 1, add
2526 lea r0, [r6 + 8*SIZEOF_PIXEL]
2528 add r2, 8*SIZEOF_PIXEL
2529 SATD_4x8_SSE vertical, 1, add
2530 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2531 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2532 SATD_4x8_SSE vertical, 1, add
2540 cglobal pixel_satd_24x32, 4,8,14
2541 SATD_START_SSE2 m6, m7
2544 call pixel_satd_8x8_internal2
2545 call pixel_satd_8x8_internal2
2546 call pixel_satd_8x8_internal2
2547 call pixel_satd_8x8_internal2
2548 lea r0, [r6 + 8*SIZEOF_PIXEL]
2549 lea r2, [r7 + 8*SIZEOF_PIXEL]
2550 call pixel_satd_8x8_internal2
2551 call pixel_satd_8x8_internal2
2552 call pixel_satd_8x8_internal2
2553 call pixel_satd_8x8_internal2
2554 lea r0, [r6 + 16*SIZEOF_PIXEL]
2555 lea r2, [r7 + 16*SIZEOF_PIXEL]
2556 call pixel_satd_8x8_internal2
2557 call pixel_satd_8x8_internal2
2558 call pixel_satd_8x8_internal2
2559 call pixel_satd_8x8_internal2
2568 cglobal pixel_satd_24x32, 4,7,8,0-gprsize
2569 SATD_START_SSE2 m6, m7
2572 call pixel_satd_8x8_internal2
2573 call pixel_satd_8x8_internal2
2574 call pixel_satd_8x8_internal2
2575 call pixel_satd_8x8_internal2
2576 lea r0, [r6 + 8*SIZEOF_PIXEL]
2578 add r2, 8*SIZEOF_PIXEL
2579 call pixel_satd_8x8_internal2
2580 call pixel_satd_8x8_internal2
2581 call pixel_satd_8x8_internal2
2582 call pixel_satd_8x8_internal2
2583 lea r0, [r6 + 16*SIZEOF_PIXEL]
2585 add r2, 16*SIZEOF_PIXEL
2586 call pixel_satd_8x8_internal2
2587 call pixel_satd_8x8_internal2
2588 call pixel_satd_8x8_internal2
2589 call pixel_satd_8x8_internal2
2600 cglobal pixel_satd_8x32, 4,6,14
2602 cglobal pixel_satd_8x32, 4,6,8
2604 SATD_START_SSE2 m6, m7
2608 call pixel_satd_8x8_internal2
2609 call pixel_satd_8x8_internal2
2610 call pixel_satd_8x8_internal2
2611 call pixel_satd_8x8_internal2
2621 cglobal pixel_satd_8x16, 4,6,14
2623 cglobal pixel_satd_8x16, 4,6,8
2625 SATD_START_SSE2 m6, m7
2626 call pixel_satd_8x8_internal2
2627 call pixel_satd_8x8_internal2
2636 cglobal pixel_satd_8x8, 4,6,8
2637 SATD_START_SSE2 m6, m7
2638 call pixel_satd_8x8_internal
2642 cglobal pixel_satd_8x4, 4,6,14
2644 cglobal pixel_satd_8x4, 4,6,8
2646 SATD_START_SSE2 m6, m7
2647 call %%pixel_satd_8x4_internal2
2649 %endmacro ; SATDS_SSE2
2652 ;=============================================================================
2654 ;=============================================================================
2669 %endif ; HIGH_BIT_DEPTH
2673 call pixel_sa8d_8x8_internal
2678 %endif ; HIGH_BIT_DEPTH
2685 call pixel_sa8d_8x8_internal ; pix[0]
2686 add r2, 8*SIZEOF_PIXEL
2687 add r0, 8*SIZEOF_PIXEL
2692 call pixel_sa8d_8x8_internal ; pix[8]
2696 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
2697 sub r2, 8*SIZEOF_PIXEL
2698 sub r0, 8*SIZEOF_PIXEL
2700 call pixel_sa8d_8x8_internal ; pix[8*stride]
2703 %if HIGH_BIT_DEPTH == 0
2713 %if HIGH_BIT_DEPTH == 0
2719 add r4d, dword [esp+36]
2720 mov dword [esp+36], r4d
2724 ; sse2 doesn't seem to like the horizontal way of doing things
2725 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
2728 ;-----------------------------------------------------------------------------
2729 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
2730 ;-----------------------------------------------------------------------------
2731 cglobal pixel_sa8d_8x8_internal
2734 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
2735 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
2737 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
2739 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
2747 cglobal pixel_sa8d_8x8, 4,8,12
2754 call pixel_sa8d_8x8_internal
2759 %endif ; HIGH_BIT_DEPTH
2765 cglobal pixel_sa8d_16x16, 4,8,12
2772 call pixel_sa8d_8x8_internal ; pix[0]
2773 add r2, 8*SIZEOF_PIXEL
2774 add r0, 8*SIZEOF_PIXEL
2779 call pixel_sa8d_8x8_internal ; pix[8]
2783 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
2784 sub r2, 8*SIZEOF_PIXEL
2785 sub r0, 8*SIZEOF_PIXEL
2787 call pixel_sa8d_8x8_internal ; pix[8*stride]
2790 %if HIGH_BIT_DEPTH == 0
2798 cglobal pixel_sa8d_8x16, 4,8,13
2813 cglobal pixel_sa8d_8x32, 4,8,13
2834 cglobal pixel_sa8d_16x8, 4,8,13
2843 add r0, 8*SIZEOF_PIXEL
2844 add r2, 8*SIZEOF_PIXEL
2849 cglobal pixel_sa8d_16x32, 4,8,13
2864 cglobal pixel_sa8d_16x64, 4,8,13
2885 cglobal pixel_sa8d_24x32, 4,8,13
2894 add r0, 8*SIZEOF_PIXEL
2895 add r2, 8*SIZEOF_PIXEL
2897 add r0, 8*SIZEOF_PIXEL
2898 add r2, 8*SIZEOF_PIXEL
2903 sub r0, 8*SIZEOF_PIXEL
2904 sub r2, 8*SIZEOF_PIXEL
2906 sub r0, 8*SIZEOF_PIXEL
2907 sub r2, 8*SIZEOF_PIXEL
2912 add r0, 8*SIZEOF_PIXEL
2913 add r2, 8*SIZEOF_PIXEL
2915 add r0, 8*SIZEOF_PIXEL
2916 add r2, 8*SIZEOF_PIXEL
2921 sub r0, 8*SIZEOF_PIXEL
2922 sub r2, 8*SIZEOF_PIXEL
2924 sub r0, 8*SIZEOF_PIXEL
2925 sub r2, 8*SIZEOF_PIXEL
2930 cglobal pixel_sa8d_32x8, 4,8,13
2939 add r0, 8*SIZEOF_PIXEL
2940 add r2, 8*SIZEOF_PIXEL
2942 add r0, 8*SIZEOF_PIXEL
2943 add r2, 8*SIZEOF_PIXEL
2945 add r0, 8*SIZEOF_PIXEL
2946 add r2, 8*SIZEOF_PIXEL
2951 cglobal pixel_sa8d_32x16, 4,8,13
2964 add r2, 16*SIZEOF_PIXEL
2965 add r0, 16*SIZEOF_PIXEL
2972 cglobal pixel_sa8d_32x24, 4,8,13
2981 add r0, 8*SIZEOF_PIXEL
2982 add r2, 8*SIZEOF_PIXEL
2984 add r0, 8*SIZEOF_PIXEL
2985 add r2, 8*SIZEOF_PIXEL
2987 add r0, 8*SIZEOF_PIXEL
2988 add r2, 8*SIZEOF_PIXEL
2993 sub r0, 8*SIZEOF_PIXEL
2994 sub r2, 8*SIZEOF_PIXEL
2996 sub r0, 8*SIZEOF_PIXEL
2997 sub r2, 8*SIZEOF_PIXEL
2999 sub r0, 8*SIZEOF_PIXEL
3000 sub r2, 8*SIZEOF_PIXEL
3005 add r0, 8*SIZEOF_PIXEL
3006 add r2, 8*SIZEOF_PIXEL
3008 add r0, 8*SIZEOF_PIXEL
3009 add r2, 8*SIZEOF_PIXEL
3011 add r0, 8*SIZEOF_PIXEL
3012 add r2, 8*SIZEOF_PIXEL
3017 cglobal pixel_sa8d_32x32, 4,8,13
3030 add r2, 16*SIZEOF_PIXEL
3031 add r0, 16*SIZEOF_PIXEL
3042 sub r2, 16*SIZEOF_PIXEL
3043 sub r0, 16*SIZEOF_PIXEL
3050 cglobal pixel_sa8d_32x64, 4,8,13
3063 add r2, 16*SIZEOF_PIXEL
3064 add r0, 16*SIZEOF_PIXEL
3075 sub r2, 16*SIZEOF_PIXEL
3076 sub r0, 16*SIZEOF_PIXEL
3087 add r2, 16*SIZEOF_PIXEL
3088 add r0, 16*SIZEOF_PIXEL
3099 sub r2, 16*SIZEOF_PIXEL
3100 sub r0, 16*SIZEOF_PIXEL
3107 cglobal pixel_sa8d_48x64, 4,8,13
3120 add r2, 16*SIZEOF_PIXEL
3121 add r0, 16*SIZEOF_PIXEL
3129 add r2, 16*SIZEOF_PIXEL
3130 add r0, 16*SIZEOF_PIXEL
3141 sub r2, 16*SIZEOF_PIXEL
3142 sub r0, 16*SIZEOF_PIXEL
3150 sub r2, 16*SIZEOF_PIXEL
3151 sub r0, 16*SIZEOF_PIXEL
3162 add r2, 16*SIZEOF_PIXEL
3163 add r0, 16*SIZEOF_PIXEL
3171 add r2, 16*SIZEOF_PIXEL
3172 add r0, 16*SIZEOF_PIXEL
3183 sub r2, 16*SIZEOF_PIXEL
3184 sub r0, 16*SIZEOF_PIXEL
3192 sub r2, 16*SIZEOF_PIXEL
3193 sub r0, 16*SIZEOF_PIXEL
3200 cglobal pixel_sa8d_64x16, 4,8,13
3213 add r2, 16*SIZEOF_PIXEL
3214 add r0, 16*SIZEOF_PIXEL
3222 add r2, 16*SIZEOF_PIXEL
3223 add r0, 16*SIZEOF_PIXEL
3231 add r2, 16*SIZEOF_PIXEL
3232 add r0, 16*SIZEOF_PIXEL
3239 cglobal pixel_sa8d_64x32, 4,8,13
3252 add r2, 16*SIZEOF_PIXEL
3253 add r0, 16*SIZEOF_PIXEL
3261 add r2, 16*SIZEOF_PIXEL
3262 add r0, 16*SIZEOF_PIXEL
3270 add r2, 16*SIZEOF_PIXEL
3271 add r0, 16*SIZEOF_PIXEL
3282 sub r2, 16*SIZEOF_PIXEL
3283 sub r0, 16*SIZEOF_PIXEL
3291 sub r2, 16*SIZEOF_PIXEL
3292 sub r0, 16*SIZEOF_PIXEL
3300 sub r2, 16*SIZEOF_PIXEL
3301 sub r0, 16*SIZEOF_PIXEL
3308 cglobal pixel_sa8d_64x48, 4,8,13
3321 add r2, 16*SIZEOF_PIXEL
3322 add r0, 16*SIZEOF_PIXEL
3330 add r2, 16*SIZEOF_PIXEL
3331 add r0, 16*SIZEOF_PIXEL
3339 add r2, 16*SIZEOF_PIXEL
3340 add r0, 16*SIZEOF_PIXEL
3351 sub r2, 16*SIZEOF_PIXEL
3352 sub r0, 16*SIZEOF_PIXEL
3360 sub r2, 16*SIZEOF_PIXEL
3361 sub r0, 16*SIZEOF_PIXEL
3369 sub r2, 16*SIZEOF_PIXEL
3370 sub r0, 16*SIZEOF_PIXEL
3381 add r2, 16*SIZEOF_PIXEL
3382 add r0, 16*SIZEOF_PIXEL
3390 add r2, 16*SIZEOF_PIXEL
3391 add r0, 16*SIZEOF_PIXEL
3399 add r2, 16*SIZEOF_PIXEL
3400 add r0, 16*SIZEOF_PIXEL
3407 cglobal pixel_sa8d_64x64, 4,8,13
3420 add r2, 16*SIZEOF_PIXEL
3421 add r0, 16*SIZEOF_PIXEL
3429 add r2, 16*SIZEOF_PIXEL
3430 add r0, 16*SIZEOF_PIXEL
3438 add r2, 16*SIZEOF_PIXEL
3439 add r0, 16*SIZEOF_PIXEL
3450 sub r2, 16*SIZEOF_PIXEL
3451 sub r0, 16*SIZEOF_PIXEL
3459 sub r2, 16*SIZEOF_PIXEL
3460 sub r0, 16*SIZEOF_PIXEL
3468 sub r2, 16*SIZEOF_PIXEL
3469 sub r0, 16*SIZEOF_PIXEL
3480 add r2, 16*SIZEOF_PIXEL
3481 add r0, 16*SIZEOF_PIXEL
3489 add r2, 16*SIZEOF_PIXEL
3490 add r0, 16*SIZEOF_PIXEL
3498 add r2, 16*SIZEOF_PIXEL
3499 add r0, 16*SIZEOF_PIXEL
3510 sub r2, 16*SIZEOF_PIXEL
3511 sub r0, 16*SIZEOF_PIXEL
3519 sub r2, 16*SIZEOF_PIXEL
3520 sub r0, 16*SIZEOF_PIXEL
3528 sub r2, 16*SIZEOF_PIXEL
3529 sub r0, 16*SIZEOF_PIXEL
3538 cglobal pixel_sa8d_8x8_internal
3539 %define spill0 [esp+4]
3540 %define spill1 [esp+20]
3541 %define spill2 [esp+36]
3543 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3544 HADAMARD4_2D 0, 1, 2, 3, 4
3546 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3547 HADAMARD4_2D 4, 5, 6, 7, 3
3548 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3551 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3554 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
3555 ; could do first HADAMARD4_V here to save spilling later
3556 ; surprisingly, not a win on conroe or even p4
3561 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
3562 HADAMARD4_V 4, 5, 6, 7, 3
3568 HADAMARD4_V 0, 1, 2, 3, 7
3569 SUMSUB_BADC w, 0, 4, 1, 5, 7
3570 HADAMARD 2, sumsub, 0, 4, 7, 6
3571 HADAMARD 2, sumsub, 1, 5, 7, 6
3572 HADAMARD 1, amax, 0, 4, 7, 6
3573 HADAMARD 1, amax, 1, 5, 7, 6
3577 SUMSUB_BADC w, 2, 6, 3, 7, 4
3578 HADAMARD 2, sumsub, 2, 6, 4, 5
3579 HADAMARD 2, sumsub, 3, 7, 4, 5
3580 HADAMARD 1, amax, 2, 6, 4, 5
3581 HADAMARD 1, amax, 3, 7, 4, 5
3582 %endif ; sse2/non-sse2
3587 %endif ; ifndef mmx2
3589 cglobal pixel_sa8d_8x8_internal2
3590 %define spill0 [esp+4]
3591 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3592 HADAMARD4_2D 0, 1, 2, 3, 4
3594 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3595 HADAMARD4_2D 4, 5, 6, 7, 3
3596 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3599 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3605 cglobal pixel_sa8d_8x8, 4,7
3612 call pixel_sa8d_8x8_internal
3617 %endif ; HIGH_BIT_DEPTH
3624 cglobal pixel_sa8d_16x16, 4,7
3631 call pixel_sa8d_8x8_internal
3640 call pixel_sa8d_8x8_internal
3643 add r0, 8*SIZEOF_PIXEL
3644 add r2, 8*SIZEOF_PIXEL
3647 call pixel_sa8d_8x8_internal
3654 mova [esp+64-mmsize], m0
3655 call pixel_sa8d_8x8_internal
3658 %else ; !HIGH_BIT_DEPTH
3659 paddusw m0, [esp+64-mmsize]
3676 %endif ; HIGH_BIT_DEPTH
3683 cglobal pixel_sa8d_8x16, 4,7,8
3691 call pixel_sa8d_8x8_internal2
3696 mov dword [esp+36], r4d
3703 call pixel_sa8d_8x8_internal2
3708 add r4d, dword [esp+36]
3713 cglobal pixel_sa8d_8x32, 4,7,8
3721 call pixel_sa8d_8x8_internal2
3726 mov dword [esp+36], r4d
3733 call pixel_sa8d_8x8_internal2
3738 add r4d, dword [esp+36]
3739 mov dword [esp+36], r4d
3748 call pixel_sa8d_8x8_internal2
3753 add r4d, dword [esp+36]
3754 mov dword [esp+36], r4d
3765 call pixel_sa8d_8x8_internal2
3770 add r4d, dword [esp+36]
3775 cglobal pixel_sa8d_16x8, 4,7,8
3783 call pixel_sa8d_8x8_internal2
3788 mov dword [esp+36], r4d
3792 add r0, 8*SIZEOF_PIXEL
3793 add r2, 8*SIZEOF_PIXEL
3795 call pixel_sa8d_8x8_internal2
3800 add r4d, dword [esp+36]
3805 cglobal pixel_sa8d_16x32, 4,7,8
3813 call pixel_sa8d_8x8_internal2
3818 call pixel_sa8d_8x8_internal2
3824 add r0, 8*SIZEOF_PIXEL
3825 add r2, 8*SIZEOF_PIXEL
3826 call pixel_sa8d_8x8_internal2
3829 call pixel_sa8d_8x8_internal2
3831 %if HIGH_BIT_DEPTH == 0
3837 mov dword [esp+36], r4d
3846 call pixel_sa8d_8x8_internal2
3851 call pixel_sa8d_8x8_internal2
3861 add r0, 8*SIZEOF_PIXEL
3862 add r2, 8*SIZEOF_PIXEL
3863 call pixel_sa8d_8x8_internal2
3866 call pixel_sa8d_8x8_internal2
3868 %if HIGH_BIT_DEPTH == 0
3874 add r4d, dword [esp+36]
3879 cglobal pixel_sa8d_16x64, 4,7,8
3887 call pixel_sa8d_8x8_internal2
3892 call pixel_sa8d_8x8_internal2
3898 add r0, 8*SIZEOF_PIXEL
3899 add r2, 8*SIZEOF_PIXEL
3900 call pixel_sa8d_8x8_internal2
3903 call pixel_sa8d_8x8_internal2
3905 %if HIGH_BIT_DEPTH == 0
3911 mov dword [esp+36], r4d
3923 call pixel_sa8d_8x8_internal2
3928 call pixel_sa8d_8x8_internal2
3934 add r0, 8*SIZEOF_PIXEL
3935 add r2, 8*SIZEOF_PIXEL
3936 call pixel_sa8d_8x8_internal2
3938 mova [esp+64-mmsize], m0
3939 call pixel_sa8d_8x8_internal2
3952 call pixel_sa8d_8x8_internal2
3957 call pixel_sa8d_8x8_internal2
3963 add r0, 8*SIZEOF_PIXEL
3964 add r2, 8*SIZEOF_PIXEL
3965 call pixel_sa8d_8x8_internal2
3967 mova [esp+64-mmsize], m0
3968 call pixel_sa8d_8x8_internal2
3981 call pixel_sa8d_8x8_internal2
3986 call pixel_sa8d_8x8_internal2
3992 add r0, 8*SIZEOF_PIXEL
3993 add r2, 8*SIZEOF_PIXEL
3994 call pixel_sa8d_8x8_internal2
3996 mova [esp+64-mmsize], m0
3997 call pixel_sa8d_8x8_internal2
3999 %if HIGH_BIT_DEPTH == 0
4005 add r4d, dword [esp+36]
4010 cglobal pixel_sa8d_24x32, 4,7,8
4018 call pixel_sa8d_8x8_internal2
4023 mov dword [esp+36], r4d
4027 add r0, 8*SIZEOF_PIXEL
4028 add r2, 8*SIZEOF_PIXEL
4030 call pixel_sa8d_8x8_internal2
4035 add r4d, dword [esp+36]
4036 mov dword [esp+36], r4d
4040 add r0, 16*SIZEOF_PIXEL
4041 add r2, 16*SIZEOF_PIXEL
4043 call pixel_sa8d_8x8_internal2
4048 add r4d, dword [esp+36]
4049 mov dword [esp+36], r4d
4058 call pixel_sa8d_8x8_internal2
4063 add r4d, dword [esp+36]
4064 mov dword [esp+36], r4d
4068 add r0, 8*SIZEOF_PIXEL
4069 add r2, 8*SIZEOF_PIXEL
4071 call pixel_sa8d_8x8_internal2
4076 add r4d, dword [esp+36]
4077 mov dword [esp+36], r4d
4081 add r0, 16*SIZEOF_PIXEL
4082 add r2, 16*SIZEOF_PIXEL
4084 call pixel_sa8d_8x8_internal2
4089 add r4d, dword [esp+36]
4090 mov dword [esp+36], r4d
4099 call pixel_sa8d_8x8_internal2
4104 add r4d, dword [esp+36]
4105 mov dword [esp+36], r4d
4109 add r0, 8*SIZEOF_PIXEL
4110 add r2, 8*SIZEOF_PIXEL
4112 call pixel_sa8d_8x8_internal2
4117 add r4d, dword [esp+36]
4118 mov dword [esp+36], r4d
4122 add r0, 16*SIZEOF_PIXEL
4123 add r2, 16*SIZEOF_PIXEL
4125 call pixel_sa8d_8x8_internal2
4130 add r4d, dword [esp+36]
4131 mov dword [esp+36], r4d
4140 call pixel_sa8d_8x8_internal2
4145 add r4d, dword [esp+36]
4146 mov dword [esp+36], r4d
4150 add r0, 8*SIZEOF_PIXEL
4151 add r2, 8*SIZEOF_PIXEL
4153 call pixel_sa8d_8x8_internal2
4158 add r4d, dword [esp+36]
4159 mov dword [esp+36], r4d
4163 add r0, 16*SIZEOF_PIXEL
4164 add r2, 16*SIZEOF_PIXEL
4166 call pixel_sa8d_8x8_internal2
4171 add r4d, dword [esp+36]
4176 cglobal pixel_sa8d_32x8, 4,7,8
4184 call pixel_sa8d_8x8_internal2
4189 mov dword [esp+36], r4d
4193 add r0, 8*SIZEOF_PIXEL
4194 add r2, 8*SIZEOF_PIXEL
4196 call pixel_sa8d_8x8_internal2
4201 add r4d, dword [esp+36]
4202 mov dword [esp+36], r4d
4206 add r0, 16*SIZEOF_PIXEL
4207 add r2, 16*SIZEOF_PIXEL
4209 call pixel_sa8d_8x8_internal2
4214 add r4d, dword [esp+36]
4215 mov dword [esp+36], r4d
4219 add r0, 24*SIZEOF_PIXEL
4220 add r2, 24*SIZEOF_PIXEL
4222 call pixel_sa8d_8x8_internal2
4227 add r4d, dword [esp+36]
4232 cglobal pixel_sa8d_32x16, 4,7,8
4240 call pixel_sa8d_8x8_internal2
4245 call pixel_sa8d_8x8_internal2
4251 add r0, 8*SIZEOF_PIXEL
4252 add r2, 8*SIZEOF_PIXEL
4253 call pixel_sa8d_8x8_internal2
4256 call pixel_sa8d_8x8_internal2
4258 %if HIGH_BIT_DEPTH == 0
4264 mov dword [esp+36], r4d
4268 add r0, 16*SIZEOF_PIXEL
4269 add r2, 16*SIZEOF_PIXEL
4271 call pixel_sa8d_8x8_internal2
4276 call pixel_sa8d_8x8_internal2
4282 add r0, 24*SIZEOF_PIXEL
4283 add r2, 24*SIZEOF_PIXEL
4284 call pixel_sa8d_8x8_internal2
4286 mova [esp+64-mmsize], m0
4287 call pixel_sa8d_8x8_internal2
4289 %if HIGH_BIT_DEPTH == 0
4295 add r4d, dword [esp+36]
4300 cglobal pixel_sa8d_32x24, 4,7,8
4308 call pixel_sa8d_8x8_internal2
4313 mov dword [esp+36], r4d
4317 add r0, 8*SIZEOF_PIXEL
4318 add r2, 8*SIZEOF_PIXEL
4320 call pixel_sa8d_8x8_internal2
4325 add r4d, dword [esp+36]
4326 mov dword [esp+36], r4d
4330 add r0, 16*SIZEOF_PIXEL
4331 add r2, 16*SIZEOF_PIXEL
4333 call pixel_sa8d_8x8_internal2
4338 add r4d, dword [esp+36]
4339 mov dword [esp+36], r4d
4343 add r0, 24*SIZEOF_PIXEL
4344 add r2, 24*SIZEOF_PIXEL
4346 call pixel_sa8d_8x8_internal2
4351 add r4d, dword [esp+36]
4352 mov dword [esp+36], r4d
4361 call pixel_sa8d_8x8_internal2
4366 add r4d, dword [esp+36]
4367 mov dword [esp+36], r4d
4371 add r0, 8*SIZEOF_PIXEL
4372 add r2, 8*SIZEOF_PIXEL
4374 call pixel_sa8d_8x8_internal2
4379 add r4d, dword [esp+36]
4380 mov dword [esp+36], r4d
4384 add r0, 16*SIZEOF_PIXEL
4385 add r2, 16*SIZEOF_PIXEL
4387 call pixel_sa8d_8x8_internal2
4392 add r4d, dword [esp+36]
4393 mov dword [esp+36], r4d
4397 add r0, 24*SIZEOF_PIXEL
4398 add r2, 24*SIZEOF_PIXEL
4400 call pixel_sa8d_8x8_internal2
4405 add r4d, dword [esp+36]
4406 mov dword [esp+36], r4d
4415 call pixel_sa8d_8x8_internal2
4420 add r4d, dword [esp+36]
4421 mov dword [esp+36], r4d
4425 add r0, 8*SIZEOF_PIXEL
4426 add r2, 8*SIZEOF_PIXEL
4428 call pixel_sa8d_8x8_internal2
4433 add r4d, dword [esp+36]
4434 mov dword [esp+36], r4d
4438 add r0, 16*SIZEOF_PIXEL
4439 add r2, 16*SIZEOF_PIXEL
4441 call pixel_sa8d_8x8_internal2
4446 add r4d, dword [esp+36]
4447 mov dword [esp+36], r4d
4451 add r0, 24*SIZEOF_PIXEL
4452 add r2, 24*SIZEOF_PIXEL
4454 call pixel_sa8d_8x8_internal2
4459 add r4d, dword [esp+36]
4464 cglobal pixel_sa8d_32x32, 4,7,8
4472 call pixel_sa8d_8x8_internal2
4477 call pixel_sa8d_8x8_internal2
4483 add r0, 8*SIZEOF_PIXEL
4484 add r2, 8*SIZEOF_PIXEL
4485 call pixel_sa8d_8x8_internal2
4488 call pixel_sa8d_8x8_internal2
4490 %if HIGH_BIT_DEPTH == 0
4496 mov dword [esp+36], r4d
4500 add r0, 16*SIZEOF_PIXEL
4501 add r2, 16*SIZEOF_PIXEL
4503 call pixel_sa8d_8x8_internal2
4508 call pixel_sa8d_8x8_internal2
4514 add r0, 24*SIZEOF_PIXEL
4515 add r2, 24*SIZEOF_PIXEL
4516 call pixel_sa8d_8x8_internal2
4518 mova [esp+64-mmsize], m0
4519 call pixel_sa8d_8x8_internal2
4529 call pixel_sa8d_8x8_internal2
4534 call pixel_sa8d_8x8_internal2
4544 add r0, 8*SIZEOF_PIXEL
4545 add r2, 8*SIZEOF_PIXEL
4546 call pixel_sa8d_8x8_internal2
4548 mova [esp+64-mmsize], m0
4549 call pixel_sa8d_8x8_internal2
4558 add r0, 16*SIZEOF_PIXEL
4559 add r2, 16*SIZEOF_PIXEL
4561 call pixel_sa8d_8x8_internal2
4566 call pixel_sa8d_8x8_internal2
4576 add r0, 24*SIZEOF_PIXEL
4577 add r2, 24*SIZEOF_PIXEL
4578 call pixel_sa8d_8x8_internal2
4580 mova [esp+64-mmsize], m0
4581 call pixel_sa8d_8x8_internal2
4583 %if HIGH_BIT_DEPTH == 0
4589 add r4d, dword [esp+36]
4594 cglobal pixel_sa8d_32x64, 4,7,8
4602 call pixel_sa8d_8x8_internal2
4607 call pixel_sa8d_8x8_internal2
4613 add r0, 8*SIZEOF_PIXEL
4614 add r2, 8*SIZEOF_PIXEL
4615 call pixel_sa8d_8x8_internal2
4618 call pixel_sa8d_8x8_internal2
4620 %if HIGH_BIT_DEPTH == 0
4626 mov dword [esp+36], r4d
4630 add r0, 16*SIZEOF_PIXEL
4631 add r2, 16*SIZEOF_PIXEL
4633 call pixel_sa8d_8x8_internal2
4638 call pixel_sa8d_8x8_internal2
4644 add r0, 24*SIZEOF_PIXEL
4645 add r2, 24*SIZEOF_PIXEL
4646 call pixel_sa8d_8x8_internal2
4648 mova [esp+64-mmsize], m0
4649 call pixel_sa8d_8x8_internal2
4662 call pixel_sa8d_8x8_internal2
4667 call pixel_sa8d_8x8_internal2
4673 add r0, 8*SIZEOF_PIXEL
4674 add r2, 8*SIZEOF_PIXEL
4675 call pixel_sa8d_8x8_internal2
4677 mova [esp+64-mmsize], m0
4678 call pixel_sa8d_8x8_internal2
4683 add r0, 16*SIZEOF_PIXEL
4684 add r2, 16*SIZEOF_PIXEL
4686 call pixel_sa8d_8x8_internal2
4691 call pixel_sa8d_8x8_internal2
4697 add r0, 24*SIZEOF_PIXEL
4698 add r2, 24*SIZEOF_PIXEL
4699 call pixel_sa8d_8x8_internal2
4701 mova [esp+64-mmsize], m0
4702 call pixel_sa8d_8x8_internal2
4715 call pixel_sa8d_8x8_internal2
4720 call pixel_sa8d_8x8_internal2
4726 add r0, 8*SIZEOF_PIXEL
4727 add r2, 8*SIZEOF_PIXEL
4728 call pixel_sa8d_8x8_internal2
4730 mova [esp+64-mmsize], m0
4731 call pixel_sa8d_8x8_internal2
4736 add r0, 16*SIZEOF_PIXEL
4737 add r2, 16*SIZEOF_PIXEL
4739 call pixel_sa8d_8x8_internal2
4744 call pixel_sa8d_8x8_internal2
4750 add r0, 24*SIZEOF_PIXEL
4751 add r2, 24*SIZEOF_PIXEL
4752 call pixel_sa8d_8x8_internal2
4754 mova [esp+64-mmsize], m0
4755 call pixel_sa8d_8x8_internal2
4768 call pixel_sa8d_8x8_internal2
4773 call pixel_sa8d_8x8_internal2
4779 add r0, 8*SIZEOF_PIXEL
4780 add r2, 8*SIZEOF_PIXEL
4781 call pixel_sa8d_8x8_internal2
4783 mova [esp+64-mmsize], m0
4784 call pixel_sa8d_8x8_internal2
4789 add r0, 16*SIZEOF_PIXEL
4790 add r2, 16*SIZEOF_PIXEL
4792 call pixel_sa8d_8x8_internal2
4797 call pixel_sa8d_8x8_internal2
4803 add r0, 24*SIZEOF_PIXEL
4804 add r2, 24*SIZEOF_PIXEL
4805 call pixel_sa8d_8x8_internal2
4807 mova [esp+64-mmsize], m0
4808 call pixel_sa8d_8x8_internal2
4810 %if HIGH_BIT_DEPTH == 0
4816 add r4d, dword [esp+36]
4821 cglobal pixel_sa8d_48x64, 4,7,8
4829 call pixel_sa8d_8x8_internal2
4834 call pixel_sa8d_8x8_internal2
4840 add r0, 8*SIZEOF_PIXEL
4841 add r2, 8*SIZEOF_PIXEL
4842 call pixel_sa8d_8x8_internal2
4845 call pixel_sa8d_8x8_internal2
4847 %if HIGH_BIT_DEPTH == 0
4853 mov dword [esp+36], r4d
4857 add r0, 16*SIZEOF_PIXEL
4858 add r2, 16*SIZEOF_PIXEL
4860 call pixel_sa8d_8x8_internal2
4865 call pixel_sa8d_8x8_internal2
4871 add r0, 24*SIZEOF_PIXEL
4872 add r2, 24*SIZEOF_PIXEL
4873 call pixel_sa8d_8x8_internal2
4875 mova [esp+64-mmsize], m0
4876 call pixel_sa8d_8x8_internal2
4881 add r0, 32*SIZEOF_PIXEL
4882 add r2, 32*SIZEOF_PIXEL
4884 call pixel_sa8d_8x8_internal2
4889 call pixel_sa8d_8x8_internal2
4895 add r0, 40*SIZEOF_PIXEL
4896 add r2, 40*SIZEOF_PIXEL
4897 call pixel_sa8d_8x8_internal2
4899 mova [esp+64-mmsize], m0
4900 call pixel_sa8d_8x8_internal2
4913 call pixel_sa8d_8x8_internal2
4918 call pixel_sa8d_8x8_internal2
4924 add r0, 8*SIZEOF_PIXEL
4925 add r2, 8*SIZEOF_PIXEL
4926 call pixel_sa8d_8x8_internal2
4928 mova [esp+64-mmsize], m0
4929 call pixel_sa8d_8x8_internal2
4934 add r0, 16*SIZEOF_PIXEL
4935 add r2, 16*SIZEOF_PIXEL
4937 call pixel_sa8d_8x8_internal2
4942 call pixel_sa8d_8x8_internal2
4948 add r0, 24*SIZEOF_PIXEL
4949 add r2, 24*SIZEOF_PIXEL
4950 call pixel_sa8d_8x8_internal2
4952 mova [esp+64-mmsize], m0
4953 call pixel_sa8d_8x8_internal2
4958 add r0, 32*SIZEOF_PIXEL
4959 add r2, 32*SIZEOF_PIXEL
4961 call pixel_sa8d_8x8_internal2
4966 call pixel_sa8d_8x8_internal2
4972 add r0, 40*SIZEOF_PIXEL
4973 add r2, 40*SIZEOF_PIXEL
4974 call pixel_sa8d_8x8_internal2
4976 mova [esp+64-mmsize], m0
4977 call pixel_sa8d_8x8_internal2
4990 call pixel_sa8d_8x8_internal2
4995 call pixel_sa8d_8x8_internal2
5001 add r0, 8*SIZEOF_PIXEL
5002 add r2, 8*SIZEOF_PIXEL
5003 call pixel_sa8d_8x8_internal2
5005 mova [esp+64-mmsize], m0
5006 call pixel_sa8d_8x8_internal2
5011 add r0, 16*SIZEOF_PIXEL
5012 add r2, 16*SIZEOF_PIXEL
5014 call pixel_sa8d_8x8_internal2
5019 call pixel_sa8d_8x8_internal2
5025 add r0, 24*SIZEOF_PIXEL
5026 add r2, 24*SIZEOF_PIXEL
5027 call pixel_sa8d_8x8_internal2
5029 mova [esp+64-mmsize], m0
5030 call pixel_sa8d_8x8_internal2
5035 add r0, 32*SIZEOF_PIXEL
5036 add r2, 32*SIZEOF_PIXEL
5038 call pixel_sa8d_8x8_internal2
5043 call pixel_sa8d_8x8_internal2
5049 add r0, 40*SIZEOF_PIXEL
5050 add r2, 40*SIZEOF_PIXEL
5051 call pixel_sa8d_8x8_internal2
5053 mova [esp+64-mmsize], m0
5054 call pixel_sa8d_8x8_internal2
5067 call pixel_sa8d_8x8_internal2
5072 call pixel_sa8d_8x8_internal2
5078 add r0, 8*SIZEOF_PIXEL
5079 add r2, 8*SIZEOF_PIXEL
5080 call pixel_sa8d_8x8_internal2
5082 mova [esp+64-mmsize], m0
5083 call pixel_sa8d_8x8_internal2
5088 add r0, 16*SIZEOF_PIXEL
5089 add r2, 16*SIZEOF_PIXEL
5091 call pixel_sa8d_8x8_internal2
5096 call pixel_sa8d_8x8_internal2
5102 add r0, 24*SIZEOF_PIXEL
5103 add r2, 24*SIZEOF_PIXEL
5104 call pixel_sa8d_8x8_internal2
5106 mova [esp+64-mmsize], m0
5107 call pixel_sa8d_8x8_internal2
5112 add r0, 32*SIZEOF_PIXEL
5113 add r2, 32*SIZEOF_PIXEL
5115 call pixel_sa8d_8x8_internal2
5120 call pixel_sa8d_8x8_internal2
5126 add r0, 40*SIZEOF_PIXEL
5127 add r2, 40*SIZEOF_PIXEL
5128 call pixel_sa8d_8x8_internal2
5130 mova [esp+64-mmsize], m0
5131 call pixel_sa8d_8x8_internal2
5133 %if HIGH_BIT_DEPTH == 0
5139 add r4d, dword [esp+36]
5144 cglobal pixel_sa8d_64x16, 4,7,8
5152 call pixel_sa8d_8x8_internal2
5157 call pixel_sa8d_8x8_internal2
5163 add r0, 8*SIZEOF_PIXEL
5164 add r2, 8*SIZEOF_PIXEL
5165 call pixel_sa8d_8x8_internal2
5168 call pixel_sa8d_8x8_internal2
5170 %if HIGH_BIT_DEPTH == 0
5176 mov dword [esp+36], r4d
5180 add r0, 16*SIZEOF_PIXEL
5181 add r2, 16*SIZEOF_PIXEL
5183 call pixel_sa8d_8x8_internal2
5188 call pixel_sa8d_8x8_internal2
5194 add r0, 24*SIZEOF_PIXEL
5195 add r2, 24*SIZEOF_PIXEL
5196 call pixel_sa8d_8x8_internal2
5198 mova [esp+64-mmsize], m0
5199 call pixel_sa8d_8x8_internal2
5204 add r0, 32*SIZEOF_PIXEL
5205 add r2, 32*SIZEOF_PIXEL
5207 call pixel_sa8d_8x8_internal2
5212 call pixel_sa8d_8x8_internal2
5218 add r0, 40*SIZEOF_PIXEL
5219 add r2, 40*SIZEOF_PIXEL
5220 call pixel_sa8d_8x8_internal2
5222 mova [esp+64-mmsize], m0
5223 call pixel_sa8d_8x8_internal2
5228 add r0, 48*SIZEOF_PIXEL
5229 add r2, 48*SIZEOF_PIXEL
5231 call pixel_sa8d_8x8_internal2
5236 call pixel_sa8d_8x8_internal2
5242 add r0, 56*SIZEOF_PIXEL
5243 add r2, 56*SIZEOF_PIXEL
5244 call pixel_sa8d_8x8_internal2
5246 mova [esp+64-mmsize], m0
5247 call pixel_sa8d_8x8_internal2
5249 %if HIGH_BIT_DEPTH == 0
5255 add r4d, dword [esp+36]
5260 cglobal pixel_sa8d_64x32, 4,7,8
5268 call pixel_sa8d_8x8_internal2
5273 call pixel_sa8d_8x8_internal2
5279 add r0, 8*SIZEOF_PIXEL
5280 add r2, 8*SIZEOF_PIXEL
5281 call pixel_sa8d_8x8_internal2
5284 call pixel_sa8d_8x8_internal2
5286 %if HIGH_BIT_DEPTH == 0
5292 mov dword [esp+36], r4d
5296 add r0, 16*SIZEOF_PIXEL
5297 add r2, 16*SIZEOF_PIXEL
5299 call pixel_sa8d_8x8_internal2
5304 call pixel_sa8d_8x8_internal2
5310 add r0, 24*SIZEOF_PIXEL
5311 add r2, 24*SIZEOF_PIXEL
5312 call pixel_sa8d_8x8_internal2
5314 mova [esp+64-mmsize], m0
5315 call pixel_sa8d_8x8_internal2
5320 add r0, 32*SIZEOF_PIXEL
5321 add r2, 32*SIZEOF_PIXEL
5323 call pixel_sa8d_8x8_internal2
5328 call pixel_sa8d_8x8_internal2
5334 add r0, 40*SIZEOF_PIXEL
5335 add r2, 40*SIZEOF_PIXEL
5336 call pixel_sa8d_8x8_internal2
5338 mova [esp+64-mmsize], m0
5339 call pixel_sa8d_8x8_internal2
5344 add r0, 48*SIZEOF_PIXEL
5345 add r2, 48*SIZEOF_PIXEL
5347 call pixel_sa8d_8x8_internal2
5352 call pixel_sa8d_8x8_internal2
5358 add r0, 56*SIZEOF_PIXEL
5359 add r2, 56*SIZEOF_PIXEL
5360 call pixel_sa8d_8x8_internal2
5362 mova [esp+64-mmsize], m0
5363 call pixel_sa8d_8x8_internal2
5376 call pixel_sa8d_8x8_internal2
5381 call pixel_sa8d_8x8_internal2
5387 add r0, 8*SIZEOF_PIXEL
5388 add r2, 8*SIZEOF_PIXEL
5389 call pixel_sa8d_8x8_internal2
5391 mova [esp+64-mmsize], m0
5392 call pixel_sa8d_8x8_internal2
5397 add r0, 16*SIZEOF_PIXEL
5398 add r2, 16*SIZEOF_PIXEL
5400 call pixel_sa8d_8x8_internal2
5405 call pixel_sa8d_8x8_internal2
5411 add r0, 24*SIZEOF_PIXEL
5412 add r2, 24*SIZEOF_PIXEL
5413 call pixel_sa8d_8x8_internal2
5415 mova [esp+64-mmsize], m0
5416 call pixel_sa8d_8x8_internal2
5421 add r0, 32*SIZEOF_PIXEL
5422 add r2, 32*SIZEOF_PIXEL
5424 call pixel_sa8d_8x8_internal2
5429 call pixel_sa8d_8x8_internal2
5435 add r0, 40*SIZEOF_PIXEL
5436 add r2, 40*SIZEOF_PIXEL
5437 call pixel_sa8d_8x8_internal2
5439 mova [esp+64-mmsize], m0
5440 call pixel_sa8d_8x8_internal2
5445 add r0, 48*SIZEOF_PIXEL
5446 add r2, 48*SIZEOF_PIXEL
5448 call pixel_sa8d_8x8_internal2
5453 call pixel_sa8d_8x8_internal2
5459 add r0, 56*SIZEOF_PIXEL
5460 add r2, 56*SIZEOF_PIXEL
5461 call pixel_sa8d_8x8_internal2
5463 mova [esp+64-mmsize], m0
5464 call pixel_sa8d_8x8_internal2
5466 %if HIGH_BIT_DEPTH == 0
5472 add r4d, dword [esp+36]
5477 cglobal pixel_sa8d_64x48, 4,7,8
5485 call pixel_sa8d_8x8_internal2
5490 call pixel_sa8d_8x8_internal2
5496 add r0, 8*SIZEOF_PIXEL
5497 add r2, 8*SIZEOF_PIXEL
5498 call pixel_sa8d_8x8_internal2
5501 call pixel_sa8d_8x8_internal2
5503 %if HIGH_BIT_DEPTH == 0
5509 mov dword [esp+36], r4d
5513 add r0, 16*SIZEOF_PIXEL
5514 add r2, 16*SIZEOF_PIXEL
5516 call pixel_sa8d_8x8_internal2
5521 call pixel_sa8d_8x8_internal2
5527 add r0, 24*SIZEOF_PIXEL
5528 add r2, 24*SIZEOF_PIXEL
5529 call pixel_sa8d_8x8_internal2
5531 mova [esp+64-mmsize], m0
5532 call pixel_sa8d_8x8_internal2
5537 add r0, 32*SIZEOF_PIXEL
5538 add r2, 32*SIZEOF_PIXEL
5540 call pixel_sa8d_8x8_internal2
5545 call pixel_sa8d_8x8_internal2
5551 add r0, 40*SIZEOF_PIXEL
5552 add r2, 40*SIZEOF_PIXEL
5553 call pixel_sa8d_8x8_internal2
5555 mova [esp+64-mmsize], m0
5556 call pixel_sa8d_8x8_internal2
5561 add r0, 48*SIZEOF_PIXEL
5562 add r2, 48*SIZEOF_PIXEL
5564 call pixel_sa8d_8x8_internal2
5569 call pixel_sa8d_8x8_internal2
5575 add r0, 56*SIZEOF_PIXEL
5576 add r2, 56*SIZEOF_PIXEL
5577 call pixel_sa8d_8x8_internal2
5579 mova [esp+64-mmsize], m0
5580 call pixel_sa8d_8x8_internal2
5593 call pixel_sa8d_8x8_internal2
5598 call pixel_sa8d_8x8_internal2
5604 add r0, 8*SIZEOF_PIXEL
5605 add r2, 8*SIZEOF_PIXEL
5606 call pixel_sa8d_8x8_internal2
5608 mova [esp+64-mmsize], m0
5609 call pixel_sa8d_8x8_internal2
5614 add r0, 16*SIZEOF_PIXEL
5615 add r2, 16*SIZEOF_PIXEL
5617 call pixel_sa8d_8x8_internal2
5622 call pixel_sa8d_8x8_internal2
5628 add r0, 24*SIZEOF_PIXEL
5629 add r2, 24*SIZEOF_PIXEL
5630 call pixel_sa8d_8x8_internal2
5632 mova [esp+64-mmsize], m0
5633 call pixel_sa8d_8x8_internal2
5638 add r0, 32*SIZEOF_PIXEL
5639 add r2, 32*SIZEOF_PIXEL
5641 call pixel_sa8d_8x8_internal2
5646 call pixel_sa8d_8x8_internal2
5652 add r0, 40*SIZEOF_PIXEL
5653 add r2, 40*SIZEOF_PIXEL
5654 call pixel_sa8d_8x8_internal2
5656 mova [esp+64-mmsize], m0
5657 call pixel_sa8d_8x8_internal2
5662 add r0, 48*SIZEOF_PIXEL
5663 add r2, 48*SIZEOF_PIXEL
5665 call pixel_sa8d_8x8_internal2
5670 call pixel_sa8d_8x8_internal2
5676 add r0, 56*SIZEOF_PIXEL
5677 add r2, 56*SIZEOF_PIXEL
5678 call pixel_sa8d_8x8_internal2
5680 mova [esp+64-mmsize], m0
5681 call pixel_sa8d_8x8_internal2
5694 call pixel_sa8d_8x8_internal2
5699 call pixel_sa8d_8x8_internal2
5705 add r0, 8*SIZEOF_PIXEL
5706 add r2, 8*SIZEOF_PIXEL
5707 call pixel_sa8d_8x8_internal2
5709 mova [esp+64-mmsize], m0
5710 call pixel_sa8d_8x8_internal2
5715 add r0, 16*SIZEOF_PIXEL
5716 add r2, 16*SIZEOF_PIXEL
5718 call pixel_sa8d_8x8_internal2
5723 call pixel_sa8d_8x8_internal2
5729 add r0, 24*SIZEOF_PIXEL
5730 add r2, 24*SIZEOF_PIXEL
5731 call pixel_sa8d_8x8_internal2
5733 mova [esp+64-mmsize], m0
5734 call pixel_sa8d_8x8_internal2
5739 add r0, 32*SIZEOF_PIXEL
5740 add r2, 32*SIZEOF_PIXEL
5742 call pixel_sa8d_8x8_internal2
5747 call pixel_sa8d_8x8_internal2
5753 add r0, 40*SIZEOF_PIXEL
5754 add r2, 40*SIZEOF_PIXEL
5755 call pixel_sa8d_8x8_internal2
5757 mova [esp+64-mmsize], m0
5758 call pixel_sa8d_8x8_internal2
5763 add r0, 48*SIZEOF_PIXEL
5764 add r2, 48*SIZEOF_PIXEL
5766 call pixel_sa8d_8x8_internal2
5771 call pixel_sa8d_8x8_internal2
5777 add r0, 56*SIZEOF_PIXEL
5778 add r2, 56*SIZEOF_PIXEL
5779 call pixel_sa8d_8x8_internal2
5781 mova [esp+64-mmsize], m0
5782 call pixel_sa8d_8x8_internal2
5784 %if HIGH_BIT_DEPTH == 0
5790 add r4d, dword [esp+36]
5795 cglobal pixel_sa8d_64x64, 4,7,8
5803 call pixel_sa8d_8x8_internal2
5808 call pixel_sa8d_8x8_internal2
5814 add r0, 8*SIZEOF_PIXEL
5815 add r2, 8*SIZEOF_PIXEL
5816 call pixel_sa8d_8x8_internal2
5819 call pixel_sa8d_8x8_internal2
5821 %if HIGH_BIT_DEPTH == 0
5827 mov dword [esp+36], r4d
5831 add r0, 16*SIZEOF_PIXEL
5832 add r2, 16*SIZEOF_PIXEL
5834 call pixel_sa8d_8x8_internal2
5839 call pixel_sa8d_8x8_internal2
5845 add r0, 24*SIZEOF_PIXEL
5846 add r2, 24*SIZEOF_PIXEL
5847 call pixel_sa8d_8x8_internal2
5849 mova [esp+64-mmsize], m0
5850 call pixel_sa8d_8x8_internal2
5855 add r0, 32*SIZEOF_PIXEL
5856 add r2, 32*SIZEOF_PIXEL
5858 call pixel_sa8d_8x8_internal2
5863 call pixel_sa8d_8x8_internal2
5869 add r0, 40*SIZEOF_PIXEL
5870 add r2, 40*SIZEOF_PIXEL
5871 call pixel_sa8d_8x8_internal2
5873 mova [esp+64-mmsize], m0
5874 call pixel_sa8d_8x8_internal2
5879 add r0, 48*SIZEOF_PIXEL
5880 add r2, 48*SIZEOF_PIXEL
5882 call pixel_sa8d_8x8_internal2
5887 call pixel_sa8d_8x8_internal2
5893 add r0, 56*SIZEOF_PIXEL
5894 add r2, 56*SIZEOF_PIXEL
5895 call pixel_sa8d_8x8_internal2
5897 mova [esp+64-mmsize], m0
5898 call pixel_sa8d_8x8_internal2
5911 call pixel_sa8d_8x8_internal2
5916 call pixel_sa8d_8x8_internal2
5922 add r0, 8*SIZEOF_PIXEL
5923 add r2, 8*SIZEOF_PIXEL
5924 call pixel_sa8d_8x8_internal2
5926 mova [esp+64-mmsize], m0
5927 call pixel_sa8d_8x8_internal2
5932 add r0, 16*SIZEOF_PIXEL
5933 add r2, 16*SIZEOF_PIXEL
5935 call pixel_sa8d_8x8_internal2
5940 call pixel_sa8d_8x8_internal2
5946 add r0, 24*SIZEOF_PIXEL
5947 add r2, 24*SIZEOF_PIXEL
5948 call pixel_sa8d_8x8_internal2
5950 mova [esp+64-mmsize], m0
5951 call pixel_sa8d_8x8_internal2
5956 add r0, 32*SIZEOF_PIXEL
5957 add r2, 32*SIZEOF_PIXEL
5959 call pixel_sa8d_8x8_internal2
5964 call pixel_sa8d_8x8_internal2
5970 add r0, 40*SIZEOF_PIXEL
5971 add r2, 40*SIZEOF_PIXEL
5972 call pixel_sa8d_8x8_internal2
5974 mova [esp+64-mmsize], m0
5975 call pixel_sa8d_8x8_internal2
5980 add r0, 48*SIZEOF_PIXEL
5981 add r2, 48*SIZEOF_PIXEL
5983 call pixel_sa8d_8x8_internal2
5988 call pixel_sa8d_8x8_internal2
5994 add r0, 56*SIZEOF_PIXEL
5995 add r2, 56*SIZEOF_PIXEL
5996 call pixel_sa8d_8x8_internal2
5998 mova [esp+64-mmsize], m0
5999 call pixel_sa8d_8x8_internal2
6012 call pixel_sa8d_8x8_internal2
6017 call pixel_sa8d_8x8_internal2
6023 add r0, 8*SIZEOF_PIXEL
6024 add r2, 8*SIZEOF_PIXEL
6025 call pixel_sa8d_8x8_internal2
6027 mova [esp+64-mmsize], m0
6028 call pixel_sa8d_8x8_internal2
6033 add r0, 16*SIZEOF_PIXEL
6034 add r2, 16*SIZEOF_PIXEL
6036 call pixel_sa8d_8x8_internal2
6041 call pixel_sa8d_8x8_internal2
6047 add r0, 24*SIZEOF_PIXEL
6048 add r2, 24*SIZEOF_PIXEL
6049 call pixel_sa8d_8x8_internal2
6051 mova [esp+64-mmsize], m0
6052 call pixel_sa8d_8x8_internal2
6057 add r0, 32*SIZEOF_PIXEL
6058 add r2, 32*SIZEOF_PIXEL
6060 call pixel_sa8d_8x8_internal2
6065 call pixel_sa8d_8x8_internal2
6071 add r0, 40*SIZEOF_PIXEL
6072 add r2, 40*SIZEOF_PIXEL
6073 call pixel_sa8d_8x8_internal2
6075 mova [esp+64-mmsize], m0
6076 call pixel_sa8d_8x8_internal2
6081 add r0, 48*SIZEOF_PIXEL
6082 add r2, 48*SIZEOF_PIXEL
6084 call pixel_sa8d_8x8_internal2
6089 call pixel_sa8d_8x8_internal2
6095 add r0, 56*SIZEOF_PIXEL
6096 add r2, 56*SIZEOF_PIXEL
6097 call pixel_sa8d_8x8_internal2
6099 mova [esp+64-mmsize], m0
6100 call pixel_sa8d_8x8_internal2
6113 call pixel_sa8d_8x8_internal2
6118 call pixel_sa8d_8x8_internal2
6124 add r0, 8*SIZEOF_PIXEL
6125 add r2, 8*SIZEOF_PIXEL
6126 call pixel_sa8d_8x8_internal2
6128 mova [esp+64-mmsize], m0
6129 call pixel_sa8d_8x8_internal2
6134 add r0, 16*SIZEOF_PIXEL
6135 add r2, 16*SIZEOF_PIXEL
6137 call pixel_sa8d_8x8_internal2
6142 call pixel_sa8d_8x8_internal2
6148 add r0, 24*SIZEOF_PIXEL
6149 add r2, 24*SIZEOF_PIXEL
6150 call pixel_sa8d_8x8_internal2
6152 mova [esp+64-mmsize], m0
6153 call pixel_sa8d_8x8_internal2
6158 add r0, 32*SIZEOF_PIXEL
6159 add r2, 32*SIZEOF_PIXEL
6161 call pixel_sa8d_8x8_internal2
6166 call pixel_sa8d_8x8_internal2
6172 add r0, 40*SIZEOF_PIXEL
6173 add r2, 40*SIZEOF_PIXEL
6174 call pixel_sa8d_8x8_internal2
6176 mova [esp+64-mmsize], m0
6177 call pixel_sa8d_8x8_internal2
6182 add r0, 48*SIZEOF_PIXEL
6183 add r2, 48*SIZEOF_PIXEL
6185 call pixel_sa8d_8x8_internal2
6190 call pixel_sa8d_8x8_internal2
6196 add r0, 56*SIZEOF_PIXEL
6197 add r2, 56*SIZEOF_PIXEL
6198 call pixel_sa8d_8x8_internal2
6200 mova [esp+64-mmsize], m0
6201 call pixel_sa8d_8x8_internal2
6203 %if HIGH_BIT_DEPTH == 0
6209 add r4d, dword [esp+36]
6213 %endif ; !ARCH_X86_64
6216 ;=============================================================================
6218 ;=============================================================================
6219 %define TRANS TRANS_SSE2
6220 %define DIFFOP DIFF_UNPACK_SSE2
6221 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
6222 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
6223 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
6224 %define movdqu movups
6225 %define punpcklqdq movlhps
6230 %if HIGH_BIT_DEPTH == 0
6236 %define DIFFOP DIFF_SUMSUB_SSSE3
6237 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
6238 %if HIGH_BIT_DEPTH == 0
6239 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
6240 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
6245 %undef movdqa ; nehalem doesn't like movaps
6246 %undef movdqu ; movups
6247 %undef punpcklqdq ; or movlhps
6249 %define TRANS TRANS_SSE4
6250 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
6255 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
6256 ; it's effectively free.
6257 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
6262 %define TRANS TRANS_XOP
6268 %if HIGH_BIT_DEPTH == 0
6269 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
6270 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
6271 %define TRANS TRANS_SSE4
6273 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
6278 vinserti128 m%1, m%1, [r0+4*r1], 1
6279 vinserti128 m%3, m%3, [r2+4*r3], 1
6280 vinserti128 m%2, m%2, [r0+r4], 1
6281 vinserti128 m%4, m%4, [r2+r5], 1
6286 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
6294 vinserti128 m%3, m%3, [r0+4*r1], 1
6295 vinserti128 m%5, m%5, [r2+4*r3], 1
6296 vinserti128 m%4, m%4, [r0+r4], 1
6297 vinserti128 m%6, m%6, [r2+r5], 1
6302 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
6305 %macro SATD_START_AVX2 2-3 0
6319 %define TRANS TRANS_SSE4
6321 cglobal pixel_satd_16x8_internal
6322 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
6323 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6324 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
6325 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6328 cglobal pixel_satd_16x16, 4,6,8
6329 SATD_START_AVX2 m6, m7
6330 call pixel_satd_16x8_internal
6333 pixel_satd_16x8_internal:
6334 call pixel_satd_16x8_internal
6335 vextracti128 xm0, m6, 1
6340 cglobal pixel_satd_16x8, 4,6,8
6341 SATD_START_AVX2 m6, m7
6342 jmp pixel_satd_16x8_internal
6344 cglobal pixel_satd_8x8_internal
6345 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
6346 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6349 cglobal pixel_satd_8x16, 4,6,8
6350 SATD_START_AVX2 m6, m7, 1
6351 call pixel_satd_8x8_internal
6356 call pixel_satd_8x8_internal
6357 vextracti128 xm0, m6, 1
6362 cglobal pixel_satd_8x8, 4,6,8
6363 SATD_START_AVX2 m6, m7, 1
6364 call pixel_satd_8x8_internal
6365 vextracti128 xm0, m6, 1
6370 cglobal pixel_sa8d_8x8_internal
6371 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
6372 HADAMARD4_V 0, 1, 2, 3, 4
6373 HADAMARD 8, sumsub, 0, 1, 4, 5
6374 HADAMARD 8, sumsub, 2, 3, 4, 5
6375 HADAMARD 2, sumsub, 0, 1, 4, 5
6376 HADAMARD 2, sumsub, 2, 3, 4, 5
6377 HADAMARD 1, amax, 0, 1, 4, 5
6378 HADAMARD 1, amax, 2, 3, 4, 5
6383 cglobal pixel_sa8d_8x8, 4,6,8
6384 SATD_START_AVX2 m6, m7, 1
6385 call pixel_sa8d_8x8_internal
6386 vextracti128 xm1, m6, 1
6393 %endif ; HIGH_BIT_DEPTH
6395 ; Input 16bpp, Output 8bpp
6396 ;------------------------------------------------------------------------------------------------------------------------
6397 ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
6398 ;------------------------------------------------------------------------------------------------------------------------
6400 cglobal downShift_16, 7,7,3
6401 movd m0, r6d ; m0 = shift
6407 movu m1, [r0 + r6 * 2]
6408 movu m2, [r0 + r6 * 2 + 16]
6424 ;processing last row of every frame [To handle width which not a multiple of 16]
6489 ; Input 8bpp, Output 16bpp
6490 ;---------------------------------------------------------------------------------------------------------------------
6491 ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
6492 ;---------------------------------------------------------------------------------------------------------------------
6494 cglobal upShift_8, 7,7,3
6496 movd m2, r6d ; m0 = shift
6503 pmovzxbw m0,[r0 + r6]
6504 pmovzxbw m1,[r0 + r6 + 8]
6507 movu [r2 + r6 * 2], m0
6508 movu [r2 + r6 * 2 + 16], m1
6520 ;processing last row of every frame [To handle width which not a multiple of 16]
6524 pmovzxbw m1,[r0 + 8]
6564 movzx r3d, byte [r0]
6567 movzx r3d, byte [r0 + 1]
6577 movzx r3d, byte [r0]