Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / pixel-a.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* pixel.asm: x86 pixel metrics
3;*****************************************************************************
4;* Copyright (C) 2003-2013 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Holger Lubitz <holger@lubitz.org>
8;* Laurent Aimar <fenrir@via.ecp.fr>
9;* Alex Izvorski <aizvorksi@gmail.com>
10;* Fiona Glaser <fiona@x264.com>
11;* Oskar Arvidsson <oskar@irock.se>
12;*
13;* This program is free software; you can redistribute it and/or modify
14;* it under the terms of the GNU General Public License as published by
15;* the Free Software Foundation; either version 2 of the License, or
16;* (at your option) any later version.
17;*
18;* This program is distributed in the hope that it will be useful,
19;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21;* GNU General Public License for more details.
22;*
23;* You should have received a copy of the GNU General Public License
24;* along with this program; if not, write to the Free Software
25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26;*
27;* This program is also available under a commercial proprietary license.
28;* For more information, contact us at license @ x265.com.
29;*****************************************************************************
30
31%include "x86inc.asm"
32%include "x86util.asm"
33
34SECTION_RODATA 32
35hmul_16p: times 16 db 1
36 times 8 db 1, -1
37hmul_8p: times 8 db 1
38 times 4 db 1, -1
39 times 8 db 1
40 times 4 db 1, -1
41hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
42mask_10: times 4 dw 0, -1
43mask_1100: times 2 dd 0, -1
44
45ALIGN 32
46transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
47transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
48
49sw_f0: dq 0xfff0, 0
50pd_f0: times 4 dd 0xffff0000
51
52pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
53
54SECTION .text
55
56cextern pb_0
57cextern pb_1
58cextern pw_1
59cextern pw_8
60cextern pw_16
61cextern pw_32
62cextern pw_00ff
63cextern pw_ppppmmmm
64cextern pw_ppmmppmm
65cextern pw_pmpmpmpm
66cextern pw_pmmpzzzz
67cextern pd_1
68cextern popcnt_table
69
70;=============================================================================
71; SATD
72;=============================================================================
73
74%macro JDUP 2
75%if cpuflag(sse4)
76 ; just use shufps on anything post conroe
77 shufps %1, %2, 0
78%elif cpuflag(ssse3) && notcpuflag(atom)
79 ; join 2x 32 bit and duplicate them
80 ; emulating shufps is faster on conroe
81 punpcklqdq %1, %2
82 movsldup %1, %1
83%else
84 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
85 punpckldq %1, %2
86%endif
87%endmacro
88
89%macro HSUMSUB 5
90 pmaddubsw m%2, m%5
91 pmaddubsw m%1, m%5
92 pmaddubsw m%4, m%5
93 pmaddubsw m%3, m%5
94%endmacro
95
96%macro DIFF_UNPACK_SSE2 5
97 punpcklbw m%1, m%5
98 punpcklbw m%2, m%5
99 punpcklbw m%3, m%5
100 punpcklbw m%4, m%5
101 psubw m%1, m%2
102 psubw m%3, m%4
103%endmacro
104
105%macro DIFF_SUMSUB_SSSE3 5
106 HSUMSUB %1, %2, %3, %4, %5
107 psubw m%1, m%2
108 psubw m%3, m%4
109%endmacro
110
111%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
112 movd %1, %3
113 movd %2, %4
114 JDUP %1, %2
115%endmacro
116
117%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
118 movddup m%3, %6
119 movddup m%4, %8
120 movddup m%1, %5
121 movddup m%2, %7
122%endmacro
123
124%macro LOAD_DUP_4x8P_PENRYN 8
125 ; penryn and nehalem run punpcklqdq and movddup in different units
126 movh m%3, %6
127 movh m%4, %8
128 punpcklqdq m%3, m%3
129 movddup m%1, %5
130 punpcklqdq m%4, m%4
131 movddup m%2, %7
132%endmacro
133
134%macro LOAD_SUMSUB_8x2P 9
135 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
136 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
137%endmacro
138
139%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
140; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
141 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
142 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
143%if %10
144 lea %8, [%8+4*r1]
145 lea %9, [%9+4*r3]
146%endif
147%endmacro
148
149%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
150 movddup m%1, [%7]
151 movddup m%2, [%7+8]
152 mova m%4, [%6]
153 movddup m%3, m%4
154 punpckhqdq m%4, m%4
155 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
156%endmacro
157
158%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
159 movu m%4, [%7]
160 mova m%2, [%6]
161 DEINTB %1, %2, %3, %4, %5
162 psubw m%1, m%3
163 psubw m%2, m%4
164 SUMSUB_BA w, %1, %2, %3
165%endmacro
166
167%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
168; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
169 LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
170 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
171 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
172 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
173%endmacro
174
175%macro LOAD_SUMSUB_16x2P_AVX2 9
176; 2*dst, 2*tmp, mul, 4*ptr
177 vbroadcasti128 m%1, [%6]
178 vbroadcasti128 m%3, [%7]
179 vbroadcasti128 m%2, [%8]
180 vbroadcasti128 m%4, [%9]
181 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
182%endmacro
183
184%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
185; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
186 LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
187 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
188%if %10
189 lea %8, [%8+4*r1]
190 lea %9, [%9+4*r3]
191%endif
192%endmacro
193
194%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
195 mova xm%3, %6
196 mova xm%4, %8
197 mova xm%1, %5
198 mova xm%2, %7
199 vpermq m%3, m%3, q0011
200 vpermq m%4, m%4, q0011
201 vpermq m%1, m%1, q0011
202 vpermq m%2, m%2, q0011
203%endmacro
204
205%macro LOAD_SUMSUB8_16x2P_AVX2 9
206; 2*dst, 2*tmp, mul, 4*ptr
207 LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
208 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
209%endmacro
210
211%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
212; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
213 LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
214 LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
215%if %10
216 lea %8, [%8+4*r1]
217 lea %9, [%9+4*r3]
218%endif
219%endmacro
220
221; in: r4=3*stride1, r5=3*stride2
222; in: %2 = horizontal offset
223; in: %3 = whether we need to increment pix1 and pix2
224; clobber: m3..m7
225; out: %1 = satd
226%macro SATD_4x4_MMX 3
227 %xdefine %%n n%1
228 %assign offset %2*SIZEOF_PIXEL
229 LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
230 LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
231 LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
232 LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
233%if %3
234 lea r0, [r0+4*r1]
235 lea r2, [r2+4*r3]
236%endif
237 HADAMARD4_2D 4, 5, 6, 7, 3, %%n
238 paddw m4, m6
239 SWAP %%n, 4
240%endmacro
241
242; in: %1 = horizontal if 0, vertical if 1
243%macro SATD_8x4_SSE 8-9
244%if %1
245 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
246%else
247 HADAMARD4_V %2, %3, %4, %5, %6
248 ; doing the abs first is a slight advantage
249 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
250 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
251 HADAMARD 1, max, %2, %4, %6, %7
252%endif
253%ifnidn %9, swap
254 paddw m%8, m%2
255%else
256 SWAP %8, %2
257%endif
258%if %1
259 paddw m%8, m%4
260%else
261 HADAMARD 1, max, %3, %5, %6, %7
262 paddw m%8, m%3
263%endif
264%endmacro
265
266%macro SATD_8x4_1_SSE 10
267%if %1
268 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
269%else
270 HADAMARD4_V %2, %3, %4, %5, %6
271 ; doing the abs first is a slight advantage
272 ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
273 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
274 HADAMARD 1, max, %2, %4, %6, %7
275%endif
276
277 pxor m%10, m%10
278 mova m%9, m%2
279 punpcklwd m%9, m%10
280 paddd m%8, m%9
281 mova m%9, m%2
282 punpckhwd m%9, m%10
283 paddd m%8, m%9
284
285%if %1
286 pxor m%10, m%10
287 mova m%9, m%4
288 punpcklwd m%9, m%10
289 paddd m%8, m%9
290 mova m%9, m%4
291 punpckhwd m%9, m%10
292 paddd m%8, m%9
293%else
294 HADAMARD 1, max, %3, %5, %6, %7
295 pxor m%10, m%10
296 mova m%9, m%3
297 punpcklwd m%9, m%10
298 paddd m%8, m%9
299 mova m%9, m%3
300 punpckhwd m%9, m%10
301 paddd m%8, m%9
302%endif
303%endmacro
304
305%macro SATD_START_MMX 0
306 FIX_STRIDES r1, r3
307 lea r4, [3*r1] ; 3*stride1
308 lea r5, [3*r3] ; 3*stride2
309%endmacro
310
311%macro SATD_END_MMX 0
312%if HIGH_BIT_DEPTH
313 HADDUW m0, m1
314 movd eax, m0
315%else ; !HIGH_BIT_DEPTH
316 pshufw m1, m0, q1032
317 paddw m0, m1
318 pshufw m1, m0, q2301
319 paddw m0, m1
320 movd eax, m0
321 and eax, 0xffff
322%endif ; HIGH_BIT_DEPTH
323 RET
324%endmacro
325
326; FIXME avoid the spilling of regs to hold 3*stride.
327; for small blocks on x86_32, modify pixel pointer instead.
328
329;-----------------------------------------------------------------------------
330; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
331;-----------------------------------------------------------------------------
332INIT_MMX mmx2
333cglobal pixel_satd_16x4_internal
334 SATD_4x4_MMX m2, 0, 0
335 SATD_4x4_MMX m1, 4, 0
336 paddw m0, m2
337 SATD_4x4_MMX m2, 8, 0
338 paddw m0, m1
339 SATD_4x4_MMX m1, 12, 0
340 paddw m0, m2
341 paddw m0, m1
342 ret
343
344cglobal pixel_satd_8x8_internal
345 SATD_4x4_MMX m2, 0, 0
346 SATD_4x4_MMX m1, 4, 1
347 paddw m0, m2
348 paddw m0, m1
349pixel_satd_8x4_internal_mmx2:
350 SATD_4x4_MMX m2, 0, 0
351 SATD_4x4_MMX m1, 4, 0
352 paddw m0, m2
353 paddw m0, m1
354 ret
355
356%if HIGH_BIT_DEPTH
357%macro SATD_MxN_MMX 3
358cglobal pixel_satd_%1x%2, 4,7
359 SATD_START_MMX
360 pxor m0, m0
361 call pixel_satd_%1x%3_internal_mmx2
362 HADDUW m0, m1
363 movd r6d, m0
364%rep %2/%3-1
365 pxor m0, m0
366 lea r0, [r0+4*r1]
367 lea r2, [r2+4*r3]
368 call pixel_satd_%1x%3_internal_mmx2
369 movd m2, r4
370 HADDUW m0, m1
371 movd r4, m0
372 add r6, r4
373 movd r4, m2
374%endrep
375 movifnidn eax, r6d
376 RET
377%endmacro
378
379SATD_MxN_MMX 16, 16, 4
380SATD_MxN_MMX 16, 8, 4
381SATD_MxN_MMX 8, 16, 8
382%endif ; HIGH_BIT_DEPTH
383
384%if HIGH_BIT_DEPTH == 0
385cglobal pixel_satd_16x16, 4,6
386 SATD_START_MMX
387 pxor m0, m0
388%rep 3
389 call pixel_satd_16x4_internal_mmx2
390 lea r0, [r0+4*r1]
391 lea r2, [r2+4*r3]
392%endrep
393 call pixel_satd_16x4_internal_mmx2
394 HADDUW m0, m1
395 movd eax, m0
396 RET
397
398cglobal pixel_satd_16x8, 4,6
399 SATD_START_MMX
400 pxor m0, m0
401 call pixel_satd_16x4_internal_mmx2
402 lea r0, [r0+4*r1]
403 lea r2, [r2+4*r3]
404 call pixel_satd_16x4_internal_mmx2
405 SATD_END_MMX
406
407cglobal pixel_satd_8x16, 4,6
408 SATD_START_MMX
409 pxor m0, m0
410 call pixel_satd_8x8_internal_mmx2
411 lea r0, [r0+4*r1]
412 lea r2, [r2+4*r3]
413 call pixel_satd_8x8_internal_mmx2
414 SATD_END_MMX
415%endif ; !HIGH_BIT_DEPTH
416
417cglobal pixel_satd_8x8, 4,6
418 SATD_START_MMX
419 pxor m0, m0
420 call pixel_satd_8x8_internal_mmx2
421 SATD_END_MMX
422
423cglobal pixel_satd_8x4, 4,6
424 SATD_START_MMX
425 pxor m0, m0
426 call pixel_satd_8x4_internal_mmx2
427 SATD_END_MMX
428
429cglobal pixel_satd_4x16, 4,6
430 SATD_START_MMX
431 SATD_4x4_MMX m0, 0, 1
432 SATD_4x4_MMX m1, 0, 1
433 paddw m0, m1
434 SATD_4x4_MMX m1, 0, 1
435 paddw m0, m1
436 SATD_4x4_MMX m1, 0, 0
437 paddw m0, m1
438 SATD_END_MMX
439
440cglobal pixel_satd_4x8, 4,6
441 SATD_START_MMX
442 SATD_4x4_MMX m0, 0, 1
443 SATD_4x4_MMX m1, 0, 0
444 paddw m0, m1
445 SATD_END_MMX
446
447cglobal pixel_satd_4x4, 4,6
448 SATD_START_MMX
449 SATD_4x4_MMX m0, 0, 0
450 SATD_END_MMX
451
452%macro SATD_START_SSE2 2-3 0
453 FIX_STRIDES r1, r3
454%if HIGH_BIT_DEPTH && %3
455 pxor %2, %2
456%elif cpuflag(ssse3) && notcpuflag(atom)
457%if mmsize==32
458 mova %2, [hmul_16p]
459%else
460 mova %2, [hmul_8p]
461%endif
462%endif
463 lea r4, [3*r1]
464 lea r5, [3*r3]
465 pxor %1, %1
466%endmacro
467
468%macro SATD_END_SSE2 1-2
469%if HIGH_BIT_DEPTH
470 HADDUW %1, xm0
471%if %0 == 2
472 paddd %1, %2
473%endif
474%else
475 HADDW %1, xm7
476%endif
477 movd eax, %1
478 RET
479%endmacro
480
481%macro SATD_ACCUM 3
482%if HIGH_BIT_DEPTH
483 HADDUW %1, %2
484 paddd %3, %1
485 pxor %1, %1
486%endif
487%endmacro
488
489%macro BACKUP_POINTERS 0
490%if ARCH_X86_64
491%if WIN64
492 PUSH r7
493%endif
494 mov r6, r0
495 mov r7, r2
496%endif
497%endmacro
498
499%macro RESTORE_AND_INC_POINTERS 0
500%if ARCH_X86_64
501 lea r0, [r6+8*SIZEOF_PIXEL]
502 lea r2, [r7+8*SIZEOF_PIXEL]
503%if WIN64
504 POP r7
505%endif
506%else
507 mov r0, r0mp
508 mov r2, r2mp
509 add r0, 8*SIZEOF_PIXEL
510 add r2, 8*SIZEOF_PIXEL
511%endif
512%endmacro
513
514%macro SATD_4x8_SSE 3-4
515%if HIGH_BIT_DEPTH
516 movh m0, [r0+0*r1]
517 movh m4, [r2+0*r3]
518 movh m1, [r0+1*r1]
519 movh m5, [r2+1*r3]
520 movhps m0, [r0+4*r1]
521 movhps m4, [r2+4*r3]
522 movh m2, [r0+2*r1]
523 movh m6, [r2+2*r3]
524 psubw m0, m4
525 movh m3, [r0+r4]
526 movh m4, [r2+r5]
527 lea r0, [r0+4*r1]
528 lea r2, [r2+4*r3]
529 movhps m1, [r0+1*r1]
530 movhps m5, [r2+1*r3]
531 movhps m2, [r0+2*r1]
532 movhps m6, [r2+2*r3]
533 psubw m1, m5
534 movhps m3, [r0+r4]
535 movhps m4, [r2+r5]
536 psubw m2, m6
537 psubw m3, m4
538%else ; !HIGH_BIT_DEPTH
539 movd m4, [r2]
540 movd m5, [r2+r3]
541 movd m6, [r2+2*r3]
542 add r2, r5
543 movd m0, [r0]
544 movd m1, [r0+r1]
545 movd m2, [r0+2*r1]
546 add r0, r4
547 movd m3, [r2+r3]
548 JDUP m4, m3
549 movd m3, [r0+r1]
550 JDUP m0, m3
551 movd m3, [r2+2*r3]
552 JDUP m5, m3
553 movd m3, [r0+2*r1]
554 JDUP m1, m3
555%if %1==0 && %2==1
556 mova m3, [hmul_4p]
557 DIFFOP 0, 4, 1, 5, 3
558%else
559 DIFFOP 0, 4, 1, 5, 7
560%endif
561 movd m5, [r2]
562 add r2, r5
563 movd m3, [r0]
564 add r0, r4
565 movd m4, [r2]
566 JDUP m6, m4
567 movd m4, [r0]
568 JDUP m2, m4
569 movd m4, [r2+r3]
570 JDUP m5, m4
571 movd m4, [r0+r1]
572 JDUP m3, m4
573%if %1==0 && %2==1
574 mova m4, [hmul_4p]
575 DIFFOP 2, 6, 3, 5, 4
576%else
577 DIFFOP 2, 6, 3, 5, 7
578%endif
579%endif ; HIGH_BIT_DEPTH
580%if %0 == 4
581 SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
582%else
583 SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
584%endif
585%endmacro
586
587;-----------------------------------------------------------------------------
588; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
589;-----------------------------------------------------------------------------
590%macro SATDS_SSE2 0
591%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
592
593%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
594cglobal pixel_satd_4x4, 4, 6, 6
595 SATD_START_MMX
596 mova m4, [hmul_4p]
597 LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
598 LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
599 LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
600 LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
601 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
602 HADAMARD 0, sumsub, 0, 1, 2, 3
603 HADAMARD 4, sumsub, 0, 1, 2, 3
604 HADAMARD 1, amax, 0, 1, 2, 3
605 HADDW m0, m1
606 movd eax, m0
607 RET
608%endif
609
610cglobal pixel_satd_4x8, 4, 6, 8
611 SATD_START_MMX
612%if vertical==0
613 mova m7, [hmul_4p]
614%endif
615 SATD_4x8_SSE vertical, 0, swap
616 HADDW m7, m1
617 movd eax, m7
618 RET
619
620cglobal pixel_satd_4x16, 4, 6, 8
621 SATD_START_MMX
622%if vertical==0
623 mova m7, [hmul_4p]
624%endif
625 SATD_4x8_SSE vertical, 0, swap
626 lea r0, [r0+r1*2*SIZEOF_PIXEL]
627 lea r2, [r2+r3*2*SIZEOF_PIXEL]
628 SATD_4x8_SSE vertical, 1, add
629 HADDW m7, m1
630 movd eax, m7
631 RET
632
633cglobal pixel_satd_8x8_internal
634 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
635 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
636%%pixel_satd_8x4_internal:
637 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
638 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
639 ret
640
641cglobal pixel_satd_8x8_internal2
642%if WIN64
643 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
644 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
645%%pixel_satd_8x4_internal2:
646 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
647 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
648%else
649 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
650 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
651%%pixel_satd_8x4_internal2:
652 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
653 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
654%endif
655 ret
656
657; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
658; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
659%if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
660
661cglobal pixel_satd_16x4_internal2
662 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
663 lea r2, [r2+4*r3]
664 lea r0, [r0+4*r1]
665 SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13
666 SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13
667 ret
668
669cglobal pixel_satd_16x4, 4,6,14
670 SATD_START_SSE2 m10, m7
671%if vertical
672 mova m7, [pw_00ff]
673%endif
674 call pixel_satd_16x4_internal2
675 pxor m9, m9
676 movhlps m9, m10
677 paddd m10, m9
678 pshufd m9, m10, 1
679 paddd m10, m9
680 movd eax, m10
681 RET
682
683cglobal pixel_satd_16x8, 4,6,14
684 SATD_START_SSE2 m10, m7
685%if vertical
686 mova m7, [pw_00ff]
687%endif
688 jmp %%pixel_satd_16x8_internal
689
690cglobal pixel_satd_16x12, 4,6,14
691 SATD_START_SSE2 m10, m7
692%if vertical
693 mova m7, [pw_00ff]
694%endif
695 call pixel_satd_16x4_internal2
696 jmp %%pixel_satd_16x8_internal
697
698cglobal pixel_satd_16x32, 4,6,14
699 SATD_START_SSE2 m10, m7
700%if vertical
701 mova m7, [pw_00ff]
702%endif
703 call pixel_satd_16x4_internal2
704 call pixel_satd_16x4_internal2
705 call pixel_satd_16x4_internal2
706 call pixel_satd_16x4_internal2
707 call pixel_satd_16x4_internal2
708 call pixel_satd_16x4_internal2
709 jmp %%pixel_satd_16x8_internal
710
711cglobal pixel_satd_16x64, 4,6,14
712 SATD_START_SSE2 m10, m7
713%if vertical
714 mova m7, [pw_00ff]
715%endif
716 call pixel_satd_16x4_internal2
717 call pixel_satd_16x4_internal2
718 call pixel_satd_16x4_internal2
719 call pixel_satd_16x4_internal2
720 call pixel_satd_16x4_internal2
721 call pixel_satd_16x4_internal2
722 call pixel_satd_16x4_internal2
723 call pixel_satd_16x4_internal2
724 call pixel_satd_16x4_internal2
725 call pixel_satd_16x4_internal2
726 call pixel_satd_16x4_internal2
727 call pixel_satd_16x4_internal2
728 call pixel_satd_16x4_internal2
729 call pixel_satd_16x4_internal2
730 jmp %%pixel_satd_16x8_internal
731
732cglobal pixel_satd_16x16, 4,6,14
733 SATD_START_SSE2 m10, m7
734%if vertical
735 mova m7, [pw_00ff]
736%endif
737 call pixel_satd_16x4_internal2
738 call pixel_satd_16x4_internal2
739%%pixel_satd_16x8_internal:
740 call pixel_satd_16x4_internal2
741 call pixel_satd_16x4_internal2
742 pxor m9, m9
743 movhlps m9, m10
744 paddd m10, m9
745 pshufd m9, m10, 1
746 paddd m10, m9
747 movd eax, m10
748 RET
749
750cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx)
751 SATD_START_SSE2 m10, m7
752 mov r6, r0
753 mov r7, r2
754%if vertical
755 mova m7, [pw_00ff]
756%endif
757 call pixel_satd_16x4_internal2
758 call pixel_satd_16x4_internal2
759 lea r0, [r6 + 16]
760 lea r2, [r7 + 16]
761 call pixel_satd_16x4_internal2
762 call pixel_satd_16x4_internal2
763 pxor m9, m9
764 movhlps m9, m10
765 paddd m10, m9
766 pshufd m9, m10, 1
767 paddd m10, m9
768 movd eax, m10
769 RET
770
771cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
772 SATD_START_SSE2 m10, m7
773 mov r6, r0
774 mov r7, r2
775%if vertical
776 mova m7, [pw_00ff]
777%endif
778 call pixel_satd_16x4_internal2
779 call pixel_satd_16x4_internal2
780 call pixel_satd_16x4_internal2
781 call pixel_satd_16x4_internal2
782 lea r0, [r6 + 16]
783 lea r2, [r7 + 16]
784 call pixel_satd_16x4_internal2
785 call pixel_satd_16x4_internal2
786 call pixel_satd_16x4_internal2
787 call pixel_satd_16x4_internal2
788 pxor m9, m9
789 movhlps m9, m10
790 paddd m10, m9
791 pshufd m9, m10, 1
792 paddd m10, m9
793 movd eax, m10
794 RET
795
796cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx)
797 SATD_START_SSE2 m10, m7
798 mov r6, r0
799 mov r7, r2
800%if vertical
801 mova m7, [pw_00ff]
802%endif
803 call pixel_satd_16x4_internal2
804 call pixel_satd_16x4_internal2
805 call pixel_satd_16x4_internal2
806 call pixel_satd_16x4_internal2
807 call pixel_satd_16x4_internal2
808 call pixel_satd_16x4_internal2
809 lea r0, [r6 + 16]
810 lea r2, [r7 + 16]
811 call pixel_satd_16x4_internal2
812 call pixel_satd_16x4_internal2
813 call pixel_satd_16x4_internal2
814 call pixel_satd_16x4_internal2
815 call pixel_satd_16x4_internal2
816 call pixel_satd_16x4_internal2
817 pxor m9, m9
818 movhlps m9, m10
819 paddd m10, m9
820 pshufd m9, m10, 1
821 paddd m10, m9
822 movd eax, m10
823 RET
824
825cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
826 SATD_START_SSE2 m10, m7
827 mov r6, r0
828 mov r7, r2
829%if vertical
830 mova m7, [pw_00ff]
831%endif
832 call pixel_satd_16x4_internal2
833 call pixel_satd_16x4_internal2
834 call pixel_satd_16x4_internal2
835 call pixel_satd_16x4_internal2
836 call pixel_satd_16x4_internal2
837 call pixel_satd_16x4_internal2
838 call pixel_satd_16x4_internal2
839 call pixel_satd_16x4_internal2
840 lea r0, [r6 + 16]
841 lea r2, [r7 + 16]
842 call pixel_satd_16x4_internal2
843 call pixel_satd_16x4_internal2
844 call pixel_satd_16x4_internal2
845 call pixel_satd_16x4_internal2
846 call pixel_satd_16x4_internal2
847 call pixel_satd_16x4_internal2
848 call pixel_satd_16x4_internal2
849 call pixel_satd_16x4_internal2
850 pxor m9, m9
851 movhlps m9, m10
852 paddd m10, m9
853 pshufd m9, m10, 1
854 paddd m10, m9
855 movd eax, m10
856 RET
857
858cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
859 SATD_START_SSE2 m10, m7
860 mov r6, r0
861 mov r7, r2
862%if vertical
863 mova m7, [pw_00ff]
864%endif
865 call pixel_satd_16x4_internal2
866 call pixel_satd_16x4_internal2
867 call pixel_satd_16x4_internal2
868 call pixel_satd_16x4_internal2
869 call pixel_satd_16x4_internal2
870 call pixel_satd_16x4_internal2
871 call pixel_satd_16x4_internal2
872 call pixel_satd_16x4_internal2
873 call pixel_satd_16x4_internal2
874 call pixel_satd_16x4_internal2
875 call pixel_satd_16x4_internal2
876 call pixel_satd_16x4_internal2
877 call pixel_satd_16x4_internal2
878 call pixel_satd_16x4_internal2
879 call pixel_satd_16x4_internal2
880 call pixel_satd_16x4_internal2
881 lea r0, [r6 + 16]
882 lea r2, [r7 + 16]
883 call pixel_satd_16x4_internal2
884 call pixel_satd_16x4_internal2
885 call pixel_satd_16x4_internal2
886 call pixel_satd_16x4_internal2
887 call pixel_satd_16x4_internal2
888 call pixel_satd_16x4_internal2
889 call pixel_satd_16x4_internal2
890 call pixel_satd_16x4_internal2
891 call pixel_satd_16x4_internal2
892 call pixel_satd_16x4_internal2
893 call pixel_satd_16x4_internal2
894 call pixel_satd_16x4_internal2
895 call pixel_satd_16x4_internal2
896 call pixel_satd_16x4_internal2
897 call pixel_satd_16x4_internal2
898 call pixel_satd_16x4_internal2
899 pxor m9, m9
900 movhlps m9, m10
901 paddd m10, m9
902 pshufd m9, m10, 1
903 paddd m10, m9
904 movd eax, m10
905 RET
906
907cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
908 SATD_START_SSE2 m10, m7
909 mov r6, r0
910 mov r7, r2
911%if vertical
912 mova m7, [pw_00ff]
913%endif
914 call pixel_satd_16x4_internal2
915 call pixel_satd_16x4_internal2
916 call pixel_satd_16x4_internal2
917 call pixel_satd_16x4_internal2
918 call pixel_satd_16x4_internal2
919 call pixel_satd_16x4_internal2
920 call pixel_satd_16x4_internal2
921 call pixel_satd_16x4_internal2
922 call pixel_satd_16x4_internal2
923 call pixel_satd_16x4_internal2
924 call pixel_satd_16x4_internal2
925 call pixel_satd_16x4_internal2
926 call pixel_satd_16x4_internal2
927 call pixel_satd_16x4_internal2
928 call pixel_satd_16x4_internal2
929 call pixel_satd_16x4_internal2
930 lea r0, [r6 + 16]
931 lea r2, [r7 + 16]
932 call pixel_satd_16x4_internal2
933 call pixel_satd_16x4_internal2
934 call pixel_satd_16x4_internal2
935 call pixel_satd_16x4_internal2
936 call pixel_satd_16x4_internal2
937 call pixel_satd_16x4_internal2
938 call pixel_satd_16x4_internal2
939 call pixel_satd_16x4_internal2
940 call pixel_satd_16x4_internal2
941 call pixel_satd_16x4_internal2
942 call pixel_satd_16x4_internal2
943 call pixel_satd_16x4_internal2
944 call pixel_satd_16x4_internal2
945 call pixel_satd_16x4_internal2
946 call pixel_satd_16x4_internal2
947 call pixel_satd_16x4_internal2
948 lea r0, [r6 + 32]
949 lea r2, [r7 + 32]
950 call pixel_satd_16x4_internal2
951 call pixel_satd_16x4_internal2
952 call pixel_satd_16x4_internal2
953 call pixel_satd_16x4_internal2
954 call pixel_satd_16x4_internal2
955 call pixel_satd_16x4_internal2
956 call pixel_satd_16x4_internal2
957 call pixel_satd_16x4_internal2
958 call pixel_satd_16x4_internal2
959 call pixel_satd_16x4_internal2
960 call pixel_satd_16x4_internal2
961 call pixel_satd_16x4_internal2
962 call pixel_satd_16x4_internal2
963 call pixel_satd_16x4_internal2
964 call pixel_satd_16x4_internal2
965 call pixel_satd_16x4_internal2
966 pxor m9, m9
967 movhlps m9, m10
968 paddd m10, m9
969 pshufd m9, m10, 1
970 paddd m10, m9
971 movd eax, m10
972 RET
973
974cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
975 SATD_START_SSE2 m10, m7
976 mov r6, r0
977 mov r7, r2
978%if vertical
979 mova m7, [pw_00ff]
980%endif
981 call pixel_satd_16x4_internal2
982 call pixel_satd_16x4_internal2
983 call pixel_satd_16x4_internal2
984 call pixel_satd_16x4_internal2
985 lea r0, [r6 + 16]
986 lea r2, [r7 + 16]
987 call pixel_satd_16x4_internal2
988 call pixel_satd_16x4_internal2
989 call pixel_satd_16x4_internal2
990 call pixel_satd_16x4_internal2
991 lea r0, [r6 + 32]
992 lea r2, [r7 + 32]
993 call pixel_satd_16x4_internal2
994 call pixel_satd_16x4_internal2
995 call pixel_satd_16x4_internal2
996 call pixel_satd_16x4_internal2
997 lea r0, [r6 + 48]
998 lea r2, [r7 + 48]
999 call pixel_satd_16x4_internal2
1000 call pixel_satd_16x4_internal2
1001 call pixel_satd_16x4_internal2
1002 call pixel_satd_16x4_internal2
1003 pxor m9, m9
1004 movhlps m9, m10
1005 paddd m10, m9
1006 pshufd m9, m10, 1
1007 paddd m10, m9
1008 movd eax, m10
1009 RET
1010
1011cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
1012 SATD_START_SSE2 m10, m7
1013 mov r6, r0
1014 mov r7, r2
1015%if vertical
1016 mova m7, [pw_00ff]
1017%endif
1018 call pixel_satd_16x4_internal2
1019 call pixel_satd_16x4_internal2
1020 call pixel_satd_16x4_internal2
1021 call pixel_satd_16x4_internal2
1022 call pixel_satd_16x4_internal2
1023 call pixel_satd_16x4_internal2
1024 call pixel_satd_16x4_internal2
1025 call pixel_satd_16x4_internal2
1026 lea r0, [r6 + 16]
1027 lea r2, [r7 + 16]
1028 call pixel_satd_16x4_internal2
1029 call pixel_satd_16x4_internal2
1030 call pixel_satd_16x4_internal2
1031 call pixel_satd_16x4_internal2
1032 call pixel_satd_16x4_internal2
1033 call pixel_satd_16x4_internal2
1034 call pixel_satd_16x4_internal2
1035 call pixel_satd_16x4_internal2
1036 lea r0, [r6 + 32]
1037 lea r2, [r7 + 32]
1038 call pixel_satd_16x4_internal2
1039 call pixel_satd_16x4_internal2
1040 call pixel_satd_16x4_internal2
1041 call pixel_satd_16x4_internal2
1042 call pixel_satd_16x4_internal2
1043 call pixel_satd_16x4_internal2
1044 call pixel_satd_16x4_internal2
1045 call pixel_satd_16x4_internal2
1046 lea r0, [r6 + 48]
1047 lea r2, [r7 + 48]
1048 call pixel_satd_16x4_internal2
1049 call pixel_satd_16x4_internal2
1050 call pixel_satd_16x4_internal2
1051 call pixel_satd_16x4_internal2
1052 call pixel_satd_16x4_internal2
1053 call pixel_satd_16x4_internal2
1054 call pixel_satd_16x4_internal2
1055 call pixel_satd_16x4_internal2
1056
1057 pxor m9, m9
1058 movhlps m9, m10
1059 paddd m10, m9
1060 pshufd m9, m10, 1
1061 paddd m10, m9
1062 movd eax, m10
1063 RET
1064
1065cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx)
1066 SATD_START_SSE2 m10, m7
1067 mov r6, r0
1068 mov r7, r2
1069%if vertical
1070 mova m7, [pw_00ff]
1071%endif
1072 call pixel_satd_16x4_internal2
1073 call pixel_satd_16x4_internal2
1074 call pixel_satd_16x4_internal2
1075 call pixel_satd_16x4_internal2
1076 call pixel_satd_16x4_internal2
1077 call pixel_satd_16x4_internal2
1078 call pixel_satd_16x4_internal2
1079 call pixel_satd_16x4_internal2
1080 call pixel_satd_16x4_internal2
1081 call pixel_satd_16x4_internal2
1082 call pixel_satd_16x4_internal2
1083 call pixel_satd_16x4_internal2
1084 lea r0, [r6 + 16]
1085 lea r2, [r7 + 16]
1086 call pixel_satd_16x4_internal2
1087 call pixel_satd_16x4_internal2
1088 call pixel_satd_16x4_internal2
1089 call pixel_satd_16x4_internal2
1090 call pixel_satd_16x4_internal2
1091 call pixel_satd_16x4_internal2
1092 call pixel_satd_16x4_internal2
1093 call pixel_satd_16x4_internal2
1094 call pixel_satd_16x4_internal2
1095 call pixel_satd_16x4_internal2
1096 call pixel_satd_16x4_internal2
1097 call pixel_satd_16x4_internal2
1098 lea r0, [r6 + 32]
1099 lea r2, [r7 + 32]
1100 call pixel_satd_16x4_internal2
1101 call pixel_satd_16x4_internal2
1102 call pixel_satd_16x4_internal2
1103 call pixel_satd_16x4_internal2
1104 call pixel_satd_16x4_internal2
1105 call pixel_satd_16x4_internal2
1106 call pixel_satd_16x4_internal2
1107 call pixel_satd_16x4_internal2
1108 call pixel_satd_16x4_internal2
1109 call pixel_satd_16x4_internal2
1110 call pixel_satd_16x4_internal2
1111 call pixel_satd_16x4_internal2
1112 lea r0, [r6 + 48]
1113 lea r2, [r7 + 48]
1114 call pixel_satd_16x4_internal2
1115 call pixel_satd_16x4_internal2
1116 call pixel_satd_16x4_internal2
1117 call pixel_satd_16x4_internal2
1118 call pixel_satd_16x4_internal2
1119 call pixel_satd_16x4_internal2
1120 call pixel_satd_16x4_internal2
1121 call pixel_satd_16x4_internal2
1122 call pixel_satd_16x4_internal2
1123 call pixel_satd_16x4_internal2
1124 call pixel_satd_16x4_internal2
1125 call pixel_satd_16x4_internal2
1126
1127 pxor m9, m9
1128 movhlps m9, m10
1129 paddd m10, m9
1130 pshufd m9, m10, 1
1131 paddd m10, m9
1132 movd eax, m10
1133 RET
1134
1135cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
1136 SATD_START_SSE2 m10, m7
1137 mov r6, r0
1138 mov r7, r2
1139%if vertical
1140 mova m7, [pw_00ff]
1141%endif
1142 call pixel_satd_16x4_internal2
1143 call pixel_satd_16x4_internal2
1144 call pixel_satd_16x4_internal2
1145 call pixel_satd_16x4_internal2
1146 call pixel_satd_16x4_internal2
1147 call pixel_satd_16x4_internal2
1148 call pixel_satd_16x4_internal2
1149 call pixel_satd_16x4_internal2
1150 call pixel_satd_16x4_internal2
1151 call pixel_satd_16x4_internal2
1152 call pixel_satd_16x4_internal2
1153 call pixel_satd_16x4_internal2
1154 call pixel_satd_16x4_internal2
1155 call pixel_satd_16x4_internal2
1156 call pixel_satd_16x4_internal2
1157 call pixel_satd_16x4_internal2
1158 lea r0, [r6 + 16]
1159 lea r2, [r7 + 16]
1160 call pixel_satd_16x4_internal2
1161 call pixel_satd_16x4_internal2
1162 call pixel_satd_16x4_internal2
1163 call pixel_satd_16x4_internal2
1164 call pixel_satd_16x4_internal2
1165 call pixel_satd_16x4_internal2
1166 call pixel_satd_16x4_internal2
1167 call pixel_satd_16x4_internal2
1168 call pixel_satd_16x4_internal2
1169 call pixel_satd_16x4_internal2
1170 call pixel_satd_16x4_internal2
1171 call pixel_satd_16x4_internal2
1172 call pixel_satd_16x4_internal2
1173 call pixel_satd_16x4_internal2
1174 call pixel_satd_16x4_internal2
1175 call pixel_satd_16x4_internal2
1176 lea r0, [r6 + 32]
1177 lea r2, [r7 + 32]
1178 call pixel_satd_16x4_internal2
1179 call pixel_satd_16x4_internal2
1180 call pixel_satd_16x4_internal2
1181 call pixel_satd_16x4_internal2
1182 call pixel_satd_16x4_internal2
1183 call pixel_satd_16x4_internal2
1184 call pixel_satd_16x4_internal2
1185 call pixel_satd_16x4_internal2
1186 call pixel_satd_16x4_internal2
1187 call pixel_satd_16x4_internal2
1188 call pixel_satd_16x4_internal2
1189 call pixel_satd_16x4_internal2
1190 call pixel_satd_16x4_internal2
1191 call pixel_satd_16x4_internal2
1192 call pixel_satd_16x4_internal2
1193 call pixel_satd_16x4_internal2
1194 lea r0, [r6 + 48]
1195 lea r2, [r7 + 48]
1196 call pixel_satd_16x4_internal2
1197 call pixel_satd_16x4_internal2
1198 call pixel_satd_16x4_internal2
1199 call pixel_satd_16x4_internal2
1200 call pixel_satd_16x4_internal2
1201 call pixel_satd_16x4_internal2
1202 call pixel_satd_16x4_internal2
1203 call pixel_satd_16x4_internal2
1204 call pixel_satd_16x4_internal2
1205 call pixel_satd_16x4_internal2
1206 call pixel_satd_16x4_internal2
1207 call pixel_satd_16x4_internal2
1208 call pixel_satd_16x4_internal2
1209 call pixel_satd_16x4_internal2
1210 call pixel_satd_16x4_internal2
1211 call pixel_satd_16x4_internal2
1212
1213 pxor m9, m9
1214 movhlps m9, m10
1215 paddd m10, m9
1216 pshufd m9, m10, 1
1217 paddd m10, m9
1218 movd eax, m10
1219 RET
1220
1221%else
1222
1223%if WIN64
1224cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx)
1225 SATD_START_SSE2 m6, m7
1226 mov r6, r0
1227 mov r7, r2
1228 call pixel_satd_8x8_internal2
1229 lea r0, [r6 + 8*SIZEOF_PIXEL]
1230 lea r2, [r7 + 8*SIZEOF_PIXEL]
1231 call pixel_satd_8x8_internal2
1232 lea r0, [r6 + 16*SIZEOF_PIXEL]
1233 lea r2, [r7 + 16*SIZEOF_PIXEL]
1234 call pixel_satd_8x8_internal2
1235 lea r0, [r6 + 24*SIZEOF_PIXEL]
1236 lea r2, [r7 + 24*SIZEOF_PIXEL]
1237 call pixel_satd_8x8_internal2
1238 pxor m7, m7
1239 movhlps m7, m6
1240 paddd m6, m7
1241 pshufd m7, m6, 1
1242 paddd m6, m7
1243 movd eax, m6
1244 RET
1245%else
1246cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
1247 SATD_START_SSE2 m6, m7
1248 mov r6, r0
1249 mov [rsp], r2
1250 call pixel_satd_8x8_internal2
1251 lea r0, [r6 + 8*SIZEOF_PIXEL]
1252 mov r2, [rsp]
1253 add r2, 8*SIZEOF_PIXEL
1254 call pixel_satd_8x8_internal2
1255 lea r0, [r6 + 16*SIZEOF_PIXEL]
1256 mov r2, [rsp]
1257 add r2, 16*SIZEOF_PIXEL
1258 call pixel_satd_8x8_internal2
1259 lea r0, [r6 + 24*SIZEOF_PIXEL]
1260 mov r2, [rsp]
1261 add r2, 24*SIZEOF_PIXEL
1262 call pixel_satd_8x8_internal2
1263 pxor m7, m7
1264 movhlps m7, m6
1265 paddd m6, m7
1266 pshufd m7, m6, 1
1267 paddd m6, m7
1268 movd eax, m6
1269 RET
1270%endif
1271
1272%if WIN64
1273cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx)
1274 SATD_START_SSE2 m6, m7
1275 mov r6, r0
1276 mov r7, r2
1277 call pixel_satd_8x8_internal2
1278 call pixel_satd_8x8_internal2
1279 lea r0, [r6 + 8*SIZEOF_PIXEL]
1280 lea r2, [r7 + 8*SIZEOF_PIXEL]
1281 call pixel_satd_8x8_internal2
1282 call pixel_satd_8x8_internal2
1283 lea r0, [r6 + 16*SIZEOF_PIXEL]
1284 lea r2, [r7 + 16*SIZEOF_PIXEL]
1285 call pixel_satd_8x8_internal2
1286 call pixel_satd_8x8_internal2
1287 lea r0, [r6 + 24*SIZEOF_PIXEL]
1288 lea r2, [r7 + 24*SIZEOF_PIXEL]
1289 call pixel_satd_8x8_internal2
1290 call pixel_satd_8x8_internal2
1291 pxor m7, m7
1292 movhlps m7, m6
1293 paddd m6, m7
1294 pshufd m7, m6, 1
1295 paddd m6, m7
1296 movd eax, m6
1297 RET
1298%else
1299cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
1300 SATD_START_SSE2 m6, m7
1301 mov r6, r0
1302 mov [rsp], r2
1303 call pixel_satd_8x8_internal2
1304 call pixel_satd_8x8_internal2
1305 lea r0, [r6 + 8*SIZEOF_PIXEL]
1306 mov r2, [rsp]
1307 add r2, 8*SIZEOF_PIXEL
1308 call pixel_satd_8x8_internal2
1309 call pixel_satd_8x8_internal2
1310 lea r0, [r6 + 16*SIZEOF_PIXEL]
1311 mov r2, [rsp]
1312 add r2, 16*SIZEOF_PIXEL
1313 call pixel_satd_8x8_internal2
1314 call pixel_satd_8x8_internal2
1315 lea r0, [r6 + 24*SIZEOF_PIXEL]
1316 mov r2, [rsp]
1317 add r2, 24*SIZEOF_PIXEL
1318 call pixel_satd_8x8_internal2
1319 call pixel_satd_8x8_internal2
1320 pxor m7, m7
1321 movhlps m7, m6
1322 paddd m6, m7
1323 pshufd m7, m6, 1
1324 paddd m6, m7
1325 movd eax, m6
1326 RET
1327%endif
1328
1329%if WIN64
1330cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx)
1331 SATD_START_SSE2 m6, m7
1332 mov r6, r0
1333 mov r7, r2
1334 call pixel_satd_8x8_internal2
1335 call pixel_satd_8x8_internal2
1336 call pixel_satd_8x8_internal2
1337 lea r0, [r6 + 8*SIZEOF_PIXEL]
1338 lea r2, [r7 + 8*SIZEOF_PIXEL]
1339 call pixel_satd_8x8_internal2
1340 call pixel_satd_8x8_internal2
1341 call pixel_satd_8x8_internal2
1342 lea r0, [r6 + 16*SIZEOF_PIXEL]
1343 lea r2, [r7 + 16*SIZEOF_PIXEL]
1344 call pixel_satd_8x8_internal2
1345 call pixel_satd_8x8_internal2
1346 call pixel_satd_8x8_internal2
1347 lea r0, [r6 + 24*SIZEOF_PIXEL]
1348 lea r2, [r7 + 24*SIZEOF_PIXEL]
1349 call pixel_satd_8x8_internal2
1350 call pixel_satd_8x8_internal2
1351 call pixel_satd_8x8_internal2
1352 pxor m7, m7
1353 movhlps m7, m6
1354 paddd m6, m7
1355 pshufd m7, m6, 1
1356 paddd m6, m7
1357 movd eax, m6
1358 RET
1359%else
1360cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
1361 SATD_START_SSE2 m6, m7
1362 mov r6, r0
1363 mov [rsp], r2
1364 call pixel_satd_8x8_internal2
1365 call pixel_satd_8x8_internal2
1366 call pixel_satd_8x8_internal2
1367 lea r0, [r6 + 8*SIZEOF_PIXEL]
1368 mov r2, [rsp]
1369 add r2, 8*SIZEOF_PIXEL
1370 call pixel_satd_8x8_internal2
1371 call pixel_satd_8x8_internal2
1372 call pixel_satd_8x8_internal2
1373 lea r0, [r6 + 16*SIZEOF_PIXEL]
1374 mov r2, [rsp]
1375 add r2, 16*SIZEOF_PIXEL
1376 call pixel_satd_8x8_internal2
1377 call pixel_satd_8x8_internal2
1378 call pixel_satd_8x8_internal2
1379 lea r0, [r6 + 24*SIZEOF_PIXEL]
1380 mov r2, [rsp]
1381 add r2, 24*SIZEOF_PIXEL
1382 call pixel_satd_8x8_internal2
1383 call pixel_satd_8x8_internal2
1384 call pixel_satd_8x8_internal2
1385 pxor m7, m7
1386 movhlps m7, m6
1387 paddd m6, m7
1388 pshufd m7, m6, 1
1389 paddd m6, m7
1390 movd eax, m6
1391 RET
1392%endif
1393
1394%if WIN64
1395cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx)
1396 SATD_START_SSE2 m6, m7
1397 mov r6, r0
1398 mov r7, r2
1399 call pixel_satd_8x8_internal2
1400 call pixel_satd_8x8_internal2
1401 call pixel_satd_8x8_internal2
1402 call pixel_satd_8x8_internal2
1403 lea r0, [r6 + 8*SIZEOF_PIXEL]
1404 lea r2, [r7 + 8*SIZEOF_PIXEL]
1405 call pixel_satd_8x8_internal2
1406 call pixel_satd_8x8_internal2
1407 call pixel_satd_8x8_internal2
1408 call pixel_satd_8x8_internal2
1409 lea r0, [r6 + 16*SIZEOF_PIXEL]
1410 lea r2, [r7 + 16*SIZEOF_PIXEL]
1411 call pixel_satd_8x8_internal2
1412 call pixel_satd_8x8_internal2
1413 call pixel_satd_8x8_internal2
1414 call pixel_satd_8x8_internal2
1415 lea r0, [r6 + 24*SIZEOF_PIXEL]
1416 lea r2, [r7 + 24*SIZEOF_PIXEL]
1417 call pixel_satd_8x8_internal2
1418 call pixel_satd_8x8_internal2
1419 call pixel_satd_8x8_internal2
1420 call pixel_satd_8x8_internal2
1421 pxor m7, m7
1422 movhlps m7, m6
1423 paddd m6, m7
1424 pshufd m7, m6, 1
1425 paddd m6, m7
1426 movd eax, m6
1427 RET
1428%else
1429cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
1430 SATD_START_SSE2 m6, m7
1431 mov r6, r0
1432 mov [rsp], r2
1433 call pixel_satd_8x8_internal2
1434 call pixel_satd_8x8_internal2
1435 call pixel_satd_8x8_internal2
1436 call pixel_satd_8x8_internal2
1437 lea r0, [r6 + 8*SIZEOF_PIXEL]
1438 mov r2, [rsp]
1439 add r2, 8*SIZEOF_PIXEL
1440 call pixel_satd_8x8_internal2
1441 call pixel_satd_8x8_internal2
1442 call pixel_satd_8x8_internal2
1443 call pixel_satd_8x8_internal2
1444 lea r0, [r6 + 16*SIZEOF_PIXEL]
1445 mov r2, [rsp]
1446 add r2, 16*SIZEOF_PIXEL
1447 call pixel_satd_8x8_internal2
1448 call pixel_satd_8x8_internal2
1449 call pixel_satd_8x8_internal2
1450 call pixel_satd_8x8_internal2
1451 lea r0, [r6 + 24*SIZEOF_PIXEL]
1452 mov r2, [rsp]
1453 add r2, 24*SIZEOF_PIXEL
1454 call pixel_satd_8x8_internal2
1455 call pixel_satd_8x8_internal2
1456 call pixel_satd_8x8_internal2
1457 call pixel_satd_8x8_internal2
1458 pxor m7, m7
1459 movhlps m7, m6
1460 paddd m6, m7
1461 pshufd m7, m6, 1
1462 paddd m6, m7
1463 movd eax, m6
1464 RET
1465%endif
1466
1467%if WIN64
1468cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1469 SATD_START_SSE2 m6, m7
1470 mov r6, r0
1471 mov r7, r2
1472 call pixel_satd_8x8_internal2
1473 call pixel_satd_8x8_internal2
1474 call pixel_satd_8x8_internal2
1475 call pixel_satd_8x8_internal2
1476 call pixel_satd_8x8_internal2
1477 call pixel_satd_8x8_internal2
1478 call pixel_satd_8x8_internal2
1479 call pixel_satd_8x8_internal2
1480 lea r0, [r6 + 8*SIZEOF_PIXEL]
1481 lea r2, [r7 + 8*SIZEOF_PIXEL]
1482 call pixel_satd_8x8_internal2
1483 call pixel_satd_8x8_internal2
1484 call pixel_satd_8x8_internal2
1485 call pixel_satd_8x8_internal2
1486 call pixel_satd_8x8_internal2
1487 call pixel_satd_8x8_internal2
1488 call pixel_satd_8x8_internal2
1489 call pixel_satd_8x8_internal2
1490 lea r0, [r6 + 16*SIZEOF_PIXEL]
1491 lea r2, [r7 + 16*SIZEOF_PIXEL]
1492 call pixel_satd_8x8_internal2
1493 call pixel_satd_8x8_internal2
1494 call pixel_satd_8x8_internal2
1495 call pixel_satd_8x8_internal2
1496 call pixel_satd_8x8_internal2
1497 call pixel_satd_8x8_internal2
1498 call pixel_satd_8x8_internal2
1499 call pixel_satd_8x8_internal2
1500 lea r0, [r6 + 24*SIZEOF_PIXEL]
1501 lea r2, [r7 + 24*SIZEOF_PIXEL]
1502 call pixel_satd_8x8_internal2
1503 call pixel_satd_8x8_internal2
1504 call pixel_satd_8x8_internal2
1505 call pixel_satd_8x8_internal2
1506 call pixel_satd_8x8_internal2
1507 call pixel_satd_8x8_internal2
1508 call pixel_satd_8x8_internal2
1509 call pixel_satd_8x8_internal2
1510 pxor m7, m7
1511 movhlps m7, m6
1512 paddd m6, m7
1513 pshufd m7, m6, 1
1514 paddd m6, m7
1515 movd eax, m6
1516 RET
1517%else
1518cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64
1519 SATD_START_SSE2 m6, m7
1520 mov r6, r0
1521 mov [rsp], r2
1522 call pixel_satd_8x8_internal2
1523 call pixel_satd_8x8_internal2
1524 call pixel_satd_8x8_internal2
1525 call pixel_satd_8x8_internal2
1526 call pixel_satd_8x8_internal2
1527 call pixel_satd_8x8_internal2
1528 call pixel_satd_8x8_internal2
1529 call pixel_satd_8x8_internal2
1530 lea r0, [r6 + 8*SIZEOF_PIXEL]
1531 mov r2, [rsp]
1532 add r2, 8*SIZEOF_PIXEL
1533 call pixel_satd_8x8_internal2
1534 call pixel_satd_8x8_internal2
1535 call pixel_satd_8x8_internal2
1536 call pixel_satd_8x8_internal2
1537 call pixel_satd_8x8_internal2
1538 call pixel_satd_8x8_internal2
1539 call pixel_satd_8x8_internal2
1540 call pixel_satd_8x8_internal2
1541 lea r0, [r6 + 16*SIZEOF_PIXEL]
1542 mov r2, [rsp]
1543 add r2, 16*SIZEOF_PIXEL
1544 call pixel_satd_8x8_internal2
1545 call pixel_satd_8x8_internal2
1546 call pixel_satd_8x8_internal2
1547 call pixel_satd_8x8_internal2
1548 call pixel_satd_8x8_internal2
1549 call pixel_satd_8x8_internal2
1550 call pixel_satd_8x8_internal2
1551 call pixel_satd_8x8_internal2
1552 lea r0, [r6 + 24*SIZEOF_PIXEL]
1553 mov r2, [rsp]
1554 add r2, 24*SIZEOF_PIXEL
1555 call pixel_satd_8x8_internal2
1556 call pixel_satd_8x8_internal2
1557 call pixel_satd_8x8_internal2
1558 call pixel_satd_8x8_internal2
1559 call pixel_satd_8x8_internal2
1560 call pixel_satd_8x8_internal2
1561 call pixel_satd_8x8_internal2
1562 call pixel_satd_8x8_internal2
1563 pxor m7, m7
1564 movhlps m7, m6
1565 paddd m6, m7
1566 pshufd m7, m6, 1
1567 paddd m6, m7
1568 movd eax, m6
1569 RET
1570%endif
1571
1572%if WIN64
1573cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx)
1574 SATD_START_SSE2 m6, m7
1575 mov r6, r0
1576 mov r7, r2
1577 call pixel_satd_8x8_internal2
1578 call pixel_satd_8x8_internal2
1579 call pixel_satd_8x8_internal2
1580 call pixel_satd_8x8_internal2
1581 call pixel_satd_8x8_internal2
1582 call pixel_satd_8x8_internal2
1583 call pixel_satd_8x8_internal2
1584 call pixel_satd_8x8_internal2
1585 lea r0, [r6 + 8*SIZEOF_PIXEL]
1586 lea r2, [r7 + 8*SIZEOF_PIXEL]
1587 call pixel_satd_8x8_internal2
1588 call pixel_satd_8x8_internal2
1589 call pixel_satd_8x8_internal2
1590 call pixel_satd_8x8_internal2
1591 call pixel_satd_8x8_internal2
1592 call pixel_satd_8x8_internal2
1593 call pixel_satd_8x8_internal2
1594 call pixel_satd_8x8_internal2
1595 lea r0, [r6 + 16*SIZEOF_PIXEL]
1596 lea r2, [r7 + 16*SIZEOF_PIXEL]
1597 call pixel_satd_8x8_internal2
1598 call pixel_satd_8x8_internal2
1599 call pixel_satd_8x8_internal2
1600 call pixel_satd_8x8_internal2
1601 call pixel_satd_8x8_internal2
1602 call pixel_satd_8x8_internal2
1603 call pixel_satd_8x8_internal2
1604 call pixel_satd_8x8_internal2
1605 lea r0, [r6 + 24*SIZEOF_PIXEL]
1606 lea r2, [r7 + 24*SIZEOF_PIXEL]
1607 call pixel_satd_8x8_internal2
1608 call pixel_satd_8x8_internal2
1609 call pixel_satd_8x8_internal2
1610 call pixel_satd_8x8_internal2
1611 call pixel_satd_8x8_internal2
1612 call pixel_satd_8x8_internal2
1613 call pixel_satd_8x8_internal2
1614 call pixel_satd_8x8_internal2
1615 lea r0, [r6 + 32*SIZEOF_PIXEL]
1616 lea r2, [r7 + 32*SIZEOF_PIXEL]
1617 call pixel_satd_8x8_internal2
1618 call pixel_satd_8x8_internal2
1619 call pixel_satd_8x8_internal2
1620 call pixel_satd_8x8_internal2
1621 call pixel_satd_8x8_internal2
1622 call pixel_satd_8x8_internal2
1623 call pixel_satd_8x8_internal2
1624 call pixel_satd_8x8_internal2
1625 lea r0, [r6 + 40*SIZEOF_PIXEL]
1626 lea r2, [r7 + 40*SIZEOF_PIXEL]
1627 call pixel_satd_8x8_internal2
1628 call pixel_satd_8x8_internal2
1629 call pixel_satd_8x8_internal2
1630 call pixel_satd_8x8_internal2
1631 call pixel_satd_8x8_internal2
1632 call pixel_satd_8x8_internal2
1633 call pixel_satd_8x8_internal2
1634 call pixel_satd_8x8_internal2
1635 pxor m7, m7
1636 movhlps m7, m6
1637 paddd m6, m7
1638 pshufd m7, m6, 1
1639 paddd m6, m7
1640 movd eax, m6
1641 RET
1642%else
1643cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64
1644 SATD_START_SSE2 m6, m7
1645 mov r6, r0
1646 mov [rsp], r2
1647 call pixel_satd_8x8_internal2
1648 call pixel_satd_8x8_internal2
1649 call pixel_satd_8x8_internal2
1650 call pixel_satd_8x8_internal2
1651 call pixel_satd_8x8_internal2
1652 call pixel_satd_8x8_internal2
1653 call pixel_satd_8x8_internal2
1654 call pixel_satd_8x8_internal2
1655 lea r0, [r6 + 8*SIZEOF_PIXEL]
1656 mov r2, [rsp]
1657 add r2,8*SIZEOF_PIXEL
1658 call pixel_satd_8x8_internal2
1659 call pixel_satd_8x8_internal2
1660 call pixel_satd_8x8_internal2
1661 call pixel_satd_8x8_internal2
1662 call pixel_satd_8x8_internal2
1663 call pixel_satd_8x8_internal2
1664 call pixel_satd_8x8_internal2
1665 call pixel_satd_8x8_internal2
1666 lea r0, [r6 + 16*SIZEOF_PIXEL]
1667 mov r2, [rsp]
1668 add r2,16*SIZEOF_PIXEL
1669 call pixel_satd_8x8_internal2
1670 call pixel_satd_8x8_internal2
1671 call pixel_satd_8x8_internal2
1672 call pixel_satd_8x8_internal2
1673 call pixel_satd_8x8_internal2
1674 call pixel_satd_8x8_internal2
1675 call pixel_satd_8x8_internal2
1676 call pixel_satd_8x8_internal2
1677 lea r0, [r6 + 24*SIZEOF_PIXEL]
1678 mov r2, [rsp]
1679 add r2,24*SIZEOF_PIXEL
1680 call pixel_satd_8x8_internal2
1681 call pixel_satd_8x8_internal2
1682 call pixel_satd_8x8_internal2
1683 call pixel_satd_8x8_internal2
1684 call pixel_satd_8x8_internal2
1685 call pixel_satd_8x8_internal2
1686 call pixel_satd_8x8_internal2
1687 call pixel_satd_8x8_internal2
1688 lea r0, [r6 + 32*SIZEOF_PIXEL]
1689 mov r2, [rsp]
1690 add r2,32*SIZEOF_PIXEL
1691 call pixel_satd_8x8_internal2
1692 call pixel_satd_8x8_internal2
1693 call pixel_satd_8x8_internal2
1694 call pixel_satd_8x8_internal2
1695 call pixel_satd_8x8_internal2
1696 call pixel_satd_8x8_internal2
1697 call pixel_satd_8x8_internal2
1698 call pixel_satd_8x8_internal2
1699 lea r0, [r6 + 40*SIZEOF_PIXEL]
1700 mov r2, [rsp]
1701 add r2,40*SIZEOF_PIXEL
1702 call pixel_satd_8x8_internal2
1703 call pixel_satd_8x8_internal2
1704 call pixel_satd_8x8_internal2
1705 call pixel_satd_8x8_internal2
1706 call pixel_satd_8x8_internal2
1707 call pixel_satd_8x8_internal2
1708 call pixel_satd_8x8_internal2
1709 call pixel_satd_8x8_internal2
1710 pxor m7, m7
1711 movhlps m7, m6
1712 paddd m6, m7
1713 pshufd m7, m6, 1
1714 paddd m6, m7
1715 movd eax, m6
1716 RET
1717%endif
1718
1719
1720%if WIN64
1721cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx)
1722 SATD_START_SSE2 m6, m7
1723 mov r6, r0
1724 mov r7, r2
1725 call pixel_satd_8x8_internal2
1726 call pixel_satd_8x8_internal2
1727 lea r0, [r6 + 8*SIZEOF_PIXEL]
1728 lea r2, [r7 + 8*SIZEOF_PIXEL]
1729 call pixel_satd_8x8_internal2
1730 call pixel_satd_8x8_internal2
1731 lea r0, [r6 + 16*SIZEOF_PIXEL]
1732 lea r2, [r7 + 16*SIZEOF_PIXEL]
1733 call pixel_satd_8x8_internal2
1734 call pixel_satd_8x8_internal2
1735 lea r0, [r6 + 24*SIZEOF_PIXEL]
1736 lea r2, [r7 + 24*SIZEOF_PIXEL]
1737 call pixel_satd_8x8_internal2
1738 call pixel_satd_8x8_internal2
1739 lea r0, [r6 + 32*SIZEOF_PIXEL]
1740 lea r2, [r7 + 32*SIZEOF_PIXEL]
1741 call pixel_satd_8x8_internal2
1742 call pixel_satd_8x8_internal2
1743 lea r0, [r6 + 40*SIZEOF_PIXEL]
1744 lea r2, [r7 + 40*SIZEOF_PIXEL]
1745 call pixel_satd_8x8_internal2
1746 call pixel_satd_8x8_internal2
1747 lea r0, [r6 + 48*SIZEOF_PIXEL]
1748 lea r2, [r7 + 48*SIZEOF_PIXEL]
1749 call pixel_satd_8x8_internal2
1750 call pixel_satd_8x8_internal2
1751 lea r0, [r6 + 56*SIZEOF_PIXEL]
1752 lea r2, [r7 + 56*SIZEOF_PIXEL]
1753 call pixel_satd_8x8_internal2
1754 call pixel_satd_8x8_internal2
1755 pxor m7, m7
1756 movhlps m7, m6
1757 paddd m6, m7
1758 pshufd m7, m6, 1
1759 paddd m6, m7
1760 movd eax, m6
1761 RET
1762%else
1763cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
1764 SATD_START_SSE2 m6, m7
1765 mov r6, r0
1766 mov [rsp], r2
1767 call pixel_satd_8x8_internal2
1768 call pixel_satd_8x8_internal2
1769 lea r0, [r6 + 8*SIZEOF_PIXEL]
1770 mov r2, [rsp]
1771 add r2,8*SIZEOF_PIXEL
1772 call pixel_satd_8x8_internal2
1773 call pixel_satd_8x8_internal2
1774 lea r0, [r6 + 16*SIZEOF_PIXEL]
1775 mov r2, [rsp]
1776 add r2,16*SIZEOF_PIXEL
1777 call pixel_satd_8x8_internal2
1778 call pixel_satd_8x8_internal2
1779 lea r0, [r6 + 24*SIZEOF_PIXEL]
1780 mov r2, [rsp]
1781 add r2,24*SIZEOF_PIXEL
1782 call pixel_satd_8x8_internal2
1783 call pixel_satd_8x8_internal2
1784 lea r0, [r6 + 32*SIZEOF_PIXEL]
1785 mov r2, [rsp]
1786 add r2,32*SIZEOF_PIXEL
1787 call pixel_satd_8x8_internal2
1788 call pixel_satd_8x8_internal2
1789 lea r0, [r6 + 40*SIZEOF_PIXEL]
1790 mov r2, [rsp]
1791 add r2,40*SIZEOF_PIXEL
1792 call pixel_satd_8x8_internal2
1793 call pixel_satd_8x8_internal2
1794 lea r0, [r6 + 48*SIZEOF_PIXEL]
1795 mov r2, [rsp]
1796 add r2,48*SIZEOF_PIXEL
1797 call pixel_satd_8x8_internal2
1798 call pixel_satd_8x8_internal2
1799 lea r0, [r6 + 56*SIZEOF_PIXEL]
1800 mov r2, [rsp]
1801 add r2,56*SIZEOF_PIXEL
1802 call pixel_satd_8x8_internal2
1803 call pixel_satd_8x8_internal2
1804 pxor m7, m7
1805 movhlps m7, m6
1806 paddd m6, m7
1807 pshufd m7, m6, 1
1808 paddd m6, m7
1809 movd eax, m6
1810 RET
1811%endif
1812
1813%if WIN64
1814cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx)
1815 SATD_START_SSE2 m6, m7
1816 mov r6, r0
1817 mov r7, r2
1818 call pixel_satd_8x8_internal2
1819 call pixel_satd_8x8_internal2
1820 call pixel_satd_8x8_internal2
1821 call pixel_satd_8x8_internal2
1822 lea r0, [r6 + 8*SIZEOF_PIXEL]
1823 lea r2, [r7 + 8*SIZEOF_PIXEL]
1824 call pixel_satd_8x8_internal2
1825 call pixel_satd_8x8_internal2
1826 call pixel_satd_8x8_internal2
1827 call pixel_satd_8x8_internal2
1828 lea r0, [r6 + 16*SIZEOF_PIXEL]
1829 lea r2, [r7 + 16*SIZEOF_PIXEL]
1830 call pixel_satd_8x8_internal2
1831 call pixel_satd_8x8_internal2
1832 call pixel_satd_8x8_internal2
1833 call pixel_satd_8x8_internal2
1834 lea r0, [r6 + 24*SIZEOF_PIXEL]
1835 lea r2, [r7 + 24*SIZEOF_PIXEL]
1836 call pixel_satd_8x8_internal2
1837 call pixel_satd_8x8_internal2
1838 call pixel_satd_8x8_internal2
1839 call pixel_satd_8x8_internal2
1840 lea r0, [r6 + 32*SIZEOF_PIXEL]
1841 lea r2, [r7 + 32*SIZEOF_PIXEL]
1842 call pixel_satd_8x8_internal2
1843 call pixel_satd_8x8_internal2
1844 call pixel_satd_8x8_internal2
1845 call pixel_satd_8x8_internal2
1846 lea r0, [r6 + 40*SIZEOF_PIXEL]
1847 lea r2, [r7 + 40*SIZEOF_PIXEL]
1848 call pixel_satd_8x8_internal2
1849 call pixel_satd_8x8_internal2
1850 call pixel_satd_8x8_internal2
1851 call pixel_satd_8x8_internal2
1852 lea r0, [r6 + 48*SIZEOF_PIXEL]
1853 lea r2, [r7 + 48*SIZEOF_PIXEL]
1854 call pixel_satd_8x8_internal2
1855 call pixel_satd_8x8_internal2
1856 call pixel_satd_8x8_internal2
1857 call pixel_satd_8x8_internal2
1858 lea r0, [r6 + 56*SIZEOF_PIXEL]
1859 lea r2, [r7 + 56*SIZEOF_PIXEL]
1860 call pixel_satd_8x8_internal2
1861 call pixel_satd_8x8_internal2
1862 call pixel_satd_8x8_internal2
1863 call pixel_satd_8x8_internal2
1864 pxor m7, m7
1865 movhlps m7, m6
1866 paddd m6, m7
1867 pshufd m7, m6, 1
1868 paddd m6, m7
1869 movd eax, m6
1870 RET
1871%else
1872cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64
1873 SATD_START_SSE2 m6, m7
1874 mov r6, r0
1875 mov [rsp], r2
1876 call pixel_satd_8x8_internal2
1877 call pixel_satd_8x8_internal2
1878 call pixel_satd_8x8_internal2
1879 call pixel_satd_8x8_internal2
1880 lea r0, [r6 + 8*SIZEOF_PIXEL]
1881 mov r2, [rsp]
1882 add r2, 8*SIZEOF_PIXEL
1883 call pixel_satd_8x8_internal2
1884 call pixel_satd_8x8_internal2
1885 call pixel_satd_8x8_internal2
1886 call pixel_satd_8x8_internal2
1887 lea r0, [r6 + 16*SIZEOF_PIXEL]
1888 mov r2, [rsp]
1889 add r2, 16*SIZEOF_PIXEL
1890 call pixel_satd_8x8_internal2
1891 call pixel_satd_8x8_internal2
1892 call pixel_satd_8x8_internal2
1893 call pixel_satd_8x8_internal2
1894 lea r0, [r6 + 24*SIZEOF_PIXEL]
1895 mov r2, [rsp]
1896 add r2, 24*SIZEOF_PIXEL
1897 call pixel_satd_8x8_internal2
1898 call pixel_satd_8x8_internal2
1899 call pixel_satd_8x8_internal2
1900 call pixel_satd_8x8_internal2
1901 lea r0, [r6 + 32*SIZEOF_PIXEL]
1902 mov r2, [rsp]
1903 add r2, 32*SIZEOF_PIXEL
1904 call pixel_satd_8x8_internal2
1905 call pixel_satd_8x8_internal2
1906 call pixel_satd_8x8_internal2
1907 call pixel_satd_8x8_internal2
1908 lea r0, [r6 + 40*SIZEOF_PIXEL]
1909 mov r2, [rsp]
1910 add r2, 40*SIZEOF_PIXEL
1911 call pixel_satd_8x8_internal2
1912 call pixel_satd_8x8_internal2
1913 call pixel_satd_8x8_internal2
1914 call pixel_satd_8x8_internal2
1915 lea r0, [r6 + 48*SIZEOF_PIXEL]
1916 mov r2, [rsp]
1917 add r2, 48*SIZEOF_PIXEL
1918 call pixel_satd_8x8_internal2
1919 call pixel_satd_8x8_internal2
1920 call pixel_satd_8x8_internal2
1921 call pixel_satd_8x8_internal2
1922 lea r0, [r6 + 56*SIZEOF_PIXEL]
1923 mov r2, [rsp]
1924 add r2, 56*SIZEOF_PIXEL
1925 call pixel_satd_8x8_internal2
1926 call pixel_satd_8x8_internal2
1927 call pixel_satd_8x8_internal2
1928 call pixel_satd_8x8_internal2
1929 pxor m7, m7
1930 movhlps m7, m6
1931 paddd m6, m7
1932 pshufd m7, m6, 1
1933 paddd m6, m7
1934 movd eax, m6
1935 RET
1936%endif
1937
1938%if WIN64
1939cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx)
1940 SATD_START_SSE2 m6, m7
1941 mov r6, r0
1942 mov r7, r2
1943 call pixel_satd_8x8_internal2
1944 call pixel_satd_8x8_internal2
1945 call pixel_satd_8x8_internal2
1946 call pixel_satd_8x8_internal2
1947 call pixel_satd_8x8_internal2
1948 call pixel_satd_8x8_internal2
1949 lea r0, [r6 + 8*SIZEOF_PIXEL]
1950 lea r2, [r7 + 8*SIZEOF_PIXEL]
1951 call pixel_satd_8x8_internal2
1952 call pixel_satd_8x8_internal2
1953 call pixel_satd_8x8_internal2
1954 call pixel_satd_8x8_internal2
1955 call pixel_satd_8x8_internal2
1956 call pixel_satd_8x8_internal2
1957 lea r0, [r6 + 16*SIZEOF_PIXEL]
1958 lea r2, [r7 + 16*SIZEOF_PIXEL]
1959 call pixel_satd_8x8_internal2
1960 call pixel_satd_8x8_internal2
1961 call pixel_satd_8x8_internal2
1962 call pixel_satd_8x8_internal2
1963 call pixel_satd_8x8_internal2
1964 call pixel_satd_8x8_internal2
1965 lea r0, [r6 + 24*SIZEOF_PIXEL]
1966 lea r2, [r7 + 24*SIZEOF_PIXEL]
1967 call pixel_satd_8x8_internal2
1968 call pixel_satd_8x8_internal2
1969 call pixel_satd_8x8_internal2
1970 call pixel_satd_8x8_internal2
1971 call pixel_satd_8x8_internal2
1972 call pixel_satd_8x8_internal2
1973 lea r0, [r6 + 32*SIZEOF_PIXEL]
1974 lea r2, [r7 + 32*SIZEOF_PIXEL]
1975 call pixel_satd_8x8_internal2
1976 call pixel_satd_8x8_internal2
1977 call pixel_satd_8x8_internal2
1978 call pixel_satd_8x8_internal2
1979 call pixel_satd_8x8_internal2
1980 call pixel_satd_8x8_internal2
1981 lea r0, [r6 + 40*SIZEOF_PIXEL]
1982 lea r2, [r7 + 40*SIZEOF_PIXEL]
1983 call pixel_satd_8x8_internal2
1984 call pixel_satd_8x8_internal2
1985 call pixel_satd_8x8_internal2
1986 call pixel_satd_8x8_internal2
1987 call pixel_satd_8x8_internal2
1988 call pixel_satd_8x8_internal2
1989 lea r0, [r6 + 48*SIZEOF_PIXEL]
1990 lea r2, [r7 + 48*SIZEOF_PIXEL]
1991 call pixel_satd_8x8_internal2
1992 call pixel_satd_8x8_internal2
1993 call pixel_satd_8x8_internal2
1994 call pixel_satd_8x8_internal2
1995 call pixel_satd_8x8_internal2
1996 call pixel_satd_8x8_internal2
1997 lea r0, [r6 + 56*SIZEOF_PIXEL]
1998 lea r2, [r7 + 56*SIZEOF_PIXEL]
1999 call pixel_satd_8x8_internal2
2000 call pixel_satd_8x8_internal2
2001 call pixel_satd_8x8_internal2
2002 call pixel_satd_8x8_internal2
2003 call pixel_satd_8x8_internal2
2004 call pixel_satd_8x8_internal2
2005 pxor m8, m8
2006 movhlps m8, m6
2007 paddd m6, m8
2008 pshufd m8, m6, 1
2009 paddd m6, m8
2010 movd eax, m6
2011 RET
2012%else
2013cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64
2014 SATD_START_SSE2 m6, m7
2015 mov r6, r0
2016 mov [rsp], r2
2017 call pixel_satd_8x8_internal2
2018 call pixel_satd_8x8_internal2
2019 call pixel_satd_8x8_internal2
2020 call pixel_satd_8x8_internal2
2021 call pixel_satd_8x8_internal2
2022 call pixel_satd_8x8_internal2
2023 lea r0, [r6 + 8*SIZEOF_PIXEL]
2024 mov r2, [rsp]
2025 add r2, 8*SIZEOF_PIXEL
2026 call pixel_satd_8x8_internal2
2027 call pixel_satd_8x8_internal2
2028 call pixel_satd_8x8_internal2
2029 call pixel_satd_8x8_internal2
2030 call pixel_satd_8x8_internal2
2031 call pixel_satd_8x8_internal2
2032 lea r0, [r6 + 16*SIZEOF_PIXEL]
2033 mov r2, [rsp]
2034 add r2, 16*SIZEOF_PIXEL
2035 call pixel_satd_8x8_internal2
2036 call pixel_satd_8x8_internal2
2037 call pixel_satd_8x8_internal2
2038 call pixel_satd_8x8_internal2
2039 call pixel_satd_8x8_internal2
2040 call pixel_satd_8x8_internal2
2041 lea r0, [r6 + 24*SIZEOF_PIXEL]
2042 mov r2, [rsp]
2043 add r2, 24*SIZEOF_PIXEL
2044 call pixel_satd_8x8_internal2
2045 call pixel_satd_8x8_internal2
2046 call pixel_satd_8x8_internal2
2047 call pixel_satd_8x8_internal2
2048 call pixel_satd_8x8_internal2
2049 call pixel_satd_8x8_internal2
2050 lea r0, [r6 + 32*SIZEOF_PIXEL]
2051 mov r2, [rsp]
2052 add r2, 32*SIZEOF_PIXEL
2053 call pixel_satd_8x8_internal2
2054 call pixel_satd_8x8_internal2
2055 call pixel_satd_8x8_internal2
2056 call pixel_satd_8x8_internal2
2057 call pixel_satd_8x8_internal2
2058 call pixel_satd_8x8_internal2
2059 lea r0, [r6 + 40*SIZEOF_PIXEL]
2060 mov r2, [rsp]
2061 add r2, 40*SIZEOF_PIXEL
2062 call pixel_satd_8x8_internal2
2063 call pixel_satd_8x8_internal2
2064 call pixel_satd_8x8_internal2
2065 call pixel_satd_8x8_internal2
2066 call pixel_satd_8x8_internal2
2067 call pixel_satd_8x8_internal2
2068 lea r0, [r6 + 48*SIZEOF_PIXEL]
2069 mov r2, [rsp]
2070 add r2, 48*SIZEOF_PIXEL
2071 call pixel_satd_8x8_internal2
2072 call pixel_satd_8x8_internal2
2073 call pixel_satd_8x8_internal2
2074 call pixel_satd_8x8_internal2
2075 call pixel_satd_8x8_internal2
2076 call pixel_satd_8x8_internal2
2077 lea r0, [r6 + 56*SIZEOF_PIXEL]
2078 mov r2, [rsp]
2079 add r2, 56*SIZEOF_PIXEL
2080 call pixel_satd_8x8_internal2
2081 call pixel_satd_8x8_internal2
2082 call pixel_satd_8x8_internal2
2083 call pixel_satd_8x8_internal2
2084 call pixel_satd_8x8_internal2
2085 call pixel_satd_8x8_internal2
2086 pxor m7, m7
2087 movhlps m7, m6
2088 paddd m6, m7
2089 pshufd m7, m6, 1
2090 paddd m6, m7
2091 movd eax, m6
2092 RET
2093%endif
2094
2095%if WIN64
2096cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx)
2097 SATD_START_SSE2 m6, m7
2098 mov r6, r0
2099 mov r7, r2
2100 call pixel_satd_8x8_internal2
2101 call pixel_satd_8x8_internal2
2102 call pixel_satd_8x8_internal2
2103 call pixel_satd_8x8_internal2
2104 call pixel_satd_8x8_internal2
2105 call pixel_satd_8x8_internal2
2106 call pixel_satd_8x8_internal2
2107 call pixel_satd_8x8_internal2
2108 lea r0, [r6 + 8*SIZEOF_PIXEL]
2109 lea r2, [r7 + 8*SIZEOF_PIXEL]
2110 call pixel_satd_8x8_internal2
2111 call pixel_satd_8x8_internal2
2112 call pixel_satd_8x8_internal2
2113 call pixel_satd_8x8_internal2
2114 call pixel_satd_8x8_internal2
2115 call pixel_satd_8x8_internal2
2116 call pixel_satd_8x8_internal2
2117 call pixel_satd_8x8_internal2
2118 lea r0, [r6 + 16*SIZEOF_PIXEL]
2119 lea r2, [r7 + 16*SIZEOF_PIXEL]
2120 call pixel_satd_8x8_internal2
2121 call pixel_satd_8x8_internal2
2122 call pixel_satd_8x8_internal2
2123 call pixel_satd_8x8_internal2
2124 call pixel_satd_8x8_internal2
2125 call pixel_satd_8x8_internal2
2126 call pixel_satd_8x8_internal2
2127 call pixel_satd_8x8_internal2
2128 lea r0, [r6 + 24*SIZEOF_PIXEL]
2129 lea r2, [r7 + 24*SIZEOF_PIXEL]
2130 call pixel_satd_8x8_internal2
2131 call pixel_satd_8x8_internal2
2132 call pixel_satd_8x8_internal2
2133 call pixel_satd_8x8_internal2
2134 call pixel_satd_8x8_internal2
2135 call pixel_satd_8x8_internal2
2136 call pixel_satd_8x8_internal2
2137 call pixel_satd_8x8_internal2
2138 lea r0, [r6 + 32*SIZEOF_PIXEL]
2139 lea r2, [r7 + 32*SIZEOF_PIXEL]
2140 call pixel_satd_8x8_internal2
2141 call pixel_satd_8x8_internal2
2142 call pixel_satd_8x8_internal2
2143 call pixel_satd_8x8_internal2
2144 call pixel_satd_8x8_internal2
2145 call pixel_satd_8x8_internal2
2146 call pixel_satd_8x8_internal2
2147 call pixel_satd_8x8_internal2
2148 lea r0, [r6 + 40*SIZEOF_PIXEL]
2149 lea r2, [r7 + 40*SIZEOF_PIXEL]
2150 call pixel_satd_8x8_internal2
2151 call pixel_satd_8x8_internal2
2152 call pixel_satd_8x8_internal2
2153 call pixel_satd_8x8_internal2
2154 call pixel_satd_8x8_internal2
2155 call pixel_satd_8x8_internal2
2156 call pixel_satd_8x8_internal2
2157 call pixel_satd_8x8_internal2
2158 lea r0, [r6 + 48*SIZEOF_PIXEL]
2159 lea r2, [r7 + 48*SIZEOF_PIXEL]
2160 call pixel_satd_8x8_internal2
2161 call pixel_satd_8x8_internal2
2162 call pixel_satd_8x8_internal2
2163 call pixel_satd_8x8_internal2
2164 call pixel_satd_8x8_internal2
2165 call pixel_satd_8x8_internal2
2166 call pixel_satd_8x8_internal2
2167 call pixel_satd_8x8_internal2
2168 lea r0, [r6 + 56*SIZEOF_PIXEL]
2169 lea r2, [r7 + 56*SIZEOF_PIXEL]
2170 call pixel_satd_8x8_internal2
2171 call pixel_satd_8x8_internal2
2172 call pixel_satd_8x8_internal2
2173 call pixel_satd_8x8_internal2
2174 call pixel_satd_8x8_internal2
2175 call pixel_satd_8x8_internal2
2176 call pixel_satd_8x8_internal2
2177 call pixel_satd_8x8_internal2
2178 pxor m8, m8
2179 movhlps m8, m6
2180 paddd m6, m8
2181 pshufd m8, m6, 1
2182 paddd m6, m8
2183 movd eax, m6
2184 RET
2185%else
2186cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64
2187 SATD_START_SSE2 m6, m7
2188 mov r6, r0
2189 mov [rsp], r2
2190 call pixel_satd_8x8_internal2
2191 call pixel_satd_8x8_internal2
2192 call pixel_satd_8x8_internal2
2193 call pixel_satd_8x8_internal2
2194 call pixel_satd_8x8_internal2
2195 call pixel_satd_8x8_internal2
2196 call pixel_satd_8x8_internal2
2197 call pixel_satd_8x8_internal2
2198 lea r0, [r6 + 8*SIZEOF_PIXEL]
2199 mov r2, [rsp]
2200 add r2, 8*SIZEOF_PIXEL
2201 call pixel_satd_8x8_internal2
2202 call pixel_satd_8x8_internal2
2203 call pixel_satd_8x8_internal2
2204 call pixel_satd_8x8_internal2
2205 call pixel_satd_8x8_internal2
2206 call pixel_satd_8x8_internal2
2207 call pixel_satd_8x8_internal2
2208 call pixel_satd_8x8_internal2
2209 lea r0, [r6 + 16*SIZEOF_PIXEL]
2210 mov r2, [rsp]
2211 add r2, 16*SIZEOF_PIXEL
2212 call pixel_satd_8x8_internal2
2213 call pixel_satd_8x8_internal2
2214 call pixel_satd_8x8_internal2
2215 call pixel_satd_8x8_internal2
2216 call pixel_satd_8x8_internal2
2217 call pixel_satd_8x8_internal2
2218 call pixel_satd_8x8_internal2
2219 call pixel_satd_8x8_internal2
2220 lea r0, [r6 + 24*SIZEOF_PIXEL]
2221 mov r2, [rsp]
2222 add r2, 24*SIZEOF_PIXEL
2223 call pixel_satd_8x8_internal2
2224 call pixel_satd_8x8_internal2
2225 call pixel_satd_8x8_internal2
2226 call pixel_satd_8x8_internal2
2227 call pixel_satd_8x8_internal2
2228 call pixel_satd_8x8_internal2
2229 call pixel_satd_8x8_internal2
2230 call pixel_satd_8x8_internal2
2231 lea r0, [r6 + 32*SIZEOF_PIXEL]
2232 mov r2, [rsp]
2233 add r2, 32*SIZEOF_PIXEL
2234 call pixel_satd_8x8_internal2
2235 call pixel_satd_8x8_internal2
2236 call pixel_satd_8x8_internal2
2237 call pixel_satd_8x8_internal2
2238 call pixel_satd_8x8_internal2
2239 call pixel_satd_8x8_internal2
2240 call pixel_satd_8x8_internal2
2241 call pixel_satd_8x8_internal2
2242 lea r0, [r6 + 40*SIZEOF_PIXEL]
2243 mov r2, [rsp]
2244 add r2, 40*SIZEOF_PIXEL
2245 call pixel_satd_8x8_internal2
2246 call pixel_satd_8x8_internal2
2247 call pixel_satd_8x8_internal2
2248 call pixel_satd_8x8_internal2
2249 call pixel_satd_8x8_internal2
2250 call pixel_satd_8x8_internal2
2251 call pixel_satd_8x8_internal2
2252 call pixel_satd_8x8_internal2
2253 lea r0, [r6 + 48*SIZEOF_PIXEL]
2254 mov r2, [rsp]
2255 add r2, 48*SIZEOF_PIXEL
2256 call pixel_satd_8x8_internal2
2257 call pixel_satd_8x8_internal2
2258 call pixel_satd_8x8_internal2
2259 call pixel_satd_8x8_internal2
2260 call pixel_satd_8x8_internal2
2261 call pixel_satd_8x8_internal2
2262 call pixel_satd_8x8_internal2
2263 call pixel_satd_8x8_internal2
2264 lea r0, [r6 + 56*SIZEOF_PIXEL]
2265 mov r2, [rsp]
2266 add r2, 56*SIZEOF_PIXEL
2267 call pixel_satd_8x8_internal2
2268 call pixel_satd_8x8_internal2
2269 call pixel_satd_8x8_internal2
2270 call pixel_satd_8x8_internal2
2271 call pixel_satd_8x8_internal2
2272 call pixel_satd_8x8_internal2
2273 call pixel_satd_8x8_internal2
2274 call pixel_satd_8x8_internal2
2275 pxor m7, m7
2276 movhlps m7, m6
2277 paddd m6, m7
2278 pshufd m7, m6, 1
2279 paddd m6, m7
2280 movd eax, m6
2281 RET
2282%endif
2283
2284%if WIN64
2285cglobal pixel_satd_16x4, 4,6,14
2286%else
2287cglobal pixel_satd_16x4, 4,6,8
2288%endif
2289 SATD_START_SSE2 m6, m7
2290 BACKUP_POINTERS
2291 call %%pixel_satd_8x4_internal2
2292 RESTORE_AND_INC_POINTERS
2293 call %%pixel_satd_8x4_internal2
2294 pxor m7, m7
2295 movhlps m7, m6
2296 paddd m6, m7
2297 pshufd m7, m6, 1
2298 paddd m6, m7
2299 movd eax, m6
2300 RET
2301
2302%if WIN64
2303cglobal pixel_satd_16x8, 4,6,14
2304%else
2305cglobal pixel_satd_16x8, 4,6,8
2306%endif
2307 SATD_START_SSE2 m6, m7
2308 BACKUP_POINTERS
2309 call pixel_satd_8x8_internal2
2310 RESTORE_AND_INC_POINTERS
2311 call pixel_satd_8x8_internal2
2312 pxor m7, m7
2313 movhlps m7, m6
2314 paddd m6, m7
2315 pshufd m7, m6, 1
2316 paddd m6, m7
2317 movd eax, m6
2318 RET
2319
2320%if WIN64
2321cglobal pixel_satd_16x12, 4,6,14
2322%else
2323cglobal pixel_satd_16x12, 4,6,8
2324%endif
2325 SATD_START_SSE2 m6, m7, 1
2326 BACKUP_POINTERS
2327 call pixel_satd_8x8_internal2
2328 call %%pixel_satd_8x4_internal2
2329 RESTORE_AND_INC_POINTERS
2330 call pixel_satd_8x8_internal2
2331 call %%pixel_satd_8x4_internal2
2332 pxor m7, m7
2333 movhlps m7, m6
2334 paddd m6, m7
2335 pshufd m7, m6, 1
2336 paddd m6, m7
2337 movd eax, m6
2338 RET
2339
2340%if WIN64
2341cglobal pixel_satd_16x16, 4,6,14
2342%else
2343cglobal pixel_satd_16x16, 4,6,8
2344%endif
2345 SATD_START_SSE2 m6, m7, 1
2346 BACKUP_POINTERS
2347 call pixel_satd_8x8_internal2
2348 call pixel_satd_8x8_internal2
2349 RESTORE_AND_INC_POINTERS
2350 call pixel_satd_8x8_internal2
2351 call pixel_satd_8x8_internal2
2352 pxor m7, m7
2353 movhlps m7, m6
2354 paddd m6, m7
2355 pshufd m7, m6, 1
2356 paddd m6, m7
2357 movd eax, m6
2358 RET
2359
2360%if WIN64
2361cglobal pixel_satd_16x32, 4,6,14
2362%else
2363cglobal pixel_satd_16x32, 4,6,8
2364%endif
2365 SATD_START_SSE2 m6, m7, 1
2366 BACKUP_POINTERS
2367 call pixel_satd_8x8_internal2
2368 call pixel_satd_8x8_internal2
2369 call pixel_satd_8x8_internal2
2370 call pixel_satd_8x8_internal2
2371 RESTORE_AND_INC_POINTERS
2372 call pixel_satd_8x8_internal2
2373 call pixel_satd_8x8_internal2
2374 call pixel_satd_8x8_internal2
2375 call pixel_satd_8x8_internal2
2376 pxor m7, m7
2377 movhlps m7, m6
2378 paddd m6, m7
2379 pshufd m7, m6, 1
2380 paddd m6, m7
2381 movd eax, m6
2382 RET
2383
2384%if WIN64
2385cglobal pixel_satd_16x64, 4,6,14
2386%else
2387cglobal pixel_satd_16x64, 4,6,8
2388%endif
2389 SATD_START_SSE2 m6, m7, 1
2390 BACKUP_POINTERS
2391 call pixel_satd_8x8_internal2
2392 call pixel_satd_8x8_internal2
2393 call pixel_satd_8x8_internal2
2394 call pixel_satd_8x8_internal2
2395 call pixel_satd_8x8_internal2
2396 call pixel_satd_8x8_internal2
2397 call pixel_satd_8x8_internal2
2398 call pixel_satd_8x8_internal2
2399 RESTORE_AND_INC_POINTERS
2400 call pixel_satd_8x8_internal2
2401 call pixel_satd_8x8_internal2
2402 call pixel_satd_8x8_internal2
2403 call pixel_satd_8x8_internal2
2404 call pixel_satd_8x8_internal2
2405 call pixel_satd_8x8_internal2
2406 call pixel_satd_8x8_internal2
2407 call pixel_satd_8x8_internal2
2408 pxor m7, m7
2409 movhlps m7, m6
2410 paddd m6, m7
2411 pshufd m7, m6, 1
2412 paddd m6, m7
2413 movd eax, m6
2414 RET
2415%endif
2416
2417%if HIGH_BIT_DEPTH
2418%if WIN64
2419cglobal pixel_satd_12x16, 4,8,8
2420 SATD_START_MMX
2421 mov r6, r0
2422 mov r7, r2
2423 pxor m7, m7
2424 SATD_4x8_SSE vertical, 0, 4, 5
2425 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2426 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2427 SATD_4x8_SSE vertical, 1, 4, 5
2428 lea r0, [r6 + 4*SIZEOF_PIXEL]
2429 lea r2, [r7 + 4*SIZEOF_PIXEL]
2430 SATD_4x8_SSE vertical, 1, 4, 5
2431 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2432 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2433 SATD_4x8_SSE vertical, 1, 4, 5
2434 lea r0, [r6 + 8*SIZEOF_PIXEL]
2435 lea r2, [r7 + 8*SIZEOF_PIXEL]
2436 SATD_4x8_SSE vertical, 1, 4, 5
2437 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2438 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2439 SATD_4x8_SSE vertical, 1, 4, 5
2440 pxor m1, m1
2441 movhlps m1, m7
2442 paddd m7, m1
2443 pshufd m1, m7, 1
2444 paddd m7, m1
2445 movd eax, m7
2446 RET
2447%else
2448cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2449 SATD_START_MMX
2450 mov r6, r0
2451 mov [rsp], r2
2452 pxor m7, m7
2453 SATD_4x8_SSE vertical, 0, 4, 5
2454 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2455 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2456 SATD_4x8_SSE vertical, 1, 4, 5
2457 lea r0, [r6 + 4*SIZEOF_PIXEL]
2458 mov r2, [rsp]
2459 add r2, 4*SIZEOF_PIXEL
2460 SATD_4x8_SSE vertical, 1, 4, 5
2461 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2462 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2463 SATD_4x8_SSE vertical, 1, 4, 5
2464 lea r0, [r6 + 8*SIZEOF_PIXEL]
2465 mov r2, [rsp]
2466 add r2, 8*SIZEOF_PIXEL
2467 SATD_4x8_SSE vertical, 1, 4, 5
2468 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2469 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2470 SATD_4x8_SSE vertical, 1, 4, 5
2471 pxor m1, m1
2472 movhlps m1, m7
2473 paddd m7, m1
2474 pshufd m1, m7, 1
2475 paddd m7, m1
2476 movd eax, m7
2477 RET
2478%endif
2479%else ;HIGH_BIT_DEPTH
2480%if WIN64
2481cglobal pixel_satd_12x16, 4,8,8
2482 SATD_START_MMX
2483 mov r6, r0
2484 mov r7, r2
2485%if vertical==0
2486 mova m7, [hmul_4p]
2487%endif
2488 SATD_4x8_SSE vertical, 0, swap
2489 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2490 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2491 SATD_4x8_SSE vertical, 1, add
2492 lea r0, [r6 + 4*SIZEOF_PIXEL]
2493 lea r2, [r7 + 4*SIZEOF_PIXEL]
2494 SATD_4x8_SSE vertical, 1, add
2495 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2496 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2497 SATD_4x8_SSE vertical, 1, add
2498 lea r0, [r6 + 8*SIZEOF_PIXEL]
2499 lea r2, [r7 + 8*SIZEOF_PIXEL]
2500 SATD_4x8_SSE vertical, 1, add
2501 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2502 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2503 SATD_4x8_SSE vertical, 1, add
2504 HADDW m7, m1
2505 movd eax, m7
2506 RET
2507%else
2508cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2509 SATD_START_MMX
2510 mov r6, r0
2511 mov [rsp], r2
2512%if vertical==0
2513 mova m7, [hmul_4p]
2514%endif
2515 SATD_4x8_SSE vertical, 0, swap
2516 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2517 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2518 SATD_4x8_SSE vertical, 1, add
2519 lea r0, [r6 + 4*SIZEOF_PIXEL]
2520 mov r2, [rsp]
2521 add r2, 4*SIZEOF_PIXEL
2522 SATD_4x8_SSE vertical, 1, add
2523 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2524 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2525 SATD_4x8_SSE vertical, 1, add
2526 lea r0, [r6 + 8*SIZEOF_PIXEL]
2527 mov r2, [rsp]
2528 add r2, 8*SIZEOF_PIXEL
2529 SATD_4x8_SSE vertical, 1, add
2530 lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2531 lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2532 SATD_4x8_SSE vertical, 1, add
2533 HADDW m7, m1
2534 movd eax, m7
2535 RET
2536%endif
2537%endif
2538
2539%if WIN64
2540cglobal pixel_satd_24x32, 4,8,14
2541 SATD_START_SSE2 m6, m7
2542 mov r6, r0
2543 mov r7, r2
2544 call pixel_satd_8x8_internal2
2545 call pixel_satd_8x8_internal2
2546 call pixel_satd_8x8_internal2
2547 call pixel_satd_8x8_internal2
2548 lea r0, [r6 + 8*SIZEOF_PIXEL]
2549 lea r2, [r7 + 8*SIZEOF_PIXEL]
2550 call pixel_satd_8x8_internal2
2551 call pixel_satd_8x8_internal2
2552 call pixel_satd_8x8_internal2
2553 call pixel_satd_8x8_internal2
2554 lea r0, [r6 + 16*SIZEOF_PIXEL]
2555 lea r2, [r7 + 16*SIZEOF_PIXEL]
2556 call pixel_satd_8x8_internal2
2557 call pixel_satd_8x8_internal2
2558 call pixel_satd_8x8_internal2
2559 call pixel_satd_8x8_internal2
2560 pxor m7, m7
2561 movhlps m7, m6
2562 paddd m6, m7
2563 pshufd m7, m6, 1
2564 paddd m6, m7
2565 movd eax, m6
2566 RET
2567%else
2568cglobal pixel_satd_24x32, 4,7,8,0-gprsize
2569 SATD_START_SSE2 m6, m7
2570 mov r6, r0
2571 mov [rsp], r2
2572 call pixel_satd_8x8_internal2
2573 call pixel_satd_8x8_internal2
2574 call pixel_satd_8x8_internal2
2575 call pixel_satd_8x8_internal2
2576 lea r0, [r6 + 8*SIZEOF_PIXEL]
2577 mov r2, [rsp]
2578 add r2, 8*SIZEOF_PIXEL
2579 call pixel_satd_8x8_internal2
2580 call pixel_satd_8x8_internal2
2581 call pixel_satd_8x8_internal2
2582 call pixel_satd_8x8_internal2
2583 lea r0, [r6 + 16*SIZEOF_PIXEL]
2584 mov r2, [rsp]
2585 add r2, 16*SIZEOF_PIXEL
2586 call pixel_satd_8x8_internal2
2587 call pixel_satd_8x8_internal2
2588 call pixel_satd_8x8_internal2
2589 call pixel_satd_8x8_internal2
2590 pxor m7, m7
2591 movhlps m7, m6
2592 paddd m6, m7
2593 pshufd m7, m6, 1
2594 paddd m6, m7
2595 movd eax, m6
2596 RET
2597%endif ;WIN64
2598
2599%if WIN64
2600cglobal pixel_satd_8x32, 4,6,14
2601%else
2602cglobal pixel_satd_8x32, 4,6,8
2603%endif
2604 SATD_START_SSE2 m6, m7
2605%if vertical
2606 mova m7, [pw_00ff]
2607%endif
2608 call pixel_satd_8x8_internal2
2609 call pixel_satd_8x8_internal2
2610 call pixel_satd_8x8_internal2
2611 call pixel_satd_8x8_internal2
2612 pxor m7, m7
2613 movhlps m7, m6
2614 paddd m6, m7
2615 pshufd m7, m6, 1
2616 paddd m6, m7
2617 movd eax, m6
2618 RET
2619
2620%if WIN64
2621cglobal pixel_satd_8x16, 4,6,14
2622%else
2623cglobal pixel_satd_8x16, 4,6,8
2624%endif
2625 SATD_START_SSE2 m6, m7
2626 call pixel_satd_8x8_internal2
2627 call pixel_satd_8x8_internal2
2628 pxor m7, m7
2629 movhlps m7, m6
2630 paddd m6, m7
2631 pshufd m7, m6, 1
2632 paddd m6, m7
2633 movd eax, m6
2634 RET
2635
2636cglobal pixel_satd_8x8, 4,6,8
2637 SATD_START_SSE2 m6, m7
2638 call pixel_satd_8x8_internal
2639 SATD_END_SSE2 m6
2640
2641%if WIN64
2642cglobal pixel_satd_8x4, 4,6,14
2643%else
2644cglobal pixel_satd_8x4, 4,6,8
2645%endif
2646 SATD_START_SSE2 m6, m7
2647 call %%pixel_satd_8x4_internal2
2648 SATD_END_SSE2 m6
2649%endmacro ; SATDS_SSE2
2650
2651
2652;=============================================================================
2653; SA8D
2654;=============================================================================
2655
2656%macro SA8D_INTER 0
2657%if ARCH_X86_64
2658 %define lh m10
2659 %define rh m0
2660%else
2661 %define lh m0
2662 %define rh [esp+48]
2663%endif
2664%if HIGH_BIT_DEPTH
2665 HADDUW m0, m1
2666 paddd lh, rh
2667%else
2668 paddusw lh, rh
2669%endif ; HIGH_BIT_DEPTH
2670%endmacro
2671
2672%macro SA8D_8x8 0
2673 call pixel_sa8d_8x8_internal
2674%if HIGH_BIT_DEPTH
2675 HADDUW m0, m1
2676%else
2677 HADDW m0, m1
2678%endif ; HIGH_BIT_DEPTH
2679 paddd m0, [pd_1]
2680 psrld m0, 1
2681 paddd m12, m0
2682%endmacro
2683
2684%macro SA8D_16x16 0
2685 call pixel_sa8d_8x8_internal ; pix[0]
2686 add r2, 8*SIZEOF_PIXEL
2687 add r0, 8*SIZEOF_PIXEL
2688%if HIGH_BIT_DEPTH
2689 HADDUW m0, m1
2690%endif
2691 mova m10, m0
2692 call pixel_sa8d_8x8_internal ; pix[8]
2693 lea r2, [r2+8*r3]
2694 lea r0, [r0+8*r1]
2695 SA8D_INTER
2696 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
2697 sub r2, 8*SIZEOF_PIXEL
2698 sub r0, 8*SIZEOF_PIXEL
2699 SA8D_INTER
2700 call pixel_sa8d_8x8_internal ; pix[8*stride]
2701 SA8D_INTER
2702 SWAP 0, 10
2703%if HIGH_BIT_DEPTH == 0
2704 HADDUW m0, m1
2705%endif
2706 paddd m0, [pd_1]
2707 psrld m0, 1
2708 paddd m12, m0
2709%endmacro
2710
2711%macro AVG_16x16 0
2712 SA8D_INTER
2713%if HIGH_BIT_DEPTH == 0
2714 HADDUW m0, m1
2715%endif
2716 movd r4d, m0
2717 add r4d, 1
2718 shr r4d, 1
2719 add r4d, dword [esp+36]
2720 mov dword [esp+36], r4d
2721%endmacro
2722
2723%macro SA8D 0
2724; sse2 doesn't seem to like the horizontal way of doing things
2725%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
2726
2727%if ARCH_X86_64
2728;-----------------------------------------------------------------------------
2729; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
2730;-----------------------------------------------------------------------------
2731cglobal pixel_sa8d_8x8_internal
2732 lea r6, [r0+4*r1]
2733 lea r7, [r2+4*r3]
2734 LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
2735 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
2736%if vertical
2737 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
2738%else ; non-sse2
2739 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
2740%endif
2741 paddw m0, m1
2742 paddw m0, m2
2743 paddw m0, m8
2744 SAVE_MM_PERMUTATION
2745 ret
2746
2747cglobal pixel_sa8d_8x8, 4,8,12
2748 FIX_STRIDES r1, r3
2749 lea r4, [3*r1]
2750 lea r5, [3*r3]
2751%if vertical == 0
2752 mova m7, [hmul_8p]
2753%endif
2754 call pixel_sa8d_8x8_internal
2755%if HIGH_BIT_DEPTH
2756 HADDUW m0, m1
2757%else
2758 HADDW m0, m1
2759%endif ; HIGH_BIT_DEPTH
2760 movd eax, m0
2761 add eax, 1
2762 shr eax, 1
2763 RET
2764
2765cglobal pixel_sa8d_16x16, 4,8,12
2766 FIX_STRIDES r1, r3
2767 lea r4, [3*r1]
2768 lea r5, [3*r3]
2769%if vertical == 0
2770 mova m7, [hmul_8p]
2771%endif
2772 call pixel_sa8d_8x8_internal ; pix[0]
2773 add r2, 8*SIZEOF_PIXEL
2774 add r0, 8*SIZEOF_PIXEL
2775%if HIGH_BIT_DEPTH
2776 HADDUW m0, m1
2777%endif
2778 mova m10, m0
2779 call pixel_sa8d_8x8_internal ; pix[8]
2780 lea r2, [r2+8*r3]
2781 lea r0, [r0+8*r1]
2782 SA8D_INTER
2783 call pixel_sa8d_8x8_internal ; pix[8*stride+8]
2784 sub r2, 8*SIZEOF_PIXEL
2785 sub r0, 8*SIZEOF_PIXEL
2786 SA8D_INTER
2787 call pixel_sa8d_8x8_internal ; pix[8*stride]
2788 SA8D_INTER
2789 SWAP 0, 10
2790%if HIGH_BIT_DEPTH == 0
2791 HADDUW m0, m1
2792%endif
2793 movd eax, m0
2794 add eax, 1
2795 shr eax, 1
2796 RET
2797
2798cglobal pixel_sa8d_8x16, 4,8,13
2799 FIX_STRIDES r1, r3
2800 lea r4, [3*r1]
2801 lea r5, [3*r3]
2802 pxor m12, m12
2803%if vertical == 0
2804 mova m7, [hmul_8p]
2805%endif
2806 SA8D_8x8
2807 lea r0, [r0 + 8*r1]
2808 lea r2, [r2 + 8*r3]
2809 SA8D_8x8
2810 movd eax, m12
2811 RET
2812
2813cglobal pixel_sa8d_8x32, 4,8,13
2814 FIX_STRIDES r1, r3
2815 lea r4, [3*r1]
2816 lea r5, [3*r3]
2817 pxor m12, m12
2818%if vertical == 0
2819 mova m7, [hmul_8p]
2820%endif
2821 SA8D_8x8
2822 lea r0, [r0 + r1*8]
2823 lea r2, [r2 + r3*8]
2824 SA8D_8x8
2825 lea r0, [r0 + r1*8]
2826 lea r2, [r2 + r3*8]
2827 SA8D_8x8
2828 lea r0, [r0 + r1*8]
2829 lea r2, [r2 + r3*8]
2830 SA8D_8x8
2831 movd eax, m12
2832 RET
2833
2834cglobal pixel_sa8d_16x8, 4,8,13
2835 FIX_STRIDES r1, r3
2836 lea r4, [3*r1]
2837 lea r5, [3*r3]
2838 pxor m12, m12
2839%if vertical == 0
2840 mova m7, [hmul_8p]
2841%endif
2842 SA8D_8x8
2843 add r0, 8*SIZEOF_PIXEL
2844 add r2, 8*SIZEOF_PIXEL
2845 SA8D_8x8
2846 movd eax, m12
2847 RET
2848
2849cglobal pixel_sa8d_16x32, 4,8,13
2850 FIX_STRIDES r1, r3
2851 lea r4, [3*r1]
2852 lea r5, [3*r3]
2853 pxor m12, m12
2854%if vertical == 0
2855 mova m7, [hmul_8p]
2856%endif
2857 SA8D_16x16
2858 lea r0, [r0+8*r1]
2859 lea r2, [r2+8*r3]
2860 SA8D_16x16
2861 movd eax, m12
2862 RET
2863
2864cglobal pixel_sa8d_16x64, 4,8,13
2865 FIX_STRIDES r1, r3
2866 lea r4, [3*r1]
2867 lea r5, [3*r3]
2868 pxor m12, m12
2869%if vertical == 0
2870 mova m7, [hmul_8p]
2871%endif
2872 SA8D_16x16
2873 lea r0, [r0+8*r1]
2874 lea r2, [r2+8*r3]
2875 SA8D_16x16
2876 lea r0, [r0+8*r1]
2877 lea r2, [r2+8*r3]
2878 SA8D_16x16
2879 lea r0, [r0+8*r1]
2880 lea r2, [r2+8*r3]
2881 SA8D_16x16
2882 movd eax, m12
2883 RET
2884
2885cglobal pixel_sa8d_24x32, 4,8,13
2886 FIX_STRIDES r1, r3
2887 lea r4, [3*r1]
2888 lea r5, [3*r3]
2889 pxor m12, m12
2890%if vertical == 0
2891 mova m7, [hmul_8p]
2892%endif
2893 SA8D_8x8
2894 add r0, 8*SIZEOF_PIXEL
2895 add r2, 8*SIZEOF_PIXEL
2896 SA8D_8x8
2897 add r0, 8*SIZEOF_PIXEL
2898 add r2, 8*SIZEOF_PIXEL
2899 SA8D_8x8
2900 lea r0, [r0 + r1*8]
2901 lea r2, [r2 + r3*8]
2902 SA8D_8x8
2903 sub r0, 8*SIZEOF_PIXEL
2904 sub r2, 8*SIZEOF_PIXEL
2905 SA8D_8x8
2906 sub r0, 8*SIZEOF_PIXEL
2907 sub r2, 8*SIZEOF_PIXEL
2908 SA8D_8x8
2909 lea r0, [r0 + r1*8]
2910 lea r2, [r2 + r3*8]
2911 SA8D_8x8
2912 add r0, 8*SIZEOF_PIXEL
2913 add r2, 8*SIZEOF_PIXEL
2914 SA8D_8x8
2915 add r0, 8*SIZEOF_PIXEL
2916 add r2, 8*SIZEOF_PIXEL
2917 SA8D_8x8
2918 lea r0, [r0 + r1*8]
2919 lea r2, [r2 + r3*8]
2920 SA8D_8x8
2921 sub r0, 8*SIZEOF_PIXEL
2922 sub r2, 8*SIZEOF_PIXEL
2923 SA8D_8x8
2924 sub r0, 8*SIZEOF_PIXEL
2925 sub r2, 8*SIZEOF_PIXEL
2926 SA8D_8x8
2927 movd eax, m12
2928 RET
2929
2930cglobal pixel_sa8d_32x8, 4,8,13
2931 FIX_STRIDES r1, r3
2932 lea r4, [3*r1]
2933 lea r5, [3*r3]
2934 pxor m12, m12
2935%if vertical == 0
2936 mova m7, [hmul_8p]
2937%endif
2938 SA8D_8x8
2939 add r0, 8*SIZEOF_PIXEL
2940 add r2, 8*SIZEOF_PIXEL
2941 SA8D_8x8
2942 add r0, 8*SIZEOF_PIXEL
2943 add r2, 8*SIZEOF_PIXEL
2944 SA8D_8x8
2945 add r0, 8*SIZEOF_PIXEL
2946 add r2, 8*SIZEOF_PIXEL
2947 SA8D_8x8
2948 movd eax, m12
2949 RET
2950
2951cglobal pixel_sa8d_32x16, 4,8,13
2952 FIX_STRIDES r1, r3
2953 lea r4, [3*r1]
2954 lea r5, [3*r3]
2955 pxor m12, m12
2956%if vertical == 0
2957 mova m7, [hmul_8p]
2958%endif
2959 SA8D_16x16
2960 lea r4, [8*r1]
2961 lea r5, [8*r3]
2962 sub r0, r4
2963 sub r2, r5
2964 add r2, 16*SIZEOF_PIXEL
2965 add r0, 16*SIZEOF_PIXEL
2966 lea r4, [3*r1]
2967 lea r5, [3*r3]
2968 SA8D_16x16
2969 movd eax, m12
2970 RET
2971
2972cglobal pixel_sa8d_32x24, 4,8,13
2973 FIX_STRIDES r1, r3
2974 lea r4, [3*r1]
2975 lea r5, [3*r3]
2976 pxor m12, m12
2977%if vertical == 0
2978 mova m7, [hmul_8p]
2979%endif
2980 SA8D_8x8
2981 add r0, 8*SIZEOF_PIXEL
2982 add r2, 8*SIZEOF_PIXEL
2983 SA8D_8x8
2984 add r0, 8*SIZEOF_PIXEL
2985 add r2, 8*SIZEOF_PIXEL
2986 SA8D_8x8
2987 add r0, 8*SIZEOF_PIXEL
2988 add r2, 8*SIZEOF_PIXEL
2989 SA8D_8x8
2990 lea r0, [r0 + r1*8]
2991 lea r2, [r2 + r3*8]
2992 SA8D_8x8
2993 sub r0, 8*SIZEOF_PIXEL
2994 sub r2, 8*SIZEOF_PIXEL
2995 SA8D_8x8
2996 sub r0, 8*SIZEOF_PIXEL
2997 sub r2, 8*SIZEOF_PIXEL
2998 SA8D_8x8
2999 sub r0, 8*SIZEOF_PIXEL
3000 sub r2, 8*SIZEOF_PIXEL
3001 SA8D_8x8
3002 lea r0, [r0 + r1*8]
3003 lea r2, [r2 + r3*8]
3004 SA8D_8x8
3005 add r0, 8*SIZEOF_PIXEL
3006 add r2, 8*SIZEOF_PIXEL
3007 SA8D_8x8
3008 add r0, 8*SIZEOF_PIXEL
3009 add r2, 8*SIZEOF_PIXEL
3010 SA8D_8x8
3011 add r0, 8*SIZEOF_PIXEL
3012 add r2, 8*SIZEOF_PIXEL
3013 SA8D_8x8
3014 movd eax, m12
3015 RET
3016
3017cglobal pixel_sa8d_32x32, 4,8,13
3018 FIX_STRIDES r1, r3
3019 lea r4, [3*r1]
3020 lea r5, [3*r3]
3021 pxor m12, m12
3022%if vertical == 0
3023 mova m7, [hmul_8p]
3024%endif
3025 SA8D_16x16
3026 lea r4, [8*r1]
3027 lea r5, [8*r3]
3028 sub r0, r4
3029 sub r2, r5
3030 add r2, 16*SIZEOF_PIXEL
3031 add r0, 16*SIZEOF_PIXEL
3032 lea r4, [3*r1]
3033 lea r5, [3*r3]
3034 SA8D_16x16
3035 lea r0, [r0+8*r1]
3036 lea r2, [r2+8*r3]
3037 SA8D_16x16
3038 lea r4, [8*r1]
3039 lea r5, [8*r3]
3040 sub r0, r4
3041 sub r2, r5
3042 sub r2, 16*SIZEOF_PIXEL
3043 sub r0, 16*SIZEOF_PIXEL
3044 lea r4, [3*r1]
3045 lea r5, [3*r3]
3046 SA8D_16x16
3047 movd eax, m12
3048 RET
3049
3050cglobal pixel_sa8d_32x64, 4,8,13
3051 FIX_STRIDES r1, r3
3052 lea r4, [3*r1]
3053 lea r5, [3*r3]
3054 pxor m12, m12
3055%if vertical == 0
3056 mova m7, [hmul_8p]
3057%endif
3058 SA8D_16x16
3059 lea r4, [8*r1]
3060 lea r5, [8*r3]
3061 sub r0, r4
3062 sub r2, r5
3063 add r2, 16*SIZEOF_PIXEL
3064 add r0, 16*SIZEOF_PIXEL
3065 lea r4, [3*r1]
3066 lea r5, [3*r3]
3067 SA8D_16x16
3068 lea r0, [r0+8*r1]
3069 lea r2, [r2+8*r3]
3070 SA8D_16x16
3071 lea r4, [8*r1]
3072 lea r5, [8*r3]
3073 sub r0, r4
3074 sub r2, r5
3075 sub r2, 16*SIZEOF_PIXEL
3076 sub r0, 16*SIZEOF_PIXEL
3077 lea r4, [3*r1]
3078 lea r5, [3*r3]
3079 SA8D_16x16
3080 lea r0, [r0+8*r1]
3081 lea r2, [r2+8*r3]
3082 SA8D_16x16
3083 lea r4, [8*r1]
3084 lea r5, [8*r3]
3085 sub r0, r4
3086 sub r2, r5
3087 add r2, 16*SIZEOF_PIXEL
3088 add r0, 16*SIZEOF_PIXEL
3089 lea r4, [3*r1]
3090 lea r5, [3*r3]
3091 SA8D_16x16
3092 lea r0, [r0+8*r1]
3093 lea r2, [r2+8*r3]
3094 SA8D_16x16
3095 lea r4, [8*r1]
3096 lea r5, [8*r3]
3097 sub r0, r4
3098 sub r2, r5
3099 sub r2, 16*SIZEOF_PIXEL
3100 sub r0, 16*SIZEOF_PIXEL
3101 lea r4, [3*r1]
3102 lea r5, [3*r3]
3103 SA8D_16x16
3104 movd eax, m12
3105 RET
3106
3107cglobal pixel_sa8d_48x64, 4,8,13
3108 FIX_STRIDES r1, r3
3109 lea r4, [3*r1]
3110 lea r5, [3*r3]
3111 pxor m12, m12
3112%if vertical == 0
3113 mova m7, [hmul_8p]
3114%endif
3115 SA8D_16x16
3116 lea r4, [8*r1]
3117 lea r5, [8*r3]
3118 sub r0, r4
3119 sub r2, r5
3120 add r2, 16*SIZEOF_PIXEL
3121 add r0, 16*SIZEOF_PIXEL
3122 lea r4, [3*r1]
3123 lea r5, [3*r3]
3124 SA8D_16x16
3125 lea r4, [8*r1]
3126 lea r5, [8*r3]
3127 sub r0, r4
3128 sub r2, r5
3129 add r2, 16*SIZEOF_PIXEL
3130 add r0, 16*SIZEOF_PIXEL
3131 lea r4, [3*r1]
3132 lea r5, [3*r3]
3133 SA8D_16x16
3134 lea r0, [r0+8*r1]
3135 lea r2, [r2+8*r3]
3136 SA8D_16x16
3137 lea r4, [8*r1]
3138 lea r5, [8*r3]
3139 sub r0, r4
3140 sub r2, r5
3141 sub r2, 16*SIZEOF_PIXEL
3142 sub r0, 16*SIZEOF_PIXEL
3143 lea r4, [3*r1]
3144 lea r5, [3*r3]
3145 SA8D_16x16
3146 lea r4, [8*r1]
3147 lea r5, [8*r3]
3148 sub r0, r4
3149 sub r2, r5
3150 sub r2, 16*SIZEOF_PIXEL
3151 sub r0, 16*SIZEOF_PIXEL
3152 lea r4, [3*r1]
3153 lea r5, [3*r3]
3154 SA8D_16x16
3155 lea r0, [r0+8*r1]
3156 lea r2, [r2+8*r3]
3157 SA8D_16x16
3158 lea r4, [8*r1]
3159 lea r5, [8*r3]
3160 sub r0, r4
3161 sub r2, r5
3162 add r2, 16*SIZEOF_PIXEL
3163 add r0, 16*SIZEOF_PIXEL
3164 lea r4, [3*r1]
3165 lea r5, [3*r3]
3166 SA8D_16x16
3167 lea r4, [8*r1]
3168 lea r5, [8*r3]
3169 sub r0, r4
3170 sub r2, r5
3171 add r2, 16*SIZEOF_PIXEL
3172 add r0, 16*SIZEOF_PIXEL
3173 lea r4, [3*r1]
3174 lea r5, [3*r3]
3175 SA8D_16x16
3176 lea r0, [r0+8*r1]
3177 lea r2, [r2+8*r3]
3178 SA8D_16x16
3179 lea r4, [8*r1]
3180 lea r5, [8*r3]
3181 sub r0, r4
3182 sub r2, r5
3183 sub r2, 16*SIZEOF_PIXEL
3184 sub r0, 16*SIZEOF_PIXEL
3185 lea r4, [3*r1]
3186 lea r5, [3*r3]
3187 SA8D_16x16
3188 lea r4, [8*r1]
3189 lea r5, [8*r3]
3190 sub r0, r4
3191 sub r2, r5
3192 sub r2, 16*SIZEOF_PIXEL
3193 sub r0, 16*SIZEOF_PIXEL
3194 lea r4, [3*r1]
3195 lea r5, [3*r3]
3196 SA8D_16x16
3197 movd eax, m12
3198 RET
3199
3200cglobal pixel_sa8d_64x16, 4,8,13
3201 FIX_STRIDES r1, r3
3202 lea r4, [3*r1]
3203 lea r5, [3*r3]
3204 pxor m12, m12
3205%if vertical == 0
3206 mova m7, [hmul_8p]
3207%endif
3208 SA8D_16x16
3209 lea r4, [8*r1]
3210 lea r5, [8*r3]
3211 sub r0, r4
3212 sub r2, r5
3213 add r2, 16*SIZEOF_PIXEL
3214 add r0, 16*SIZEOF_PIXEL
3215 lea r4, [3*r1]
3216 lea r5, [3*r3]
3217 SA8D_16x16
3218 lea r4, [8*r1]
3219 lea r5, [8*r3]
3220 sub r0, r4
3221 sub r2, r5
3222 add r2, 16*SIZEOF_PIXEL
3223 add r0, 16*SIZEOF_PIXEL
3224 lea r4, [3*r1]
3225 lea r5, [3*r3]
3226 SA8D_16x16
3227 lea r4, [8*r1]
3228 lea r5, [8*r3]
3229 sub r0, r4
3230 sub r2, r5
3231 add r2, 16*SIZEOF_PIXEL
3232 add r0, 16*SIZEOF_PIXEL
3233 lea r4, [3*r1]
3234 lea r5, [3*r3]
3235 SA8D_16x16
3236 movd eax, m12
3237 RET
3238
3239cglobal pixel_sa8d_64x32, 4,8,13
3240 FIX_STRIDES r1, r3
3241 lea r4, [3*r1]
3242 lea r5, [3*r3]
3243 pxor m12, m12
3244%if vertical == 0
3245 mova m7, [hmul_8p]
3246%endif
3247 SA8D_16x16
3248 lea r4, [8*r1]
3249 lea r5, [8*r3]
3250 sub r0, r4
3251 sub r2, r5
3252 add r2, 16*SIZEOF_PIXEL
3253 add r0, 16*SIZEOF_PIXEL
3254 lea r4, [3*r1]
3255 lea r5, [3*r3]
3256 SA8D_16x16
3257 lea r4, [8*r1]
3258 lea r5, [8*r3]
3259 sub r0, r4
3260 sub r2, r5
3261 add r2, 16*SIZEOF_PIXEL
3262 add r0, 16*SIZEOF_PIXEL
3263 lea r4, [3*r1]
3264 lea r5, [3*r3]
3265 SA8D_16x16
3266 lea r4, [8*r1]
3267 lea r5, [8*r3]
3268 sub r0, r4
3269 sub r2, r5
3270 add r2, 16*SIZEOF_PIXEL
3271 add r0, 16*SIZEOF_PIXEL
3272 lea r4, [3*r1]
3273 lea r5, [3*r3]
3274 SA8D_16x16
3275 lea r0, [r0+8*r1]
3276 lea r2, [r2+8*r3]
3277 SA8D_16x16
3278 lea r4, [8*r1]
3279 lea r5, [8*r3]
3280 sub r0, r4
3281 sub r2, r5
3282 sub r2, 16*SIZEOF_PIXEL
3283 sub r0, 16*SIZEOF_PIXEL
3284 lea r4, [3*r1]
3285 lea r5, [3*r3]
3286 SA8D_16x16
3287 lea r4, [8*r1]
3288 lea r5, [8*r3]
3289 sub r0, r4
3290 sub r2, r5
3291 sub r2, 16*SIZEOF_PIXEL
3292 sub r0, 16*SIZEOF_PIXEL
3293 lea r4, [3*r1]
3294 lea r5, [3*r3]
3295 SA8D_16x16
3296 lea r4, [8*r1]
3297 lea r5, [8*r3]
3298 sub r0, r4
3299 sub r2, r5
3300 sub r2, 16*SIZEOF_PIXEL
3301 sub r0, 16*SIZEOF_PIXEL
3302 lea r4, [3*r1]
3303 lea r5, [3*r3]
3304 SA8D_16x16
3305 movd eax, m12
3306 RET
3307
3308cglobal pixel_sa8d_64x48, 4,8,13
3309 FIX_STRIDES r1, r3
3310 lea r4, [3*r1]
3311 lea r5, [3*r3]
3312 pxor m12, m12
3313%if vertical == 0
3314 mova m7, [hmul_8p]
3315%endif
3316 SA8D_16x16
3317 lea r4, [8*r1]
3318 lea r5, [8*r3]
3319 sub r0, r4
3320 sub r2, r5
3321 add r2, 16*SIZEOF_PIXEL
3322 add r0, 16*SIZEOF_PIXEL
3323 lea r4, [3*r1]
3324 lea r5, [3*r3]
3325 SA8D_16x16
3326 lea r4, [8*r1]
3327 lea r5, [8*r3]
3328 sub r0, r4
3329 sub r2, r5
3330 add r2, 16*SIZEOF_PIXEL
3331 add r0, 16*SIZEOF_PIXEL
3332 lea r4, [3*r1]
3333 lea r5, [3*r3]
3334 SA8D_16x16
3335 lea r4, [8*r1]
3336 lea r5, [8*r3]
3337 sub r0, r4
3338 sub r2, r5
3339 add r2, 16*SIZEOF_PIXEL
3340 add r0, 16*SIZEOF_PIXEL
3341 lea r4, [3*r1]
3342 lea r5, [3*r3]
3343 SA8D_16x16
3344 lea r0, [r0+8*r1]
3345 lea r2, [r2+8*r3]
3346 SA8D_16x16
3347 lea r4, [8*r1]
3348 lea r5, [8*r3]
3349 sub r0, r4
3350 sub r2, r5
3351 sub r2, 16*SIZEOF_PIXEL
3352 sub r0, 16*SIZEOF_PIXEL
3353 lea r4, [3*r1]
3354 lea r5, [3*r3]
3355 SA8D_16x16
3356 lea r4, [8*r1]
3357 lea r5, [8*r3]
3358 sub r0, r4
3359 sub r2, r5
3360 sub r2, 16*SIZEOF_PIXEL
3361 sub r0, 16*SIZEOF_PIXEL
3362 lea r4, [3*r1]
3363 lea r5, [3*r3]
3364 SA8D_16x16
3365 lea r4, [8*r1]
3366 lea r5, [8*r3]
3367 sub r0, r4
3368 sub r2, r5
3369 sub r2, 16*SIZEOF_PIXEL
3370 sub r0, 16*SIZEOF_PIXEL
3371 lea r4, [3*r1]
3372 lea r5, [3*r3]
3373 SA8D_16x16
3374 lea r0, [r0+8*r1]
3375 lea r2, [r2+8*r3]
3376 SA8D_16x16
3377 lea r4, [8*r1]
3378 lea r5, [8*r3]
3379 sub r0, r4
3380 sub r2, r5
3381 add r2, 16*SIZEOF_PIXEL
3382 add r0, 16*SIZEOF_PIXEL
3383 lea r4, [3*r1]
3384 lea r5, [3*r3]
3385 SA8D_16x16
3386 lea r4, [8*r1]
3387 lea r5, [8*r3]
3388 sub r0, r4
3389 sub r2, r5
3390 add r2, 16*SIZEOF_PIXEL
3391 add r0, 16*SIZEOF_PIXEL
3392 lea r4, [3*r1]
3393 lea r5, [3*r3]
3394 SA8D_16x16
3395 lea r4, [8*r1]
3396 lea r5, [8*r3]
3397 sub r0, r4
3398 sub r2, r5
3399 add r2, 16*SIZEOF_PIXEL
3400 add r0, 16*SIZEOF_PIXEL
3401 lea r4, [3*r1]
3402 lea r5, [3*r3]
3403 SA8D_16x16
3404 movd eax, m12
3405 RET
3406
3407cglobal pixel_sa8d_64x64, 4,8,13
3408 FIX_STRIDES r1, r3
3409 lea r4, [3*r1]
3410 lea r5, [3*r3]
3411 pxor m12, m12
3412%if vertical == 0
3413 mova m7, [hmul_8p]
3414%endif
3415 SA8D_16x16
3416 lea r4, [8*r1]
3417 lea r5, [8*r3]
3418 sub r0, r4
3419 sub r2, r5
3420 add r2, 16*SIZEOF_PIXEL
3421 add r0, 16*SIZEOF_PIXEL
3422 lea r4, [3*r1]
3423 lea r5, [3*r3]
3424 SA8D_16x16
3425 lea r4, [8*r1]
3426 lea r5, [8*r3]
3427 sub r0, r4
3428 sub r2, r5
3429 add r2, 16*SIZEOF_PIXEL
3430 add r0, 16*SIZEOF_PIXEL
3431 lea r4, [3*r1]
3432 lea r5, [3*r3]
3433 SA8D_16x16
3434 lea r4, [8*r1]
3435 lea r5, [8*r3]
3436 sub r0, r4
3437 sub r2, r5
3438 add r2, 16*SIZEOF_PIXEL
3439 add r0, 16*SIZEOF_PIXEL
3440 lea r4, [3*r1]
3441 lea r5, [3*r3]
3442 SA8D_16x16
3443 lea r0, [r0+8*r1]
3444 lea r2, [r2+8*r3]
3445 SA8D_16x16
3446 lea r4, [8*r1]
3447 lea r5, [8*r3]
3448 sub r0, r4
3449 sub r2, r5
3450 sub r2, 16*SIZEOF_PIXEL
3451 sub r0, 16*SIZEOF_PIXEL
3452 lea r4, [3*r1]
3453 lea r5, [3*r3]
3454 SA8D_16x16
3455 lea r4, [8*r1]
3456 lea r5, [8*r3]
3457 sub r0, r4
3458 sub r2, r5
3459 sub r2, 16*SIZEOF_PIXEL
3460 sub r0, 16*SIZEOF_PIXEL
3461 lea r4, [3*r1]
3462 lea r5, [3*r3]
3463 SA8D_16x16
3464 lea r4, [8*r1]
3465 lea r5, [8*r3]
3466 sub r0, r4
3467 sub r2, r5
3468 sub r2, 16*SIZEOF_PIXEL
3469 sub r0, 16*SIZEOF_PIXEL
3470 lea r4, [3*r1]
3471 lea r5, [3*r3]
3472 SA8D_16x16
3473 lea r0, [r0+8*r1]
3474 lea r2, [r2+8*r3]
3475 SA8D_16x16
3476 lea r4, [8*r1]
3477 lea r5, [8*r3]
3478 sub r0, r4
3479 sub r2, r5
3480 add r2, 16*SIZEOF_PIXEL
3481 add r0, 16*SIZEOF_PIXEL
3482 lea r4, [3*r1]
3483 lea r5, [3*r3]
3484 SA8D_16x16
3485 lea r4, [8*r1]
3486 lea r5, [8*r3]
3487 sub r0, r4
3488 sub r2, r5
3489 add r2, 16*SIZEOF_PIXEL
3490 add r0, 16*SIZEOF_PIXEL
3491 lea r4, [3*r1]
3492 lea r5, [3*r3]
3493 SA8D_16x16
3494 lea r4, [8*r1]
3495 lea r5, [8*r3]
3496 sub r0, r4
3497 sub r2, r5
3498 add r2, 16*SIZEOF_PIXEL
3499 add r0, 16*SIZEOF_PIXEL
3500 lea r4, [3*r1]
3501 lea r5, [3*r3]
3502 SA8D_16x16
3503 lea r0, [r0+8*r1]
3504 lea r2, [r2+8*r3]
3505 SA8D_16x16
3506 lea r4, [8*r1]
3507 lea r5, [8*r3]
3508 sub r0, r4
3509 sub r2, r5
3510 sub r2, 16*SIZEOF_PIXEL
3511 sub r0, 16*SIZEOF_PIXEL
3512 lea r4, [3*r1]
3513 lea r5, [3*r3]
3514 SA8D_16x16
3515 lea r4, [8*r1]
3516 lea r5, [8*r3]
3517 sub r0, r4
3518 sub r2, r5
3519 sub r2, 16*SIZEOF_PIXEL
3520 sub r0, 16*SIZEOF_PIXEL
3521 lea r4, [3*r1]
3522 lea r5, [3*r3]
3523 SA8D_16x16
3524 lea r4, [8*r1]
3525 lea r5, [8*r3]
3526 sub r0, r4
3527 sub r2, r5
3528 sub r2, 16*SIZEOF_PIXEL
3529 sub r0, 16*SIZEOF_PIXEL
3530 lea r4, [3*r1]
3531 lea r5, [3*r3]
3532 SA8D_16x16
3533 movd eax, m12
3534 RET
3535
3536%else ; ARCH_X86_32
3537%if mmsize == 16
3538cglobal pixel_sa8d_8x8_internal
3539 %define spill0 [esp+4]
3540 %define spill1 [esp+20]
3541 %define spill2 [esp+36]
3542%if vertical
3543 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3544 HADAMARD4_2D 0, 1, 2, 3, 4
3545 movdqa spill0, m3
3546 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3547 HADAMARD4_2D 4, 5, 6, 7, 3
3548 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3549 movdqa m3, spill0
3550 paddw m0, m1
3551 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3552%else ; mmsize == 8
3553 mova m7, [hmul_8p]
3554 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
3555 ; could do first HADAMARD4_V here to save spilling later
3556 ; surprisingly, not a win on conroe or even p4
3557 mova spill0, m2
3558 mova spill1, m3
3559 mova spill2, m1
3560 SWAP 1, 7
3561 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
3562 HADAMARD4_V 4, 5, 6, 7, 3
3563 mova m1, spill2
3564 mova m2, spill0
3565 mova m3, spill1
3566 mova spill0, m6
3567 mova spill1, m7
3568 HADAMARD4_V 0, 1, 2, 3, 7
3569 SUMSUB_BADC w, 0, 4, 1, 5, 7
3570 HADAMARD 2, sumsub, 0, 4, 7, 6
3571 HADAMARD 2, sumsub, 1, 5, 7, 6
3572 HADAMARD 1, amax, 0, 4, 7, 6
3573 HADAMARD 1, amax, 1, 5, 7, 6
3574 mova m6, spill0
3575 mova m7, spill1
3576 paddw m0, m1
3577 SUMSUB_BADC w, 2, 6, 3, 7, 4
3578 HADAMARD 2, sumsub, 2, 6, 4, 5
3579 HADAMARD 2, sumsub, 3, 7, 4, 5
3580 HADAMARD 1, amax, 2, 6, 4, 5
3581 HADAMARD 1, amax, 3, 7, 4, 5
3582%endif ; sse2/non-sse2
3583 paddw m0, m2
3584 paddw m0, m3
3585 SAVE_MM_PERMUTATION
3586 ret
3587%endif ; ifndef mmx2
3588
3589cglobal pixel_sa8d_8x8_internal2
3590 %define spill0 [esp+4]
3591 LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3592 HADAMARD4_2D 0, 1, 2, 3, 4
3593 movdqa spill0, m3
3594 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3595 HADAMARD4_2D 4, 5, 6, 7, 3
3596 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3597 movdqa m3, spill0
3598 paddw m0, m1
3599 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3600 paddw m0, m2
3601 paddw m0, m3
3602 SAVE_MM_PERMUTATION
3603 ret
3604
3605cglobal pixel_sa8d_8x8, 4,7
3606 FIX_STRIDES r1, r3
3607 mov r6, esp
3608 and esp, ~15
3609 sub esp, 48
3610 lea r4, [3*r1]
3611 lea r5, [3*r3]
3612 call pixel_sa8d_8x8_internal
3613%if HIGH_BIT_DEPTH
3614 HADDUW m0, m1
3615%else
3616 HADDW m0, m1
3617%endif ; HIGH_BIT_DEPTH
3618 movd eax, m0
3619 add eax, 1
3620 shr eax, 1
3621 mov esp, r6
3622 RET
3623
3624cglobal pixel_sa8d_16x16, 4,7
3625 FIX_STRIDES r1, r3
3626 mov r6, esp
3627 and esp, ~15
3628 sub esp, 64
3629 lea r4, [3*r1]
3630 lea r5, [3*r3]
3631 call pixel_sa8d_8x8_internal
3632%if mmsize == 8
3633 lea r0, [r0+4*r1]
3634 lea r2, [r2+4*r3]
3635%endif
3636%if HIGH_BIT_DEPTH
3637 HADDUW m0, m1
3638%endif
3639 mova [esp+48], m0
3640 call pixel_sa8d_8x8_internal
3641 mov r0, [r6+20]
3642 mov r2, [r6+28]
3643 add r0, 8*SIZEOF_PIXEL
3644 add r2, 8*SIZEOF_PIXEL
3645 SA8D_INTER
3646 mova [esp+48], m0
3647 call pixel_sa8d_8x8_internal
3648%if mmsize == 8
3649 lea r0, [r0+4*r1]
3650 lea r2, [r2+4*r3]
3651%else
3652 SA8D_INTER
3653%endif
3654 mova [esp+64-mmsize], m0
3655 call pixel_sa8d_8x8_internal
3656%if HIGH_BIT_DEPTH
3657 SA8D_INTER
3658%else ; !HIGH_BIT_DEPTH
3659 paddusw m0, [esp+64-mmsize]
3660%if mmsize == 16
3661 HADDUW m0, m1
3662%else
3663 mova m2, [esp+48]
3664 pxor m7, m7
3665 mova m1, m0
3666 mova m3, m2
3667 punpcklwd m0, m7
3668 punpckhwd m1, m7
3669 punpcklwd m2, m7
3670 punpckhwd m3, m7
3671 paddd m0, m1
3672 paddd m2, m3
3673 paddd m0, m2
3674 HADDD m0, m1
3675%endif
3676%endif ; HIGH_BIT_DEPTH
3677 movd eax, m0
3678 add eax, 1
3679 shr eax, 1
3680 mov esp, r6
3681 RET
3682
3683cglobal pixel_sa8d_8x16, 4,7,8
3684 FIX_STRIDES r1, r3
3685 mov r6, esp
3686 and esp, ~15
3687 sub esp, 64
3688
3689 lea r4, [r1 + 2*r1]
3690 lea r5, [r3 + 2*r3]
3691 call pixel_sa8d_8x8_internal2
3692 HADDUW m0, m1
3693 movd r4d, m0
3694 add r4d, 1
3695 shr r4d, 1
3696 mov dword [esp+36], r4d
3697
3698 mov r0, [r6+20]
3699 mov r2, [r6+28]
3700 lea r0, [r0 + r1*8]
3701 lea r2, [r2 + r3*8]
3702 lea r4, [r1 + 2*r1]
3703 call pixel_sa8d_8x8_internal2
3704 HADDUW m0, m1
3705 movd r4d, m0
3706 add r4d, 1
3707 shr r4d, 1
3708 add r4d, dword [esp+36]
3709 mov eax, r4d
3710 mov esp, r6
3711 RET
3712
3713cglobal pixel_sa8d_8x32, 4,7,8
3714 FIX_STRIDES r1, r3
3715 mov r6, esp
3716 and esp, ~15
3717 sub esp, 64
3718
3719 lea r4, [r1 + 2*r1]
3720 lea r5, [r3 + 2*r3]
3721 call pixel_sa8d_8x8_internal2
3722 HADDUW m0, m1
3723 movd r4d, m0
3724 add r4d, 1
3725 shr r4d, 1
3726 mov dword [esp+36], r4d
3727
3728 mov r0, [r6+20]
3729 mov r2, [r6+28]
3730 lea r0, [r0 + r1*8]
3731 lea r2, [r2 + r3*8]
3732 lea r4, [r1 + 2*r1]
3733 call pixel_sa8d_8x8_internal2
3734 HADDUW m0, m1
3735 movd r4d, m0
3736 add r4d, 1
3737 shr r4d, 1
3738 add r4d, dword [esp+36]
3739 mov dword [esp+36], r4d
3740
3741 mov r0, [r6+20]
3742 mov r2, [r6+28]
3743 lea r0, [r0 + r1*8]
3744 lea r2, [r2 + r3*8]
3745 lea r0, [r0 + r1*8]
3746 lea r2, [r2 + r3*8]
3747 lea r4, [r1 + 2*r1]
3748 call pixel_sa8d_8x8_internal2
3749 HADDUW m0, m1
3750 movd r4d, m0
3751 add r4d, 1
3752 shr r4d, 1
3753 add r4d, dword [esp+36]
3754 mov dword [esp+36], r4d
3755
3756 mov r0, [r6+20]
3757 mov r2, [r6+28]
3758 lea r0, [r0 + r1*8]
3759 lea r2, [r2 + r3*8]
3760 lea r0, [r0 + r1*8]
3761 lea r2, [r2 + r3*8]
3762 lea r0, [r0 + r1*8]
3763 lea r2, [r2 + r3*8]
3764 lea r4, [r1 + 2*r1]
3765 call pixel_sa8d_8x8_internal2
3766 HADDUW m0, m1
3767 movd r4d, m0
3768 add r4d, 1
3769 shr r4d, 1
3770 add r4d, dword [esp+36]
3771 mov eax, r4d
3772 mov esp, r6
3773 RET
3774
3775cglobal pixel_sa8d_16x8, 4,7,8
3776 FIX_STRIDES r1, r3
3777 mov r6, esp
3778 and esp, ~15
3779 sub esp, 64
3780
3781 lea r4, [r1 + 2*r1]
3782 lea r5, [r3 + 2*r3]
3783 call pixel_sa8d_8x8_internal2
3784 HADDUW m0, m1
3785 movd r4d, m0
3786 add r4d, 1
3787 shr r4d, 1
3788 mov dword [esp+36], r4d
3789
3790 mov r0, [r6+20]
3791 mov r2, [r6+28]
3792 add r0, 8*SIZEOF_PIXEL
3793 add r2, 8*SIZEOF_PIXEL
3794 lea r4, [r1 + 2*r1]
3795 call pixel_sa8d_8x8_internal2
3796 HADDUW m0, m1
3797 movd r4d, m0
3798 add r4d, 1
3799 shr r4d, 1
3800 add r4d, dword [esp+36]
3801 mov eax, r4d
3802 mov esp, r6
3803 RET
3804
3805cglobal pixel_sa8d_16x32, 4,7,8
3806 FIX_STRIDES r1, r3
3807 mov r6, esp
3808 and esp, ~15
3809 sub esp, 64
3810
3811 lea r4, [r1 + 2*r1]
3812 lea r5, [r3 + 2*r3]
3813 call pixel_sa8d_8x8_internal2
3814%if HIGH_BIT_DEPTH
3815 HADDUW m0, m1
3816%endif
3817 mova [rsp+48], m0
3818 call pixel_sa8d_8x8_internal2
3819 SA8D_INTER
3820 mova [esp+48], m0
3821
3822 mov r0, [r6+20]
3823 mov r2, [r6+28]
3824 add r0, 8*SIZEOF_PIXEL
3825 add r2, 8*SIZEOF_PIXEL
3826 call pixel_sa8d_8x8_internal2
3827 SA8D_INTER
3828 mova [esp+48], m0
3829 call pixel_sa8d_8x8_internal2
3830 SA8D_INTER
3831%if HIGH_BIT_DEPTH == 0
3832 HADDUW m0, m1
3833%endif
3834 movd r4d, m0
3835 add r4d, 1
3836 shr r4d, 1
3837 mov dword [esp+36], r4d
3838
3839 mov r0, [r6+20]
3840 mov r2, [r6+28]
3841 lea r0, [r0 + r1*8]
3842 lea r2, [r2 + r3*8]
3843 lea r0, [r0 + r1*8]
3844 lea r2, [r2 + r3*8]
3845 lea r4, [r1 + 2*r1]
3846 call pixel_sa8d_8x8_internal2
3847%if HIGH_BIT_DEPTH
3848 HADDUW m0, m1
3849%endif
3850 mova [esp+48], m0
3851 call pixel_sa8d_8x8_internal2
3852 SA8D_INTER
3853 mova [esp+48], m0
3854
3855 mov r0, [r6+20]
3856 mov r2, [r6+28]
3857 lea r0, [r0 + r1*8]
3858 lea r2, [r2 + r3*8]
3859 lea r0, [r0 + r1*8]
3860 lea r2, [r2 + r3*8]
3861 add r0, 8*SIZEOF_PIXEL
3862 add r2, 8*SIZEOF_PIXEL
3863 call pixel_sa8d_8x8_internal2
3864 SA8D_INTER
3865 mova [esp+48], m0
3866 call pixel_sa8d_8x8_internal2
3867 SA8D_INTER
3868%if HIGH_BIT_DEPTH == 0
3869 HADDUW m0, m1
3870%endif
3871 movd r4d, m0
3872 add r4d, 1
3873 shr r4d, 1
3874 add r4d, dword [esp+36]
3875 mov eax, r4d
3876 mov esp, r6
3877 RET
3878
3879cglobal pixel_sa8d_16x64, 4,7,8
3880 FIX_STRIDES r1, r3
3881 mov r6, esp
3882 and esp, ~15
3883 sub esp, 64
3884
3885 lea r4, [r1 + 2*r1]
3886 lea r5, [r3 + 2*r3]
3887 call pixel_sa8d_8x8_internal2
3888%if HIGH_BIT_DEPTH
3889 HADDUW m0, m1
3890%endif
3891 mova [rsp+48], m0
3892 call pixel_sa8d_8x8_internal2
3893 SA8D_INTER
3894 mova [esp+48], m0
3895
3896 mov r0, [r6+20]
3897 mov r2, [r6+28]
3898 add r0, 8*SIZEOF_PIXEL
3899 add r2, 8*SIZEOF_PIXEL
3900 call pixel_sa8d_8x8_internal2
3901 SA8D_INTER
3902 mova [esp+48], m0
3903 call pixel_sa8d_8x8_internal2
3904 SA8D_INTER
3905%if HIGH_BIT_DEPTH == 0
3906 HADDUW m0, m1
3907%endif
3908 movd r4d, m0
3909 add r4d, 1
3910 shr r4d, 1
3911 mov dword [esp+36], r4d
3912
3913 mov r0, [r6+20]
3914 mov r2, [r6+28]
3915 lea r0, [r0 + r1*8]
3916 lea r2, [r2 + r3*8]
3917 lea r0, [r0 + r1*8]
3918 lea r2, [r2 + r3*8]
3919 mov [r6+20], r0
3920 mov [r6+28], r2
3921
3922 lea r4, [r1 + 2*r1]
3923 call pixel_sa8d_8x8_internal2
3924%if HIGH_BIT_DEPTH
3925 HADDUW m0, m1
3926%endif
3927 mova [esp+48], m0
3928 call pixel_sa8d_8x8_internal2
3929 SA8D_INTER
3930 mova [esp+48], m0
3931
3932 mov r0, [r6+20]
3933 mov r2, [r6+28]
3934 add r0, 8*SIZEOF_PIXEL
3935 add r2, 8*SIZEOF_PIXEL
3936 call pixel_sa8d_8x8_internal2
3937 SA8D_INTER
3938 mova [esp+64-mmsize], m0
3939 call pixel_sa8d_8x8_internal2
3940 AVG_16x16
3941
3942 mov r0, [r6+20]
3943 mov r2, [r6+28]
3944 lea r0, [r0 + r1*8]
3945 lea r2, [r2 + r3*8]
3946 lea r0, [r0 + r1*8]
3947 lea r2, [r2 + r3*8]
3948 mov [r6+20], r0
3949 mov [r6+28], r2
3950
3951 lea r4, [r1 + 2*r1]
3952 call pixel_sa8d_8x8_internal2
3953%if HIGH_BIT_DEPTH
3954 HADDUW m0, m1
3955%endif
3956 mova [esp+48], m0
3957 call pixel_sa8d_8x8_internal2
3958 SA8D_INTER
3959 mova [esp+48], m0
3960
3961 mov r0, [r6+20]
3962 mov r2, [r6+28]
3963 add r0, 8*SIZEOF_PIXEL
3964 add r2, 8*SIZEOF_PIXEL
3965 call pixel_sa8d_8x8_internal2
3966 SA8D_INTER
3967 mova [esp+64-mmsize], m0
3968 call pixel_sa8d_8x8_internal2
3969 AVG_16x16
3970
3971 mov r0, [r6+20]
3972 mov r2, [r6+28]
3973 lea r0, [r0 + r1*8]
3974 lea r2, [r2 + r3*8]
3975 lea r0, [r0 + r1*8]
3976 lea r2, [r2 + r3*8]
3977 mov [r6+20], r0
3978 mov [r6+28], r2
3979
3980 lea r4, [r1 + 2*r1]
3981 call pixel_sa8d_8x8_internal2
3982%if HIGH_BIT_DEPTH
3983 HADDUW m0, m1
3984%endif
3985 mova [esp+48], m0
3986 call pixel_sa8d_8x8_internal2
3987 SA8D_INTER
3988 mova [esp+48], m0
3989
3990 mov r0, [r6+20]
3991 mov r2, [r6+28]
3992 add r0, 8*SIZEOF_PIXEL
3993 add r2, 8*SIZEOF_PIXEL
3994 call pixel_sa8d_8x8_internal2
3995 SA8D_INTER
3996 mova [esp+64-mmsize], m0
3997 call pixel_sa8d_8x8_internal2
3998 SA8D_INTER
3999%if HIGH_BIT_DEPTH == 0
4000 HADDUW m0, m1
4001%endif
4002 movd r4d, m0
4003 add r4d, 1
4004 shr r4d, 1
4005 add r4d, dword [esp+36]
4006 mov eax, r4d
4007 mov esp, r6
4008 RET
4009
4010cglobal pixel_sa8d_24x32, 4,7,8
4011 FIX_STRIDES r1, r3
4012 mov r6, esp
4013 and esp, ~15
4014 sub esp, 64
4015
4016 lea r4, [r1 + 2*r1]
4017 lea r5, [r3 + 2*r3]
4018 call pixel_sa8d_8x8_internal2
4019 HADDUW m0, m1
4020 movd r4d, m0
4021 add r4d, 1
4022 shr r4d, 1
4023 mov dword [esp+36], r4d
4024
4025 mov r0, [r6+20]
4026 mov r2, [r6+28]
4027 add r0, 8*SIZEOF_PIXEL
4028 add r2, 8*SIZEOF_PIXEL
4029 lea r4, [r1 + 2*r1]
4030 call pixel_sa8d_8x8_internal2
4031 HADDUW m0, m1
4032 movd r4d, m0
4033 add r4d, 1
4034 shr r4d, 1
4035 add r4d, dword [esp+36]
4036 mov dword [esp+36], r4d
4037
4038 mov r0, [r6+20]
4039 mov r2, [r6+28]
4040 add r0, 16*SIZEOF_PIXEL
4041 add r2, 16*SIZEOF_PIXEL
4042 lea r4, [r1 + 2*r1]
4043 call pixel_sa8d_8x8_internal2
4044 HADDUW m0, m1
4045 movd r4d, m0
4046 add r4d, 1
4047 shr r4d, 1
4048 add r4d, dword [esp+36]
4049 mov dword [esp+36], r4d
4050
4051 mov r0, [r6+20]
4052 mov r2, [r6+28]
4053 lea r0, [r0 + r1*8]
4054 lea r2, [r2 + r3*8]
4055 mov [r6+20], r0
4056 mov [r6+28], r2
4057 lea r4, [r1 + 2*r1]
4058 call pixel_sa8d_8x8_internal2
4059 HADDUW m0, m1
4060 movd r4d, m0
4061 add r4d, 1
4062 shr r4d, 1
4063 add r4d, dword [esp+36]
4064 mov dword [esp+36], r4d
4065
4066 mov r0, [r6+20]
4067 mov r2, [r6+28]
4068 add r0, 8*SIZEOF_PIXEL
4069 add r2, 8*SIZEOF_PIXEL
4070 lea r4, [r1 + 2*r1]
4071 call pixel_sa8d_8x8_internal2
4072 HADDUW m0, m1
4073 movd r4d, m0
4074 add r4d, 1
4075 shr r4d, 1
4076 add r4d, dword [esp+36]
4077 mov dword [esp+36], r4d
4078
4079 mov r0, [r6+20]
4080 mov r2, [r6+28]
4081 add r0, 16*SIZEOF_PIXEL
4082 add r2, 16*SIZEOF_PIXEL
4083 lea r4, [r1 + 2*r1]
4084 call pixel_sa8d_8x8_internal2
4085 HADDUW m0, m1
4086 movd r4d, m0
4087 add r4d, 1
4088 shr r4d, 1
4089 add r4d, dword [esp+36]
4090 mov dword [esp+36], r4d
4091
4092 mov r0, [r6+20]
4093 mov r2, [r6+28]
4094 lea r0, [r0 + r1*8]
4095 lea r2, [r2 + r3*8]
4096 mov [r6+20], r0
4097 mov [r6+28], r2
4098 lea r4, [r1 + 2*r1]
4099 call pixel_sa8d_8x8_internal2
4100 HADDUW m0, m1
4101 movd r4d, m0
4102 add r4d, 1
4103 shr r4d, 1
4104 add r4d, dword [esp+36]
4105 mov dword [esp+36], r4d
4106
4107 mov r0, [r6+20]
4108 mov r2, [r6+28]
4109 add r0, 8*SIZEOF_PIXEL
4110 add r2, 8*SIZEOF_PIXEL
4111 lea r4, [r1 + 2*r1]
4112 call pixel_sa8d_8x8_internal2
4113 HADDUW m0, m1
4114 movd r4d, m0
4115 add r4d, 1
4116 shr r4d, 1
4117 add r4d, dword [esp+36]
4118 mov dword [esp+36], r4d
4119
4120 mov r0, [r6+20]
4121 mov r2, [r6+28]
4122 add r0, 16*SIZEOF_PIXEL
4123 add r2, 16*SIZEOF_PIXEL
4124 lea r4, [r1 + 2*r1]
4125 call pixel_sa8d_8x8_internal2
4126 HADDUW m0, m1
4127 movd r4d, m0
4128 add r4d, 1
4129 shr r4d, 1
4130 add r4d, dword [esp+36]
4131 mov dword [esp+36], r4d
4132
4133 mov r0, [r6+20]
4134 mov r2, [r6+28]
4135 lea r0, [r0 + r1*8]
4136 lea r2, [r2 + r3*8]
4137 mov [r6+20], r0
4138 mov [r6+28], r2
4139 lea r4, [r1 + 2*r1]
4140 call pixel_sa8d_8x8_internal2
4141 HADDUW m0, m1
4142 movd r4d, m0
4143 add r4d, 1
4144 shr r4d, 1
4145 add r4d, dword [esp+36]
4146 mov dword [esp+36], r4d
4147
4148 mov r0, [r6+20]
4149 mov r2, [r6+28]
4150 add r0, 8*SIZEOF_PIXEL
4151 add r2, 8*SIZEOF_PIXEL
4152 lea r4, [r1 + 2*r1]
4153 call pixel_sa8d_8x8_internal2
4154 HADDUW m0, m1
4155 movd r4d, m0
4156 add r4d, 1
4157 shr r4d, 1
4158 add r4d, dword [esp+36]
4159 mov dword [esp+36], r4d
4160
4161 mov r0, [r6+20]
4162 mov r2, [r6+28]
4163 add r0, 16*SIZEOF_PIXEL
4164 add r2, 16*SIZEOF_PIXEL
4165 lea r4, [r1 + 2*r1]
4166 call pixel_sa8d_8x8_internal2
4167 HADDUW m0, m1
4168 movd r4d, m0
4169 add r4d, 1
4170 shr r4d, 1
4171 add r4d, dword [esp+36]
4172 mov eax, r4d
4173 mov esp, r6
4174 RET
4175
4176cglobal pixel_sa8d_32x8, 4,7,8
4177 FIX_STRIDES r1, r3
4178 mov r6, esp
4179 and esp, ~15
4180 sub esp, 64
4181
4182 lea r4, [r1 + 2*r1]
4183 lea r5, [r3 + 2*r3]
4184 call pixel_sa8d_8x8_internal2
4185 HADDUW m0, m1
4186 movd r4d, m0
4187 add r4d, 1
4188 shr r4d, 1
4189 mov dword [esp+36], r4d
4190
4191 mov r0, [r6+20]
4192 mov r2, [r6+28]
4193 add r0, 8*SIZEOF_PIXEL
4194 add r2, 8*SIZEOF_PIXEL
4195 lea r4, [r1 + 2*r1]
4196 call pixel_sa8d_8x8_internal2
4197 HADDUW m0, m1
4198 movd r4d, m0
4199 add r4d, 1
4200 shr r4d, 1
4201 add r4d, dword [esp+36]
4202 mov dword [esp+36], r4d
4203
4204 mov r0, [r6+20]
4205 mov r2, [r6+28]
4206 add r0, 16*SIZEOF_PIXEL
4207 add r2, 16*SIZEOF_PIXEL
4208 lea r4, [r1 + 2*r1]
4209 call pixel_sa8d_8x8_internal2
4210 HADDUW m0, m1
4211 movd r4d, m0
4212 add r4d, 1
4213 shr r4d, 1
4214 add r4d, dword [esp+36]
4215 mov dword [esp+36], r4d
4216
4217 mov r0, [r6+20]
4218 mov r2, [r6+28]
4219 add r0, 24*SIZEOF_PIXEL
4220 add r2, 24*SIZEOF_PIXEL
4221 lea r4, [r1 + 2*r1]
4222 call pixel_sa8d_8x8_internal2
4223 HADDUW m0, m1
4224 movd r4d, m0
4225 add r4d, 1
4226 shr r4d, 1
4227 add r4d, dword [esp+36]
4228 mov eax, r4d
4229 mov esp, r6
4230 RET
4231
4232cglobal pixel_sa8d_32x16, 4,7,8
4233 FIX_STRIDES r1, r3
4234 mov r6, esp
4235 and esp, ~15
4236 sub esp, 64
4237
4238 lea r4, [r1 + 2*r1]
4239 lea r5, [r3 + 2*r3]
4240 call pixel_sa8d_8x8_internal2
4241%if HIGH_BIT_DEPTH
4242 HADDUW m0, m1
4243%endif
4244 mova [rsp+48], m0
4245 call pixel_sa8d_8x8_internal2
4246 SA8D_INTER
4247 mova [esp+48], m0
4248
4249 mov r0, [r6+20]
4250 mov r2, [r6+28]
4251 add r0, 8*SIZEOF_PIXEL
4252 add r2, 8*SIZEOF_PIXEL
4253 call pixel_sa8d_8x8_internal2
4254 SA8D_INTER
4255 mova [esp+48], m0
4256 call pixel_sa8d_8x8_internal2
4257 SA8D_INTER
4258%if HIGH_BIT_DEPTH == 0
4259 HADDUW m0, m1
4260%endif
4261 movd r4d, m0
4262 add r4d, 1
4263 shr r4d, 1
4264 mov dword [esp+36], r4d
4265
4266 mov r0, [r6+20]
4267 mov r2, [r6+28]
4268 add r0, 16*SIZEOF_PIXEL
4269 add r2, 16*SIZEOF_PIXEL
4270 lea r4, [r1 + 2*r1]
4271 call pixel_sa8d_8x8_internal2
4272%if HIGH_BIT_DEPTH
4273 HADDUW m0, m1
4274%endif
4275 mova [esp+48], m0
4276 call pixel_sa8d_8x8_internal2
4277 SA8D_INTER
4278 mova [esp+48], m0
4279
4280 mov r0, [r6+20]
4281 mov r2, [r6+28]
4282 add r0, 24*SIZEOF_PIXEL
4283 add r2, 24*SIZEOF_PIXEL
4284 call pixel_sa8d_8x8_internal2
4285 SA8D_INTER
4286 mova [esp+64-mmsize], m0
4287 call pixel_sa8d_8x8_internal2
4288 SA8D_INTER
4289%if HIGH_BIT_DEPTH == 0
4290 HADDUW m0, m1
4291%endif
4292 movd r4d, m0
4293 add r4d, 1
4294 shr r4d, 1
4295 add r4d, dword [esp+36]
4296 mov eax, r4d
4297 mov esp, r6
4298 RET
4299
4300cglobal pixel_sa8d_32x24, 4,7,8
4301 FIX_STRIDES r1, r3
4302 mov r6, esp
4303 and esp, ~15
4304 sub esp, 64
4305
4306 lea r4, [r1 + 2*r1]
4307 lea r5, [r3 + 2*r3]
4308 call pixel_sa8d_8x8_internal2
4309 HADDUW m0, m1
4310 movd r4d, m0
4311 add r4d, 1
4312 shr r4d, 1
4313 mov dword [esp+36], r4d
4314
4315 mov r0, [r6+20]
4316 mov r2, [r6+28]
4317 add r0, 8*SIZEOF_PIXEL
4318 add r2, 8*SIZEOF_PIXEL
4319 lea r4, [r1 + 2*r1]
4320 call pixel_sa8d_8x8_internal2
4321 HADDUW m0, m1
4322 movd r4d, m0
4323 add r4d, 1
4324 shr r4d, 1
4325 add r4d, dword [esp+36]
4326 mov dword [esp+36], r4d
4327
4328 mov r0, [r6+20]
4329 mov r2, [r6+28]
4330 add r0, 16*SIZEOF_PIXEL
4331 add r2, 16*SIZEOF_PIXEL
4332 lea r4, [r1 + 2*r1]
4333 call pixel_sa8d_8x8_internal2
4334 HADDUW m0, m1
4335 movd r4d, m0
4336 add r4d, 1
4337 shr r4d, 1
4338 add r4d, dword [esp+36]
4339 mov dword [esp+36], r4d
4340
4341 mov r0, [r6+20]
4342 mov r2, [r6+28]
4343 add r0, 24*SIZEOF_PIXEL
4344 add r2, 24*SIZEOF_PIXEL
4345 lea r4, [r1 + 2*r1]
4346 call pixel_sa8d_8x8_internal2
4347 HADDUW m0, m1
4348 movd r4d, m0
4349 add r4d, 1
4350 shr r4d, 1
4351 add r4d, dword [esp+36]
4352 mov dword [esp+36], r4d
4353
4354 mov r0, [r6+20]
4355 mov r2, [r6+28]
4356 lea r0, [r0 + r1*8]
4357 lea r2, [r2 + r3*8]
4358 mov [r6+20], r0
4359 mov [r6+28], r2
4360 lea r4, [r1 + 2*r1]
4361 call pixel_sa8d_8x8_internal2
4362 HADDUW m0, m1
4363 movd r4d, m0
4364 add r4d, 1
4365 shr r4d, 1
4366 add r4d, dword [esp+36]
4367 mov dword [esp+36], r4d
4368
4369 mov r0, [r6+20]
4370 mov r2, [r6+28]
4371 add r0, 8*SIZEOF_PIXEL
4372 add r2, 8*SIZEOF_PIXEL
4373 lea r4, [r1 + 2*r1]
4374 call pixel_sa8d_8x8_internal2
4375 HADDUW m0, m1
4376 movd r4d, m0
4377 add r4d, 1
4378 shr r4d, 1
4379 add r4d, dword [esp+36]
4380 mov dword [esp+36], r4d
4381
4382 mov r0, [r6+20]
4383 mov r2, [r6+28]
4384 add r0, 16*SIZEOF_PIXEL
4385 add r2, 16*SIZEOF_PIXEL
4386 lea r4, [r1 + 2*r1]
4387 call pixel_sa8d_8x8_internal2
4388 HADDUW m0, m1
4389 movd r4d, m0
4390 add r4d, 1
4391 shr r4d, 1
4392 add r4d, dword [esp+36]
4393 mov dword [esp+36], r4d
4394
4395 mov r0, [r6+20]
4396 mov r2, [r6+28]
4397 add r0, 24*SIZEOF_PIXEL
4398 add r2, 24*SIZEOF_PIXEL
4399 lea r4, [r1 + 2*r1]
4400 call pixel_sa8d_8x8_internal2
4401 HADDUW m0, m1
4402 movd r4d, m0
4403 add r4d, 1
4404 shr r4d, 1
4405 add r4d, dword [esp+36]
4406 mov dword [esp+36], r4d
4407
4408 mov r0, [r6+20]
4409 mov r2, [r6+28]
4410 lea r0, [r0 + r1*8]
4411 lea r2, [r2 + r3*8]
4412 mov [r6+20], r0
4413 mov [r6+28], r2
4414 lea r4, [r1 + 2*r1]
4415 call pixel_sa8d_8x8_internal2
4416 HADDUW m0, m1
4417 movd r4d, m0
4418 add r4d, 1
4419 shr r4d, 1
4420 add r4d, dword [esp+36]
4421 mov dword [esp+36], r4d
4422
4423 mov r0, [r6+20]
4424 mov r2, [r6+28]
4425 add r0, 8*SIZEOF_PIXEL
4426 add r2, 8*SIZEOF_PIXEL
4427 lea r4, [r1 + 2*r1]
4428 call pixel_sa8d_8x8_internal2
4429 HADDUW m0, m1
4430 movd r4d, m0
4431 add r4d, 1
4432 shr r4d, 1
4433 add r4d, dword [esp+36]
4434 mov dword [esp+36], r4d
4435
4436 mov r0, [r6+20]
4437 mov r2, [r6+28]
4438 add r0, 16*SIZEOF_PIXEL
4439 add r2, 16*SIZEOF_PIXEL
4440 lea r4, [r1 + 2*r1]
4441 call pixel_sa8d_8x8_internal2
4442 HADDUW m0, m1
4443 movd r4d, m0
4444 add r4d, 1
4445 shr r4d, 1
4446 add r4d, dword [esp+36]
4447 mov dword [esp+36], r4d
4448
4449 mov r0, [r6+20]
4450 mov r2, [r6+28]
4451 add r0, 24*SIZEOF_PIXEL
4452 add r2, 24*SIZEOF_PIXEL
4453 lea r4, [r1 + 2*r1]
4454 call pixel_sa8d_8x8_internal2
4455 HADDUW m0, m1
4456 movd r4d, m0
4457 add r4d, 1
4458 shr r4d, 1
4459 add r4d, dword [esp+36]
4460 mov eax, r4d
4461 mov esp, r6
4462 RET
4463
4464cglobal pixel_sa8d_32x32, 4,7,8
4465 FIX_STRIDES r1, r3
4466 mov r6, esp
4467 and esp, ~15
4468 sub esp, 64
4469
4470 lea r4, [r1 + 2*r1]
4471 lea r5, [r3 + 2*r3]
4472 call pixel_sa8d_8x8_internal2
4473%if HIGH_BIT_DEPTH
4474 HADDUW m0, m1
4475%endif
4476 mova [rsp+48], m0
4477 call pixel_sa8d_8x8_internal2
4478 SA8D_INTER
4479 mova [esp+48], m0
4480
4481 mov r0, [r6+20]
4482 mov r2, [r6+28]
4483 add r0, 8*SIZEOF_PIXEL
4484 add r2, 8*SIZEOF_PIXEL
4485 call pixel_sa8d_8x8_internal2
4486 SA8D_INTER
4487 mova [esp+48], m0
4488 call pixel_sa8d_8x8_internal2
4489 SA8D_INTER
4490%if HIGH_BIT_DEPTH == 0
4491 HADDUW m0, m1
4492%endif
4493 movd r4d, m0
4494 add r4d, 1
4495 shr r4d, 1
4496 mov dword [esp+36], r4d
4497
4498 mov r0, [r6+20]
4499 mov r2, [r6+28]
4500 add r0, 16*SIZEOF_PIXEL
4501 add r2, 16*SIZEOF_PIXEL
4502 lea r4, [r1 + 2*r1]
4503 call pixel_sa8d_8x8_internal2
4504%if HIGH_BIT_DEPTH
4505 HADDUW m0, m1
4506%endif
4507 mova [esp+48], m0
4508 call pixel_sa8d_8x8_internal2
4509 SA8D_INTER
4510 mova [esp+48], m0
4511
4512 mov r0, [r6+20]
4513 mov r2, [r6+28]
4514 add r0, 24*SIZEOF_PIXEL
4515 add r2, 24*SIZEOF_PIXEL
4516 call pixel_sa8d_8x8_internal2
4517 SA8D_INTER
4518 mova [esp+64-mmsize], m0
4519 call pixel_sa8d_8x8_internal2
4520 AVG_16x16
4521
4522 mov r0, [r6+20]
4523 mov r2, [r6+28]
4524 lea r0, [r0 + r1*8]
4525 lea r2, [r2 + r3*8]
4526 lea r0, [r0 + r1*8]
4527 lea r2, [r2 + r3*8]
4528 lea r4, [r1 + 2*r1]
4529 call pixel_sa8d_8x8_internal2
4530%if HIGH_BIT_DEPTH
4531 HADDUW m0, m1
4532%endif
4533 mova [esp+48], m0
4534 call pixel_sa8d_8x8_internal2
4535 SA8D_INTER
4536 mova [esp+48], m0
4537
4538 mov r0, [r6+20]
4539 mov r2, [r6+28]
4540 lea r0, [r0 + r1*8]
4541 lea r2, [r2 + r3*8]
4542 lea r0, [r0 + r1*8]
4543 lea r2, [r2 + r3*8]
4544 add r0, 8*SIZEOF_PIXEL
4545 add r2, 8*SIZEOF_PIXEL
4546 call pixel_sa8d_8x8_internal2
4547 SA8D_INTER
4548 mova [esp+64-mmsize], m0
4549 call pixel_sa8d_8x8_internal2
4550 AVG_16x16
4551
4552 mov r0, [r6+20]
4553 mov r2, [r6+28]
4554 lea r0, [r0 + r1*8]
4555 lea r2, [r2 + r3*8]
4556 lea r0, [r0 + r1*8]
4557 lea r2, [r2 + r3*8]
4558 add r0, 16*SIZEOF_PIXEL
4559 add r2, 16*SIZEOF_PIXEL
4560 lea r4, [r1 + 2*r1]
4561 call pixel_sa8d_8x8_internal2
4562%if HIGH_BIT_DEPTH
4563 HADDUW m0, m1
4564%endif
4565 mova [esp+48], m0
4566 call pixel_sa8d_8x8_internal2
4567 SA8D_INTER
4568 mova [esp+48], m0
4569
4570 mov r0, [r6+20]
4571 mov r2, [r6+28]
4572 lea r0, [r0 + r1*8]
4573 lea r2, [r2 + r3*8]
4574 lea r0, [r0 + r1*8]
4575 lea r2, [r2 + r3*8]
4576 add r0, 24*SIZEOF_PIXEL
4577 add r2, 24*SIZEOF_PIXEL
4578 call pixel_sa8d_8x8_internal2
4579 SA8D_INTER
4580 mova [esp+64-mmsize], m0
4581 call pixel_sa8d_8x8_internal2
4582 SA8D_INTER
4583%if HIGH_BIT_DEPTH == 0
4584 HADDUW m0, m1
4585%endif
4586 movd r4d, m0
4587 add r4d, 1
4588 shr r4d, 1
4589 add r4d, dword [esp+36]
4590 mov eax, r4d
4591 mov esp, r6
4592 RET
4593
4594cglobal pixel_sa8d_32x64, 4,7,8
4595 FIX_STRIDES r1, r3
4596 mov r6, esp
4597 and esp, ~15
4598 sub esp, 64
4599
4600 lea r4, [r1 + 2*r1]
4601 lea r5, [r3 + 2*r3]
4602 call pixel_sa8d_8x8_internal2
4603%if HIGH_BIT_DEPTH
4604 HADDUW m0, m1
4605%endif
4606 mova [rsp+48], m0
4607 call pixel_sa8d_8x8_internal2
4608 SA8D_INTER
4609 mova [esp+48], m0
4610
4611 mov r0, [r6+20]
4612 mov r2, [r6+28]
4613 add r0, 8*SIZEOF_PIXEL
4614 add r2, 8*SIZEOF_PIXEL
4615 call pixel_sa8d_8x8_internal2
4616 SA8D_INTER
4617 mova [esp+48], m0
4618 call pixel_sa8d_8x8_internal2
4619 SA8D_INTER
4620%if HIGH_BIT_DEPTH == 0
4621 HADDUW m0, m1
4622%endif
4623 movd r4d, m0
4624 add r4d, 1
4625 shr r4d, 1
4626 mov dword [esp+36], r4d
4627
4628 mov r0, [r6+20]
4629 mov r2, [r6+28]
4630 add r0, 16*SIZEOF_PIXEL
4631 add r2, 16*SIZEOF_PIXEL
4632 lea r4, [r1 + 2*r1]
4633 call pixel_sa8d_8x8_internal2
4634%if HIGH_BIT_DEPTH
4635 HADDUW m0, m1
4636%endif
4637 mova [esp+48], m0
4638 call pixel_sa8d_8x8_internal2
4639 SA8D_INTER
4640 mova [esp+48], m0
4641
4642 mov r0, [r6+20]
4643 mov r2, [r6+28]
4644 add r0, 24*SIZEOF_PIXEL
4645 add r2, 24*SIZEOF_PIXEL
4646 call pixel_sa8d_8x8_internal2
4647 SA8D_INTER
4648 mova [esp+64-mmsize], m0
4649 call pixel_sa8d_8x8_internal2
4650 AVG_16x16
4651
4652 mov r0, [r6+20]
4653 mov r2, [r6+28]
4654 lea r0, [r0 + r1*8]
4655 lea r2, [r2 + r3*8]
4656 lea r0, [r0 + r1*8]
4657 lea r2, [r2 + r3*8]
4658 mov [r6+20], r0
4659 mov [r6+28], r2
4660
4661 lea r4, [r1 + 2*r1]
4662 call pixel_sa8d_8x8_internal2
4663%if HIGH_BIT_DEPTH
4664 HADDUW m0, m1
4665%endif
4666 mova [esp+48], m0
4667 call pixel_sa8d_8x8_internal2
4668 SA8D_INTER
4669 mova [esp+48], m0
4670
4671 mov r0, [r6+20]
4672 mov r2, [r6+28]
4673 add r0, 8*SIZEOF_PIXEL
4674 add r2, 8*SIZEOF_PIXEL
4675 call pixel_sa8d_8x8_internal2
4676 SA8D_INTER
4677 mova [esp+64-mmsize], m0
4678 call pixel_sa8d_8x8_internal2
4679 AVG_16x16
4680
4681 mov r0, [r6+20]
4682 mov r2, [r6+28]
4683 add r0, 16*SIZEOF_PIXEL
4684 add r2, 16*SIZEOF_PIXEL
4685 lea r4, [r1 + 2*r1]
4686 call pixel_sa8d_8x8_internal2
4687%if HIGH_BIT_DEPTH
4688 HADDUW m0, m1
4689%endif
4690 mova [esp+48], m0
4691 call pixel_sa8d_8x8_internal2
4692 SA8D_INTER
4693 mova [esp+48], m0
4694
4695 mov r0, [r6+20]
4696 mov r2, [r6+28]
4697 add r0, 24*SIZEOF_PIXEL
4698 add r2, 24*SIZEOF_PIXEL
4699 call pixel_sa8d_8x8_internal2
4700 SA8D_INTER
4701 mova [esp+64-mmsize], m0
4702 call pixel_sa8d_8x8_internal2
4703 AVG_16x16
4704
4705 mov r0, [r6+20]
4706 mov r2, [r6+28]
4707 lea r0, [r0 + r1*8]
4708 lea r2, [r2 + r3*8]
4709 lea r0, [r0 + r1*8]
4710 lea r2, [r2 + r3*8]
4711 mov [r6+20], r0
4712 mov [r6+28], r2
4713
4714 lea r4, [r1 + 2*r1]
4715 call pixel_sa8d_8x8_internal2
4716%if HIGH_BIT_DEPTH
4717 HADDUW m0, m1
4718%endif
4719 mova [esp+48], m0
4720 call pixel_sa8d_8x8_internal2
4721 SA8D_INTER
4722 mova [esp+48], m0
4723
4724 mov r0, [r6+20]
4725 mov r2, [r6+28]
4726 add r0, 8*SIZEOF_PIXEL
4727 add r2, 8*SIZEOF_PIXEL
4728 call pixel_sa8d_8x8_internal2
4729 SA8D_INTER
4730 mova [esp+64-mmsize], m0
4731 call pixel_sa8d_8x8_internal2
4732 AVG_16x16
4733
4734 mov r0, [r6+20]
4735 mov r2, [r6+28]
4736 add r0, 16*SIZEOF_PIXEL
4737 add r2, 16*SIZEOF_PIXEL
4738 lea r4, [r1 + 2*r1]
4739 call pixel_sa8d_8x8_internal2
4740%if HIGH_BIT_DEPTH
4741 HADDUW m0, m1
4742%endif
4743 mova [esp+48], m0
4744 call pixel_sa8d_8x8_internal2
4745 SA8D_INTER
4746 mova [esp+48], m0
4747
4748 mov r0, [r6+20]
4749 mov r2, [r6+28]
4750 add r0, 24*SIZEOF_PIXEL
4751 add r2, 24*SIZEOF_PIXEL
4752 call pixel_sa8d_8x8_internal2
4753 SA8D_INTER
4754 mova [esp+64-mmsize], m0
4755 call pixel_sa8d_8x8_internal2
4756 AVG_16x16
4757
4758 mov r0, [r6+20]
4759 mov r2, [r6+28]
4760 lea r0, [r0 + r1*8]
4761 lea r2, [r2 + r3*8]
4762 lea r0, [r0 + r1*8]
4763 lea r2, [r2 + r3*8]
4764 mov [r6+20], r0
4765 mov [r6+28], r2
4766
4767 lea r4, [r1 + 2*r1]
4768 call pixel_sa8d_8x8_internal2
4769%if HIGH_BIT_DEPTH
4770 HADDUW m0, m1
4771%endif
4772 mova [esp+48], m0
4773 call pixel_sa8d_8x8_internal2
4774 SA8D_INTER
4775 mova [esp+48], m0
4776
4777 mov r0, [r6+20]
4778 mov r2, [r6+28]
4779 add r0, 8*SIZEOF_PIXEL
4780 add r2, 8*SIZEOF_PIXEL
4781 call pixel_sa8d_8x8_internal2
4782 SA8D_INTER
4783 mova [esp+64-mmsize], m0
4784 call pixel_sa8d_8x8_internal2
4785 AVG_16x16
4786
4787 mov r0, [r6+20]
4788 mov r2, [r6+28]
4789 add r0, 16*SIZEOF_PIXEL
4790 add r2, 16*SIZEOF_PIXEL
4791 lea r4, [r1 + 2*r1]
4792 call pixel_sa8d_8x8_internal2
4793%if HIGH_BIT_DEPTH
4794 HADDUW m0, m1
4795%endif
4796 mova [esp+48], m0
4797 call pixel_sa8d_8x8_internal2
4798 SA8D_INTER
4799 mova [esp+48], m0
4800
4801 mov r0, [r6+20]
4802 mov r2, [r6+28]
4803 add r0, 24*SIZEOF_PIXEL
4804 add r2, 24*SIZEOF_PIXEL
4805 call pixel_sa8d_8x8_internal2
4806 SA8D_INTER
4807 mova [esp+64-mmsize], m0
4808 call pixel_sa8d_8x8_internal2
4809 SA8D_INTER
4810%if HIGH_BIT_DEPTH == 0
4811 HADDUW m0, m1
4812%endif
4813 movd r4d, m0
4814 add r4d, 1
4815 shr r4d, 1
4816 add r4d, dword [esp+36]
4817 mov eax, r4d
4818 mov esp, r6
4819 RET
4820
4821cglobal pixel_sa8d_48x64, 4,7,8
4822 FIX_STRIDES r1, r3
4823 mov r6, esp
4824 and esp, ~15
4825 sub esp, 64
4826
4827 lea r4, [r1 + 2*r1]
4828 lea r5, [r3 + 2*r3]
4829 call pixel_sa8d_8x8_internal2
4830%if HIGH_BIT_DEPTH
4831 HADDUW m0, m1
4832%endif
4833 mova [rsp+48], m0
4834 call pixel_sa8d_8x8_internal2
4835 SA8D_INTER
4836 mova [esp+48], m0
4837
4838 mov r0, [r6+20]
4839 mov r2, [r6+28]
4840 add r0, 8*SIZEOF_PIXEL
4841 add r2, 8*SIZEOF_PIXEL
4842 call pixel_sa8d_8x8_internal2
4843 SA8D_INTER
4844 mova [esp+48], m0
4845 call pixel_sa8d_8x8_internal2
4846 SA8D_INTER
4847%if HIGH_BIT_DEPTH == 0
4848 HADDUW m0, m1
4849%endif
4850 movd r4d, m0
4851 add r4d, 1
4852 shr r4d, 1
4853 mov dword [esp+36], r4d
4854
4855 mov r0, [r6+20]
4856 mov r2, [r6+28]
4857 add r0, 16*SIZEOF_PIXEL
4858 add r2, 16*SIZEOF_PIXEL
4859 lea r4, [r1 + 2*r1]
4860 call pixel_sa8d_8x8_internal2
4861%if HIGH_BIT_DEPTH
4862 HADDUW m0, m1
4863%endif
4864 mova [esp+48], m0
4865 call pixel_sa8d_8x8_internal2
4866 SA8D_INTER
4867 mova [esp+48], m0
4868
4869 mov r0, [r6+20]
4870 mov r2, [r6+28]
4871 add r0, 24*SIZEOF_PIXEL
4872 add r2, 24*SIZEOF_PIXEL
4873 call pixel_sa8d_8x8_internal2
4874 SA8D_INTER
4875 mova [esp+64-mmsize], m0
4876 call pixel_sa8d_8x8_internal2
4877 AVG_16x16
4878
4879 mov r0, [r6+20]
4880 mov r2, [r6+28]
4881 add r0, 32*SIZEOF_PIXEL
4882 add r2, 32*SIZEOF_PIXEL
4883 lea r4, [r1 + 2*r1]
4884 call pixel_sa8d_8x8_internal2
4885%if HIGH_BIT_DEPTH
4886 HADDUW m0, m1
4887%endif
4888 mova [esp+48], m0
4889 call pixel_sa8d_8x8_internal2
4890 SA8D_INTER
4891 mova [esp+48], m0
4892
4893 mov r0, [r6+20]
4894 mov r2, [r6+28]
4895 add r0, 40*SIZEOF_PIXEL
4896 add r2, 40*SIZEOF_PIXEL
4897 call pixel_sa8d_8x8_internal2
4898 SA8D_INTER
4899 mova [esp+64-mmsize], m0
4900 call pixel_sa8d_8x8_internal2
4901 AVG_16x16
4902
4903 mov r0, [r6+20]
4904 mov r2, [r6+28]
4905 lea r0, [r0 + r1*8]
4906 lea r2, [r2 + r3*8]
4907 lea r0, [r0 + r1*8]
4908 lea r2, [r2 + r3*8]
4909 mov [r6+20], r0
4910 mov [r6+28], r2
4911
4912 lea r4, [r1 + 2*r1]
4913 call pixel_sa8d_8x8_internal2
4914%if HIGH_BIT_DEPTH
4915 HADDUW m0, m1
4916%endif
4917 mova [esp+48], m0
4918 call pixel_sa8d_8x8_internal2
4919 SA8D_INTER
4920 mova [esp+48], m0
4921
4922 mov r0, [r6+20]
4923 mov r2, [r6+28]
4924 add r0, 8*SIZEOF_PIXEL
4925 add r2, 8*SIZEOF_PIXEL
4926 call pixel_sa8d_8x8_internal2
4927 SA8D_INTER
4928 mova [esp+64-mmsize], m0
4929 call pixel_sa8d_8x8_internal2
4930 AVG_16x16
4931
4932 mov r0, [r6+20]
4933 mov r2, [r6+28]
4934 add r0, 16*SIZEOF_PIXEL
4935 add r2, 16*SIZEOF_PIXEL
4936 lea r4, [r1 + 2*r1]
4937 call pixel_sa8d_8x8_internal2
4938%if HIGH_BIT_DEPTH
4939 HADDUW m0, m1
4940%endif
4941 mova [esp+48], m0
4942 call pixel_sa8d_8x8_internal2
4943 SA8D_INTER
4944 mova [esp+48], m0
4945
4946 mov r0, [r6+20]
4947 mov r2, [r6+28]
4948 add r0, 24*SIZEOF_PIXEL
4949 add r2, 24*SIZEOF_PIXEL
4950 call pixel_sa8d_8x8_internal2
4951 SA8D_INTER
4952 mova [esp+64-mmsize], m0
4953 call pixel_sa8d_8x8_internal2
4954 AVG_16x16
4955
4956 mov r0, [r6+20]
4957 mov r2, [r6+28]
4958 add r0, 32*SIZEOF_PIXEL
4959 add r2, 32*SIZEOF_PIXEL
4960 lea r4, [r1 + 2*r1]
4961 call pixel_sa8d_8x8_internal2
4962%if HIGH_BIT_DEPTH
4963 HADDUW m0, m1
4964%endif
4965 mova [esp+48], m0
4966 call pixel_sa8d_8x8_internal2
4967 SA8D_INTER
4968 mova [esp+48], m0
4969
4970 mov r0, [r6+20]
4971 mov r2, [r6+28]
4972 add r0, 40*SIZEOF_PIXEL
4973 add r2, 40*SIZEOF_PIXEL
4974 call pixel_sa8d_8x8_internal2
4975 SA8D_INTER
4976 mova [esp+64-mmsize], m0
4977 call pixel_sa8d_8x8_internal2
4978 AVG_16x16
4979
4980 mov r0, [r6+20]
4981 mov r2, [r6+28]
4982 lea r0, [r0 + r1*8]
4983 lea r2, [r2 + r3*8]
4984 lea r0, [r0 + r1*8]
4985 lea r2, [r2 + r3*8]
4986 mov [r6+20], r0
4987 mov [r6+28], r2
4988
4989 lea r4, [r1 + 2*r1]
4990 call pixel_sa8d_8x8_internal2
4991%if HIGH_BIT_DEPTH
4992 HADDUW m0, m1
4993%endif
4994 mova [esp+48], m0
4995 call pixel_sa8d_8x8_internal2
4996 SA8D_INTER
4997 mova [esp+48], m0
4998
4999 mov r0, [r6+20]
5000 mov r2, [r6+28]
5001 add r0, 8*SIZEOF_PIXEL
5002 add r2, 8*SIZEOF_PIXEL
5003 call pixel_sa8d_8x8_internal2
5004 SA8D_INTER
5005 mova [esp+64-mmsize], m0
5006 call pixel_sa8d_8x8_internal2
5007 AVG_16x16
5008
5009 mov r0, [r6+20]
5010 mov r2, [r6+28]
5011 add r0, 16*SIZEOF_PIXEL
5012 add r2, 16*SIZEOF_PIXEL
5013 lea r4, [r1 + 2*r1]
5014 call pixel_sa8d_8x8_internal2
5015%if HIGH_BIT_DEPTH
5016 HADDUW m0, m1
5017%endif
5018 mova [esp+48], m0
5019 call pixel_sa8d_8x8_internal2
5020 SA8D_INTER
5021 mova [esp+48], m0
5022
5023 mov r0, [r6+20]
5024 mov r2, [r6+28]
5025 add r0, 24*SIZEOF_PIXEL
5026 add r2, 24*SIZEOF_PIXEL
5027 call pixel_sa8d_8x8_internal2
5028 SA8D_INTER
5029 mova [esp+64-mmsize], m0
5030 call pixel_sa8d_8x8_internal2
5031 AVG_16x16
5032
5033 mov r0, [r6+20]
5034 mov r2, [r6+28]
5035 add r0, 32*SIZEOF_PIXEL
5036 add r2, 32*SIZEOF_PIXEL
5037 lea r4, [r1 + 2*r1]
5038 call pixel_sa8d_8x8_internal2
5039%if HIGH_BIT_DEPTH
5040 HADDUW m0, m1
5041%endif
5042 mova [esp+48], m0
5043 call pixel_sa8d_8x8_internal2
5044 SA8D_INTER
5045 mova [esp+48], m0
5046
5047 mov r0, [r6+20]
5048 mov r2, [r6+28]
5049 add r0, 40*SIZEOF_PIXEL
5050 add r2, 40*SIZEOF_PIXEL
5051 call pixel_sa8d_8x8_internal2
5052 SA8D_INTER
5053 mova [esp+64-mmsize], m0
5054 call pixel_sa8d_8x8_internal2
5055 AVG_16x16
5056
5057 mov r0, [r6+20]
5058 mov r2, [r6+28]
5059 lea r0, [r0 + r1*8]
5060 lea r2, [r2 + r3*8]
5061 lea r0, [r0 + r1*8]
5062 lea r2, [r2 + r3*8]
5063 mov [r6+20], r0
5064 mov [r6+28], r2
5065
5066 lea r4, [r1 + 2*r1]
5067 call pixel_sa8d_8x8_internal2
5068%if HIGH_BIT_DEPTH
5069 HADDUW m0, m1
5070%endif
5071 mova [esp+48], m0
5072 call pixel_sa8d_8x8_internal2
5073 SA8D_INTER
5074 mova [esp+48], m0
5075
5076 mov r0, [r6+20]
5077 mov r2, [r6+28]
5078 add r0, 8*SIZEOF_PIXEL
5079 add r2, 8*SIZEOF_PIXEL
5080 call pixel_sa8d_8x8_internal2
5081 SA8D_INTER
5082 mova [esp+64-mmsize], m0
5083 call pixel_sa8d_8x8_internal2
5084 AVG_16x16
5085
5086 mov r0, [r6+20]
5087 mov r2, [r6+28]
5088 add r0, 16*SIZEOF_PIXEL
5089 add r2, 16*SIZEOF_PIXEL
5090 lea r4, [r1 + 2*r1]
5091 call pixel_sa8d_8x8_internal2
5092%if HIGH_BIT_DEPTH
5093 HADDUW m0, m1
5094%endif
5095 mova [esp+48], m0
5096 call pixel_sa8d_8x8_internal2
5097 SA8D_INTER
5098 mova [esp+48], m0
5099
5100 mov r0, [r6+20]
5101 mov r2, [r6+28]
5102 add r0, 24*SIZEOF_PIXEL
5103 add r2, 24*SIZEOF_PIXEL
5104 call pixel_sa8d_8x8_internal2
5105 SA8D_INTER
5106 mova [esp+64-mmsize], m0
5107 call pixel_sa8d_8x8_internal2
5108 AVG_16x16
5109
5110 mov r0, [r6+20]
5111 mov r2, [r6+28]
5112 add r0, 32*SIZEOF_PIXEL
5113 add r2, 32*SIZEOF_PIXEL
5114 lea r4, [r1 + 2*r1]
5115 call pixel_sa8d_8x8_internal2
5116%if HIGH_BIT_DEPTH
5117 HADDUW m0, m1
5118%endif
5119 mova [esp+48], m0
5120 call pixel_sa8d_8x8_internal2
5121 SA8D_INTER
5122 mova [esp+48], m0
5123
5124 mov r0, [r6+20]
5125 mov r2, [r6+28]
5126 add r0, 40*SIZEOF_PIXEL
5127 add r2, 40*SIZEOF_PIXEL
5128 call pixel_sa8d_8x8_internal2
5129 SA8D_INTER
5130 mova [esp+64-mmsize], m0
5131 call pixel_sa8d_8x8_internal2
5132 SA8D_INTER
5133%if HIGH_BIT_DEPTH == 0
5134 HADDUW m0, m1
5135%endif
5136 movd r4d, m0
5137 add r4d, 1
5138 shr r4d, 1
5139 add r4d, dword [esp+36]
5140 mov eax, r4d
5141 mov esp, r6
5142 RET
5143
5144cglobal pixel_sa8d_64x16, 4,7,8
5145 FIX_STRIDES r1, r3
5146 mov r6, esp
5147 and esp, ~15
5148 sub esp, 64
5149
5150 lea r4, [r1 + 2*r1]
5151 lea r5, [r3 + 2*r3]
5152 call pixel_sa8d_8x8_internal2
5153%if HIGH_BIT_DEPTH
5154 HADDUW m0, m1
5155%endif
5156 mova [rsp+48], m0
5157 call pixel_sa8d_8x8_internal2
5158 SA8D_INTER
5159 mova [esp+48], m0
5160
5161 mov r0, [r6+20]
5162 mov r2, [r6+28]
5163 add r0, 8*SIZEOF_PIXEL
5164 add r2, 8*SIZEOF_PIXEL
5165 call pixel_sa8d_8x8_internal2
5166 SA8D_INTER
5167 mova [esp+48], m0
5168 call pixel_sa8d_8x8_internal2
5169 SA8D_INTER
5170%if HIGH_BIT_DEPTH == 0
5171 HADDUW m0, m1
5172%endif
5173 movd r4d, m0
5174 add r4d, 1
5175 shr r4d, 1
5176 mov dword [esp+36], r4d
5177
5178 mov r0, [r6+20]
5179 mov r2, [r6+28]
5180 add r0, 16*SIZEOF_PIXEL
5181 add r2, 16*SIZEOF_PIXEL
5182 lea r4, [r1 + 2*r1]
5183 call pixel_sa8d_8x8_internal2
5184%if HIGH_BIT_DEPTH
5185 HADDUW m0, m1
5186%endif
5187 mova [esp+48], m0
5188 call pixel_sa8d_8x8_internal2
5189 SA8D_INTER
5190 mova [esp+48], m0
5191
5192 mov r0, [r6+20]
5193 mov r2, [r6+28]
5194 add r0, 24*SIZEOF_PIXEL
5195 add r2, 24*SIZEOF_PIXEL
5196 call pixel_sa8d_8x8_internal2
5197 SA8D_INTER
5198 mova [esp+64-mmsize], m0
5199 call pixel_sa8d_8x8_internal2
5200 AVG_16x16
5201
5202 mov r0, [r6+20]
5203 mov r2, [r6+28]
5204 add r0, 32*SIZEOF_PIXEL
5205 add r2, 32*SIZEOF_PIXEL
5206 lea r4, [r1 + 2*r1]
5207 call pixel_sa8d_8x8_internal2
5208%if HIGH_BIT_DEPTH
5209 HADDUW m0, m1
5210%endif
5211 mova [esp+48], m0
5212 call pixel_sa8d_8x8_internal2
5213 SA8D_INTER
5214 mova [esp+48], m0
5215
5216 mov r0, [r6+20]
5217 mov r2, [r6+28]
5218 add r0, 40*SIZEOF_PIXEL
5219 add r2, 40*SIZEOF_PIXEL
5220 call pixel_sa8d_8x8_internal2
5221 SA8D_INTER
5222 mova [esp+64-mmsize], m0
5223 call pixel_sa8d_8x8_internal2
5224 AVG_16x16
5225
5226 mov r0, [r6+20]
5227 mov r2, [r6+28]
5228 add r0, 48*SIZEOF_PIXEL
5229 add r2, 48*SIZEOF_PIXEL
5230 lea r4, [r1 + 2*r1]
5231 call pixel_sa8d_8x8_internal2
5232%if HIGH_BIT_DEPTH
5233 HADDUW m0, m1
5234%endif
5235 mova [esp+48], m0
5236 call pixel_sa8d_8x8_internal2
5237 SA8D_INTER
5238 mova [esp+48], m0
5239
5240 mov r0, [r6+20]
5241 mov r2, [r6+28]
5242 add r0, 56*SIZEOF_PIXEL
5243 add r2, 56*SIZEOF_PIXEL
5244 call pixel_sa8d_8x8_internal2
5245 SA8D_INTER
5246 mova [esp+64-mmsize], m0
5247 call pixel_sa8d_8x8_internal2
5248 SA8D_INTER
5249%if HIGH_BIT_DEPTH == 0
5250 HADDUW m0, m1
5251%endif
5252 movd r4d, m0
5253 add r4d, 1
5254 shr r4d, 1
5255 add r4d, dword [esp+36]
5256 mov eax, r4d
5257 mov esp, r6
5258 RET
5259
5260cglobal pixel_sa8d_64x32, 4,7,8
5261 FIX_STRIDES r1, r3
5262 mov r6, esp
5263 and esp, ~15
5264 sub esp, 64
5265
5266 lea r4, [r1 + 2*r1]
5267 lea r5, [r3 + 2*r3]
5268 call pixel_sa8d_8x8_internal2
5269%if HIGH_BIT_DEPTH
5270 HADDUW m0, m1
5271%endif
5272 mova [rsp+48], m0
5273 call pixel_sa8d_8x8_internal2
5274 SA8D_INTER
5275 mova [esp+48], m0
5276
5277 mov r0, [r6+20]
5278 mov r2, [r6+28]
5279 add r0, 8*SIZEOF_PIXEL
5280 add r2, 8*SIZEOF_PIXEL
5281 call pixel_sa8d_8x8_internal2
5282 SA8D_INTER
5283 mova [esp+48], m0
5284 call pixel_sa8d_8x8_internal2
5285 SA8D_INTER
5286%if HIGH_BIT_DEPTH == 0
5287 HADDUW m0, m1
5288%endif
5289 movd r4d, m0
5290 add r4d, 1
5291 shr r4d, 1
5292 mov dword [esp+36], r4d
5293
5294 mov r0, [r6+20]
5295 mov r2, [r6+28]
5296 add r0, 16*SIZEOF_PIXEL
5297 add r2, 16*SIZEOF_PIXEL
5298 lea r4, [r1 + 2*r1]
5299 call pixel_sa8d_8x8_internal2
5300%if HIGH_BIT_DEPTH
5301 HADDUW m0, m1
5302%endif
5303 mova [esp+48], m0
5304 call pixel_sa8d_8x8_internal2
5305 SA8D_INTER
5306 mova [esp+48], m0
5307
5308 mov r0, [r6+20]
5309 mov r2, [r6+28]
5310 add r0, 24*SIZEOF_PIXEL
5311 add r2, 24*SIZEOF_PIXEL
5312 call pixel_sa8d_8x8_internal2
5313 SA8D_INTER
5314 mova [esp+64-mmsize], m0
5315 call pixel_sa8d_8x8_internal2
5316 AVG_16x16
5317
5318 mov r0, [r6+20]
5319 mov r2, [r6+28]
5320 add r0, 32*SIZEOF_PIXEL
5321 add r2, 32*SIZEOF_PIXEL
5322 lea r4, [r1 + 2*r1]
5323 call pixel_sa8d_8x8_internal2
5324%if HIGH_BIT_DEPTH
5325 HADDUW m0, m1
5326%endif
5327 mova [esp+48], m0
5328 call pixel_sa8d_8x8_internal2
5329 SA8D_INTER
5330 mova [esp+48], m0
5331
5332 mov r0, [r6+20]
5333 mov r2, [r6+28]
5334 add r0, 40*SIZEOF_PIXEL
5335 add r2, 40*SIZEOF_PIXEL
5336 call pixel_sa8d_8x8_internal2
5337 SA8D_INTER
5338 mova [esp+64-mmsize], m0
5339 call pixel_sa8d_8x8_internal2
5340 AVG_16x16
5341
5342 mov r0, [r6+20]
5343 mov r2, [r6+28]
5344 add r0, 48*SIZEOF_PIXEL
5345 add r2, 48*SIZEOF_PIXEL
5346 lea r4, [r1 + 2*r1]
5347 call pixel_sa8d_8x8_internal2
5348%if HIGH_BIT_DEPTH
5349 HADDUW m0, m1
5350%endif
5351 mova [esp+48], m0
5352 call pixel_sa8d_8x8_internal2
5353 SA8D_INTER
5354 mova [esp+48], m0
5355
5356 mov r0, [r6+20]
5357 mov r2, [r6+28]
5358 add r0, 56*SIZEOF_PIXEL
5359 add r2, 56*SIZEOF_PIXEL
5360 call pixel_sa8d_8x8_internal2
5361 SA8D_INTER
5362 mova [esp+64-mmsize], m0
5363 call pixel_sa8d_8x8_internal2
5364 AVG_16x16
5365
5366 mov r0, [r6+20]
5367 mov r2, [r6+28]
5368 lea r0, [r0 + r1*8]
5369 lea r2, [r2 + r3*8]
5370 lea r0, [r0 + r1*8]
5371 lea r2, [r2 + r3*8]
5372 mov [r6+20], r0
5373 mov [r6+28], r2
5374
5375 lea r4, [r1 + 2*r1]
5376 call pixel_sa8d_8x8_internal2
5377%if HIGH_BIT_DEPTH
5378 HADDUW m0, m1
5379%endif
5380 mova [esp+48], m0
5381 call pixel_sa8d_8x8_internal2
5382 SA8D_INTER
5383 mova [esp+48], m0
5384
5385 mov r0, [r6+20]
5386 mov r2, [r6+28]
5387 add r0, 8*SIZEOF_PIXEL
5388 add r2, 8*SIZEOF_PIXEL
5389 call pixel_sa8d_8x8_internal2
5390 SA8D_INTER
5391 mova [esp+64-mmsize], m0
5392 call pixel_sa8d_8x8_internal2
5393 AVG_16x16
5394
5395 mov r0, [r6+20]
5396 mov r2, [r6+28]
5397 add r0, 16*SIZEOF_PIXEL
5398 add r2, 16*SIZEOF_PIXEL
5399 lea r4, [r1 + 2*r1]
5400 call pixel_sa8d_8x8_internal2
5401%if HIGH_BIT_DEPTH
5402 HADDUW m0, m1
5403%endif
5404 mova [esp+48], m0
5405 call pixel_sa8d_8x8_internal2
5406 SA8D_INTER
5407 mova [esp+48], m0
5408
5409 mov r0, [r6+20]
5410 mov r2, [r6+28]
5411 add r0, 24*SIZEOF_PIXEL
5412 add r2, 24*SIZEOF_PIXEL
5413 call pixel_sa8d_8x8_internal2
5414 SA8D_INTER
5415 mova [esp+64-mmsize], m0
5416 call pixel_sa8d_8x8_internal2
5417 AVG_16x16
5418
5419 mov r0, [r6+20]
5420 mov r2, [r6+28]
5421 add r0, 32*SIZEOF_PIXEL
5422 add r2, 32*SIZEOF_PIXEL
5423 lea r4, [r1 + 2*r1]
5424 call pixel_sa8d_8x8_internal2
5425%if HIGH_BIT_DEPTH
5426 HADDUW m0, m1
5427%endif
5428 mova [esp+48], m0
5429 call pixel_sa8d_8x8_internal2
5430 SA8D_INTER
5431 mova [esp+48], m0
5432
5433 mov r0, [r6+20]
5434 mov r2, [r6+28]
5435 add r0, 40*SIZEOF_PIXEL
5436 add r2, 40*SIZEOF_PIXEL
5437 call pixel_sa8d_8x8_internal2
5438 SA8D_INTER
5439 mova [esp+64-mmsize], m0
5440 call pixel_sa8d_8x8_internal2
5441 AVG_16x16
5442
5443 mov r0, [r6+20]
5444 mov r2, [r6+28]
5445 add r0, 48*SIZEOF_PIXEL
5446 add r2, 48*SIZEOF_PIXEL
5447 lea r4, [r1 + 2*r1]
5448 call pixel_sa8d_8x8_internal2
5449%if HIGH_BIT_DEPTH
5450 HADDUW m0, m1
5451%endif
5452 mova [esp+48], m0
5453 call pixel_sa8d_8x8_internal2
5454 SA8D_INTER
5455 mova [esp+48], m0
5456
5457 mov r0, [r6+20]
5458 mov r2, [r6+28]
5459 add r0, 56*SIZEOF_PIXEL
5460 add r2, 56*SIZEOF_PIXEL
5461 call pixel_sa8d_8x8_internal2
5462 SA8D_INTER
5463 mova [esp+64-mmsize], m0
5464 call pixel_sa8d_8x8_internal2
5465 SA8D_INTER
5466%if HIGH_BIT_DEPTH == 0
5467 HADDUW m0, m1
5468%endif
5469 movd r4d, m0
5470 add r4d, 1
5471 shr r4d, 1
5472 add r4d, dword [esp+36]
5473 mov eax, r4d
5474 mov esp, r6
5475 RET
5476
5477cglobal pixel_sa8d_64x48, 4,7,8
5478 FIX_STRIDES r1, r3
5479 mov r6, esp
5480 and esp, ~15
5481 sub esp, 64
5482
5483 lea r4, [r1 + 2*r1]
5484 lea r5, [r3 + 2*r3]
5485 call pixel_sa8d_8x8_internal2
5486%if HIGH_BIT_DEPTH
5487 HADDUW m0, m1
5488%endif
5489 mova [rsp+48], m0
5490 call pixel_sa8d_8x8_internal2
5491 SA8D_INTER
5492 mova [esp+48], m0
5493
5494 mov r0, [r6+20]
5495 mov r2, [r6+28]
5496 add r0, 8*SIZEOF_PIXEL
5497 add r2, 8*SIZEOF_PIXEL
5498 call pixel_sa8d_8x8_internal2
5499 SA8D_INTER
5500 mova [esp+48], m0
5501 call pixel_sa8d_8x8_internal2
5502 SA8D_INTER
5503%if HIGH_BIT_DEPTH == 0
5504 HADDUW m0, m1
5505%endif
5506 movd r4d, m0
5507 add r4d, 1
5508 shr r4d, 1
5509 mov dword [esp+36], r4d
5510
5511 mov r0, [r6+20]
5512 mov r2, [r6+28]
5513 add r0, 16*SIZEOF_PIXEL
5514 add r2, 16*SIZEOF_PIXEL
5515 lea r4, [r1 + 2*r1]
5516 call pixel_sa8d_8x8_internal2
5517%if HIGH_BIT_DEPTH
5518 HADDUW m0, m1
5519%endif
5520 mova [esp+48], m0
5521 call pixel_sa8d_8x8_internal2
5522 SA8D_INTER
5523 mova [esp+48], m0
5524
5525 mov r0, [r6+20]
5526 mov r2, [r6+28]
5527 add r0, 24*SIZEOF_PIXEL
5528 add r2, 24*SIZEOF_PIXEL
5529 call pixel_sa8d_8x8_internal2
5530 SA8D_INTER
5531 mova [esp+64-mmsize], m0
5532 call pixel_sa8d_8x8_internal2
5533 AVG_16x16
5534
5535 mov r0, [r6+20]
5536 mov r2, [r6+28]
5537 add r0, 32*SIZEOF_PIXEL
5538 add r2, 32*SIZEOF_PIXEL
5539 lea r4, [r1 + 2*r1]
5540 call pixel_sa8d_8x8_internal2
5541%if HIGH_BIT_DEPTH
5542 HADDUW m0, m1
5543%endif
5544 mova [esp+48], m0
5545 call pixel_sa8d_8x8_internal2
5546 SA8D_INTER
5547 mova [esp+48], m0
5548
5549 mov r0, [r6+20]
5550 mov r2, [r6+28]
5551 add r0, 40*SIZEOF_PIXEL
5552 add r2, 40*SIZEOF_PIXEL
5553 call pixel_sa8d_8x8_internal2
5554 SA8D_INTER
5555 mova [esp+64-mmsize], m0
5556 call pixel_sa8d_8x8_internal2
5557 AVG_16x16
5558
5559 mov r0, [r6+20]
5560 mov r2, [r6+28]
5561 add r0, 48*SIZEOF_PIXEL
5562 add r2, 48*SIZEOF_PIXEL
5563 lea r4, [r1 + 2*r1]
5564 call pixel_sa8d_8x8_internal2
5565%if HIGH_BIT_DEPTH
5566 HADDUW m0, m1
5567%endif
5568 mova [esp+48], m0
5569 call pixel_sa8d_8x8_internal2
5570 SA8D_INTER
5571 mova [esp+48], m0
5572
5573 mov r0, [r6+20]
5574 mov r2, [r6+28]
5575 add r0, 56*SIZEOF_PIXEL
5576 add r2, 56*SIZEOF_PIXEL
5577 call pixel_sa8d_8x8_internal2
5578 SA8D_INTER
5579 mova [esp+64-mmsize], m0
5580 call pixel_sa8d_8x8_internal2
5581 AVG_16x16
5582
5583 mov r0, [r6+20]
5584 mov r2, [r6+28]
5585 lea r0, [r0 + r1*8]
5586 lea r2, [r2 + r3*8]
5587 lea r0, [r0 + r1*8]
5588 lea r2, [r2 + r3*8]
5589 mov [r6+20], r0
5590 mov [r6+28], r2
5591
5592 lea r4, [r1 + 2*r1]
5593 call pixel_sa8d_8x8_internal2
5594%if HIGH_BIT_DEPTH
5595 HADDUW m0, m1
5596%endif
5597 mova [esp+48], m0
5598 call pixel_sa8d_8x8_internal2
5599 SA8D_INTER
5600 mova [esp+48], m0
5601
5602 mov r0, [r6+20]
5603 mov r2, [r6+28]
5604 add r0, 8*SIZEOF_PIXEL
5605 add r2, 8*SIZEOF_PIXEL
5606 call pixel_sa8d_8x8_internal2
5607 SA8D_INTER
5608 mova [esp+64-mmsize], m0
5609 call pixel_sa8d_8x8_internal2
5610 AVG_16x16
5611
5612 mov r0, [r6+20]
5613 mov r2, [r6+28]
5614 add r0, 16*SIZEOF_PIXEL
5615 add r2, 16*SIZEOF_PIXEL
5616 lea r4, [r1 + 2*r1]
5617 call pixel_sa8d_8x8_internal2
5618%if HIGH_BIT_DEPTH
5619 HADDUW m0, m1
5620%endif
5621 mova [esp+48], m0
5622 call pixel_sa8d_8x8_internal2
5623 SA8D_INTER
5624 mova [esp+48], m0
5625
5626 mov r0, [r6+20]
5627 mov r2, [r6+28]
5628 add r0, 24*SIZEOF_PIXEL
5629 add r2, 24*SIZEOF_PIXEL
5630 call pixel_sa8d_8x8_internal2
5631 SA8D_INTER
5632 mova [esp+64-mmsize], m0
5633 call pixel_sa8d_8x8_internal2
5634 AVG_16x16
5635
5636 mov r0, [r6+20]
5637 mov r2, [r6+28]
5638 add r0, 32*SIZEOF_PIXEL
5639 add r2, 32*SIZEOF_PIXEL
5640 lea r4, [r1 + 2*r1]
5641 call pixel_sa8d_8x8_internal2
5642%if HIGH_BIT_DEPTH
5643 HADDUW m0, m1
5644%endif
5645 mova [esp+48], m0
5646 call pixel_sa8d_8x8_internal2
5647 SA8D_INTER
5648 mova [esp+48], m0
5649
5650 mov r0, [r6+20]
5651 mov r2, [r6+28]
5652 add r0, 40*SIZEOF_PIXEL
5653 add r2, 40*SIZEOF_PIXEL
5654 call pixel_sa8d_8x8_internal2
5655 SA8D_INTER
5656 mova [esp+64-mmsize], m0
5657 call pixel_sa8d_8x8_internal2
5658 AVG_16x16
5659
5660 mov r0, [r6+20]
5661 mov r2, [r6+28]
5662 add r0, 48*SIZEOF_PIXEL
5663 add r2, 48*SIZEOF_PIXEL
5664 lea r4, [r1 + 2*r1]
5665 call pixel_sa8d_8x8_internal2
5666%if HIGH_BIT_DEPTH
5667 HADDUW m0, m1
5668%endif
5669 mova [esp+48], m0
5670 call pixel_sa8d_8x8_internal2
5671 SA8D_INTER
5672 mova [esp+48], m0
5673
5674 mov r0, [r6+20]
5675 mov r2, [r6+28]
5676 add r0, 56*SIZEOF_PIXEL
5677 add r2, 56*SIZEOF_PIXEL
5678 call pixel_sa8d_8x8_internal2
5679 SA8D_INTER
5680 mova [esp+64-mmsize], m0
5681 call pixel_sa8d_8x8_internal2
5682 AVG_16x16
5683
5684 mov r0, [r6+20]
5685 mov r2, [r6+28]
5686 lea r0, [r0 + r1*8]
5687 lea r2, [r2 + r3*8]
5688 lea r0, [r0 + r1*8]
5689 lea r2, [r2 + r3*8]
5690 mov [r6+20], r0
5691 mov [r6+28], r2
5692
5693 lea r4, [r1 + 2*r1]
5694 call pixel_sa8d_8x8_internal2
5695%if HIGH_BIT_DEPTH
5696 HADDUW m0, m1
5697%endif
5698 mova [esp+48], m0
5699 call pixel_sa8d_8x8_internal2
5700 SA8D_INTER
5701 mova [esp+48], m0
5702
5703 mov r0, [r6+20]
5704 mov r2, [r6+28]
5705 add r0, 8*SIZEOF_PIXEL
5706 add r2, 8*SIZEOF_PIXEL
5707 call pixel_sa8d_8x8_internal2
5708 SA8D_INTER
5709 mova [esp+64-mmsize], m0
5710 call pixel_sa8d_8x8_internal2
5711 AVG_16x16
5712
5713 mov r0, [r6+20]
5714 mov r2, [r6+28]
5715 add r0, 16*SIZEOF_PIXEL
5716 add r2, 16*SIZEOF_PIXEL
5717 lea r4, [r1 + 2*r1]
5718 call pixel_sa8d_8x8_internal2
5719%if HIGH_BIT_DEPTH
5720 HADDUW m0, m1
5721%endif
5722 mova [esp+48], m0
5723 call pixel_sa8d_8x8_internal2
5724 SA8D_INTER
5725 mova [esp+48], m0
5726
5727 mov r0, [r6+20]
5728 mov r2, [r6+28]
5729 add r0, 24*SIZEOF_PIXEL
5730 add r2, 24*SIZEOF_PIXEL
5731 call pixel_sa8d_8x8_internal2
5732 SA8D_INTER
5733 mova [esp+64-mmsize], m0
5734 call pixel_sa8d_8x8_internal2
5735 AVG_16x16
5736
5737 mov r0, [r6+20]
5738 mov r2, [r6+28]
5739 add r0, 32*SIZEOF_PIXEL
5740 add r2, 32*SIZEOF_PIXEL
5741 lea r4, [r1 + 2*r1]
5742 call pixel_sa8d_8x8_internal2
5743%if HIGH_BIT_DEPTH
5744 HADDUW m0, m1
5745%endif
5746 mova [esp+48], m0
5747 call pixel_sa8d_8x8_internal2
5748 SA8D_INTER
5749 mova [esp+48], m0
5750
5751 mov r0, [r6+20]
5752 mov r2, [r6+28]
5753 add r0, 40*SIZEOF_PIXEL
5754 add r2, 40*SIZEOF_PIXEL
5755 call pixel_sa8d_8x8_internal2
5756 SA8D_INTER
5757 mova [esp+64-mmsize], m0
5758 call pixel_sa8d_8x8_internal2
5759 AVG_16x16
5760
5761 mov r0, [r6+20]
5762 mov r2, [r6+28]
5763 add r0, 48*SIZEOF_PIXEL
5764 add r2, 48*SIZEOF_PIXEL
5765 lea r4, [r1 + 2*r1]
5766 call pixel_sa8d_8x8_internal2
5767%if HIGH_BIT_DEPTH
5768 HADDUW m0, m1
5769%endif
5770 mova [esp+48], m0
5771 call pixel_sa8d_8x8_internal2
5772 SA8D_INTER
5773 mova [esp+48], m0
5774
5775 mov r0, [r6+20]
5776 mov r2, [r6+28]
5777 add r0, 56*SIZEOF_PIXEL
5778 add r2, 56*SIZEOF_PIXEL
5779 call pixel_sa8d_8x8_internal2
5780 SA8D_INTER
5781 mova [esp+64-mmsize], m0
5782 call pixel_sa8d_8x8_internal2
5783 SA8D_INTER
5784%if HIGH_BIT_DEPTH == 0
5785 HADDUW m0, m1
5786%endif
5787 movd r4d, m0
5788 add r4d, 1
5789 shr r4d, 1
5790 add r4d, dword [esp+36]
5791 mov eax, r4d
5792 mov esp, r6
5793 RET
5794
5795cglobal pixel_sa8d_64x64, 4,7,8
5796 FIX_STRIDES r1, r3
5797 mov r6, esp
5798 and esp, ~15
5799 sub esp, 64
5800
5801 lea r4, [r1 + 2*r1]
5802 lea r5, [r3 + 2*r3]
5803 call pixel_sa8d_8x8_internal2
5804%if HIGH_BIT_DEPTH
5805 HADDUW m0, m1
5806%endif
5807 mova [rsp+48], m0
5808 call pixel_sa8d_8x8_internal2
5809 SA8D_INTER
5810 mova [esp+48], m0
5811
5812 mov r0, [r6+20]
5813 mov r2, [r6+28]
5814 add r0, 8*SIZEOF_PIXEL
5815 add r2, 8*SIZEOF_PIXEL
5816 call pixel_sa8d_8x8_internal2
5817 SA8D_INTER
5818 mova [esp+48], m0
5819 call pixel_sa8d_8x8_internal2
5820 SA8D_INTER
5821%if HIGH_BIT_DEPTH == 0
5822 HADDUW m0, m1
5823%endif
5824 movd r4d, m0
5825 add r4d, 1
5826 shr r4d, 1
5827 mov dword [esp+36], r4d
5828
5829 mov r0, [r6+20]
5830 mov r2, [r6+28]
5831 add r0, 16*SIZEOF_PIXEL
5832 add r2, 16*SIZEOF_PIXEL
5833 lea r4, [r1 + 2*r1]
5834 call pixel_sa8d_8x8_internal2
5835%if HIGH_BIT_DEPTH
5836 HADDUW m0, m1
5837%endif
5838 mova [esp+48], m0
5839 call pixel_sa8d_8x8_internal2
5840 SA8D_INTER
5841 mova [esp+48], m0
5842
5843 mov r0, [r6+20]
5844 mov r2, [r6+28]
5845 add r0, 24*SIZEOF_PIXEL
5846 add r2, 24*SIZEOF_PIXEL
5847 call pixel_sa8d_8x8_internal2
5848 SA8D_INTER
5849 mova [esp+64-mmsize], m0
5850 call pixel_sa8d_8x8_internal2
5851 AVG_16x16
5852
5853 mov r0, [r6+20]
5854 mov r2, [r6+28]
5855 add r0, 32*SIZEOF_PIXEL
5856 add r2, 32*SIZEOF_PIXEL
5857 lea r4, [r1 + 2*r1]
5858 call pixel_sa8d_8x8_internal2
5859%if HIGH_BIT_DEPTH
5860 HADDUW m0, m1
5861%endif
5862 mova [esp+48], m0
5863 call pixel_sa8d_8x8_internal2
5864 SA8D_INTER
5865 mova [esp+48], m0
5866
5867 mov r0, [r6+20]
5868 mov r2, [r6+28]
5869 add r0, 40*SIZEOF_PIXEL
5870 add r2, 40*SIZEOF_PIXEL
5871 call pixel_sa8d_8x8_internal2
5872 SA8D_INTER
5873 mova [esp+64-mmsize], m0
5874 call pixel_sa8d_8x8_internal2
5875 AVG_16x16
5876
5877 mov r0, [r6+20]
5878 mov r2, [r6+28]
5879 add r0, 48*SIZEOF_PIXEL
5880 add r2, 48*SIZEOF_PIXEL
5881 lea r4, [r1 + 2*r1]
5882 call pixel_sa8d_8x8_internal2
5883%if HIGH_BIT_DEPTH
5884 HADDUW m0, m1
5885%endif
5886 mova [esp+48], m0
5887 call pixel_sa8d_8x8_internal2
5888 SA8D_INTER
5889 mova [esp+48], m0
5890
5891 mov r0, [r6+20]
5892 mov r2, [r6+28]
5893 add r0, 56*SIZEOF_PIXEL
5894 add r2, 56*SIZEOF_PIXEL
5895 call pixel_sa8d_8x8_internal2
5896 SA8D_INTER
5897 mova [esp+64-mmsize], m0
5898 call pixel_sa8d_8x8_internal2
5899 AVG_16x16
5900
5901 mov r0, [r6+20]
5902 mov r2, [r6+28]
5903 lea r0, [r0 + r1*8]
5904 lea r2, [r2 + r3*8]
5905 lea r0, [r0 + r1*8]
5906 lea r2, [r2 + r3*8]
5907 mov [r6+20], r0
5908 mov [r6+28], r2
5909
5910 lea r4, [r1 + 2*r1]
5911 call pixel_sa8d_8x8_internal2
5912%if HIGH_BIT_DEPTH
5913 HADDUW m0, m1
5914%endif
5915 mova [esp+48], m0
5916 call pixel_sa8d_8x8_internal2
5917 SA8D_INTER
5918 mova [esp+48], m0
5919
5920 mov r0, [r6+20]
5921 mov r2, [r6+28]
5922 add r0, 8*SIZEOF_PIXEL
5923 add r2, 8*SIZEOF_PIXEL
5924 call pixel_sa8d_8x8_internal2
5925 SA8D_INTER
5926 mova [esp+64-mmsize], m0
5927 call pixel_sa8d_8x8_internal2
5928 AVG_16x16
5929
5930 mov r0, [r6+20]
5931 mov r2, [r6+28]
5932 add r0, 16*SIZEOF_PIXEL
5933 add r2, 16*SIZEOF_PIXEL
5934 lea r4, [r1 + 2*r1]
5935 call pixel_sa8d_8x8_internal2
5936%if HIGH_BIT_DEPTH
5937 HADDUW m0, m1
5938%endif
5939 mova [esp+48], m0
5940 call pixel_sa8d_8x8_internal2
5941 SA8D_INTER
5942 mova [esp+48], m0
5943
5944 mov r0, [r6+20]
5945 mov r2, [r6+28]
5946 add r0, 24*SIZEOF_PIXEL
5947 add r2, 24*SIZEOF_PIXEL
5948 call pixel_sa8d_8x8_internal2
5949 SA8D_INTER
5950 mova [esp+64-mmsize], m0
5951 call pixel_sa8d_8x8_internal2
5952 AVG_16x16
5953
5954 mov r0, [r6+20]
5955 mov r2, [r6+28]
5956 add r0, 32*SIZEOF_PIXEL
5957 add r2, 32*SIZEOF_PIXEL
5958 lea r4, [r1 + 2*r1]
5959 call pixel_sa8d_8x8_internal2
5960%if HIGH_BIT_DEPTH
5961 HADDUW m0, m1
5962%endif
5963 mova [esp+48], m0
5964 call pixel_sa8d_8x8_internal2
5965 SA8D_INTER
5966 mova [esp+48], m0
5967
5968 mov r0, [r6+20]
5969 mov r2, [r6+28]
5970 add r0, 40*SIZEOF_PIXEL
5971 add r2, 40*SIZEOF_PIXEL
5972 call pixel_sa8d_8x8_internal2
5973 SA8D_INTER
5974 mova [esp+64-mmsize], m0
5975 call pixel_sa8d_8x8_internal2
5976 AVG_16x16
5977
5978 mov r0, [r6+20]
5979 mov r2, [r6+28]
5980 add r0, 48*SIZEOF_PIXEL
5981 add r2, 48*SIZEOF_PIXEL
5982 lea r4, [r1 + 2*r1]
5983 call pixel_sa8d_8x8_internal2
5984%if HIGH_BIT_DEPTH
5985 HADDUW m0, m1
5986%endif
5987 mova [esp+48], m0
5988 call pixel_sa8d_8x8_internal2
5989 SA8D_INTER
5990 mova [esp+48], m0
5991
5992 mov r0, [r6+20]
5993 mov r2, [r6+28]
5994 add r0, 56*SIZEOF_PIXEL
5995 add r2, 56*SIZEOF_PIXEL
5996 call pixel_sa8d_8x8_internal2
5997 SA8D_INTER
5998 mova [esp+64-mmsize], m0
5999 call pixel_sa8d_8x8_internal2
6000 AVG_16x16
6001
6002 mov r0, [r6+20]
6003 mov r2, [r6+28]
6004 lea r0, [r0 + r1*8]
6005 lea r2, [r2 + r3*8]
6006 lea r0, [r0 + r1*8]
6007 lea r2, [r2 + r3*8]
6008 mov [r6+20], r0
6009 mov [r6+28], r2
6010
6011 lea r4, [r1 + 2*r1]
6012 call pixel_sa8d_8x8_internal2
6013%if HIGH_BIT_DEPTH
6014 HADDUW m0, m1
6015%endif
6016 mova [esp+48], m0
6017 call pixel_sa8d_8x8_internal2
6018 SA8D_INTER
6019 mova [esp+48], m0
6020
6021 mov r0, [r6+20]
6022 mov r2, [r6+28]
6023 add r0, 8*SIZEOF_PIXEL
6024 add r2, 8*SIZEOF_PIXEL
6025 call pixel_sa8d_8x8_internal2
6026 SA8D_INTER
6027 mova [esp+64-mmsize], m0
6028 call pixel_sa8d_8x8_internal2
6029 AVG_16x16
6030
6031 mov r0, [r6+20]
6032 mov r2, [r6+28]
6033 add r0, 16*SIZEOF_PIXEL
6034 add r2, 16*SIZEOF_PIXEL
6035 lea r4, [r1 + 2*r1]
6036 call pixel_sa8d_8x8_internal2
6037%if HIGH_BIT_DEPTH
6038 HADDUW m0, m1
6039%endif
6040 mova [esp+48], m0
6041 call pixel_sa8d_8x8_internal2
6042 SA8D_INTER
6043 mova [esp+48], m0
6044
6045 mov r0, [r6+20]
6046 mov r2, [r6+28]
6047 add r0, 24*SIZEOF_PIXEL
6048 add r2, 24*SIZEOF_PIXEL
6049 call pixel_sa8d_8x8_internal2
6050 SA8D_INTER
6051 mova [esp+64-mmsize], m0
6052 call pixel_sa8d_8x8_internal2
6053 AVG_16x16
6054
6055 mov r0, [r6+20]
6056 mov r2, [r6+28]
6057 add r0, 32*SIZEOF_PIXEL
6058 add r2, 32*SIZEOF_PIXEL
6059 lea r4, [r1 + 2*r1]
6060 call pixel_sa8d_8x8_internal2
6061%if HIGH_BIT_DEPTH
6062 HADDUW m0, m1
6063%endif
6064 mova [esp+48], m0
6065 call pixel_sa8d_8x8_internal2
6066 SA8D_INTER
6067 mova [esp+48], m0
6068
6069 mov r0, [r6+20]
6070 mov r2, [r6+28]
6071 add r0, 40*SIZEOF_PIXEL
6072 add r2, 40*SIZEOF_PIXEL
6073 call pixel_sa8d_8x8_internal2
6074 SA8D_INTER
6075 mova [esp+64-mmsize], m0
6076 call pixel_sa8d_8x8_internal2
6077 AVG_16x16
6078
6079 mov r0, [r6+20]
6080 mov r2, [r6+28]
6081 add r0, 48*SIZEOF_PIXEL
6082 add r2, 48*SIZEOF_PIXEL
6083 lea r4, [r1 + 2*r1]
6084 call pixel_sa8d_8x8_internal2
6085%if HIGH_BIT_DEPTH
6086 HADDUW m0, m1
6087%endif
6088 mova [esp+48], m0
6089 call pixel_sa8d_8x8_internal2
6090 SA8D_INTER
6091 mova [esp+48], m0
6092
6093 mov r0, [r6+20]
6094 mov r2, [r6+28]
6095 add r0, 56*SIZEOF_PIXEL
6096 add r2, 56*SIZEOF_PIXEL
6097 call pixel_sa8d_8x8_internal2
6098 SA8D_INTER
6099 mova [esp+64-mmsize], m0
6100 call pixel_sa8d_8x8_internal2
6101 AVG_16x16
6102
6103 mov r0, [r6+20]
6104 mov r2, [r6+28]
6105 lea r0, [r0 + r1*8]
6106 lea r2, [r2 + r3*8]
6107 lea r0, [r0 + r1*8]
6108 lea r2, [r2 + r3*8]
6109 mov [r6+20], r0
6110 mov [r6+28], r2
6111
6112 lea r4, [r1 + 2*r1]
6113 call pixel_sa8d_8x8_internal2
6114%if HIGH_BIT_DEPTH
6115 HADDUW m0, m1
6116%endif
6117 mova [esp+48], m0
6118 call pixel_sa8d_8x8_internal2
6119 SA8D_INTER
6120 mova [esp+48], m0
6121
6122 mov r0, [r6+20]
6123 mov r2, [r6+28]
6124 add r0, 8*SIZEOF_PIXEL
6125 add r2, 8*SIZEOF_PIXEL
6126 call pixel_sa8d_8x8_internal2
6127 SA8D_INTER
6128 mova [esp+64-mmsize], m0
6129 call pixel_sa8d_8x8_internal2
6130 AVG_16x16
6131
6132 mov r0, [r6+20]
6133 mov r2, [r6+28]
6134 add r0, 16*SIZEOF_PIXEL
6135 add r2, 16*SIZEOF_PIXEL
6136 lea r4, [r1 + 2*r1]
6137 call pixel_sa8d_8x8_internal2
6138%if HIGH_BIT_DEPTH
6139 HADDUW m0, m1
6140%endif
6141 mova [esp+48], m0
6142 call pixel_sa8d_8x8_internal2
6143 SA8D_INTER
6144 mova [esp+48], m0
6145
6146 mov r0, [r6+20]
6147 mov r2, [r6+28]
6148 add r0, 24*SIZEOF_PIXEL
6149 add r2, 24*SIZEOF_PIXEL
6150 call pixel_sa8d_8x8_internal2
6151 SA8D_INTER
6152 mova [esp+64-mmsize], m0
6153 call pixel_sa8d_8x8_internal2
6154 AVG_16x16
6155
6156 mov r0, [r6+20]
6157 mov r2, [r6+28]
6158 add r0, 32*SIZEOF_PIXEL
6159 add r2, 32*SIZEOF_PIXEL
6160 lea r4, [r1 + 2*r1]
6161 call pixel_sa8d_8x8_internal2
6162%if HIGH_BIT_DEPTH
6163 HADDUW m0, m1
6164%endif
6165 mova [esp+48], m0
6166 call pixel_sa8d_8x8_internal2
6167 SA8D_INTER
6168 mova [esp+48], m0
6169
6170 mov r0, [r6+20]
6171 mov r2, [r6+28]
6172 add r0, 40*SIZEOF_PIXEL
6173 add r2, 40*SIZEOF_PIXEL
6174 call pixel_sa8d_8x8_internal2
6175 SA8D_INTER
6176 mova [esp+64-mmsize], m0
6177 call pixel_sa8d_8x8_internal2
6178 AVG_16x16
6179
6180 mov r0, [r6+20]
6181 mov r2, [r6+28]
6182 add r0, 48*SIZEOF_PIXEL
6183 add r2, 48*SIZEOF_PIXEL
6184 lea r4, [r1 + 2*r1]
6185 call pixel_sa8d_8x8_internal2
6186%if HIGH_BIT_DEPTH
6187 HADDUW m0, m1
6188%endif
6189 mova [esp+48], m0
6190 call pixel_sa8d_8x8_internal2
6191 SA8D_INTER
6192 mova [esp+48], m0
6193
6194 mov r0, [r6+20]
6195 mov r2, [r6+28]
6196 add r0, 56*SIZEOF_PIXEL
6197 add r2, 56*SIZEOF_PIXEL
6198 call pixel_sa8d_8x8_internal2
6199 SA8D_INTER
6200 mova [esp+64-mmsize], m0
6201 call pixel_sa8d_8x8_internal2
6202 SA8D_INTER
6203%if HIGH_BIT_DEPTH == 0
6204 HADDUW m0, m1
6205%endif
6206 movd r4d, m0
6207 add r4d, 1
6208 shr r4d, 1
6209 add r4d, dword [esp+36]
6210 mov eax, r4d
6211 mov esp, r6
6212 RET
6213%endif ; !ARCH_X86_64
6214%endmacro ; SA8D
6215
6216;=============================================================================
6217; INTRA SATD
6218;=============================================================================
6219%define TRANS TRANS_SSE2
6220%define DIFFOP DIFF_UNPACK_SSE2
6221%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
6222%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
6223%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
6224%define movdqu movups
6225%define punpcklqdq movlhps
6226INIT_XMM sse2
6227SA8D
6228SATDS_SSE2
6229
6230%if HIGH_BIT_DEPTH == 0
6231INIT_XMM ssse3,atom
6232SATDS_SSE2
6233SA8D
6234%endif
6235
6236%define DIFFOP DIFF_SUMSUB_SSSE3
6237%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
6238%if HIGH_BIT_DEPTH == 0
6239%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
6240%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
6241%endif
6242INIT_XMM ssse3
6243SATDS_SSE2
6244SA8D
6245%undef movdqa ; nehalem doesn't like movaps
6246%undef movdqu ; movups
6247%undef punpcklqdq ; or movlhps
6248
6249%define TRANS TRANS_SSE4
6250%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
6251INIT_XMM sse4
6252SATDS_SSE2
6253SA8D
6254
6255; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
6256; it's effectively free.
6257%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
6258INIT_XMM avx
6259SATDS_SSE2
6260SA8D
6261
6262%define TRANS TRANS_XOP
6263INIT_XMM xop
6264SATDS_SSE2
6265SA8D
6266
6267
6268%if HIGH_BIT_DEPTH == 0
6269%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
6270%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
6271%define TRANS TRANS_SSE4
6272
6273%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
6274 movq xm%1, [r0]
6275 movq xm%3, [r2]
6276 movq xm%2, [r0+r1]
6277 movq xm%4, [r2+r3]
6278 vinserti128 m%1, m%1, [r0+4*r1], 1
6279 vinserti128 m%3, m%3, [r2+4*r3], 1
6280 vinserti128 m%2, m%2, [r0+r4], 1
6281 vinserti128 m%4, m%4, [r2+r5], 1
6282 punpcklqdq m%1, m%1
6283 punpcklqdq m%3, m%3
6284 punpcklqdq m%2, m%2
6285 punpcklqdq m%4, m%4
6286 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
6287 lea r0, [r0+2*r1]
6288 lea r2, [r2+2*r3]
6289
6290 movq xm%3, [r0]
6291 movq xm%5, [r2]
6292 movq xm%4, [r0+r1]
6293 movq xm%6, [r2+r3]
6294 vinserti128 m%3, m%3, [r0+4*r1], 1
6295 vinserti128 m%5, m%5, [r2+4*r3], 1
6296 vinserti128 m%4, m%4, [r0+r4], 1
6297 vinserti128 m%6, m%6, [r2+r5], 1
6298 punpcklqdq m%3, m%3
6299 punpcklqdq m%5, m%5
6300 punpcklqdq m%4, m%4
6301 punpcklqdq m%6, m%6
6302 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
6303%endmacro
6304
6305%macro SATD_START_AVX2 2-3 0
6306 FIX_STRIDES r1, r3
6307%if %3
6308 mova %2, [hmul_8p]
6309 lea r4, [5*r1]
6310 lea r5, [5*r3]
6311%else
6312 mova %2, [hmul_16p]
6313 lea r4, [3*r1]
6314 lea r5, [3*r3]
6315%endif
6316 pxor %1, %1
6317%endmacro
6318
6319%define TRANS TRANS_SSE4
6320INIT_YMM avx2
6321cglobal pixel_satd_16x8_internal
6322 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
6323 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6324 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
6325 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6326 ret
6327
6328cglobal pixel_satd_16x16, 4,6,8
6329 SATD_START_AVX2 m6, m7
6330 call pixel_satd_16x8_internal
6331 lea r0, [r0+4*r1]
6332 lea r2, [r2+4*r3]
6333pixel_satd_16x8_internal:
6334 call pixel_satd_16x8_internal
6335 vextracti128 xm0, m6, 1
6336 paddw xm0, xm6
6337 SATD_END_SSE2 xm0
6338 RET
6339
6340cglobal pixel_satd_16x8, 4,6,8
6341 SATD_START_AVX2 m6, m7
6342 jmp pixel_satd_16x8_internal
6343
6344cglobal pixel_satd_8x8_internal
6345 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
6346 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
6347 ret
6348
6349cglobal pixel_satd_8x16, 4,6,8
6350 SATD_START_AVX2 m6, m7, 1
6351 call pixel_satd_8x8_internal
6352 lea r0, [r0+2*r1]
6353 lea r2, [r2+2*r3]
6354 lea r0, [r0+4*r1]
6355 lea r2, [r2+4*r3]
6356 call pixel_satd_8x8_internal
6357 vextracti128 xm0, m6, 1
6358 paddw xm0, xm6
6359 SATD_END_SSE2 xm0
6360 RET
6361
6362cglobal pixel_satd_8x8, 4,6,8
6363 SATD_START_AVX2 m6, m7, 1
6364 call pixel_satd_8x8_internal
6365 vextracti128 xm0, m6, 1
6366 paddw xm0, xm6
6367 SATD_END_SSE2 xm0
6368 RET
6369
6370cglobal pixel_sa8d_8x8_internal
6371 LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
6372 HADAMARD4_V 0, 1, 2, 3, 4
6373 HADAMARD 8, sumsub, 0, 1, 4, 5
6374 HADAMARD 8, sumsub, 2, 3, 4, 5
6375 HADAMARD 2, sumsub, 0, 1, 4, 5
6376 HADAMARD 2, sumsub, 2, 3, 4, 5
6377 HADAMARD 1, amax, 0, 1, 4, 5
6378 HADAMARD 1, amax, 2, 3, 4, 5
6379 paddw m6, m0
6380 paddw m6, m2
6381 ret
6382
6383cglobal pixel_sa8d_8x8, 4,6,8
6384 SATD_START_AVX2 m6, m7, 1
6385 call pixel_sa8d_8x8_internal
6386 vextracti128 xm1, m6, 1
6387 paddw xm6, xm1
6388 HADDW xm6, xm1
6389 movd eax, xm6
6390 add eax, 1
6391 shr eax, 1
6392 RET
6393%endif ; HIGH_BIT_DEPTH
6394
6395; Input 16bpp, Output 8bpp
6396;------------------------------------------------------------------------------------------------------------------------
6397;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
6398;------------------------------------------------------------------------------------------------------------------------
6399INIT_XMM sse2
6400cglobal downShift_16, 7,7,3
6401 movd m0, r6d ; m0 = shift
6402 add r1, r1
6403 dec r5d
6404.loopH:
6405 xor r6, r6
6406.loopW:
6407 movu m1, [r0 + r6 * 2]
6408 movu m2, [r0 + r6 * 2 + 16]
6409 psrlw m1, m0
6410 psrlw m2, m0
6411 packuswb m1, m2
6412 movu [r2 + r6], m1
6413
6414 add r6, 16
6415 cmp r6d, r4d
6416 jl .loopW
6417
6418 ; move to next row
6419 add r0, r1
6420 add r2, r3
6421 dec r5d
6422 jnz .loopH
6423
6424;processing last row of every frame [To handle width which not a multiple of 16]
6425
6426.loop16:
6427 movu m1, [r0]
6428 movu m2, [r0 + 16]
6429 psrlw m1, m0
6430 psrlw m2, m0
6431 packuswb m1, m2
6432 movu [r2], m1
6433
6434 add r0, 2 * mmsize
6435 add r2, mmsize
6436 sub r4d, 16
6437 jz .end
6438 cmp r4d, 15
6439 jg .loop16
6440
6441 cmp r4d, 8
6442 jl .process4
6443 movu m1, [r0]
6444 psrlw m1, m0
6445 packuswb m1, m1
6446 movh [r2], m1
6447
6448 add r0, mmsize
6449 add r2, 8
6450 sub r4d, 8
6451 jz .end
6452
6453.process4:
6454 cmp r4d, 4
6455 jl .process2
6456 movh m1,[r0]
6457 psrlw m1, m0
6458 packuswb m1, m1
6459 movd [r2], m1
6460
6461 add r0, 8
6462 add r2, 4
6463 sub r4d, 4
6464 jz .end
6465
6466.process2:
6467 cmp r4d, 2
6468 jl .process1
6469 movd m1, [r0]
6470 psrlw m1, m0
6471 packuswb m1, m1
6472 movd r6, m1
6473 mov [r2], r6w
6474
6475 add r0, 4
6476 add r2, 2
6477 sub r4d, 2
6478 jz .end
6479
6480.process1:
6481 movd m1, [r0]
6482 psrlw m1, m0
6483 packuswb m1, m1
6484 movd r3, m1
6485 mov [r2], r3b
6486.end:
6487 RET
6488
6489; Input 8bpp, Output 16bpp
6490;---------------------------------------------------------------------------------------------------------------------
6491;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
6492;---------------------------------------------------------------------------------------------------------------------
6493INIT_XMM sse4
6494cglobal upShift_8, 7,7,3
6495
6496 movd m2, r6d ; m0 = shift
6497 add r3, r3
6498 dec r5d
6499
6500.loopH:
6501 xor r6, r6
6502.loopW:
6503 pmovzxbw m0,[r0 + r6]
6504 pmovzxbw m1,[r0 + r6 + 8]
6505 psllw m0, m2
6506 psllw m1, m2
6507 movu [r2 + r6 * 2], m0
6508 movu [r2 + r6 * 2 + 16], m1
6509
6510 add r6, 16
6511 cmp r6d, r4d
6512 jl .loopW
6513
6514 ; move to next row
6515 add r0, r1
6516 add r2, r3
6517 dec r5d
6518 jnz .loopH
6519
6520;processing last row of every frame [To handle width which not a multiple of 16]
6521
6522.loop16:
6523 pmovzxbw m0,[r0]
6524 pmovzxbw m1,[r0 + 8]
6525 psllw m0, m2
6526 psllw m1, m2
6527 movu [r2], m0
6528 movu [r2 + 16], m1
6529
6530 add r0, mmsize
6531 add r2, 2 * mmsize
6532 sub r4d, 16
6533 jz .end
6534 cmp r4d, 15
6535 jg .loop16
6536
6537 cmp r4d, 8
6538 jl .process4
6539 pmovzxbw m0,[r0]
6540 psllw m0, m2
6541 movu [r2], m0
6542
6543 add r0, 8
6544 add r2, mmsize
6545 sub r4d, 8
6546 jz .end
6547
6548.process4:
6549 cmp r4d, 4
6550 jl .process2
6551 movd m0,[r0]
6552 pmovzxbw m0,m0
6553 psllw m0, m2
6554 movh [r2], m0
6555
6556 add r0, 4
6557 add r2, 8
6558 sub r4d, 4
6559 jz .end
6560
6561.process2:
6562 cmp r4d, 2
6563 jl .process1
6564 movzx r3d, byte [r0]
6565 shl r3d, 2
6566 mov [r2], r3w
6567 movzx r3d, byte [r0 + 1]
6568 shl r3d, 2
6569 mov [r2 + 2], r3w
6570
6571 add r0, 2
6572 add r2, 4
6573 sub r4d, 2
6574 jz .end
6575
6576.process1:
6577 movzx r3d, byte [r0]
6578 shl r3d, 2
6579 mov [r2], r3w
6580.end:
6581 RET