Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / sad16-a.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* sad16-a.asm: x86 high depth sad functions
3;*****************************************************************************
4;* Copyright (C) 2010-2013 x264 project
5;*
6;* Authors: Oskar Arvidsson <oskar@irock.se>
7;* Henrik Gramner <henrik@gramner.com>
8;* Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9;*
10;* This program is free software; you can redistribute it and/or modify
11;* it under the terms of the GNU General Public License as published by
12;* the Free Software Foundation; either version 2 of the License, or
13;* (at your option) any later version.
14;*
15;* This program is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18;* GNU General Public License for more details.
19;*
20;* You should have received a copy of the GNU General Public License
21;* along with this program; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23;*
24;* This program is also available under a commercial proprietary license.
25;* For more information, contact us at license @ x265.com.
26;*****************************************************************************
27
28%include "x86inc.asm"
29%include "x86util.asm"
30
31SECTION .text
32
33cextern pw_1
34
35;=============================================================================
36; SAD MMX
37;=============================================================================
38
39%macro SAD_INC_1x16P_MMX 0
40 movu m1, [r0+ 0]
41 movu m2, [r0+ 8]
42 movu m3, [r0+16]
43 movu m4, [r0+24]
44 psubw m1, [r2+ 0]
45 psubw m2, [r2+ 8]
46 psubw m3, [r2+16]
47 psubw m4, [r2+24]
48 ABSW2 m1, m2, m1, m2, m5, m6
49 ABSW2 m3, m4, m3, m4, m7, m5
50 lea r0, [r0+2*r1]
51 lea r2, [r2+2*r3]
52 paddw m1, m2
53 paddw m3, m4
54 paddw m0, m1
55 paddw m0, m3
56%endmacro
57
58%macro SAD_INC_2x8P_MMX 0
59 movu m1, [r0+0]
60 movu m2, [r0+8]
61 movu m3, [r0+2*r1+0]
62 movu m4, [r0+2*r1+8]
63 psubw m1, [r2+0]
64 psubw m2, [r2+8]
65 psubw m3, [r2+2*r3+0]
66 psubw m4, [r2+2*r3+8]
67 ABSW2 m1, m2, m1, m2, m5, m6
68 ABSW2 m3, m4, m3, m4, m7, m5
69 lea r0, [r0+4*r1]
70 lea r2, [r2+4*r3]
71 paddw m1, m2
72 paddw m3, m4
73 paddw m0, m1
74 paddw m0, m3
75%endmacro
76
77%macro SAD_INC_2x4P_MMX 0
78 movu m1, [r0]
79 movu m2, [r0+2*r1]
80 psubw m1, [r2]
81 psubw m2, [r2+2*r3]
82 ABSW2 m1, m2, m1, m2, m3, m4
83 lea r0, [r0+4*r1]
84 lea r2, [r2+4*r3]
85 paddw m0, m1
86 paddw m0, m2
87%endmacro
88
89;-----------------------------------------------------------------------------
90; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
91;-----------------------------------------------------------------------------
92%macro SAD_MMX 3
93cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
94 pxor m0, m0
95%if %2 == 4
96 SAD_INC_%3x%1P_MMX
97 SAD_INC_%3x%1P_MMX
98%else
99 mov r4d, %2/%3
100.loop:
101 SAD_INC_%3x%1P_MMX
102 dec r4d
103 jg .loop
104%endif
105%if %1*%2 == 256
106 HADDUW m0, m1
107%else
108 HADDW m0, m1
109%endif
110 movd eax, m0
111 RET
112%endmacro
113
114INIT_MMX mmx2
115SAD_MMX 16, 16, 1
116SAD_MMX 16, 8, 1
117SAD_MMX 8, 16, 2
118SAD_MMX 8, 8, 2
119SAD_MMX 8, 4, 2
120SAD_MMX 4, 8, 2
121SAD_MMX 4, 4, 2
122SAD_MMX 4, 16, 2
123INIT_MMX ssse3
124SAD_MMX 4, 8, 2
125SAD_MMX 4, 4, 2
126
127;=============================================================================
128; SAD XMM
129;=============================================================================
130
131%macro SAD_1x32 0
132 movu m1, [r2+ 0]
133 movu m2, [r2+16]
134 movu m3, [r2+32]
135 movu m4, [r2+48]
136 psubw m1, [r0+0]
137 psubw m2, [r0+16]
138 psubw m3, [r0+32]
139 psubw m4, [r0+48]
140 ABSW2 m1, m2, m1, m2, m5, m6
141 pmaddwd m1, [pw_1]
142 pmaddwd m2, [pw_1]
143 lea r0, [r0+2*r1]
144 lea r2, [r2+2*r3]
145 ABSW2 m3, m4, m3, m4, m7, m5
146 pmaddwd m3, [pw_1]
147 pmaddwd m4, [pw_1]
148 paddd m1, m2
149 paddd m3, m4
150 paddd m0, m1
151 paddd m0, m3
152%endmacro
153
154%macro SAD_1x24 0
155 movu m1, [r2+ 0]
156 movu m2, [r2+16]
157 movu m3, [r2+32]
158 psubw m1, [r0+0]
159 psubw m2, [r0+16]
160 psubw m3, [r0+32]
161 ABSW2 m1, m2, m1, m2, m4, m6
162 pmaddwd m1, [pw_1]
163 pmaddwd m2, [pw_1]
164 lea r0, [r0+2*r1]
165 lea r2, [r2+2*r3]
166 pxor m4, m4
167 psubw m4, m3
168 pmaxsw m3, m4
169 pmaddwd m3, [pw_1]
170 paddd m1, m2
171 paddd m0, m1
172 paddd m0, m3
173%endmacro
174
175%macro SAD_1x48 0
176 movu m1, [r2+ 0]
177 movu m2, [r2+16]
178 movu m3, [r2+32]
179 movu m4, [r2+48]
180 psubw m1, [r0+0]
181 psubw m2, [r0+16]
182 psubw m3, [r0+32]
183 psubw m4, [r0+48]
184 ABSW2 m1, m2, m1, m2, m5, m6
185 pmaddwd m1, [pw_1]
186 pmaddwd m2, [pw_1]
187 ABSW2 m3, m4, m3, m4, m7, m5
188 pmaddwd m3, [pw_1]
189 pmaddwd m4, [pw_1]
190 paddd m1, m2
191 paddd m3, m4
192 paddd m0, m1
193 paddd m0, m3
194 movu m1, [r2+64]
195 movu m2, [r2+80]
196 psubw m1, [r0+64]
197 psubw m2, [r0+80]
198 ABSW2 m1, m2, m1, m2, m3, m4
199 pmaddwd m1, [pw_1]
200 pmaddwd m2, [pw_1]
201 lea r0, [r0+2*r1]
202 lea r2, [r2+2*r3]
203 paddd m0, m1
204 paddd m0, m2
205%endmacro
206
207%macro SAD_1x64 0
208 movu m1, [r2+ 0]
209 movu m2, [r2+16]
210 movu m3, [r2+32]
211 movu m4, [r2+48]
212 psubw m1, [r0+0]
213 psubw m2, [r0+16]
214 psubw m3, [r0+32]
215 psubw m4, [r0+48]
216 ABSW2 m1, m2, m1, m2, m5, m6
217 pmaddwd m1, [pw_1]
218 pmaddwd m2, [pw_1]
219 ABSW2 m3, m4, m3, m4, m7, m5
220 pmaddwd m3, [pw_1]
221 pmaddwd m4, [pw_1]
222 paddd m1, m2
223 paddd m3, m4
224 paddd m0, m1
225 paddd m0, m3
226 movu m1, [r2+64]
227 movu m2, [r2+80]
228 movu m3, [r2+96]
229 movu m4, [r2+112]
230 psubw m1, [r0+64]
231 psubw m2, [r0+80]
232 psubw m3, [r0+96]
233 psubw m4, [r0+112]
234 ABSW2 m1, m2, m1, m2, m5, m6
235 pmaddwd m1, [pw_1]
236 pmaddwd m2, [pw_1]
237 ABSW2 m3, m4, m3, m4, m7, m5
238 pmaddwd m3, [pw_1]
239 pmaddwd m4, [pw_1]
240 paddd m1, m2
241 paddd m3, m4
242 paddd m0, m1
243 paddd m0, m3
244 lea r0, [r0+2*r1]
245 lea r2, [r2+2*r3]
246%endmacro
247
248%macro SAD_1x12 0
249 movu m1, [r2+0]
250 movh m2, [r2+16]
251 psubw m1, [r0+0]
252 movh m3, [r0+16]
253 psubw m2, m3
254 ABSW2 m1, m2, m1, m2, m4, m6
255 pmaddwd m1, [pw_1]
256 pmaddwd m2, [pw_1]
257 lea r0, [r0+2*r1]
258 lea r2, [r2+2*r3]
259 paddd m1, m2
260 paddd m0, m1
261%endmacro
262
263%macro SAD_INC_2ROW 1
264%if 2*%1 > mmsize
265 movu m1, [r2+ 0]
266 movu m2, [r2+16]
267 movu m3, [r2+2*r3+ 0]
268 movu m4, [r2+2*r3+16]
269 psubw m1, [r0+ 0]
270 psubw m2, [r0+16]
271 psubw m3, [r0+2*r1+ 0]
272 psubw m4, [r0+2*r1+16]
273 ABSW2 m1, m2, m1, m2, m5, m6
274 lea r0, [r0+4*r1]
275 lea r2, [r2+4*r3]
276 ABSW2 m3, m4, m3, m4, m7, m5
277 paddw m1, m2
278 paddw m3, m4
279 paddw m3, m1
280 pmaddwd m3, [pw_1]
281 paddd m0, m3
282%else
283 movu m1, [r2]
284 movu m2, [r2+2*r3]
285 psubw m1, [r0]
286 psubw m2, [r0+2*r1]
287 ABSW2 m1, m2, m1, m2, m3, m4
288 lea r0, [r0+4*r1]
289 lea r2, [r2+4*r3]
290 paddw m2, m1
291 pmaddwd m2, [pw_1]
292 paddd m0, m2
293%endif
294%endmacro
295
296;-----------------------------------------------------------------------------
297; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
298;-----------------------------------------------------------------------------
299%macro SAD 2
300cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
301 pxor m0, m0
302%if %2 == 4
303 SAD_INC_2ROW %1
304 SAD_INC_2ROW %1
305%else
306 mov r4d, %2/2
307.loop:
308 SAD_INC_2ROW %1
309 dec r4d
310 jg .loop
311%endif
312
313 HADDD m0, m1
314 movd eax, xm0
315 RET
316%endmacro
317
318INIT_XMM sse2
319SAD 16, 4
320SAD 16, 8
321SAD 16, 12
322SAD 16, 16
323SAD 16, 32
324SAD 16, 64
325
326INIT_XMM sse2
327SAD 8, 4
328SAD 8, 8
329SAD 8, 16
330SAD 8, 32
331
332;------------------------------------------------------------------
333; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
334;------------------------------------------------------------------
335%macro SAD_32 2
336cglobal pixel_sad_%1x%2, 4,5,8
337 pxor m0, m0
338 mov r4d, %2/4
339.loop:
340 SAD_1x32
341 SAD_1x32
342 SAD_1x32
343 SAD_1x32
344 dec r4d
345 jnz .loop
346
347 HADDD m0, m1
348 movd eax, xm0
349 RET
350%endmacro
351
352INIT_XMM sse2
353SAD_32 32, 8
354SAD_32 32, 16
355SAD_32 32, 24
356SAD_32 32, 32
357SAD_32 32, 64
358
359;------------------------------------------------------------------
360; int pixel_sad_64xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
361;------------------------------------------------------------------
362%macro SAD_64 2
363cglobal pixel_sad_%1x%2, 4,5,8
364 pxor m0, m0
365 mov r4d, %2/4
366.loop:
367 SAD_1x64
368 SAD_1x64
369 SAD_1x64
370 SAD_1x64
371 dec r4d
372 jnz .loop
373
374 HADDD m0, m1
375 movd eax, xmm0
376 RET
377%endmacro
378
379INIT_XMM sse2
380SAD_64 64, 16
381SAD_64 64, 32
382SAD_64 64, 48
383SAD_64 64, 64
384
385;------------------------------------------------------------------
386; int pixel_sad_48xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
387;------------------------------------------------------------------
388%macro SAD_48 2
389cglobal pixel_sad_%1x%2, 4,5,8
390 pxor m0, m0
391 mov r4d, %2/4
392.loop:
393 SAD_1x48
394 SAD_1x48
395 SAD_1x48
396 SAD_1x48
397 dec r4d
398 jnz .loop
399
400 HADDD m0, m1
401 movd eax, xmm0
402 RET
403%endmacro
404
405INIT_XMM sse2
406SAD_48 48, 64
407
408;------------------------------------------------------------------
409; int pixel_sad_24xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
410;------------------------------------------------------------------
411%macro SAD_24 2
412cglobal pixel_sad_%1x%2, 4,5,8
413 pxor m0, m0
414 mov r4d, %2/4
415.loop:
416 SAD_1x24
417 SAD_1x24
418 SAD_1x24
419 SAD_1x24
420 dec r4d
421 jnz .loop
422
423 HADDD m0, m1
424 movd eax, xmm0
425 RET
426%endmacro
427
428INIT_XMM sse2
429SAD_24 24, 32
430
431;------------------------------------------------------------------
432; int pixel_sad_12xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
433;------------------------------------------------------------------
434%macro SAD_12 2
435cglobal pixel_sad_%1x%2, 4,5,8
436 pxor m0, m0
437 mov r4d, %2/4
438.loop:
439 SAD_1x12
440 SAD_1x12
441 SAD_1x12
442 SAD_1x12
443 dec r4d
444 jnz .loop
445
446 HADDD m0, m1
447 movd eax, xmm0
448 RET
449%endmacro
450
451INIT_XMM sse2
452SAD_12 12, 16
453
454
455;=============================================================================
456; SAD x3/x4
457;=============================================================================
458
459%macro SAD_X3_INC_P 0
460 add r0, 4*FENC_STRIDE
461 lea r1, [r1+4*r4]
462 lea r2, [r2+4*r4]
463 lea r3, [r3+4*r4]
464%endmacro
465
466%macro SAD_X3_ONE_START 0
467 mova m3, [r0]
468 movu m0, [r1]
469 movu m1, [r2]
470 movu m2, [r3]
471 psubw m0, m3
472 psubw m1, m3
473 psubw m2, m3
474 ABSW2 m0, m1, m0, m1, m4, m5
475 ABSW m2, m2, m6
476 pmaddwd m0, [pw_1]
477 pmaddwd m1, [pw_1]
478 pmaddwd m2, [pw_1]
479%endmacro
480
481%macro SAD_X3_ONE 2
482 mova m6, [r0+%1]
483 movu m3, [r1+%2]
484 movu m4, [r2+%2]
485 movu m5, [r3+%2]
486 psubw m3, m6
487 psubw m4, m6
488 psubw m5, m6
489 ABSW2 m3, m4, m3, m4, m7, m6
490 ABSW m5, m5, m6
491 pmaddwd m3, [pw_1]
492 pmaddwd m4, [pw_1]
493 pmaddwd m5, [pw_1]
494 paddd m0, m3
495 paddd m1, m4
496 paddd m2, m5
497%endmacro
498
499%macro SAD_X3_END 2
500%if mmsize == 8 && %1*%2 == 256
501 HADDUW m0, m3
502 HADDUW m1, m4
503 HADDUW m2, m5
504%else
505 HADDD m0, m3
506 HADDD m1, m4
507 HADDD m2, m5
508%endif
509%if UNIX64
510 movd [r5+0], xm0
511 movd [r5+4], xm1
512 movd [r5+8], xm2
513%else
514 mov r0, r5mp
515 movd [r0+0], xm0
516 movd [r0+4], xm1
517 movd [r0+8], xm2
518%endif
519 RET
520%endmacro
521
522%macro SAD_X4_INC_P 0
523 add r0, 4*FENC_STRIDE
524 lea r1, [r1+4*r5]
525 lea r2, [r2+4*r5]
526 lea r3, [r3+4*r5]
527 lea r4, [r4+4*r5]
528%endmacro
529
530%macro SAD_X4_ONE_START 0
531 mova m4, [r0]
532 movu m0, [r1]
533 movu m1, [r2]
534 movu m2, [r3]
535 movu m3, [r4]
536 psubw m0, m4
537 psubw m1, m4
538 psubw m2, m4
539 psubw m3, m4
540 ABSW2 m0, m1, m0, m1, m5, m6
541 ABSW2 m2, m3, m2, m3, m4, m7
542 pmaddwd m0, [pw_1]
543 pmaddwd m1, [pw_1]
544 pmaddwd m2, [pw_1]
545 pmaddwd m3, [pw_1]
546%endmacro
547
548%macro SAD_X4_ONE 2
549 mova m4, [r0+%1]
550 movu m5, [r1+%2]
551 movu m6, [r2+%2]
552%if num_mmregs > 8
553 movu m7, [r3+%2]
554 movu m8, [r4+%2]
555 psubw m5, m4
556 psubw m6, m4
557 psubw m7, m4
558 psubw m8, m4
559 ABSW2 m5, m6, m5, m6, m9, m10
560 ABSW2 m7, m8, m7, m8, m9, m10
561 pmaddwd m5, [pw_1]
562 pmaddwd m6, [pw_1]
563 pmaddwd m7, [pw_1]
564 pmaddwd m8, [pw_1]
565 paddd m0, m5
566 paddd m1, m6
567 paddd m2, m7
568 paddd m3, m8
569%elif cpuflag(ssse3)
570 movu m7, [r3+%2]
571 psubw m5, m4
572 psubw m6, m4
573 psubw m7, m4
574 movu m4, [r4+%2]
575 pabsw m5, m5
576 psubw m4, [r0+%1]
577 pabsw m6, m6
578 pabsw m7, m7
579 pabsw m4, m4
580 pmaddwd m5, [pw_1]
581 pmaddwd m6, [pw_1]
582 pmaddwd m7, [pw_1]
583 pmaddwd m4, [pw_1]
584 paddd m0, m5
585 paddd m1, m6
586 paddd m2, m7
587 paddd m3, m4
588%else ; num_mmregs == 8 && !ssse3
589 psubw m5, m4
590 psubw m6, m4
591 ABSW m5, m5, m7
592 ABSW m6, m6, m7
593 pmaddwd m5, [pw_1]
594 pmaddwd m6, [pw_1]
595 paddd m0, m5
596 paddd m1, m6
597 movu m5, [r3+%2]
598 movu m6, [r4+%2]
599 psubw m5, m4
600 psubw m6, m4
601 ABSW2 m5, m6, m5, m6, m7, m4
602 pmaddwd m5, [pw_1]
603 pmaddwd m6, [pw_1]
604 paddd m2, m5
605 paddd m3, m6
606%endif
607%endmacro
608
609%macro SAD_X4_END 2
610%if mmsize == 8 && %1*%2 == 256
611 HADDUW m0, m4
612 HADDUW m1, m5
613 HADDUW m2, m6
614 HADDUW m3, m7
615%else
616 HADDD m0, m4
617 HADDD m1, m5
618 HADDD m2, m6
619 HADDD m3, m7
620%endif
621 mov r0, r6mp
622 movd [r0+ 0], xm0
623 movd [r0+ 4], xm1
624 movd [r0+ 8], xm2
625 movd [r0+12], xm3
626 RET
627%endmacro
628
629%macro SAD_X_2xNP 4
630 %assign x %3
631%rep %4
632 SAD_X%1_ONE x*mmsize, x*mmsize
633 SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
634 %assign x x+1
635%endrep
636%endmacro
637
638%macro PIXEL_VSAD 0
639cglobal pixel_vsad, 3,3,8
640 mova m0, [r0]
641 mova m1, [r0+16]
642 mova m2, [r0+2*r1]
643 mova m3, [r0+2*r1+16]
644 lea r0, [r0+4*r1]
645 psubw m0, m2
646 psubw m1, m3
647 ABSW2 m0, m1, m0, m1, m4, m5
648 paddw m0, m1
649 sub r2d, 2
650 je .end
651.loop:
652 mova m4, [r0]
653 mova m5, [r0+16]
654 mova m6, [r0+2*r1]
655 mova m7, [r0+2*r1+16]
656 lea r0, [r0+4*r1]
657 psubw m2, m4
658 psubw m3, m5
659 psubw m4, m6
660 psubw m5, m7
661 ABSW m2, m2, m1
662 ABSW m3, m3, m1
663 ABSW m4, m4, m1
664 ABSW m5, m5, m1
665 paddw m0, m2
666 paddw m0, m3
667 paddw m0, m4
668 paddw m0, m5
669 mova m2, m6
670 mova m3, m7
671 sub r2d, 2
672 jg .loop
673.end:
674%if BIT_DEPTH == 9
675 HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
676%else
677 HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
678%endif
679 movd eax, m0
680 RET
681%endmacro
682INIT_XMM sse2
683PIXEL_VSAD
684INIT_XMM ssse3
685PIXEL_VSAD
686INIT_XMM xop
687PIXEL_VSAD
688
689INIT_YMM avx2
690cglobal pixel_vsad, 3,3
691 mova m0, [r0]
692 mova m1, [r0+2*r1]
693 lea r0, [r0+4*r1]
694 psubw m0, m1
695 pabsw m0, m0
696 sub r2d, 2
697 je .end
698.loop:
699 mova m2, [r0]
700 mova m3, [r0+2*r1]
701 lea r0, [r0+4*r1]
702 psubw m1, m2
703 psubw m2, m3
704 pabsw m1, m1
705 pabsw m2, m2
706 paddw m0, m1
707 paddw m0, m2
708 mova m1, m3
709 sub r2d, 2
710 jg .loop
711.end:
712%if BIT_DEPTH == 9
713 HADDW m0, m1
714%else
715 HADDUW m0, m1
716%endif
717 movd eax, xm0
718 RET
719
720;-----------------------------------------------------------------------------
721; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
722; uint16_t *pix2, intptr_t i_stride, int scores[3] )
723;-----------------------------------------------------------------------------
724%macro SAD_X 3
725cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
726 %assign regnum %1+1
727 %xdefine STRIDE r %+ regnum
728 mov r6, %3/2-1
729 SAD_X%1_ONE_START
730 SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
731 SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
732.loop:
733 SAD_X%1_INC_P
734 SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
735 dec r6
736 jg .loop
737%if %1 == 4
738 mov r6, r6m
739%endif
740 SAD_X%1_END %2, %3
741%endmacro
742
743INIT_MMX mmx2
744%define XMM_REGS 0
745SAD_X 3, 16, 16
746SAD_X 3, 16, 8
747SAD_X 3, 12, 16
748SAD_X 3, 8, 16
749SAD_X 3, 8, 8
750SAD_X 3, 8, 4
751SAD_X 3, 4, 16
752SAD_X 3, 4, 8
753SAD_X 3, 4, 4
754SAD_X 4, 16, 16
755SAD_X 4, 16, 8
756SAD_X 4, 12, 16
757SAD_X 4, 8, 16
758SAD_X 4, 8, 8
759SAD_X 4, 8, 4
760SAD_X 4, 4, 16
761SAD_X 4, 4, 8
762SAD_X 4, 4, 4
763INIT_MMX ssse3
764SAD_X 3, 4, 8
765SAD_X 3, 4, 4
766SAD_X 4, 4, 8
767SAD_X 4, 4, 4
768INIT_XMM ssse3
769%define XMM_REGS 7
770SAD_X 3, 16, 16
771SAD_X 3, 16, 8
772SAD_X 3, 8, 16
773SAD_X 3, 8, 8
774SAD_X 3, 8, 4
775%define XMM_REGS 9
776SAD_X 4, 16, 16
777SAD_X 4, 16, 8
778SAD_X 4, 8, 16
779SAD_X 4, 8, 8
780SAD_X 4, 8, 4
781INIT_XMM sse2
782%define XMM_REGS 8
783SAD_X 3, 64, 64
784SAD_X 3, 64, 48
785SAD_X 3, 64, 32
786SAD_X 3, 64, 16
787SAD_X 3, 48, 64
788SAD_X 3, 32, 64
789SAD_X 3, 32, 32
790SAD_X 3, 32, 24
791SAD_X 3, 32, 16
792SAD_X 3, 32, 8
793SAD_X 3, 24, 32
794SAD_X 3, 16, 64
795SAD_X 3, 16, 32
796SAD_X 3, 16, 16
797SAD_X 3, 16, 12
798SAD_X 3, 16, 8
799SAD_X 3, 16, 4
800SAD_X 3, 8, 32
801SAD_X 3, 8, 16
802SAD_X 3, 8, 8
803SAD_X 3, 8, 4
804%define XMM_REGS 11
805SAD_X 4, 64, 64
806SAD_X 4, 64, 48
807SAD_X 4, 64, 32
808SAD_X 4, 64, 16
809SAD_X 4, 48, 64
810SAD_X 4, 32, 64
811SAD_X 4, 32, 32
812SAD_X 4, 32, 24
813SAD_X 4, 32, 16
814SAD_X 4, 32, 8
815SAD_X 4, 24, 32
816SAD_X 4, 16, 64
817SAD_X 4, 16, 32
818SAD_X 4, 16, 16
819SAD_X 4, 16, 12
820SAD_X 4, 16, 8
821SAD_X 4, 16, 4
822SAD_X 4, 8, 32
823SAD_X 4, 8, 16
824SAD_X 4, 8, 8
825SAD_X 4, 8, 4
826INIT_YMM avx2
827%define XMM_REGS 7
828SAD_X 3, 16, 16
829SAD_X 3, 16, 8
830%define XMM_REGS 9
831SAD_X 4, 16, 16
832SAD_X 4, 16, 8
833