Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / sad-a.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* sad-a.asm: x86 sad functions
3;*****************************************************************************
4;* Copyright (C) 2003-2013 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Fiona Glaser <fiona@x264.com>
8;* Laurent Aimar <fenrir@via.ecp.fr>
9;* Alex Izvorski <aizvorksi@gmail.com>
10;*
11;* This program is free software; you can redistribute it and/or modify
12;* it under the terms of the GNU General Public License as published by
13;* the Free Software Foundation; either version 2 of the License, or
14;* (at your option) any later version.
15;*
16;* This program is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;* GNU General Public License for more details.
20;*
21;* You should have received a copy of the GNU General Public License
22;* along with this program; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24;*
25;* This program is also available under a commercial proprietary license.
26;* For more information, contact us at license @ x265.com.
27;*****************************************************************************
28
29%include "x86inc.asm"
30%include "x86util.asm"
31
32SECTION_RODATA 32
33
34MSK: db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0
35pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
36hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
37
38SECTION .text
39
40cextern pb_3
41cextern pb_shuf8x8c
42cextern pw_8
43cextern sw_64
44
45;=============================================================================
46; SAD MMX
47;=============================================================================
48
49%macro SAD_INC_2x16P 0
50 movq mm1, [r0]
51 movq mm2, [r0+8]
52 movq mm3, [r0+r1]
53 movq mm4, [r0+r1+8]
54 psadbw mm1, [r2]
55 psadbw mm2, [r2+8]
56 psadbw mm3, [r2+r3]
57 psadbw mm4, [r2+r3+8]
58 lea r0, [r0+2*r1]
59 paddw mm1, mm2
60 paddw mm3, mm4
61 lea r2, [r2+2*r3]
62 paddw mm0, mm1
63 paddw mm0, mm3
64%endmacro
65
66%macro SAD_INC_2x8P 0
67 movq mm1, [r0]
68 movq mm2, [r0+r1]
69 psadbw mm1, [r2]
70 psadbw mm2, [r2+r3]
71 lea r0, [r0+2*r1]
72 paddw mm0, mm1
73 paddw mm0, mm2
74 lea r2, [r2+2*r3]
75%endmacro
76
77%macro SAD_INC_2x4P 0
78 movd mm1, [r0]
79 movd mm2, [r2]
80 punpckldq mm1, [r0+r1]
81 punpckldq mm2, [r2+r3]
82 psadbw mm1, mm2
83 paddw mm0, mm1
84 lea r0, [r0+2*r1]
85 lea r2, [r2+2*r3]
86%endmacro
87
88;-----------------------------------------------------------------------------
89; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
90;-----------------------------------------------------------------------------
91%macro SAD 2
92cglobal pixel_sad_%1x%2_mmx2, 4,4
93 pxor mm0, mm0
94%rep %2/2
95 SAD_INC_2x%1P
96%endrep
97 movd eax, mm0
98 RET
99%endmacro
100
101SAD 16, 16
102SAD 16, 8
103SAD 8, 16
104SAD 8, 8
105SAD 8, 4
106SAD 4, 16
107SAD 4, 8
108SAD 4, 4
109
110
111
112;=============================================================================
113; SAD XMM
114;=============================================================================
115
116%macro SAD_END_SSE2 0
117 movhlps m1, m0
118 paddw m0, m1
119 movd eax, m0
120 RET
121%endmacro
122
123%macro PROCESS_SAD_12x4 0
124 movu m1, [r2]
125 movu m2, [r0]
126 pand m1, m4
127 pand m2, m4
128 psadbw m1, m2
129 paddd m0, m1
130 lea r2, [r2 + r3]
131 lea r0, [r0 + r1]
132 movu m1, [r2]
133 movu m2, [r0]
134 pand m1, m4
135 pand m2, m4
136 psadbw m1, m2
137 paddd m0, m1
138 lea r2, [r2 + r3]
139 lea r0, [r0 + r1]
140 movu m1, [r2]
141 movu m2, [r0]
142 pand m1, m4
143 pand m2, m4
144 psadbw m1, m2
145 paddd m0, m1
146 lea r2, [r2 + r3]
147 lea r0, [r0 + r1]
148 movu m1, [r2]
149 movu m2, [r0]
150 pand m1, m4
151 pand m2, m4
152 psadbw m1, m2
153 paddd m0, m1
154%endmacro
155
156%macro PROCESS_SAD_16x4 0
157 movu m1, [r2]
158 movu m2, [r2 + r3]
159 psadbw m1, [r0]
160 psadbw m2, [r0 + r1]
161 paddd m1, m2
162 paddd m0, m1
163 lea r2, [r2 + 2 * r3]
164 lea r0, [r0 + 2 * r1]
165 movu m1, [r2]
166 movu m2, [r2 + r3]
167 psadbw m1, [r0]
168 psadbw m2, [r0 + r1]
169 paddd m1, m2
170 paddd m0, m1
171 lea r2, [r2 + 2 * r3]
172 lea r0, [r0 + 2 * r1]
173%endmacro
174
175%macro PROCESS_SAD_24x4 0
176 movu m1, [r2]
177 movq m2, [r2 + 16]
178 lea r2, [r2 + r3]
179 movu m3, [r2]
180 movq m4, [r2 + 16]
181 psadbw m1, [r0]
182 psadbw m3, [r0 + r1]
183 paddd m0, m1
184 paddd m0, m3
185 movq m1, [r0 + 16]
186 lea r0, [r0 + r1]
187 movq m3, [r0 + 16]
188 punpcklqdq m2, m4
189 punpcklqdq m1, m3
190 psadbw m2, m1
191 paddd m0, m2
192 lea r2, [r2 + r3]
193 lea r0, [r0 + r1]
194
195 movu m1, [r2]
196 movq m2, [r2 + 16]
197 lea r2, [r2 + r3]
198 movu m3, [r2]
199 movq m4, [r2 + 16]
200 psadbw m1, [r0]
201 psadbw m3, [r0 + r1]
202 paddd m0, m1
203 paddd m0, m3
204 movq m1, [r0 + 16]
205 lea r0, [r0 + r1]
206 movq m3, [r0 + 16]
207 punpcklqdq m2, m4
208 punpcklqdq m1, m3
209 psadbw m2, m1
210 paddd m0, m2
211%endmacro
212
213%macro PROCESS_SAD_32x4 0
214 movu m1, [r2]
215 movu m2, [r2 + 16]
216 psadbw m1, [r0]
217 psadbw m2, [r0 + 16]
218 paddd m1, m2
219 paddd m0, m1
220 lea r2, [r2 + r3]
221 lea r0, [r0 + r1]
222 movu m1, [r2]
223 movu m2, [r2 + 16]
224 psadbw m1, [r0]
225 psadbw m2, [r0 + 16]
226 paddd m1, m2
227 paddd m0, m1
228 lea r2, [r2 + r3]
229 lea r0, [r0 + r1]
230 movu m1, [r2]
231 movu m2, [r2 + 16]
232 psadbw m1, [r0]
233 psadbw m2, [r0 + 16]
234 paddd m1, m2
235 paddd m0, m1
236 lea r2, [r2 + r3]
237 lea r0, [r0 + r1]
238 movu m1, [r2]
239 movu m2, [r2 + 16]
240 psadbw m1, [r0]
241 psadbw m2, [r0 + 16]
242 paddd m1, m2
243 paddd m0, m1
244 lea r2, [r2 + r3]
245 lea r0, [r0 + r1]
246%endmacro
247
248%macro PROCESS_SAD_48x4 0
249 movu m1, [r2]
250 movu m2, [r2 + 16]
251 movu m3, [r2 + 32]
252 psadbw m1, [r0]
253 psadbw m2, [r0 + 16]
254 psadbw m3, [r0 + 32]
255 paddd m1, m2
256 paddd m0, m1
257 paddd m0, m3
258 lea r2, [r2 + r3]
259 lea r0, [r0 + r1]
260
261 movu m1, [r2]
262 movu m2, [r2 + 16]
263 movu m3, [r2 + 32]
264 psadbw m1, [r0]
265 psadbw m2, [r0 + 16]
266 psadbw m3, [r0 + 32]
267 paddd m1, m2
268 paddd m0, m1
269 paddd m0, m3
270 lea r2, [r2 + r3]
271 lea r0, [r0 + r1]
272
273 movu m1, [r2]
274 movu m2, [r2 + 16]
275 movu m3, [r2 + 32]
276 psadbw m1, [r0]
277 psadbw m2, [r0 + 16]
278 psadbw m3, [r0 + 32]
279 paddd m1, m2
280 paddd m0, m1
281 paddd m0, m3
282 lea r2, [r2 + r3]
283 lea r0, [r0 + r1]
284
285 movu m1, [r2]
286 movu m2, [r2 + 16]
287 movu m3, [r2 + 32]
288 psadbw m1, [r0]
289 psadbw m2, [r0 + 16]
290 psadbw m3, [r0 + 32]
291 paddd m1, m2
292 paddd m0, m1
293 paddd m0, m3
294%endmacro
295
296%macro PROCESS_SAD_8x4 0
297 movq m1, [r2]
298 movq m2, [r2 + r3]
299 lea r2, [r2 + 2 * r3]
300 movq m3, [r0]
301 movq m4, [r0 + r1]
302 lea r0, [r0 + 2 * r1]
303 punpcklqdq m1, m2
304 punpcklqdq m3, m4
305 psadbw m1, m3
306 paddd m0, m1
307 movq m1, [r2]
308 movq m2, [r2 + r3]
309 lea r2, [r2 + 2 * r3]
310 movq m3, [r0]
311 movq m4, [r0 + r1]
312 lea r0, [r0 + 2 * r1]
313 punpcklqdq m1, m2
314 punpcklqdq m3, m4
315 psadbw m1, m3
316 paddd m0, m1
317%endmacro
318
319%macro PROCESS_SAD_64x4 0
320 movu m1, [r2]
321 movu m2, [r2 + 16]
322 movu m3, [r2 + 32]
323 movu m4, [r2 + 48]
324 psadbw m1, [r0]
325 psadbw m2, [r0 + 16]
326 psadbw m3, [r0 + 32]
327 psadbw m4, [r0 + 48]
328 paddd m1, m2
329 paddd m3, m4
330 paddd m0, m1
331 paddd m0, m3
332 lea r2, [r2 + r3]
333 lea r0, [r0 + r1]
334
335 movu m1, [r2]
336 movu m2, [r2 + 16]
337 movu m3, [r2 + 32]
338 movu m4, [r2 + 48]
339 psadbw m1, [r0]
340 psadbw m2, [r0 + 16]
341 psadbw m3, [r0 + 32]
342 psadbw m4, [r0 + 48]
343 paddd m1, m2
344 paddd m3, m4
345 paddd m0, m1
346 paddd m0, m3
347 lea r2, [r2 + r3]
348 lea r0, [r0 + r1]
349
350 movu m1, [r2]
351 movu m2, [r2 + 16]
352 movu m3, [r2 + 32]
353 movu m4, [r2 + 48]
354 psadbw m1, [r0]
355 psadbw m2, [r0 + 16]
356 psadbw m3, [r0 + 32]
357 psadbw m4, [r0 + 48]
358 paddd m1, m2
359 paddd m3, m4
360 paddd m0, m1
361 paddd m0, m3
362 lea r2, [r2 + r3]
363 lea r0, [r0 + r1]
364
365 movu m1, [r2]
366 movu m2, [r2 + 16]
367 movu m3, [r2 + 32]
368 movu m4, [r2 + 48]
369 psadbw m1, [r0]
370 psadbw m2, [r0 + 16]
371 psadbw m3, [r0 + 32]
372 psadbw m4, [r0 + 48]
373 paddd m1, m2
374 paddd m3, m4
375 paddd m0, m1
376 paddd m0, m3
377 lea r2, [r2 + r3]
378 lea r0, [r0 + r1]
379%endmacro
380
381%macro SAD_W16 0
382;-----------------------------------------------------------------------------
383; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
384;-----------------------------------------------------------------------------
385cglobal pixel_sad_16x16, 4,4,8
386 movu m0, [r2]
387 movu m1, [r2+r3]
388 lea r2, [r2+2*r3]
389 movu m2, [r2]
390 movu m3, [r2+r3]
391 lea r2, [r2+2*r3]
392 psadbw m0, [r0]
393 psadbw m1, [r0+r1]
394 lea r0, [r0+2*r1]
395 movu m4, [r2]
396 paddw m0, m1
397 psadbw m2, [r0]
398 psadbw m3, [r0+r1]
399 lea r0, [r0+2*r1]
400 movu m5, [r2+r3]
401 lea r2, [r2+2*r3]
402 paddw m2, m3
403 movu m6, [r2]
404 movu m7, [r2+r3]
405 lea r2, [r2+2*r3]
406 paddw m0, m2
407 psadbw m4, [r0]
408 psadbw m5, [r0+r1]
409 lea r0, [r0+2*r1]
410 movu m1, [r2]
411 paddw m4, m5
412 psadbw m6, [r0]
413 psadbw m7, [r0+r1]
414 lea r0, [r0+2*r1]
415 movu m2, [r2+r3]
416 lea r2, [r2+2*r3]
417 paddw m6, m7
418 movu m3, [r2]
419 paddw m0, m4
420 movu m4, [r2+r3]
421 lea r2, [r2+2*r3]
422 paddw m0, m6
423 psadbw m1, [r0]
424 psadbw m2, [r0+r1]
425 lea r0, [r0+2*r1]
426 movu m5, [r2]
427 paddw m1, m2
428 psadbw m3, [r0]
429 psadbw m4, [r0+r1]
430 lea r0, [r0+2*r1]
431 movu m6, [r2+r3]
432 lea r2, [r2+2*r3]
433 paddw m3, m4
434 movu m7, [r2]
435 paddw m0, m1
436 movu m1, [r2+r3]
437 paddw m0, m3
438 psadbw m5, [r0]
439 psadbw m6, [r0+r1]
440 lea r0, [r0+2*r1]
441 paddw m5, m6
442 psadbw m7, [r0]
443 psadbw m1, [r0+r1]
444 paddw m7, m1
445 paddw m0, m5
446 paddw m0, m7
447 SAD_END_SSE2
448
449;-----------------------------------------------------------------------------
450; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
451;-----------------------------------------------------------------------------
452cglobal pixel_sad_16x8, 4,4
453 movu m0, [r2]
454 movu m2, [r2+r3]
455 lea r2, [r2+2*r3]
456 movu m3, [r2]
457 movu m4, [r2+r3]
458 psadbw m0, [r0]
459 psadbw m2, [r0+r1]
460 lea r0, [r0+2*r1]
461 psadbw m3, [r0]
462 psadbw m4, [r0+r1]
463 lea r0, [r0+2*r1]
464 lea r2, [r2+2*r3]
465 paddw m0, m2
466 paddw m3, m4
467 paddw m0, m3
468 movu m1, [r2]
469 movu m2, [r2+r3]
470 lea r2, [r2+2*r3]
471 movu m3, [r2]
472 movu m4, [r2+r3]
473 psadbw m1, [r0]
474 psadbw m2, [r0+r1]
475 lea r0, [r0+2*r1]
476 psadbw m3, [r0]
477 psadbw m4, [r0+r1]
478 lea r0, [r0+2*r1]
479 lea r2, [r2+2*r3]
480 paddw m1, m2
481 paddw m3, m4
482 paddw m0, m1
483 paddw m0, m3
484 SAD_END_SSE2
485
486;-----------------------------------------------------------------------------
487; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
488;-----------------------------------------------------------------------------
489cglobal pixel_sad_16x12, 4,4,3
490 pxor m0, m0
491
492 PROCESS_SAD_16x4
493 PROCESS_SAD_16x4
494 PROCESS_SAD_16x4
495
496 movhlps m1, m0
497 paddd m0, m1
498 movd eax, m0
499 RET
500
501;-----------------------------------------------------------------------------
502; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
503;-----------------------------------------------------------------------------
504cglobal pixel_sad_16x32, 4,5,3
505 pxor m0, m0
506 mov r4d, 4
507.loop:
508 PROCESS_SAD_16x4
509 PROCESS_SAD_16x4
510 dec r4d
511 jnz .loop
512
513 movhlps m1, m0
514 paddd m0, m1
515 movd eax, m0
516 RET
517
518;-----------------------------------------------------------------------------
519; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
520;-----------------------------------------------------------------------------
521cglobal pixel_sad_16x64, 4,5,3
522 pxor m0, m0
523 mov r4d, 8
524.loop:
525 PROCESS_SAD_16x4
526 PROCESS_SAD_16x4
527 dec r4d
528 jnz .loop
529
530 movhlps m1, m0
531 paddd m0, m1
532 movd eax, m0
533 RET
534
535;-----------------------------------------------------------------------------
536; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
537;-----------------------------------------------------------------------------
538cglobal pixel_sad_16x4, 4,4,3
539
540 movu m0, [r2]
541 movu m1, [r2 + r3]
542 psadbw m0, [r0]
543 psadbw m1, [r0 + r1]
544 paddd m0, m1
545 lea r2, [r2 + 2 * r3]
546 lea r0, [r0 + 2 * r1]
547 movu m1, [r2]
548 movu m2, [r2 + r3]
549 psadbw m1, [r0]
550 psadbw m2, [r0 + r1]
551 paddd m1, m2
552 paddd m0, m1
553
554 movhlps m1, m0
555 paddd m0, m1
556 movd eax, m0
557 RET
558
559;-----------------------------------------------------------------------------
560; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
561;-----------------------------------------------------------------------------
562cglobal pixel_sad_32x8, 4,4,3
563 pxor m0, m0
564
565 PROCESS_SAD_32x4
566 PROCESS_SAD_32x4
567
568 movhlps m1, m0
569 paddd m0, m1
570 movd eax, m0
571 RET
572
573;-----------------------------------------------------------------------------
574; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
575;-----------------------------------------------------------------------------
576cglobal pixel_sad_32x24, 4,5,3
577 pxor m0, m0
578 mov r4d, 3
579.loop:
580 PROCESS_SAD_32x4
581 PROCESS_SAD_32x4
582 dec r4d
583 jnz .loop
584
585 movhlps m1, m0
586 paddd m0, m1
587 movd eax, m0
588 RET
589
590;-----------------------------------------------------------------------------
591; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
592;-----------------------------------------------------------------------------
593cglobal pixel_sad_32x32, 4,5,3
594 pxor m0, m0
595 mov r4d, 4
596.loop:
597 PROCESS_SAD_32x4
598 PROCESS_SAD_32x4
599 dec r4d
600 jnz .loop
601
602 movhlps m1, m0
603 paddd m0, m1
604 movd eax, m0
605 RET
606
607;-----------------------------------------------------------------------------
608; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
609;-----------------------------------------------------------------------------
610cglobal pixel_sad_32x16, 4,4,3
611 pxor m0, m0
612
613 PROCESS_SAD_32x4
614 PROCESS_SAD_32x4
615 PROCESS_SAD_32x4
616 PROCESS_SAD_32x4
617
618 movhlps m1, m0
619 paddd m0, m1
620 movd eax, m0
621 RET
622
623;-----------------------------------------------------------------------------
624; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
625;-----------------------------------------------------------------------------
626cglobal pixel_sad_32x64, 4,5,3
627 pxor m0, m0
628 mov r4d, 8
629.loop:
630 PROCESS_SAD_32x4
631 PROCESS_SAD_32x4
632 dec r4d
633 jnz .loop
634
635 movhlps m1, m0
636 paddd m0, m1
637 movd eax, m0
638 RET
639
640;-----------------------------------------------------------------------------
641; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
642;-----------------------------------------------------------------------------
643cglobal pixel_sad_8x32, 4,5,3
644 pxor m0, m0
645 mov r4d, 4
646.loop:
647 PROCESS_SAD_8x4
648 PROCESS_SAD_8x4
649 dec r4d
650 jnz .loop
651
652 movhlps m1, m0
653 paddd m0, m1
654 movd eax, m0
655 RET
656
657;-----------------------------------------------------------------------------
658; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
659;-----------------------------------------------------------------------------
660cglobal pixel_sad_64x16, 4,4,5
661 pxor m0, m0
662
663 PROCESS_SAD_64x4
664 PROCESS_SAD_64x4
665 PROCESS_SAD_64x4
666 PROCESS_SAD_64x4
667
668 movhlps m1, m0
669 paddd m0, m1
670 movd eax, m0
671 RET
672
673;-----------------------------------------------------------------------------
674; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
675;-----------------------------------------------------------------------------
676cglobal pixel_sad_64x32, 4,5,5
677 pxor m0, m0
678 mov r4, 4
679
680.loop:
681 PROCESS_SAD_64x4
682 PROCESS_SAD_64x4
683
684 dec r4
685 jnz .loop
686
687 movhlps m1, m0
688 paddd m0, m1
689 movd eax, m0
690 RET
691
692;-----------------------------------------------------------------------------
693; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
694;-----------------------------------------------------------------------------
695cglobal pixel_sad_64x48, 4,5,5
696 pxor m0, m0
697 mov r4, 6
698
699.loop:
700 PROCESS_SAD_64x4
701 PROCESS_SAD_64x4
702 dec r4d
703 jnz .loop
704
705 movhlps m1, m0
706 paddd m0, m1
707 movd eax, m0
708 RET
709
710;-----------------------------------------------------------------------------
711; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
712;-----------------------------------------------------------------------------
713cglobal pixel_sad_64x64, 4,5,5
714 pxor m0, m0
715 mov r4, 8
716
717.loop:
718 PROCESS_SAD_64x4
719 PROCESS_SAD_64x4
720 dec r4
721 jnz .loop
722
723 movhlps m1, m0
724 paddd m0, m1
725 movd eax, m0
726 RET
727
728;-----------------------------------------------------------------------------
729; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
730;-----------------------------------------------------------------------------
731cglobal pixel_sad_48x64, 4,5,5
732 pxor m0, m0
733 mov r4, 64
734
735.loop:
736 PROCESS_SAD_48x4
737 lea r2, [r2 + r3]
738 lea r0, [r0 + r1]
739
740 PROCESS_SAD_48x4
741 lea r2, [r2 + r3]
742 lea r0, [r0 + r1]
743
744 sub r4, 8
745 cmp r4, 8
746
747jnz .loop
748 PROCESS_SAD_48x4
749 lea r2, [r2 + r3]
750 lea r0, [r0 + r1]
751 PROCESS_SAD_48x4
752
753 movhlps m1, m0
754 paddd m0, m1
755 movd eax, m0
756 RET
757
758;-----------------------------------------------------------------------------
759; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
760;-----------------------------------------------------------------------------
761cglobal pixel_sad_24x32, 4,5,4
762 pxor m0, m0
763 mov r4, 32
764
765.loop:
766 PROCESS_SAD_24x4
767 lea r2, [r2 + r3]
768 lea r0, [r0 + r1]
769 PROCESS_SAD_24x4
770 lea r2, [r2 + r3]
771 lea r0, [r0 + r1]
772 sub r4, 8
773 cmp r4, 8
774jnz .loop
775 PROCESS_SAD_24x4
776 lea r2, [r2 + r3]
777 lea r0, [r0 + r1]
778 PROCESS_SAD_24x4
779
780 movhlps m1, m0
781 paddd m0, m1
782 movd eax, m0
783 RET
784
785;-----------------------------------------------------------------------------
786; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
787;-----------------------------------------------------------------------------
788cglobal pixel_sad_12x16, 4,4,4
789 mova m4, [MSK]
790 pxor m0, m0
791
792 PROCESS_SAD_12x4
793 lea r2, [r2 + r3]
794 lea r0, [r0 + r1]
795 PROCESS_SAD_12x4
796 lea r2, [r2 + r3]
797 lea r0, [r0 + r1]
798 PROCESS_SAD_12x4
799 lea r2, [r2 + r3]
800 lea r0, [r0 + r1]
801 PROCESS_SAD_12x4
802
803 movhlps m1, m0
804 paddd m0, m1
805 movd eax, m0
806 RET
807
808%endmacro
809
810INIT_XMM sse2
811SAD_W16
812INIT_XMM sse3
813SAD_W16
814INIT_XMM sse2, aligned
815SAD_W16
816
817%macro SAD_INC_4x8P_SSE 1
818 movq m1, [r0]
819 movq m2, [r0+r1]
820 lea r0, [r0+2*r1]
821 movq m3, [r2]
822 movq m4, [r2+r3]
823 lea r2, [r2+2*r3]
824 movhps m1, [r0]
825 movhps m2, [r0+r1]
826 movhps m3, [r2]
827 movhps m4, [r2+r3]
828 lea r0, [r0+2*r1]
829 psadbw m1, m3
830 psadbw m2, m4
831 lea r2, [r2+2*r3]
832 ACCUM paddw, 0, 1, %1
833 paddw m0, m2
834%endmacro
835
836INIT_XMM
837;Even on Nehalem, no sizes other than 8x16 benefit from this method.
838cglobal pixel_sad_8x16_sse2, 4,4
839 SAD_INC_4x8P_SSE 0
840 SAD_INC_4x8P_SSE 1
841 SAD_INC_4x8P_SSE 1
842 SAD_INC_4x8P_SSE 1
843 SAD_END_SSE2
844 RET
845
846;=============================================================================
847; SAD x3/x4 MMX
848;=============================================================================
849
850%macro SAD_X3_START_1x8P 0
851 movq mm3, [r0]
852 movq mm0, [r1]
853 movq mm1, [r2]
854 movq mm2, [r3]
855 psadbw mm0, mm3
856 psadbw mm1, mm3
857 psadbw mm2, mm3
858%endmacro
859
860%macro SAD_X3_1x8P 2
861 movq mm3, [r0+%1]
862 movq mm4, [r1+%2]
863 movq mm5, [r2+%2]
864 movq mm6, [r3+%2]
865 psadbw mm4, mm3
866 psadbw mm5, mm3
867 psadbw mm6, mm3
868 paddw mm0, mm4
869 paddw mm1, mm5
870 paddw mm2, mm6
871%endmacro
872
873%macro SAD_X3_START_2x4P 3
874 movd mm3, [r0]
875 movd %1, [r1]
876 movd %2, [r2]
877 movd %3, [r3]
878 punpckldq mm3, [r0+FENC_STRIDE]
879 punpckldq %1, [r1+r4]
880 punpckldq %2, [r2+r4]
881 punpckldq %3, [r3+r4]
882 psadbw %1, mm3
883 psadbw %2, mm3
884 psadbw %3, mm3
885%endmacro
886
887%macro SAD_X3_2x16P 1
888%if %1
889 SAD_X3_START_1x8P
890%else
891 SAD_X3_1x8P 0, 0
892%endif
893 SAD_X3_1x8P 8, 8
894 SAD_X3_1x8P FENC_STRIDE, r4
895 SAD_X3_1x8P FENC_STRIDE+8, r4+8
896 add r0, 2*FENC_STRIDE
897 lea r1, [r1+2*r4]
898 lea r2, [r2+2*r4]
899 lea r3, [r3+2*r4]
900%endmacro
901
902%macro SAD_X3_2x8P 1
903%if %1
904 SAD_X3_START_1x8P
905%else
906 SAD_X3_1x8P 0, 0
907%endif
908 SAD_X3_1x8P FENC_STRIDE, r4
909 add r0, 2*FENC_STRIDE
910 lea r1, [r1+2*r4]
911 lea r2, [r2+2*r4]
912 lea r3, [r3+2*r4]
913%endmacro
914
915%macro SAD_X3_2x4P 1
916%if %1
917 SAD_X3_START_2x4P mm0, mm1, mm2
918%else
919 SAD_X3_START_2x4P mm4, mm5, mm6
920 paddw mm0, mm4
921 paddw mm1, mm5
922 paddw mm2, mm6
923%endif
924 add r0, 2*FENC_STRIDE
925 lea r1, [r1+2*r4]
926 lea r2, [r2+2*r4]
927 lea r3, [r3+2*r4]
928%endmacro
929
930%macro SAD_X4_START_1x8P 0
931 movq mm7, [r0]
932 movq mm0, [r1]
933 movq mm1, [r2]
934 movq mm2, [r3]
935 movq mm3, [r4]
936 psadbw mm0, mm7
937 psadbw mm1, mm7
938 psadbw mm2, mm7
939 psadbw mm3, mm7
940%endmacro
941
942%macro SAD_X4_1x8P 2
943 movq mm7, [r0+%1]
944 movq mm4, [r1+%2]
945 movq mm5, [r2+%2]
946 movq mm6, [r3+%2]
947 psadbw mm4, mm7
948 psadbw mm5, mm7
949 psadbw mm6, mm7
950 psadbw mm7, [r4+%2]
951 paddw mm0, mm4
952 paddw mm1, mm5
953 paddw mm2, mm6
954 paddw mm3, mm7
955%endmacro
956
957%macro SAD_X4_START_2x4P 0
958 movd mm7, [r0]
959 movd mm0, [r1]
960 movd mm1, [r2]
961 movd mm2, [r3]
962 movd mm3, [r4]
963 punpckldq mm7, [r0+FENC_STRIDE]
964 punpckldq mm0, [r1+r5]
965 punpckldq mm1, [r2+r5]
966 punpckldq mm2, [r3+r5]
967 punpckldq mm3, [r4+r5]
968 psadbw mm0, mm7
969 psadbw mm1, mm7
970 psadbw mm2, mm7
971 psadbw mm3, mm7
972%endmacro
973
974%macro SAD_X4_INC_2x4P 0
975 movd mm7, [r0]
976 movd mm4, [r1]
977 movd mm5, [r2]
978 punpckldq mm7, [r0+FENC_STRIDE]
979 punpckldq mm4, [r1+r5]
980 punpckldq mm5, [r2+r5]
981 psadbw mm4, mm7
982 psadbw mm5, mm7
983 paddw mm0, mm4
984 paddw mm1, mm5
985 movd mm4, [r3]
986 movd mm5, [r4]
987 punpckldq mm4, [r3+r5]
988 punpckldq mm5, [r4+r5]
989 psadbw mm4, mm7
990 psadbw mm5, mm7
991 paddw mm2, mm4
992 paddw mm3, mm5
993%endmacro
994
995%macro SAD_X4_2x16P 1
996%if %1
997 SAD_X4_START_1x8P
998%else
999 SAD_X4_1x8P 0, 0
1000%endif
1001 SAD_X4_1x8P 8, 8
1002 SAD_X4_1x8P FENC_STRIDE, r5
1003 SAD_X4_1x8P FENC_STRIDE+8, r5+8
1004 add r0, 2*FENC_STRIDE
1005 lea r1, [r1+2*r5]
1006 lea r2, [r2+2*r5]
1007 lea r3, [r3+2*r5]
1008 lea r4, [r4+2*r5]
1009%endmacro
1010
1011%macro SAD_X4_2x8P 1
1012%if %1
1013 SAD_X4_START_1x8P
1014%else
1015 SAD_X4_1x8P 0, 0
1016%endif
1017 SAD_X4_1x8P FENC_STRIDE, r5
1018 add r0, 2*FENC_STRIDE
1019 lea r1, [r1+2*r5]
1020 lea r2, [r2+2*r5]
1021 lea r3, [r3+2*r5]
1022 lea r4, [r4+2*r5]
1023%endmacro
1024
1025%macro SAD_X4_2x4P 1
1026%if %1
1027 SAD_X4_START_2x4P
1028%else
1029 SAD_X4_INC_2x4P
1030%endif
1031 add r0, 2*FENC_STRIDE
1032 lea r1, [r1+2*r5]
1033 lea r2, [r2+2*r5]
1034 lea r3, [r3+2*r5]
1035 lea r4, [r4+2*r5]
1036%endmacro
1037
1038%macro SAD_X3_END 0
1039%if UNIX64
1040 movd [r5+0], mm0
1041 movd [r5+4], mm1
1042 movd [r5+8], mm2
1043%else
1044 mov r0, r5mp
1045 movd [r0+0], mm0
1046 movd [r0+4], mm1
1047 movd [r0+8], mm2
1048%endif
1049 RET
1050%endmacro
1051
1052%macro SAD_X4_END 0
1053 mov r0, r6mp
1054 movd [r0+0], mm0
1055 movd [r0+4], mm1
1056 movd [r0+8], mm2
1057 movd [r0+12], mm3
1058 RET
1059%endmacro
1060
1061%macro SAD_X3_12x4 0
1062 mova m3, [r0]
1063 movu m5, [r1]
1064 pand m3, m4
1065 pand m5, m4
1066 psadbw m5, m3
1067 paddd m0, m5
1068 movu m5, [r2]
1069 pand m5, m4
1070 psadbw m5, m3
1071 paddd m1, m5
1072 movu m5, [r3]
1073 pand m5, m4
1074 psadbw m5, m3
1075 paddd m2, m5
1076 mova m3, [r0 + FENC_STRIDE]
1077 movu m5, [r1 + r4]
1078 pand m3, m4
1079 pand m5, m4
1080 psadbw m5, m3
1081 paddd m0, m5
1082 movu m5, [r2 + r4]
1083 pand m5, m4
1084 psadbw m5, m3
1085 paddd m1, m5
1086 movu m5, [r3 + r4]
1087 pand m5, m4
1088 psadbw m5, m3
1089 paddd m2, m5
1090 mova m3, [r0 + FENC_STRIDE * 2]
1091 movu m5, [r1 + r4 * 2]
1092 pand m3, m4
1093 pand m5, m4
1094 psadbw m5, m3
1095 paddd m0, m5
1096 movu m5, [r2 + r4 * 2]
1097 pand m5, m4
1098 psadbw m5, m3
1099 paddd m1, m5
1100 movu m5, [r3 + r4 * 2]
1101 pand m5, m4
1102 psadbw m5, m3
1103 paddd m2, m5
1104 lea r1, [r1 + r4 * 2]
1105 lea r2, [r2 + r4 * 2]
1106 lea r3, [r3 + r4 * 2]
1107 mova m3, [r0 + FENC_STRIDE + FENC_STRIDE * 2]
1108 movu m5, [r1 + r4]
1109 pand m3, m4
1110 pand m5, m4
1111 psadbw m5, m3
1112 paddd m0, m5
1113 movu m5, [r2 + r4]
1114 pand m5, m4
1115 psadbw m5, m3
1116 paddd m1, m5
1117 movu m5, [r3 + r4]
1118 pand m5, m4
1119 psadbw m5, m3
1120 paddd m2, m5
1121 lea r0, [r0 + FENC_STRIDE * 4]
1122 lea r1, [r1 + r4 * 2]
1123 lea r2, [r2 + r4 * 2]
1124 lea r3, [r3 + r4 * 2]
1125%endmacro
1126
1127%macro SAD_X4_12x4 0
1128 mova m4, [r0]
1129 movu m5, [r1]
1130 pand m4, m6
1131 pand m5, m6
1132 psadbw m5, m4
1133 paddd m0, m5
1134 movu m5, [r2]
1135 pand m5, m6
1136 psadbw m5, m4
1137 paddd m1, m5
1138 movu m5, [r3]
1139 pand m5, m6
1140 psadbw m5, m4
1141 paddd m2, m5
1142 movu m5, [r4]
1143 pand m5, m6
1144 psadbw m5, m4
1145 paddd m3, m5
1146 mova m4, [r0 + FENC_STRIDE]
1147 movu m5, [r1 + r5]
1148 pand m4, m6
1149 pand m5, m6
1150 psadbw m5, m4
1151 paddd m0, m5
1152 movu m5, [r2 + r5]
1153 pand m5, m6
1154 psadbw m5, m4
1155 paddd m1, m5
1156 movu m5, [r3 + r5]
1157 pand m5, m6
1158 psadbw m5, m4
1159 paddd m2, m5
1160 movu m5, [r4 + r5]
1161 pand m5, m6
1162 psadbw m5, m4
1163 paddd m3, m5
1164 mova m4, [r0 + FENC_STRIDE * 2]
1165 movu m5, [r1 + r5 * 2]
1166 pand m4, m6
1167 pand m5, m6
1168 psadbw m5, m4
1169 paddd m0, m5
1170 movu m5, [r2 + r5 * 2]
1171 pand m5, m6
1172 psadbw m5, m4
1173 paddd m1, m5
1174 movu m5, [r3 + r5 * 2]
1175 pand m5, m6
1176 psadbw m5, m4
1177 paddd m2, m5
1178 movu m5, [r4 + r5 * 2]
1179 pand m5, m6
1180 psadbw m5, m4
1181 paddd m3, m5
1182 lea r1, [r1 + r5 * 2]
1183 lea r2, [r2 + r5 * 2]
1184 lea r3, [r3 + r5 * 2]
1185 lea r4, [r4 + r5 * 2]
1186 mova m4, [r0 + FENC_STRIDE + FENC_STRIDE * 2]
1187 movu m5, [r1 + r5]
1188 pand m4, m6
1189 pand m5, m6
1190 psadbw m5, m4
1191 paddd m0, m5
1192 movu m5, [r2 + r5]
1193 pand m5, m6
1194 psadbw m5, m4
1195 paddd m1, m5
1196 movu m5, [r3 + r5]
1197 pand m5, m6
1198 psadbw m5, m4
1199 paddd m2, m5
1200 movu m5, [r4 + r5]
1201 pand m5, m6
1202 psadbw m5, m4
1203 paddd m3, m5
1204 lea r0, [r0 + FENC_STRIDE * 4]
1205 lea r1, [r1 + r5 * 2]
1206 lea r2, [r2 + r5 * 2]
1207 lea r3, [r3 + r5 * 2]
1208 lea r4, [r4 + r5 * 2]
1209%endmacro
1210
1211%macro SAD_X3_24x4 0
1212 mova m3, [r0]
1213 mova m4, [r0 + 16]
1214 movu m5, [r1]
1215 movu m6, [r1 + 16]
1216 psadbw m5, m3
1217 psadbw m6, m4
1218 pshufd m6, m6, 84
1219 paddd m5, m6
1220 paddd m0, m5
1221 movu m5, [r2]
1222 movu m6, [r2 + 16]
1223 psadbw m5, m3
1224 psadbw m6, m4
1225 pshufd m6, m6, 84
1226 paddd m5, m6
1227 paddd m1, m5
1228 movu m5, [r3]
1229 movu m6, [r3 + 16]
1230 psadbw m5, m3
1231 psadbw m6, m4
1232 pshufd m6, m6, 84
1233 paddd m5, m6
1234 paddd m2, m5
1235
1236 mova m3, [r0 + FENC_STRIDE]
1237 mova m4, [r0 + 16 + FENC_STRIDE]
1238 movu m5, [r1 + r4]
1239 movu m6, [r1 + 16 + r4]
1240 psadbw m5, m3
1241 psadbw m6, m4
1242 pshufd m6, m6, 84
1243 paddd m5, m6
1244 paddd m0, m5
1245 movu m5, [r2 + r4]
1246 movu m6, [r2 + 16 + r4]
1247 psadbw m5, m3
1248 psadbw m6, m4
1249 pshufd m6, m6, 84
1250 paddd m5, m6
1251 paddd m1, m5
1252 movu m5, [r3 + r4]
1253 movu m6, [r3 + 16 + r4]
1254 psadbw m5, m3
1255 psadbw m6, m4
1256 pshufd m6, m6, 84
1257 paddd m5, m6
1258 paddd m2, m5
1259
1260 mova m3, [r0 + FENC_STRIDE * 2]
1261 mova m4, [r0 + 16 + FENC_STRIDE * 2]
1262 movu m5, [r1 + r4 * 2]
1263 movu m6, [r1 + 16 + r4 * 2]
1264 psadbw m5, m3
1265 psadbw m6, m4
1266 pshufd m6, m6, 84
1267 paddd m5, m6
1268 paddd m0, m5
1269 movu m5, [r2 + r4 * 2]
1270 movu m6, [r2 + 16 + r4 * 2]
1271 psadbw m5, m3
1272 psadbw m6, m4
1273 pshufd m6, m6, 84
1274 paddd m5, m6
1275 paddd m1, m5
1276 movu m5, [r3 + r4 * 2]
1277 movu m6, [r3 + 16 + r4 * 2]
1278 psadbw m5, m3
1279 psadbw m6, m4
1280 pshufd m6, m6, 84
1281 paddd m5, m6
1282 paddd m2, m5
1283 lea r0, [r0 + FENC_STRIDE * 2]
1284 lea r1, [r1 + r4 * 2]
1285 lea r2, [r2 + r4 * 2]
1286 lea r3, [r3 + r4 * 2]
1287
1288 mova m3, [r0 + FENC_STRIDE]
1289 mova m4, [r0 + 16 + FENC_STRIDE]
1290 movu m5, [r1 + r4]
1291 movu m6, [r1 + 16 + r4]
1292 psadbw m5, m3
1293 psadbw m6, m4
1294 pshufd m6, m6, 84
1295 paddd m5, m6
1296 paddd m0, m5
1297 movu m5, [r2 + r4]
1298 movu m6, [r2 + 16 + r4]
1299 psadbw m5, m3
1300 psadbw m6, m4
1301 pshufd m6, m6, 84
1302 paddd m5, m6
1303 paddd m1, m5
1304 movu m5, [r3 + r4]
1305 movu m6, [r3 + 16 + r4]
1306 psadbw m5, m3
1307 psadbw m6, m4
1308 pshufd m6, m6, 84
1309 paddd m5, m6
1310 paddd m2, m5
1311 lea r0, [r0 + FENC_STRIDE * 2]
1312 lea r1, [r1 + r4 * 2]
1313 lea r2, [r2 + r4 * 2]
1314 lea r3, [r3 + r4 * 2]
1315%endmacro
1316
1317%macro SAD_X4_24x4 0
1318 mova m4, [r0]
1319 mova m5, [r0 + 16]
1320 movu m6, [r1]
1321 movu m7, [r1 + 16]
1322 psadbw m6, m4
1323 psadbw m7, m5
1324 pshufd m7, m7, 84
1325 paddd m6, m7
1326 paddd m0, m6
1327 movu m6, [r2]
1328 movu m7, [r2 + 16]
1329 psadbw m6, m4
1330 psadbw m7, m5
1331 pshufd m7, m7, 84
1332 paddd m6, m7
1333 paddd m1, m6
1334 movu m6, [r3]
1335 movu m7, [r3 + 16]
1336 psadbw m6, m4
1337 psadbw m7, m5
1338 pshufd m7, m7, 84
1339 paddd m6, m7
1340 paddd m2, m6
1341 movu m6, [r4]
1342 movu m7, [r4 + 16]
1343 psadbw m6, m4
1344 psadbw m7, m5
1345 pshufd m7, m7, 84
1346 paddd m6, m7
1347 paddd m3, m6
1348
1349 mova m4, [r0 + FENC_STRIDE]
1350 mova m5, [r0 + 16 + FENC_STRIDE]
1351 movu m6, [r1 + r5]
1352 movu m7, [r1 + 16 + r5]
1353 psadbw m6, m4
1354 psadbw m7, m5
1355 pshufd m7, m7, 84
1356 paddd m6, m7
1357 paddd m0, m6
1358 movu m6, [r2 + r5]
1359 movu m7, [r2 + 16 + r5]
1360 psadbw m6, m4
1361 psadbw m7, m5
1362 pshufd m7, m7, 84
1363 paddd m6, m7
1364 paddd m1, m6
1365 movu m6, [r3 + r5]
1366 movu m7, [r3 + 16 + r5]
1367 psadbw m6, m4
1368 psadbw m7, m5
1369 pshufd m7, m7, 84
1370 paddd m6, m7
1371 paddd m2, m6
1372 movu m6, [r4 + r5]
1373 movu m7, [r4 + 16 + r5]
1374 psadbw m6, m4
1375 psadbw m7, m5
1376 pshufd m7, m7, 84
1377 paddd m6, m7
1378 paddd m3, m6
1379
1380 mova m4, [r0 + FENC_STRIDE * 2]
1381 mova m5, [r0 + 16 + FENC_STRIDE * 2]
1382 movu m6, [r1 + r5 * 2]
1383 movu m7, [r1 + 16 + r5 * 2]
1384 psadbw m6, m4
1385 psadbw m7, m5
1386 pshufd m7, m7, 84
1387 paddd m6, m7
1388 paddd m0, m6
1389 movu m6, [r2 + r5 * 2]
1390 movu m7, [r2 + 16 + r5 * 2]
1391 psadbw m6, m4
1392 psadbw m7, m5
1393 pshufd m7, m7, 84
1394 paddd m6, m7
1395 paddd m1, m6
1396 movu m6, [r3 + r5 * 2]
1397 movu m7, [r3 + 16 + r5 * 2]
1398 psadbw m6, m4
1399 psadbw m7, m5
1400 pshufd m7, m7, 84
1401 paddd m6, m7
1402 paddd m2, m6
1403 movu m6, [r4 + r5 * 2]
1404 movu m7, [r4 + 16 + r5 * 2]
1405 psadbw m6, m4
1406 psadbw m7, m5
1407 pshufd m7, m7, 84
1408 paddd m6, m7
1409 paddd m3, m6
1410 lea r0, [r0 + FENC_STRIDE * 2]
1411 lea r1, [r1 + r5 * 2]
1412 lea r2, [r2 + r5 * 2]
1413 lea r3, [r3 + r5 * 2]
1414 lea r4, [r4 + r5 * 2]
1415 mova m4, [r0 + FENC_STRIDE]
1416 mova m5, [r0 + 16 + FENC_STRIDE]
1417 movu m6, [r1 + r5]
1418 movu m7, [r1 + 16 + r5]
1419 psadbw m6, m4
1420 psadbw m7, m5
1421 pshufd m7, m7, 84
1422 paddd m6, m7
1423 paddd m0, m6
1424 movu m6, [r2 + r5]
1425 movu m7, [r2 + 16 + r5]
1426 psadbw m6, m4
1427 psadbw m7, m5
1428 pshufd m7, m7, 84
1429 paddd m6, m7
1430 paddd m1, m6
1431 movu m6, [r3 + r5]
1432 movu m7, [r3 + 16 + r5]
1433 psadbw m6, m4
1434 psadbw m7, m5
1435 pshufd m7, m7, 84
1436 paddd m6, m7
1437 paddd m2, m6
1438 movu m6, [r4 + r5]
1439 movu m7, [r4 + 16 + r5]
1440 psadbw m6, m4
1441 psadbw m7, m5
1442 pshufd m7, m7, 84
1443 paddd m6, m7
1444 paddd m3, m6
1445 lea r0, [r0 + FENC_STRIDE * 2]
1446 lea r1, [r1 + r5 * 2]
1447 lea r2, [r2 + r5 * 2]
1448 lea r3, [r3 + r5 * 2]
1449 lea r4, [r4 + r5 * 2]
1450%endmacro
1451
1452%macro SAD_X3_32x4 0
1453 mova m3, [r0]
1454 mova m4, [r0 + 16]
1455 movu m5, [r1]
1456 movu m6, [r1 + 16]
1457 psadbw m5, m3
1458 psadbw m6, m4
1459 paddd m5, m6
1460 paddd m0, m5
1461 movu m5, [r2]
1462 movu m6, [r2 + 16]
1463 psadbw m5, m3
1464 psadbw m6, m4
1465 paddd m5, m6
1466 paddd m1, m5
1467 movu m5, [r3]
1468 movu m6, [r3 + 16]
1469 psadbw m5, m3
1470 psadbw m6, m4
1471 paddd m5, m6
1472 paddd m2, m5
1473 lea r0, [r0 + FENC_STRIDE]
1474 lea r1, [r1 + r4]
1475 lea r2, [r2 + r4]
1476 lea r3, [r3 + r4]
1477 mova m3, [r0]
1478 mova m4, [r0 + 16]
1479 movu m5, [r1]
1480 movu m6, [r1 + 16]
1481 psadbw m5, m3
1482 psadbw m6, m4
1483 paddd m5, m6
1484 paddd m0, m5
1485 movu m5, [r2]
1486 movu m6, [r2 + 16]
1487 psadbw m5, m3
1488 psadbw m6, m4
1489 paddd m5, m6
1490 paddd m1, m5
1491 movu m5, [r3]
1492 movu m6, [r3 + 16]
1493 psadbw m5, m3
1494 psadbw m6, m4
1495 paddd m5, m6
1496 paddd m2, m5
1497 lea r0, [r0 + FENC_STRIDE]
1498 lea r1, [r1 + r4]
1499 lea r2, [r2 + r4]
1500 lea r3, [r3 + r4]
1501 mova m3, [r0]
1502 mova m4, [r0 + 16]
1503 movu m5, [r1]
1504 movu m6, [r1 + 16]
1505 psadbw m5, m3
1506 psadbw m6, m4
1507 paddd m5, m6
1508 paddd m0, m5
1509 movu m5, [r2]
1510 movu m6, [r2 + 16]
1511 psadbw m5, m3
1512 psadbw m6, m4
1513 paddd m5, m6
1514 paddd m1, m5
1515 movu m5, [r3]
1516 movu m6, [r3 + 16]
1517 psadbw m5, m3
1518 psadbw m6, m4
1519 paddd m5, m6
1520 paddd m2, m5
1521 lea r0, [r0 + FENC_STRIDE]
1522 lea r1, [r1 + r4]
1523 lea r2, [r2 + r4]
1524 lea r3, [r3 + r4]
1525 mova m3, [r0]
1526 mova m4, [r0 + 16]
1527 movu m5, [r1]
1528 movu m6, [r1 + 16]
1529 psadbw m5, m3
1530 psadbw m6, m4
1531 paddd m5, m6
1532 paddd m0, m5
1533 movu m5, [r2]
1534 movu m6, [r2 + 16]
1535 psadbw m5, m3
1536 psadbw m6, m4
1537 paddd m5, m6
1538 paddd m1, m5
1539 movu m5, [r3]
1540 movu m6, [r3 + 16]
1541 psadbw m5, m3
1542 psadbw m6, m4
1543 paddd m5, m6
1544 paddd m2, m5
1545 lea r0, [r0 + FENC_STRIDE]
1546 lea r1, [r1 + r4]
1547 lea r2, [r2 + r4]
1548 lea r3, [r3 + r4]
1549%endmacro
1550
1551%macro SAD_X4_32x4 0
1552 mova m4, [r0]
1553 mova m5, [r0 + 16]
1554 movu m6, [r1]
1555 movu m7, [r1 + 16]
1556 psadbw m6, m4
1557 psadbw m7, m5
1558 paddd m6, m7
1559 paddd m0, m6
1560 movu m6, [r2]
1561 movu m7, [r2 + 16]
1562 psadbw m6, m4
1563 psadbw m7, m5
1564 paddd m6, m7
1565 paddd m1, m6
1566 movu m6, [r3]
1567 movu m7, [r3 + 16]
1568 psadbw m6, m4
1569 psadbw m7, m5
1570 paddd m6, m7
1571 paddd m2, m6
1572 movu m6, [r4]
1573 movu m7, [r4 + 16]
1574 psadbw m6, m4
1575 psadbw m7, m5
1576 paddd m6, m7
1577 paddd m3, m6
1578 lea r0, [r0 + FENC_STRIDE]
1579 lea r1, [r1 + r5]
1580 lea r2, [r2 + r5]
1581 lea r3, [r3 + r5]
1582 lea r4, [r4 + r5]
1583 mova m4, [r0]
1584 mova m5, [r0 + 16]
1585 movu m6, [r1]
1586 movu m7, [r1 + 16]
1587 psadbw m6, m4
1588 psadbw m7, m5
1589 paddd m6, m7
1590 paddd m0, m6
1591 movu m6, [r2]
1592 movu m7, [r2 + 16]
1593 psadbw m6, m4
1594 psadbw m7, m5
1595 paddd m6, m7
1596 paddd m1, m6
1597 movu m6, [r3]
1598 movu m7, [r3 + 16]
1599 psadbw m6, m4
1600 psadbw m7, m5
1601 paddd m6, m7
1602 paddd m2, m6
1603 movu m6, [r4]
1604 movu m7, [r4 + 16]
1605 psadbw m6, m4
1606 psadbw m7, m5
1607 paddd m6, m7
1608 paddd m3, m6
1609 lea r0, [r0 + FENC_STRIDE]
1610 lea r1, [r1 + r5]
1611 lea r2, [r2 + r5]
1612 lea r3, [r3 + r5]
1613 lea r4, [r4 + r5]
1614 mova m4, [r0]
1615 mova m5, [r0 + 16]
1616 movu m6, [r1]
1617 movu m7, [r1 + 16]
1618 psadbw m6, m4
1619 psadbw m7, m5
1620 paddd m6, m7
1621 paddd m0, m6
1622 movu m6, [r2]
1623 movu m7, [r2 + 16]
1624 psadbw m6, m4
1625 psadbw m7, m5
1626 paddd m6, m7
1627 paddd m1, m6
1628 movu m6, [r3]
1629 movu m7, [r3 + 16]
1630 psadbw m6, m4
1631 psadbw m7, m5
1632 paddd m6, m7
1633 paddd m2, m6
1634 movu m6, [r4]
1635 movu m7, [r4 + 16]
1636 psadbw m6, m4
1637 psadbw m7, m5
1638 paddd m6, m7
1639 paddd m3, m6
1640 lea r0, [r0 + FENC_STRIDE]
1641 lea r1, [r1 + r5]
1642 lea r2, [r2 + r5]
1643 lea r3, [r3 + r5]
1644 lea r4, [r4 + r5]
1645 mova m4, [r0]
1646 mova m5, [r0 + 16]
1647 movu m6, [r1]
1648 movu m7, [r1 + 16]
1649 psadbw m6, m4
1650 psadbw m7, m5
1651 paddd m6, m7
1652 paddd m0, m6
1653 movu m6, [r2]
1654 movu m7, [r2 + 16]
1655 psadbw m6, m4
1656 psadbw m7, m5
1657 paddd m6, m7
1658 paddd m1, m6
1659 movu m6, [r3]
1660 movu m7, [r3 + 16]
1661 psadbw m6, m4
1662 psadbw m7, m5
1663 paddd m6, m7
1664 paddd m2, m6
1665 movu m6, [r4]
1666 movu m7, [r4 + 16]
1667 psadbw m6, m4
1668 psadbw m7, m5
1669 paddd m6, m7
1670 paddd m3, m6
1671 lea r0, [r0 + FENC_STRIDE]
1672 lea r1, [r1 + r5]
1673 lea r2, [r2 + r5]
1674 lea r3, [r3 + r5]
1675 lea r4, [r4 + r5]
1676%endmacro
1677
1678%macro SAD_X3_48x4 0
1679 mova m3, [r0]
1680 mova m4, [r0 + 16]
1681 mova m5, [r0 + 32]
1682 movu m6, [r1]
1683 psadbw m6, m3
1684 paddd m0, m6
1685 movu m6, [r1 + 16]
1686 psadbw m6, m4
1687 paddd m0, m6
1688 movu m6, [r1 + 32]
1689 psadbw m6, m5
1690 paddd m0, m6
1691 movu m6, [r2]
1692 psadbw m6, m3
1693 paddd m1, m6
1694 movu m6, [r2 + 16]
1695 psadbw m6, m4
1696 paddd m1, m6
1697 movu m6, [r2 + 32]
1698 psadbw m6, m5
1699 paddd m1, m6
1700 movu m6, [r3]
1701 psadbw m6, m3
1702 paddd m2, m6
1703 movu m6, [r3 + 16]
1704 psadbw m6, m4
1705 paddd m2, m6
1706 movu m6, [r3 + 32]
1707 psadbw m6, m5
1708 paddd m2, m6
1709
1710 mova m3, [r0 + FENC_STRIDE]
1711 mova m4, [r0 + 16 + FENC_STRIDE]
1712 mova m5, [r0 + 32 + FENC_STRIDE]
1713 movu m6, [r1 + r4]
1714 psadbw m6, m3
1715 paddd m0, m6
1716 movu m6, [r1 + 16 + r4]
1717 psadbw m6, m4
1718 paddd m0, m6
1719 movu m6, [r1 + 32 + r4]
1720 psadbw m6, m5
1721 paddd m0, m6
1722 movu m6, [r2 + r4]
1723 psadbw m6, m3
1724 paddd m1, m6
1725 movu m6, [r2 + 16 + r4]
1726 psadbw m6, m4
1727 paddd m1, m6
1728 movu m6, [r2 + 32 + r4]
1729 psadbw m6, m5
1730 paddd m1, m6
1731 movu m6, [r3 + r4]
1732 psadbw m6, m3
1733 paddd m2, m6
1734 movu m6, [r3 + 16 + r4]
1735 psadbw m6, m4
1736 paddd m2, m6
1737 movu m6, [r3 + 32 + r4]
1738 psadbw m6, m5
1739 paddd m2, m6
1740
1741 mova m3, [r0 + FENC_STRIDE * 2]
1742 mova m4, [r0 + 16 + FENC_STRIDE * 2]
1743 mova m5, [r0 + 32 + FENC_STRIDE * 2]
1744 movu m6, [r1 + r4 * 2]
1745 psadbw m6, m3
1746 paddd m0, m6
1747 movu m6, [r1 + 16 + r4 * 2]
1748 psadbw m6, m4
1749 paddd m0, m6
1750 movu m6, [r1 + 32 + r4 * 2]
1751 psadbw m6, m5
1752 paddd m0, m6
1753 movu m6, [r2 + r4 * 2]
1754 psadbw m6, m3
1755 paddd m1, m6
1756 movu m6, [r2 + 16 + r4 * 2]
1757 psadbw m6, m4
1758 paddd m1, m6
1759 movu m6, [r2 + 32 + r4 * 2]
1760 psadbw m6, m5
1761 paddd m1, m6
1762 movu m6, [r3 + r4 * 2]
1763 psadbw m6, m3
1764 paddd m2, m6
1765 movu m6, [r3 + 16 + r4 * 2]
1766 psadbw m6, m4
1767 paddd m2, m6
1768 movu m6, [r3 + 32 + r4 * 2]
1769 psadbw m6, m5
1770 paddd m2, m6
1771
1772 lea r0, [r0 + FENC_STRIDE * 2]
1773 lea r1, [r1 + r4 * 2]
1774 lea r2, [r2 + r4 * 2]
1775 lea r3, [r3 + r4 * 2]
1776 mova m3, [r0 + FENC_STRIDE]
1777 mova m4, [r0 + 16 + FENC_STRIDE]
1778 mova m5, [r0 + 32 + FENC_STRIDE]
1779 movu m6, [r1 + r4]
1780 psadbw m6, m3
1781 paddd m0, m6
1782 movu m6, [r1 + 16 + r4]
1783 psadbw m6, m4
1784 paddd m0, m6
1785 movu m6, [r1 + 32 + r4]
1786 psadbw m6, m5
1787 paddd m0, m6
1788 movu m6, [r2 + r4]
1789 psadbw m6, m3
1790 paddd m1, m6
1791 movu m6, [r2 + 16 + r4]
1792 psadbw m6, m4
1793 paddd m1, m6
1794 movu m6, [r2 + 32 + r4]
1795 psadbw m6, m5
1796 paddd m1, m6
1797 movu m6, [r3 + r4]
1798 psadbw m6, m3
1799 paddd m2, m6
1800 movu m6, [r3 + 16 + r4]
1801 psadbw m6, m4
1802 paddd m2, m6
1803 movu m6, [r3 + 32 + r4]
1804 psadbw m6, m5
1805 paddd m2, m6
1806 lea r0, [r0 + FENC_STRIDE * 2]
1807 lea r1, [r1 + r4 * 2]
1808 lea r2, [r2 + r4 * 2]
1809 lea r3, [r3 + r4 * 2]
1810%endmacro
1811
1812%macro SAD_X4_48x4 0
1813 mova m4, [r0]
1814 mova m5, [r0 + 16]
1815 mova m6, [r0 + 32]
1816 movu m7, [r1]
1817 psadbw m7, m4
1818 paddd m0, m7
1819 movu m7, [r1 + 16]
1820 psadbw m7, m5
1821 paddd m0, m7
1822 movu m7, [r1 + 32]
1823 psadbw m7, m6
1824 paddd m0, m7
1825 movu m7, [r2]
1826 psadbw m7, m4
1827 paddd m1, m7
1828 movu m7, [r2 + 16]
1829 psadbw m7, m5
1830 paddd m1, m7
1831 movu m7, [r2 + 32]
1832 psadbw m7, m6
1833 paddd m1, m7
1834 movu m7, [r3]
1835 psadbw m7, m4
1836 paddd m2, m7
1837 movu m7, [r3 + 16]
1838 psadbw m7, m5
1839 paddd m2, m7
1840 movu m7, [r3 + 32]
1841 psadbw m7, m6
1842 paddd m2, m7
1843 movu m7, [r4]
1844 psadbw m7, m4
1845 paddd m3, m7
1846 movu m7, [r4 + 16]
1847 psadbw m7, m5
1848 paddd m3, m7
1849 movu m7, [r4 + 32]
1850 psadbw m7, m6
1851 paddd m3, m7
1852
1853 mova m4, [r0 + FENC_STRIDE]
1854 mova m5, [r0 + 16 + FENC_STRIDE]
1855 mova m6, [r0 + 32 + FENC_STRIDE]
1856 movu m7, [r1 + r5]
1857 psadbw m7, m4
1858 paddd m0, m7
1859 movu m7, [r1 + 16 + r5]
1860 psadbw m7, m5
1861 paddd m0, m7
1862 movu m7, [r1 + 32 + r5]
1863 psadbw m7, m6
1864 paddd m0, m7
1865 movu m7, [r2 + r5]
1866 psadbw m7, m4
1867 paddd m1, m7
1868 movu m7, [r2 + 16 + r5]
1869 psadbw m7, m5
1870 paddd m1, m7
1871 movu m7, [r2 + 32 + r5]
1872 psadbw m7, m6
1873 paddd m1, m7
1874 movu m7, [r3 + r5]
1875 psadbw m7, m4
1876 paddd m2, m7
1877 movu m7, [r3 + 16 + r5]
1878 psadbw m7, m5
1879 paddd m2, m7
1880 movu m7, [r3 + 32 + r5]
1881 psadbw m7, m6
1882 paddd m2, m7
1883 movu m7, [r4 + r5]
1884 psadbw m7, m4
1885 paddd m3, m7
1886 movu m7, [r4 + 16 + r5]
1887 psadbw m7, m5
1888 paddd m3, m7
1889 movu m7, [r4 + 32 + r5]
1890 psadbw m7, m6
1891 paddd m3, m7
1892
1893 mova m4, [r0 + FENC_STRIDE * 2]
1894 mova m5, [r0 + 16 + FENC_STRIDE * 2]
1895 mova m6, [r0 + 32 + FENC_STRIDE * 2]
1896 movu m7, [r1 + r5 * 2]
1897 psadbw m7, m4
1898 paddd m0, m7
1899 movu m7, [r1 + 16 + r5 * 2]
1900 psadbw m7, m5
1901 paddd m0, m7
1902 movu m7, [r1 + 32 + r5 * 2]
1903 psadbw m7, m6
1904 paddd m0, m7
1905 movu m7, [r2 + r5 * 2]
1906 psadbw m7, m4
1907 paddd m1, m7
1908 movu m7, [r2 + 16 + r5 * 2]
1909 psadbw m7, m5
1910 paddd m1, m7
1911 movu m7, [r2 + 32 + r5 * 2]
1912 psadbw m7, m6
1913 paddd m1, m7
1914 movu m7, [r3 + r5 * 2]
1915 psadbw m7, m4
1916 paddd m2, m7
1917 movu m7, [r3 + 16 + r5 * 2]
1918 psadbw m7, m5
1919 paddd m2, m7
1920 movu m7, [r3 + 32 + r5 * 2]
1921 psadbw m7, m6
1922 paddd m2, m7
1923 movu m7, [r4 + r5 * 2]
1924 psadbw m7, m4
1925 paddd m3, m7
1926 movu m7, [r4 + 16 + r5 * 2]
1927 psadbw m7, m5
1928 paddd m3, m7
1929 movu m7, [r4 + 32 + r5 * 2]
1930 psadbw m7, m6
1931 paddd m3, m7
1932
1933 lea r0, [r0 + FENC_STRIDE * 2]
1934 lea r1, [r1 + r5 * 2]
1935 lea r2, [r2 + r5 * 2]
1936 lea r3, [r3 + r5 * 2]
1937 lea r4, [r4 + r5 * 2]
1938 mova m4, [r0 + FENC_STRIDE]
1939 mova m5, [r0 + 16 + FENC_STRIDE]
1940 mova m6, [r0 + 32 + FENC_STRIDE]
1941 movu m7, [r1 + r5]
1942 psadbw m7, m4
1943 paddd m0, m7
1944 movu m7, [r1 + 16 + r5]
1945 psadbw m7, m5
1946 paddd m0, m7
1947 movu m7, [r1 + 32 + r5]
1948 psadbw m7, m6
1949 paddd m0, m7
1950 movu m7, [r2 + r5]
1951 psadbw m7, m4
1952 paddd m1, m7
1953 movu m7, [r2 + 16 + r5]
1954 psadbw m7, m5
1955 paddd m1, m7
1956 movu m7, [r2 + 32 + r5]
1957 psadbw m7, m6
1958 paddd m1, m7
1959 movu m7, [r3 + r5]
1960 psadbw m7, m4
1961 paddd m2, m7
1962 movu m7, [r3 + 16 + r5]
1963 psadbw m7, m5
1964 paddd m2, m7
1965 movu m7, [r3 + 32 + r5]
1966 psadbw m7, m6
1967 paddd m2, m7
1968 movu m7, [r4 + r5]
1969 psadbw m7, m4
1970 paddd m3, m7
1971 movu m7, [r4 + 16 + r5]
1972 psadbw m7, m5
1973 paddd m3, m7
1974 movu m7, [r4 + 32 + r5]
1975 psadbw m7, m6
1976 paddd m3, m7
1977 lea r0, [r0 + FENC_STRIDE * 2]
1978 lea r1, [r1 + r5 * 2]
1979 lea r2, [r2 + r5 * 2]
1980 lea r3, [r3 + r5 * 2]
1981 lea r4, [r4 + r5 * 2]
1982%endmacro
1983
1984%macro SAD_X3_64x4 0
1985 mova m3, [r0]
1986 mova m4, [r0 + 16]
1987 movu m5, [r1]
1988 psadbw m5, m3
1989 paddd m0, m5
1990 movu m5, [r1 + 16]
1991 psadbw m5, m4
1992 paddd m0, m5
1993 movu m5, [r2]
1994 psadbw m5, m3
1995 paddd m1, m5
1996 movu m5, [r2 + 16]
1997 psadbw m5, m4
1998 paddd m1, m5
1999 movu m5, [r3]
2000 psadbw m5, m3
2001 paddd m2, m5
2002 movu m5, [r3 + 16]
2003 psadbw m5, m4
2004 paddd m2, m5
2005 mova m3, [r0 + 32]
2006 mova m4, [r0 + 48]
2007 movu m5, [r1 + 32]
2008 psadbw m5, m3
2009 paddd m0, m5
2010 movu m5, [r1 + 48]
2011 psadbw m5, m4
2012 paddd m0, m5
2013 movu m5, [r2 + 32]
2014 psadbw m5, m3
2015 paddd m1, m5
2016 movu m5, [r2 + 48]
2017 psadbw m5, m4
2018 paddd m1, m5
2019 movu m5, [r3 + 32]
2020 psadbw m5, m3
2021 paddd m2, m5
2022 movu m5, [r3 + 48]
2023 psadbw m5, m4
2024 paddd m2, m5
2025
2026 mova m3, [r0 + FENC_STRIDE]
2027 mova m4, [r0 + 16 + FENC_STRIDE]
2028 movu m5, [r1 + r4]
2029 psadbw m5, m3
2030 paddd m0, m5
2031 movu m5, [r1 + 16 + r4]
2032 psadbw m5, m4
2033 paddd m0, m5
2034 movu m5, [r2 + r4]
2035 psadbw m5, m3
2036 paddd m1, m5
2037 movu m5, [r2 + 16 + r4]
2038 psadbw m5, m4
2039 paddd m1, m5
2040 movu m5, [r3 + r4]
2041 psadbw m5, m3
2042 paddd m2, m5
2043 movu m5, [r3 + 16 + r4]
2044 psadbw m5, m4
2045 paddd m2, m5
2046 mova m3, [r0 + 32 + FENC_STRIDE]
2047 mova m4, [r0 + 48 + FENC_STRIDE]
2048 movu m5, [r1 + 32 + r4]
2049 psadbw m5, m3
2050 paddd m0, m5
2051 movu m5, [r1 + 48 + r4]
2052 psadbw m5, m4
2053 paddd m0, m5
2054 movu m5, [r2 + 32 + r4]
2055 psadbw m5, m3
2056 paddd m1, m5
2057 movu m5, [r2 + 48 + r4]
2058 psadbw m5, m4
2059 paddd m1, m5
2060 movu m5, [r3 + 32 + r4]
2061 psadbw m5, m3
2062 paddd m2, m5
2063 movu m5, [r3 + 48 + r4]
2064 psadbw m5, m4
2065 paddd m2, m5
2066
2067 mova m3, [r0 + FENC_STRIDE * 2]
2068 mova m4, [r0 + 16 + FENC_STRIDE * 2]
2069 movu m5, [r1 + r4 * 2]
2070 psadbw m5, m3
2071 paddd m0, m5
2072 movu m5, [r1 + 16 + r4 * 2]
2073 psadbw m5, m4
2074 paddd m0, m5
2075 movu m5, [r2 + r4 * 2]
2076 psadbw m5, m3
2077 paddd m1, m5
2078 movu m5, [r2 + 16 + r4 * 2]
2079 psadbw m5, m4
2080 paddd m1, m5
2081 movu m5, [r3 + r4 * 2]
2082 psadbw m5, m3
2083 paddd m2, m5
2084 movu m5, [r3 + 16 + r4 * 2]
2085 psadbw m5, m4
2086 paddd m2, m5
2087 mova m3, [r0 + 32 + FENC_STRIDE * 2]
2088 mova m4, [r0 + 48 + FENC_STRIDE * 2]
2089 movu m5, [r1 + 32 + r4 * 2]
2090 psadbw m5, m3
2091 paddd m0, m5
2092 movu m5, [r1 + 48 + r4 * 2]
2093 psadbw m5, m4
2094 paddd m0, m5
2095 movu m5, [r2 + 32 + r4 * 2]
2096 psadbw m5, m3
2097 paddd m1, m5
2098 movu m5, [r2 + 48 + r4 * 2]
2099 psadbw m5, m4
2100 paddd m1, m5
2101 movu m5, [r3 + 32 + r4 * 2]
2102 psadbw m5, m3
2103 paddd m2, m5
2104 movu m5, [r3 + 48 + r4 * 2]
2105 psadbw m5, m4
2106 paddd m2, m5
2107
2108 lea r0, [r0 + FENC_STRIDE * 2]
2109 lea r1, [r1 + r4 * 2]
2110 lea r2, [r2 + r4 * 2]
2111 lea r3, [r3 + r4 * 2]
2112 mova m3, [r0 + FENC_STRIDE]
2113 mova m4, [r0 + 16 + FENC_STRIDE]
2114 movu m5, [r1 + r4]
2115 psadbw m5, m3
2116 paddd m0, m5
2117 movu m5, [r1 + 16 + r4]
2118 psadbw m5, m4
2119 paddd m0, m5
2120 movu m5, [r2 + r4]
2121 psadbw m5, m3
2122 paddd m1, m5
2123 movu m5, [r2 + 16 + r4]
2124 psadbw m5, m4
2125 paddd m1, m5
2126 movu m5, [r3 + r4]
2127 psadbw m5, m3
2128 paddd m2, m5
2129 movu m5, [r3 + 16 + r4]
2130 psadbw m5, m4
2131 paddd m2, m5
2132 mova m3, [r0 + 32 + FENC_STRIDE]
2133 mova m4, [r0 + 48 + FENC_STRIDE]
2134 movu m5, [r1 + 32 + r4]
2135 psadbw m5, m3
2136 paddd m0, m5
2137 movu m5, [r1 + 48 + r4]
2138 psadbw m5, m4
2139 paddd m0, m5
2140 movu m5, [r2 + 32 + r4]
2141 psadbw m5, m3
2142 paddd m1, m5
2143 movu m5, [r2 + 48 + r4]
2144 psadbw m5, m4
2145 paddd m1, m5
2146 movu m5, [r3 + 32 + r4]
2147 psadbw m5, m3
2148 paddd m2, m5
2149 movu m5, [r3 + 48 + r4]
2150 psadbw m5, m4
2151 paddd m2, m5
2152 lea r0, [r0 + FENC_STRIDE * 2]
2153 lea r1, [r1 + r4 * 2]
2154 lea r2, [r2 + r4 * 2]
2155 lea r3, [r3 + r4 * 2]
2156%endmacro
2157
2158%macro SAD_X4_64x4 0
2159 mova m4, [r0]
2160 mova m5, [r0 + 16]
2161 movu m6, [r1]
2162 psadbw m6, m4
2163 paddd m0, m6
2164 movu m6, [r1 + 16]
2165 psadbw m6, m5
2166 paddd m0, m6
2167 movu m6, [r2]
2168 psadbw m6, m4
2169 paddd m1, m6
2170 movu m6, [r2 + 16]
2171 psadbw m6, m5
2172 paddd m1, m6
2173 movu m6, [r3]
2174 psadbw m6, m4
2175 paddd m2, m6
2176 movu m6, [r3 + 16]
2177 psadbw m6, m5
2178 paddd m2, m6
2179 movu m6, [r4]
2180 psadbw m6, m4
2181 paddd m3, m6
2182 movu m6, [r4 + 16]
2183 psadbw m6, m5
2184 paddd m3, m6
2185 mova m4, [r0 + 32]
2186 mova m5, [r0 + 48]
2187 movu m6, [r1 + 32]
2188 psadbw m6, m4
2189 paddd m0, m6
2190 movu m6, [r1 + 48]
2191 psadbw m6, m5
2192 paddd m0, m6
2193 movu m6, [r2 + 32]
2194 psadbw m6, m4
2195 paddd m1, m6
2196 movu m6, [r2 + 48]
2197 psadbw m6, m5
2198 paddd m1, m6
2199 movu m6, [r3 + 32]
2200 psadbw m6, m4
2201 paddd m2, m6
2202 movu m6, [r3 + 48]
2203 psadbw m6, m5
2204 paddd m2, m6
2205 movu m6, [r4 + 32]
2206 psadbw m6, m4
2207 paddd m3, m6
2208 movu m6, [r4 + 48]
2209 psadbw m6, m5
2210 paddd m3, m6
2211
2212 mova m4, [r0 + FENC_STRIDE]
2213 mova m5, [r0 + 16 + FENC_STRIDE]
2214 movu m6, [r1 + r5]
2215 psadbw m6, m4
2216 paddd m0, m6
2217 movu m6, [r1 + 16 + r5]
2218 psadbw m6, m5
2219 paddd m0, m6
2220 movu m6, [r2 + r5]
2221 psadbw m6, m4
2222 paddd m1, m6
2223 movu m6, [r2 + 16 + r5]
2224 psadbw m6, m5
2225 paddd m1, m6
2226 movu m6, [r3 + r5]
2227 psadbw m6, m4
2228 paddd m2, m6
2229 movu m6, [r3 + 16 + r5]
2230 psadbw m6, m5
2231 paddd m2, m6
2232 movu m6, [r4 + r5]
2233 psadbw m6, m4
2234 paddd m3, m6
2235 movu m6, [r4 + 16 + r5]
2236 psadbw m6, m5
2237 paddd m3, m6
2238 mova m4, [r0 + 32 + FENC_STRIDE]
2239 mova m5, [r0 + 48 + FENC_STRIDE]
2240 movu m6, [r1 + 32 + r5]
2241 psadbw m6, m4
2242 paddd m0, m6
2243 movu m6, [r1 + 48 + r5]
2244 psadbw m6, m5
2245 paddd m0, m6
2246 movu m6, [r2 + 32 + r5]
2247 psadbw m6, m4
2248 paddd m1, m6
2249 movu m6, [r2 + 48 + r5]
2250 psadbw m6, m5
2251 paddd m1, m6
2252 movu m6, [r3 + 32 + r5]
2253 psadbw m6, m4
2254 paddd m2, m6
2255 movu m6, [r3 + 48 + r5]
2256 psadbw m6, m5
2257 paddd m2, m6
2258 movu m6, [r4 + 32 + r5]
2259 psadbw m6, m4
2260 paddd m3, m6
2261 movu m6, [r4 + 48 + r5]
2262 psadbw m6, m5
2263 paddd m3, m6
2264
2265 mova m4, [r0 + FENC_STRIDE * 2]
2266 mova m5, [r0 + 16 + FENC_STRIDE * 2]
2267 movu m6, [r1 + r5 * 2]
2268 psadbw m6, m4
2269 paddd m0, m6
2270 movu m6, [r1 + 16 + r5 * 2]
2271 psadbw m6, m5
2272 paddd m0, m6
2273 movu m6, [r2 + r5 * 2]
2274 psadbw m6, m4
2275 paddd m1, m6
2276 movu m6, [r2 + 16 + r5 * 2]
2277 psadbw m6, m5
2278 paddd m1, m6
2279 movu m6, [r3 + r5 * 2]
2280 psadbw m6, m4
2281 paddd m2, m6
2282 movu m6, [r3 + 16 + r5 * 2]
2283 psadbw m6, m5
2284 paddd m2, m6
2285 movu m6, [r4 + r5 * 2]
2286 psadbw m6, m4
2287 paddd m3, m6
2288 movu m6, [r4 + 16 + r5 * 2]
2289 psadbw m6, m5
2290 paddd m3, m6
2291 mova m4, [r0 + 32 + FENC_STRIDE * 2]
2292 mova m5, [r0 + 48 + FENC_STRIDE * 2]
2293 movu m6, [r1 + 32 + r5 * 2]
2294 psadbw m6, m4
2295 paddd m0, m6
2296 movu m6, [r1 + 48 + r5 * 2]
2297 psadbw m6, m5
2298 paddd m0, m6
2299 movu m6, [r2 + 32 + r5 * 2]
2300 psadbw m6, m4
2301 paddd m1, m6
2302 movu m6, [r2 + 48 + r5 * 2]
2303 psadbw m6, m5
2304 paddd m1, m6
2305 movu m6, [r3 + 32 + r5 * 2]
2306 psadbw m6, m4
2307 paddd m2, m6
2308 movu m6, [r3 + 48 + r5 * 2]
2309 psadbw m6, m5
2310 paddd m2, m6
2311 movu m6, [r4 + 32 + r5 * 2]
2312 psadbw m6, m4
2313 paddd m3, m6
2314 movu m6, [r4 + 48 + r5 * 2]
2315 psadbw m6, m5
2316 paddd m3, m6
2317
2318 lea r0, [r0 + FENC_STRIDE * 2]
2319 lea r1, [r1 + r5 * 2]
2320 lea r2, [r2 + r5 * 2]
2321 lea r3, [r3 + r5 * 2]
2322 lea r4, [r4 + r5 * 2]
2323 mova m4, [r0 + FENC_STRIDE]
2324 mova m5, [r0 + 16 + FENC_STRIDE]
2325 movu m6, [r1 + r5]
2326 psadbw m6, m4
2327 paddd m0, m6
2328 movu m6, [r1 + 16 + r5]
2329 psadbw m6, m5
2330 paddd m0, m6
2331 movu m6, [r2 + r5]
2332 psadbw m6, m4
2333 paddd m1, m6
2334 movu m6, [r2 + 16 + r5]
2335 psadbw m6, m5
2336 paddd m1, m6
2337 movu m6, [r3 + r5]
2338 psadbw m6, m4
2339 paddd m2, m6
2340 movu m6, [r3 + 16 + r5]
2341 psadbw m6, m5
2342 paddd m2, m6
2343 movu m6, [r4 + r5]
2344 psadbw m6, m4
2345 paddd m3, m6
2346 movu m6, [r4 + 16 + r5]
2347 psadbw m6, m5
2348 paddd m3, m6
2349 mova m4, [r0 + 32 + FENC_STRIDE]
2350 mova m5, [r0 + 48 + FENC_STRIDE]
2351 movu m6, [r1 + 32 + r5]
2352 psadbw m6, m4
2353 paddd m0, m6
2354 movu m6, [r1 + 48 + r5]
2355 psadbw m6, m5
2356 paddd m0, m6
2357 movu m6, [r2 + 32 + r5]
2358 psadbw m6, m4
2359 paddd m1, m6
2360 movu m6, [r2 + 48 + r5]
2361 psadbw m6, m5
2362 paddd m1, m6
2363 movu m6, [r3 + 32 + r5]
2364 psadbw m6, m4
2365 paddd m2, m6
2366 movu m6, [r3 + 48 + r5]
2367 psadbw m6, m5
2368 paddd m2, m6
2369 movu m6, [r4 + 32 + r5]
2370 psadbw m6, m4
2371 paddd m3, m6
2372 movu m6, [r4 + 48 + r5]
2373 psadbw m6, m5
2374 paddd m3, m6
2375 lea r0, [r0 + FENC_STRIDE * 2]
2376 lea r1, [r1 + r5 * 2]
2377 lea r2, [r2 + r5 * 2]
2378 lea r3, [r3 + r5 * 2]
2379 lea r4, [r4 + r5 * 2]
2380%endmacro
2381
2382;-----------------------------------------------------------------------------
2383; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
2384; uint8_t *pix2, intptr_t i_stride, int scores[3] )
2385;-----------------------------------------------------------------------------
2386%macro SAD_X 3
2387cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
2388 SAD_X%1_2x%2P 1
2389%rep %3/2-1
2390 SAD_X%1_2x%2P 0
2391%endrep
2392 SAD_X%1_END
2393%endmacro
2394
2395INIT_MMX
2396SAD_X 3, 16, 16
2397SAD_X 3, 16, 8
2398SAD_X 3, 8, 16
2399SAD_X 3, 8, 8
2400SAD_X 3, 8, 4
2401SAD_X 3, 4, 16
2402SAD_X 3, 4, 8
2403SAD_X 3, 4, 4
2404SAD_X 4, 16, 16
2405SAD_X 4, 16, 8
2406SAD_X 4, 8, 16
2407SAD_X 4, 8, 8
2408SAD_X 4, 8, 4
2409SAD_X 4, 4, 16
2410SAD_X 4, 4, 8
2411SAD_X 4, 4, 4
2412
2413
2414
2415;=============================================================================
2416; SAD x3/x4 XMM
2417;=============================================================================
2418
2419%macro SAD_X3_START_1x16P_SSE2 0
2420 mova m2, [r0]
2421%if cpuflag(avx)
2422 psadbw m0, m2, [r1]
2423 psadbw m1, m2, [r2]
2424 psadbw m2, [r3]
2425%else
2426 movu m0, [r1]
2427 movu m1, [r2]
2428 movu m3, [r3]
2429 psadbw m0, m2
2430 psadbw m1, m2
2431 psadbw m2, m3
2432%endif
2433%endmacro
2434
2435%macro SAD_X3_1x16P_SSE2 2
2436 mova m3, [r0+%1]
2437%if cpuflag(avx)
2438 psadbw m4, m3, [r1+%2]
2439 psadbw m5, m3, [r2+%2]
2440 psadbw m3, [r3+%2]
2441%else
2442 movu m4, [r1+%2]
2443 movu m5, [r2+%2]
2444 movu m6, [r3+%2]
2445 psadbw m4, m3
2446 psadbw m5, m3
2447 psadbw m3, m6
2448%endif
2449 paddd m0, m4
2450 paddd m1, m5
2451 paddd m2, m3
2452%endmacro
2453
2454%if ARCH_X86_64
2455 DECLARE_REG_TMP 6
2456%else
2457 DECLARE_REG_TMP 5
2458%endif
2459
2460%macro SAD_X3_4x16P_SSE2 2
2461%if %1==0
2462 lea t0, [r4*3]
2463 SAD_X3_START_1x16P_SSE2
2464%else
2465 SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
2466%endif
2467 SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
2468 SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
2469 SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
2470%if %1 != %2-1
2471%if (%1&1) != 0
2472 add r0, 8*FENC_STRIDE
2473%endif
2474 lea r1, [r1+4*r4]
2475 lea r2, [r2+4*r4]
2476 lea r3, [r3+4*r4]
2477%endif
2478%endmacro
2479
2480%macro SAD_X3_START_2x8P_SSE2 0
2481 movq m3, [r0]
2482 movq m0, [r1]
2483 movq m1, [r2]
2484 movq m2, [r3]
2485 movhps m3, [r0+FENC_STRIDE]
2486 movhps m0, [r1+r4]
2487 movhps m1, [r2+r4]
2488 movhps m2, [r3+r4]
2489 psadbw m0, m3
2490 psadbw m1, m3
2491 psadbw m2, m3
2492%endmacro
2493
2494%macro SAD_X3_2x8P_SSE2 4
2495 movq m6, [r0+%1]
2496 movq m3, [r1+%2]
2497 movq m4, [r2+%2]
2498 movq m5, [r3+%2]
2499 movhps m6, [r0+%3]
2500 movhps m3, [r1+%4]
2501 movhps m4, [r2+%4]
2502 movhps m5, [r3+%4]
2503 psadbw m3, m6
2504 psadbw m4, m6
2505 psadbw m5, m6
2506 paddd m0, m3
2507 paddd m1, m4
2508 paddd m2, m5
2509%endmacro
2510
2511%macro SAD_X4_START_2x8P_SSE2 0
2512 movq m4, [r0]
2513 movq m0, [r1]
2514 movq m1, [r2]
2515 movq m2, [r3]
2516 movq m3, [r4]
2517 movhps m4, [r0+FENC_STRIDE]
2518 movhps m0, [r1+r5]
2519 movhps m1, [r2+r5]
2520 movhps m2, [r3+r5]
2521 movhps m3, [r4+r5]
2522 psadbw m0, m4
2523 psadbw m1, m4
2524 psadbw m2, m4
2525 psadbw m3, m4
2526%endmacro
2527
2528%macro SAD_X4_2x8P_SSE2 4
2529 movq m6, [r0+%1]
2530 movq m4, [r1+%2]
2531 movq m5, [r2+%2]
2532 movhps m6, [r0+%3]
2533 movhps m4, [r1+%4]
2534 movhps m5, [r2+%4]
2535 psadbw m4, m6
2536 psadbw m5, m6
2537 paddd m0, m4
2538 paddd m1, m5
2539 movq m4, [r3+%2]
2540 movq m5, [r4+%2]
2541 movhps m4, [r3+%4]
2542 movhps m5, [r4+%4]
2543 psadbw m4, m6
2544 psadbw m5, m6
2545 paddd m2, m4
2546 paddd m3, m5
2547%endmacro
2548
2549%macro SAD_X4_START_1x16P_SSE2 0
2550 mova m3, [r0]
2551%if cpuflag(avx)
2552 psadbw m0, m3, [r1]
2553 psadbw m1, m3, [r2]
2554 psadbw m2, m3, [r3]
2555 psadbw m3, [r4]
2556%else
2557 movu m0, [r1]
2558 movu m1, [r2]
2559 movu m2, [r3]
2560 movu m4, [r4]
2561 psadbw m0, m3
2562 psadbw m1, m3
2563 psadbw m2, m3
2564 psadbw m3, m4
2565%endif
2566%endmacro
2567
2568%macro SAD_X4_1x16P_SSE2 2
2569 mova m6, [r0+%1]
2570%if cpuflag(avx)
2571 psadbw m4, m6, [r1+%2]
2572 psadbw m5, m6, [r2+%2]
2573%else
2574 movu m4, [r1+%2]
2575 movu m5, [r2+%2]
2576 psadbw m4, m6
2577 psadbw m5, m6
2578%endif
2579 paddd m0, m4
2580 paddd m1, m5
2581%if cpuflag(avx)
2582 psadbw m4, m6, [r3+%2]
2583 psadbw m5, m6, [r4+%2]
2584%else
2585 movu m4, [r3+%2]
2586 movu m5, [r4+%2]
2587 psadbw m4, m6
2588 psadbw m5, m6
2589%endif
2590 paddd m2, m4
2591 paddd m3, m5
2592%endmacro
2593
2594%macro SAD_X4_4x16P_SSE2 2
2595%if %1==0
2596 lea r6, [r5*3]
2597 SAD_X4_START_1x16P_SSE2
2598%else
2599 SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
2600%endif
2601 SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
2602 SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
2603 SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
2604%if %1 != %2-1
2605%if (%1&1) != 0
2606 add r0, 8*FENC_STRIDE
2607%endif
2608 lea r1, [r1+4*r5]
2609 lea r2, [r2+4*r5]
2610 lea r3, [r3+4*r5]
2611 lea r4, [r4+4*r5]
2612%endif
2613%endmacro
2614
2615%macro SAD_X3_4x8P_SSE2 2
2616%if %1==0
2617 lea t0, [r4*3]
2618 SAD_X3_START_2x8P_SSE2
2619%else
2620 SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
2621%endif
2622 SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
2623%if %1 != %2-1
2624%if (%1&1) != 0
2625 add r0, 8*FENC_STRIDE
2626%endif
2627 lea r1, [r1+4*r4]
2628 lea r2, [r2+4*r4]
2629 lea r3, [r3+4*r4]
2630%endif
2631%endmacro
2632
2633%macro SAD_X4_4x8P_SSE2 2
2634%if %1==0
2635 lea r6, [r5*3]
2636 SAD_X4_START_2x8P_SSE2
2637%else
2638 SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
2639%endif
2640 SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
2641%if %1 != %2-1
2642%if (%1&1) != 0
2643 add r0, 8*FENC_STRIDE
2644%endif
2645 lea r1, [r1+4*r5]
2646 lea r2, [r2+4*r5]
2647 lea r3, [r3+4*r5]
2648 lea r4, [r4+4*r5]
2649%endif
2650%endmacro
2651
2652%macro SAD_X3_END_SSE2 1
2653 movifnidn r5, r5mp
2654 movhlps m3, m0
2655 movhlps m4, m1
2656 movhlps m5, m2
2657 paddd m0, m3
2658 paddd m1, m4
2659 paddd m2, m5
2660 movd [r5+0], m0
2661 movd [r5+4], m1
2662 movd [r5+8], m2
2663 RET
2664%endmacro
2665
2666%macro SAD_X4_END_SSE2 1
2667 mov r0, r6mp
2668 psllq m1, 32
2669 psllq m3, 32
2670 paddd m0, m1
2671 paddd m2, m3
2672 movhlps m1, m0
2673 movhlps m3, m2
2674 paddd m0, m1
2675 paddd m2, m3
2676 movq [r0+0], m0
2677 movq [r0+8], m2
2678 RET
2679%endmacro
2680
2681%macro SAD_X3_START_2x16P_AVX2 0
2682 movu m3, [r0] ; assumes FENC_STRIDE == 16
2683 movu xm0, [r1]
2684 movu xm1, [r2]
2685 movu xm2, [r3]
2686 vinserti128 m0, m0, [r1+r4], 1
2687 vinserti128 m1, m1, [r2+r4], 1
2688 vinserti128 m2, m2, [r3+r4], 1
2689 psadbw m0, m3
2690 psadbw m1, m3
2691 psadbw m2, m3
2692%endmacro
2693
2694%macro SAD_X3_2x16P_AVX2 3
2695 movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
2696 movu xm4, [r1+%2]
2697 movu xm5, [r2+%2]
2698 movu xm6, [r3+%2]
2699 vinserti128 m4, m4, [r1+%3], 1
2700 vinserti128 m5, m5, [r2+%3], 1
2701 vinserti128 m6, m6, [r3+%3], 1
2702 psadbw m4, m3
2703 psadbw m5, m3
2704 psadbw m6, m3
2705 paddw m0, m4
2706 paddw m1, m5
2707 paddw m2, m6
2708%endmacro
2709
2710%macro SAD_X3_4x16P_AVX2 2
2711%if %1==0
2712 lea t0, [r4*3]
2713 SAD_X3_START_2x16P_AVX2
2714%else
2715 SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
2716%endif
2717 SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
2718%if %1 != %2-1
2719%if (%1&1) != 0
2720 add r0, 8*FENC_STRIDE
2721%endif
2722 lea r1, [r1+4*r4]
2723 lea r2, [r2+4*r4]
2724 lea r3, [r3+4*r4]
2725%endif
2726%endmacro
2727
2728%macro SAD_X4_START_2x16P_AVX2 0
2729 vbroadcasti128 m4, [r0]
2730 vbroadcasti128 m5, [r0+FENC_STRIDE]
2731 movu xm0, [r1]
2732 movu xm1, [r2]
2733 movu xm2, [r1+r5]
2734 movu xm3, [r2+r5]
2735 vinserti128 m0, m0, [r3], 1
2736 vinserti128 m1, m1, [r4], 1
2737 vinserti128 m2, m2, [r3+r5], 1
2738 vinserti128 m3, m3, [r4+r5], 1
2739 psadbw m0, m4
2740 psadbw m1, m4
2741 psadbw m2, m5
2742 psadbw m3, m5
2743 paddw m0, m2
2744 paddw m1, m3
2745%endmacro
2746
2747%macro SAD_X4_2x16P_AVX2 4
2748 vbroadcasti128 m6, [r0+%1]
2749 vbroadcasti128 m7, [r0+%3]
2750 movu xm2, [r1+%2]
2751 movu xm3, [r2+%2]
2752 movu xm4, [r1+%4]
2753 movu xm5, [r2+%4]
2754 vinserti128 m2, m2, [r3+%2], 1
2755 vinserti128 m3, m3, [r4+%2], 1
2756 vinserti128 m4, m4, [r3+%4], 1
2757 vinserti128 m5, m5, [r4+%4], 1
2758 psadbw m2, m6
2759 psadbw m3, m6
2760 psadbw m4, m7
2761 psadbw m5, m7
2762 paddd m0, m2
2763 paddd m1, m3
2764 paddd m0, m4
2765 paddd m1, m5
2766%endmacro
2767
2768%macro SAD_X4_4x16P_AVX2 2
2769%if %1==0
2770 lea r6, [r5*3]
2771 SAD_X4_START_2x16P_AVX2
2772%else
2773 SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
2774%endif
2775 SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
2776%if %1 != %2-1
2777%if (%1&1) != 0
2778 add r0, 8*FENC_STRIDE
2779%endif
2780 lea r1, [r1+4*r5]
2781 lea r2, [r2+4*r5]
2782 lea r3, [r3+4*r5]
2783 lea r4, [r4+4*r5]
2784%endif
2785%endmacro
2786
2787%macro SAD_X3_END_AVX2 0
2788 movifnidn r5, r5mp
2789 packssdw m0, m1 ; 0 0 1 1 0 0 1 1
2790 packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
2791 phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
2792 vextracti128 xm1, m0, 1
2793 paddd xm0, xm1 ; 0 1 2 _
2794 mova [r5], xm0
2795 RET
2796%endmacro
2797
2798%macro SAD_X4_END_AVX2 0
2799 mov r0, r6mp
2800 pshufd m0, m0, 0x8
2801 pshufd m1, m1, 0x8
2802 vextracti128 xm2, m0, 1
2803 vextracti128 xm3, m1, 1
2804 punpcklqdq xm0, xm1
2805 punpcklqdq xm2, xm3
2806 phaddd xm0, xm2 ; 0 1 2 3
2807 mova [r0], xm0
2808 RET
2809%endmacro
2810
2811;-----------------------------------------------------------------------------
2812; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
2813; uint8_t *pix2, intptr_t i_stride, int scores[3] )
2814;-----------------------------------------------------------------------------
2815%macro SAD_X_SSE2 4
2816cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
2817%assign x 0
2818%rep %3/4
2819 SAD_X%1_4x%2P_SSE2 x, %3/4
2820%assign x x+1
2821%endrep
2822%if %3 == 64
2823 SAD_X%1_END_SSE2 1
2824%else
2825 SAD_X%1_END_SSE2 0
2826%endif
2827%endmacro
2828
2829%macro SAD_X3_W12 0
2830cglobal pixel_sad_x3_12x16, 5, 7, 8
2831 mova m4, [MSK]
2832 pxor m0, m0
2833 pxor m1, m1
2834 pxor m2, m2
2835
2836 SAD_X3_12x4
2837 SAD_X3_12x4
2838 SAD_X3_12x4
2839 SAD_X3_12x4
2840 SAD_X3_END_SSE2 1
2841%endmacro
2842
2843%macro SAD_X4_W12 0
2844cglobal pixel_sad_x4_12x16, 6, 8, 8
2845 mova m6, [MSK]
2846 pxor m0, m0
2847 pxor m1, m1
2848 pxor m2, m2
2849 pxor m3, m3
2850
2851 SAD_X4_12x4
2852 SAD_X4_12x4
2853 SAD_X4_12x4
2854 SAD_X4_12x4
2855 SAD_X4_END_SSE2 1
2856%endmacro
2857
2858%macro SAD_X3_W24 0
2859cglobal pixel_sad_x3_24x32, 5, 7, 8
2860 pxor m0, m0
2861 pxor m1, m1
2862 pxor m2, m2
2863 mov r6, 32
2864
2865.loop:
2866 SAD_X3_24x4
2867 SAD_X3_24x4
2868 SAD_X3_24x4
2869 SAD_X3_24x4
2870
2871 sub r6, 16
2872 cmp r6, 0
2873jnz .loop
2874 SAD_X3_END_SSE2 1
2875%endmacro
2876
2877%macro SAD_X4_W24 0
2878%if ARCH_X86_64 == 1
2879cglobal pixel_sad_x4_24x32, 6, 8, 8
2880%define count r7
2881%else
2882cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4
2883%define count dword [rsp]
2884%endif
2885 pxor m0, m0
2886 pxor m1, m1
2887 pxor m2, m2
2888 pxor m3, m3
2889 mov count, 32
2890
2891.loop:
2892 SAD_X4_24x4
2893 SAD_X4_24x4
2894 SAD_X4_24x4
2895 SAD_X4_24x4
2896
2897 sub count, 16
2898 jnz .loop
2899 SAD_X4_END_SSE2 1
2900
2901%endmacro\1d
2902
2903%macro SAD_X3_W32 0
2904cglobal pixel_sad_x3_32x8, 5, 6, 8
2905 pxor m0, m0
2906 pxor m1, m1
2907 pxor m2, m2
2908
2909 SAD_X3_32x4
2910 SAD_X3_32x4
2911 SAD_X3_END_SSE2 1
2912
2913cglobal pixel_sad_x3_32x16, 5, 6, 8
2914 pxor m0, m0
2915 pxor m1, m1
2916 pxor m2, m2
2917
2918 SAD_X3_32x4
2919 SAD_X3_32x4
2920 SAD_X3_32x4
2921 SAD_X3_32x4
2922 SAD_X3_END_SSE2 1
2923
2924cglobal pixel_sad_x3_32x24, 5, 6, 8
2925 pxor m0, m0
2926 pxor m1, m1
2927 pxor m2, m2
2928
2929 SAD_X3_32x4
2930 SAD_X3_32x4
2931 SAD_X3_32x4
2932 SAD_X3_32x4
2933 SAD_X3_32x4
2934 SAD_X3_32x4
2935 SAD_X3_END_SSE2 1
2936
2937cglobal pixel_sad_x3_32x32, 5, 7, 8
2938 pxor m0, m0
2939 pxor m1, m1
2940 pxor m2, m2
2941 mov r6, 32
2942
2943.loop:
2944 SAD_X3_32x4
2945 SAD_X3_32x4
2946 SAD_X3_32x4
2947 SAD_X3_32x4
2948
2949 sub r6, 16
2950 cmp r6, 0
2951jnz .loop
2952 SAD_X3_END_SSE2 1
2953
2954cglobal pixel_sad_x3_32x64, 5, 7, 8
2955 pxor m0, m0
2956 pxor m1, m1
2957 pxor m2, m2
2958 mov r6, 64
2959
2960.loop1:
2961 SAD_X3_32x4
2962 SAD_X3_32x4
2963 SAD_X3_32x4
2964 SAD_X3_32x4
2965
2966 sub r6, 16
2967 cmp r6, 0
2968jnz .loop1
2969 SAD_X3_END_SSE2 1
2970%endmacro
2971
2972%macro SAD_X4_W32 0
2973cglobal pixel_sad_x4_32x8, 6, 7, 8
2974 pxor m0, m0
2975 pxor m1, m1
2976 pxor m2, m2
2977 pxor m3, m3
2978
2979 SAD_X4_32x4
2980 SAD_X4_32x4
2981 SAD_X4_END_SSE2 1
2982
2983cglobal pixel_sad_x4_32x16, 6, 7, 8
2984 pxor m0, m0
2985 pxor m1, m1
2986 pxor m2, m2
2987 pxor m3, m3
2988
2989 SAD_X4_32x4
2990 SAD_X4_32x4
2991 SAD_X4_32x4
2992 SAD_X4_32x4
2993 SAD_X4_END_SSE2 1
2994
2995cglobal pixel_sad_x4_32x24, 6, 7, 8
2996 pxor m0, m0
2997 pxor m1, m1
2998 pxor m2, m2
2999 pxor m3, m3
3000
3001 SAD_X4_32x4
3002 SAD_X4_32x4
3003 SAD_X4_32x4
3004 SAD_X4_32x4
3005 SAD_X4_32x4
3006 SAD_X4_32x4
3007 SAD_X4_END_SSE2 1
3008
3009%if ARCH_X86_64 == 1
3010cglobal pixel_sad_x4_32x32, 6, 8, 8
3011%define count r7
3012%else
3013cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4
3014%define count dword [rsp]
3015%endif
3016 pxor m0, m0
3017 pxor m1, m1
3018 pxor m2, m2
3019 pxor m3, m3
3020 mov count, 32
3021
3022.loop:
3023 SAD_X4_32x4
3024 SAD_X4_32x4
3025 SAD_X4_32x4
3026 SAD_X4_32x4
3027
3028 sub count, 16
3029 jnz .loop
3030 SAD_X4_END_SSE2 1
3031
3032%if ARCH_X86_64 == 1
3033cglobal pixel_sad_x4_32x64, 6, 8, 8
3034%define count r7
3035%else
3036cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4
3037%define count dword [rsp]
3038%endif
3039 pxor m0, m0
3040 pxor m1, m1
3041 pxor m2, m2
3042 pxor m3, m3
3043 mov count, 64
3044
3045.loop:
3046 SAD_X4_32x4
3047 SAD_X4_32x4
3048 SAD_X4_32x4
3049 SAD_X4_32x4
3050
3051 sub count, 16
3052 jnz .loop
3053 SAD_X4_END_SSE2 1
3054
3055%endmacro
3056
3057%macro SAD_X3_W48 0
3058cglobal pixel_sad_x3_48x64, 5, 7, 8
3059 pxor m0, m0
3060 pxor m1, m1
3061 pxor m2, m2
3062 mov r6, 64
3063
3064.loop:
3065 SAD_X3_48x4
3066 SAD_X3_48x4
3067 SAD_X3_48x4
3068 SAD_X3_48x4
3069
3070 sub r6, 16
3071 jnz .loop
3072 SAD_X3_END_SSE2 1
3073%endmacro
3074
3075%macro SAD_X4_W48 0
3076%if ARCH_X86_64 == 1
3077cglobal pixel_sad_x4_48x64, 6, 8, 8
3078%define count r7
3079%else
3080cglobal pixel_sad_x4_48x64, 6, 7, 8, 0-4
3081%define count dword [rsp]
3082%endif
3083 pxor m0, m0
3084 pxor m1, m1
3085 pxor m2, m2
3086 pxor m3, m3
3087 mov count, 64
3088
3089.loop:
3090 SAD_X4_48x4
3091 SAD_X4_48x4
3092 SAD_X4_48x4
3093 SAD_X4_48x4
3094
3095 sub count, 16
3096 jnz .loop
3097 SAD_X4_END_SSE2 1
3098%endmacro
3099
3100%macro SAD_X3_W64 0
3101cglobal pixel_sad_x3_64x16, 5, 7, 7
3102 pxor m0, m0
3103 pxor m1, m1
3104 pxor m2, m2
3105 mov r6, 16
3106
3107.loop:
3108 SAD_X3_64x4
3109 SAD_X3_64x4
3110
3111 sub r6, 8
3112 jnz .loop
3113 SAD_X3_END_SSE2 1
3114
3115cglobal pixel_sad_x3_64x32, 5, 7, 7
3116 pxor m0, m0
3117 pxor m1, m1
3118 pxor m2, m2
3119 mov r6, 32
3120
3121.loop:
3122 SAD_X3_64x4
3123 SAD_X3_64x4
3124
3125 sub r6, 8
3126 jnz .loop
3127 SAD_X3_END_SSE2 1
3128
3129cglobal pixel_sad_x3_64x48, 5, 7, 7
3130 pxor m0, m0
3131 pxor m1, m1
3132 pxor m2, m2
3133 mov r6, 48
3134
3135.loop:
3136 SAD_X3_64x4
3137 SAD_X3_64x4
3138
3139 sub r6, 8
3140 jnz .loop
3141 SAD_X3_END_SSE2 1
3142
3143cglobal pixel_sad_x3_64x64, 5, 7, 7
3144 pxor m0, m0
3145 pxor m1, m1
3146 pxor m2, m2
3147 mov r6, 64
3148
3149.loop:
3150 SAD_X3_64x4
3151 SAD_X3_64x4
3152
3153 sub r6, 8
3154 jnz .loop
3155 SAD_X3_END_SSE2 1
3156%endmacro
3157
3158%macro SAD_X4_W64 0
3159%if ARCH_X86_64 == 1
3160cglobal pixel_sad_x4_64x16, 6, 8, 8
3161%define count r7
3162%else
3163cglobal pixel_sad_x4_64x16, 6, 7, 8, 0-4
3164%define count dword [rsp]
3165%endif
3166 pxor m0, m0
3167 pxor m1, m1
3168 pxor m2, m2
3169 pxor m3, m3
3170 mov count, 16
3171
3172.loop:
3173 SAD_X4_64x4
3174 SAD_X4_64x4
3175
3176 sub count, 8
3177 jnz .loop
3178 SAD_X4_END_SSE2 1
3179
3180%if ARCH_X86_64 == 1
3181cglobal pixel_sad_x4_64x32, 6, 8, 8
3182%define count r7
3183%else
3184cglobal pixel_sad_x4_64x32, 6, 7, 8, 0-4
3185%define count dword [rsp]
3186%endif
3187 pxor m0, m0
3188 pxor m1, m1
3189 pxor m2, m2
3190 pxor m3, m3
3191 mov count, 32
3192
3193.loop:
3194 SAD_X4_64x4
3195 SAD_X4_64x4
3196
3197 sub count, 8
3198 jnz .loop
3199 SAD_X4_END_SSE2 1
3200
3201%if ARCH_X86_64 == 1
3202cglobal pixel_sad_x4_64x48, 6, 8, 8
3203%define count r7
3204%else
3205cglobal pixel_sad_x4_64x48, 6, 7, 8, 0-4
3206%define count dword [rsp]
3207%endif
3208 pxor m0, m0
3209 pxor m1, m1
3210 pxor m2, m2
3211 pxor m3, m3
3212 mov count, 48
3213
3214.loop:
3215 SAD_X4_64x4
3216 SAD_X4_64x4
3217
3218 sub count, 8
3219 jnz .loop
3220 SAD_X4_END_SSE2 1
3221
3222%if ARCH_X86_64 == 1
3223cglobal pixel_sad_x4_64x64, 6, 8, 8
3224%define count r7
3225%else
3226cglobal pixel_sad_x4_64x64, 6, 7, 8, 0-4
3227%define count dword [rsp]
3228%endif
3229 pxor m0, m0
3230 pxor m1, m1
3231 pxor m2, m2
3232 pxor m3, m3
3233 mov count, 64
3234
3235.loop:
3236 SAD_X4_64x4
3237 SAD_X4_64x4
3238
3239 sub count, 8
3240 jnz .loop
3241 SAD_X4_END_SSE2 1
3242%endmacro
3243
3244INIT_XMM sse2
3245SAD_X_SSE2 3, 16, 16, 7
3246SAD_X_SSE2 3, 16, 8, 7
3247SAD_X_SSE2 3, 8, 16, 7
3248SAD_X_SSE2 3, 8, 8, 7
3249SAD_X_SSE2 3, 8, 4, 7
3250SAD_X_SSE2 4, 16, 16, 7
3251SAD_X_SSE2 4, 16, 8, 7
3252SAD_X_SSE2 4, 8, 16, 7
3253SAD_X_SSE2 4, 8, 8, 7
3254SAD_X_SSE2 4, 8, 4, 7
3255
3256INIT_XMM sse3
3257SAD_X_SSE2 3, 16, 16, 7
3258SAD_X_SSE2 3, 16, 8, 7
3259SAD_X_SSE2 3, 16, 4, 7
3260SAD_X_SSE2 4, 16, 16, 7
3261SAD_X_SSE2 4, 16, 8, 7
3262SAD_X_SSE2 4, 16, 4, 7
3263
3264INIT_XMM ssse3
3265SAD_X3_W12
3266SAD_X3_W32
3267SAD_X3_W24
3268SAD_X3_W48
3269SAD_X3_W64
3270SAD_X_SSE2 3, 16, 64, 7
3271SAD_X_SSE2 3, 16, 32, 7
3272SAD_X_SSE2 3, 16, 16, 7
3273SAD_X_SSE2 3, 16, 12, 7
3274SAD_X_SSE2 3, 16, 8, 7
3275SAD_X_SSE2 3, 8, 32, 7
3276SAD_X_SSE2 3, 8, 16, 7
3277SAD_X4_W12
3278SAD_X4_W24
3279SAD_X4_W32
3280SAD_X4_W48
3281SAD_X4_W64
3282SAD_X_SSE2 4, 16, 64, 7
3283SAD_X_SSE2 4, 16, 32, 7
3284SAD_X_SSE2 4, 16, 16, 7
3285SAD_X_SSE2 4, 16, 12, 7
3286SAD_X_SSE2 4, 16, 8, 7
3287SAD_X_SSE2 4, 8, 32, 7
3288SAD_X_SSE2 4, 8, 16, 7
3289SAD_X_SSE2 4, 8, 8, 7
3290SAD_X_SSE2 4, 8, 4, 7
3291
3292INIT_XMM avx
3293SAD_X3_W12
3294SAD_X3_W32
3295SAD_X3_W24
3296SAD_X3_W48
3297SAD_X3_W64
3298SAD_X_SSE2 3, 16, 64, 7
3299SAD_X_SSE2 3, 16, 32, 6
3300SAD_X_SSE2 3, 16, 16, 6
3301SAD_X_SSE2 3, 16, 12, 6
3302SAD_X_SSE2 3, 16, 8, 6
3303SAD_X_SSE2 3, 16, 4, 6
3304SAD_X4_W12
3305SAD_X4_W24
3306SAD_X4_W32
3307SAD_X4_W48
3308SAD_X4_W64
3309SAD_X_SSE2 4, 16, 64, 7
3310SAD_X_SSE2 4, 16, 32, 7
3311SAD_X_SSE2 4, 16, 16, 7
3312SAD_X_SSE2 4, 16, 12, 7
3313SAD_X_SSE2 4, 16, 8, 7
3314SAD_X_SSE2 4, 16, 4, 7
3315
3316%macro SAD_X_AVX2 4
3317cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
3318%assign x 0
3319%rep %3/4
3320 SAD_X%1_4x%2P_AVX2 x, %3/4
3321%assign x x+1
3322%endrep
3323 SAD_X%1_END_AVX2
3324%endmacro
3325
3326INIT_YMM avx2
3327SAD_X_AVX2 3, 16, 32, 7
3328SAD_X_AVX2 3, 16, 16, 7
3329SAD_X_AVX2 3, 16, 12, 7
3330SAD_X_AVX2 3, 16, 8, 7
3331SAD_X_AVX2 4, 16, 32, 8
3332SAD_X_AVX2 4, 16, 16, 8
3333SAD_X_AVX2 4, 16, 12, 8
3334SAD_X_AVX2 4, 16, 8, 8
3335
3336;=============================================================================
3337; SAD cacheline split
3338;=============================================================================
3339
3340; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
3341; unless the unaligned data spans the border between 2 cachelines, in which
3342; case it's really slow. The exact numbers may differ, but all Intel cpus prior
3343; to Nehalem have a large penalty for cacheline splits.
3344; (8-byte alignment exactly half way between two cachelines is ok though.)
3345; LDDQU was supposed to fix this, but it only works on Pentium 4.
3346; So in the split case we load aligned data and explicitly perform the
3347; alignment between registers. Like on archs that have only aligned loads,
3348; except complicated by the fact that PALIGNR takes only an immediate, not
3349; a variable alignment.
3350; It is also possible to hoist the realignment to the macroblock level (keep
3351; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
3352; needed for that method makes it often slower.
3353
3354; sad 16x16 costs on Core2:
3355; good offsets: 49 cycles (50/64 of all mvs)
3356; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
3357; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
3358; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
3359
3360; computed jump assumes this loop is exactly 80 bytes
3361%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
3362ALIGN 16
3363sad_w16_align%1_sse2:
3364 movdqa xmm1, [r2+16]
3365 movdqa xmm2, [r2+r3+16]
3366 movdqa xmm3, [r2]
3367 movdqa xmm4, [r2+r3]
3368 pslldq xmm1, 16-%1
3369 pslldq xmm2, 16-%1
3370 psrldq xmm3, %1
3371 psrldq xmm4, %1
3372 por xmm1, xmm3
3373 por xmm2, xmm4
3374 psadbw xmm1, [r0]
3375 psadbw xmm2, [r0+r1]
3376 paddw xmm0, xmm1
3377 paddw xmm0, xmm2
3378 lea r0, [r0+2*r1]
3379 lea r2, [r2+2*r3]
3380 dec r4
3381 jg sad_w16_align%1_sse2
3382 ret
3383%endmacro
3384
3385; computed jump assumes this loop is exactly 64 bytes
3386%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
3387ALIGN 16
3388sad_w16_align%1_ssse3:
3389 movdqa xmm1, [r2+16]
3390 movdqa xmm2, [r2+r3+16]
3391 palignr xmm1, [r2], %1
3392 palignr xmm2, [r2+r3], %1
3393 psadbw xmm1, [r0]
3394 psadbw xmm2, [r0+r1]
3395 paddw xmm0, xmm1
3396 paddw xmm0, xmm2
3397 lea r0, [r0+2*r1]
3398 lea r2, [r2+2*r3]
3399 dec r4
3400 jg sad_w16_align%1_ssse3
3401 ret
3402%endmacro
3403
3404%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
3405cglobal pixel_sad_16x%2_cache64_%1
3406 mov eax, r2m
3407 and eax, 0x37
3408 cmp eax, 0x30
3409 jle pixel_sad_16x%2_sse2
3410 PROLOGUE 4,6
3411 mov r4d, r2d
3412 and r4d, 15
3413%ifidn %1, ssse3
3414 shl r4d, 6 ; code size = 64
3415%else
3416 lea r4, [r4*5]
3417 shl r4d, 4 ; code size = 80
3418%endif
3419%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
3420%ifdef PIC
3421 lea r5, [sad_w16_addr]
3422 add r5, r4
3423%else
3424 lea r5, [sad_w16_addr + r4]
3425%endif
3426 and r2, ~15
3427 mov r4d, %2/2
3428 pxor xmm0, xmm0
3429 call r5
3430 movhlps xmm1, xmm0
3431 paddw xmm0, xmm1
3432 movd eax, xmm0
3433 RET
3434%endmacro
3435
3436%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
3437 mov eax, r2m
3438 and eax, 0x17|%1|(%4>>1)
3439 cmp eax, 0x10|%1|(%4>>1)
3440 jle pixel_sad_%1x%2_mmx2
3441 and eax, 7
3442 shl eax, 3
3443 movd mm6, [sw_64]
3444 movd mm7, eax
3445 psubw mm6, mm7
3446 PROLOGUE 4,5
3447 and r2, ~7
3448 mov r4d, %3
3449 pxor mm0, mm0
3450%endmacro
3451
3452%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
3453cglobal pixel_sad_16x%1_cache%2_mmx2
3454 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
3455.loop:
3456 movq mm1, [r2]
3457 movq mm2, [r2+8]
3458 movq mm3, [r2+16]
3459 movq mm4, mm2
3460 psrlq mm1, mm7
3461 psllq mm2, mm6
3462 psllq mm3, mm6
3463 psrlq mm4, mm7
3464 por mm1, mm2
3465 por mm3, mm4
3466 psadbw mm1, [r0]
3467 psadbw mm3, [r0+8]
3468 paddw mm0, mm1
3469 paddw mm0, mm3
3470 add r2, r3
3471 add r0, r1
3472 dec r4
3473 jg .loop
3474 movd eax, mm0
3475 RET
3476%endmacro
3477
3478%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
3479cglobal pixel_sad_8x%1_cache%2_mmx2
3480 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
3481.loop:
3482 movq mm1, [r2+8]
3483 movq mm2, [r2+r3+8]
3484 movq mm3, [r2]
3485 movq mm4, [r2+r3]
3486 psllq mm1, mm6
3487 psllq mm2, mm6
3488 psrlq mm3, mm7
3489 psrlq mm4, mm7
3490 por mm1, mm3
3491 por mm2, mm4
3492 psadbw mm1, [r0]
3493 psadbw mm2, [r0+r1]
3494 paddw mm0, mm1
3495 paddw mm0, mm2
3496 lea r2, [r2+2*r3]
3497 lea r0, [r0+2*r1]
3498 dec r4
3499 jg .loop
3500 movd eax, mm0
3501 RET
3502%endmacro
3503
3504; sad_x3/x4_cache64: check each mv.
3505; if they're all within a cacheline, use normal sad_x3/x4.
3506; otherwise, send them individually to sad_cache64.
3507%macro CHECK_SPLIT 3 ; pix, width, cacheline
3508 mov eax, %1
3509 and eax, 0x17|%2|(%3>>1)
3510 cmp eax, 0x10|%2|(%3>>1)
3511 jg .split
3512%endmacro
3513
3514%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
3515cglobal pixel_sad_x3_%1x%2_cache%3_%6
3516 CHECK_SPLIT r1m, %1, %3
3517 CHECK_SPLIT r2m, %1, %3
3518 CHECK_SPLIT r3m, %1, %3
3519 jmp pixel_sad_x3_%1x%2_%4
3520.split:
3521%if ARCH_X86_64
3522 PROLOGUE 6,9
3523 push r3
3524 push r2
3525%if WIN64
3526 movsxd r4, r4d
3527 sub rsp, 40 ; shadow space and alignment
3528%endif
3529 mov r2, r1
3530 mov r1, FENC_STRIDE
3531 mov r3, r4
3532 mov r7, r0
3533 mov r8, r5
3534 call pixel_sad_%1x%2_cache%3_%5
3535 mov [r8], eax
3536%if WIN64
3537 mov r2, [rsp+40+0*8]
3538%else
3539 pop r2
3540%endif
3541 mov r0, r7
3542 call pixel_sad_%1x%2_cache%3_%5
3543 mov [r8+4], eax
3544%if WIN64
3545 mov r2, [rsp+40+1*8]
3546%else
3547 pop r2
3548%endif
3549 mov r0, r7
3550 call pixel_sad_%1x%2_cache%3_%5
3551 mov [r8+8], eax
3552%if WIN64
3553 add rsp, 40+2*8
3554%endif
3555 RET
3556%else
3557 push edi
3558 mov edi, [esp+28]
3559 push dword [esp+24]
3560 push dword [esp+16]
3561 push dword 16
3562 push dword [esp+20]
3563 call pixel_sad_%1x%2_cache%3_%5
3564 mov ecx, [esp+32]
3565 mov [edi], eax
3566 mov [esp+8], ecx
3567 call pixel_sad_%1x%2_cache%3_%5
3568 mov ecx, [esp+36]
3569 mov [edi+4], eax
3570 mov [esp+8], ecx
3571 call pixel_sad_%1x%2_cache%3_%5
3572 mov [edi+8], eax
3573 add esp, 16
3574 pop edi
3575 ret
3576%endif
3577%endmacro
3578
3579%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
3580cglobal pixel_sad_x4_%1x%2_cache%3_%6
3581 CHECK_SPLIT r1m, %1, %3
3582 CHECK_SPLIT r2m, %1, %3
3583 CHECK_SPLIT r3m, %1, %3
3584 CHECK_SPLIT r4m, %1, %3
3585 jmp pixel_sad_x4_%1x%2_%4
3586.split:
3587%if ARCH_X86_64
3588 PROLOGUE 6,9
3589 mov r8, r6mp
3590 push r4
3591 push r3
3592 push r2
3593%if WIN64
3594 sub rsp, 32 ; shadow space
3595%endif
3596 mov r2, r1
3597 mov r1, FENC_STRIDE
3598 mov r3, r5
3599 mov r7, r0
3600 call pixel_sad_%1x%2_cache%3_%5
3601 mov [r8], eax
3602%if WIN64
3603 mov r2, [rsp+32+0*8]
3604%else
3605 pop r2
3606%endif
3607 mov r0, r7
3608 call pixel_sad_%1x%2_cache%3_%5
3609 mov [r8+4], eax
3610%if WIN64
3611 mov r2, [rsp+32+1*8]
3612%else
3613 pop r2
3614%endif
3615 mov r0, r7
3616 call pixel_sad_%1x%2_cache%3_%5
3617 mov [r8+8], eax
3618%if WIN64
3619 mov r2, [rsp+32+2*8]
3620%else
3621 pop r2
3622%endif
3623 mov r0, r7
3624 call pixel_sad_%1x%2_cache%3_%5
3625 mov [r8+12], eax
3626%if WIN64
3627 add rsp, 32+3*8
3628%endif
3629 RET
3630%else
3631 push edi
3632 mov edi, [esp+32]
3633 push dword [esp+28]
3634 push dword [esp+16]
3635 push dword 16
3636 push dword [esp+20]
3637 call pixel_sad_%1x%2_cache%3_%5
3638 mov ecx, [esp+32]
3639 mov [edi], eax
3640 mov [esp+8], ecx
3641 call pixel_sad_%1x%2_cache%3_%5
3642 mov ecx, [esp+36]
3643 mov [edi+4], eax
3644 mov [esp+8], ecx
3645 call pixel_sad_%1x%2_cache%3_%5
3646 mov ecx, [esp+40]
3647 mov [edi+8], eax
3648 mov [esp+8], ecx
3649 call pixel_sad_%1x%2_cache%3_%5
3650 mov [edi+12], eax
3651 add esp, 16
3652 pop edi
3653 ret
3654%endif
3655%endmacro
3656
3657%macro SADX34_CACHELINE_FUNC 1+
3658 SADX3_CACHELINE_FUNC %1
3659 SADX4_CACHELINE_FUNC %1
3660%endmacro
3661
3662
3663; instantiate the aligned sads
3664
3665INIT_MMX
3666%if ARCH_X86_64 == 0
3667SAD16_CACHELINE_FUNC_MMX2 8, 32
3668SAD16_CACHELINE_FUNC_MMX2 16, 32
3669SAD8_CACHELINE_FUNC_MMX2 4, 32
3670SAD8_CACHELINE_FUNC_MMX2 8, 32
3671SAD8_CACHELINE_FUNC_MMX2 16, 32
3672SAD16_CACHELINE_FUNC_MMX2 8, 64
3673SAD16_CACHELINE_FUNC_MMX2 16, 64
3674%endif ; !ARCH_X86_64
3675SAD8_CACHELINE_FUNC_MMX2 4, 64
3676SAD8_CACHELINE_FUNC_MMX2 8, 64
3677SAD8_CACHELINE_FUNC_MMX2 16, 64
3678
3679%if ARCH_X86_64 == 0
3680SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
3681SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
3682SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
3683SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
3684SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
3685SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
3686%endif ; !ARCH_X86_64
3687SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
3688SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
3689
3690%if ARCH_X86_64 == 0
3691SAD16_CACHELINE_FUNC sse2, 8
3692SAD16_CACHELINE_FUNC sse2, 16
3693%assign i 1
3694%rep 15
3695SAD16_CACHELINE_LOOP_SSE2 i
3696%assign i i+1
3697%endrep
3698SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
3699SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
3700%endif ; !ARCH_X86_64
3701SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
3702
3703SAD16_CACHELINE_FUNC ssse3, 8
3704SAD16_CACHELINE_FUNC ssse3, 16
3705%assign i 1
3706%rep 15
3707SAD16_CACHELINE_LOOP_SSSE3 i
3708%assign i i+1
3709%endrep
3710SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
3711SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3
3712