Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / me_cmp.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* SIMD-optimized motion compensation estimation
3;*****************************************************************************
4;* Copyright (c) 2000, 2001 Fabrice Bellard
5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;*****************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
f6fa7814
DM
26SECTION_RODATA
27
28cextern pb_1
29cextern pb_80
30
2ba45a60
DM
31SECTION .text
32
33%macro DIFF_PIXELS_1 4
34 movh %1, %3
35 movh %2, %4
36 punpcklbw %2, %1
37 punpcklbw %1, %1
38 psubw %1, %2
39%endmacro
40
41; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
42; %6=temporary storage location
43; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
44%macro DIFF_PIXELS_8 6
45 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
46 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
47 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48 add %1, %5
49 add %2, %5
50 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
51 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
52 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
53 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
54%ifdef m8
55 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
56%else
57 mova [%6], m0
58 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
59 mova m0, [%6]
60%endif
61 sub %1, %5
62 sub %2, %5
63%endmacro
64
65%macro HADAMARD8 0
66 SUMSUB_BADC w, 0, 1, 2, 3
67 SUMSUB_BADC w, 4, 5, 6, 7
68 SUMSUB_BADC w, 0, 2, 1, 3
69 SUMSUB_BADC w, 4, 6, 5, 7
70 SUMSUB_BADC w, 0, 4, 1, 5
71 SUMSUB_BADC w, 2, 6, 3, 7
72%endmacro
73
74%macro ABS1_SUM 3
75 ABS1 %1, %2
76 paddusw %3, %1
77%endmacro
78
79%macro ABS2_SUM 6
80 ABS2 %1, %2, %3, %4
81 paddusw %5, %1
82 paddusw %6, %2
83%endmacro
84
85%macro ABS_SUM_8x8_64 1
86 ABS2 m0, m1, m8, m9
87 ABS2_SUM m2, m3, m8, m9, m0, m1
88 ABS2_SUM m4, m5, m8, m9, m0, m1
89 ABS2_SUM m6, m7, m8, m9, m0, m1
90 paddusw m0, m1
91%endmacro
92
93%macro ABS_SUM_8x8_32 1
94 mova [%1], m7
95 ABS1 m0, m7
96 ABS1 m1, m7
97 ABS1_SUM m2, m7, m0
98 ABS1_SUM m3, m7, m1
99 ABS1_SUM m4, m7, m0
100 ABS1_SUM m5, m7, m1
101 ABS1_SUM m6, m7, m0
102 mova m2, [%1]
103 ABS1_SUM m2, m7, m1
104 paddusw m0, m1
105%endmacro
106
107; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
108; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
109; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
110%macro HSUM 3
111%if cpuflag(sse2)
112 movhlps %2, %1
113 paddusw %1, %2
114 pshuflw %2, %1, 0xE
115 paddusw %1, %2
116 pshuflw %2, %1, 0x1
117 paddusw %1, %2
118 movd %3, %1
119%elif cpuflag(mmxext)
120 pshufw %2, %1, 0xE
121 paddusw %1, %2
122 pshufw %2, %1, 0x1
123 paddusw %1, %2
124 movd %3, %1
125%elif cpuflag(mmx)
126 mova %2, %1
127 psrlq %1, 32
128 paddusw %1, %2
129 mova %2, %1
130 psrlq %1, 16
131 paddusw %1, %2
132 movd %3, %1
133%endif
134%endmacro
135
136%macro STORE4 5
137 mova [%1+mmsize*0], %2
138 mova [%1+mmsize*1], %3
139 mova [%1+mmsize*2], %4
140 mova [%1+mmsize*3], %5
141%endmacro
142
143%macro LOAD4 5
144 mova %2, [%1+mmsize*0]
145 mova %3, [%1+mmsize*1]
146 mova %4, [%1+mmsize*2]
147 mova %5, [%1+mmsize*3]
148%endmacro
149
150%macro hadamard8_16_wrapper 2
151cglobal hadamard8_diff, 4, 4, %1
152%ifndef m8
153 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
154 SUB rsp, pad
155%endif
156 call hadamard8x8_diff %+ SUFFIX
157%ifndef m8
158 ADD rsp, pad
159%endif
160 RET
161
162cglobal hadamard8_diff16, 5, 6, %1
163%ifndef m8
164 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
165 SUB rsp, pad
166%endif
167
168 call hadamard8x8_diff %+ SUFFIX
169 mov r5d, eax
170
171 add r1, 8
172 add r2, 8
173 call hadamard8x8_diff %+ SUFFIX
174 add r5d, eax
175
176 cmp r4d, 16
177 jne .done
178
179 lea r1, [r1+r3*8-8]
180 lea r2, [r2+r3*8-8]
181 call hadamard8x8_diff %+ SUFFIX
182 add r5d, eax
183
184 add r1, 8
185 add r2, 8
186 call hadamard8x8_diff %+ SUFFIX
187 add r5d, eax
188
189.done:
190 mov eax, r5d
191%ifndef m8
192 ADD rsp, pad
193%endif
194 RET
195%endmacro
196
197%macro HADAMARD8_DIFF 0-1
198%if cpuflag(sse2)
199hadamard8x8_diff %+ SUFFIX:
200 lea r0, [r3*3]
201 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
202 HADAMARD8
203%if ARCH_X86_64
204 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
205%else
206 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
207%endif
208 HADAMARD8
209 ABS_SUM_8x8 rsp+gprsize
210 HSUM m0, m1, eax
211 and eax, 0xFFFF
212 ret
213
214hadamard8_16_wrapper %1, 3
215%elif cpuflag(mmx)
216ALIGN 16
217; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
f6fa7814 218; uint8_t *src2, ptrdiff_t stride, int h)
2ba45a60
DM
219; r0 = void *s = unused, int h = unused (always 8)
220; note how r1, r2 and r3 are not clobbered in this function, so 16x16
221; can simply call this 2x2x (and that's why we access rsp+gprsize
222; everywhere, which is rsp of calling func
223hadamard8x8_diff %+ SUFFIX:
224 lea r0, [r3*3]
225
226 ; first 4x8 pixels
227 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
228 HADAMARD8
229 mova [rsp+gprsize+0x60], m7
230 TRANSPOSE4x4W 0, 1, 2, 3, 7
231 STORE4 rsp+gprsize, m0, m1, m2, m3
232 mova m7, [rsp+gprsize+0x60]
233 TRANSPOSE4x4W 4, 5, 6, 7, 0
234 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
235
236 ; second 4x8 pixels
237 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
238 HADAMARD8
239 mova [rsp+gprsize+0x60], m7
240 TRANSPOSE4x4W 0, 1, 2, 3, 7
241 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
242 mova m7, [rsp+gprsize+0x60]
243 TRANSPOSE4x4W 4, 5, 6, 7, 0
244
245 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
246 HADAMARD8
247 ABS_SUM_8x8_32 rsp+gprsize+0x60
248 mova [rsp+gprsize+0x60], m0
249
250 LOAD4 rsp+gprsize , m0, m1, m2, m3
251 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
252 HADAMARD8
253 ABS_SUM_8x8_32 rsp+gprsize
254 paddusw m0, [rsp+gprsize+0x60]
255
256 HSUM m0, m1, eax
257 and rax, 0xFFFF
258 ret
259
260hadamard8_16_wrapper 0, 14
261%endif
262%endmacro
263
264INIT_MMX mmx
265HADAMARD8_DIFF
266
267INIT_MMX mmxext
268HADAMARD8_DIFF
269
270INIT_XMM sse2
271%if ARCH_X86_64
272%define ABS_SUM_8x8 ABS_SUM_8x8_64
273%else
274%define ABS_SUM_8x8 ABS_SUM_8x8_32
275%endif
276HADAMARD8_DIFF 10
277
278INIT_XMM ssse3
279%define ABS_SUM_8x8 ABS_SUM_8x8_64
280HADAMARD8_DIFF 9
281
282; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
f6fa7814 283; ptrdiff_t line_size, int h)
2ba45a60
DM
284
285%macro SUM_SQUARED_ERRORS 1
286cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
287%if %1 == mmsize
288 shr hd, 1
289%endif
290 pxor m0, m0 ; mm0 = 0
291 pxor m7, m7 ; mm7 holds the sum
292
293.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
294 movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx
295 movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx
296%if %1 == mmsize
297 movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
298 movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
299%else ; %1 / 2 == mmsize; mmx only
300 mova m3, [pix1q+8] ; m3 = pix1[0][8-15]
301 mova m4, [pix2q+8] ; m4 = pix2[0][8-15]
302%endif
303
304 ; todo: mm1-mm2, mm3-mm4
305 ; algo: subtract mm1 from mm2 with saturation and vice versa
306 ; OR the result to get the absolute difference
307 mova m5, m1
308 mova m6, m3
309 psubusb m1, m2
310 psubusb m3, m4
311 psubusb m2, m5
312 psubusb m4, m6
313
314 por m2, m1
315 por m4, m3
316
317 ; now convert to 16-bit vectors so we can square them
318 mova m1, m2
319 mova m3, m4
320
321 punpckhbw m2, m0
322 punpckhbw m4, m0
323 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
324 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
325
326 pmaddwd m2, m2
327 pmaddwd m4, m4
328 pmaddwd m1, m1
329 pmaddwd m3, m3
330
331 paddd m1, m2
332 paddd m3, m4
333 paddd m7, m1
334 paddd m7, m3
335
336%if %1 == mmsize
337 lea pix1q, [pix1q + 2*lsizeq]
338 lea pix2q, [pix2q + 2*lsizeq]
339%else
340 add pix1q, lsizeq
341 add pix2q, lsizeq
342%endif
343 dec hd
344 jnz .next2lines
345
346 HADDD m7, m1
347 movd eax, m7 ; return value
348 RET
349%endmacro
350
351INIT_MMX mmx
352SUM_SQUARED_ERRORS 8
353
354INIT_MMX mmx
355SUM_SQUARED_ERRORS 16
356
357INIT_XMM sse2
358SUM_SQUARED_ERRORS 16
359
360;-----------------------------------------------
361;int ff_sum_abs_dctelem(int16_t *block)
362;-----------------------------------------------
363; %1 = number of xmm registers used
364; %2 = number of inline loops
365
366%macro SUM_ABS_DCTELEM 2
367cglobal sum_abs_dctelem, 1, 1, %1, block
368 pxor m0, m0
369 pxor m1, m1
370%assign %%i 0
371%rep %2
372 mova m2, [blockq+mmsize*(0+%%i)]
373 mova m3, [blockq+mmsize*(1+%%i)]
374 mova m4, [blockq+mmsize*(2+%%i)]
375 mova m5, [blockq+mmsize*(3+%%i)]
376 ABS1_SUM m2, m6, m0
377 ABS1_SUM m3, m6, m1
378 ABS1_SUM m4, m6, m0
379 ABS1_SUM m5, m6, m1
380%assign %%i %%i+4
381%endrep
382 paddusw m0, m1
383 HSUM m0, m1, eax
384 and eax, 0xFFFF
385 RET
386%endmacro
387
388INIT_MMX mmx
389SUM_ABS_DCTELEM 0, 4
390INIT_MMX mmxext
391SUM_ABS_DCTELEM 0, 4
392INIT_XMM sse2
393SUM_ABS_DCTELEM 7, 2
394INIT_XMM ssse3
395SUM_ABS_DCTELEM 6, 2
396
397;------------------------------------------------------------------------------
f6fa7814 398; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
2ba45a60
DM
399;------------------------------------------------------------------------------
400; %1 = 8/16. %2-5=m#
401%macro HF_NOISE_PART1 5
402 mova m%2, [pix1q]
403%if %1 == 8
404 mova m%3, m%2
405 psllq m%2, 8
406 psrlq m%3, 8
407 psrlq m%2, 8
408%else
409 mova m%3, [pix1q+1]
410%endif
411 mova m%4, m%2
412 mova m%5, m%3
413 punpcklbw m%2, m7
414 punpcklbw m%3, m7
415 punpckhbw m%4, m7
416 punpckhbw m%5, m7
417 psubw m%2, m%3
418 psubw m%4, m%5
419%endmacro
420
421; %1-2 = m#
422%macro HF_NOISE_PART2 4
423 psubw m%1, m%3
424 psubw m%2, m%4
425 pxor m3, m3
426 pxor m1, m1
427 pcmpgtw m3, m%1
428 pcmpgtw m1, m%2
429 pxor m%1, m3
430 pxor m%2, m1
431 psubw m%1, m3
432 psubw m%2, m1
433 paddw m%2, m%1
434 paddw m6, m%2
435%endmacro
436
437; %1 = 8/16
438%macro HF_NOISE 1
439cglobal hf_noise%1, 3,3,0, pix1, lsize, h
2ba45a60
DM
440 sub hd, 2
441 pxor m7, m7
442 pxor m6, m6
443 HF_NOISE_PART1 %1, 0, 1, 2, 3
444 add pix1q, lsizeq
445 HF_NOISE_PART1 %1, 4, 1, 5, 3
446 HF_NOISE_PART2 0, 2, 4, 5
447 add pix1q, lsizeq
448.loop:
449 HF_NOISE_PART1 %1, 0, 1, 2, 3
450 HF_NOISE_PART2 4, 5, 0, 2
451 add pix1q, lsizeq
452 HF_NOISE_PART1 %1, 4, 1, 5, 3
453 HF_NOISE_PART2 0, 2, 4, 5
454 add pix1q, lsizeq
455 sub hd, 2
456 jne .loop
457
458 mova m0, m6
459 punpcklwd m0, m7
460 punpckhwd m6, m7
461 paddd m6, m0
462 mova m0, m6
463 psrlq m6, 32
464 paddd m0, m6
465 movd eax, m0 ; eax = result of hf_noise8;
466 REP_RET ; return eax;
467%endmacro
468
469INIT_MMX mmx
470HF_NOISE 8
471HF_NOISE 16
f6fa7814
DM
472
473;---------------------------------------------------------------------------------------
474;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
475;---------------------------------------------------------------------------------------
476;%1 = 8/16
477%macro SAD 1
478cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
479 movu m2, [pix2q]
480 movu m1, [pix2q+strideq]
481 psadbw m2, [pix1q]
482 psadbw m1, [pix1q+strideq]
483 paddw m2, m1
484%if %1 != mmsize
485 movu m0, [pix2q+8]
486 movu m1, [pix2q+strideq+8]
487 psadbw m0, [pix1q+8]
488 psadbw m1, [pix1q+strideq+8]
489 paddw m2, m0
490 paddw m2, m1
491%endif
492 sub hd, 2
493
494align 16
495.loop:
496 lea pix1q, [pix1q+strideq*2]
497 lea pix2q, [pix2q+strideq*2]
498 movu m0, [pix2q]
499 movu m1, [pix2q+strideq]
500 psadbw m0, [pix1q]
501 psadbw m1, [pix1q+strideq]
502 paddw m2, m0
503 paddw m2, m1
504%if %1 != mmsize
505 movu m0, [pix2q+8]
506 movu m1, [pix2q+strideq+8]
507 psadbw m0, [pix1q+8]
508 psadbw m1, [pix1q+strideq+8]
509 paddw m2, m0
510 paddw m2, m1
511%endif
512 sub hd, 2
513 jg .loop
514%if mmsize == 16
515 movhlps m0, m2
516 paddw m2, m0
517%endif
518 movd eax, m2
519 RET
520%endmacro
521
522INIT_MMX mmxext
523SAD 8
524SAD 16
525INIT_XMM sse2
526SAD 16
527
528;------------------------------------------------------------------------------------------
529;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
530;------------------------------------------------------------------------------------------
531;%1 = 8/16
532%macro SAD_X2 1
533cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
534 movu m0, [pix2q]
535 movu m2, [pix2q+strideq]
536%if mmsize == 16
537 movu m3, [pix2q+1]
538 movu m4, [pix2q+strideq+1]
539 pavgb m0, m3
540 pavgb m2, m4
541%else
542 pavgb m0, [pix2q+1]
543 pavgb m2, [pix2q+strideq+1]
544%endif
545 psadbw m0, [pix1q]
546 psadbw m2, [pix1q+strideq]
547 paddw m0, m2
548%if %1 != mmsize
549 movu m1, [pix2q+8]
550 movu m2, [pix2q+strideq+8]
551 pavgb m1, [pix2q+9]
552 pavgb m2, [pix2q+strideq+9]
553 psadbw m1, [pix1q+8]
554 psadbw m2, [pix1q+strideq+8]
555 paddw m0, m1
556 paddw m0, m2
557%endif
558 sub hd, 2
559
560align 16
561.loop:
562 lea pix1q, [pix1q+2*strideq]
563 lea pix2q, [pix2q+2*strideq]
564 movu m1, [pix2q]
565 movu m2, [pix2q+strideq]
566%if mmsize == 16
567 movu m3, [pix2q+1]
568 movu m4, [pix2q+strideq+1]
569 pavgb m1, m3
570 pavgb m2, m4
571%else
572 pavgb m1, [pix2q+1]
573 pavgb m2, [pix2q+strideq+1]
574%endif
575 psadbw m1, [pix1q]
576 psadbw m2, [pix1q+strideq]
577 paddw m0, m1
578 paddw m0, m2
579%if %1 != mmsize
580 movu m1, [pix2q+8]
581 movu m2, [pix2q+strideq+8]
582 pavgb m1, [pix2q+9]
583 pavgb m2, [pix2q+strideq+9]
584 psadbw m1, [pix1q+8]
585 psadbw m2, [pix1q+strideq+8]
586 paddw m0, m1
587 paddw m0, m2
588%endif
589 sub hd, 2
590 jg .loop
591%if mmsize == 16
592 movhlps m1, m0
593 paddw m0, m1
594%endif
595 movd eax, m0
596 RET
597%endmacro
598
599INIT_MMX mmxext
600SAD_X2 8
601SAD_X2 16
602INIT_XMM sse2
603SAD_X2 16
604
605;------------------------------------------------------------------------------------------
606;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
607;------------------------------------------------------------------------------------------
608;%1 = 8/16
609%macro SAD_Y2 1
610cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
611 movu m1, [pix2q]
612 movu m0, [pix2q+strideq]
613 movu m3, [pix2q+2*strideq]
614 pavgb m1, m0
615 pavgb m0, m3
616 psadbw m1, [pix1q]
617 psadbw m0, [pix1q+strideq]
618 paddw m0, m1
619 mova m1, m3
620%if %1 != mmsize
621 movu m4, [pix2q+8]
622 movu m5, [pix2q+strideq+8]
623 movu m6, [pix2q+2*strideq+8]
624 pavgb m4, m5
625 pavgb m5, m6
626 psadbw m4, [pix1q+8]
627 psadbw m5, [pix1q+strideq+8]
628 paddw m0, m4
629 paddw m0, m5
630 mova m4, m6
631%endif
632 add pix2q, strideq
633 sub hd, 2
634
635align 16
636.loop:
637 lea pix1q, [pix1q+2*strideq]
638 lea pix2q, [pix2q+2*strideq]
639 movu m2, [pix2q]
640 movu m3, [pix2q+strideq]
641 pavgb m1, m2
642 pavgb m2, m3
643 psadbw m1, [pix1q]
644 psadbw m2, [pix1q+strideq]
645 paddw m0, m1
646 paddw m0, m2
647 mova m1, m3
648%if %1 != mmsize
649 movu m5, [pix2q+8]
650 movu m6, [pix2q+strideq+8]
651 pavgb m4, m5
652 pavgb m5, m6
653 psadbw m4, [pix1q+8]
654 psadbw m5, [pix1q+strideq+8]
655 paddw m0, m4
656 paddw m0, m5
657 mova m4, m6
658%endif
659 sub hd, 2
660 jg .loop
661%if mmsize == 16
662 movhlps m1, m0
663 paddw m0, m1
664%endif
665 movd eax, m0
666 RET
667%endmacro
668
669INIT_MMX mmxext
670SAD_Y2 8
671SAD_Y2 16
672INIT_XMM sse2
673SAD_Y2 16
674
675;-------------------------------------------------------------------------------------------
676;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
677;-------------------------------------------------------------------------------------------
678;%1 = 8/16
679%macro SAD_APPROX_XY2 1
680cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
681 mova m4, [pb_1]
682 movu m1, [pix2q]
683 movu m0, [pix2q+strideq]
684 movu m3, [pix2q+2*strideq]
685%if mmsize == 16
686 movu m5, [pix2q+1]
687 movu m6, [pix2q+strideq+1]
688 movu m2, [pix2q+2*strideq+1]
689 pavgb m1, m5
690 pavgb m0, m6
691 pavgb m3, m2
692%else
693 pavgb m1, [pix2q+1]
694 pavgb m0, [pix2q+strideq+1]
695 pavgb m3, [pix2q+2*strideq+1]
696%endif
697 psubusb m0, m4
698 pavgb m1, m0
699 pavgb m0, m3
700 psadbw m1, [pix1q]
701 psadbw m0, [pix1q+strideq]
702 paddw m0, m1
703 mova m1, m3
704%if %1 != mmsize
705 movu m5, [pix2q+8]
706 movu m6, [pix2q+strideq+8]
707 movu m7, [pix2q+2*strideq+8]
708 pavgb m5, [pix2q+1+8]
709 pavgb m6, [pix2q+strideq+1+8]
710 pavgb m7, [pix2q+2*strideq+1+8]
711 psubusb m6, m4
712 pavgb m5, m6
713 pavgb m6, m7
714 psadbw m5, [pix1q+8]
715 psadbw m6, [pix1q+strideq+8]
716 paddw m0, m5
717 paddw m0, m6
718 mova m5, m7
719%endif
720 add pix2q, strideq
721 sub hd, 2
722
723align 16
724.loop:
725 lea pix1q, [pix1q+2*strideq]
726 lea pix2q, [pix2q+2*strideq]
727 movu m2, [pix2q]
728 movu m3, [pix2q+strideq]
729%if mmsize == 16
730 movu m5, [pix2q+1]
731 movu m6, [pix2q+strideq+1]
732 pavgb m2, m5
733 pavgb m3, m6
734%else
735 pavgb m2, [pix2q+1]
736 pavgb m3, [pix2q+strideq+1]
737%endif
738 psubusb m2, m4
739 pavgb m1, m2
740 pavgb m2, m3
741 psadbw m1, [pix1q]
742 psadbw m2, [pix1q+strideq]
743 paddw m0, m1
744 paddw m0, m2
745 mova m1, m3
746%if %1 != mmsize
747 movu m6, [pix2q+8]
748 movu m7, [pix2q+strideq+8]
749 pavgb m6, [pix2q+8+1]
750 pavgb m7, [pix2q+strideq+8+1]
751 psubusb m6, m4
752 pavgb m5, m6
753 pavgb m6, m7
754 psadbw m5, [pix1q+8]
755 psadbw m6, [pix1q+strideq+8]
756 paddw m0, m5
757 paddw m0, m6
758 mova m5, m7
759%endif
760 sub hd, 2
761 jg .loop
762%if mmsize == 16
763 movhlps m1, m0
764 paddw m0, m1
765%endif
766 movd eax, m0
767 RET
768%endmacro
769
770INIT_MMX mmxext
771SAD_APPROX_XY2 8
772SAD_APPROX_XY2 16
773INIT_XMM sse2
774SAD_APPROX_XY2 16
775
776;--------------------------------------------------------------------
777;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
778; ptrdiff_t line_size, int h);
779;--------------------------------------------------------------------
780; %1 = 8/16
781%macro VSAD_INTRA 1
782cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
783 mova m0, [pix1q]
784%if %1 == mmsize
785 mova m2, [pix1q+lsizeq]
786 psadbw m0, m2
787%else
788 mova m2, [pix1q+lsizeq]
789 mova m3, [pix1q+8]
790 mova m4, [pix1q+lsizeq+8]
791 psadbw m0, m2
792 psadbw m3, m4
793 paddw m0, m3
794%endif
795 sub hd, 2
796
797.loop
798 lea pix1q, [pix1q + 2*lsizeq]
799%if %1 == mmsize
800 mova m1, [pix1q]
801 psadbw m2, m1
802 paddw m0, m2
803 mova m2, [pix1q+lsizeq]
804 psadbw m1, m2
805 paddw m0, m1
806%else
807 mova m1, [pix1q]
808 mova m3, [pix1q+8]
809 psadbw m2, m1
810 psadbw m4, m3
811 paddw m0, m2
812 paddw m0, m4
813 mova m2, [pix1q+lsizeq]
814 mova m4, [pix1q+lsizeq+8]
815 psadbw m1, m2
816 psadbw m3, m4
817 paddw m0, m1
818 paddw m0, m3
819%endif
820 sub hd, 2
821 jg .loop
822
823%if mmsize == 16
824 pshufd m1, m0, 0xe
825 paddd m0, m1
826%endif
827 movd eax, m0
828 RET
829%endmacro
830
831INIT_MMX mmxext
832VSAD_INTRA 8
833VSAD_INTRA 16
834INIT_XMM sse2
835VSAD_INTRA 16
836
837;---------------------------------------------------------------------
838;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
839; ptrdiff_t line_size, int h);
840;---------------------------------------------------------------------
841; %1 = 8/16
842%macro VSAD_APPROX 1
843cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
844 mova m1, [pb_80]
845 mova m0, [pix1q]
846%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
847 mova m4, [pix1q+lsizeq]
848%if mmsize == 16
849 movu m3, [pix2q]
850 movu m2, [pix2q+lsizeq]
851 psubb m0, m3
852 psubb m4, m2
853%else
854 psubb m0, [pix2q]
855 psubb m4, [pix2q+lsizeq]
856%endif
857 pxor m0, m1
858 pxor m4, m1
859 psadbw m0, m4
860%else ; vsad16_mmxext
861 mova m3, [pix1q+8]
862 psubb m0, [pix2q]
863 psubb m3, [pix2q+8]
864 pxor m0, m1
865 pxor m3, m1
866 mova m4, [pix1q+lsizeq]
867 mova m5, [pix1q+lsizeq+8]
868 psubb m4, [pix2q+lsizeq]
869 psubb m5, [pix2q+lsizeq+8]
870 pxor m4, m1
871 pxor m5, m1
872 psadbw m0, m4
873 psadbw m3, m5
874 paddw m0, m3
875%endif
876 sub hd, 2
877
878.loop
879 lea pix1q, [pix1q + 2*lsizeq]
880 lea pix2q, [pix2q + 2*lsizeq]
881 mova m2, [pix1q]
882%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
883%if mmsize == 16
884 movu m3, [pix2q]
885 psubb m2, m3
886%else
887 psubb m2, [pix2q]
888%endif
889 pxor m2, m1
890 psadbw m4, m2
891 paddw m0, m4
892 mova m4, [pix1q+lsizeq]
893 movu m3, [pix2q+lsizeq]
894 psubb m4, m3
895 pxor m4, m1
896 psadbw m2, m4
897 paddw m0, m2
898%else ; vsad16_mmxext
899 mova m3, [pix1q+8]
900 psubb m2, [pix2q]
901 psubb m3, [pix2q+8]
902 pxor m2, m1
903 pxor m3, m1
904 psadbw m4, m2
905 psadbw m5, m3
906 paddw m0, m4
907 paddw m0, m5
908 mova m4, [pix1q+lsizeq]
909 mova m5, [pix1q+lsizeq+8]
910 psubb m4, [pix2q+lsizeq]
911 psubb m5, [pix2q+lsizeq+8]
912 pxor m4, m1
913 pxor m5, m1
914 psadbw m2, m4
915 psadbw m3, m5
916 paddw m0, m2
917 paddw m0, m3
918%endif
919 sub hd, 2
920 jg .loop
921
922%if mmsize == 16
923 pshufd m1, m0, 0xe
924 paddd m0, m1
925%endif
926 movd eax, m0
927 RET
928%endmacro
929
930INIT_MMX mmxext
931VSAD_APPROX 8
932VSAD_APPROX 16
933INIT_XMM sse2
934VSAD_APPROX 16