Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* SIMD-optimized motion compensation estimation | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (c) 2000, 2001 Fabrice Bellard | |
5 | ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
6 | ;* | |
7 | ;* This file is part of FFmpeg. | |
8 | ;* | |
9 | ;* FFmpeg is free software; you can redistribute it and/or | |
10 | ;* modify it under the terms of the GNU Lesser General Public | |
11 | ;* License as published by the Free Software Foundation; either | |
12 | ;* version 2.1 of the License, or (at your option) any later version. | |
13 | ;* | |
14 | ;* FFmpeg is distributed in the hope that it will be useful, | |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;* Lesser General Public License for more details. | |
18 | ;* | |
19 | ;* You should have received a copy of the GNU Lesser General Public | |
20 | ;* License along with FFmpeg; if not, write to the Free Software | |
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | ;***************************************************************************** | |
23 | ||
24 | %include "libavutil/x86/x86util.asm" | |
25 | ||
f6fa7814 DM |
26 | SECTION_RODATA |
27 | ||
28 | cextern pb_1 | |
29 | cextern pb_80 | |
30 | ||
2ba45a60 DM |
31 | SECTION .text |
32 | ||
33 | %macro DIFF_PIXELS_1 4 | |
34 | movh %1, %3 | |
35 | movh %2, %4 | |
36 | punpcklbw %2, %1 | |
37 | punpcklbw %1, %1 | |
38 | psubw %1, %2 | |
39 | %endmacro | |
40 | ||
41 | ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 | |
42 | ; %6=temporary storage location | |
43 | ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) | |
44 | %macro DIFF_PIXELS_8 6 | |
45 | DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] | |
46 | DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] | |
47 | DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] | |
48 | add %1, %5 | |
49 | add %2, %5 | |
50 | DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] | |
51 | DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] | |
52 | DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] | |
53 | DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] | |
54 | %ifdef m8 | |
55 | DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] | |
56 | %else | |
57 | mova [%6], m0 | |
58 | DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] | |
59 | mova m0, [%6] | |
60 | %endif | |
61 | sub %1, %5 | |
62 | sub %2, %5 | |
63 | %endmacro | |
64 | ||
65 | %macro HADAMARD8 0 | |
66 | SUMSUB_BADC w, 0, 1, 2, 3 | |
67 | SUMSUB_BADC w, 4, 5, 6, 7 | |
68 | SUMSUB_BADC w, 0, 2, 1, 3 | |
69 | SUMSUB_BADC w, 4, 6, 5, 7 | |
70 | SUMSUB_BADC w, 0, 4, 1, 5 | |
71 | SUMSUB_BADC w, 2, 6, 3, 7 | |
72 | %endmacro | |
73 | ||
74 | %macro ABS1_SUM 3 | |
75 | ABS1 %1, %2 | |
76 | paddusw %3, %1 | |
77 | %endmacro | |
78 | ||
79 | %macro ABS2_SUM 6 | |
80 | ABS2 %1, %2, %3, %4 | |
81 | paddusw %5, %1 | |
82 | paddusw %6, %2 | |
83 | %endmacro | |
84 | ||
85 | %macro ABS_SUM_8x8_64 1 | |
86 | ABS2 m0, m1, m8, m9 | |
87 | ABS2_SUM m2, m3, m8, m9, m0, m1 | |
88 | ABS2_SUM m4, m5, m8, m9, m0, m1 | |
89 | ABS2_SUM m6, m7, m8, m9, m0, m1 | |
90 | paddusw m0, m1 | |
91 | %endmacro | |
92 | ||
93 | %macro ABS_SUM_8x8_32 1 | |
94 | mova [%1], m7 | |
95 | ABS1 m0, m7 | |
96 | ABS1 m1, m7 | |
97 | ABS1_SUM m2, m7, m0 | |
98 | ABS1_SUM m3, m7, m1 | |
99 | ABS1_SUM m4, m7, m0 | |
100 | ABS1_SUM m5, m7, m1 | |
101 | ABS1_SUM m6, m7, m0 | |
102 | mova m2, [%1] | |
103 | ABS1_SUM m2, m7, m1 | |
104 | paddusw m0, m1 | |
105 | %endmacro | |
106 | ||
107 | ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to | |
108 | ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, | |
109 | ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. | |
110 | %macro HSUM 3 | |
111 | %if cpuflag(sse2) | |
112 | movhlps %2, %1 | |
113 | paddusw %1, %2 | |
114 | pshuflw %2, %1, 0xE | |
115 | paddusw %1, %2 | |
116 | pshuflw %2, %1, 0x1 | |
117 | paddusw %1, %2 | |
118 | movd %3, %1 | |
119 | %elif cpuflag(mmxext) | |
120 | pshufw %2, %1, 0xE | |
121 | paddusw %1, %2 | |
122 | pshufw %2, %1, 0x1 | |
123 | paddusw %1, %2 | |
124 | movd %3, %1 | |
125 | %elif cpuflag(mmx) | |
126 | mova %2, %1 | |
127 | psrlq %1, 32 | |
128 | paddusw %1, %2 | |
129 | mova %2, %1 | |
130 | psrlq %1, 16 | |
131 | paddusw %1, %2 | |
132 | movd %3, %1 | |
133 | %endif | |
134 | %endmacro | |
135 | ||
136 | %macro STORE4 5 | |
137 | mova [%1+mmsize*0], %2 | |
138 | mova [%1+mmsize*1], %3 | |
139 | mova [%1+mmsize*2], %4 | |
140 | mova [%1+mmsize*3], %5 | |
141 | %endmacro | |
142 | ||
143 | %macro LOAD4 5 | |
144 | mova %2, [%1+mmsize*0] | |
145 | mova %3, [%1+mmsize*1] | |
146 | mova %4, [%1+mmsize*2] | |
147 | mova %5, [%1+mmsize*3] | |
148 | %endmacro | |
149 | ||
150 | %macro hadamard8_16_wrapper 2 | |
151 | cglobal hadamard8_diff, 4, 4, %1 | |
152 | %ifndef m8 | |
153 | %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) | |
154 | SUB rsp, pad | |
155 | %endif | |
156 | call hadamard8x8_diff %+ SUFFIX | |
157 | %ifndef m8 | |
158 | ADD rsp, pad | |
159 | %endif | |
160 | RET | |
161 | ||
162 | cglobal hadamard8_diff16, 5, 6, %1 | |
163 | %ifndef m8 | |
164 | %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) | |
165 | SUB rsp, pad | |
166 | %endif | |
167 | ||
168 | call hadamard8x8_diff %+ SUFFIX | |
169 | mov r5d, eax | |
170 | ||
171 | add r1, 8 | |
172 | add r2, 8 | |
173 | call hadamard8x8_diff %+ SUFFIX | |
174 | add r5d, eax | |
175 | ||
176 | cmp r4d, 16 | |
177 | jne .done | |
178 | ||
179 | lea r1, [r1+r3*8-8] | |
180 | lea r2, [r2+r3*8-8] | |
181 | call hadamard8x8_diff %+ SUFFIX | |
182 | add r5d, eax | |
183 | ||
184 | add r1, 8 | |
185 | add r2, 8 | |
186 | call hadamard8x8_diff %+ SUFFIX | |
187 | add r5d, eax | |
188 | ||
189 | .done: | |
190 | mov eax, r5d | |
191 | %ifndef m8 | |
192 | ADD rsp, pad | |
193 | %endif | |
194 | RET | |
195 | %endmacro | |
196 | ||
197 | %macro HADAMARD8_DIFF 0-1 | |
198 | %if cpuflag(sse2) | |
199 | hadamard8x8_diff %+ SUFFIX: | |
200 | lea r0, [r3*3] | |
201 | DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize | |
202 | HADAMARD8 | |
203 | %if ARCH_X86_64 | |
204 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
205 | %else | |
206 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] | |
207 | %endif | |
208 | HADAMARD8 | |
209 | ABS_SUM_8x8 rsp+gprsize | |
210 | HSUM m0, m1, eax | |
211 | and eax, 0xFFFF | |
212 | ret | |
213 | ||
214 | hadamard8_16_wrapper %1, 3 | |
215 | %elif cpuflag(mmx) | |
216 | ALIGN 16 | |
217 | ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, | |
f6fa7814 | 218 | ; uint8_t *src2, ptrdiff_t stride, int h) |
2ba45a60 DM |
219 | ; r0 = void *s = unused, int h = unused (always 8) |
220 | ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 | |
221 | ; can simply call this 2x2x (and that's why we access rsp+gprsize | |
222 | ; everywhere, which is rsp of calling func | |
223 | hadamard8x8_diff %+ SUFFIX: | |
224 | lea r0, [r3*3] | |
225 | ||
226 | ; first 4x8 pixels | |
227 | DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 | |
228 | HADAMARD8 | |
229 | mova [rsp+gprsize+0x60], m7 | |
230 | TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
231 | STORE4 rsp+gprsize, m0, m1, m2, m3 | |
232 | mova m7, [rsp+gprsize+0x60] | |
233 | TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
234 | STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 | |
235 | ||
236 | ; second 4x8 pixels | |
237 | DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 | |
238 | HADAMARD8 | |
239 | mova [rsp+gprsize+0x60], m7 | |
240 | TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
241 | STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 | |
242 | mova m7, [rsp+gprsize+0x60] | |
243 | TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
244 | ||
245 | LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 | |
246 | HADAMARD8 | |
247 | ABS_SUM_8x8_32 rsp+gprsize+0x60 | |
248 | mova [rsp+gprsize+0x60], m0 | |
249 | ||
250 | LOAD4 rsp+gprsize , m0, m1, m2, m3 | |
251 | LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 | |
252 | HADAMARD8 | |
253 | ABS_SUM_8x8_32 rsp+gprsize | |
254 | paddusw m0, [rsp+gprsize+0x60] | |
255 | ||
256 | HSUM m0, m1, eax | |
257 | and rax, 0xFFFF | |
258 | ret | |
259 | ||
260 | hadamard8_16_wrapper 0, 14 | |
261 | %endif | |
262 | %endmacro | |
263 | ||
264 | INIT_MMX mmx | |
265 | HADAMARD8_DIFF | |
266 | ||
267 | INIT_MMX mmxext | |
268 | HADAMARD8_DIFF | |
269 | ||
270 | INIT_XMM sse2 | |
271 | %if ARCH_X86_64 | |
272 | %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |
273 | %else | |
274 | %define ABS_SUM_8x8 ABS_SUM_8x8_32 | |
275 | %endif | |
276 | HADAMARD8_DIFF 10 | |
277 | ||
278 | INIT_XMM ssse3 | |
279 | %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |
280 | HADAMARD8_DIFF 9 | |
281 | ||
282 | ; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
f6fa7814 | 283 | ; ptrdiff_t line_size, int h) |
2ba45a60 DM |
284 | |
285 | %macro SUM_SQUARED_ERRORS 1 | |
286 | cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h | |
287 | %if %1 == mmsize | |
288 | shr hd, 1 | |
289 | %endif | |
290 | pxor m0, m0 ; mm0 = 0 | |
291 | pxor m7, m7 ; mm7 holds the sum | |
292 | ||
293 | .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned | |
294 | movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx | |
295 | movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx | |
296 | %if %1 == mmsize | |
297 | movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx | |
298 | movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx | |
299 | %else ; %1 / 2 == mmsize; mmx only | |
300 | mova m3, [pix1q+8] ; m3 = pix1[0][8-15] | |
301 | mova m4, [pix2q+8] ; m4 = pix2[0][8-15] | |
302 | %endif | |
303 | ||
304 | ; todo: mm1-mm2, mm3-mm4 | |
305 | ; algo: subtract mm1 from mm2 with saturation and vice versa | |
306 | ; OR the result to get the absolute difference | |
307 | mova m5, m1 | |
308 | mova m6, m3 | |
309 | psubusb m1, m2 | |
310 | psubusb m3, m4 | |
311 | psubusb m2, m5 | |
312 | psubusb m4, m6 | |
313 | ||
314 | por m2, m1 | |
315 | por m4, m3 | |
316 | ||
317 | ; now convert to 16-bit vectors so we can square them | |
318 | mova m1, m2 | |
319 | mova m3, m4 | |
320 | ||
321 | punpckhbw m2, m0 | |
322 | punpckhbw m4, m0 | |
323 | punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) | |
324 | punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) | |
325 | ||
326 | pmaddwd m2, m2 | |
327 | pmaddwd m4, m4 | |
328 | pmaddwd m1, m1 | |
329 | pmaddwd m3, m3 | |
330 | ||
331 | paddd m1, m2 | |
332 | paddd m3, m4 | |
333 | paddd m7, m1 | |
334 | paddd m7, m3 | |
335 | ||
336 | %if %1 == mmsize | |
337 | lea pix1q, [pix1q + 2*lsizeq] | |
338 | lea pix2q, [pix2q + 2*lsizeq] | |
339 | %else | |
340 | add pix1q, lsizeq | |
341 | add pix2q, lsizeq | |
342 | %endif | |
343 | dec hd | |
344 | jnz .next2lines | |
345 | ||
346 | HADDD m7, m1 | |
347 | movd eax, m7 ; return value | |
348 | RET | |
349 | %endmacro | |
350 | ||
351 | INIT_MMX mmx | |
352 | SUM_SQUARED_ERRORS 8 | |
353 | ||
354 | INIT_MMX mmx | |
355 | SUM_SQUARED_ERRORS 16 | |
356 | ||
357 | INIT_XMM sse2 | |
358 | SUM_SQUARED_ERRORS 16 | |
359 | ||
360 | ;----------------------------------------------- | |
361 | ;int ff_sum_abs_dctelem(int16_t *block) | |
362 | ;----------------------------------------------- | |
363 | ; %1 = number of xmm registers used | |
364 | ; %2 = number of inline loops | |
365 | ||
366 | %macro SUM_ABS_DCTELEM 2 | |
367 | cglobal sum_abs_dctelem, 1, 1, %1, block | |
368 | pxor m0, m0 | |
369 | pxor m1, m1 | |
370 | %assign %%i 0 | |
371 | %rep %2 | |
372 | mova m2, [blockq+mmsize*(0+%%i)] | |
373 | mova m3, [blockq+mmsize*(1+%%i)] | |
374 | mova m4, [blockq+mmsize*(2+%%i)] | |
375 | mova m5, [blockq+mmsize*(3+%%i)] | |
376 | ABS1_SUM m2, m6, m0 | |
377 | ABS1_SUM m3, m6, m1 | |
378 | ABS1_SUM m4, m6, m0 | |
379 | ABS1_SUM m5, m6, m1 | |
380 | %assign %%i %%i+4 | |
381 | %endrep | |
382 | paddusw m0, m1 | |
383 | HSUM m0, m1, eax | |
384 | and eax, 0xFFFF | |
385 | RET | |
386 | %endmacro | |
387 | ||
388 | INIT_MMX mmx | |
389 | SUM_ABS_DCTELEM 0, 4 | |
390 | INIT_MMX mmxext | |
391 | SUM_ABS_DCTELEM 0, 4 | |
392 | INIT_XMM sse2 | |
393 | SUM_ABS_DCTELEM 7, 2 | |
394 | INIT_XMM ssse3 | |
395 | SUM_ABS_DCTELEM 6, 2 | |
396 | ||
397 | ;------------------------------------------------------------------------------ | |
f6fa7814 | 398 | ; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h) |
2ba45a60 DM |
399 | ;------------------------------------------------------------------------------ |
400 | ; %1 = 8/16. %2-5=m# | |
401 | %macro HF_NOISE_PART1 5 | |
402 | mova m%2, [pix1q] | |
403 | %if %1 == 8 | |
404 | mova m%3, m%2 | |
405 | psllq m%2, 8 | |
406 | psrlq m%3, 8 | |
407 | psrlq m%2, 8 | |
408 | %else | |
409 | mova m%3, [pix1q+1] | |
410 | %endif | |
411 | mova m%4, m%2 | |
412 | mova m%5, m%3 | |
413 | punpcklbw m%2, m7 | |
414 | punpcklbw m%3, m7 | |
415 | punpckhbw m%4, m7 | |
416 | punpckhbw m%5, m7 | |
417 | psubw m%2, m%3 | |
418 | psubw m%4, m%5 | |
419 | %endmacro | |
420 | ||
421 | ; %1-2 = m# | |
422 | %macro HF_NOISE_PART2 4 | |
423 | psubw m%1, m%3 | |
424 | psubw m%2, m%4 | |
425 | pxor m3, m3 | |
426 | pxor m1, m1 | |
427 | pcmpgtw m3, m%1 | |
428 | pcmpgtw m1, m%2 | |
429 | pxor m%1, m3 | |
430 | pxor m%2, m1 | |
431 | psubw m%1, m3 | |
432 | psubw m%2, m1 | |
433 | paddw m%2, m%1 | |
434 | paddw m6, m%2 | |
435 | %endmacro | |
436 | ||
437 | ; %1 = 8/16 | |
438 | %macro HF_NOISE 1 | |
439 | cglobal hf_noise%1, 3,3,0, pix1, lsize, h | |
2ba45a60 DM |
440 | sub hd, 2 |
441 | pxor m7, m7 | |
442 | pxor m6, m6 | |
443 | HF_NOISE_PART1 %1, 0, 1, 2, 3 | |
444 | add pix1q, lsizeq | |
445 | HF_NOISE_PART1 %1, 4, 1, 5, 3 | |
446 | HF_NOISE_PART2 0, 2, 4, 5 | |
447 | add pix1q, lsizeq | |
448 | .loop: | |
449 | HF_NOISE_PART1 %1, 0, 1, 2, 3 | |
450 | HF_NOISE_PART2 4, 5, 0, 2 | |
451 | add pix1q, lsizeq | |
452 | HF_NOISE_PART1 %1, 4, 1, 5, 3 | |
453 | HF_NOISE_PART2 0, 2, 4, 5 | |
454 | add pix1q, lsizeq | |
455 | sub hd, 2 | |
456 | jne .loop | |
457 | ||
458 | mova m0, m6 | |
459 | punpcklwd m0, m7 | |
460 | punpckhwd m6, m7 | |
461 | paddd m6, m0 | |
462 | mova m0, m6 | |
463 | psrlq m6, 32 | |
464 | paddd m0, m6 | |
465 | movd eax, m0 ; eax = result of hf_noise8; | |
466 | REP_RET ; return eax; | |
467 | %endmacro | |
468 | ||
469 | INIT_MMX mmx | |
470 | HF_NOISE 8 | |
471 | HF_NOISE 16 | |
f6fa7814 DM |
472 | |
473 | ;--------------------------------------------------------------------------------------- | |
474 | ;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); | |
475 | ;--------------------------------------------------------------------------------------- | |
476 | ;%1 = 8/16 | |
477 | %macro SAD 1 | |
478 | cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h | |
479 | movu m2, [pix2q] | |
480 | movu m1, [pix2q+strideq] | |
481 | psadbw m2, [pix1q] | |
482 | psadbw m1, [pix1q+strideq] | |
483 | paddw m2, m1 | |
484 | %if %1 != mmsize | |
485 | movu m0, [pix2q+8] | |
486 | movu m1, [pix2q+strideq+8] | |
487 | psadbw m0, [pix1q+8] | |
488 | psadbw m1, [pix1q+strideq+8] | |
489 | paddw m2, m0 | |
490 | paddw m2, m1 | |
491 | %endif | |
492 | sub hd, 2 | |
493 | ||
494 | align 16 | |
495 | .loop: | |
496 | lea pix1q, [pix1q+strideq*2] | |
497 | lea pix2q, [pix2q+strideq*2] | |
498 | movu m0, [pix2q] | |
499 | movu m1, [pix2q+strideq] | |
500 | psadbw m0, [pix1q] | |
501 | psadbw m1, [pix1q+strideq] | |
502 | paddw m2, m0 | |
503 | paddw m2, m1 | |
504 | %if %1 != mmsize | |
505 | movu m0, [pix2q+8] | |
506 | movu m1, [pix2q+strideq+8] | |
507 | psadbw m0, [pix1q+8] | |
508 | psadbw m1, [pix1q+strideq+8] | |
509 | paddw m2, m0 | |
510 | paddw m2, m1 | |
511 | %endif | |
512 | sub hd, 2 | |
513 | jg .loop | |
514 | %if mmsize == 16 | |
515 | movhlps m0, m2 | |
516 | paddw m2, m0 | |
517 | %endif | |
518 | movd eax, m2 | |
519 | RET | |
520 | %endmacro | |
521 | ||
522 | INIT_MMX mmxext | |
523 | SAD 8 | |
524 | SAD 16 | |
525 | INIT_XMM sse2 | |
526 | SAD 16 | |
527 | ||
528 | ;------------------------------------------------------------------------------------------ | |
529 | ;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); | |
530 | ;------------------------------------------------------------------------------------------ | |
531 | ;%1 = 8/16 | |
532 | %macro SAD_X2 1 | |
533 | cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h | |
534 | movu m0, [pix2q] | |
535 | movu m2, [pix2q+strideq] | |
536 | %if mmsize == 16 | |
537 | movu m3, [pix2q+1] | |
538 | movu m4, [pix2q+strideq+1] | |
539 | pavgb m0, m3 | |
540 | pavgb m2, m4 | |
541 | %else | |
542 | pavgb m0, [pix2q+1] | |
543 | pavgb m2, [pix2q+strideq+1] | |
544 | %endif | |
545 | psadbw m0, [pix1q] | |
546 | psadbw m2, [pix1q+strideq] | |
547 | paddw m0, m2 | |
548 | %if %1 != mmsize | |
549 | movu m1, [pix2q+8] | |
550 | movu m2, [pix2q+strideq+8] | |
551 | pavgb m1, [pix2q+9] | |
552 | pavgb m2, [pix2q+strideq+9] | |
553 | psadbw m1, [pix1q+8] | |
554 | psadbw m2, [pix1q+strideq+8] | |
555 | paddw m0, m1 | |
556 | paddw m0, m2 | |
557 | %endif | |
558 | sub hd, 2 | |
559 | ||
560 | align 16 | |
561 | .loop: | |
562 | lea pix1q, [pix1q+2*strideq] | |
563 | lea pix2q, [pix2q+2*strideq] | |
564 | movu m1, [pix2q] | |
565 | movu m2, [pix2q+strideq] | |
566 | %if mmsize == 16 | |
567 | movu m3, [pix2q+1] | |
568 | movu m4, [pix2q+strideq+1] | |
569 | pavgb m1, m3 | |
570 | pavgb m2, m4 | |
571 | %else | |
572 | pavgb m1, [pix2q+1] | |
573 | pavgb m2, [pix2q+strideq+1] | |
574 | %endif | |
575 | psadbw m1, [pix1q] | |
576 | psadbw m2, [pix1q+strideq] | |
577 | paddw m0, m1 | |
578 | paddw m0, m2 | |
579 | %if %1 != mmsize | |
580 | movu m1, [pix2q+8] | |
581 | movu m2, [pix2q+strideq+8] | |
582 | pavgb m1, [pix2q+9] | |
583 | pavgb m2, [pix2q+strideq+9] | |
584 | psadbw m1, [pix1q+8] | |
585 | psadbw m2, [pix1q+strideq+8] | |
586 | paddw m0, m1 | |
587 | paddw m0, m2 | |
588 | %endif | |
589 | sub hd, 2 | |
590 | jg .loop | |
591 | %if mmsize == 16 | |
592 | movhlps m1, m0 | |
593 | paddw m0, m1 | |
594 | %endif | |
595 | movd eax, m0 | |
596 | RET | |
597 | %endmacro | |
598 | ||
599 | INIT_MMX mmxext | |
600 | SAD_X2 8 | |
601 | SAD_X2 16 | |
602 | INIT_XMM sse2 | |
603 | SAD_X2 16 | |
604 | ||
605 | ;------------------------------------------------------------------------------------------ | |
606 | ;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); | |
607 | ;------------------------------------------------------------------------------------------ | |
608 | ;%1 = 8/16 | |
609 | %macro SAD_Y2 1 | |
610 | cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h | |
611 | movu m1, [pix2q] | |
612 | movu m0, [pix2q+strideq] | |
613 | movu m3, [pix2q+2*strideq] | |
614 | pavgb m1, m0 | |
615 | pavgb m0, m3 | |
616 | psadbw m1, [pix1q] | |
617 | psadbw m0, [pix1q+strideq] | |
618 | paddw m0, m1 | |
619 | mova m1, m3 | |
620 | %if %1 != mmsize | |
621 | movu m4, [pix2q+8] | |
622 | movu m5, [pix2q+strideq+8] | |
623 | movu m6, [pix2q+2*strideq+8] | |
624 | pavgb m4, m5 | |
625 | pavgb m5, m6 | |
626 | psadbw m4, [pix1q+8] | |
627 | psadbw m5, [pix1q+strideq+8] | |
628 | paddw m0, m4 | |
629 | paddw m0, m5 | |
630 | mova m4, m6 | |
631 | %endif | |
632 | add pix2q, strideq | |
633 | sub hd, 2 | |
634 | ||
635 | align 16 | |
636 | .loop: | |
637 | lea pix1q, [pix1q+2*strideq] | |
638 | lea pix2q, [pix2q+2*strideq] | |
639 | movu m2, [pix2q] | |
640 | movu m3, [pix2q+strideq] | |
641 | pavgb m1, m2 | |
642 | pavgb m2, m3 | |
643 | psadbw m1, [pix1q] | |
644 | psadbw m2, [pix1q+strideq] | |
645 | paddw m0, m1 | |
646 | paddw m0, m2 | |
647 | mova m1, m3 | |
648 | %if %1 != mmsize | |
649 | movu m5, [pix2q+8] | |
650 | movu m6, [pix2q+strideq+8] | |
651 | pavgb m4, m5 | |
652 | pavgb m5, m6 | |
653 | psadbw m4, [pix1q+8] | |
654 | psadbw m5, [pix1q+strideq+8] | |
655 | paddw m0, m4 | |
656 | paddw m0, m5 | |
657 | mova m4, m6 | |
658 | %endif | |
659 | sub hd, 2 | |
660 | jg .loop | |
661 | %if mmsize == 16 | |
662 | movhlps m1, m0 | |
663 | paddw m0, m1 | |
664 | %endif | |
665 | movd eax, m0 | |
666 | RET | |
667 | %endmacro | |
668 | ||
669 | INIT_MMX mmxext | |
670 | SAD_Y2 8 | |
671 | SAD_Y2 16 | |
672 | INIT_XMM sse2 | |
673 | SAD_Y2 16 | |
674 | ||
675 | ;------------------------------------------------------------------------------------------- | |
676 | ;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); | |
677 | ;------------------------------------------------------------------------------------------- | |
678 | ;%1 = 8/16 | |
679 | %macro SAD_APPROX_XY2 1 | |
680 | cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h | |
681 | mova m4, [pb_1] | |
682 | movu m1, [pix2q] | |
683 | movu m0, [pix2q+strideq] | |
684 | movu m3, [pix2q+2*strideq] | |
685 | %if mmsize == 16 | |
686 | movu m5, [pix2q+1] | |
687 | movu m6, [pix2q+strideq+1] | |
688 | movu m2, [pix2q+2*strideq+1] | |
689 | pavgb m1, m5 | |
690 | pavgb m0, m6 | |
691 | pavgb m3, m2 | |
692 | %else | |
693 | pavgb m1, [pix2q+1] | |
694 | pavgb m0, [pix2q+strideq+1] | |
695 | pavgb m3, [pix2q+2*strideq+1] | |
696 | %endif | |
697 | psubusb m0, m4 | |
698 | pavgb m1, m0 | |
699 | pavgb m0, m3 | |
700 | psadbw m1, [pix1q] | |
701 | psadbw m0, [pix1q+strideq] | |
702 | paddw m0, m1 | |
703 | mova m1, m3 | |
704 | %if %1 != mmsize | |
705 | movu m5, [pix2q+8] | |
706 | movu m6, [pix2q+strideq+8] | |
707 | movu m7, [pix2q+2*strideq+8] | |
708 | pavgb m5, [pix2q+1+8] | |
709 | pavgb m6, [pix2q+strideq+1+8] | |
710 | pavgb m7, [pix2q+2*strideq+1+8] | |
711 | psubusb m6, m4 | |
712 | pavgb m5, m6 | |
713 | pavgb m6, m7 | |
714 | psadbw m5, [pix1q+8] | |
715 | psadbw m6, [pix1q+strideq+8] | |
716 | paddw m0, m5 | |
717 | paddw m0, m6 | |
718 | mova m5, m7 | |
719 | %endif | |
720 | add pix2q, strideq | |
721 | sub hd, 2 | |
722 | ||
723 | align 16 | |
724 | .loop: | |
725 | lea pix1q, [pix1q+2*strideq] | |
726 | lea pix2q, [pix2q+2*strideq] | |
727 | movu m2, [pix2q] | |
728 | movu m3, [pix2q+strideq] | |
729 | %if mmsize == 16 | |
730 | movu m5, [pix2q+1] | |
731 | movu m6, [pix2q+strideq+1] | |
732 | pavgb m2, m5 | |
733 | pavgb m3, m6 | |
734 | %else | |
735 | pavgb m2, [pix2q+1] | |
736 | pavgb m3, [pix2q+strideq+1] | |
737 | %endif | |
738 | psubusb m2, m4 | |
739 | pavgb m1, m2 | |
740 | pavgb m2, m3 | |
741 | psadbw m1, [pix1q] | |
742 | psadbw m2, [pix1q+strideq] | |
743 | paddw m0, m1 | |
744 | paddw m0, m2 | |
745 | mova m1, m3 | |
746 | %if %1 != mmsize | |
747 | movu m6, [pix2q+8] | |
748 | movu m7, [pix2q+strideq+8] | |
749 | pavgb m6, [pix2q+8+1] | |
750 | pavgb m7, [pix2q+strideq+8+1] | |
751 | psubusb m6, m4 | |
752 | pavgb m5, m6 | |
753 | pavgb m6, m7 | |
754 | psadbw m5, [pix1q+8] | |
755 | psadbw m6, [pix1q+strideq+8] | |
756 | paddw m0, m5 | |
757 | paddw m0, m6 | |
758 | mova m5, m7 | |
759 | %endif | |
760 | sub hd, 2 | |
761 | jg .loop | |
762 | %if mmsize == 16 | |
763 | movhlps m1, m0 | |
764 | paddw m0, m1 | |
765 | %endif | |
766 | movd eax, m0 | |
767 | RET | |
768 | %endmacro | |
769 | ||
770 | INIT_MMX mmxext | |
771 | SAD_APPROX_XY2 8 | |
772 | SAD_APPROX_XY2 16 | |
773 | INIT_XMM sse2 | |
774 | SAD_APPROX_XY2 16 | |
775 | ||
776 | ;-------------------------------------------------------------------- | |
777 | ;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
778 | ; ptrdiff_t line_size, int h); | |
779 | ;-------------------------------------------------------------------- | |
780 | ; %1 = 8/16 | |
781 | %macro VSAD_INTRA 1 | |
782 | cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h | |
783 | mova m0, [pix1q] | |
784 | %if %1 == mmsize | |
785 | mova m2, [pix1q+lsizeq] | |
786 | psadbw m0, m2 | |
787 | %else | |
788 | mova m2, [pix1q+lsizeq] | |
789 | mova m3, [pix1q+8] | |
790 | mova m4, [pix1q+lsizeq+8] | |
791 | psadbw m0, m2 | |
792 | psadbw m3, m4 | |
793 | paddw m0, m3 | |
794 | %endif | |
795 | sub hd, 2 | |
796 | ||
797 | .loop | |
798 | lea pix1q, [pix1q + 2*lsizeq] | |
799 | %if %1 == mmsize | |
800 | mova m1, [pix1q] | |
801 | psadbw m2, m1 | |
802 | paddw m0, m2 | |
803 | mova m2, [pix1q+lsizeq] | |
804 | psadbw m1, m2 | |
805 | paddw m0, m1 | |
806 | %else | |
807 | mova m1, [pix1q] | |
808 | mova m3, [pix1q+8] | |
809 | psadbw m2, m1 | |
810 | psadbw m4, m3 | |
811 | paddw m0, m2 | |
812 | paddw m0, m4 | |
813 | mova m2, [pix1q+lsizeq] | |
814 | mova m4, [pix1q+lsizeq+8] | |
815 | psadbw m1, m2 | |
816 | psadbw m3, m4 | |
817 | paddw m0, m1 | |
818 | paddw m0, m3 | |
819 | %endif | |
820 | sub hd, 2 | |
821 | jg .loop | |
822 | ||
823 | %if mmsize == 16 | |
824 | pshufd m1, m0, 0xe | |
825 | paddd m0, m1 | |
826 | %endif | |
827 | movd eax, m0 | |
828 | RET | |
829 | %endmacro | |
830 | ||
831 | INIT_MMX mmxext | |
832 | VSAD_INTRA 8 | |
833 | VSAD_INTRA 16 | |
834 | INIT_XMM sse2 | |
835 | VSAD_INTRA 16 | |
836 | ||
837 | ;--------------------------------------------------------------------- | |
838 | ;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
839 | ; ptrdiff_t line_size, int h); | |
840 | ;--------------------------------------------------------------------- | |
841 | ; %1 = 8/16 | |
842 | %macro VSAD_APPROX 1 | |
843 | cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h | |
844 | mova m1, [pb_80] | |
845 | mova m0, [pix1q] | |
846 | %if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 | |
847 | mova m4, [pix1q+lsizeq] | |
848 | %if mmsize == 16 | |
849 | movu m3, [pix2q] | |
850 | movu m2, [pix2q+lsizeq] | |
851 | psubb m0, m3 | |
852 | psubb m4, m2 | |
853 | %else | |
854 | psubb m0, [pix2q] | |
855 | psubb m4, [pix2q+lsizeq] | |
856 | %endif | |
857 | pxor m0, m1 | |
858 | pxor m4, m1 | |
859 | psadbw m0, m4 | |
860 | %else ; vsad16_mmxext | |
861 | mova m3, [pix1q+8] | |
862 | psubb m0, [pix2q] | |
863 | psubb m3, [pix2q+8] | |
864 | pxor m0, m1 | |
865 | pxor m3, m1 | |
866 | mova m4, [pix1q+lsizeq] | |
867 | mova m5, [pix1q+lsizeq+8] | |
868 | psubb m4, [pix2q+lsizeq] | |
869 | psubb m5, [pix2q+lsizeq+8] | |
870 | pxor m4, m1 | |
871 | pxor m5, m1 | |
872 | psadbw m0, m4 | |
873 | psadbw m3, m5 | |
874 | paddw m0, m3 | |
875 | %endif | |
876 | sub hd, 2 | |
877 | ||
878 | .loop | |
879 | lea pix1q, [pix1q + 2*lsizeq] | |
880 | lea pix2q, [pix2q + 2*lsizeq] | |
881 | mova m2, [pix1q] | |
882 | %if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 | |
883 | %if mmsize == 16 | |
884 | movu m3, [pix2q] | |
885 | psubb m2, m3 | |
886 | %else | |
887 | psubb m2, [pix2q] | |
888 | %endif | |
889 | pxor m2, m1 | |
890 | psadbw m4, m2 | |
891 | paddw m0, m4 | |
892 | mova m4, [pix1q+lsizeq] | |
893 | movu m3, [pix2q+lsizeq] | |
894 | psubb m4, m3 | |
895 | pxor m4, m1 | |
896 | psadbw m2, m4 | |
897 | paddw m0, m2 | |
898 | %else ; vsad16_mmxext | |
899 | mova m3, [pix1q+8] | |
900 | psubb m2, [pix2q] | |
901 | psubb m3, [pix2q+8] | |
902 | pxor m2, m1 | |
903 | pxor m3, m1 | |
904 | psadbw m4, m2 | |
905 | psadbw m5, m3 | |
906 | paddw m0, m4 | |
907 | paddw m0, m5 | |
908 | mova m4, [pix1q+lsizeq] | |
909 | mova m5, [pix1q+lsizeq+8] | |
910 | psubb m4, [pix2q+lsizeq] | |
911 | psubb m5, [pix2q+lsizeq+8] | |
912 | pxor m4, m1 | |
913 | pxor m5, m1 | |
914 | psadbw m2, m4 | |
915 | psadbw m3, m5 | |
916 | paddw m0, m2 | |
917 | paddw m0, m3 | |
918 | %endif | |
919 | sub hd, 2 | |
920 | jg .loop | |
921 | ||
922 | %if mmsize == 16 | |
923 | pshufd m1, m0, 0xe | |
924 | paddd m0, m1 | |
925 | %endif | |
926 | movd eax, m0 | |
927 | RET | |
928 | %endmacro | |
929 | ||
930 | INIT_MMX mmxext | |
931 | VSAD_APPROX 8 | |
932 | VSAD_APPROX 16 | |
933 | INIT_XMM sse2 | |
934 | VSAD_APPROX 16 |