Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* SIMD-optimized motion compensation estimation | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (c) 2000, 2001 Fabrice Bellard | |
5 | ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
6 | ;* | |
7 | ;* This file is part of FFmpeg. | |
8 | ;* | |
9 | ;* FFmpeg is free software; you can redistribute it and/or | |
10 | ;* modify it under the terms of the GNU Lesser General Public | |
11 | ;* License as published by the Free Software Foundation; either | |
12 | ;* version 2.1 of the License, or (at your option) any later version. | |
13 | ;* | |
14 | ;* FFmpeg is distributed in the hope that it will be useful, | |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;* Lesser General Public License for more details. | |
18 | ;* | |
19 | ;* You should have received a copy of the GNU Lesser General Public | |
20 | ;* License along with FFmpeg; if not, write to the Free Software | |
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | ;***************************************************************************** | |
23 | ||
24 | %include "libavutil/x86/x86util.asm" | |
25 | ||
26 | SECTION .text | |
27 | ||
28 | %macro DIFF_PIXELS_1 4 | |
29 | movh %1, %3 | |
30 | movh %2, %4 | |
31 | punpcklbw %2, %1 | |
32 | punpcklbw %1, %1 | |
33 | psubw %1, %2 | |
34 | %endmacro | |
35 | ||
36 | ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 | |
37 | ; %6=temporary storage location | |
38 | ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) | |
39 | %macro DIFF_PIXELS_8 6 | |
40 | DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] | |
41 | DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] | |
42 | DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] | |
43 | add %1, %5 | |
44 | add %2, %5 | |
45 | DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] | |
46 | DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] | |
47 | DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] | |
48 | DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] | |
49 | %ifdef m8 | |
50 | DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] | |
51 | %else | |
52 | mova [%6], m0 | |
53 | DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] | |
54 | mova m0, [%6] | |
55 | %endif | |
56 | sub %1, %5 | |
57 | sub %2, %5 | |
58 | %endmacro | |
59 | ||
60 | %macro HADAMARD8 0 | |
61 | SUMSUB_BADC w, 0, 1, 2, 3 | |
62 | SUMSUB_BADC w, 4, 5, 6, 7 | |
63 | SUMSUB_BADC w, 0, 2, 1, 3 | |
64 | SUMSUB_BADC w, 4, 6, 5, 7 | |
65 | SUMSUB_BADC w, 0, 4, 1, 5 | |
66 | SUMSUB_BADC w, 2, 6, 3, 7 | |
67 | %endmacro | |
68 | ||
69 | %macro ABS1_SUM 3 | |
70 | ABS1 %1, %2 | |
71 | paddusw %3, %1 | |
72 | %endmacro | |
73 | ||
74 | %macro ABS2_SUM 6 | |
75 | ABS2 %1, %2, %3, %4 | |
76 | paddusw %5, %1 | |
77 | paddusw %6, %2 | |
78 | %endmacro | |
79 | ||
80 | %macro ABS_SUM_8x8_64 1 | |
81 | ABS2 m0, m1, m8, m9 | |
82 | ABS2_SUM m2, m3, m8, m9, m0, m1 | |
83 | ABS2_SUM m4, m5, m8, m9, m0, m1 | |
84 | ABS2_SUM m6, m7, m8, m9, m0, m1 | |
85 | paddusw m0, m1 | |
86 | %endmacro | |
87 | ||
88 | %macro ABS_SUM_8x8_32 1 | |
89 | mova [%1], m7 | |
90 | ABS1 m0, m7 | |
91 | ABS1 m1, m7 | |
92 | ABS1_SUM m2, m7, m0 | |
93 | ABS1_SUM m3, m7, m1 | |
94 | ABS1_SUM m4, m7, m0 | |
95 | ABS1_SUM m5, m7, m1 | |
96 | ABS1_SUM m6, m7, m0 | |
97 | mova m2, [%1] | |
98 | ABS1_SUM m2, m7, m1 | |
99 | paddusw m0, m1 | |
100 | %endmacro | |
101 | ||
102 | ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to | |
103 | ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, | |
104 | ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. | |
105 | %macro HSUM 3 | |
106 | %if cpuflag(sse2) | |
107 | movhlps %2, %1 | |
108 | paddusw %1, %2 | |
109 | pshuflw %2, %1, 0xE | |
110 | paddusw %1, %2 | |
111 | pshuflw %2, %1, 0x1 | |
112 | paddusw %1, %2 | |
113 | movd %3, %1 | |
114 | %elif cpuflag(mmxext) | |
115 | pshufw %2, %1, 0xE | |
116 | paddusw %1, %2 | |
117 | pshufw %2, %1, 0x1 | |
118 | paddusw %1, %2 | |
119 | movd %3, %1 | |
120 | %elif cpuflag(mmx) | |
121 | mova %2, %1 | |
122 | psrlq %1, 32 | |
123 | paddusw %1, %2 | |
124 | mova %2, %1 | |
125 | psrlq %1, 16 | |
126 | paddusw %1, %2 | |
127 | movd %3, %1 | |
128 | %endif | |
129 | %endmacro | |
130 | ||
131 | %macro STORE4 5 | |
132 | mova [%1+mmsize*0], %2 | |
133 | mova [%1+mmsize*1], %3 | |
134 | mova [%1+mmsize*2], %4 | |
135 | mova [%1+mmsize*3], %5 | |
136 | %endmacro | |
137 | ||
138 | %macro LOAD4 5 | |
139 | mova %2, [%1+mmsize*0] | |
140 | mova %3, [%1+mmsize*1] | |
141 | mova %4, [%1+mmsize*2] | |
142 | mova %5, [%1+mmsize*3] | |
143 | %endmacro | |
144 | ||
145 | %macro hadamard8_16_wrapper 2 | |
146 | cglobal hadamard8_diff, 4, 4, %1 | |
147 | %ifndef m8 | |
148 | %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) | |
149 | SUB rsp, pad | |
150 | %endif | |
151 | call hadamard8x8_diff %+ SUFFIX | |
152 | %ifndef m8 | |
153 | ADD rsp, pad | |
154 | %endif | |
155 | RET | |
156 | ||
157 | cglobal hadamard8_diff16, 5, 6, %1 | |
158 | %ifndef m8 | |
159 | %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) | |
160 | SUB rsp, pad | |
161 | %endif | |
162 | ||
163 | call hadamard8x8_diff %+ SUFFIX | |
164 | mov r5d, eax | |
165 | ||
166 | add r1, 8 | |
167 | add r2, 8 | |
168 | call hadamard8x8_diff %+ SUFFIX | |
169 | add r5d, eax | |
170 | ||
171 | cmp r4d, 16 | |
172 | jne .done | |
173 | ||
174 | lea r1, [r1+r3*8-8] | |
175 | lea r2, [r2+r3*8-8] | |
176 | call hadamard8x8_diff %+ SUFFIX | |
177 | add r5d, eax | |
178 | ||
179 | add r1, 8 | |
180 | add r2, 8 | |
181 | call hadamard8x8_diff %+ SUFFIX | |
182 | add r5d, eax | |
183 | ||
184 | .done: | |
185 | mov eax, r5d | |
186 | %ifndef m8 | |
187 | ADD rsp, pad | |
188 | %endif | |
189 | RET | |
190 | %endmacro | |
191 | ||
192 | %macro HADAMARD8_DIFF 0-1 | |
193 | %if cpuflag(sse2) | |
194 | hadamard8x8_diff %+ SUFFIX: | |
195 | lea r0, [r3*3] | |
196 | DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize | |
197 | HADAMARD8 | |
198 | %if ARCH_X86_64 | |
199 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
200 | %else | |
201 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] | |
202 | %endif | |
203 | HADAMARD8 | |
204 | ABS_SUM_8x8 rsp+gprsize | |
205 | HSUM m0, m1, eax | |
206 | and eax, 0xFFFF | |
207 | ret | |
208 | ||
209 | hadamard8_16_wrapper %1, 3 | |
210 | %elif cpuflag(mmx) | |
211 | ALIGN 16 | |
212 | ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, | |
213 | ; uint8_t *src2, int stride, int h) | |
214 | ; r0 = void *s = unused, int h = unused (always 8) | |
215 | ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 | |
216 | ; can simply call this 2x2x (and that's why we access rsp+gprsize | |
217 | ; everywhere, which is rsp of calling func | |
218 | hadamard8x8_diff %+ SUFFIX: | |
219 | lea r0, [r3*3] | |
220 | ||
221 | ; first 4x8 pixels | |
222 | DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 | |
223 | HADAMARD8 | |
224 | mova [rsp+gprsize+0x60], m7 | |
225 | TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
226 | STORE4 rsp+gprsize, m0, m1, m2, m3 | |
227 | mova m7, [rsp+gprsize+0x60] | |
228 | TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
229 | STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 | |
230 | ||
231 | ; second 4x8 pixels | |
232 | DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 | |
233 | HADAMARD8 | |
234 | mova [rsp+gprsize+0x60], m7 | |
235 | TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
236 | STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 | |
237 | mova m7, [rsp+gprsize+0x60] | |
238 | TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
239 | ||
240 | LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 | |
241 | HADAMARD8 | |
242 | ABS_SUM_8x8_32 rsp+gprsize+0x60 | |
243 | mova [rsp+gprsize+0x60], m0 | |
244 | ||
245 | LOAD4 rsp+gprsize , m0, m1, m2, m3 | |
246 | LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 | |
247 | HADAMARD8 | |
248 | ABS_SUM_8x8_32 rsp+gprsize | |
249 | paddusw m0, [rsp+gprsize+0x60] | |
250 | ||
251 | HSUM m0, m1, eax | |
252 | and rax, 0xFFFF | |
253 | ret | |
254 | ||
255 | hadamard8_16_wrapper 0, 14 | |
256 | %endif | |
257 | %endmacro | |
258 | ||
259 | INIT_MMX mmx | |
260 | HADAMARD8_DIFF | |
261 | ||
262 | INIT_MMX mmxext | |
263 | HADAMARD8_DIFF | |
264 | ||
265 | INIT_XMM sse2 | |
266 | %if ARCH_X86_64 | |
267 | %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |
268 | %else | |
269 | %define ABS_SUM_8x8 ABS_SUM_8x8_32 | |
270 | %endif | |
271 | HADAMARD8_DIFF 10 | |
272 | ||
273 | INIT_XMM ssse3 | |
274 | %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |
275 | HADAMARD8_DIFF 9 | |
276 | ||
277 | ; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
278 | ; int line_size, int h) | |
279 | ||
280 | %macro SUM_SQUARED_ERRORS 1 | |
281 | cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h | |
282 | %if %1 == mmsize | |
283 | shr hd, 1 | |
284 | %endif | |
285 | pxor m0, m0 ; mm0 = 0 | |
286 | pxor m7, m7 ; mm7 holds the sum | |
287 | ||
288 | .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned | |
289 | movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx | |
290 | movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx | |
291 | %if %1 == mmsize | |
292 | movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx | |
293 | movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx | |
294 | %else ; %1 / 2 == mmsize; mmx only | |
295 | mova m3, [pix1q+8] ; m3 = pix1[0][8-15] | |
296 | mova m4, [pix2q+8] ; m4 = pix2[0][8-15] | |
297 | %endif | |
298 | ||
299 | ; todo: mm1-mm2, mm3-mm4 | |
300 | ; algo: subtract mm1 from mm2 with saturation and vice versa | |
301 | ; OR the result to get the absolute difference | |
302 | mova m5, m1 | |
303 | mova m6, m3 | |
304 | psubusb m1, m2 | |
305 | psubusb m3, m4 | |
306 | psubusb m2, m5 | |
307 | psubusb m4, m6 | |
308 | ||
309 | por m2, m1 | |
310 | por m4, m3 | |
311 | ||
312 | ; now convert to 16-bit vectors so we can square them | |
313 | mova m1, m2 | |
314 | mova m3, m4 | |
315 | ||
316 | punpckhbw m2, m0 | |
317 | punpckhbw m4, m0 | |
318 | punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) | |
319 | punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) | |
320 | ||
321 | pmaddwd m2, m2 | |
322 | pmaddwd m4, m4 | |
323 | pmaddwd m1, m1 | |
324 | pmaddwd m3, m3 | |
325 | ||
326 | paddd m1, m2 | |
327 | paddd m3, m4 | |
328 | paddd m7, m1 | |
329 | paddd m7, m3 | |
330 | ||
331 | %if %1 == mmsize | |
332 | lea pix1q, [pix1q + 2*lsizeq] | |
333 | lea pix2q, [pix2q + 2*lsizeq] | |
334 | %else | |
335 | add pix1q, lsizeq | |
336 | add pix2q, lsizeq | |
337 | %endif | |
338 | dec hd | |
339 | jnz .next2lines | |
340 | ||
341 | HADDD m7, m1 | |
342 | movd eax, m7 ; return value | |
343 | RET | |
344 | %endmacro | |
345 | ||
346 | INIT_MMX mmx | |
347 | SUM_SQUARED_ERRORS 8 | |
348 | ||
349 | INIT_MMX mmx | |
350 | SUM_SQUARED_ERRORS 16 | |
351 | ||
352 | INIT_XMM sse2 | |
353 | SUM_SQUARED_ERRORS 16 | |
354 | ||
355 | ;----------------------------------------------- | |
356 | ;int ff_sum_abs_dctelem(int16_t *block) | |
357 | ;----------------------------------------------- | |
358 | ; %1 = number of xmm registers used | |
359 | ; %2 = number of inline loops | |
360 | ||
361 | %macro SUM_ABS_DCTELEM 2 | |
362 | cglobal sum_abs_dctelem, 1, 1, %1, block | |
363 | pxor m0, m0 | |
364 | pxor m1, m1 | |
365 | %assign %%i 0 | |
366 | %rep %2 | |
367 | mova m2, [blockq+mmsize*(0+%%i)] | |
368 | mova m3, [blockq+mmsize*(1+%%i)] | |
369 | mova m4, [blockq+mmsize*(2+%%i)] | |
370 | mova m5, [blockq+mmsize*(3+%%i)] | |
371 | ABS1_SUM m2, m6, m0 | |
372 | ABS1_SUM m3, m6, m1 | |
373 | ABS1_SUM m4, m6, m0 | |
374 | ABS1_SUM m5, m6, m1 | |
375 | %assign %%i %%i+4 | |
376 | %endrep | |
377 | paddusw m0, m1 | |
378 | HSUM m0, m1, eax | |
379 | and eax, 0xFFFF | |
380 | RET | |
381 | %endmacro | |
382 | ||
383 | INIT_MMX mmx | |
384 | SUM_ABS_DCTELEM 0, 4 | |
385 | INIT_MMX mmxext | |
386 | SUM_ABS_DCTELEM 0, 4 | |
387 | INIT_XMM sse2 | |
388 | SUM_ABS_DCTELEM 7, 2 | |
389 | INIT_XMM ssse3 | |
390 | SUM_ABS_DCTELEM 6, 2 | |
391 | ||
392 | ;------------------------------------------------------------------------------ | |
393 | ; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h) | |
394 | ;------------------------------------------------------------------------------ | |
395 | ; %1 = 8/16. %2-5=m# | |
396 | %macro HF_NOISE_PART1 5 | |
397 | mova m%2, [pix1q] | |
398 | %if %1 == 8 | |
399 | mova m%3, m%2 | |
400 | psllq m%2, 8 | |
401 | psrlq m%3, 8 | |
402 | psrlq m%2, 8 | |
403 | %else | |
404 | mova m%3, [pix1q+1] | |
405 | %endif | |
406 | mova m%4, m%2 | |
407 | mova m%5, m%3 | |
408 | punpcklbw m%2, m7 | |
409 | punpcklbw m%3, m7 | |
410 | punpckhbw m%4, m7 | |
411 | punpckhbw m%5, m7 | |
412 | psubw m%2, m%3 | |
413 | psubw m%4, m%5 | |
414 | %endmacro | |
415 | ||
416 | ; %1-2 = m# | |
417 | %macro HF_NOISE_PART2 4 | |
418 | psubw m%1, m%3 | |
419 | psubw m%2, m%4 | |
420 | pxor m3, m3 | |
421 | pxor m1, m1 | |
422 | pcmpgtw m3, m%1 | |
423 | pcmpgtw m1, m%2 | |
424 | pxor m%1, m3 | |
425 | pxor m%2, m1 | |
426 | psubw m%1, m3 | |
427 | psubw m%2, m1 | |
428 | paddw m%2, m%1 | |
429 | paddw m6, m%2 | |
430 | %endmacro | |
431 | ||
432 | ; %1 = 8/16 | |
433 | %macro HF_NOISE 1 | |
434 | cglobal hf_noise%1, 3,3,0, pix1, lsize, h | |
435 | movsxdifnidn lsizeq, lsized | |
436 | sub hd, 2 | |
437 | pxor m7, m7 | |
438 | pxor m6, m6 | |
439 | HF_NOISE_PART1 %1, 0, 1, 2, 3 | |
440 | add pix1q, lsizeq | |
441 | HF_NOISE_PART1 %1, 4, 1, 5, 3 | |
442 | HF_NOISE_PART2 0, 2, 4, 5 | |
443 | add pix1q, lsizeq | |
444 | .loop: | |
445 | HF_NOISE_PART1 %1, 0, 1, 2, 3 | |
446 | HF_NOISE_PART2 4, 5, 0, 2 | |
447 | add pix1q, lsizeq | |
448 | HF_NOISE_PART1 %1, 4, 1, 5, 3 | |
449 | HF_NOISE_PART2 0, 2, 4, 5 | |
450 | add pix1q, lsizeq | |
451 | sub hd, 2 | |
452 | jne .loop | |
453 | ||
454 | mova m0, m6 | |
455 | punpcklwd m0, m7 | |
456 | punpckhwd m6, m7 | |
457 | paddd m6, m0 | |
458 | mova m0, m6 | |
459 | psrlq m6, 32 | |
460 | paddd m0, m6 | |
461 | movd eax, m0 ; eax = result of hf_noise8; | |
462 | REP_RET ; return eax; | |
463 | %endmacro | |
464 | ||
465 | INIT_MMX mmx | |
466 | HF_NOISE 8 | |
467 | HF_NOISE 16 |