Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / me_cmp.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* SIMD-optimized motion compensation estimation
3;*****************************************************************************
4;* Copyright (c) 2000, 2001 Fabrice Bellard
5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;*****************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28%macro DIFF_PIXELS_1 4
29 movh %1, %3
30 movh %2, %4
31 punpcklbw %2, %1
32 punpcklbw %1, %1
33 psubw %1, %2
34%endmacro
35
36; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
37; %6=temporary storage location
38; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
39%macro DIFF_PIXELS_8 6
40 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
41 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
42 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
43 add %1, %5
44 add %2, %5
45 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
46 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
47 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
49%ifdef m8
50 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
51%else
52 mova [%6], m0
53 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
54 mova m0, [%6]
55%endif
56 sub %1, %5
57 sub %2, %5
58%endmacro
59
60%macro HADAMARD8 0
61 SUMSUB_BADC w, 0, 1, 2, 3
62 SUMSUB_BADC w, 4, 5, 6, 7
63 SUMSUB_BADC w, 0, 2, 1, 3
64 SUMSUB_BADC w, 4, 6, 5, 7
65 SUMSUB_BADC w, 0, 4, 1, 5
66 SUMSUB_BADC w, 2, 6, 3, 7
67%endmacro
68
69%macro ABS1_SUM 3
70 ABS1 %1, %2
71 paddusw %3, %1
72%endmacro
73
74%macro ABS2_SUM 6
75 ABS2 %1, %2, %3, %4
76 paddusw %5, %1
77 paddusw %6, %2
78%endmacro
79
80%macro ABS_SUM_8x8_64 1
81 ABS2 m0, m1, m8, m9
82 ABS2_SUM m2, m3, m8, m9, m0, m1
83 ABS2_SUM m4, m5, m8, m9, m0, m1
84 ABS2_SUM m6, m7, m8, m9, m0, m1
85 paddusw m0, m1
86%endmacro
87
88%macro ABS_SUM_8x8_32 1
89 mova [%1], m7
90 ABS1 m0, m7
91 ABS1 m1, m7
92 ABS1_SUM m2, m7, m0
93 ABS1_SUM m3, m7, m1
94 ABS1_SUM m4, m7, m0
95 ABS1_SUM m5, m7, m1
96 ABS1_SUM m6, m7, m0
97 mova m2, [%1]
98 ABS1_SUM m2, m7, m1
99 paddusw m0, m1
100%endmacro
101
102; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
103; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
104; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
105%macro HSUM 3
106%if cpuflag(sse2)
107 movhlps %2, %1
108 paddusw %1, %2
109 pshuflw %2, %1, 0xE
110 paddusw %1, %2
111 pshuflw %2, %1, 0x1
112 paddusw %1, %2
113 movd %3, %1
114%elif cpuflag(mmxext)
115 pshufw %2, %1, 0xE
116 paddusw %1, %2
117 pshufw %2, %1, 0x1
118 paddusw %1, %2
119 movd %3, %1
120%elif cpuflag(mmx)
121 mova %2, %1
122 psrlq %1, 32
123 paddusw %1, %2
124 mova %2, %1
125 psrlq %1, 16
126 paddusw %1, %2
127 movd %3, %1
128%endif
129%endmacro
130
131%macro STORE4 5
132 mova [%1+mmsize*0], %2
133 mova [%1+mmsize*1], %3
134 mova [%1+mmsize*2], %4
135 mova [%1+mmsize*3], %5
136%endmacro
137
138%macro LOAD4 5
139 mova %2, [%1+mmsize*0]
140 mova %3, [%1+mmsize*1]
141 mova %4, [%1+mmsize*2]
142 mova %5, [%1+mmsize*3]
143%endmacro
144
145%macro hadamard8_16_wrapper 2
146cglobal hadamard8_diff, 4, 4, %1
147%ifndef m8
148 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
149 SUB rsp, pad
150%endif
151 call hadamard8x8_diff %+ SUFFIX
152%ifndef m8
153 ADD rsp, pad
154%endif
155 RET
156
157cglobal hadamard8_diff16, 5, 6, %1
158%ifndef m8
159 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
160 SUB rsp, pad
161%endif
162
163 call hadamard8x8_diff %+ SUFFIX
164 mov r5d, eax
165
166 add r1, 8
167 add r2, 8
168 call hadamard8x8_diff %+ SUFFIX
169 add r5d, eax
170
171 cmp r4d, 16
172 jne .done
173
174 lea r1, [r1+r3*8-8]
175 lea r2, [r2+r3*8-8]
176 call hadamard8x8_diff %+ SUFFIX
177 add r5d, eax
178
179 add r1, 8
180 add r2, 8
181 call hadamard8x8_diff %+ SUFFIX
182 add r5d, eax
183
184.done:
185 mov eax, r5d
186%ifndef m8
187 ADD rsp, pad
188%endif
189 RET
190%endmacro
191
192%macro HADAMARD8_DIFF 0-1
193%if cpuflag(sse2)
194hadamard8x8_diff %+ SUFFIX:
195 lea r0, [r3*3]
196 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
197 HADAMARD8
198%if ARCH_X86_64
199 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
200%else
201 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
202%endif
203 HADAMARD8
204 ABS_SUM_8x8 rsp+gprsize
205 HSUM m0, m1, eax
206 and eax, 0xFFFF
207 ret
208
209hadamard8_16_wrapper %1, 3
210%elif cpuflag(mmx)
211ALIGN 16
212; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
213; uint8_t *src2, int stride, int h)
214; r0 = void *s = unused, int h = unused (always 8)
215; note how r1, r2 and r3 are not clobbered in this function, so 16x16
216; can simply call this 2x2x (and that's why we access rsp+gprsize
217; everywhere, which is rsp of calling func
218hadamard8x8_diff %+ SUFFIX:
219 lea r0, [r3*3]
220
221 ; first 4x8 pixels
222 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
223 HADAMARD8
224 mova [rsp+gprsize+0x60], m7
225 TRANSPOSE4x4W 0, 1, 2, 3, 7
226 STORE4 rsp+gprsize, m0, m1, m2, m3
227 mova m7, [rsp+gprsize+0x60]
228 TRANSPOSE4x4W 4, 5, 6, 7, 0
229 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
230
231 ; second 4x8 pixels
232 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
233 HADAMARD8
234 mova [rsp+gprsize+0x60], m7
235 TRANSPOSE4x4W 0, 1, 2, 3, 7
236 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
237 mova m7, [rsp+gprsize+0x60]
238 TRANSPOSE4x4W 4, 5, 6, 7, 0
239
240 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
241 HADAMARD8
242 ABS_SUM_8x8_32 rsp+gprsize+0x60
243 mova [rsp+gprsize+0x60], m0
244
245 LOAD4 rsp+gprsize , m0, m1, m2, m3
246 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
247 HADAMARD8
248 ABS_SUM_8x8_32 rsp+gprsize
249 paddusw m0, [rsp+gprsize+0x60]
250
251 HSUM m0, m1, eax
252 and rax, 0xFFFF
253 ret
254
255hadamard8_16_wrapper 0, 14
256%endif
257%endmacro
258
259INIT_MMX mmx
260HADAMARD8_DIFF
261
262INIT_MMX mmxext
263HADAMARD8_DIFF
264
265INIT_XMM sse2
266%if ARCH_X86_64
267%define ABS_SUM_8x8 ABS_SUM_8x8_64
268%else
269%define ABS_SUM_8x8 ABS_SUM_8x8_32
270%endif
271HADAMARD8_DIFF 10
272
273INIT_XMM ssse3
274%define ABS_SUM_8x8 ABS_SUM_8x8_64
275HADAMARD8_DIFF 9
276
277; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
278; int line_size, int h)
279
280%macro SUM_SQUARED_ERRORS 1
281cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
282%if %1 == mmsize
283 shr hd, 1
284%endif
285 pxor m0, m0 ; mm0 = 0
286 pxor m7, m7 ; mm7 holds the sum
287
288.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
289 movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx
290 movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx
291%if %1 == mmsize
292 movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
293 movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
294%else ; %1 / 2 == mmsize; mmx only
295 mova m3, [pix1q+8] ; m3 = pix1[0][8-15]
296 mova m4, [pix2q+8] ; m4 = pix2[0][8-15]
297%endif
298
299 ; todo: mm1-mm2, mm3-mm4
300 ; algo: subtract mm1 from mm2 with saturation and vice versa
301 ; OR the result to get the absolute difference
302 mova m5, m1
303 mova m6, m3
304 psubusb m1, m2
305 psubusb m3, m4
306 psubusb m2, m5
307 psubusb m4, m6
308
309 por m2, m1
310 por m4, m3
311
312 ; now convert to 16-bit vectors so we can square them
313 mova m1, m2
314 mova m3, m4
315
316 punpckhbw m2, m0
317 punpckhbw m4, m0
318 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
319 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
320
321 pmaddwd m2, m2
322 pmaddwd m4, m4
323 pmaddwd m1, m1
324 pmaddwd m3, m3
325
326 paddd m1, m2
327 paddd m3, m4
328 paddd m7, m1
329 paddd m7, m3
330
331%if %1 == mmsize
332 lea pix1q, [pix1q + 2*lsizeq]
333 lea pix2q, [pix2q + 2*lsizeq]
334%else
335 add pix1q, lsizeq
336 add pix2q, lsizeq
337%endif
338 dec hd
339 jnz .next2lines
340
341 HADDD m7, m1
342 movd eax, m7 ; return value
343 RET
344%endmacro
345
346INIT_MMX mmx
347SUM_SQUARED_ERRORS 8
348
349INIT_MMX mmx
350SUM_SQUARED_ERRORS 16
351
352INIT_XMM sse2
353SUM_SQUARED_ERRORS 16
354
355;-----------------------------------------------
356;int ff_sum_abs_dctelem(int16_t *block)
357;-----------------------------------------------
358; %1 = number of xmm registers used
359; %2 = number of inline loops
360
361%macro SUM_ABS_DCTELEM 2
362cglobal sum_abs_dctelem, 1, 1, %1, block
363 pxor m0, m0
364 pxor m1, m1
365%assign %%i 0
366%rep %2
367 mova m2, [blockq+mmsize*(0+%%i)]
368 mova m3, [blockq+mmsize*(1+%%i)]
369 mova m4, [blockq+mmsize*(2+%%i)]
370 mova m5, [blockq+mmsize*(3+%%i)]
371 ABS1_SUM m2, m6, m0
372 ABS1_SUM m3, m6, m1
373 ABS1_SUM m4, m6, m0
374 ABS1_SUM m5, m6, m1
375%assign %%i %%i+4
376%endrep
377 paddusw m0, m1
378 HSUM m0, m1, eax
379 and eax, 0xFFFF
380 RET
381%endmacro
382
383INIT_MMX mmx
384SUM_ABS_DCTELEM 0, 4
385INIT_MMX mmxext
386SUM_ABS_DCTELEM 0, 4
387INIT_XMM sse2
388SUM_ABS_DCTELEM 7, 2
389INIT_XMM ssse3
390SUM_ABS_DCTELEM 6, 2
391
392;------------------------------------------------------------------------------
393; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h)
394;------------------------------------------------------------------------------
395; %1 = 8/16. %2-5=m#
396%macro HF_NOISE_PART1 5
397 mova m%2, [pix1q]
398%if %1 == 8
399 mova m%3, m%2
400 psllq m%2, 8
401 psrlq m%3, 8
402 psrlq m%2, 8
403%else
404 mova m%3, [pix1q+1]
405%endif
406 mova m%4, m%2
407 mova m%5, m%3
408 punpcklbw m%2, m7
409 punpcklbw m%3, m7
410 punpckhbw m%4, m7
411 punpckhbw m%5, m7
412 psubw m%2, m%3
413 psubw m%4, m%5
414%endmacro
415
416; %1-2 = m#
417%macro HF_NOISE_PART2 4
418 psubw m%1, m%3
419 psubw m%2, m%4
420 pxor m3, m3
421 pxor m1, m1
422 pcmpgtw m3, m%1
423 pcmpgtw m1, m%2
424 pxor m%1, m3
425 pxor m%2, m1
426 psubw m%1, m3
427 psubw m%2, m1
428 paddw m%2, m%1
429 paddw m6, m%2
430%endmacro
431
432; %1 = 8/16
433%macro HF_NOISE 1
434cglobal hf_noise%1, 3,3,0, pix1, lsize, h
435 movsxdifnidn lsizeq, lsized
436 sub hd, 2
437 pxor m7, m7
438 pxor m6, m6
439 HF_NOISE_PART1 %1, 0, 1, 2, 3
440 add pix1q, lsizeq
441 HF_NOISE_PART1 %1, 4, 1, 5, 3
442 HF_NOISE_PART2 0, 2, 4, 5
443 add pix1q, lsizeq
444.loop:
445 HF_NOISE_PART1 %1, 0, 1, 2, 3
446 HF_NOISE_PART2 4, 5, 0, 2
447 add pix1q, lsizeq
448 HF_NOISE_PART1 %1, 4, 1, 5, 3
449 HF_NOISE_PART2 0, 2, 4, 5
450 add pix1q, lsizeq
451 sub hd, 2
452 jne .loop
453
454 mova m0, m6
455 punpcklwd m0, m7
456 punpckhwd m6, m7
457 paddd m6, m0
458 mova m0, m6
459 psrlq m6, 32
460 paddd m0, m6
461 movd eax, m0 ; eax = result of hf_noise8;
462 REP_RET ; return eax;
463%endmacro
464
465INIT_MMX mmx
466HF_NOISE 8
467HF_NOISE 16