Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / fft.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* FFT transform with SSE/3DNow optimizations
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2011 Vitor Sessak
5;*
6;* This algorithm (though not any of the implementation details) is
7;* based on libdjbfft by D. J. Bernstein.
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26; These functions are not individually interchangeable with the C versions.
27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28; in blocks as conventient to the vector size.
29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
30
31%include "libavutil/x86/x86util.asm"
32
33%if ARCH_X86_64
34%define pointer resq
35%else
36%define pointer resd
37%endif
38
39SECTION_RODATA 32
40
41struc FFTContext
42 .nbits: resd 1
43 .reverse: resd 1
44 .revtab: pointer 1
45 .tmpbuf: pointer 1
46 .mdctsize: resd 1
47 .mdctbits: resd 1
48 .tcos: pointer 1
49 .tsin: pointer 1
50 .fftperm: pointer 1
51 .fftcalc: pointer 1
52 .imdctcalc:pointer 1
53 .imdcthalf:pointer 1
54endstruc
55
56%define M_SQRT1_2 0.70710678118654752440
57%define M_COS_PI_1_8 0.923879532511287
58%define M_COS_PI_3_8 0.38268343236509
59
60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
62
63ps_root2: times 8 dd M_SQRT1_2
64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
66
67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
71ps_m1p1: dd 1<<31, 0
72
73cextern ps_neg
74
75%assign i 16
76%rep 13
77cextern cos_ %+ i
78%assign i i<<1
79%endrep
80
81%if ARCH_X86_64
82 %define pointer dq
83%else
84 %define pointer dd
85%endif
86
87%macro IF0 1+
88%endmacro
89%macro IF1 1+
90 %1
91%endmacro
92
93SECTION_TEXT
94
95%macro T2_3DNOW 4 ; z0, z1, mem0, mem1
96 mova %1, %3
97 mova %2, %1
98 pfadd %1, %4
99 pfsub %2, %4
100%endmacro
101
102%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
103 mova %5, %3
104 pfsub %3, %4
105 pfadd %5, %4 ; {t6,t5}
106 pxor %3, [ps_m1p1] ; {t8,t7}
107 mova %6, %1
108 movd [r0+12], %3
109 punpckhdq %3, [r0+8]
110 pfadd %1, %5 ; {r0,i0}
111 pfsub %6, %5 ; {r2,i2}
112 mova %4, %2
113 pfadd %2, %3 ; {r1,i1}
114 pfsub %4, %3 ; {r3,i3}
115 SWAP %3, %6
116%endmacro
117
118; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
119; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
120; %3, %4, %5 tmp
121; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
122; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
123%macro T8_AVX 5
124 vsubps %5, %1, %2 ; v = %1 - %2
125 vaddps %3, %1, %2 ; w = %1 + %2
126 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
127 vpermilps %2, %2, [perm1]
128 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
129 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
130 vsubps %4, %5, %1 ; s = r - q
131 vaddps %1, %5, %1 ; u = r + q
132 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
133 vshufps %5, %4, %1, 0xbb
134 vshufps %3, %4, %1, 0xee
135 vperm2f128 %3, %3, %5, 0x13
136 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
137 vshufps %2, %1, %4, 0xdd
138 vshufps %1, %1, %4, 0x88
139 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
140 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
141 vsubps %5, %1, %3
142 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
143 vsubps %2, %4, %1 ; %2 = v - w
144 vaddps %1, %4, %1 ; %1 = v + w
145%endmacro
146
147; In SSE mode do one fft4 transforms
148; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
149; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
150;
151; In AVX mode do two fft4 transforms
152; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
153; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
154%macro T4_SSE 3
155 subps %3, %1, %2 ; {t3,t4,-t8,t7}
156 addps %1, %1, %2 ; {t1,t2,t6,t5}
157 xorps %3, %3, [ps_p1p1m1p1]
158 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
159 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
160 subps %3, %1, %2 ; {r2,i2,r3,i3}
161 addps %1, %1, %2 ; {r0,i0,r1,i1}
162 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
163 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
164%endmacro
165
166; In SSE mode do one FFT8
167; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
168; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
169;
170; In AVX mode do two FFT8
171; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
172; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
173; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
174; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
175%macro T8_SSE 6
176 addps %6, %3, %4 ; {t1,t2,t3,t4}
177 subps %3, %3, %4 ; {r5,i5,r7,i7}
178 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
179 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
180 mulps %4, %4, [ps_root2]
181 addps %3, %3, %4 ; {t8,t7,ta,t9}
182 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
183 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
184 subps %3, %6, %4 ; {t6,t5,tc,tb}
185 addps %6, %6, %4 ; {t1,t2,t9,ta}
186 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
187 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
188 subps %3, %1, %6 ; {r4,r5,r6,r7}
189 addps %1, %1, %6 ; {r0,r1,r2,r3}
190 subps %4, %2, %5 ; {i4,i5,i6,i7}
191 addps %2, %2, %5 ; {i0,i1,i2,i3}
192%endmacro
193
194; scheduled for cpu-bound sizes
195%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
196IF%1 mova m4, Z(4)
197IF%1 mova m5, Z(5)
198 mova m0, %2 ; wre
199 mova m1, %3 ; wim
200 mulps m2, m4, m0 ; r2*wre
201IF%1 mova m6, Z2(6)
202 mulps m3, m5, m1 ; i2*wim
203IF%1 mova m7, Z2(7)
204 mulps m4, m4, m1 ; r2*wim
205 mulps m5, m5, m0 ; i2*wre
206 addps m2, m2, m3 ; r2*wre + i2*wim
207 mulps m3, m1, m7 ; i3*wim
208 subps m5, m5, m4 ; i2*wre - r2*wim
209 mulps m1, m1, m6 ; r3*wim
210 mulps m4, m0, m6 ; r3*wre
211 mulps m0, m0, m7 ; i3*wre
212 subps m4, m4, m3 ; r3*wre - i3*wim
213 mova m3, Z(0)
214 addps m0, m0, m1 ; i3*wre + r3*wim
215 subps m1, m4, m2 ; t3
216 addps m4, m4, m2 ; t5
217 subps m3, m3, m4 ; r2
218 addps m4, m4, Z(0) ; r0
219 mova m6, Z(2)
220 mova Z(4), m3
221 mova Z(0), m4
222 subps m3, m5, m0 ; t4
223 subps m4, m6, m3 ; r3
224 addps m3, m3, m6 ; r1
225 mova Z2(6), m4
226 mova Z(2), m3
227 mova m2, Z(3)
228 addps m3, m5, m0 ; t6
229 subps m2, m2, m1 ; i3
230 mova m7, Z(1)
231 addps m1, m1, Z(3) ; i1
232 mova Z2(7), m2
233 mova Z(3), m1
234 subps m4, m7, m3 ; i2
235 addps m3, m3, m7 ; i0
236 mova Z(5), m4
237 mova Z(1), m3
238%endmacro
239
240; scheduled to avoid store->load aliasing
241%macro PASS_BIG 1 ; (!interleave)
242 mova m4, Z(4) ; r2
243 mova m5, Z(5) ; i2
244 mova m0, [wq] ; wre
245 mova m1, [wq+o1q] ; wim
246 mulps m2, m4, m0 ; r2*wre
247 mova m6, Z2(6) ; r3
248 mulps m3, m5, m1 ; i2*wim
249 mova m7, Z2(7) ; i3
250 mulps m4, m4, m1 ; r2*wim
251 mulps m5, m5, m0 ; i2*wre
252 addps m2, m2, m3 ; r2*wre + i2*wim
253 mulps m3, m1, m7 ; i3*wim
254 mulps m1, m1, m6 ; r3*wim
255 subps m5, m5, m4 ; i2*wre - r2*wim
256 mulps m4, m0, m6 ; r3*wre
257 mulps m0, m0, m7 ; i3*wre
258 subps m4, m4, m3 ; r3*wre - i3*wim
259 mova m3, Z(0)
260 addps m0, m0, m1 ; i3*wre + r3*wim
261 subps m1, m4, m2 ; t3
262 addps m4, m4, m2 ; t5
263 subps m3, m3, m4 ; r2
264 addps m4, m4, Z(0) ; r0
265 mova m6, Z(2)
266 mova Z(4), m3
267 mova Z(0), m4
268 subps m3, m5, m0 ; t4
269 subps m4, m6, m3 ; r3
270 addps m3, m3, m6 ; r1
271IF%1 mova Z2(6), m4
272IF%1 mova Z(2), m3
273 mova m2, Z(3)
274 addps m5, m5, m0 ; t6
275 subps m2, m2, m1 ; i3
276 mova m7, Z(1)
277 addps m1, m1, Z(3) ; i1
278IF%1 mova Z2(7), m2
279IF%1 mova Z(3), m1
280 subps m6, m7, m5 ; i2
281 addps m5, m5, m7 ; i0
282IF%1 mova Z(5), m6
283IF%1 mova Z(1), m5
284%if %1==0
285 INTERL m1, m3, m7, Z, 2
286 INTERL m2, m4, m0, Z2, 6
287
288 mova m1, Z(0)
289 mova m2, Z(4)
290
291 INTERL m5, m1, m3, Z, 0
292 INTERL m6, m2, m7, Z, 4
293%endif
294%endmacro
295
296%macro PUNPCK 3
297 mova %3, %1
298 punpckldq %1, %2
299 punpckhdq %3, %2
300%endmacro
301
302%define Z(x) [r0+mmsize*x]
303%define Z2(x) [r0+mmsize*x]
304%define ZH(x) [r0+mmsize*x+mmsize/2]
305
306INIT_YMM avx
307
308%if HAVE_AVX_EXTERNAL
309align 16
310fft8_avx:
311 mova m0, Z(0)
312 mova m1, Z(1)
313 T8_AVX m0, m1, m2, m3, m4
314 mova Z(0), m0
315 mova Z(1), m1
316 ret
317
318
319align 16
320fft16_avx:
321 mova m2, Z(2)
322 mova m3, Z(3)
323 T4_SSE m2, m3, m7
324
325 mova m0, Z(0)
326 mova m1, Z(1)
327 T8_AVX m0, m1, m4, m5, m7
328
329 mova m4, [ps_cos16_1]
330 mova m5, [ps_cos16_2]
331 vmulps m6, m2, m4
332 vmulps m7, m3, m5
333 vaddps m7, m7, m6
334 vmulps m2, m2, m5
335 vmulps m3, m3, m4
336 vsubps m3, m3, m2
337 vblendps m2, m7, m3, 0xf0
338 vperm2f128 m3, m7, m3, 0x21
339 vaddps m4, m2, m3
340 vsubps m2, m3, m2
341 vperm2f128 m2, m2, m2, 0x01
342 vsubps m3, m1, m2
343 vaddps m1, m1, m2
344 vsubps m5, m0, m4
345 vaddps m0, m0, m4
346 vextractf128 Z(0), m0, 0
347 vextractf128 ZH(0), m1, 0
348 vextractf128 Z(1), m0, 1
349 vextractf128 ZH(1), m1, 1
350 vextractf128 Z(2), m5, 0
351 vextractf128 ZH(2), m3, 0
352 vextractf128 Z(3), m5, 1
353 vextractf128 ZH(3), m3, 1
354 ret
355
356align 16
357fft32_avx:
358 call fft16_avx
359
360 mova m0, Z(4)
361 mova m1, Z(5)
362
363 T4_SSE m0, m1, m4
364
365 mova m2, Z(6)
366 mova m3, Z(7)
367
368 T8_SSE m0, m1, m2, m3, m4, m6
369 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
370 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
371
372 vperm2f128 m4, m0, m2, 0x20
373 vperm2f128 m5, m1, m3, 0x20
374 vperm2f128 m6, m0, m2, 0x31
375 vperm2f128 m7, m1, m3, 0x31
376
377 PASS_SMALL 0, [cos_32], [cos_32+32]
378
379 ret
380
381fft32_interleave_avx:
382 call fft32_avx
383 mov r2d, 32
384.deint_loop:
385 mova m2, Z(0)
386 mova m3, Z(1)
387 vunpcklps m0, m2, m3
388 vunpckhps m1, m2, m3
389 vextractf128 Z(0), m0, 0
390 vextractf128 ZH(0), m1, 0
391 vextractf128 Z(1), m0, 1
392 vextractf128 ZH(1), m1, 1
393 add r0, mmsize*2
394 sub r2d, mmsize/4
395 jg .deint_loop
396 ret
397
398%endif
399
400INIT_XMM sse
401
402align 16
403fft4_avx:
404fft4_sse:
405 mova m0, Z(0)
406 mova m1, Z(1)
407 T4_SSE m0, m1, m2
408 mova Z(0), m0
409 mova Z(1), m1
410 ret
411
412align 16
413fft8_sse:
414 mova m0, Z(0)
415 mova m1, Z(1)
416 T4_SSE m0, m1, m2
417 mova m2, Z(2)
418 mova m3, Z(3)
419 T8_SSE m0, m1, m2, m3, m4, m5
420 mova Z(0), m0
421 mova Z(1), m1
422 mova Z(2), m2
423 mova Z(3), m3
424 ret
425
426align 16
427fft16_sse:
428 mova m0, Z(0)
429 mova m1, Z(1)
430 T4_SSE m0, m1, m2
431 mova m2, Z(2)
432 mova m3, Z(3)
433 T8_SSE m0, m1, m2, m3, m4, m5
434 mova m4, Z(4)
435 mova m5, Z(5)
436 mova Z(0), m0
437 mova Z(1), m1
438 mova Z(2), m2
439 mova Z(3), m3
440 T4_SSE m4, m5, m6
441 mova m6, Z2(6)
442 mova m7, Z2(7)
443 T4_SSE m6, m7, m0
444 PASS_SMALL 0, [cos_16], [cos_16+16]
445 ret
446
447
448%macro FFT48_3DNOW 0
449align 16
450fft4 %+ SUFFIX:
451 T2_3DNOW m0, m1, Z(0), Z(1)
452 mova m2, Z(2)
453 mova m3, Z(3)
454 T4_3DNOW m0, m1, m2, m3, m4, m5
455 PUNPCK m0, m1, m4
456 PUNPCK m2, m3, m5
457 mova Z(0), m0
458 mova Z(1), m4
459 mova Z(2), m2
460 mova Z(3), m5
461 ret
462
463align 16
464fft8 %+ SUFFIX:
465 T2_3DNOW m0, m1, Z(0), Z(1)
466 mova m2, Z(2)
467 mova m3, Z(3)
468 T4_3DNOW m0, m1, m2, m3, m4, m5
469 mova Z(0), m0
470 mova Z(2), m2
471 T2_3DNOW m4, m5, Z(4), Z(5)
472 T2_3DNOW m6, m7, Z2(6), Z2(7)
473 PSWAPD m0, m5
474 PSWAPD m2, m7
475 pxor m0, [ps_m1p1]
476 pxor m2, [ps_m1p1]
477 pfsub m5, m0
478 pfadd m7, m2
479 pfmul m5, [ps_root2]
480 pfmul m7, [ps_root2]
481 T4_3DNOW m1, m3, m5, m7, m0, m2
482 mova Z(5), m5
483 mova Z2(7), m7
484 mova m0, Z(0)
485 mova m2, Z(2)
486 T4_3DNOW m0, m2, m4, m6, m5, m7
487 PUNPCK m0, m1, m5
488 PUNPCK m2, m3, m7
489 mova Z(0), m0
490 mova Z(1), m5
491 mova Z(2), m2
492 mova Z(3), m7
493 PUNPCK m4, Z(5), m5
494 PUNPCK m6, Z2(7), m7
495 mova Z(4), m4
496 mova Z(5), m5
497 mova Z2(6), m6
498 mova Z2(7), m7
499 ret
500%endmacro
501
502%if ARCH_X86_32
503INIT_MMX 3dnowext
504FFT48_3DNOW
505
506INIT_MMX 3dnow
507FFT48_3DNOW
508%endif
509
510%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
511%define Z2(x) [zcq + o3q + mmsize*(x&1)]
512%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
513%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
514
515%macro DECL_PASS 2+ ; name, payload
516align 16
517%1:
518DEFINE_ARGS zc, w, n, o1, o3
519 lea o3q, [nq*3]
520 lea o1q, [nq*8]
521 shl o3q, 4
522.loop:
523 %2
524 add zcq, mmsize*2
525 add wq, mmsize
526 sub nd, mmsize/8
527 jg .loop
528 rep ret
529%endmacro
530
531%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
532 lea r2, [dispatch_tab%1]
533 mov r2, [r2 + (%2q-2)*gprsize]
534%ifdef PIC
535 lea r3, [$$]
536 add r2, r3
537%endif
538 call r2
539%endmacro ; FFT_DISPATCH
540
541INIT_YMM avx
542
543%if HAVE_AVX_EXTERNAL
544%macro INTERL_AVX 5
545 vunpckhps %3, %2, %1
546 vunpcklps %2, %2, %1
547 vextractf128 %4(%5), %2, 0
548 vextractf128 %4 %+ H(%5), %3, 0
549 vextractf128 %4(%5 + 1), %2, 1
550 vextractf128 %4 %+ H(%5 + 1), %3, 1
551%endmacro
552
553%define INTERL INTERL_AVX
554
555DECL_PASS pass_avx, PASS_BIG 1
556DECL_PASS pass_interleave_avx, PASS_BIG 0
557
558cglobal fft_calc, 2,5,8
559 mov r3d, [r0 + FFTContext.nbits]
560 mov r0, r1
561 mov r1, r3
562 FFT_DISPATCH _interleave %+ SUFFIX, r1
563 REP_RET
564
565%endif
566
567INIT_XMM sse
568
569%macro INTERL_SSE 5
570 mova %3, %2
571 unpcklps %2, %1
572 unpckhps %3, %1
573 mova %4(%5), %2
574 mova %4(%5+1), %3
575%endmacro
576
577%define INTERL INTERL_SSE
578
579DECL_PASS pass_sse, PASS_BIG 1
580DECL_PASS pass_interleave_sse, PASS_BIG 0
581
582%macro FFT_CALC_FUNC 0
583cglobal fft_calc, 2,5,8
584 mov r3d, [r0 + FFTContext.nbits]
585 PUSH r1
586 PUSH r3
587 mov r0, r1
588 mov r1, r3
589 FFT_DISPATCH _interleave %+ SUFFIX, r1
590 POP rcx
591 POP r4
592 cmp rcx, 3+(mmsize/16)
593 jg .end
594 mov r2, -1
595 add rcx, 3
596 shl r2, cl
597 sub r4, r2
598.loop:
599%if mmsize == 8
600 PSWAPD m0, [r4 + r2 + 4]
601 mova [r4 + r2 + 4], m0
602%else
603 movaps xmm0, [r4 + r2]
604 movaps xmm1, xmm0
605 unpcklps xmm0, [r4 + r2 + 16]
606 unpckhps xmm1, [r4 + r2 + 16]
607 movaps [r4 + r2], xmm0
608 movaps [r4 + r2 + 16], xmm1
609%endif
610 add r2, mmsize*2
611 jl .loop
612.end:
613%if cpuflag(3dnow)
614 femms
615 RET
616%else
617 REP_RET
618%endif
619%endmacro
620
621%if ARCH_X86_32
622INIT_MMX 3dnow
623FFT_CALC_FUNC
624INIT_MMX 3dnowext
625FFT_CALC_FUNC
626%endif
627INIT_XMM sse
628FFT_CALC_FUNC
629
630cglobal fft_permute, 2,7,1
631 mov r4, [r0 + FFTContext.revtab]
632 mov r5, [r0 + FFTContext.tmpbuf]
633 mov ecx, [r0 + FFTContext.nbits]
634 mov r2, 1
635 shl r2, cl
636 xor r0, r0
637%if ARCH_X86_32
638 mov r1, r1m
639%endif
640.loop:
641 movaps xmm0, [r1 + 8*r0]
642 movzx r6, word [r4 + 2*r0]
643 movzx r3, word [r4 + 2*r0 + 2]
644 movlps [r5 + 8*r6], xmm0
645 movhps [r5 + 8*r3], xmm0
646 add r0, 2
647 cmp r0, r2
648 jl .loop
649 shl r2, 3
650 add r1, r2
651 add r5, r2
652 neg r2
653; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
654.loopcopy:
655 movaps xmm0, [r5 + r2]
656 movaps xmm1, [r5 + r2 + 16]
657 movaps [r1 + r2], xmm0
658 movaps [r1 + r2 + 16], xmm1
659 add r2, 32
660 jl .loopcopy
661 REP_RET
662
663%macro IMDCT_CALC_FUNC 0
664cglobal imdct_calc, 3,5,3
665 mov r3d, [r0 + FFTContext.mdctsize]
666 mov r4, [r0 + FFTContext.imdcthalf]
667 add r1, r3
668 PUSH r3
669 PUSH r1
670%if ARCH_X86_32
671 push r2
672 push r1
673 push r0
674%else
675 sub rsp, 8+32*WIN64 ; allocate win64 shadow space
676%endif
677 call r4
678%if ARCH_X86_32
679 add esp, 12
680%else
681 add rsp, 8+32*WIN64
682%endif
683 POP r1
684 POP r3
685 lea r0, [r1 + 2*r3]
686 mov r2, r3
687 sub r3, mmsize
688 neg r2
689 mova m2, [ps_neg]
690.loop:
691%if mmsize == 8
692 PSWAPD m0, [r1 + r3]
693 PSWAPD m1, [r0 + r2]
694 pxor m0, m2
695%else
696 mova m0, [r1 + r3]
697 mova m1, [r0 + r2]
698 shufps m0, m0, 0x1b
699 shufps m1, m1, 0x1b
700 xorps m0, m2
701%endif
702 mova [r0 + r3], m1
703 mova [r1 + r2], m0
704 sub r3, mmsize
705 add r2, mmsize
706 jl .loop
707%if cpuflag(3dnow)
708 femms
709 RET
710%else
711 REP_RET
712%endif
713%endmacro
714
715%if ARCH_X86_32
716INIT_MMX 3dnow
717IMDCT_CALC_FUNC
718INIT_MMX 3dnowext
719IMDCT_CALC_FUNC
720%endif
721
722INIT_XMM sse
723IMDCT_CALC_FUNC
724
725%if ARCH_X86_32
726INIT_MMX 3dnow
727%define mulps pfmul
728%define addps pfadd
729%define subps pfsub
730%define unpcklps punpckldq
731%define unpckhps punpckhdq
732DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
733DECL_PASS pass_interleave_3dnow, PASS_BIG 0
734%define pass_3dnowext pass_3dnow
735%define pass_interleave_3dnowext pass_interleave_3dnow
736%endif
737
738%ifdef PIC
739%define SECTION_REL - $$
740%else
741%define SECTION_REL
742%endif
743
744%macro DECL_FFT 1-2 ; nbits, suffix
745%ifidn %0, 1
746%xdefine fullsuffix SUFFIX
747%else
748%xdefine fullsuffix %2 %+ SUFFIX
749%endif
750%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
751%if %1>=5
752%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
753%endif
754%if %1>=6
755%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
756%endif
757
758%assign n 1<<%1
759%rep 17-%1
760%assign n2 n/2
761%assign n4 n/4
762%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
763
764align 16
765fft %+ n %+ fullsuffix:
766 call fft %+ n2 %+ SUFFIX
767 add r0, n*4 - (n&(-2<<%1))
768 call fft %+ n4 %+ SUFFIX
769 add r0, n*2 - (n2&(-2<<%1))
770 call fft %+ n4 %+ SUFFIX
771 sub r0, n*6 + (n2&(-2<<%1))
772 lea r1, [cos_ %+ n]
773 mov r2d, n4/2
774 jmp pass %+ fullsuffix
775
776%assign n n*2
777%endrep
778%undef n
779
780align 8
781dispatch_tab %+ fullsuffix: pointer list_of_fft
782%endmacro ; DECL_FFT
783
784%if HAVE_AVX_EXTERNAL
785INIT_YMM avx
786DECL_FFT 6
787DECL_FFT 6, _interleave
788%endif
789INIT_XMM sse
790DECL_FFT 5
791DECL_FFT 5, _interleave
792%if ARCH_X86_32
793INIT_MMX 3dnow
794DECL_FFT 4
795DECL_FFT 4, _interleave
796INIT_MMX 3dnowext
797DECL_FFT 4
798DECL_FFT 4, _interleave
799%endif
800
801INIT_XMM sse
802%undef mulps
803%undef addps
804%undef subps
805%undef unpcklps
806%undef unpckhps
807
808%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
809%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
810 PSWAPD m0, [%3+%2*4]
811 movq m2, [%3+%1*4-8]
812 movq m3, m0
813 punpckldq m0, m2
814 punpckhdq m2, m3
815 movd m1, [%4+%1*2-4] ; tcos[j]
816 movd m3, [%4+%2*2] ; tcos[n4-j-1]
817 punpckldq m1, [%5+%1*2-4] ; tsin[j]
818 punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
819
820 mova m4, m0
821 PSWAPD m5, m1
822 pfmul m0, m1
823 pfmul m4, m5
824 mova m6, m2
825 PSWAPD m5, m3
826 pfmul m2, m3
827 pfmul m6, m5
828%if cpuflag(3dnowext)
829 pfpnacc m0, m4
830 pfpnacc m2, m6
831%else
832 SBUTTERFLY dq, 0, 4, 1
833 SBUTTERFLY dq, 2, 6, 3
834 pxor m4, m7
835 pxor m6, m7
836 pfadd m0, m4
837 pfadd m2, m6
838%endif
839%else
840 movaps xmm0, [%3+%2*4]
841 movaps xmm1, [%3+%1*4-0x10]
842 movaps xmm2, xmm0
843 shufps xmm0, xmm1, 0x88
844 shufps xmm1, xmm2, 0x77
845 movlps xmm4, [%4+%2*2]
846 movlps xmm5, [%5+%2*2+0x0]
847 movhps xmm4, [%4+%1*2-0x8]
848 movhps xmm5, [%5+%1*2-0x8]
849 movaps xmm2, xmm0
850 movaps xmm3, xmm1
851 mulps xmm0, xmm5
852 mulps xmm1, xmm4
853 mulps xmm2, xmm4
854 mulps xmm3, xmm5
855 subps xmm1, xmm0
856 addps xmm2, xmm3
857 movaps xmm0, xmm1
858 unpcklps xmm1, xmm2
859 unpckhps xmm0, xmm2
860%endif
861%endmacro
862
863%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
864 mulps m6, %3, [%5+%1]
865 mulps m7, %2, [%5+%1]
866 mulps %2, %2, [%6+%1]
867 mulps %3, %3, [%6+%1]
868 subps %2, %2, m6
869 addps %3, %3, m7
870%endmacro
871
872%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
873.post:
874 vmovaps ymm1, [%3+%1*2]
875 vmovaps ymm0, [%3+%1*2+0x20]
876 vmovaps ymm3, [%3+%2*2]
877 vmovaps ymm2, [%3+%2*2+0x20]
878
879 CMUL %1, ymm0, ymm1, %3, %4, %5
880 CMUL %2, ymm2, ymm3, %3, %4, %5
881 vshufps ymm1, ymm1, ymm1, 0x1b
882 vshufps ymm3, ymm3, ymm3, 0x1b
883 vperm2f128 ymm1, ymm1, ymm1, 0x01
884 vperm2f128 ymm3, ymm3, ymm3, 0x01
885 vunpcklps ymm6, ymm2, ymm1
886 vunpckhps ymm4, ymm2, ymm1
887 vunpcklps ymm7, ymm0, ymm3
888 vunpckhps ymm5, ymm0, ymm3
889
890 vextractf128 [%3+%1*2], ymm7, 0
891 vextractf128 [%3+%1*2+0x10], ymm5, 0
892 vextractf128 [%3+%1*2+0x20], ymm7, 1
893 vextractf128 [%3+%1*2+0x30], ymm5, 1
894
895 vextractf128 [%3+%2*2], ymm6, 0
896 vextractf128 [%3+%2*2+0x10], ymm4, 0
897 vextractf128 [%3+%2*2+0x20], ymm6, 1
898 vextractf128 [%3+%2*2+0x30], ymm4, 1
899 sub %2, 0x20
900 add %1, 0x20
901 jl .post
902%endmacro
903
904%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
905.post:
906 movaps xmm1, [%3+%1*2]
907 movaps xmm0, [%3+%1*2+0x10]
908 CMUL %1, xmm0, xmm1, %3, %4, %5
909 movaps xmm5, [%3+%2*2]
910 movaps xmm4, [%3+%2*2+0x10]
911 CMUL %2, xmm4, xmm5, %3, %4, %5
912 shufps xmm1, xmm1, 0x1b
913 shufps xmm5, xmm5, 0x1b
914 movaps xmm6, xmm4
915 unpckhps xmm4, xmm1
916 unpcklps xmm6, xmm1
917 movaps xmm2, xmm0
918 unpcklps xmm0, xmm5
919 unpckhps xmm2, xmm5
920 movaps [%3+%2*2], xmm6
921 movaps [%3+%2*2+0x10], xmm4
922 movaps [%3+%1*2], xmm0
923 movaps [%3+%1*2+0x10], xmm2
924 sub %2, 0x10
925 add %1, 0x10
926 jl .post
927%endmacro
928
929%macro CMUL_3DNOW 6
930 mova m6, [%1+%2*2]
931 mova %3, [%1+%2*2+8]
932 mova %4, m6
933 mova m7, %3
934 pfmul m6, [%5+%2]
935 pfmul %3, [%6+%2]
936 pfmul %4, [%6+%2]
937 pfmul m7, [%5+%2]
938 pfsub %3, m6
939 pfadd %4, m7
940%endmacro
941
942%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
943.post:
944 CMUL_3DNOW %3, %1, m0, m1, %4, %5
945 CMUL_3DNOW %3, %2, m2, m3, %4, %5
946 movd [%3+%1*2+ 0], m0
947 movd [%3+%2*2+12], m1
948 movd [%3+%2*2+ 0], m2
949 movd [%3+%1*2+12], m3
950 psrlq m0, 32
951 psrlq m1, 32
952 psrlq m2, 32
953 psrlq m3, 32
954 movd [%3+%1*2+ 8], m0
955 movd [%3+%2*2+ 4], m1
956 movd [%3+%2*2+ 8], m2
957 movd [%3+%1*2+ 4], m3
958 sub %2, 8
959 add %1, 8
960 jl .post
961%endmacro
962
963%macro DECL_IMDCT 1
964cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
965%if ARCH_X86_64
966%define rrevtab r7
967%define rtcos r8
968%define rtsin r9
969%else
970%define rrevtab r6
971%define rtsin r6
972%define rtcos r5
973%endif
974 mov r3d, [r0+FFTContext.mdctsize]
975 add r2, r3
976 shr r3, 1
977 mov rtcos, [r0+FFTContext.tcos]
978 mov rtsin, [r0+FFTContext.tsin]
979 add rtcos, r3
980 add rtsin, r3
981%if ARCH_X86_64 == 0
982 push rtcos
983 push rtsin
984%endif
985 shr r3, 1
986 mov rrevtab, [r0+FFTContext.revtab]
987 add rrevtab, r3
988%if ARCH_X86_64 == 0
989 push rrevtab
990%endif
991
992%if mmsize == 8
993 sub r3, 2
994%else
995 sub r3, 4
996%endif
997%if ARCH_X86_64 || mmsize == 8
998 xor r4, r4
999 sub r4, r3
1000%endif
1001%if notcpuflag(3dnowext) && mmsize == 8
1002 movd m7, [ps_neg]
1003%endif
1004.pre:
1005%if ARCH_X86_64 == 0
1006;unspill
1007%if mmsize != 8
1008 xor r4, r4
1009 sub r4, r3
1010%endif
1011 mov rtcos, [esp+8]
1012 mov rtsin, [esp+4]
1013%endif
1014
1015 PREROTATER r4, r3, r2, rtcos, rtsin
1016%if mmsize == 8
1017 mov r6, [esp] ; rrevtab = ptr+n8
1018 movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
1019 movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
1020 mova [r1+r5*8], m0
1021 mova [r1+r6*8], m2
1022 add r4, 2
1023 sub r3, 2
1024%else
1025%if ARCH_X86_64
1026 movzx r5, word [rrevtab+r4-4]
1027 movzx r6, word [rrevtab+r4-2]
1028 movzx r10, word [rrevtab+r3]
1029 movzx r11, word [rrevtab+r3+2]
1030 movlps [r1+r5 *8], xmm0
1031 movhps [r1+r6 *8], xmm0
1032 movlps [r1+r10*8], xmm1
1033 movhps [r1+r11*8], xmm1
1034 add r4, 4
1035%else
1036 mov r6, [esp]
1037 movzx r5, word [r6+r4-4]
1038 movzx r4, word [r6+r4-2]
1039 movlps [r1+r5*8], xmm0
1040 movhps [r1+r4*8], xmm0
1041 movzx r5, word [r6+r3]
1042 movzx r4, word [r6+r3+2]
1043 movlps [r1+r5*8], xmm1
1044 movhps [r1+r4*8], xmm1
1045%endif
1046 sub r3, 4
1047%endif
1048 jns .pre
1049
1050 mov r5, r0
1051 mov r6, r1
1052 mov r0, r1
1053 mov r1d, [r5+FFTContext.nbits]
1054
1055 FFT_DISPATCH SUFFIX, r1
1056
1057 mov r0d, [r5+FFTContext.mdctsize]
1058 add r6, r0
1059 shr r0, 1
1060%if ARCH_X86_64 == 0
1061%define rtcos r2
1062%define rtsin r3
1063 mov rtcos, [esp+8]
1064 mov rtsin, [esp+4]
1065%endif
1066 neg r0
1067 mov r1, -mmsize
1068 sub r1, r0
1069 %1 r0, r1, r6, rtcos, rtsin
1070%if ARCH_X86_64 == 0
1071 add esp, 12
1072%endif
1073%if mmsize == 8
1074 femms
1075%endif
1076 RET
1077%endmacro
1078
1079DECL_IMDCT POSROTATESHUF
1080
1081%if ARCH_X86_32
1082INIT_MMX 3dnow
1083DECL_IMDCT POSROTATESHUF_3DNOW
1084
1085INIT_MMX 3dnowext
1086DECL_IMDCT POSROTATESHUF_3DNOW
1087%endif
1088
1089INIT_YMM avx
1090
1091%if HAVE_AVX_EXTERNAL
1092DECL_IMDCT POSROTATESHUF_AVX
1093%endif