Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* FFT transform with SSE/3DNow optimizations | |
3 | ;* Copyright (c) 2008 Loren Merritt | |
4 | ;* Copyright (c) 2011 Vitor Sessak | |
5 | ;* | |
6 | ;* This algorithm (though not any of the implementation details) is | |
7 | ;* based on libdjbfft by D. J. Bernstein. | |
8 | ;* | |
9 | ;* This file is part of FFmpeg. | |
10 | ;* | |
11 | ;* FFmpeg is free software; you can redistribute it and/or | |
12 | ;* modify it under the terms of the GNU Lesser General Public | |
13 | ;* License as published by the Free Software Foundation; either | |
14 | ;* version 2.1 of the License, or (at your option) any later version. | |
15 | ;* | |
16 | ;* FFmpeg is distributed in the hope that it will be useful, | |
17 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 | ;* Lesser General Public License for more details. | |
20 | ;* | |
21 | ;* You should have received a copy of the GNU Lesser General Public | |
22 | ;* License along with FFmpeg; if not, write to the Free Software | |
23 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
24 | ;****************************************************************************** | |
25 | ||
26 | ; These functions are not individually interchangeable with the C versions. | |
27 | ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results | |
28 | ; in blocks as conventient to the vector size. | |
29 | ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | |
30 | ||
31 | %include "libavutil/x86/x86util.asm" | |
32 | ||
33 | %if ARCH_X86_64 | |
34 | %define pointer resq | |
35 | %else | |
36 | %define pointer resd | |
37 | %endif | |
38 | ||
39 | SECTION_RODATA 32 | |
40 | ||
41 | struc FFTContext | |
42 | .nbits: resd 1 | |
43 | .reverse: resd 1 | |
44 | .revtab: pointer 1 | |
45 | .tmpbuf: pointer 1 | |
46 | .mdctsize: resd 1 | |
47 | .mdctbits: resd 1 | |
48 | .tcos: pointer 1 | |
49 | .tsin: pointer 1 | |
50 | .fftperm: pointer 1 | |
51 | .fftcalc: pointer 1 | |
52 | .imdctcalc:pointer 1 | |
53 | .imdcthalf:pointer 1 | |
54 | endstruc | |
55 | ||
56 | %define M_SQRT1_2 0.70710678118654752440 | |
57 | %define M_COS_PI_1_8 0.923879532511287 | |
58 | %define M_COS_PI_3_8 0.38268343236509 | |
59 | ||
60 | ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 | |
61 | ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 | |
62 | ||
63 | ps_root2: times 8 dd M_SQRT1_2 | |
64 | ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | |
65 | ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 | |
66 | ||
67 | perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 | |
68 | perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 | |
69 | ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 | |
70 | ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 | |
71 | ps_m1p1: dd 1<<31, 0 | |
72 | ||
73 | cextern ps_neg | |
74 | ||
75 | %assign i 16 | |
76 | %rep 13 | |
77 | cextern cos_ %+ i | |
78 | %assign i i<<1 | |
79 | %endrep | |
80 | ||
81 | %if ARCH_X86_64 | |
82 | %define pointer dq | |
83 | %else | |
84 | %define pointer dd | |
85 | %endif | |
86 | ||
87 | %macro IF0 1+ | |
88 | %endmacro | |
89 | %macro IF1 1+ | |
90 | %1 | |
91 | %endmacro | |
92 | ||
93 | SECTION_TEXT | |
94 | ||
95 | %macro T2_3DNOW 4 ; z0, z1, mem0, mem1 | |
96 | mova %1, %3 | |
97 | mova %2, %1 | |
98 | pfadd %1, %4 | |
99 | pfsub %2, %4 | |
100 | %endmacro | |
101 | ||
102 | %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 | |
103 | mova %5, %3 | |
104 | pfsub %3, %4 | |
105 | pfadd %5, %4 ; {t6,t5} | |
106 | pxor %3, [ps_m1p1] ; {t8,t7} | |
107 | mova %6, %1 | |
108 | movd [r0+12], %3 | |
109 | punpckhdq %3, [r0+8] | |
110 | pfadd %1, %5 ; {r0,i0} | |
111 | pfsub %6, %5 ; {r2,i2} | |
112 | mova %4, %2 | |
113 | pfadd %2, %3 ; {r1,i1} | |
114 | pfsub %4, %3 ; {r3,i3} | |
115 | SWAP %3, %6 | |
116 | %endmacro | |
117 | ||
118 | ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} | |
119 | ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} | |
120 | ; %3, %4, %5 tmp | |
121 | ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} | |
122 | ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} | |
123 | %macro T8_AVX 5 | |
124 | vsubps %5, %1, %2 ; v = %1 - %2 | |
125 | vaddps %3, %1, %2 ; w = %1 + %2 | |
126 | vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 | |
127 | vpermilps %2, %2, [perm1] | |
128 | vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} | |
129 | vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} | |
130 | vsubps %4, %5, %1 ; s = r - q | |
131 | vaddps %1, %5, %1 ; u = r + q | |
132 | vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} | |
133 | vshufps %5, %4, %1, 0xbb | |
134 | vshufps %3, %4, %1, 0xee | |
135 | vperm2f128 %3, %3, %5, 0x13 | |
136 | vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} | |
137 | vshufps %2, %1, %4, 0xdd | |
138 | vshufps %1, %1, %4, 0x88 | |
139 | vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} | |
140 | vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} | |
141 | vsubps %5, %1, %3 | |
142 | vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} | |
143 | vsubps %2, %4, %1 ; %2 = v - w | |
144 | vaddps %1, %4, %1 ; %1 = v + w | |
145 | %endmacro | |
146 | ||
147 | ; In SSE mode do one fft4 transforms | |
148 | ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} | |
149 | ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | |
150 | ; | |
151 | ; In AVX mode do two fft4 transforms | |
152 | ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} | |
153 | ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} | |
154 | %macro T4_SSE 3 | |
155 | subps %3, %1, %2 ; {t3,t4,-t8,t7} | |
156 | addps %1, %1, %2 ; {t1,t2,t6,t5} | |
157 | xorps %3, %3, [ps_p1p1m1p1] | |
158 | shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} | |
159 | shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} | |
160 | subps %3, %1, %2 ; {r2,i2,r3,i3} | |
161 | addps %1, %1, %2 ; {r0,i0,r1,i1} | |
162 | shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} | |
163 | shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} | |
164 | %endmacro | |
165 | ||
166 | ; In SSE mode do one FFT8 | |
167 | ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} | |
168 | ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} | |
169 | ; | |
170 | ; In AVX mode do two FFT8 | |
171 | ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} | |
172 | ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} | |
173 | ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} | |
174 | ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} | |
175 | %macro T8_SSE 6 | |
176 | addps %6, %3, %4 ; {t1,t2,t3,t4} | |
177 | subps %3, %3, %4 ; {r5,i5,r7,i7} | |
178 | shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} | |
179 | mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} | |
180 | mulps %4, %4, [ps_root2] | |
181 | addps %3, %3, %4 ; {t8,t7,ta,t9} | |
182 | shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} | |
183 | shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} | |
184 | subps %3, %6, %4 ; {t6,t5,tc,tb} | |
185 | addps %6, %6, %4 ; {t1,t2,t9,ta} | |
186 | shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} | |
187 | shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} | |
188 | subps %3, %1, %6 ; {r4,r5,r6,r7} | |
189 | addps %1, %1, %6 ; {r0,r1,r2,r3} | |
190 | subps %4, %2, %5 ; {i4,i5,i6,i7} | |
191 | addps %2, %2, %5 ; {i0,i1,i2,i3} | |
192 | %endmacro | |
193 | ||
194 | ; scheduled for cpu-bound sizes | |
195 | %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim | |
196 | IF%1 mova m4, Z(4) | |
197 | IF%1 mova m5, Z(5) | |
198 | mova m0, %2 ; wre | |
199 | mova m1, %3 ; wim | |
200 | mulps m2, m4, m0 ; r2*wre | |
201 | IF%1 mova m6, Z2(6) | |
202 | mulps m3, m5, m1 ; i2*wim | |
203 | IF%1 mova m7, Z2(7) | |
204 | mulps m4, m4, m1 ; r2*wim | |
205 | mulps m5, m5, m0 ; i2*wre | |
206 | addps m2, m2, m3 ; r2*wre + i2*wim | |
207 | mulps m3, m1, m7 ; i3*wim | |
208 | subps m5, m5, m4 ; i2*wre - r2*wim | |
209 | mulps m1, m1, m6 ; r3*wim | |
210 | mulps m4, m0, m6 ; r3*wre | |
211 | mulps m0, m0, m7 ; i3*wre | |
212 | subps m4, m4, m3 ; r3*wre - i3*wim | |
213 | mova m3, Z(0) | |
214 | addps m0, m0, m1 ; i3*wre + r3*wim | |
215 | subps m1, m4, m2 ; t3 | |
216 | addps m4, m4, m2 ; t5 | |
217 | subps m3, m3, m4 ; r2 | |
218 | addps m4, m4, Z(0) ; r0 | |
219 | mova m6, Z(2) | |
220 | mova Z(4), m3 | |
221 | mova Z(0), m4 | |
222 | subps m3, m5, m0 ; t4 | |
223 | subps m4, m6, m3 ; r3 | |
224 | addps m3, m3, m6 ; r1 | |
225 | mova Z2(6), m4 | |
226 | mova Z(2), m3 | |
227 | mova m2, Z(3) | |
228 | addps m3, m5, m0 ; t6 | |
229 | subps m2, m2, m1 ; i3 | |
230 | mova m7, Z(1) | |
231 | addps m1, m1, Z(3) ; i1 | |
232 | mova Z2(7), m2 | |
233 | mova Z(3), m1 | |
234 | subps m4, m7, m3 ; i2 | |
235 | addps m3, m3, m7 ; i0 | |
236 | mova Z(5), m4 | |
237 | mova Z(1), m3 | |
238 | %endmacro | |
239 | ||
240 | ; scheduled to avoid store->load aliasing | |
241 | %macro PASS_BIG 1 ; (!interleave) | |
242 | mova m4, Z(4) ; r2 | |
243 | mova m5, Z(5) ; i2 | |
244 | mova m0, [wq] ; wre | |
245 | mova m1, [wq+o1q] ; wim | |
246 | mulps m2, m4, m0 ; r2*wre | |
247 | mova m6, Z2(6) ; r3 | |
248 | mulps m3, m5, m1 ; i2*wim | |
249 | mova m7, Z2(7) ; i3 | |
250 | mulps m4, m4, m1 ; r2*wim | |
251 | mulps m5, m5, m0 ; i2*wre | |
252 | addps m2, m2, m3 ; r2*wre + i2*wim | |
253 | mulps m3, m1, m7 ; i3*wim | |
254 | mulps m1, m1, m6 ; r3*wim | |
255 | subps m5, m5, m4 ; i2*wre - r2*wim | |
256 | mulps m4, m0, m6 ; r3*wre | |
257 | mulps m0, m0, m7 ; i3*wre | |
258 | subps m4, m4, m3 ; r3*wre - i3*wim | |
259 | mova m3, Z(0) | |
260 | addps m0, m0, m1 ; i3*wre + r3*wim | |
261 | subps m1, m4, m2 ; t3 | |
262 | addps m4, m4, m2 ; t5 | |
263 | subps m3, m3, m4 ; r2 | |
264 | addps m4, m4, Z(0) ; r0 | |
265 | mova m6, Z(2) | |
266 | mova Z(4), m3 | |
267 | mova Z(0), m4 | |
268 | subps m3, m5, m0 ; t4 | |
269 | subps m4, m6, m3 ; r3 | |
270 | addps m3, m3, m6 ; r1 | |
271 | IF%1 mova Z2(6), m4 | |
272 | IF%1 mova Z(2), m3 | |
273 | mova m2, Z(3) | |
274 | addps m5, m5, m0 ; t6 | |
275 | subps m2, m2, m1 ; i3 | |
276 | mova m7, Z(1) | |
277 | addps m1, m1, Z(3) ; i1 | |
278 | IF%1 mova Z2(7), m2 | |
279 | IF%1 mova Z(3), m1 | |
280 | subps m6, m7, m5 ; i2 | |
281 | addps m5, m5, m7 ; i0 | |
282 | IF%1 mova Z(5), m6 | |
283 | IF%1 mova Z(1), m5 | |
284 | %if %1==0 | |
285 | INTERL m1, m3, m7, Z, 2 | |
286 | INTERL m2, m4, m0, Z2, 6 | |
287 | ||
288 | mova m1, Z(0) | |
289 | mova m2, Z(4) | |
290 | ||
291 | INTERL m5, m1, m3, Z, 0 | |
292 | INTERL m6, m2, m7, Z, 4 | |
293 | %endif | |
294 | %endmacro | |
295 | ||
296 | %macro PUNPCK 3 | |
297 | mova %3, %1 | |
298 | punpckldq %1, %2 | |
299 | punpckhdq %3, %2 | |
300 | %endmacro | |
301 | ||
302 | %define Z(x) [r0+mmsize*x] | |
303 | %define Z2(x) [r0+mmsize*x] | |
304 | %define ZH(x) [r0+mmsize*x+mmsize/2] | |
305 | ||
306 | INIT_YMM avx | |
307 | ||
308 | %if HAVE_AVX_EXTERNAL | |
309 | align 16 | |
310 | fft8_avx: | |
311 | mova m0, Z(0) | |
312 | mova m1, Z(1) | |
313 | T8_AVX m0, m1, m2, m3, m4 | |
314 | mova Z(0), m0 | |
315 | mova Z(1), m1 | |
316 | ret | |
317 | ||
318 | ||
319 | align 16 | |
320 | fft16_avx: | |
321 | mova m2, Z(2) | |
322 | mova m3, Z(3) | |
323 | T4_SSE m2, m3, m7 | |
324 | ||
325 | mova m0, Z(0) | |
326 | mova m1, Z(1) | |
327 | T8_AVX m0, m1, m4, m5, m7 | |
328 | ||
329 | mova m4, [ps_cos16_1] | |
330 | mova m5, [ps_cos16_2] | |
331 | vmulps m6, m2, m4 | |
332 | vmulps m7, m3, m5 | |
333 | vaddps m7, m7, m6 | |
334 | vmulps m2, m2, m5 | |
335 | vmulps m3, m3, m4 | |
336 | vsubps m3, m3, m2 | |
337 | vblendps m2, m7, m3, 0xf0 | |
338 | vperm2f128 m3, m7, m3, 0x21 | |
339 | vaddps m4, m2, m3 | |
340 | vsubps m2, m3, m2 | |
341 | vperm2f128 m2, m2, m2, 0x01 | |
342 | vsubps m3, m1, m2 | |
343 | vaddps m1, m1, m2 | |
344 | vsubps m5, m0, m4 | |
345 | vaddps m0, m0, m4 | |
346 | vextractf128 Z(0), m0, 0 | |
347 | vextractf128 ZH(0), m1, 0 | |
348 | vextractf128 Z(1), m0, 1 | |
349 | vextractf128 ZH(1), m1, 1 | |
350 | vextractf128 Z(2), m5, 0 | |
351 | vextractf128 ZH(2), m3, 0 | |
352 | vextractf128 Z(3), m5, 1 | |
353 | vextractf128 ZH(3), m3, 1 | |
354 | ret | |
355 | ||
356 | align 16 | |
357 | fft32_avx: | |
358 | call fft16_avx | |
359 | ||
360 | mova m0, Z(4) | |
361 | mova m1, Z(5) | |
362 | ||
363 | T4_SSE m0, m1, m4 | |
364 | ||
365 | mova m2, Z(6) | |
366 | mova m3, Z(7) | |
367 | ||
368 | T8_SSE m0, m1, m2, m3, m4, m6 | |
369 | ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} | |
370 | ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} | |
371 | ||
372 | vperm2f128 m4, m0, m2, 0x20 | |
373 | vperm2f128 m5, m1, m3, 0x20 | |
374 | vperm2f128 m6, m0, m2, 0x31 | |
375 | vperm2f128 m7, m1, m3, 0x31 | |
376 | ||
377 | PASS_SMALL 0, [cos_32], [cos_32+32] | |
378 | ||
379 | ret | |
380 | ||
381 | fft32_interleave_avx: | |
382 | call fft32_avx | |
383 | mov r2d, 32 | |
384 | .deint_loop: | |
385 | mova m2, Z(0) | |
386 | mova m3, Z(1) | |
387 | vunpcklps m0, m2, m3 | |
388 | vunpckhps m1, m2, m3 | |
389 | vextractf128 Z(0), m0, 0 | |
390 | vextractf128 ZH(0), m1, 0 | |
391 | vextractf128 Z(1), m0, 1 | |
392 | vextractf128 ZH(1), m1, 1 | |
393 | add r0, mmsize*2 | |
394 | sub r2d, mmsize/4 | |
395 | jg .deint_loop | |
396 | ret | |
397 | ||
398 | %endif | |
399 | ||
400 | INIT_XMM sse | |
401 | ||
402 | align 16 | |
403 | fft4_avx: | |
404 | fft4_sse: | |
405 | mova m0, Z(0) | |
406 | mova m1, Z(1) | |
407 | T4_SSE m0, m1, m2 | |
408 | mova Z(0), m0 | |
409 | mova Z(1), m1 | |
410 | ret | |
411 | ||
412 | align 16 | |
413 | fft8_sse: | |
414 | mova m0, Z(0) | |
415 | mova m1, Z(1) | |
416 | T4_SSE m0, m1, m2 | |
417 | mova m2, Z(2) | |
418 | mova m3, Z(3) | |
419 | T8_SSE m0, m1, m2, m3, m4, m5 | |
420 | mova Z(0), m0 | |
421 | mova Z(1), m1 | |
422 | mova Z(2), m2 | |
423 | mova Z(3), m3 | |
424 | ret | |
425 | ||
426 | align 16 | |
427 | fft16_sse: | |
428 | mova m0, Z(0) | |
429 | mova m1, Z(1) | |
430 | T4_SSE m0, m1, m2 | |
431 | mova m2, Z(2) | |
432 | mova m3, Z(3) | |
433 | T8_SSE m0, m1, m2, m3, m4, m5 | |
434 | mova m4, Z(4) | |
435 | mova m5, Z(5) | |
436 | mova Z(0), m0 | |
437 | mova Z(1), m1 | |
438 | mova Z(2), m2 | |
439 | mova Z(3), m3 | |
440 | T4_SSE m4, m5, m6 | |
441 | mova m6, Z2(6) | |
442 | mova m7, Z2(7) | |
443 | T4_SSE m6, m7, m0 | |
444 | PASS_SMALL 0, [cos_16], [cos_16+16] | |
445 | ret | |
446 | ||
447 | ||
448 | %macro FFT48_3DNOW 0 | |
449 | align 16 | |
450 | fft4 %+ SUFFIX: | |
451 | T2_3DNOW m0, m1, Z(0), Z(1) | |
452 | mova m2, Z(2) | |
453 | mova m3, Z(3) | |
454 | T4_3DNOW m0, m1, m2, m3, m4, m5 | |
455 | PUNPCK m0, m1, m4 | |
456 | PUNPCK m2, m3, m5 | |
457 | mova Z(0), m0 | |
458 | mova Z(1), m4 | |
459 | mova Z(2), m2 | |
460 | mova Z(3), m5 | |
461 | ret | |
462 | ||
463 | align 16 | |
464 | fft8 %+ SUFFIX: | |
465 | T2_3DNOW m0, m1, Z(0), Z(1) | |
466 | mova m2, Z(2) | |
467 | mova m3, Z(3) | |
468 | T4_3DNOW m0, m1, m2, m3, m4, m5 | |
469 | mova Z(0), m0 | |
470 | mova Z(2), m2 | |
471 | T2_3DNOW m4, m5, Z(4), Z(5) | |
472 | T2_3DNOW m6, m7, Z2(6), Z2(7) | |
473 | PSWAPD m0, m5 | |
474 | PSWAPD m2, m7 | |
475 | pxor m0, [ps_m1p1] | |
476 | pxor m2, [ps_m1p1] | |
477 | pfsub m5, m0 | |
478 | pfadd m7, m2 | |
479 | pfmul m5, [ps_root2] | |
480 | pfmul m7, [ps_root2] | |
481 | T4_3DNOW m1, m3, m5, m7, m0, m2 | |
482 | mova Z(5), m5 | |
483 | mova Z2(7), m7 | |
484 | mova m0, Z(0) | |
485 | mova m2, Z(2) | |
486 | T4_3DNOW m0, m2, m4, m6, m5, m7 | |
487 | PUNPCK m0, m1, m5 | |
488 | PUNPCK m2, m3, m7 | |
489 | mova Z(0), m0 | |
490 | mova Z(1), m5 | |
491 | mova Z(2), m2 | |
492 | mova Z(3), m7 | |
493 | PUNPCK m4, Z(5), m5 | |
494 | PUNPCK m6, Z2(7), m7 | |
495 | mova Z(4), m4 | |
496 | mova Z(5), m5 | |
497 | mova Z2(6), m6 | |
498 | mova Z2(7), m7 | |
499 | ret | |
500 | %endmacro | |
501 | ||
502 | %if ARCH_X86_32 | |
503 | INIT_MMX 3dnowext | |
504 | FFT48_3DNOW | |
505 | ||
506 | INIT_MMX 3dnow | |
507 | FFT48_3DNOW | |
508 | %endif | |
509 | ||
510 | %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] | |
511 | %define Z2(x) [zcq + o3q + mmsize*(x&1)] | |
512 | %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] | |
513 | %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] | |
514 | ||
515 | %macro DECL_PASS 2+ ; name, payload | |
516 | align 16 | |
517 | %1: | |
518 | DEFINE_ARGS zc, w, n, o1, o3 | |
519 | lea o3q, [nq*3] | |
520 | lea o1q, [nq*8] | |
521 | shl o3q, 4 | |
522 | .loop: | |
523 | %2 | |
524 | add zcq, mmsize*2 | |
525 | add wq, mmsize | |
526 | sub nd, mmsize/8 | |
527 | jg .loop | |
528 | rep ret | |
529 | %endmacro | |
530 | ||
531 | %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs | |
532 | lea r2, [dispatch_tab%1] | |
533 | mov r2, [r2 + (%2q-2)*gprsize] | |
534 | %ifdef PIC | |
535 | lea r3, [$$] | |
536 | add r2, r3 | |
537 | %endif | |
538 | call r2 | |
539 | %endmacro ; FFT_DISPATCH | |
540 | ||
541 | INIT_YMM avx | |
542 | ||
543 | %if HAVE_AVX_EXTERNAL | |
544 | %macro INTERL_AVX 5 | |
545 | vunpckhps %3, %2, %1 | |
546 | vunpcklps %2, %2, %1 | |
547 | vextractf128 %4(%5), %2, 0 | |
548 | vextractf128 %4 %+ H(%5), %3, 0 | |
549 | vextractf128 %4(%5 + 1), %2, 1 | |
550 | vextractf128 %4 %+ H(%5 + 1), %3, 1 | |
551 | %endmacro | |
552 | ||
553 | %define INTERL INTERL_AVX | |
554 | ||
555 | DECL_PASS pass_avx, PASS_BIG 1 | |
556 | DECL_PASS pass_interleave_avx, PASS_BIG 0 | |
557 | ||
558 | cglobal fft_calc, 2,5,8 | |
559 | mov r3d, [r0 + FFTContext.nbits] | |
560 | mov r0, r1 | |
561 | mov r1, r3 | |
562 | FFT_DISPATCH _interleave %+ SUFFIX, r1 | |
563 | REP_RET | |
564 | ||
565 | %endif | |
566 | ||
567 | INIT_XMM sse | |
568 | ||
569 | %macro INTERL_SSE 5 | |
570 | mova %3, %2 | |
571 | unpcklps %2, %1 | |
572 | unpckhps %3, %1 | |
573 | mova %4(%5), %2 | |
574 | mova %4(%5+1), %3 | |
575 | %endmacro | |
576 | ||
577 | %define INTERL INTERL_SSE | |
578 | ||
579 | DECL_PASS pass_sse, PASS_BIG 1 | |
580 | DECL_PASS pass_interleave_sse, PASS_BIG 0 | |
581 | ||
582 | %macro FFT_CALC_FUNC 0 | |
583 | cglobal fft_calc, 2,5,8 | |
584 | mov r3d, [r0 + FFTContext.nbits] | |
585 | PUSH r1 | |
586 | PUSH r3 | |
587 | mov r0, r1 | |
588 | mov r1, r3 | |
589 | FFT_DISPATCH _interleave %+ SUFFIX, r1 | |
590 | POP rcx | |
591 | POP r4 | |
592 | cmp rcx, 3+(mmsize/16) | |
593 | jg .end | |
594 | mov r2, -1 | |
595 | add rcx, 3 | |
596 | shl r2, cl | |
597 | sub r4, r2 | |
598 | .loop: | |
599 | %if mmsize == 8 | |
600 | PSWAPD m0, [r4 + r2 + 4] | |
601 | mova [r4 + r2 + 4], m0 | |
602 | %else | |
603 | movaps xmm0, [r4 + r2] | |
604 | movaps xmm1, xmm0 | |
605 | unpcklps xmm0, [r4 + r2 + 16] | |
606 | unpckhps xmm1, [r4 + r2 + 16] | |
607 | movaps [r4 + r2], xmm0 | |
608 | movaps [r4 + r2 + 16], xmm1 | |
609 | %endif | |
610 | add r2, mmsize*2 | |
611 | jl .loop | |
612 | .end: | |
613 | %if cpuflag(3dnow) | |
614 | femms | |
615 | RET | |
616 | %else | |
617 | REP_RET | |
618 | %endif | |
619 | %endmacro | |
620 | ||
621 | %if ARCH_X86_32 | |
622 | INIT_MMX 3dnow | |
623 | FFT_CALC_FUNC | |
624 | INIT_MMX 3dnowext | |
625 | FFT_CALC_FUNC | |
626 | %endif | |
627 | INIT_XMM sse | |
628 | FFT_CALC_FUNC | |
629 | ||
630 | cglobal fft_permute, 2,7,1 | |
631 | mov r4, [r0 + FFTContext.revtab] | |
632 | mov r5, [r0 + FFTContext.tmpbuf] | |
633 | mov ecx, [r0 + FFTContext.nbits] | |
634 | mov r2, 1 | |
635 | shl r2, cl | |
636 | xor r0, r0 | |
637 | %if ARCH_X86_32 | |
638 | mov r1, r1m | |
639 | %endif | |
640 | .loop: | |
641 | movaps xmm0, [r1 + 8*r0] | |
642 | movzx r6, word [r4 + 2*r0] | |
643 | movzx r3, word [r4 + 2*r0 + 2] | |
644 | movlps [r5 + 8*r6], xmm0 | |
645 | movhps [r5 + 8*r3], xmm0 | |
646 | add r0, 2 | |
647 | cmp r0, r2 | |
648 | jl .loop | |
649 | shl r2, 3 | |
650 | add r1, r2 | |
651 | add r5, r2 | |
652 | neg r2 | |
653 | ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B | |
654 | .loopcopy: | |
655 | movaps xmm0, [r5 + r2] | |
656 | movaps xmm1, [r5 + r2 + 16] | |
657 | movaps [r1 + r2], xmm0 | |
658 | movaps [r1 + r2 + 16], xmm1 | |
659 | add r2, 32 | |
660 | jl .loopcopy | |
661 | REP_RET | |
662 | ||
663 | %macro IMDCT_CALC_FUNC 0 | |
664 | cglobal imdct_calc, 3,5,3 | |
665 | mov r3d, [r0 + FFTContext.mdctsize] | |
666 | mov r4, [r0 + FFTContext.imdcthalf] | |
667 | add r1, r3 | |
668 | PUSH r3 | |
669 | PUSH r1 | |
670 | %if ARCH_X86_32 | |
671 | push r2 | |
672 | push r1 | |
673 | push r0 | |
674 | %else | |
675 | sub rsp, 8+32*WIN64 ; allocate win64 shadow space | |
676 | %endif | |
677 | call r4 | |
678 | %if ARCH_X86_32 | |
679 | add esp, 12 | |
680 | %else | |
681 | add rsp, 8+32*WIN64 | |
682 | %endif | |
683 | POP r1 | |
684 | POP r3 | |
685 | lea r0, [r1 + 2*r3] | |
686 | mov r2, r3 | |
687 | sub r3, mmsize | |
688 | neg r2 | |
689 | mova m2, [ps_neg] | |
690 | .loop: | |
691 | %if mmsize == 8 | |
692 | PSWAPD m0, [r1 + r3] | |
693 | PSWAPD m1, [r0 + r2] | |
694 | pxor m0, m2 | |
695 | %else | |
696 | mova m0, [r1 + r3] | |
697 | mova m1, [r0 + r2] | |
698 | shufps m0, m0, 0x1b | |
699 | shufps m1, m1, 0x1b | |
700 | xorps m0, m2 | |
701 | %endif | |
702 | mova [r0 + r3], m1 | |
703 | mova [r1 + r2], m0 | |
704 | sub r3, mmsize | |
705 | add r2, mmsize | |
706 | jl .loop | |
707 | %if cpuflag(3dnow) | |
708 | femms | |
709 | RET | |
710 | %else | |
711 | REP_RET | |
712 | %endif | |
713 | %endmacro | |
714 | ||
715 | %if ARCH_X86_32 | |
716 | INIT_MMX 3dnow | |
717 | IMDCT_CALC_FUNC | |
718 | INIT_MMX 3dnowext | |
719 | IMDCT_CALC_FUNC | |
720 | %endif | |
721 | ||
722 | INIT_XMM sse | |
723 | IMDCT_CALC_FUNC | |
724 | ||
725 | %if ARCH_X86_32 | |
726 | INIT_MMX 3dnow | |
727 | %define mulps pfmul | |
728 | %define addps pfadd | |
729 | %define subps pfsub | |
730 | %define unpcklps punpckldq | |
731 | %define unpckhps punpckhdq | |
732 | DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] | |
733 | DECL_PASS pass_interleave_3dnow, PASS_BIG 0 | |
734 | %define pass_3dnowext pass_3dnow | |
735 | %define pass_interleave_3dnowext pass_interleave_3dnow | |
736 | %endif | |
737 | ||
738 | %ifdef PIC | |
739 | %define SECTION_REL - $$ | |
740 | %else | |
741 | %define SECTION_REL | |
742 | %endif | |
743 | ||
744 | %macro DECL_FFT 1-2 ; nbits, suffix | |
745 | %ifidn %0, 1 | |
746 | %xdefine fullsuffix SUFFIX | |
747 | %else | |
748 | %xdefine fullsuffix %2 %+ SUFFIX | |
749 | %endif | |
750 | %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL | |
751 | %if %1>=5 | |
752 | %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL | |
753 | %endif | |
754 | %if %1>=6 | |
755 | %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL | |
756 | %endif | |
757 | ||
758 | %assign n 1<<%1 | |
759 | %rep 17-%1 | |
760 | %assign n2 n/2 | |
761 | %assign n4 n/4 | |
762 | %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL | |
763 | ||
764 | align 16 | |
765 | fft %+ n %+ fullsuffix: | |
766 | call fft %+ n2 %+ SUFFIX | |
767 | add r0, n*4 - (n&(-2<<%1)) | |
768 | call fft %+ n4 %+ SUFFIX | |
769 | add r0, n*2 - (n2&(-2<<%1)) | |
770 | call fft %+ n4 %+ SUFFIX | |
771 | sub r0, n*6 + (n2&(-2<<%1)) | |
772 | lea r1, [cos_ %+ n] | |
773 | mov r2d, n4/2 | |
774 | jmp pass %+ fullsuffix | |
775 | ||
776 | %assign n n*2 | |
777 | %endrep | |
778 | %undef n | |
779 | ||
780 | align 8 | |
781 | dispatch_tab %+ fullsuffix: pointer list_of_fft | |
782 | %endmacro ; DECL_FFT | |
783 | ||
784 | %if HAVE_AVX_EXTERNAL | |
785 | INIT_YMM avx | |
786 | DECL_FFT 6 | |
787 | DECL_FFT 6, _interleave | |
788 | %endif | |
789 | INIT_XMM sse | |
790 | DECL_FFT 5 | |
791 | DECL_FFT 5, _interleave | |
792 | %if ARCH_X86_32 | |
793 | INIT_MMX 3dnow | |
794 | DECL_FFT 4 | |
795 | DECL_FFT 4, _interleave | |
796 | INIT_MMX 3dnowext | |
797 | DECL_FFT 4 | |
798 | DECL_FFT 4, _interleave | |
799 | %endif | |
800 | ||
801 | INIT_XMM sse | |
802 | %undef mulps | |
803 | %undef addps | |
804 | %undef subps | |
805 | %undef unpcklps | |
806 | %undef unpckhps | |
807 | ||
808 | %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 | |
809 | %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 | |
810 | PSWAPD m0, [%3+%2*4] | |
811 | movq m2, [%3+%1*4-8] | |
812 | movq m3, m0 | |
813 | punpckldq m0, m2 | |
814 | punpckhdq m2, m3 | |
815 | movd m1, [%4+%1*2-4] ; tcos[j] | |
816 | movd m3, [%4+%2*2] ; tcos[n4-j-1] | |
817 | punpckldq m1, [%5+%1*2-4] ; tsin[j] | |
818 | punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] | |
819 | ||
820 | mova m4, m0 | |
821 | PSWAPD m5, m1 | |
822 | pfmul m0, m1 | |
823 | pfmul m4, m5 | |
824 | mova m6, m2 | |
825 | PSWAPD m5, m3 | |
826 | pfmul m2, m3 | |
827 | pfmul m6, m5 | |
828 | %if cpuflag(3dnowext) | |
829 | pfpnacc m0, m4 | |
830 | pfpnacc m2, m6 | |
831 | %else | |
832 | SBUTTERFLY dq, 0, 4, 1 | |
833 | SBUTTERFLY dq, 2, 6, 3 | |
834 | pxor m4, m7 | |
835 | pxor m6, m7 | |
836 | pfadd m0, m4 | |
837 | pfadd m2, m6 | |
838 | %endif | |
839 | %else | |
840 | movaps xmm0, [%3+%2*4] | |
841 | movaps xmm1, [%3+%1*4-0x10] | |
842 | movaps xmm2, xmm0 | |
843 | shufps xmm0, xmm1, 0x88 | |
844 | shufps xmm1, xmm2, 0x77 | |
845 | movlps xmm4, [%4+%2*2] | |
846 | movlps xmm5, [%5+%2*2+0x0] | |
847 | movhps xmm4, [%4+%1*2-0x8] | |
848 | movhps xmm5, [%5+%1*2-0x8] | |
849 | movaps xmm2, xmm0 | |
850 | movaps xmm3, xmm1 | |
851 | mulps xmm0, xmm5 | |
852 | mulps xmm1, xmm4 | |
853 | mulps xmm2, xmm4 | |
854 | mulps xmm3, xmm5 | |
855 | subps xmm1, xmm0 | |
856 | addps xmm2, xmm3 | |
857 | movaps xmm0, xmm1 | |
858 | unpcklps xmm1, xmm2 | |
859 | unpckhps xmm0, xmm2 | |
860 | %endif | |
861 | %endmacro | |
862 | ||
863 | %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 | |
864 | mulps m6, %3, [%5+%1] | |
865 | mulps m7, %2, [%5+%1] | |
866 | mulps %2, %2, [%6+%1] | |
867 | mulps %3, %3, [%6+%1] | |
868 | subps %2, %2, m6 | |
869 | addps %3, %3, m7 | |
870 | %endmacro | |
871 | ||
872 | %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 | |
873 | .post: | |
874 | vmovaps ymm1, [%3+%1*2] | |
875 | vmovaps ymm0, [%3+%1*2+0x20] | |
876 | vmovaps ymm3, [%3+%2*2] | |
877 | vmovaps ymm2, [%3+%2*2+0x20] | |
878 | ||
879 | CMUL %1, ymm0, ymm1, %3, %4, %5 | |
880 | CMUL %2, ymm2, ymm3, %3, %4, %5 | |
881 | vshufps ymm1, ymm1, ymm1, 0x1b | |
882 | vshufps ymm3, ymm3, ymm3, 0x1b | |
883 | vperm2f128 ymm1, ymm1, ymm1, 0x01 | |
884 | vperm2f128 ymm3, ymm3, ymm3, 0x01 | |
885 | vunpcklps ymm6, ymm2, ymm1 | |
886 | vunpckhps ymm4, ymm2, ymm1 | |
887 | vunpcklps ymm7, ymm0, ymm3 | |
888 | vunpckhps ymm5, ymm0, ymm3 | |
889 | ||
890 | vextractf128 [%3+%1*2], ymm7, 0 | |
891 | vextractf128 [%3+%1*2+0x10], ymm5, 0 | |
892 | vextractf128 [%3+%1*2+0x20], ymm7, 1 | |
893 | vextractf128 [%3+%1*2+0x30], ymm5, 1 | |
894 | ||
895 | vextractf128 [%3+%2*2], ymm6, 0 | |
896 | vextractf128 [%3+%2*2+0x10], ymm4, 0 | |
897 | vextractf128 [%3+%2*2+0x20], ymm6, 1 | |
898 | vextractf128 [%3+%2*2+0x30], ymm4, 1 | |
899 | sub %2, 0x20 | |
900 | add %1, 0x20 | |
901 | jl .post | |
902 | %endmacro | |
903 | ||
904 | %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 | |
905 | .post: | |
906 | movaps xmm1, [%3+%1*2] | |
907 | movaps xmm0, [%3+%1*2+0x10] | |
908 | CMUL %1, xmm0, xmm1, %3, %4, %5 | |
909 | movaps xmm5, [%3+%2*2] | |
910 | movaps xmm4, [%3+%2*2+0x10] | |
911 | CMUL %2, xmm4, xmm5, %3, %4, %5 | |
912 | shufps xmm1, xmm1, 0x1b | |
913 | shufps xmm5, xmm5, 0x1b | |
914 | movaps xmm6, xmm4 | |
915 | unpckhps xmm4, xmm1 | |
916 | unpcklps xmm6, xmm1 | |
917 | movaps xmm2, xmm0 | |
918 | unpcklps xmm0, xmm5 | |
919 | unpckhps xmm2, xmm5 | |
920 | movaps [%3+%2*2], xmm6 | |
921 | movaps [%3+%2*2+0x10], xmm4 | |
922 | movaps [%3+%1*2], xmm0 | |
923 | movaps [%3+%1*2+0x10], xmm2 | |
924 | sub %2, 0x10 | |
925 | add %1, 0x10 | |
926 | jl .post | |
927 | %endmacro | |
928 | ||
929 | %macro CMUL_3DNOW 6 | |
930 | mova m6, [%1+%2*2] | |
931 | mova %3, [%1+%2*2+8] | |
932 | mova %4, m6 | |
933 | mova m7, %3 | |
934 | pfmul m6, [%5+%2] | |
935 | pfmul %3, [%6+%2] | |
936 | pfmul %4, [%6+%2] | |
937 | pfmul m7, [%5+%2] | |
938 | pfsub %3, m6 | |
939 | pfadd %4, m7 | |
940 | %endmacro | |
941 | ||
942 | %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 | |
943 | .post: | |
944 | CMUL_3DNOW %3, %1, m0, m1, %4, %5 | |
945 | CMUL_3DNOW %3, %2, m2, m3, %4, %5 | |
946 | movd [%3+%1*2+ 0], m0 | |
947 | movd [%3+%2*2+12], m1 | |
948 | movd [%3+%2*2+ 0], m2 | |
949 | movd [%3+%1*2+12], m3 | |
950 | psrlq m0, 32 | |
951 | psrlq m1, 32 | |
952 | psrlq m2, 32 | |
953 | psrlq m3, 32 | |
954 | movd [%3+%1*2+ 8], m0 | |
955 | movd [%3+%2*2+ 4], m1 | |
956 | movd [%3+%2*2+ 8], m2 | |
957 | movd [%3+%1*2+ 4], m3 | |
958 | sub %2, 8 | |
959 | add %1, 8 | |
960 | jl .post | |
961 | %endmacro | |
962 | ||
963 | %macro DECL_IMDCT 1 | |
964 | cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input | |
965 | %if ARCH_X86_64 | |
966 | %define rrevtab r7 | |
967 | %define rtcos r8 | |
968 | %define rtsin r9 | |
969 | %else | |
970 | %define rrevtab r6 | |
971 | %define rtsin r6 | |
972 | %define rtcos r5 | |
973 | %endif | |
974 | mov r3d, [r0+FFTContext.mdctsize] | |
975 | add r2, r3 | |
976 | shr r3, 1 | |
977 | mov rtcos, [r0+FFTContext.tcos] | |
978 | mov rtsin, [r0+FFTContext.tsin] | |
979 | add rtcos, r3 | |
980 | add rtsin, r3 | |
981 | %if ARCH_X86_64 == 0 | |
982 | push rtcos | |
983 | push rtsin | |
984 | %endif | |
985 | shr r3, 1 | |
986 | mov rrevtab, [r0+FFTContext.revtab] | |
987 | add rrevtab, r3 | |
988 | %if ARCH_X86_64 == 0 | |
989 | push rrevtab | |
990 | %endif | |
991 | ||
992 | %if mmsize == 8 | |
993 | sub r3, 2 | |
994 | %else | |
995 | sub r3, 4 | |
996 | %endif | |
997 | %if ARCH_X86_64 || mmsize == 8 | |
998 | xor r4, r4 | |
999 | sub r4, r3 | |
1000 | %endif | |
1001 | %if notcpuflag(3dnowext) && mmsize == 8 | |
1002 | movd m7, [ps_neg] | |
1003 | %endif | |
1004 | .pre: | |
1005 | %if ARCH_X86_64 == 0 | |
1006 | ;unspill | |
1007 | %if mmsize != 8 | |
1008 | xor r4, r4 | |
1009 | sub r4, r3 | |
1010 | %endif | |
1011 | mov rtcos, [esp+8] | |
1012 | mov rtsin, [esp+4] | |
1013 | %endif | |
1014 | ||
1015 | PREROTATER r4, r3, r2, rtcos, rtsin | |
1016 | %if mmsize == 8 | |
1017 | mov r6, [esp] ; rrevtab = ptr+n8 | |
1018 | movzx r5, word [rrevtab+r4-2] ; rrevtab[j] | |
1019 | movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] | |
1020 | mova [r1+r5*8], m0 | |
1021 | mova [r1+r6*8], m2 | |
1022 | add r4, 2 | |
1023 | sub r3, 2 | |
1024 | %else | |
1025 | %if ARCH_X86_64 | |
1026 | movzx r5, word [rrevtab+r4-4] | |
1027 | movzx r6, word [rrevtab+r4-2] | |
1028 | movzx r10, word [rrevtab+r3] | |
1029 | movzx r11, word [rrevtab+r3+2] | |
1030 | movlps [r1+r5 *8], xmm0 | |
1031 | movhps [r1+r6 *8], xmm0 | |
1032 | movlps [r1+r10*8], xmm1 | |
1033 | movhps [r1+r11*8], xmm1 | |
1034 | add r4, 4 | |
1035 | %else | |
1036 | mov r6, [esp] | |
1037 | movzx r5, word [r6+r4-4] | |
1038 | movzx r4, word [r6+r4-2] | |
1039 | movlps [r1+r5*8], xmm0 | |
1040 | movhps [r1+r4*8], xmm0 | |
1041 | movzx r5, word [r6+r3] | |
1042 | movzx r4, word [r6+r3+2] | |
1043 | movlps [r1+r5*8], xmm1 | |
1044 | movhps [r1+r4*8], xmm1 | |
1045 | %endif | |
1046 | sub r3, 4 | |
1047 | %endif | |
1048 | jns .pre | |
1049 | ||
1050 | mov r5, r0 | |
1051 | mov r6, r1 | |
1052 | mov r0, r1 | |
1053 | mov r1d, [r5+FFTContext.nbits] | |
1054 | ||
1055 | FFT_DISPATCH SUFFIX, r1 | |
1056 | ||
1057 | mov r0d, [r5+FFTContext.mdctsize] | |
1058 | add r6, r0 | |
1059 | shr r0, 1 | |
1060 | %if ARCH_X86_64 == 0 | |
1061 | %define rtcos r2 | |
1062 | %define rtsin r3 | |
1063 | mov rtcos, [esp+8] | |
1064 | mov rtsin, [esp+4] | |
1065 | %endif | |
1066 | neg r0 | |
1067 | mov r1, -mmsize | |
1068 | sub r1, r0 | |
1069 | %1 r0, r1, r6, rtcos, rtsin | |
1070 | %if ARCH_X86_64 == 0 | |
1071 | add esp, 12 | |
1072 | %endif | |
1073 | %if mmsize == 8 | |
1074 | femms | |
1075 | %endif | |
1076 | RET | |
1077 | %endmacro | |
1078 | ||
1079 | DECL_IMDCT POSROTATESHUF | |
1080 | ||
1081 | %if ARCH_X86_32 | |
1082 | INIT_MMX 3dnow | |
1083 | DECL_IMDCT POSROTATESHUF_3DNOW | |
1084 | ||
1085 | INIT_MMX 3dnowext | |
1086 | DECL_IMDCT POSROTATESHUF_3DNOW | |
1087 | %endif | |
1088 | ||
1089 | INIT_YMM avx | |
1090 | ||
1091 | %if HAVE_AVX_EXTERNAL | |
1092 | DECL_IMDCT POSROTATESHUF_AVX | |
1093 | %endif |