Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2013 RISC OS Open Ltd | |
3 | * Author: Ben Avison <bavison@riscosopen.org> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/arm/asm.S" | |
23 | ||
24 | @ The fftx_internal_vfp versions of the functions obey a modified AAPCS: | |
25 | @ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and | |
26 | @ all single-precision VFP registers may be corrupted on exit. The a2 | |
27 | @ register may not be clobbered in these functions, as it holds the | |
28 | @ stored original FPSCR. | |
29 | ||
30 | function ff_fft_calc_vfp, export=1 | |
31 | ldr ip, [a1, #0] @ nbits | |
32 | mov a1, a2 | |
33 | A ldr pc, [pc, ip, lsl #2] | |
34 | A .word 0 | |
35 | A .word 0 | |
36 | A .word 0 | |
37 | T movrel a2, (fft_tab_vfp - 8) | |
38 | T ldr pc, [a2, ip, lsl #2] | |
39 | T endfunc | |
40 | T const fft_tab_vfp | |
41 | .word fft4_vfp | |
42 | .word fft8_vfp | |
43 | .word X(ff_fft16_vfp) @ this one alone is exported | |
44 | .word fft32_vfp | |
45 | .word fft64_vfp | |
46 | .word fft128_vfp | |
47 | .word fft256_vfp | |
48 | .word fft512_vfp | |
49 | .word fft1024_vfp | |
50 | .word fft2048_vfp | |
51 | .word fft4096_vfp | |
52 | .word fft8192_vfp | |
53 | .word fft16384_vfp | |
54 | .word fft32768_vfp | |
55 | .word fft65536_vfp | |
56 | A endfunc | |
57 | ||
58 | function fft4_vfp | |
59 | vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] | |
60 | vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] | |
61 | vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] | |
62 | vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] | |
63 | @ stall | |
64 | vadd.f s12, s0, s8 @ i0 | |
65 | vadd.f s13, s1, s9 @ i1 | |
66 | vadd.f s14, s2, s10 @ i2 | |
67 | vadd.f s15, s3, s11 @ i3 | |
68 | vsub.f s8, s0, s8 @ i4 | |
69 | vsub.f s9, s1, s9 @ i5 | |
70 | vsub.f s10, s2, s10 @ i6 | |
71 | vsub.f s11, s3, s11 @ i7 | |
72 | @ stall | |
73 | @ stall | |
74 | vadd.f s0, s12, s14 @ z[0].re | |
75 | vsub.f s4, s12, s14 @ z[2].re | |
76 | vadd.f s1, s13, s15 @ z[0].im | |
77 | vsub.f s5, s13, s15 @ z[2].im | |
78 | vadd.f s7, s9, s10 @ z[3].im | |
79 | vsub.f s3, s9, s10 @ z[1].im | |
80 | vadd.f s2, s8, s11 @ z[1].re | |
81 | vsub.f s6, s8, s11 @ z[3].re | |
82 | @ stall | |
83 | @ stall | |
84 | vstr d0, [a1, #0*2*4] | |
85 | vstr d2, [a1, #2*2*4] | |
86 | @ stall | |
87 | @ stall | |
88 | vstr d1, [a1, #1*2*4] | |
89 | vstr d3, [a1, #3*2*4] | |
90 | ||
91 | bx lr | |
92 | endfunc | |
93 | ||
94 | .macro macro_fft8_head | |
95 | @ FFT4 | |
96 | vldr d4, [a1, #0 * 2*4] | |
97 | vldr d6, [a1, #1 * 2*4] | |
98 | vldr d5, [a1, #2 * 2*4] | |
99 | vldr d7, [a1, #3 * 2*4] | |
100 | @ BF | |
101 | vldr d12, [a1, #4 * 2*4] | |
102 | vadd.f s16, s8, s12 @ vector op | |
103 | vldr d14, [a1, #5 * 2*4] | |
104 | vldr d13, [a1, #6 * 2*4] | |
105 | vldr d15, [a1, #7 * 2*4] | |
106 | vsub.f s20, s8, s12 @ vector op | |
107 | vadd.f s0, s16, s18 | |
108 | vsub.f s2, s16, s18 | |
109 | vadd.f s1, s17, s19 | |
110 | vsub.f s3, s17, s19 | |
111 | vadd.f s7, s21, s22 | |
112 | vsub.f s5, s21, s22 | |
113 | vadd.f s4, s20, s23 | |
114 | vsub.f s6, s20, s23 | |
115 | vsub.f s20, s24, s28 @ vector op | |
116 | vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory | |
117 | vstr d1, [a1, #1 * 2*4] | |
118 | vldr s0, cos1pi4 | |
119 | vadd.f s16, s24, s28 @ vector op | |
120 | vstr d2, [a1, #2 * 2*4] | |
121 | vstr d3, [a1, #3 * 2*4] | |
122 | vldr d12, [a1, #0 * 2*4] | |
123 | @ TRANSFORM | |
124 | vmul.f s20, s20, s0 @ vector x scalar op | |
125 | vldr d13, [a1, #1 * 2*4] | |
126 | vldr d14, [a1, #2 * 2*4] | |
127 | vldr d15, [a1, #3 * 2*4] | |
128 | @ BUTTERFLIES | |
129 | vadd.f s0, s18, s16 | |
130 | vadd.f s1, s17, s19 | |
131 | vsub.f s2, s17, s19 | |
132 | vsub.f s3, s18, s16 | |
133 | vadd.f s4, s21, s20 | |
134 | vsub.f s5, s21, s20 | |
135 | vadd.f s6, s22, s23 | |
136 | vsub.f s7, s22, s23 | |
137 | vadd.f s8, s0, s24 @ vector op | |
138 | vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory | |
139 | vstr d1, [a1, #1 * 2*4] | |
140 | vldr d6, [a1, #0 * 2*4] | |
141 | vldr d7, [a1, #1 * 2*4] | |
142 | vadd.f s1, s5, s6 | |
143 | vadd.f s0, s7, s4 | |
144 | vsub.f s2, s5, s6 | |
145 | vsub.f s3, s7, s4 | |
146 | vsub.f s12, s24, s12 @ vector op | |
147 | vsub.f s5, s29, s1 | |
148 | vsub.f s4, s28, s0 | |
149 | vsub.f s6, s30, s2 | |
150 | vsub.f s7, s31, s3 | |
151 | vadd.f s16, s0, s28 @ vector op | |
152 | vstr d6, [a1, #4 * 2*4] | |
153 | vstr d7, [a1, #6 * 2*4] | |
154 | vstr d4, [a1, #0 * 2*4] | |
155 | vstr d5, [a1, #2 * 2*4] | |
156 | vstr d2, [a1, #5 * 2*4] | |
157 | vstr d3, [a1, #7 * 2*4] | |
158 | .endm | |
159 | ||
160 | .macro macro_fft8_tail | |
161 | vstr d8, [a1, #1 * 2*4] | |
162 | vstr d9, [a1, #3 * 2*4] | |
163 | .endm | |
164 | ||
165 | function .Lfft8_internal_vfp | |
166 | macro_fft8_head | |
167 | macro_fft8_tail | |
168 | bx lr | |
169 | endfunc | |
170 | ||
171 | function fft8_vfp | |
172 | ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 | |
173 | fmrx a2, FPSCR | |
174 | fmxr FPSCR, a3 | |
175 | vpush {s16-s31} | |
176 | mov ip, lr | |
177 | bl .Lfft8_internal_vfp | |
178 | vpop {s16-s31} | |
179 | fmxr FPSCR, a2 | |
180 | bx ip | |
181 | endfunc | |
182 | ||
183 | .align 3 | |
184 | cos1pi4: @ cos(1*pi/4) = sqrt(2) | |
185 | .float 0.707106769084930419921875 | |
186 | cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 | |
187 | .float 0.92387950420379638671875 | |
188 | cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 | |
189 | .float 0.3826834261417388916015625 | |
190 | ||
191 | function .Lfft16_internal_vfp | |
192 | macro_fft8_head | |
193 | @ FFT4(z+8) | |
194 | vldr d10, [a1, #8 * 2*4] | |
195 | vldr d12, [a1, #9 * 2*4] | |
196 | vldr d11, [a1, #10 * 2*4] | |
197 | vldr d13, [a1, #11 * 2*4] | |
198 | macro_fft8_tail | |
199 | vadd.f s16, s20, s24 @ vector op | |
200 | @ FFT4(z+12) | |
201 | vldr d4, [a1, #12 * 2*4] | |
202 | vldr d6, [a1, #13 * 2*4] | |
203 | vldr d5, [a1, #14 * 2*4] | |
204 | vsub.f s20, s20, s24 @ vector op | |
205 | vldr d7, [a1, #15 * 2*4] | |
206 | vadd.f s0, s16, s18 | |
207 | vsub.f s4, s16, s18 | |
208 | vadd.f s1, s17, s19 | |
209 | vsub.f s5, s17, s19 | |
210 | vadd.f s7, s21, s22 | |
211 | vsub.f s3, s21, s22 | |
212 | vadd.f s2, s20, s23 | |
213 | vsub.f s6, s20, s23 | |
214 | vadd.f s16, s8, s12 @ vector op | |
215 | vstr d0, [a1, #8 * 2*4] | |
216 | vstr d2, [a1, #10 * 2*4] | |
217 | vstr d1, [a1, #9 * 2*4] | |
218 | vsub.f s20, s8, s12 | |
219 | vstr d3, [a1, #11 * 2*4] | |
220 | @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) | |
221 | vldr d12, [a1, #10 * 2*4] | |
222 | vadd.f s0, s16, s18 | |
223 | vadd.f s1, s17, s19 | |
224 | vsub.f s6, s16, s18 | |
225 | vsub.f s7, s17, s19 | |
226 | vsub.f s3, s21, s22 | |
227 | vadd.f s2, s20, s23 | |
228 | vadd.f s5, s21, s22 | |
229 | vsub.f s4, s20, s23 | |
230 | vstr d0, [a1, #12 * 2*4] | |
231 | vmov s0, s6 | |
232 | @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) | |
233 | vldr d6, [a1, #9 * 2*4] | |
234 | vstr d1, [a1, #13 * 2*4] | |
235 | vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 | |
236 | vstr d2, [a1, #15 * 2*4] | |
237 | vldr d7, [a1, #13 * 2*4] | |
238 | vadd.f s4, s25, s24 | |
239 | vsub.f s5, s25, s24 | |
240 | vsub.f s6, s0, s7 | |
241 | vadd.f s7, s0, s7 | |
242 | vmul.f s20, s12, s3 @ vector op | |
243 | @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) | |
244 | vldr d4, [a1, #11 * 2*4] | |
245 | vldr d5, [a1, #15 * 2*4] | |
246 | vldr s1, cos3pi8 | |
247 | vmul.f s24, s4, s2 @ vector * scalar op | |
248 | vmul.f s28, s12, s1 @ vector * scalar op | |
249 | vmul.f s12, s8, s1 @ vector * scalar op | |
250 | vadd.f s4, s20, s29 | |
251 | vsub.f s5, s21, s28 | |
252 | vsub.f s6, s22, s31 | |
253 | vadd.f s7, s23, s30 | |
254 | vmul.f s8, s8, s3 @ vector * scalar op | |
255 | vldr d8, [a1, #1 * 2*4] | |
256 | vldr d9, [a1, #5 * 2*4] | |
257 | vldr d10, [a1, #3 * 2*4] | |
258 | vldr d11, [a1, #7 * 2*4] | |
259 | vldr d14, [a1, #2 * 2*4] | |
260 | vadd.f s0, s6, s4 | |
261 | vadd.f s1, s5, s7 | |
262 | vsub.f s2, s5, s7 | |
263 | vsub.f s3, s6, s4 | |
264 | vadd.f s4, s12, s9 | |
265 | vsub.f s5, s13, s8 | |
266 | vsub.f s6, s14, s11 | |
267 | vadd.f s7, s15, s10 | |
268 | vadd.f s12, s0, s16 @ vector op | |
269 | vstr d0, [a1, #1 * 2*4] | |
270 | vstr d1, [a1, #5 * 2*4] | |
271 | vldr d4, [a1, #1 * 2*4] | |
272 | vldr d5, [a1, #5 * 2*4] | |
273 | vadd.f s0, s6, s4 | |
274 | vadd.f s1, s5, s7 | |
275 | vsub.f s2, s5, s7 | |
276 | vsub.f s3, s6, s4 | |
277 | vsub.f s8, s16, s8 @ vector op | |
278 | vstr d6, [a1, #1 * 2*4] | |
279 | vstr d7, [a1, #5 * 2*4] | |
280 | vldr d15, [a1, #6 * 2*4] | |
281 | vsub.f s4, s20, s0 | |
282 | vsub.f s5, s21, s1 | |
283 | vsub.f s6, s22, s2 | |
284 | vsub.f s7, s23, s3 | |
285 | vadd.f s20, s0, s20 @ vector op | |
286 | vstr d4, [a1, #9 * 2*4] | |
287 | @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) | |
288 | vldr d6, [a1, #8 * 2*4] | |
289 | vstr d5, [a1, #13 * 2*4] | |
290 | vldr d7, [a1, #12 * 2*4] | |
291 | vstr d2, [a1, #11 * 2*4] | |
292 | vldr d8, [a1, #0 * 2*4] | |
293 | vstr d3, [a1, #15 * 2*4] | |
294 | vldr d9, [a1, #4 * 2*4] | |
295 | vadd.f s0, s26, s24 | |
296 | vadd.f s1, s25, s27 | |
297 | vsub.f s2, s25, s27 | |
298 | vsub.f s3, s26, s24 | |
299 | vadd.f s4, s14, s12 | |
300 | vadd.f s5, s13, s15 | |
301 | vsub.f s6, s13, s15 | |
302 | vsub.f s7, s14, s12 | |
303 | vadd.f s8, s0, s28 @ vector op | |
304 | vstr d0, [a1, #3 * 2*4] | |
305 | vstr d1, [a1, #7 * 2*4] | |
306 | vldr d6, [a1, #3 * 2*4] | |
307 | vldr d7, [a1, #7 * 2*4] | |
308 | vsub.f s0, s16, s4 | |
309 | vsub.f s1, s17, s5 | |
310 | vsub.f s2, s18, s6 | |
311 | vsub.f s3, s19, s7 | |
312 | vsub.f s12, s28, s12 @ vector op | |
313 | vadd.f s16, s4, s16 @ vector op | |
314 | vstr d10, [a1, #3 * 2*4] | |
315 | vstr d11, [a1, #7 * 2*4] | |
316 | vstr d4, [a1, #2 * 2*4] | |
317 | vstr d5, [a1, #6 * 2*4] | |
318 | vstr d0, [a1, #8 * 2*4] | |
319 | vstr d1, [a1, #12 * 2*4] | |
320 | vstr d6, [a1, #10 * 2*4] | |
321 | vstr d7, [a1, #14 * 2*4] | |
322 | vstr d8, [a1, #0 * 2*4] | |
323 | vstr d9, [a1, #4 * 2*4] | |
324 | ||
325 | bx lr | |
326 | endfunc | |
327 | ||
328 | function ff_fft16_vfp, export=1 | |
329 | ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 | |
330 | fmrx a2, FPSCR | |
331 | fmxr FPSCR, a3 | |
332 | vpush {s16-s31} | |
333 | mov ip, lr | |
334 | bl .Lfft16_internal_vfp | |
335 | vpop {s16-s31} | |
336 | fmxr FPSCR, a2 | |
337 | bx ip | |
338 | endfunc | |
339 | ||
340 | .macro pass n, z0, z1, z2, z3 | |
341 | add v6, v5, #4*2*\n | |
342 | @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) | |
343 | @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) | |
344 | @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) | |
345 | @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) | |
346 | vldr d8, [\z2, #8*(o2+1)] @ s16,s17 | |
347 | vldmdb v6!, {s2} | |
348 | vldr d9, [\z3, #8*(o3+1)] @ s18,s19 | |
349 | vldmia v5!, {s0,s1} @ s0 is unused | |
350 | vldr s7, [\z2, #8*o2] @ t1 | |
351 | vmul.f s20, s16, s2 @ vector * scalar | |
352 | vldr s0, [\z3, #8*o3] @ t5 | |
353 | vldr s6, [\z2, #8*o2+4] @ t2 | |
354 | vldr s3, [\z3, #8*o3+4] @ t6 | |
355 | vmul.f s16, s16, s1 @ vector * scalar | |
356 | ldr a4, =\n-1 | |
357 | 1: add \z0, \z0, #8*2 | |
358 | .if \n*4*2 >= 512 | |
359 | add \z1, \z1, #8*2 | |
360 | .endif | |
361 | .if \n*4*2 >= 256 | |
362 | add \z2, \z2, #8*2 | |
363 | .endif | |
364 | .if \n*4*2 >= 512 | |
365 | add \z3, \z3, #8*2 | |
366 | .endif | |
367 | @ up to 2 stalls (VFP vector issuing / waiting for s0) | |
368 | @ depending upon whether this is the first iteration and | |
369 | @ how many add instructions are inserted above | |
370 | vadd.f s4, s0, s7 @ t5 | |
371 | vadd.f s5, s6, s3 @ t6 | |
372 | vsub.f s6, s6, s3 @ t4 | |
373 | vsub.f s7, s0, s7 @ t3 | |
374 | vldr d6, [\z0, #8*0-8*2] @ s12,s13 | |
375 | vadd.f s0, s16, s21 @ t1 | |
376 | vldr d7, [\z1, #8*o1-8*2] @ s14,s15 | |
377 | vsub.f s1, s18, s23 @ t5 | |
378 | vadd.f s8, s4, s12 @ vector + vector | |
379 | @ stall (VFP vector issuing) | |
380 | @ stall (VFP vector issuing) | |
381 | @ stall (VFP vector issuing) | |
382 | vsub.f s4, s12, s4 | |
383 | vsub.f s5, s13, s5 | |
384 | vsub.f s6, s14, s6 | |
385 | vsub.f s7, s15, s7 | |
386 | vsub.f s2, s17, s20 @ t2 | |
387 | vadd.f s3, s19, s22 @ t6 | |
388 | vstr d4, [\z0, #8*0-8*2] @ s8,s9 | |
389 | vstr d5, [\z1, #8*o1-8*2] @ s10,s11 | |
390 | @ stall (waiting for s5) | |
391 | vstr d2, [\z2, #8*o2-8*2] @ s4,s5 | |
392 | vadd.f s4, s1, s0 @ t5 | |
393 | vstr d3, [\z3, #8*o3-8*2] @ s6,s7 | |
394 | vsub.f s7, s1, s0 @ t3 | |
395 | vadd.f s5, s2, s3 @ t6 | |
396 | vsub.f s6, s2, s3 @ t4 | |
397 | vldr d6, [\z0, #8*1-8*2] @ s12,s13 | |
398 | vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 | |
399 | vldr d4, [\z2, #8*o2] @ s8,s9 | |
400 | vldmdb v6!, {s2,s3} | |
401 | vldr d5, [\z3, #8*o3] @ s10,s11 | |
402 | vadd.f s20, s4, s12 @ vector + vector | |
403 | vldmia v5!, {s0,s1} | |
404 | vldr d8, [\z2, #8*(o2+1)] @ s16,s17 | |
405 | @ stall (VFP vector issuing) | |
406 | vsub.f s4, s12, s4 | |
407 | vsub.f s5, s13, s5 | |
408 | vsub.f s6, s14, s6 | |
409 | vsub.f s7, s15, s7 | |
410 | vmul.f s12, s8, s3 @ vector * scalar | |
411 | vstr d10, [\z0, #8*1-8*2] @ s20,s21 | |
412 | vldr d9, [\z3, #8*(o3+1)] @ s18,s19 | |
413 | vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 | |
414 | vmul.f s8, s8, s0 @ vector * scalar | |
415 | vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 | |
416 | @ stall (waiting for s7) | |
417 | vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 | |
418 | vmul.f s20, s16, s2 @ vector * scalar | |
419 | @ stall (VFP vector issuing) | |
420 | @ stall (VFP vector issuing) | |
421 | @ stall (VFP vector issuing) | |
422 | vadd.f s7, s8, s13 @ t1 | |
423 | vsub.f s6, s9, s12 @ t2 | |
424 | vsub.f s0, s10, s15 @ t5 | |
425 | vadd.f s3, s11, s14 @ t6 | |
426 | vmul.f s16, s16, s1 @ vector * scalar | |
427 | subs a4, a4, #1 | |
428 | bne 1b | |
429 | @ What remains is identical to the first two indentations of | |
430 | @ the above, but without the increment of z | |
431 | vadd.f s4, s0, s7 @ t5 | |
432 | vadd.f s5, s6, s3 @ t6 | |
433 | vsub.f s6, s6, s3 @ t4 | |
434 | vsub.f s7, s0, s7 @ t3 | |
435 | vldr d6, [\z0, #8*0] @ s12,s13 | |
436 | vadd.f s0, s16, s21 @ t1 | |
437 | vldr d7, [\z1, #8*o1] @ s14,s15 | |
438 | vsub.f s1, s18, s23 @ t5 | |
439 | vadd.f s8, s4, s12 @ vector + vector | |
440 | vsub.f s4, s12, s4 | |
441 | vsub.f s5, s13, s5 | |
442 | vsub.f s6, s14, s6 | |
443 | vsub.f s7, s15, s7 | |
444 | vsub.f s2, s17, s20 @ t2 | |
445 | vadd.f s3, s19, s22 @ t6 | |
446 | vstr d4, [\z0, #8*0] @ s8,s9 | |
447 | vstr d5, [\z1, #8*o1] @ s10,s11 | |
448 | vstr d2, [\z2, #8*o2] @ s4,s5 | |
449 | vadd.f s4, s1, s0 @ t5 | |
450 | vstr d3, [\z3, #8*o3] @ s6,s7 | |
451 | vsub.f s7, s1, s0 @ t3 | |
452 | vadd.f s5, s2, s3 @ t6 | |
453 | vsub.f s6, s2, s3 @ t4 | |
454 | vldr d6, [\z0, #8*1] @ s12,s13 | |
455 | vldr d7, [\z1, #8*(o1+1)] @ s14,s15 | |
456 | vadd.f s20, s4, s12 @ vector + vector | |
457 | vsub.f s4, s12, s4 | |
458 | vsub.f s5, s13, s5 | |
459 | vsub.f s6, s14, s6 | |
460 | vsub.f s7, s15, s7 | |
461 | vstr d10, [\z0, #8*1] @ s20,s21 | |
462 | vstr d11, [\z1, #8*(o1+1)] @ s22,s23 | |
463 | vstr d2, [\z2, #8*(o2+1)] @ s4,s5 | |
464 | vstr d3, [\z3, #8*(o3+1)] @ s6,s7 | |
465 | .endm | |
466 | ||
467 | .macro def_fft n, n2, n4 | |
468 | function .Lfft\n\()_internal_vfp | |
469 | .if \n >= 512 | |
470 | push {v1-v6,lr} | |
471 | .elseif \n >= 256 | |
472 | push {v1-v2,v5-v6,lr} | |
473 | .else | |
474 | push {v1,v5-v6,lr} | |
475 | .endif | |
476 | mov v1, a1 | |
477 | bl .Lfft\n2\()_internal_vfp | |
478 | add a1, v1, #8*(\n/4)*2 | |
479 | bl .Lfft\n4\()_internal_vfp | |
480 | movrelx v5, X(ff_cos_\n), a1 | |
481 | add a1, v1, #8*(\n/4)*3 | |
482 | bl .Lfft\n4\()_internal_vfp | |
483 | .if \n >= 512 | |
484 | .set o1, 0*(\n/4/2) | |
485 | .set o2, 0*(\n/4/2) | |
486 | .set o3, 0*(\n/4/2) | |
487 | add v2, v1, #8*2*(\n/4/2) | |
488 | add v3, v1, #8*4*(\n/4/2) | |
489 | add v4, v1, #8*6*(\n/4/2) | |
490 | pass (\n/4/2), v1, v2, v3, v4 | |
491 | pop {v1-v6,pc} | |
492 | .elseif \n >= 256 | |
493 | .set o1, 2*(\n/4/2) | |
494 | .set o2, 0*(\n/4/2) | |
495 | .set o3, 2*(\n/4/2) | |
496 | add v2, v1, #8*4*(\n/4/2) | |
497 | pass (\n/4/2), v1, v1, v2, v2 | |
498 | pop {v1-v2,v5-v6,pc} | |
499 | .else | |
500 | .set o1, 2*(\n/4/2) | |
501 | .set o2, 4*(\n/4/2) | |
502 | .set o3, 6*(\n/4/2) | |
503 | pass (\n/4/2), v1, v1, v1, v1 | |
504 | pop {v1,v5-v6,pc} | |
505 | .endif | |
506 | endfunc | |
507 | ||
508 | function fft\n\()_vfp | |
509 | ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ | |
510 | fmrx a2, FPSCR | |
511 | fmxr FPSCR, a3 | |
512 | vpush {s16-s31} | |
513 | mov ip, lr | |
514 | bl .Lfft\n\()_internal_vfp | |
515 | vpop {s16-s31} | |
516 | fmxr FPSCR, a2 | |
517 | bx ip | |
518 | endfunc | |
519 | ||
520 | .ltorg | |
521 | .endm | |
522 | ||
523 | def_fft 32, 16, 8 | |
524 | def_fft 64, 32, 16 | |
525 | def_fft 128, 64, 32 | |
526 | def_fft 256, 128, 64 | |
527 | def_fft 512, 256, 128 | |
528 | def_fft 1024, 512, 256 | |
529 | def_fft 2048, 1024, 512 | |
530 | def_fft 4096, 2048, 1024 | |
531 | def_fft 8192, 4096, 2048 | |
532 | def_fft 16384, 8192, 4096 | |
533 | def_fft 32768, 16384, 8192 | |
534 | def_fft 65536, 32768, 16384 |