2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
24 @ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
25 @ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
26 @ all single-precision VFP registers may be corrupted on exit. The a2
27 @ register may not be clobbered in these functions, as it holds the
28 @ stored original FPSCR.
30 function ff_fft_calc_vfp, export=1
31 ldr ip, [a1, #0] @ nbits
33 A ldr pc, [pc, ip, lsl #2]
37 T movrel a2, (fft_tab_vfp - 8)
38 T ldr pc, [a2, ip, lsl #2]
43 .word X(ff_fft16_vfp) @ this one alone is exported
59 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
60 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
61 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
62 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
64 vadd.f s12, s0, s8 @ i0
65 vadd.f s13, s1, s9 @ i1
66 vadd.f s14, s2, s10 @ i2
67 vadd.f s15, s3, s11 @ i3
68 vsub.f s8, s0, s8 @ i4
69 vsub.f s9, s1, s9 @ i5
70 vsub.f s10, s2, s10 @ i6
71 vsub.f s11, s3, s11 @ i7
74 vadd.f s0, s12, s14 @ z[0].re
75 vsub.f s4, s12, s14 @ z[2].re
76 vadd.f s1, s13, s15 @ z[0].im
77 vsub.f s5, s13, s15 @ z[2].im
78 vadd.f s7, s9, s10 @ z[3].im
79 vsub.f s3, s9, s10 @ z[1].im
80 vadd.f s2, s8, s11 @ z[1].re
81 vsub.f s6, s8, s11 @ z[3].re
94 .macro macro_fft8_head
96 vldr d4, [a1, #0 * 2*4]
97 vldr d6, [a1, #1 * 2*4]
98 vldr d5, [a1, #2 * 2*4]
99 vldr d7, [a1, #3 * 2*4]
101 vldr d12, [a1, #4 * 2*4]
102 vadd.f s16, s8, s12 @ vector op
103 vldr d14, [a1, #5 * 2*4]
104 vldr d13, [a1, #6 * 2*4]
105 vldr d15, [a1, #7 * 2*4]
106 vsub.f s20, s8, s12 @ vector op
115 vsub.f s20, s24, s28 @ vector op
116 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
117 vstr d1, [a1, #1 * 2*4]
119 vadd.f s16, s24, s28 @ vector op
120 vstr d2, [a1, #2 * 2*4]
121 vstr d3, [a1, #3 * 2*4]
122 vldr d12, [a1, #0 * 2*4]
124 vmul.f s20, s20, s0 @ vector x scalar op
125 vldr d13, [a1, #1 * 2*4]
126 vldr d14, [a1, #2 * 2*4]
127 vldr d15, [a1, #3 * 2*4]
137 vadd.f s8, s0, s24 @ vector op
138 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
139 vstr d1, [a1, #1 * 2*4]
140 vldr d6, [a1, #0 * 2*4]
141 vldr d7, [a1, #1 * 2*4]
146 vsub.f s12, s24, s12 @ vector op
151 vadd.f s16, s0, s28 @ vector op
152 vstr d6, [a1, #4 * 2*4]
153 vstr d7, [a1, #6 * 2*4]
154 vstr d4, [a1, #0 * 2*4]
155 vstr d5, [a1, #2 * 2*4]
156 vstr d2, [a1, #5 * 2*4]
157 vstr d3, [a1, #7 * 2*4]
160 .macro macro_fft8_tail
161 vstr d8, [a1, #1 * 2*4]
162 vstr d9, [a1, #3 * 2*4]
165 function .Lfft8_internal_vfp
172 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
177 bl .Lfft8_internal_vfp
184 cos1pi4: @ cos(1*pi/4) = sqrt(2)
185 .float 0.707106769084930419921875
186 cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
187 .float 0.92387950420379638671875
188 cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
189 .float 0.3826834261417388916015625
191 function .Lfft16_internal_vfp
194 vldr d10, [a1, #8 * 2*4]
195 vldr d12, [a1, #9 * 2*4]
196 vldr d11, [a1, #10 * 2*4]
197 vldr d13, [a1, #11 * 2*4]
199 vadd.f s16, s20, s24 @ vector op
201 vldr d4, [a1, #12 * 2*4]
202 vldr d6, [a1, #13 * 2*4]
203 vldr d5, [a1, #14 * 2*4]
204 vsub.f s20, s20, s24 @ vector op
205 vldr d7, [a1, #15 * 2*4]
214 vadd.f s16, s8, s12 @ vector op
215 vstr d0, [a1, #8 * 2*4]
216 vstr d2, [a1, #10 * 2*4]
217 vstr d1, [a1, #9 * 2*4]
219 vstr d3, [a1, #11 * 2*4]
220 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
221 vldr d12, [a1, #10 * 2*4]
230 vstr d0, [a1, #12 * 2*4]
232 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
233 vldr d6, [a1, #9 * 2*4]
234 vstr d1, [a1, #13 * 2*4]
235 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
236 vstr d2, [a1, #15 * 2*4]
237 vldr d7, [a1, #13 * 2*4]
242 vmul.f s20, s12, s3 @ vector op
243 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
244 vldr d4, [a1, #11 * 2*4]
245 vldr d5, [a1, #15 * 2*4]
247 vmul.f s24, s4, s2 @ vector * scalar op
248 vmul.f s28, s12, s1 @ vector * scalar op
249 vmul.f s12, s8, s1 @ vector * scalar op
254 vmul.f s8, s8, s3 @ vector * scalar op
255 vldr d8, [a1, #1 * 2*4]
256 vldr d9, [a1, #5 * 2*4]
257 vldr d10, [a1, #3 * 2*4]
258 vldr d11, [a1, #7 * 2*4]
259 vldr d14, [a1, #2 * 2*4]
268 vadd.f s12, s0, s16 @ vector op
269 vstr d0, [a1, #1 * 2*4]
270 vstr d1, [a1, #5 * 2*4]
271 vldr d4, [a1, #1 * 2*4]
272 vldr d5, [a1, #5 * 2*4]
277 vsub.f s8, s16, s8 @ vector op
278 vstr d6, [a1, #1 * 2*4]
279 vstr d7, [a1, #5 * 2*4]
280 vldr d15, [a1, #6 * 2*4]
285 vadd.f s20, s0, s20 @ vector op
286 vstr d4, [a1, #9 * 2*4]
287 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
288 vldr d6, [a1, #8 * 2*4]
289 vstr d5, [a1, #13 * 2*4]
290 vldr d7, [a1, #12 * 2*4]
291 vstr d2, [a1, #11 * 2*4]
292 vldr d8, [a1, #0 * 2*4]
293 vstr d3, [a1, #15 * 2*4]
294 vldr d9, [a1, #4 * 2*4]
303 vadd.f s8, s0, s28 @ vector op
304 vstr d0, [a1, #3 * 2*4]
305 vstr d1, [a1, #7 * 2*4]
306 vldr d6, [a1, #3 * 2*4]
307 vldr d7, [a1, #7 * 2*4]
312 vsub.f s12, s28, s12 @ vector op
313 vadd.f s16, s4, s16 @ vector op
314 vstr d10, [a1, #3 * 2*4]
315 vstr d11, [a1, #7 * 2*4]
316 vstr d4, [a1, #2 * 2*4]
317 vstr d5, [a1, #6 * 2*4]
318 vstr d0, [a1, #8 * 2*4]
319 vstr d1, [a1, #12 * 2*4]
320 vstr d6, [a1, #10 * 2*4]
321 vstr d7, [a1, #14 * 2*4]
322 vstr d8, [a1, #0 * 2*4]
323 vstr d9, [a1, #4 * 2*4]
328 function ff_fft16_vfp, export=1
329 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
334 bl .Lfft16_internal_vfp
340 .macro pass n, z0, z1, z2, z3
342 @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
343 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
344 @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
345 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
346 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
348 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
349 vldmia v5!, {s0,s1} @ s0 is unused
350 vldr s7, [\z2, #8*o2] @ t1
351 vmul.f s20, s16, s2 @ vector * scalar
352 vldr s0, [\z3, #8*o3] @ t5
353 vldr s6, [\z2, #8*o2+4] @ t2
354 vldr s3, [\z3, #8*o3+4] @ t6
355 vmul.f s16, s16, s1 @ vector * scalar
357 1: add \z0, \z0, #8*2
367 @ up to 2 stalls (VFP vector issuing / waiting for s0)
368 @ depending upon whether this is the first iteration and
369 @ how many add instructions are inserted above
370 vadd.f s4, s0, s7 @ t5
371 vadd.f s5, s6, s3 @ t6
372 vsub.f s6, s6, s3 @ t4
373 vsub.f s7, s0, s7 @ t3
374 vldr d6, [\z0, #8*0-8*2] @ s12,s13
375 vadd.f s0, s16, s21 @ t1
376 vldr d7, [\z1, #8*o1-8*2] @ s14,s15
377 vsub.f s1, s18, s23 @ t5
378 vadd.f s8, s4, s12 @ vector + vector
379 @ stall (VFP vector issuing)
380 @ stall (VFP vector issuing)
381 @ stall (VFP vector issuing)
386 vsub.f s2, s17, s20 @ t2
387 vadd.f s3, s19, s22 @ t6
388 vstr d4, [\z0, #8*0-8*2] @ s8,s9
389 vstr d5, [\z1, #8*o1-8*2] @ s10,s11
390 @ stall (waiting for s5)
391 vstr d2, [\z2, #8*o2-8*2] @ s4,s5
392 vadd.f s4, s1, s0 @ t5
393 vstr d3, [\z3, #8*o3-8*2] @ s6,s7
394 vsub.f s7, s1, s0 @ t3
395 vadd.f s5, s2, s3 @ t6
396 vsub.f s6, s2, s3 @ t4
397 vldr d6, [\z0, #8*1-8*2] @ s12,s13
398 vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
399 vldr d4, [\z2, #8*o2] @ s8,s9
401 vldr d5, [\z3, #8*o3] @ s10,s11
402 vadd.f s20, s4, s12 @ vector + vector
404 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
405 @ stall (VFP vector issuing)
410 vmul.f s12, s8, s3 @ vector * scalar
411 vstr d10, [\z0, #8*1-8*2] @ s20,s21
412 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
413 vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
414 vmul.f s8, s8, s0 @ vector * scalar
415 vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
416 @ stall (waiting for s7)
417 vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
418 vmul.f s20, s16, s2 @ vector * scalar
419 @ stall (VFP vector issuing)
420 @ stall (VFP vector issuing)
421 @ stall (VFP vector issuing)
422 vadd.f s7, s8, s13 @ t1
423 vsub.f s6, s9, s12 @ t2
424 vsub.f s0, s10, s15 @ t5
425 vadd.f s3, s11, s14 @ t6
426 vmul.f s16, s16, s1 @ vector * scalar
429 @ What remains is identical to the first two indentations of
430 @ the above, but without the increment of z
431 vadd.f s4, s0, s7 @ t5
432 vadd.f s5, s6, s3 @ t6
433 vsub.f s6, s6, s3 @ t4
434 vsub.f s7, s0, s7 @ t3
435 vldr d6, [\z0, #8*0] @ s12,s13
436 vadd.f s0, s16, s21 @ t1
437 vldr d7, [\z1, #8*o1] @ s14,s15
438 vsub.f s1, s18, s23 @ t5
439 vadd.f s8, s4, s12 @ vector + vector
444 vsub.f s2, s17, s20 @ t2
445 vadd.f s3, s19, s22 @ t6
446 vstr d4, [\z0, #8*0] @ s8,s9
447 vstr d5, [\z1, #8*o1] @ s10,s11
448 vstr d2, [\z2, #8*o2] @ s4,s5
449 vadd.f s4, s1, s0 @ t5
450 vstr d3, [\z3, #8*o3] @ s6,s7
451 vsub.f s7, s1, s0 @ t3
452 vadd.f s5, s2, s3 @ t6
453 vsub.f s6, s2, s3 @ t4
454 vldr d6, [\z0, #8*1] @ s12,s13
455 vldr d7, [\z1, #8*(o1+1)] @ s14,s15
456 vadd.f s20, s4, s12 @ vector + vector
461 vstr d10, [\z0, #8*1] @ s20,s21
462 vstr d11, [\z1, #8*(o1+1)] @ s22,s23
463 vstr d2, [\z2, #8*(o2+1)] @ s4,s5
464 vstr d3, [\z3, #8*(o3+1)] @ s6,s7
467 .macro def_fft n, n2, n4
468 function .Lfft\n\()_internal_vfp
472 push {v1-v2,v5-v6,lr}
477 bl .Lfft\n2\()_internal_vfp
478 add a1, v1, #8*(\n/4)*2
479 bl .Lfft\n4\()_internal_vfp
480 movrelx v5, X(ff_cos_\n), a1
481 add a1, v1, #8*(\n/4)*3
482 bl .Lfft\n4\()_internal_vfp
487 add v2, v1, #8*2*(\n/4/2)
488 add v3, v1, #8*4*(\n/4/2)
489 add v4, v1, #8*6*(\n/4/2)
490 pass (\n/4/2), v1, v2, v3, v4
496 add v2, v1, #8*4*(\n/4/2)
497 pass (\n/4/2), v1, v1, v2, v2
503 pass (\n/4/2), v1, v1, v1, v1
508 function fft\n\()_vfp
509 ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
514 bl .Lfft\n\()_internal_vfp
527 def_fft 512, 256, 128
528 def_fft 1024, 512, 256
529 def_fft 2048, 1024, 512
530 def_fft 4096, 2048, 1024
531 def_fft 8192, 4096, 2048
532 def_fft 16384, 8192, 4096
533 def_fft 32768, 16384, 8192
534 def_fft 65536, 32768, 16384