Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / fft_vfp.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
25@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
26@ all single-precision VFP registers may be corrupted on exit. The a2
27@ register may not be clobbered in these functions, as it holds the
28@ stored original FPSCR.
29
30function ff_fft_calc_vfp, export=1
31 ldr ip, [a1, #0] @ nbits
32 mov a1, a2
33A ldr pc, [pc, ip, lsl #2]
34A .word 0
35A .word 0
36A .word 0
37T movrel a2, (fft_tab_vfp - 8)
38T ldr pc, [a2, ip, lsl #2]
39T endfunc
40T const fft_tab_vfp
41 .word fft4_vfp
42 .word fft8_vfp
43 .word X(ff_fft16_vfp) @ this one alone is exported
44 .word fft32_vfp
45 .word fft64_vfp
46 .word fft128_vfp
47 .word fft256_vfp
48 .word fft512_vfp
49 .word fft1024_vfp
50 .word fft2048_vfp
51 .word fft4096_vfp
52 .word fft8192_vfp
53 .word fft16384_vfp
54 .word fft32768_vfp
55 .word fft65536_vfp
56A endfunc
57
58function fft4_vfp
59 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
60 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
61 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
62 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
63 @ stall
64 vadd.f s12, s0, s8 @ i0
65 vadd.f s13, s1, s9 @ i1
66 vadd.f s14, s2, s10 @ i2
67 vadd.f s15, s3, s11 @ i3
68 vsub.f s8, s0, s8 @ i4
69 vsub.f s9, s1, s9 @ i5
70 vsub.f s10, s2, s10 @ i6
71 vsub.f s11, s3, s11 @ i7
72 @ stall
73 @ stall
74 vadd.f s0, s12, s14 @ z[0].re
75 vsub.f s4, s12, s14 @ z[2].re
76 vadd.f s1, s13, s15 @ z[0].im
77 vsub.f s5, s13, s15 @ z[2].im
78 vadd.f s7, s9, s10 @ z[3].im
79 vsub.f s3, s9, s10 @ z[1].im
80 vadd.f s2, s8, s11 @ z[1].re
81 vsub.f s6, s8, s11 @ z[3].re
82 @ stall
83 @ stall
84 vstr d0, [a1, #0*2*4]
85 vstr d2, [a1, #2*2*4]
86 @ stall
87 @ stall
88 vstr d1, [a1, #1*2*4]
89 vstr d3, [a1, #3*2*4]
90
91 bx lr
92endfunc
93
94.macro macro_fft8_head
95 @ FFT4
96 vldr d4, [a1, #0 * 2*4]
97 vldr d6, [a1, #1 * 2*4]
98 vldr d5, [a1, #2 * 2*4]
99 vldr d7, [a1, #3 * 2*4]
100 @ BF
101 vldr d12, [a1, #4 * 2*4]
102 vadd.f s16, s8, s12 @ vector op
103 vldr d14, [a1, #5 * 2*4]
104 vldr d13, [a1, #6 * 2*4]
105 vldr d15, [a1, #7 * 2*4]
106 vsub.f s20, s8, s12 @ vector op
107 vadd.f s0, s16, s18
108 vsub.f s2, s16, s18
109 vadd.f s1, s17, s19
110 vsub.f s3, s17, s19
111 vadd.f s7, s21, s22
112 vsub.f s5, s21, s22
113 vadd.f s4, s20, s23
114 vsub.f s6, s20, s23
115 vsub.f s20, s24, s28 @ vector op
116 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
117 vstr d1, [a1, #1 * 2*4]
118 vldr s0, cos1pi4
119 vadd.f s16, s24, s28 @ vector op
120 vstr d2, [a1, #2 * 2*4]
121 vstr d3, [a1, #3 * 2*4]
122 vldr d12, [a1, #0 * 2*4]
123 @ TRANSFORM
124 vmul.f s20, s20, s0 @ vector x scalar op
125 vldr d13, [a1, #1 * 2*4]
126 vldr d14, [a1, #2 * 2*4]
127 vldr d15, [a1, #3 * 2*4]
128 @ BUTTERFLIES
129 vadd.f s0, s18, s16
130 vadd.f s1, s17, s19
131 vsub.f s2, s17, s19
132 vsub.f s3, s18, s16
133 vadd.f s4, s21, s20
134 vsub.f s5, s21, s20
135 vadd.f s6, s22, s23
136 vsub.f s7, s22, s23
137 vadd.f s8, s0, s24 @ vector op
138 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
139 vstr d1, [a1, #1 * 2*4]
140 vldr d6, [a1, #0 * 2*4]
141 vldr d7, [a1, #1 * 2*4]
142 vadd.f s1, s5, s6
143 vadd.f s0, s7, s4
144 vsub.f s2, s5, s6
145 vsub.f s3, s7, s4
146 vsub.f s12, s24, s12 @ vector op
147 vsub.f s5, s29, s1
148 vsub.f s4, s28, s0
149 vsub.f s6, s30, s2
150 vsub.f s7, s31, s3
151 vadd.f s16, s0, s28 @ vector op
152 vstr d6, [a1, #4 * 2*4]
153 vstr d7, [a1, #6 * 2*4]
154 vstr d4, [a1, #0 * 2*4]
155 vstr d5, [a1, #2 * 2*4]
156 vstr d2, [a1, #5 * 2*4]
157 vstr d3, [a1, #7 * 2*4]
158.endm
159
160.macro macro_fft8_tail
161 vstr d8, [a1, #1 * 2*4]
162 vstr d9, [a1, #3 * 2*4]
163.endm
164
165function .Lfft8_internal_vfp
166 macro_fft8_head
167 macro_fft8_tail
168 bx lr
169endfunc
170
171function fft8_vfp
172 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
173 fmrx a2, FPSCR
174 fmxr FPSCR, a3
175 vpush {s16-s31}
176 mov ip, lr
177 bl .Lfft8_internal_vfp
178 vpop {s16-s31}
179 fmxr FPSCR, a2
180 bx ip
181endfunc
182
183.align 3
184cos1pi4: @ cos(1*pi/4) = sqrt(2)
185 .float 0.707106769084930419921875
186cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
187 .float 0.92387950420379638671875
188cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
189 .float 0.3826834261417388916015625
190
191function .Lfft16_internal_vfp
192 macro_fft8_head
193 @ FFT4(z+8)
194 vldr d10, [a1, #8 * 2*4]
195 vldr d12, [a1, #9 * 2*4]
196 vldr d11, [a1, #10 * 2*4]
197 vldr d13, [a1, #11 * 2*4]
198 macro_fft8_tail
199 vadd.f s16, s20, s24 @ vector op
200 @ FFT4(z+12)
201 vldr d4, [a1, #12 * 2*4]
202 vldr d6, [a1, #13 * 2*4]
203 vldr d5, [a1, #14 * 2*4]
204 vsub.f s20, s20, s24 @ vector op
205 vldr d7, [a1, #15 * 2*4]
206 vadd.f s0, s16, s18
207 vsub.f s4, s16, s18
208 vadd.f s1, s17, s19
209 vsub.f s5, s17, s19
210 vadd.f s7, s21, s22
211 vsub.f s3, s21, s22
212 vadd.f s2, s20, s23
213 vsub.f s6, s20, s23
214 vadd.f s16, s8, s12 @ vector op
215 vstr d0, [a1, #8 * 2*4]
216 vstr d2, [a1, #10 * 2*4]
217 vstr d1, [a1, #9 * 2*4]
218 vsub.f s20, s8, s12
219 vstr d3, [a1, #11 * 2*4]
220 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
221 vldr d12, [a1, #10 * 2*4]
222 vadd.f s0, s16, s18
223 vadd.f s1, s17, s19
224 vsub.f s6, s16, s18
225 vsub.f s7, s17, s19
226 vsub.f s3, s21, s22
227 vadd.f s2, s20, s23
228 vadd.f s5, s21, s22
229 vsub.f s4, s20, s23
230 vstr d0, [a1, #12 * 2*4]
231 vmov s0, s6
232 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
233 vldr d6, [a1, #9 * 2*4]
234 vstr d1, [a1, #13 * 2*4]
235 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
236 vstr d2, [a1, #15 * 2*4]
237 vldr d7, [a1, #13 * 2*4]
238 vadd.f s4, s25, s24
239 vsub.f s5, s25, s24
240 vsub.f s6, s0, s7
241 vadd.f s7, s0, s7
242 vmul.f s20, s12, s3 @ vector op
243 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
244 vldr d4, [a1, #11 * 2*4]
245 vldr d5, [a1, #15 * 2*4]
246 vldr s1, cos3pi8
247 vmul.f s24, s4, s2 @ vector * scalar op
248 vmul.f s28, s12, s1 @ vector * scalar op
249 vmul.f s12, s8, s1 @ vector * scalar op
250 vadd.f s4, s20, s29
251 vsub.f s5, s21, s28
252 vsub.f s6, s22, s31
253 vadd.f s7, s23, s30
254 vmul.f s8, s8, s3 @ vector * scalar op
255 vldr d8, [a1, #1 * 2*4]
256 vldr d9, [a1, #5 * 2*4]
257 vldr d10, [a1, #3 * 2*4]
258 vldr d11, [a1, #7 * 2*4]
259 vldr d14, [a1, #2 * 2*4]
260 vadd.f s0, s6, s4
261 vadd.f s1, s5, s7
262 vsub.f s2, s5, s7
263 vsub.f s3, s6, s4
264 vadd.f s4, s12, s9
265 vsub.f s5, s13, s8
266 vsub.f s6, s14, s11
267 vadd.f s7, s15, s10
268 vadd.f s12, s0, s16 @ vector op
269 vstr d0, [a1, #1 * 2*4]
270 vstr d1, [a1, #5 * 2*4]
271 vldr d4, [a1, #1 * 2*4]
272 vldr d5, [a1, #5 * 2*4]
273 vadd.f s0, s6, s4
274 vadd.f s1, s5, s7
275 vsub.f s2, s5, s7
276 vsub.f s3, s6, s4
277 vsub.f s8, s16, s8 @ vector op
278 vstr d6, [a1, #1 * 2*4]
279 vstr d7, [a1, #5 * 2*4]
280 vldr d15, [a1, #6 * 2*4]
281 vsub.f s4, s20, s0
282 vsub.f s5, s21, s1
283 vsub.f s6, s22, s2
284 vsub.f s7, s23, s3
285 vadd.f s20, s0, s20 @ vector op
286 vstr d4, [a1, #9 * 2*4]
287 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
288 vldr d6, [a1, #8 * 2*4]
289 vstr d5, [a1, #13 * 2*4]
290 vldr d7, [a1, #12 * 2*4]
291 vstr d2, [a1, #11 * 2*4]
292 vldr d8, [a1, #0 * 2*4]
293 vstr d3, [a1, #15 * 2*4]
294 vldr d9, [a1, #4 * 2*4]
295 vadd.f s0, s26, s24
296 vadd.f s1, s25, s27
297 vsub.f s2, s25, s27
298 vsub.f s3, s26, s24
299 vadd.f s4, s14, s12
300 vadd.f s5, s13, s15
301 vsub.f s6, s13, s15
302 vsub.f s7, s14, s12
303 vadd.f s8, s0, s28 @ vector op
304 vstr d0, [a1, #3 * 2*4]
305 vstr d1, [a1, #7 * 2*4]
306 vldr d6, [a1, #3 * 2*4]
307 vldr d7, [a1, #7 * 2*4]
308 vsub.f s0, s16, s4
309 vsub.f s1, s17, s5
310 vsub.f s2, s18, s6
311 vsub.f s3, s19, s7
312 vsub.f s12, s28, s12 @ vector op
313 vadd.f s16, s4, s16 @ vector op
314 vstr d10, [a1, #3 * 2*4]
315 vstr d11, [a1, #7 * 2*4]
316 vstr d4, [a1, #2 * 2*4]
317 vstr d5, [a1, #6 * 2*4]
318 vstr d0, [a1, #8 * 2*4]
319 vstr d1, [a1, #12 * 2*4]
320 vstr d6, [a1, #10 * 2*4]
321 vstr d7, [a1, #14 * 2*4]
322 vstr d8, [a1, #0 * 2*4]
323 vstr d9, [a1, #4 * 2*4]
324
325 bx lr
326endfunc
327
328function ff_fft16_vfp, export=1
329 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
330 fmrx a2, FPSCR
331 fmxr FPSCR, a3
332 vpush {s16-s31}
333 mov ip, lr
334 bl .Lfft16_internal_vfp
335 vpop {s16-s31}
336 fmxr FPSCR, a2
337 bx ip
338endfunc
339
340.macro pass n, z0, z1, z2, z3
341 add v6, v5, #4*2*\n
342 @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
343 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
344 @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
345 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
346 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
347 vldmdb v6!, {s2}
348 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
349 vldmia v5!, {s0,s1} @ s0 is unused
350 vldr s7, [\z2, #8*o2] @ t1
351 vmul.f s20, s16, s2 @ vector * scalar
352 vldr s0, [\z3, #8*o3] @ t5
353 vldr s6, [\z2, #8*o2+4] @ t2
354 vldr s3, [\z3, #8*o3+4] @ t6
355 vmul.f s16, s16, s1 @ vector * scalar
356 ldr a4, =\n-1
3571: add \z0, \z0, #8*2
358 .if \n*4*2 >= 512
359 add \z1, \z1, #8*2
360 .endif
361 .if \n*4*2 >= 256
362 add \z2, \z2, #8*2
363 .endif
364 .if \n*4*2 >= 512
365 add \z3, \z3, #8*2
366 .endif
367 @ up to 2 stalls (VFP vector issuing / waiting for s0)
368 @ depending upon whether this is the first iteration and
369 @ how many add instructions are inserted above
370 vadd.f s4, s0, s7 @ t5
371 vadd.f s5, s6, s3 @ t6
372 vsub.f s6, s6, s3 @ t4
373 vsub.f s7, s0, s7 @ t3
374 vldr d6, [\z0, #8*0-8*2] @ s12,s13
375 vadd.f s0, s16, s21 @ t1
376 vldr d7, [\z1, #8*o1-8*2] @ s14,s15
377 vsub.f s1, s18, s23 @ t5
378 vadd.f s8, s4, s12 @ vector + vector
379 @ stall (VFP vector issuing)
380 @ stall (VFP vector issuing)
381 @ stall (VFP vector issuing)
382 vsub.f s4, s12, s4
383 vsub.f s5, s13, s5
384 vsub.f s6, s14, s6
385 vsub.f s7, s15, s7
386 vsub.f s2, s17, s20 @ t2
387 vadd.f s3, s19, s22 @ t6
388 vstr d4, [\z0, #8*0-8*2] @ s8,s9
389 vstr d5, [\z1, #8*o1-8*2] @ s10,s11
390 @ stall (waiting for s5)
391 vstr d2, [\z2, #8*o2-8*2] @ s4,s5
392 vadd.f s4, s1, s0 @ t5
393 vstr d3, [\z3, #8*o3-8*2] @ s6,s7
394 vsub.f s7, s1, s0 @ t3
395 vadd.f s5, s2, s3 @ t6
396 vsub.f s6, s2, s3 @ t4
397 vldr d6, [\z0, #8*1-8*2] @ s12,s13
398 vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
399 vldr d4, [\z2, #8*o2] @ s8,s9
400 vldmdb v6!, {s2,s3}
401 vldr d5, [\z3, #8*o3] @ s10,s11
402 vadd.f s20, s4, s12 @ vector + vector
403 vldmia v5!, {s0,s1}
404 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
405 @ stall (VFP vector issuing)
406 vsub.f s4, s12, s4
407 vsub.f s5, s13, s5
408 vsub.f s6, s14, s6
409 vsub.f s7, s15, s7
410 vmul.f s12, s8, s3 @ vector * scalar
411 vstr d10, [\z0, #8*1-8*2] @ s20,s21
412 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
413 vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
414 vmul.f s8, s8, s0 @ vector * scalar
415 vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
416 @ stall (waiting for s7)
417 vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
418 vmul.f s20, s16, s2 @ vector * scalar
419 @ stall (VFP vector issuing)
420 @ stall (VFP vector issuing)
421 @ stall (VFP vector issuing)
422 vadd.f s7, s8, s13 @ t1
423 vsub.f s6, s9, s12 @ t2
424 vsub.f s0, s10, s15 @ t5
425 vadd.f s3, s11, s14 @ t6
426 vmul.f s16, s16, s1 @ vector * scalar
427 subs a4, a4, #1
428 bne 1b
429 @ What remains is identical to the first two indentations of
430 @ the above, but without the increment of z
431 vadd.f s4, s0, s7 @ t5
432 vadd.f s5, s6, s3 @ t6
433 vsub.f s6, s6, s3 @ t4
434 vsub.f s7, s0, s7 @ t3
435 vldr d6, [\z0, #8*0] @ s12,s13
436 vadd.f s0, s16, s21 @ t1
437 vldr d7, [\z1, #8*o1] @ s14,s15
438 vsub.f s1, s18, s23 @ t5
439 vadd.f s8, s4, s12 @ vector + vector
440 vsub.f s4, s12, s4
441 vsub.f s5, s13, s5
442 vsub.f s6, s14, s6
443 vsub.f s7, s15, s7
444 vsub.f s2, s17, s20 @ t2
445 vadd.f s3, s19, s22 @ t6
446 vstr d4, [\z0, #8*0] @ s8,s9
447 vstr d5, [\z1, #8*o1] @ s10,s11
448 vstr d2, [\z2, #8*o2] @ s4,s5
449 vadd.f s4, s1, s0 @ t5
450 vstr d3, [\z3, #8*o3] @ s6,s7
451 vsub.f s7, s1, s0 @ t3
452 vadd.f s5, s2, s3 @ t6
453 vsub.f s6, s2, s3 @ t4
454 vldr d6, [\z0, #8*1] @ s12,s13
455 vldr d7, [\z1, #8*(o1+1)] @ s14,s15
456 vadd.f s20, s4, s12 @ vector + vector
457 vsub.f s4, s12, s4
458 vsub.f s5, s13, s5
459 vsub.f s6, s14, s6
460 vsub.f s7, s15, s7
461 vstr d10, [\z0, #8*1] @ s20,s21
462 vstr d11, [\z1, #8*(o1+1)] @ s22,s23
463 vstr d2, [\z2, #8*(o2+1)] @ s4,s5
464 vstr d3, [\z3, #8*(o3+1)] @ s6,s7
465.endm
466
467.macro def_fft n, n2, n4
468function .Lfft\n\()_internal_vfp
469 .if \n >= 512
470 push {v1-v6,lr}
471 .elseif \n >= 256
472 push {v1-v2,v5-v6,lr}
473 .else
474 push {v1,v5-v6,lr}
475 .endif
476 mov v1, a1
477 bl .Lfft\n2\()_internal_vfp
478 add a1, v1, #8*(\n/4)*2
479 bl .Lfft\n4\()_internal_vfp
480 movrelx v5, X(ff_cos_\n), a1
481 add a1, v1, #8*(\n/4)*3
482 bl .Lfft\n4\()_internal_vfp
483 .if \n >= 512
484 .set o1, 0*(\n/4/2)
485 .set o2, 0*(\n/4/2)
486 .set o3, 0*(\n/4/2)
487 add v2, v1, #8*2*(\n/4/2)
488 add v3, v1, #8*4*(\n/4/2)
489 add v4, v1, #8*6*(\n/4/2)
490 pass (\n/4/2), v1, v2, v3, v4
491 pop {v1-v6,pc}
492 .elseif \n >= 256
493 .set o1, 2*(\n/4/2)
494 .set o2, 0*(\n/4/2)
495 .set o3, 2*(\n/4/2)
496 add v2, v1, #8*4*(\n/4/2)
497 pass (\n/4/2), v1, v1, v2, v2
498 pop {v1-v2,v5-v6,pc}
499 .else
500 .set o1, 2*(\n/4/2)
501 .set o2, 4*(\n/4/2)
502 .set o3, 6*(\n/4/2)
503 pass (\n/4/2), v1, v1, v1, v1
504 pop {v1,v5-v6,pc}
505 .endif
506endfunc
507
508function fft\n\()_vfp
509 ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
510 fmrx a2, FPSCR
511 fmxr FPSCR, a3
512 vpush {s16-s31}
513 mov ip, lr
514 bl .Lfft\n\()_internal_vfp
515 vpop {s16-s31}
516 fmxr FPSCR, a2
517 bx ip
518endfunc
519
520.ltorg
521.endm
522
523 def_fft 32, 16, 8
524 def_fft 64, 32, 16
525 def_fft 128, 64, 32
526 def_fft 256, 128, 64
527 def_fft 512, 256, 128
528 def_fft 1024, 512, 256
529 def_fft 2048, 1024, 512
530 def_fft 4096, 2048, 1024
531 def_fft 8192, 4096, 2048
532 def_fft 16384, 8192, 4096
533 def_fft 32768, 16384, 8192
534 def_fft 65536, 32768, 16384