2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 .macro bflies d0, d1, r0, r1
24 vrev64.32 \r0, \d1 @ t5, t6, t1, t2
25 vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
26 vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
27 vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
28 vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
30 vhsub.s16 \d1, \d0, \r0
31 vhadd.s16 \d0, \d0, \r0
34 .macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
36 vmull.s16 \w0, \d3, \c0
37 vmlal.s16 \w0, \r0, \c1
38 vshrn.s32 \d3, \w0, #15
39 bflies \q0, \q1, \w0, \w1
42 .macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
46 vmull.s16 \w0, \d1, \c0
47 vmlal.s16 \w0, \r0, \c1
48 vmull.s16 \w1, \d3, \c2
49 vmlal.s16 \w1, \r1, \c3
50 vshrn.s32 \d1, \w0, #15
51 vshrn.s32 \d3, \w1, #15
52 bflies \q0, \q1, \w0, \w1
55 .macro fft4 d0, d1, r0, r1
56 vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
57 vhsub.s16 \r1, \d1, \d0
58 vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
59 vmov.i64 \d1, #0xffff00000000
61 vrev64.16 \r1, \r0 @ t7, t8, t4, t3
62 vtrn.32 \r0, \r1 @ t3, t4, t7, t8
63 vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
64 vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
65 vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
68 .macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
69 fft4 \d0, \d1, \r0, \r1
70 vtrn.32 \d0, \d1 @ z0, z2, z1, z3
71 vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
72 vhsub.s16 \d3, \d2, \d3 @ z5, z7
74 transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
85 vld1.16 {d0-d3}, [r0,:128]
87 vld1.16 {d30}, [r1,:64]
89 fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
92 vst1.16 {d0-d3}, [r0,:128]
97 vld1.16 {d0-d3}, [r0,:128]!
98 vld1.16 {d4-d7}, [r0,:128]
101 vld1.16 {d28-d31},[r1,:128]
103 fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
107 vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
108 vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
111 transform01 q0, q2, d5, d31, d28, d20, q8, q9
114 transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
118 vst1.16 {d0-d3}, [r0,:128]!
119 vst1.16 {d4-d7}, [r0,:128]
123 function fft_pass_neon
125 movrel lr, coefs + 24
126 vld1.16 {d30}, [lr,:64]
129 add r3, r1, r2, lsl #2
133 vld1.16 {d27[]}, [r3,:16]
135 vld1.16 {q0}, [r4,:128], r12
136 vld1.16 {q1}, [r4,:128], r12
137 vld1.16 {q2}, [r4,:128], r12
138 vld1.16 {q3}, [r4,:128], r12
139 vld1.16 {d28}, [r1,:64]!
140 vld1.16 {d29}, [r3,:64], lr
146 vmul.s16 d27, d27, d31
147 transform01 q0, q2, d5, d25, d27, d20, q8, q9
152 vld1.16 {q0}, [r4,:128], r12
153 vld1.16 {q1}, [r4,:128], r12
154 vld1.16 {q2}, [r4,:128], r12
155 vld1.16 {q3}, [r4,:128], r12
156 vld1.16 {d28}, [r1,:64]!
157 vld1.16 {d29}, [r3,:64], lr
165 vmul.s16 q13, q13, q15
166 transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
175 vmul.s16 q13, q13, q15
176 transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
185 vst1.16 {q0}, [r4,:128], r12
186 vst1.16 {q1}, [r4,:128], r12
187 vst1.16 {q2}, [r4,:128], r12
188 vst1.16 {q3}, [r4,:128], r12
195 #define F_SQRT1_2 23170
196 #define F_COS_16_1 30274
197 #define F_COS_16_3 12540
200 .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
201 .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
202 .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
206 .macro def_fft n, n2, n4
207 function fft\n\()_neon
217 movrelx r1, X(ff_cos_\n\()_fixed)
227 def_fft 512, 256, 128
228 def_fft 1024, 512, 256
229 def_fft 2048, 1024, 512
230 def_fft 4096, 2048, 1024
231 def_fft 8192, 4096, 2048
232 def_fft 16384, 8192, 4096
233 def_fft 32768, 16384, 8192
234 def_fft 65536, 32768, 16384
236 function ff_fft_fixed_calc_neon, export=1
239 movrel r3, fft_fixed_tab_neon
240 ldr r3, [r3, r2, lsl #2]
245 const fft_fixed_tab_neon