Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * ARM NEON optimised FFT | |
3 | * | |
4 | * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |
5 | * Copyright (c) 2009 Naotoshi Nojiri | |
6 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> | |
7 | * | |
8 | * This algorithm (though not any of the implementation details) is | |
9 | * based on libdjbfft by D. J. Bernstein. | |
10 | * | |
11 | * This file is part of FFmpeg. | |
12 | * | |
13 | * FFmpeg is free software; you can redistribute it and/or | |
14 | * modify it under the terms of the GNU Lesser General Public | |
15 | * License as published by the Free Software Foundation; either | |
16 | * version 2.1 of the License, or (at your option) any later version. | |
17 | * | |
18 | * FFmpeg is distributed in the hope that it will be useful, | |
19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 | * Lesser General Public License for more details. | |
22 | * | |
23 | * You should have received a copy of the GNU Lesser General Public | |
24 | * License along with FFmpeg; if not, write to the Free Software | |
25 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
26 | */ | |
27 | ||
28 | #include "libavutil/aarch64/asm.S" | |
29 | ||
30 | #define M_SQRT1_2 0.70710678118654752440 | |
31 | ||
32 | .macro transpose d0, d1, s0, s1 | |
33 | trn1 \d0, \s0, \s1 | |
34 | trn2 \d1, \s0, \s1 | |
35 | .endm | |
36 | ||
37 | ||
38 | function fft4_neon | |
39 | ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] | |
40 | ||
41 | fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 | |
42 | fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 | |
43 | ||
44 | ext v16.8b, v2.8b, v3.8b, #4 | |
45 | ext v17.8b, v3.8b, v2.8b, #4 | |
46 | ||
47 | fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 | |
48 | fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 | |
49 | ||
50 | fadd v0.2s, v4.2s, v5.2s | |
51 | fsub v2.2s, v4.2s, v5.2s | |
52 | fadd v1.2s, v6.2s, v7.2s | |
53 | fsub v3.2s, v6.2s, v7.2s | |
54 | ||
55 | st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] | |
56 | ||
57 | ret | |
58 | endfunc | |
59 | ||
60 | function fft8_neon | |
61 | mov x1, x0 | |
62 | ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 | |
63 | ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] | |
64 | ext v22.8b, v2.8b, v3.8b, #4 | |
65 | ext v23.8b, v3.8b, v2.8b, #4 | |
66 | fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 | |
67 | fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 | |
68 | fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 | |
69 | fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 | |
70 | rev64 v27.2s, v28.2s // ??? | |
71 | fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 | |
72 | fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 | |
73 | fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w | |
74 | ext v6.8b, v4.8b, v5.8b, #4 | |
75 | ext v7.8b, v5.8b, v4.8b, #4 | |
76 | fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w | |
77 | fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 | |
78 | fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 | |
79 | fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w | |
80 | fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w | |
81 | fadd v0.2s, v20.2s, v21.2s | |
82 | fsub v2.2s, v20.2s, v21.2s | |
83 | fadd v1.2s, v22.2s, v23.2s | |
84 | rev64 v26.2s, v26.2s | |
85 | rev64 v27.2s, v27.2s | |
86 | fsub v3.2s, v22.2s, v23.2s | |
87 | fsub v6.2s, v6.2s, v7.2s | |
88 | fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 | |
89 | fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 | |
90 | fadd v7.2s, v4.2s, v5.2s | |
91 | fsub v18.2s, v2.2s, v6.2s | |
92 | ext v26.8b, v24.8b, v25.8b, #4 | |
93 | ext v27.8b, v25.8b, v24.8b, #4 | |
94 | fadd v2.2s, v2.2s, v6.2s | |
95 | fsub v16.2s, v0.2s, v7.2s | |
96 | fadd v5.2s, v25.2s, v24.2s | |
97 | fsub v4.2s, v26.2s, v27.2s | |
98 | fadd v0.2s, v0.2s, v7.2s | |
99 | fsub v17.2s, v1.2s, v5.2s | |
100 | fsub v19.2s, v3.2s, v4.2s | |
101 | fadd v3.2s, v3.2s, v4.2s | |
102 | fadd v1.2s, v1.2s, v5.2s | |
103 | ||
104 | st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] | |
105 | st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] | |
106 | ||
107 | ret | |
108 | endfunc | |
109 | ||
110 | function fft16_neon | |
111 | mov x1, x0 | |
112 | ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 | |
113 | ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 | |
114 | ext v22.8b, v2.8b, v3.8b, #4 | |
115 | ext v23.8b, v3.8b, v2.8b, #4 | |
116 | fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 | |
117 | fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 | |
118 | fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 | |
119 | fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 | |
120 | rev64 v27.2s, v28.2s // ??? | |
121 | fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 | |
122 | fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 | |
123 | fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w | |
124 | ext v6.8b, v4.8b, v5.8b, #4 | |
125 | ext v7.8b, v5.8b, v4.8b, #4 | |
126 | fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w | |
127 | fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 | |
128 | fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 | |
129 | fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w | |
130 | fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w | |
131 | fadd v0.2s, v20.2s, v21.2s | |
132 | fsub v2.2s, v20.2s, v21.2s | |
133 | fadd v1.2s, v22.2s, v23.2s | |
134 | rev64 v26.2s, v26.2s | |
135 | rev64 v27.2s, v27.2s | |
136 | fsub v3.2s, v22.2s, v23.2s | |
137 | fsub v6.2s, v6.2s, v7.2s | |
138 | fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 | |
139 | fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 | |
140 | fadd v7.2s, v4.2s, v5.2s | |
141 | fsub v18.2s, v2.2s, v6.2s | |
142 | ld1 {v20.4s,v21.4s}, [x0], #32 | |
143 | ld1 {v22.4s,v23.4s}, [x0], #32 | |
144 | ext v26.8b, v24.8b, v25.8b, #4 | |
145 | ext v27.8b, v25.8b, v24.8b, #4 | |
146 | fadd v2.2s, v2.2s, v6.2s | |
147 | fsub v16.2s, v0.2s, v7.2s | |
148 | fadd v5.2s, v25.2s, v24.2s | |
149 | fsub v4.2s, v26.2s, v27.2s | |
150 | transpose v24.2d, v25.2d, v20.2d, v22.2d | |
151 | transpose v26.2d, v27.2d, v21.2d, v23.2d | |
152 | fadd v0.2s, v0.2s, v7.2s | |
153 | fsub v17.2s, v1.2s, v5.2s | |
154 | fsub v19.2s, v3.2s, v4.2s | |
155 | fadd v3.2s, v3.2s, v4.2s | |
156 | fadd v1.2s, v1.2s, v5.2s | |
157 | ext v20.16b, v21.16b, v21.16b, #4 | |
158 | ext v21.16b, v23.16b, v23.16b, #4 | |
159 | ||
160 | zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} | |
161 | zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} | |
162 | zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} | |
163 | zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} | |
164 | ||
165 | // 2 x fft4 | |
166 | transpose v22.2d, v23.2d, v20.2d, v21.2d | |
167 | ||
168 | fadd v4.4s, v24.4s, v25.4s | |
169 | fadd v5.4s, v26.4s, v27.4s | |
170 | fsub v6.4s, v24.4s, v25.4s | |
171 | fsub v7.4s, v22.4s, v23.4s | |
172 | ||
173 | ld1 {v23.4s}, [x14] | |
174 | ||
175 | fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} | |
176 | fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} | |
177 | fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} | |
178 | fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} | |
179 | ||
180 | //fft_pass_neon_16 | |
181 | rev64 v7.4s, v25.4s | |
182 | fmul v25.4s, v25.4s, v23.s[1] | |
183 | fmul v7.4s, v7.4s, v29.4s | |
184 | fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} | |
185 | ||
186 | zip1 v20.4s, v24.4s, v25.4s | |
187 | zip2 v21.4s, v24.4s, v25.4s | |
188 | fneg v22.4s, v20.4s | |
189 | fadd v4.4s, v21.4s, v20.4s | |
190 | fsub v6.4s, v20.4s, v21.4s // just the second half | |
191 | fadd v5.4s, v21.4s, v22.4s // just the first half | |
192 | ||
193 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float | |
194 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float | |
195 | ||
196 | fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} | |
197 | fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} | |
198 | fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} | |
199 | fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} | |
200 | ||
201 | //second half | |
202 | rev64 v6.4s, v26.4s | |
203 | fmul v26.4s, v26.4s, v23.s[2] | |
204 | rev64 v7.4s, v27.4s | |
205 | fmul v27.4s, v27.4s, v23.s[3] | |
206 | fmul v6.4s, v6.4s, v29.4s | |
207 | fmul v7.4s, v7.4s, v29.4s | |
208 | fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} | |
209 | fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} | |
210 | ||
211 | zip1 v24.4s, v26.4s, v27.4s | |
212 | zip2 v25.4s, v26.4s, v27.4s | |
213 | fneg v26.4s, v24.4s | |
214 | fadd v4.4s, v25.4s, v24.4s | |
215 | fsub v6.4s, v24.4s, v25.4s // just the second half | |
216 | fadd v5.4s, v25.4s, v26.4s // just the first half | |
217 | ||
218 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float | |
219 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float | |
220 | ||
221 | fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} | |
222 | fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} | |
223 | fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} | |
224 | fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} | |
225 | ||
226 | st1 {v16.4s,v17.4s}, [x1], #32 | |
227 | st1 {v18.4s,v19.4s}, [x1], #32 | |
228 | st1 {v20.4s,v21.4s}, [x1], #32 | |
229 | st1 {v22.4s,v23.4s}, [x1], #32 | |
230 | ||
231 | ret | |
232 | endfunc | |
233 | ||
234 | ||
235 | const trans4_float, align=4 | |
236 | .byte 0, 1, 2, 3 | |
237 | .byte 8, 9, 10, 11 | |
238 | .byte 4, 5, 6, 7 | |
239 | .byte 12, 13, 14, 15 | |
240 | endconst | |
241 | ||
242 | const trans8_float, align=4 | |
243 | .byte 24, 25, 26, 27 | |
244 | .byte 0, 1, 2, 3 | |
245 | .byte 28, 29, 30, 31 | |
246 | .byte 4, 5, 6, 7 | |
247 | endconst | |
248 | ||
249 | function fft_pass_neon | |
250 | sub x6, x2, #1 // n - 1, loop counter | |
251 | lsl x5, x2, #3 // 2 * n * sizeof FFTSample | |
252 | lsl x1, x2, #4 // 2 * n * sizeof FFTComplex | |
253 | add x5, x4, x5 // wim | |
254 | add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex | |
255 | add x2, x0, x2, lsl #5 // &z[o2] | |
256 | add x3, x0, x3 // &z[o3] | |
257 | add x1, x0, x1 // &z[o1] | |
258 | ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} | |
259 | ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} | |
260 | ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} | |
261 | trn2 v25.2d, v20.2d, v22.2d | |
262 | sub x5, x5, #4 // wim-- | |
263 | trn1 v24.2d, v20.2d, v22.2d | |
264 | ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] | |
265 | rev64 v7.4s, v25.4s | |
266 | fmul v25.4s, v25.4s, v4.s[1] | |
267 | ld1 {v16.4s}, [x0] // {z[0],z[1]} | |
268 | fmul v7.4s, v7.4s, v29.4s | |
269 | ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} | |
270 | prfm pldl1keep, [x2, #16] | |
271 | prfm pldl1keep, [x3, #16] | |
272 | fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} | |
273 | prfm pldl1keep, [x0, #16] | |
274 | prfm pldl1keep, [x1, #16] | |
275 | ||
276 | zip1 v20.4s, v24.4s, v25.4s | |
277 | zip2 v21.4s, v24.4s, v25.4s | |
278 | fneg v22.4s, v20.4s | |
279 | fadd v4.4s, v21.4s, v20.4s | |
280 | fsub v6.4s, v20.4s, v21.4s // just the second half | |
281 | fadd v5.4s, v21.4s, v22.4s // just the first half | |
282 | ||
283 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float | |
284 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float | |
285 | ||
286 | fadd v20.4s, v16.4s, v4.4s | |
287 | fsub v22.4s, v16.4s, v4.4s | |
288 | fadd v21.4s, v17.4s, v5.4s | |
289 | st1 {v20.4s}, [x0], #16 // {z[0], z[1]} | |
290 | fsub v23.4s, v17.4s, v5.4s | |
291 | ||
292 | st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} | |
293 | st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} | |
294 | st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} | |
295 | 1: | |
296 | ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} | |
297 | ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} | |
298 | ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} | |
299 | transpose v26.2d, v27.2d, v20.2d, v22.2d | |
300 | ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} | |
301 | rev64 v6.4s, v26.4s | |
302 | fmul v26.4s, v26.4s, v4.s[0] | |
303 | rev64 v7.4s, v27.4s | |
304 | fmul v27.4s, v27.4s, v4.s[1] | |
305 | fmul v6.4s, v6.4s, v29.4s | |
306 | fmul v7.4s, v7.4s, v29.4s | |
307 | ld1 {v16.4s},[x0] // {z[0],z[1]} | |
308 | fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} | |
309 | fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} | |
310 | ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} | |
311 | ||
312 | subs x6, x6, #1 // n-- | |
313 | ||
314 | zip1 v20.4s, v26.4s, v27.4s | |
315 | zip2 v21.4s, v26.4s, v27.4s | |
316 | fneg v22.4s, v20.4s | |
317 | fadd v4.4s, v21.4s, v20.4s | |
318 | fsub v6.4s, v20.4s, v21.4s // just the second half | |
319 | fadd v5.4s, v21.4s, v22.4s // just the first half | |
320 | ||
321 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float | |
322 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float | |
323 | ||
324 | fadd v20.4s, v16.4s, v4.4s | |
325 | fsub v22.4s, v16.4s, v4.4s | |
326 | fadd v21.4s, v17.4s, v5.4s | |
327 | st1 {v20.4s}, [x0], #16 // {z[0], z[1]} | |
328 | fsub v23.4s, v17.4s, v5.4s | |
329 | ||
330 | st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} | |
331 | st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} | |
332 | st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} | |
333 | b.ne 1b | |
334 | ||
335 | ret | |
336 | endfunc | |
337 | ||
338 | .macro def_fft n, n2, n4 | |
339 | function fft\n\()_neon, align=6 | |
340 | sub sp, sp, #16 | |
341 | stp x28, x30, [sp] | |
342 | add x28, x0, #\n4*2*8 | |
343 | bl fft\n2\()_neon | |
344 | mov x0, x28 | |
345 | bl fft\n4\()_neon | |
346 | add x0, x28, #\n4*1*8 | |
347 | bl fft\n4\()_neon | |
348 | sub x0, x28, #\n4*2*8 | |
349 | ldp x28, x30, [sp], #16 | |
350 | movrel x4, X(ff_cos_\n) | |
351 | mov x2, #\n4>>1 | |
352 | b fft_pass_neon | |
353 | endfunc | |
354 | .endm | |
355 | ||
356 | def_fft 32, 16, 8 | |
357 | def_fft 64, 32, 16 | |
358 | def_fft 128, 64, 32 | |
359 | def_fft 256, 128, 64 | |
360 | def_fft 512, 256, 128 | |
361 | def_fft 1024, 512, 256 | |
362 | def_fft 2048, 1024, 512 | |
363 | def_fft 4096, 2048, 1024 | |
364 | def_fft 8192, 4096, 2048 | |
365 | def_fft 16384, 8192, 4096 | |
366 | def_fft 32768, 16384, 8192 | |
367 | def_fft 65536, 32768, 16384 | |
368 | ||
369 | function ff_fft_calc_neon, export=1 | |
370 | prfm pldl1keep, [x1] | |
371 | movrel x10, trans4_float | |
372 | ldr w2, [x0] | |
373 | movrel x11, trans8_float | |
374 | sub w2, w2, #2 | |
375 | movrel x3, fft_tab_neon | |
376 | ld1 {v30.16b}, [x10] | |
377 | mov x7, #-8 | |
378 | movrel x12, pmmp | |
f6fa7814 DM |
379 | ldr x4, [x3, x2, lsl #3] |
380 | add x3, x3, x4 | |
2ba45a60 DM |
381 | movrel x13, mppm |
382 | movrel x14, X(ff_cos_16) | |
383 | ld1 {v31.16b}, [x11] | |
384 | mov x0, x1 | |
385 | ld1 {v29.4s}, [x12] // pmmp | |
386 | ld1 {v28.4s}, [x13] | |
387 | br x3 | |
388 | endfunc | |
389 | ||
390 | function ff_fft_permute_neon, export=1 | |
391 | mov x6, #1 | |
392 | ldr w2, [x0] // nbits | |
393 | ldr x3, [x0, #16] // tmp_buf | |
394 | ldr x0, [x0, #8] // revtab | |
395 | lsl x6, x6, x2 | |
396 | mov x2, x6 | |
397 | 1: | |
398 | ld1 {v0.2s,v1.2s}, [x1], #16 | |
399 | ldr w4, [x0], #4 | |
400 | uxth w5, w4 | |
401 | lsr w4, w4, #16 | |
402 | add x5, x3, x5, lsl #3 | |
403 | add x4, x3, x4, lsl #3 | |
404 | st1 {v0.2s}, [x5] | |
405 | st1 {v1.2s}, [x4] | |
406 | subs x6, x6, #2 | |
407 | b.gt 1b | |
408 | ||
409 | sub x1, x1, x2, lsl #3 | |
410 | 1: | |
411 | ld1 {v0.4s,v1.4s}, [x3], #32 | |
412 | st1 {v0.4s,v1.4s}, [x1], #32 | |
413 | subs x2, x2, #4 | |
414 | b.gt 1b | |
415 | ||
416 | ret | |
417 | endfunc | |
418 | ||
419 | const fft_tab_neon | |
f6fa7814 DM |
420 | .quad fft4_neon - fft_tab_neon |
421 | .quad fft8_neon - fft_tab_neon | |
422 | .quad fft16_neon - fft_tab_neon | |
423 | .quad fft32_neon - fft_tab_neon | |
424 | .quad fft64_neon - fft_tab_neon | |
425 | .quad fft128_neon - fft_tab_neon | |
426 | .quad fft256_neon - fft_tab_neon | |
427 | .quad fft512_neon - fft_tab_neon | |
428 | .quad fft1024_neon - fft_tab_neon | |
429 | .quad fft2048_neon - fft_tab_neon | |
430 | .quad fft4096_neon - fft_tab_neon | |
431 | .quad fft8192_neon - fft_tab_neon | |
432 | .quad fft16384_neon - fft_tab_neon | |
433 | .quad fft32768_neon - fft_tab_neon | |
434 | .quad fft65536_neon - fft_tab_neon | |
2ba45a60 DM |
435 | endconst |
436 | ||
437 | const pmmp, align=4 | |
438 | .float +1.0, -1.0, -1.0, +1.0 | |
439 | endconst | |
440 | ||
441 | const mppm, align=4 | |
442 | .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | |
443 | endconst |