Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/aarch64/asm.S" | |
22 | ||
23 | #include "asm-offsets.h" | |
24 | ||
25 | .macro shuffle a, b, c, d | |
26 | const shuffle_\a\b\c\d, align=4 | |
27 | .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3) | |
28 | .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3) | |
29 | .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3) | |
30 | .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3) | |
31 | endconst | |
32 | .endm | |
33 | ||
34 | shuffle 0, 2, 1, 3 | |
35 | shuffle 1, 0, 3, 2 | |
36 | shuffle 2, 3, 0, 1 | |
37 | shuffle 3, 1, 2, 0 | |
38 | ||
39 | ||
40 | function fft5_neon | |
41 | lsl x2, x2, #3 | |
42 | ld1 {v24.2s}, [x1], x2 | |
43 | ld2 {v25.s,v26.s}[0], [x1], x2 | |
44 | ld2 {v25.s,v26.s}[1], [x1], x2 | |
45 | ld2 {v25.s,v26.s}[2], [x1], x2 | |
46 | ld2 {v25.s,v26.s}[3], [x1] | |
47 | dup v6.4s, v24.s[0] | |
48 | dup v7.4s, v24.s[1] | |
49 | ||
50 | faddp v0.4s, v25.4s, v26.4s | |
51 | // z[][0], z[][3] | |
52 | fmul v16.4s, v25.4s, v15.s[0] // rr | |
53 | fmul v17.4s, v25.4s, v15.s[1] // ri | |
54 | fmul v18.4s, v26.4s, v15.s[0] // ir | |
55 | fmul v19.4s, v26.4s, v15.s[1] // ii | |
56 | faddp v0.4s, v0.4s, v0.4s | |
57 | // z[][1], z[][2] | |
58 | fmul v20.4s, v25.4s, v15.s[2] // rr | |
59 | fmul v21.4s, v25.4s, v15.s[3] // ri | |
60 | fmul v22.4s, v26.4s, v15.s[2] // ir | |
61 | fmul v23.4s, v26.4s, v15.s[3] // ii | |
62 | fadd v0.2s, v24.2s, v0.2s // out[0] | |
63 | ||
64 | // z[0123][0], z[0123][3] | |
65 | fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii; | |
66 | fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii; | |
67 | ld1 {v16.16b}, [x11] | |
68 | ld1 {v19.16b}, [x14] | |
69 | fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir; | |
70 | fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir; | |
71 | ld1 {v17.16b}, [x12] | |
72 | // z[0123][1], z[0123][2] | |
73 | fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii; | |
74 | fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii; | |
75 | ld1 {v18.16b}, [x13] | |
76 | fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir; | |
77 | fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir; | |
78 | ||
79 | //real | |
80 | tbl v20.16b, {v24.16b}, v16.16b | |
81 | tbl v21.16b, {v25.16b}, v17.16b | |
82 | tbl v22.16b, {v26.16b}, v18.16b | |
83 | tbl v23.16b, {v27.16b}, v19.16b | |
84 | //imag | |
85 | tbl v16.16b, {v28.16b}, v16.16b | |
86 | tbl v17.16b, {v29.16b}, v17.16b | |
87 | tbl v18.16b, {v30.16b}, v18.16b | |
88 | tbl v19.16b, {v31.16b}, v19.16b | |
89 | ||
90 | fadd v6.4s, v6.4s, v20.4s | |
91 | fadd v22.4s, v22.4s, v23.4s | |
92 | fadd v7.4s, v7.4s, v16.4s | |
93 | fadd v18.4s, v18.4s, v19.4s | |
94 | ||
95 | fadd v21.4s, v21.4s, v22.4s | |
96 | fadd v17.4s, v17.4s, v18.4s | |
97 | fadd v6.4s, v6.4s, v21.4s | |
98 | fadd v7.4s, v7.4s, v17.4s | |
99 | ||
100 | ret | |
101 | endfunc | |
102 | ||
103 | function fft15_neon | |
104 | mov x8, x1 | |
105 | mov x9, x30 | |
106 | add x2, x3, x3, lsl #1 // 3 * stride | |
107 | ||
108 | add x1, x8, x3, lsl #3 // in + 1 * stride | |
109 | bl fft5_neon | |
110 | mov v1.8b, v0.8b | |
111 | mov v2.16b, v6.16b | |
112 | mov v3.16b, v7.16b | |
113 | ||
114 | add x1, x8, x3, lsl #4 // in + 2 * stride | |
115 | add x2, x3, x3, lsl #1 // 3 * stride | |
116 | bl fft5_neon | |
117 | zip1 v1.4s, v1.4s, v0.4s | |
118 | mov v4.16b, v6.16b | |
119 | mov v5.16b, v7.16b | |
120 | ||
121 | mov x1, x8 // in + 0 * stride | |
122 | add x2, x3, x3, lsl #1 // 3 * stride | |
123 | bl fft5_neon | |
124 | ||
125 | faddp v20.4s, v1.4s, v1.4s | |
126 | ||
127 | ext v18.16b, v8.16b, v8.16b, #4 | |
128 | ext v19.16b, v9.16b, v9.16b, #4 | |
129 | mov v16.16b, v6.16b | |
130 | mov v17.16b, v7.16b | |
131 | fadd v20.2s, v20.2s, v0.2s | |
132 | ||
133 | uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re | |
134 | uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im | |
135 | ||
136 | st1 {v20.2s}, [x0], #8 // out[0] | |
137 | ||
138 | fmla v16.4s, v2.4s, v8.4s | |
139 | fmls v16.4s, v3.4s, v9.4s | |
140 | ||
141 | fmla v17.4s, v2.4s, v9.4s | |
142 | fmla v17.4s, v3.4s, v8.4s | |
143 | ||
144 | fmla v16.4s, v4.4s, v18.4s | |
145 | fmls v16.4s, v5.4s, v19.4s | |
146 | ||
147 | fmla v17.4s, v4.4s, v19.4s | |
148 | fmla v17.4s, v5.4s, v18.4s | |
149 | ||
150 | zip1 v18.4s, v16.4s, v17.4s | |
151 | zip2 v19.4s, v16.4s, v17.4s | |
152 | ||
153 | rev64 v31.4s, v14.4s | |
154 | trn1 v28.2d, v1.2d, v1.2d | |
155 | trn2 v29.2d, v1.2d, v1.2d | |
156 | zip1 v30.2d, v14.2d, v31.2d | |
157 | zip2 v31.2d, v14.2d, v31.2d | |
158 | ||
159 | st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4] | |
160 | ||
161 | fmul v16.4s, v28.4s, v30.4s | |
162 | fmul v17.4s, v29.4s, v30.4s | |
163 | fmls v16.4s, v29.4s, v31.4s | |
164 | fmla v17.4s, v28.4s, v31.4s | |
165 | faddp v16.4s, v16.4s, v16.4s | |
166 | faddp v17.4s, v17.4s, v17.4s | |
167 | zip1 v18.2s, v16.2s, v17.2s | |
168 | zip2 v19.2s, v16.2s, v17.2s | |
169 | ||
170 | fadd v18.2s, v18.2s, v0.2s | |
171 | fadd v0.2s, v19.2s, v0.2s | |
172 | ||
173 | ext v30.16b, v12.16b, v12.16b, #4 | |
174 | ext v31.16b, v13.16b, v13.16b, #4 | |
175 | mov v16.16b, v6.16b | |
176 | mov v17.16b, v7.16b | |
177 | ||
178 | uzp1 v30.4s, v30.4s, v8.4s | |
179 | uzp1 v31.4s, v31.4s, v9.4s | |
180 | ||
181 | st1 {v18.2s}, [x0], #8 // out[5] | |
182 | ||
183 | fmla v16.4s, v2.4s, v10.4s | |
184 | fmls v16.4s, v3.4s, v11.4s | |
185 | ||
186 | fmla v17.4s, v2.4s, v11.4s | |
187 | fmla v17.4s, v3.4s, v10.4s | |
188 | ||
189 | fmla v16.4s, v4.4s, v30.4s | |
190 | fmls v16.4s, v5.4s, v31.4s | |
191 | ||
192 | fmla v17.4s, v4.4s, v31.4s | |
193 | fmla v17.4s, v5.4s, v30.4s | |
194 | ||
195 | zip1 v18.4s, v16.4s, v17.4s | |
196 | zip2 v19.4s, v16.4s, v17.4s | |
197 | ||
198 | ext v30.16b, v10.16b, v10.16b, #4 | |
199 | ext v31.16b, v11.16b, v11.16b, #4 | |
200 | ||
201 | fmla v6.4s, v2.4s, v12.4s | |
202 | fmls v6.4s, v3.4s, v13.4s | |
203 | ||
204 | st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9] | |
205 | ||
206 | uzp1 v30.4s, v30.4s, v12.4s | |
207 | uzp1 v31.4s, v31.4s, v13.4s | |
208 | ||
209 | fmla v7.4s, v2.4s, v13.4s | |
210 | fmla v7.4s, v3.4s, v12.4s | |
211 | ||
212 | st1 {v0.2s}, [x0], #8 // out[10] | |
213 | ||
214 | fmla v6.4s, v4.4s, v30.4s | |
215 | fmls v6.4s, v5.4s, v31.4s | |
216 | ||
217 | fmla v7.4s, v4.4s, v31.4s | |
218 | fmla v7.4s, v5.4s, v30.4s | |
219 | ||
220 | zip1 v18.4s, v6.4s, v7.4s | |
221 | zip2 v19.4s, v6.4s, v7.4s | |
222 | ||
223 | st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14] | |
224 | ||
225 | ret x9 | |
226 | endfunc | |
227 | ||
228 | // x0: out, x1: out+len2, x2: exptab, x3: len2 | |
229 | function fft15_pass | |
230 | ands x6, x3, #3 | |
231 | mov x4, x0 | |
232 | mov x5, x1 | |
233 | b.eq 9f | |
234 | ld1 {v0.2s}, [x0], #8 | |
235 | ld1 {v1.2s}, [x1], #8 | |
236 | sub x3, x3, x6 | |
237 | subs x6, x6, #1 | |
238 | fadd v2.2s, v0.2s, v1.2s | |
239 | fsub v3.2s, v0.2s, v1.2s | |
240 | add x2, x2, #8 | |
241 | st1 {v2.2s}, [x4], #8 | |
242 | st1 {v3.2s}, [x5], #8 | |
243 | b.eq 9f | |
244 | 1: | |
245 | subs x6, x6, #1 | |
246 | ldp s4, s5, [x2], #8 | |
247 | ldp s2, s3, [x1], #8 | |
248 | ldp s0, s1, [x0], #8 | |
249 | ||
250 | fmul s6, s2, s4 | |
251 | fmul s7, s2, s5 | |
252 | fmls s6, s3, v5.s[0] | |
253 | fmla s7, s3, v4.s[0] | |
254 | ||
255 | fsub s2, s0, s6 | |
256 | fsub s3, s1, s7 | |
257 | fadd s0, s0, s6 | |
258 | fadd s1, s1, s7 | |
259 | ||
260 | stp s2, s3, [x5], #8 | |
261 | stp s0, s1, [x4], #8 | |
262 | b.gt 1b | |
263 | 9: | |
264 | ld1 {v4.4s,v5.4s}, [x2], #32 | |
265 | ld2 {v2.4s,v3.4s}, [x1], #32 | |
266 | uzp1 v6.4s, v4.4s, v5.4s | |
267 | uzp2 v7.4s, v4.4s, v5.4s | |
268 | ld2 {v0.4s,v1.4s}, [x0], #32 | |
269 | 8: | |
270 | subs x3, x3, #8 | |
271 | ||
272 | fmul v4.4s, v2.4s, v6.4s | |
273 | fmul v5.4s, v2.4s, v7.4s | |
274 | b.lt 4f | |
275 | ||
276 | ld1 {v18.4s,v19.4s}, [x2], #32 | |
277 | ||
278 | fmls v4.4s, v3.4s, v7.4s | |
279 | fmla v5.4s, v3.4s, v6.4s | |
280 | ||
281 | ld2 {v22.4s,v23.4s}, [x1], #32 | |
282 | ||
283 | fsub v2.4s, v0.4s, v4.4s | |
284 | fadd v0.4s, v0.4s, v4.4s | |
285 | fsub v3.4s, v1.4s, v5.4s | |
286 | fadd v1.4s, v1.4s, v5.4s | |
287 | ||
288 | uzp1 v16.4s, v18.4s, v19.4s | |
289 | uzp2 v17.4s, v18.4s, v19.4s | |
290 | ||
291 | st2 {v2.4s,v3.4s}, [x5], #32 | |
292 | st2 {v0.4s,v1.4s}, [x4], #32 | |
293 | ld2 {v20.4s,v21.4s}, [x0], #32 | |
294 | ||
295 | fmul v18.4s, v22.4s, v16.4s | |
296 | fmul v19.4s, v22.4s, v17.4s | |
297 | b.eq 0f | |
298 | ||
299 | ld1 {v4.4s,v5.4s}, [x2], #32 | |
300 | ||
301 | fmls v18.4s, v23.4s, v17.4s | |
302 | fmla v19.4s, v23.4s, v16.4s | |
303 | ||
304 | ld2 {v2.4s,v3.4s}, [x1], #32 | |
305 | ||
306 | fsub v22.4s, v20.4s, v18.4s | |
307 | fadd v20.4s, v20.4s, v18.4s | |
308 | fsub v23.4s, v21.4s, v19.4s | |
309 | fadd v21.4s, v21.4s, v19.4s | |
310 | ||
311 | uzp1 v6.4s, v4.4s, v5.4s | |
312 | uzp2 v7.4s, v4.4s, v5.4s | |
313 | ||
314 | st2 {v22.4s,v23.4s}, [x5], #32 | |
315 | st2 {v20.4s,v21.4s}, [x4], #32 | |
316 | ld2 {v0.4s,v1.4s}, [x0], #32 | |
317 | ||
318 | b 8b | |
319 | 4: | |
320 | fmls v4.4s, v3.4s, v7.4s | |
321 | fmla v5.4s, v3.4s, v6.4s | |
322 | ||
323 | fsub v2.4s, v0.4s, v4.4s | |
324 | fadd v0.4s, v0.4s, v4.4s | |
325 | fsub v3.4s, v1.4s, v5.4s | |
326 | fadd v1.4s, v1.4s, v5.4s | |
327 | ||
328 | st2 {v2.4s,v3.4s}, [x5], #32 | |
329 | st2 {v0.4s,v1.4s}, [x4], #32 | |
330 | ||
331 | ret | |
332 | 0: | |
333 | fmls v18.4s, v23.4s, v17.4s | |
334 | fmla v19.4s, v23.4s, v16.4s | |
335 | ||
336 | fsub v22.4s, v20.4s, v18.4s | |
337 | fadd v20.4s, v20.4s, v18.4s | |
338 | fsub v23.4s, v21.4s, v19.4s | |
339 | fadd v21.4s, v21.4s, v19.4s | |
340 | ||
341 | st2 {v22.4s,v23.4s}, [x5], #32 | |
342 | st2 {v20.4s,v21.4s}, [x4], #32 | |
343 | ||
344 | ret | |
345 | endfunc | |
346 | ||
347 | function fft30_neon, align=6 | |
348 | sub sp, sp, #0x20 | |
349 | stp x20, x21, [sp] | |
350 | stp x22, x30, [sp, #0x10] | |
351 | mov x21, x1 | |
352 | mov x22, x2 | |
353 | mov x20, x4 | |
354 | mov x0, x21 | |
355 | mov x1, x22 | |
356 | lsl x3, x20, #1 | |
357 | bl fft15_neon | |
358 | ||
359 | add x0, x21, #15*8 | |
360 | add x1, x22, x20, lsl #3 | |
361 | lsl x3, x20, #1 | |
362 | bl fft15_neon | |
363 | ||
364 | ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1] | |
365 | add x0, x21, #0 | |
366 | add x1, x21, #15*8 | |
367 | mov x3, #15 | |
368 | ldp x20, x21, [sp] | |
369 | ldp x22, x30, [sp, #0x10] | |
370 | add sp, sp, #0x20 | |
371 | b fft15_pass | |
372 | endfunc | |
373 | ||
374 | .macro def_fft n, n2 | |
375 | function fft\n\()_neon, align=6 | |
376 | sub sp, sp, #0x30 | |
377 | stp x20, x21, [sp] | |
378 | stp x22, x30, [sp, #0x10] | |
379 | stp x23, x24, [sp, #0x20] | |
380 | mov x21, x1 | |
381 | mov x22, x2 | |
382 | mov x23, x3 | |
383 | mov x20, x4 | |
384 | sub x3, x3, #1 | |
385 | lsl x4, x4, #1 | |
386 | bl fft\n2\()_neon | |
387 | ||
388 | add x1, x21, #(\n2 * 8) | |
389 | add x2, x22, x20, lsl #3 | |
390 | sub x3, x23, #1 | |
391 | lsl x4, x20, #1 | |
392 | bl fft\n2\()_neon | |
393 | ||
394 | add x5, x10, #CELT_EXPTAB | |
395 | mov x0, x21 | |
396 | ldr x2, [x5, x23, lsl #3] // s->exptab[N] | |
397 | add x1, x21, #(\n2 * 8) | |
398 | mov x3, #\n2 | |
399 | ldp x20, x21, [sp] | |
400 | ldp x22, x30, [sp, #0x10] | |
401 | ldp x23, x24, [sp, #0x20] | |
402 | add sp, sp, #0x30 | |
403 | b fft15_pass | |
404 | endfunc | |
405 | .endm | |
406 | ||
407 | def_fft 60, 30 | |
408 | def_fft 120, 60 | |
409 | def_fft 240, 120 | |
410 | def_fft 480, 240 | |
411 | def_fft 960, 480 | |
412 | ||
413 | function fft_b15_calc_neon | |
414 | sub sp, sp, #0x50 | |
415 | ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0] | |
416 | movrel x6, fact5 | |
417 | movrel x11, shuffle_0213 | |
418 | movrel x12, shuffle_1032 | |
419 | movrel x13, shuffle_2301 | |
420 | movrel x14, shuffle_3120 | |
421 | add x8, x8, #8 | |
422 | movrel x5, fft_tab_neon | |
423 | stp x20, x30, [sp] | |
424 | stp d8, d9, [sp, #0x10] | |
425 | stp d10, d11, [sp, #0x20] | |
426 | stp d12, d13, [sp, #0x30] | |
427 | stp d14, d15, [sp, #0x40] | |
428 | ld1 {v15.4s}, [x6] | |
429 | ld1 {v0.4s,v1.4s}, [x8], #32 | |
430 | ld1 {v6.2s}, [x8], #8 | |
431 | ld1 {v2.4s,v3.4s}, [x8], #32 | |
432 | ld1 {v7.2s}, [x8], #8 | |
433 | ld1 {v4.4s,v5.4s}, [x8], #32 | |
434 | uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re | |
435 | uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im | |
436 | uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re | |
437 | uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im | |
438 | uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re | |
439 | uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im | |
440 | zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im | |
441 | add x5, x5, x3, lsl #3 | |
442 | ldr x5, [x5] | |
443 | mov x10, x0 | |
444 | blr x5 | |
445 | ldp x20, x30, [sp] | |
446 | ldp d8, d9, [sp, #0x10] | |
447 | ldp d10, d11, [sp, #0x20] | |
448 | ldp d12, d13, [sp, #0x30] | |
449 | ldp d14, d15, [sp, #0x40] | |
450 | add sp, sp, #0x50 | |
451 | ret | |
452 | endfunc | |
453 | ||
454 | const fft_tab_neon | |
455 | .quad fft15_neon | |
456 | .quad fft30_neon | |
457 | .quad fft60_neon | |
458 | .quad fft120_neon | |
459 | .quad fft240_neon | |
460 | .quad fft480_neon | |
461 | .quad fft960_neon | |
462 | endconst | |
463 | ||
464 | function ff_celt_imdct_half_neon, export=1 | |
465 | sub sp, sp, #0x20 | |
466 | stp x21, x30, [sp] | |
467 | str s0, [sp, #0x10] | |
468 | ||
469 | ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4 | |
470 | mov x10, x0 | |
471 | mov x21, x1 | |
472 | sub w5, w5, #1 | |
473 | lsl x7, x3, #3 // 2 * stride * sizeof(float) | |
474 | sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float) | |
475 | mul x5, x5, x3 | |
476 | ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE | |
477 | ldr w3, [x0, #CELT_FFT_N] | |
478 | add x5, x2, x5, lsl #2 | |
479 | mov x11, x9 | |
480 | ||
481 | sub w6, w6, #4 | |
482 | ld1 {v0.s}[0], [x5], x8 | |
483 | ld1 {v1.s}[0], [x2], x7 | |
484 | ld1 {v4.4s,v5.4s}, [x10], #32 | |
485 | ld1 {v0.s}[1], [x5], x8 | |
486 | ld1 {v1.s}[1], [x2], x7 | |
487 | uzp1 v2.4s, v4.4s, v5.4s | |
488 | ld1 {v0.s}[2], [x5], x8 | |
489 | ld1 {v1.s}[2], [x2], x7 | |
490 | uzp2 v3.4s, v4.4s, v5.4s | |
491 | ld1 {v0.s}[3], [x5], x8 | |
492 | ld1 {v1.s}[3], [x2], x7 | |
493 | 1: | |
494 | subs w6, w6, #4 | |
495 | ||
496 | ld1 {v20.s}[0], [x5], x8 | |
497 | ld1 {v21.s}[0], [x2], x7 | |
498 | ld1 {v4.4s,v5.4s}, [x10], #32 | |
499 | ||
500 | fmul v6.4s, v0.4s, v2.4s | |
501 | fmul v7.4s, v0.4s, v3.4s | |
502 | ||
503 | ld1 {v20.s}[1], [x5], x8 | |
504 | ld1 {v21.s}[1], [x2], x7 | |
505 | ||
506 | fmls v6.4s, v1.4s, v3.4s | |
507 | fmla v7.4s, v1.4s, v2.4s | |
508 | ||
509 | ld1 {v20.s}[2], [x5], x8 | |
510 | ld1 {v21.s}[2], [x2], x7 | |
511 | ||
512 | uzp1 v2.4s, v4.4s, v5.4s | |
513 | uzp2 v3.4s, v4.4s, v5.4s | |
514 | ld1 {v20.s}[3], [x5], x8 | |
515 | ld1 {v21.s}[3], [x2], x7 | |
516 | ||
517 | zip1 v4.4s, v6.4s, v7.4s | |
518 | zip2 v5.4s, v6.4s, v7.4s | |
519 | ||
520 | fmul v6.4s, v20.4s, v2.4s | |
521 | fmul v7.4s, v20.4s, v3.4s | |
522 | ||
523 | st1 {v4.4s,v5.4s}, [x9], #32 | |
524 | ||
525 | fmls v6.4s, v21.4s, v3.4s | |
526 | fmla v7.4s, v21.4s, v2.4s | |
527 | ||
528 | b.eq 3f | |
529 | ||
530 | subs w6, w6, #4 | |
531 | ld1 {v4.4s,v5.4s}, [x10], #32 | |
532 | ld1 {v0.s}[0], [x5], x8 | |
533 | ld1 {v1.s}[0], [x2], x7 | |
534 | uzp1 v2.4s, v4.4s, v5.4s | |
535 | ld1 {v0.s}[1], [x5], x8 | |
536 | ld1 {v1.s}[1], [x2], x7 | |
537 | uzp2 v3.4s, v4.4s, v5.4s | |
538 | ld1 {v0.s}[2], [x5], x8 | |
539 | ld1 {v1.s}[2], [x2], x7 | |
540 | zip1 v4.4s, v6.4s, v7.4s | |
541 | zip2 v5.4s, v6.4s, v7.4s | |
542 | ld1 {v0.s}[3], [x5], x8 | |
543 | ld1 {v1.s}[3], [x2], x7 | |
544 | ||
545 | st1 {v4.4s,v5.4s}, [x9], #32 | |
546 | ||
547 | b.gt 1b | |
548 | ||
549 | fmul v6.4s, v0.4s, v2.4s | |
550 | fmul v7.4s, v0.4s, v3.4s | |
551 | fmls v6.4s, v1.4s, v3.4s | |
552 | fmla v7.4s, v1.4s, v2.4s | |
553 | 3: | |
554 | zip1 v4.4s, v6.4s, v7.4s | |
555 | zip2 v5.4s, v6.4s, v7.4s | |
556 | st1 {v4.4s,v5.4s}, [x9], #32 | |
557 | ||
558 | mov x2, x11 | |
559 | mov x4, #1 | |
560 | ||
561 | bl fft_b15_calc_neon | |
562 | ||
563 | ldr w5, [x10, #CELT_LEN4] | |
564 | ldr x6, [x10, #CELT_TWIDDLE] | |
565 | ldr s31, [sp, #0x10] | |
566 | ||
567 | add x1, x21, x5, lsl #2 | |
568 | add x3, x6, x5, lsl #2 | |
569 | sub x0, x1, #16 | |
570 | sub x2, x3, #16 | |
571 | mov x8, #-16 | |
572 | mov x7, #16 | |
573 | mov x10, x0 | |
574 | mov x11, x1 | |
575 | ||
576 | sub w5, w5, #4 | |
577 | ||
578 | ld1 {v0.4s}, [x0], x8 | |
579 | ld1 {v1.4s}, [x1], x7 | |
580 | ld1 {v2.4s}, [x2], x8 | |
581 | ld1 {v3.4s}, [x3], x7 | |
582 | ||
583 | uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re | |
584 | uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im | |
585 | ||
586 | uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re | |
587 | uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im | |
588 | ||
589 | fmul v1.4s, v6.4s, v5.4s | |
590 | fmul v0.4s, v6.4s, v7.4s | |
591 | 2: | |
592 | subs w5, w5, #4 | |
593 | ||
594 | ld1 {v20.4s}, [x0], x8 | |
595 | ||
596 | fmla v1.4s, v4.4s, v7.4s | |
597 | fmls v0.4s, v4.4s, v5.4s | |
598 | ||
599 | ld1 {v21.4s}, [x1], x7 | |
600 | ||
601 | ext v1.16b, v1.16b, v1.16b, #8 | |
602 | fmul v0.4s, v0.4s, v31.s[0] | |
603 | ||
604 | ld1 {v2.4s}, [x2], x8 | |
605 | ||
606 | rev64 v1.4s, v1.4s | |
607 | fmul v1.4s, v1.4s, v31.s[0] | |
608 | ||
609 | ld1 {v3.4s}, [x3], x7 | |
610 | ||
611 | zip1 v5.4s, v0.4s, v1.4s | |
612 | zip2 v7.4s, v0.4s, v1.4s | |
613 | ||
614 | uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re | |
615 | uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im | |
616 | ||
617 | st1 {v5.4s}, [x10], x8 | |
618 | st1 {v7.4s}, [x11], x7 | |
619 | ||
620 | uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re | |
621 | uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im | |
622 | ||
623 | fmul v1.4s, v6.4s, v5.4s | |
624 | fmul v0.4s, v6.4s, v7.4s | |
625 | b.gt 2b | |
626 | ||
627 | fmla v1.4s, v4.4s, v7.4s | |
628 | fmls v0.4s, v4.4s, v5.4s | |
629 | ext v1.16b, v1.16b, v1.16b, #8 | |
630 | fmul v0.4s, v0.4s, v31.s[0] | |
631 | rev64 v1.4s, v1.4s | |
632 | fmul v1.4s, v1.4s, v31.s[0] | |
633 | zip1 v5.4s, v0.4s, v1.4s | |
634 | zip2 v7.4s, v0.4s, v1.4s | |
635 | st1 {v5.4s}, [x10], x8 | |
636 | st1 {v7.4s}, [x11], x7 | |
637 | ||
638 | ldp x21, x30, [sp] | |
639 | add sp, sp, #0x20 | |
640 | ret | |
641 | endfunc | |
642 | ||
643 | // [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5) | |
644 | const fact5, align=4 | |
645 | .float 0.30901699437494745, 0.95105651629515353 | |
646 | .float -0.80901699437494734, 0.58778525229247325 | |
647 | endconst |