c2422617dde274c1999e4387ef6dc6110a4aea73
[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / opus_imdct_neon.S
1 /*
2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/aarch64/asm.S"
22
23 #include "asm-offsets.h"
24
25 .macro shuffle a, b, c, d
26 const shuffle_\a\b\c\d, align=4
27 .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
28 .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
29 .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
30 .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
31 endconst
32 .endm
33
34 shuffle 0, 2, 1, 3
35 shuffle 1, 0, 3, 2
36 shuffle 2, 3, 0, 1
37 shuffle 3, 1, 2, 0
38
39
40 function fft5_neon
41 lsl x2, x2, #3
42 ld1 {v24.2s}, [x1], x2
43 ld2 {v25.s,v26.s}[0], [x1], x2
44 ld2 {v25.s,v26.s}[1], [x1], x2
45 ld2 {v25.s,v26.s}[2], [x1], x2
46 ld2 {v25.s,v26.s}[3], [x1]
47 dup v6.4s, v24.s[0]
48 dup v7.4s, v24.s[1]
49
50 faddp v0.4s, v25.4s, v26.4s
51 // z[][0], z[][3]
52 fmul v16.4s, v25.4s, v15.s[0] // rr
53 fmul v17.4s, v25.4s, v15.s[1] // ri
54 fmul v18.4s, v26.4s, v15.s[0] // ir
55 fmul v19.4s, v26.4s, v15.s[1] // ii
56 faddp v0.4s, v0.4s, v0.4s
57 // z[][1], z[][2]
58 fmul v20.4s, v25.4s, v15.s[2] // rr
59 fmul v21.4s, v25.4s, v15.s[3] // ri
60 fmul v22.4s, v26.4s, v15.s[2] // ir
61 fmul v23.4s, v26.4s, v15.s[3] // ii
62 fadd v0.2s, v24.2s, v0.2s // out[0]
63
64 // z[0123][0], z[0123][3]
65 fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii;
66 fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii;
67 ld1 {v16.16b}, [x11]
68 ld1 {v19.16b}, [x14]
69 fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir;
70 fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir;
71 ld1 {v17.16b}, [x12]
72 // z[0123][1], z[0123][2]
73 fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii;
74 fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii;
75 ld1 {v18.16b}, [x13]
76 fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir;
77 fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir;
78
79 //real
80 tbl v20.16b, {v24.16b}, v16.16b
81 tbl v21.16b, {v25.16b}, v17.16b
82 tbl v22.16b, {v26.16b}, v18.16b
83 tbl v23.16b, {v27.16b}, v19.16b
84 //imag
85 tbl v16.16b, {v28.16b}, v16.16b
86 tbl v17.16b, {v29.16b}, v17.16b
87 tbl v18.16b, {v30.16b}, v18.16b
88 tbl v19.16b, {v31.16b}, v19.16b
89
90 fadd v6.4s, v6.4s, v20.4s
91 fadd v22.4s, v22.4s, v23.4s
92 fadd v7.4s, v7.4s, v16.4s
93 fadd v18.4s, v18.4s, v19.4s
94
95 fadd v21.4s, v21.4s, v22.4s
96 fadd v17.4s, v17.4s, v18.4s
97 fadd v6.4s, v6.4s, v21.4s
98 fadd v7.4s, v7.4s, v17.4s
99
100 ret
101 endfunc
102
103 function fft15_neon
104 mov x8, x1
105 mov x9, x30
106 add x2, x3, x3, lsl #1 // 3 * stride
107
108 add x1, x8, x3, lsl #3 // in + 1 * stride
109 bl fft5_neon
110 mov v1.8b, v0.8b
111 mov v2.16b, v6.16b
112 mov v3.16b, v7.16b
113
114 add x1, x8, x3, lsl #4 // in + 2 * stride
115 add x2, x3, x3, lsl #1 // 3 * stride
116 bl fft5_neon
117 zip1 v1.4s, v1.4s, v0.4s
118 mov v4.16b, v6.16b
119 mov v5.16b, v7.16b
120
121 mov x1, x8 // in + 0 * stride
122 add x2, x3, x3, lsl #1 // 3 * stride
123 bl fft5_neon
124
125 faddp v20.4s, v1.4s, v1.4s
126
127 ext v18.16b, v8.16b, v8.16b, #4
128 ext v19.16b, v9.16b, v9.16b, #4
129 mov v16.16b, v6.16b
130 mov v17.16b, v7.16b
131 fadd v20.2s, v20.2s, v0.2s
132
133 uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re
134 uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im
135
136 st1 {v20.2s}, [x0], #8 // out[0]
137
138 fmla v16.4s, v2.4s, v8.4s
139 fmls v16.4s, v3.4s, v9.4s
140
141 fmla v17.4s, v2.4s, v9.4s
142 fmla v17.4s, v3.4s, v8.4s
143
144 fmla v16.4s, v4.4s, v18.4s
145 fmls v16.4s, v5.4s, v19.4s
146
147 fmla v17.4s, v4.4s, v19.4s
148 fmla v17.4s, v5.4s, v18.4s
149
150 zip1 v18.4s, v16.4s, v17.4s
151 zip2 v19.4s, v16.4s, v17.4s
152
153 rev64 v31.4s, v14.4s
154 trn1 v28.2d, v1.2d, v1.2d
155 trn2 v29.2d, v1.2d, v1.2d
156 zip1 v30.2d, v14.2d, v31.2d
157 zip2 v31.2d, v14.2d, v31.2d
158
159 st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4]
160
161 fmul v16.4s, v28.4s, v30.4s
162 fmul v17.4s, v29.4s, v30.4s
163 fmls v16.4s, v29.4s, v31.4s
164 fmla v17.4s, v28.4s, v31.4s
165 faddp v16.4s, v16.4s, v16.4s
166 faddp v17.4s, v17.4s, v17.4s
167 zip1 v18.2s, v16.2s, v17.2s
168 zip2 v19.2s, v16.2s, v17.2s
169
170 fadd v18.2s, v18.2s, v0.2s
171 fadd v0.2s, v19.2s, v0.2s
172
173 ext v30.16b, v12.16b, v12.16b, #4
174 ext v31.16b, v13.16b, v13.16b, #4
175 mov v16.16b, v6.16b
176 mov v17.16b, v7.16b
177
178 uzp1 v30.4s, v30.4s, v8.4s
179 uzp1 v31.4s, v31.4s, v9.4s
180
181 st1 {v18.2s}, [x0], #8 // out[5]
182
183 fmla v16.4s, v2.4s, v10.4s
184 fmls v16.4s, v3.4s, v11.4s
185
186 fmla v17.4s, v2.4s, v11.4s
187 fmla v17.4s, v3.4s, v10.4s
188
189 fmla v16.4s, v4.4s, v30.4s
190 fmls v16.4s, v5.4s, v31.4s
191
192 fmla v17.4s, v4.4s, v31.4s
193 fmla v17.4s, v5.4s, v30.4s
194
195 zip1 v18.4s, v16.4s, v17.4s
196 zip2 v19.4s, v16.4s, v17.4s
197
198 ext v30.16b, v10.16b, v10.16b, #4
199 ext v31.16b, v11.16b, v11.16b, #4
200
201 fmla v6.4s, v2.4s, v12.4s
202 fmls v6.4s, v3.4s, v13.4s
203
204 st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9]
205
206 uzp1 v30.4s, v30.4s, v12.4s
207 uzp1 v31.4s, v31.4s, v13.4s
208
209 fmla v7.4s, v2.4s, v13.4s
210 fmla v7.4s, v3.4s, v12.4s
211
212 st1 {v0.2s}, [x0], #8 // out[10]
213
214 fmla v6.4s, v4.4s, v30.4s
215 fmls v6.4s, v5.4s, v31.4s
216
217 fmla v7.4s, v4.4s, v31.4s
218 fmla v7.4s, v5.4s, v30.4s
219
220 zip1 v18.4s, v6.4s, v7.4s
221 zip2 v19.4s, v6.4s, v7.4s
222
223 st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14]
224
225 ret x9
226 endfunc
227
228 // x0: out, x1: out+len2, x2: exptab, x3: len2
229 function fft15_pass
230 ands x6, x3, #3
231 mov x4, x0
232 mov x5, x1
233 b.eq 9f
234 ld1 {v0.2s}, [x0], #8
235 ld1 {v1.2s}, [x1], #8
236 sub x3, x3, x6
237 subs x6, x6, #1
238 fadd v2.2s, v0.2s, v1.2s
239 fsub v3.2s, v0.2s, v1.2s
240 add x2, x2, #8
241 st1 {v2.2s}, [x4], #8
242 st1 {v3.2s}, [x5], #8
243 b.eq 9f
244 1:
245 subs x6, x6, #1
246 ldp s4, s5, [x2], #8
247 ldp s2, s3, [x1], #8
248 ldp s0, s1, [x0], #8
249
250 fmul s6, s2, s4
251 fmul s7, s2, s5
252 fmls s6, s3, v5.s[0]
253 fmla s7, s3, v4.s[0]
254
255 fsub s2, s0, s6
256 fsub s3, s1, s7
257 fadd s0, s0, s6
258 fadd s1, s1, s7
259
260 stp s2, s3, [x5], #8
261 stp s0, s1, [x4], #8
262 b.gt 1b
263 9:
264 ld1 {v4.4s,v5.4s}, [x2], #32
265 ld2 {v2.4s,v3.4s}, [x1], #32
266 uzp1 v6.4s, v4.4s, v5.4s
267 uzp2 v7.4s, v4.4s, v5.4s
268 ld2 {v0.4s,v1.4s}, [x0], #32
269 8:
270 subs x3, x3, #8
271
272 fmul v4.4s, v2.4s, v6.4s
273 fmul v5.4s, v2.4s, v7.4s
274 b.lt 4f
275
276 ld1 {v18.4s,v19.4s}, [x2], #32
277
278 fmls v4.4s, v3.4s, v7.4s
279 fmla v5.4s, v3.4s, v6.4s
280
281 ld2 {v22.4s,v23.4s}, [x1], #32
282
283 fsub v2.4s, v0.4s, v4.4s
284 fadd v0.4s, v0.4s, v4.4s
285 fsub v3.4s, v1.4s, v5.4s
286 fadd v1.4s, v1.4s, v5.4s
287
288 uzp1 v16.4s, v18.4s, v19.4s
289 uzp2 v17.4s, v18.4s, v19.4s
290
291 st2 {v2.4s,v3.4s}, [x5], #32
292 st2 {v0.4s,v1.4s}, [x4], #32
293 ld2 {v20.4s,v21.4s}, [x0], #32
294
295 fmul v18.4s, v22.4s, v16.4s
296 fmul v19.4s, v22.4s, v17.4s
297 b.eq 0f
298
299 ld1 {v4.4s,v5.4s}, [x2], #32
300
301 fmls v18.4s, v23.4s, v17.4s
302 fmla v19.4s, v23.4s, v16.4s
303
304 ld2 {v2.4s,v3.4s}, [x1], #32
305
306 fsub v22.4s, v20.4s, v18.4s
307 fadd v20.4s, v20.4s, v18.4s
308 fsub v23.4s, v21.4s, v19.4s
309 fadd v21.4s, v21.4s, v19.4s
310
311 uzp1 v6.4s, v4.4s, v5.4s
312 uzp2 v7.4s, v4.4s, v5.4s
313
314 st2 {v22.4s,v23.4s}, [x5], #32
315 st2 {v20.4s,v21.4s}, [x4], #32
316 ld2 {v0.4s,v1.4s}, [x0], #32
317
318 b 8b
319 4:
320 fmls v4.4s, v3.4s, v7.4s
321 fmla v5.4s, v3.4s, v6.4s
322
323 fsub v2.4s, v0.4s, v4.4s
324 fadd v0.4s, v0.4s, v4.4s
325 fsub v3.4s, v1.4s, v5.4s
326 fadd v1.4s, v1.4s, v5.4s
327
328 st2 {v2.4s,v3.4s}, [x5], #32
329 st2 {v0.4s,v1.4s}, [x4], #32
330
331 ret
332 0:
333 fmls v18.4s, v23.4s, v17.4s
334 fmla v19.4s, v23.4s, v16.4s
335
336 fsub v22.4s, v20.4s, v18.4s
337 fadd v20.4s, v20.4s, v18.4s
338 fsub v23.4s, v21.4s, v19.4s
339 fadd v21.4s, v21.4s, v19.4s
340
341 st2 {v22.4s,v23.4s}, [x5], #32
342 st2 {v20.4s,v21.4s}, [x4], #32
343
344 ret
345 endfunc
346
347 function fft30_neon, align=6
348 sub sp, sp, #0x20
349 stp x20, x21, [sp]
350 stp x22, x30, [sp, #0x10]
351 mov x21, x1
352 mov x22, x2
353 mov x20, x4
354 mov x0, x21
355 mov x1, x22
356 lsl x3, x20, #1
357 bl fft15_neon
358
359 add x0, x21, #15*8
360 add x1, x22, x20, lsl #3
361 lsl x3, x20, #1
362 bl fft15_neon
363
364 ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1]
365 add x0, x21, #0
366 add x1, x21, #15*8
367 mov x3, #15
368 ldp x20, x21, [sp]
369 ldp x22, x30, [sp, #0x10]
370 add sp, sp, #0x20
371 b fft15_pass
372 endfunc
373
374 .macro def_fft n, n2
375 function fft\n\()_neon, align=6
376 sub sp, sp, #0x30
377 stp x20, x21, [sp]
378 stp x22, x30, [sp, #0x10]
379 stp x23, x24, [sp, #0x20]
380 mov x21, x1
381 mov x22, x2
382 mov x23, x3
383 mov x20, x4
384 sub x3, x3, #1
385 lsl x4, x4, #1
386 bl fft\n2\()_neon
387
388 add x1, x21, #(\n2 * 8)
389 add x2, x22, x20, lsl #3
390 sub x3, x23, #1
391 lsl x4, x20, #1
392 bl fft\n2\()_neon
393
394 add x5, x10, #CELT_EXPTAB
395 mov x0, x21
396 ldr x2, [x5, x23, lsl #3] // s->exptab[N]
397 add x1, x21, #(\n2 * 8)
398 mov x3, #\n2
399 ldp x20, x21, [sp]
400 ldp x22, x30, [sp, #0x10]
401 ldp x23, x24, [sp, #0x20]
402 add sp, sp, #0x30
403 b fft15_pass
404 endfunc
405 .endm
406
407 def_fft 60, 30
408 def_fft 120, 60
409 def_fft 240, 120
410 def_fft 480, 240
411 def_fft 960, 480
412
413 function fft_b15_calc_neon
414 sub sp, sp, #0x50
415 ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0]
416 movrel x6, fact5
417 movrel x11, shuffle_0213
418 movrel x12, shuffle_1032
419 movrel x13, shuffle_2301
420 movrel x14, shuffle_3120
421 add x8, x8, #8
422 movrel x5, fft_tab_neon
423 stp x20, x30, [sp]
424 stp d8, d9, [sp, #0x10]
425 stp d10, d11, [sp, #0x20]
426 stp d12, d13, [sp, #0x30]
427 stp d14, d15, [sp, #0x40]
428 ld1 {v15.4s}, [x6]
429 ld1 {v0.4s,v1.4s}, [x8], #32
430 ld1 {v6.2s}, [x8], #8
431 ld1 {v2.4s,v3.4s}, [x8], #32
432 ld1 {v7.2s}, [x8], #8
433 ld1 {v4.4s,v5.4s}, [x8], #32
434 uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re
435 uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im
436 uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re
437 uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im
438 uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re
439 uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im
440 zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im
441 ldr x6, [x5, x3, lsl #3]
442 add x5, x5, x6
443 mov x10, x0
444 blr x5
445 ldp x20, x30, [sp]
446 ldp d8, d9, [sp, #0x10]
447 ldp d10, d11, [sp, #0x20]
448 ldp d12, d13, [sp, #0x30]
449 ldp d14, d15, [sp, #0x40]
450 add sp, sp, #0x50
451 ret
452 endfunc
453
454 const fft_tab_neon
455 .quad fft15_neon - fft_tab_neon
456 .quad fft30_neon - fft_tab_neon
457 .quad fft60_neon - fft_tab_neon
458 .quad fft120_neon - fft_tab_neon
459 .quad fft240_neon - fft_tab_neon
460 .quad fft480_neon - fft_tab_neon
461 .quad fft960_neon - fft_tab_neon
462 endconst
463
464 function ff_celt_imdct_half_neon, export=1
465 sub sp, sp, #0x20
466 stp x21, x30, [sp]
467 str s0, [sp, #0x10]
468
469 ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4
470 mov x10, x0
471 mov x21, x1
472 sub w5, w5, #1
473 lsl x7, x3, #3 // 2 * stride * sizeof(float)
474 sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float)
475 mul x5, x5, x3
476 ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE
477 ldr w3, [x0, #CELT_FFT_N]
478 add x5, x2, x5, lsl #2
479 mov x11, x9
480
481 sub w6, w6, #4
482 ld1 {v0.s}[0], [x5], x8
483 ld1 {v1.s}[0], [x2], x7
484 ld1 {v4.4s,v5.4s}, [x10], #32
485 ld1 {v0.s}[1], [x5], x8
486 ld1 {v1.s}[1], [x2], x7
487 uzp1 v2.4s, v4.4s, v5.4s
488 ld1 {v0.s}[2], [x5], x8
489 ld1 {v1.s}[2], [x2], x7
490 uzp2 v3.4s, v4.4s, v5.4s
491 ld1 {v0.s}[3], [x5], x8
492 ld1 {v1.s}[3], [x2], x7
493 1:
494 subs w6, w6, #4
495
496 ld1 {v20.s}[0], [x5], x8
497 ld1 {v21.s}[0], [x2], x7
498 ld1 {v4.4s,v5.4s}, [x10], #32
499
500 fmul v6.4s, v0.4s, v2.4s
501 fmul v7.4s, v0.4s, v3.4s
502
503 ld1 {v20.s}[1], [x5], x8
504 ld1 {v21.s}[1], [x2], x7
505
506 fmls v6.4s, v1.4s, v3.4s
507 fmla v7.4s, v1.4s, v2.4s
508
509 ld1 {v20.s}[2], [x5], x8
510 ld1 {v21.s}[2], [x2], x7
511
512 uzp1 v2.4s, v4.4s, v5.4s
513 uzp2 v3.4s, v4.4s, v5.4s
514 ld1 {v20.s}[3], [x5], x8
515 ld1 {v21.s}[3], [x2], x7
516
517 zip1 v4.4s, v6.4s, v7.4s
518 zip2 v5.4s, v6.4s, v7.4s
519
520 fmul v6.4s, v20.4s, v2.4s
521 fmul v7.4s, v20.4s, v3.4s
522
523 st1 {v4.4s,v5.4s}, [x9], #32
524
525 fmls v6.4s, v21.4s, v3.4s
526 fmla v7.4s, v21.4s, v2.4s
527
528 b.eq 3f
529
530 subs w6, w6, #4
531 ld1 {v4.4s,v5.4s}, [x10], #32
532 ld1 {v0.s}[0], [x5], x8
533 ld1 {v1.s}[0], [x2], x7
534 uzp1 v2.4s, v4.4s, v5.4s
535 ld1 {v0.s}[1], [x5], x8
536 ld1 {v1.s}[1], [x2], x7
537 uzp2 v3.4s, v4.4s, v5.4s
538 ld1 {v0.s}[2], [x5], x8
539 ld1 {v1.s}[2], [x2], x7
540 zip1 v4.4s, v6.4s, v7.4s
541 zip2 v5.4s, v6.4s, v7.4s
542 ld1 {v0.s}[3], [x5], x8
543 ld1 {v1.s}[3], [x2], x7
544
545 st1 {v4.4s,v5.4s}, [x9], #32
546
547 b.gt 1b
548
549 fmul v6.4s, v0.4s, v2.4s
550 fmul v7.4s, v0.4s, v3.4s
551 fmls v6.4s, v1.4s, v3.4s
552 fmla v7.4s, v1.4s, v2.4s
553 3:
554 zip1 v4.4s, v6.4s, v7.4s
555 zip2 v5.4s, v6.4s, v7.4s
556 st1 {v4.4s,v5.4s}, [x9], #32
557
558 mov x2, x11
559 mov x4, #1
560
561 bl fft_b15_calc_neon
562
563 ldr w5, [x10, #CELT_LEN4]
564 ldr x6, [x10, #CELT_TWIDDLE]
565 ldr s31, [sp, #0x10]
566
567 add x1, x21, x5, lsl #2
568 add x3, x6, x5, lsl #2
569 sub x0, x1, #16
570 sub x2, x3, #16
571 mov x8, #-16
572 mov x7, #16
573 mov x10, x0
574 mov x11, x1
575
576 sub w5, w5, #4
577
578 ld1 {v0.4s}, [x0], x8
579 ld1 {v1.4s}, [x1], x7
580 ld1 {v2.4s}, [x2], x8
581 ld1 {v3.4s}, [x3], x7
582
583 uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re
584 uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im
585
586 uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
587 uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
588
589 fmul v1.4s, v6.4s, v5.4s
590 fmul v0.4s, v6.4s, v7.4s
591 2:
592 subs w5, w5, #4
593
594 ld1 {v20.4s}, [x0], x8
595
596 fmla v1.4s, v4.4s, v7.4s
597 fmls v0.4s, v4.4s, v5.4s
598
599 ld1 {v21.4s}, [x1], x7
600
601 ext v1.16b, v1.16b, v1.16b, #8
602 fmul v0.4s, v0.4s, v31.s[0]
603
604 ld1 {v2.4s}, [x2], x8
605
606 rev64 v1.4s, v1.4s
607 fmul v1.4s, v1.4s, v31.s[0]
608
609 ld1 {v3.4s}, [x3], x7
610
611 zip1 v5.4s, v0.4s, v1.4s
612 zip2 v7.4s, v0.4s, v1.4s
613
614 uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re
615 uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im
616
617 st1 {v5.4s}, [x10], x8
618 st1 {v7.4s}, [x11], x7
619
620 uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
621 uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
622
623 fmul v1.4s, v6.4s, v5.4s
624 fmul v0.4s, v6.4s, v7.4s
625 b.gt 2b
626
627 fmla v1.4s, v4.4s, v7.4s
628 fmls v0.4s, v4.4s, v5.4s
629 ext v1.16b, v1.16b, v1.16b, #8
630 fmul v0.4s, v0.4s, v31.s[0]
631 rev64 v1.4s, v1.4s
632 fmul v1.4s, v1.4s, v31.s[0]
633 zip1 v5.4s, v0.4s, v1.4s
634 zip2 v7.4s, v0.4s, v1.4s
635 st1 {v5.4s}, [x10], x8
636 st1 {v7.4s}, [x11], x7
637
638 ldp x21, x30, [sp]
639 add sp, sp, #0x20
640 ret
641 endfunc
642
643 // [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
644 const fact5, align=4
645 .float 0.30901699437494745, 0.95105651629515353
646 .float -0.80901699437494734, 0.58778525229247325
647 endconst