2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
23 #include "asm-offsets.h"
25 .macro shuffle a, b, c, d
26 const shuffle_\a\b\c\d, align=4
27 .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
28 .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
29 .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
30 .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
42 ld1 {v24.2s}, [x1], x2
43 ld2 {v25.s,v26.s}[0], [x1], x2
44 ld2 {v25.s,v26.s}[1], [x1], x2
45 ld2 {v25.s,v26.s}[2], [x1], x2
46 ld2 {v25.s,v26.s}[3], [x1]
50 faddp v0.4s, v25.4s, v26.4s
52 fmul v16.4s, v25.4s, v15.s[0] // rr
53 fmul v17.4s, v25.4s, v15.s[1] // ri
54 fmul v18.4s, v26.4s, v15.s[0] // ir
55 fmul v19.4s, v26.4s, v15.s[1] // ii
56 faddp v0.4s, v0.4s, v0.4s
58 fmul v20.4s, v25.4s, v15.s[2] // rr
59 fmul v21.4s, v25.4s, v15.s[3] // ri
60 fmul v22.4s, v26.4s, v15.s[2] // ir
61 fmul v23.4s, v26.4s, v15.s[3] // ii
62 fadd v0.2s, v24.2s, v0.2s // out[0]
64 // z[0123][0], z[0123][3]
65 fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii;
66 fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii;
69 fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir;
70 fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir;
72 // z[0123][1], z[0123][2]
73 fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii;
74 fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii;
76 fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir;
77 fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir;
80 tbl v20.16b, {v24.16b}, v16.16b
81 tbl v21.16b, {v25.16b}, v17.16b
82 tbl v22.16b, {v26.16b}, v18.16b
83 tbl v23.16b, {v27.16b}, v19.16b
85 tbl v16.16b, {v28.16b}, v16.16b
86 tbl v17.16b, {v29.16b}, v17.16b
87 tbl v18.16b, {v30.16b}, v18.16b
88 tbl v19.16b, {v31.16b}, v19.16b
90 fadd v6.4s, v6.4s, v20.4s
91 fadd v22.4s, v22.4s, v23.4s
92 fadd v7.4s, v7.4s, v16.4s
93 fadd v18.4s, v18.4s, v19.4s
95 fadd v21.4s, v21.4s, v22.4s
96 fadd v17.4s, v17.4s, v18.4s
97 fadd v6.4s, v6.4s, v21.4s
98 fadd v7.4s, v7.4s, v17.4s
106 add x2, x3, x3, lsl #1 // 3 * stride
108 add x1, x8, x3, lsl #3 // in + 1 * stride
114 add x1, x8, x3, lsl #4 // in + 2 * stride
115 add x2, x3, x3, lsl #1 // 3 * stride
117 zip1 v1.4s, v1.4s, v0.4s
121 mov x1, x8 // in + 0 * stride
122 add x2, x3, x3, lsl #1 // 3 * stride
125 faddp v20.4s, v1.4s, v1.4s
127 ext v18.16b, v8.16b, v8.16b, #4
128 ext v19.16b, v9.16b, v9.16b, #4
131 fadd v20.2s, v20.2s, v0.2s
133 uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re
134 uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im
136 st1 {v20.2s}, [x0], #8 // out[0]
138 fmla v16.4s, v2.4s, v8.4s
139 fmls v16.4s, v3.4s, v9.4s
141 fmla v17.4s, v2.4s, v9.4s
142 fmla v17.4s, v3.4s, v8.4s
144 fmla v16.4s, v4.4s, v18.4s
145 fmls v16.4s, v5.4s, v19.4s
147 fmla v17.4s, v4.4s, v19.4s
148 fmla v17.4s, v5.4s, v18.4s
150 zip1 v18.4s, v16.4s, v17.4s
151 zip2 v19.4s, v16.4s, v17.4s
154 trn1 v28.2d, v1.2d, v1.2d
155 trn2 v29.2d, v1.2d, v1.2d
156 zip1 v30.2d, v14.2d, v31.2d
157 zip2 v31.2d, v14.2d, v31.2d
159 st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4]
161 fmul v16.4s, v28.4s, v30.4s
162 fmul v17.4s, v29.4s, v30.4s
163 fmls v16.4s, v29.4s, v31.4s
164 fmla v17.4s, v28.4s, v31.4s
165 faddp v16.4s, v16.4s, v16.4s
166 faddp v17.4s, v17.4s, v17.4s
167 zip1 v18.2s, v16.2s, v17.2s
168 zip2 v19.2s, v16.2s, v17.2s
170 fadd v18.2s, v18.2s, v0.2s
171 fadd v0.2s, v19.2s, v0.2s
173 ext v30.16b, v12.16b, v12.16b, #4
174 ext v31.16b, v13.16b, v13.16b, #4
178 uzp1 v30.4s, v30.4s, v8.4s
179 uzp1 v31.4s, v31.4s, v9.4s
181 st1 {v18.2s}, [x0], #8 // out[5]
183 fmla v16.4s, v2.4s, v10.4s
184 fmls v16.4s, v3.4s, v11.4s
186 fmla v17.4s, v2.4s, v11.4s
187 fmla v17.4s, v3.4s, v10.4s
189 fmla v16.4s, v4.4s, v30.4s
190 fmls v16.4s, v5.4s, v31.4s
192 fmla v17.4s, v4.4s, v31.4s
193 fmla v17.4s, v5.4s, v30.4s
195 zip1 v18.4s, v16.4s, v17.4s
196 zip2 v19.4s, v16.4s, v17.4s
198 ext v30.16b, v10.16b, v10.16b, #4
199 ext v31.16b, v11.16b, v11.16b, #4
201 fmla v6.4s, v2.4s, v12.4s
202 fmls v6.4s, v3.4s, v13.4s
204 st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9]
206 uzp1 v30.4s, v30.4s, v12.4s
207 uzp1 v31.4s, v31.4s, v13.4s
209 fmla v7.4s, v2.4s, v13.4s
210 fmla v7.4s, v3.4s, v12.4s
212 st1 {v0.2s}, [x0], #8 // out[10]
214 fmla v6.4s, v4.4s, v30.4s
215 fmls v6.4s, v5.4s, v31.4s
217 fmla v7.4s, v4.4s, v31.4s
218 fmla v7.4s, v5.4s, v30.4s
220 zip1 v18.4s, v6.4s, v7.4s
221 zip2 v19.4s, v6.4s, v7.4s
223 st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14]
228 // x0: out, x1: out+len2, x2: exptab, x3: len2
234 ld1 {v0.2s}, [x0], #8
235 ld1 {v1.2s}, [x1], #8
238 fadd v2.2s, v0.2s, v1.2s
239 fsub v3.2s, v0.2s, v1.2s
241 st1 {v2.2s}, [x4], #8
242 st1 {v3.2s}, [x5], #8
264 ld1 {v4.4s,v5.4s}, [x2], #32
265 ld2 {v2.4s,v3.4s}, [x1], #32
266 uzp1 v6.4s, v4.4s, v5.4s
267 uzp2 v7.4s, v4.4s, v5.4s
268 ld2 {v0.4s,v1.4s}, [x0], #32
272 fmul v4.4s, v2.4s, v6.4s
273 fmul v5.4s, v2.4s, v7.4s
276 ld1 {v18.4s,v19.4s}, [x2], #32
278 fmls v4.4s, v3.4s, v7.4s
279 fmla v5.4s, v3.4s, v6.4s
281 ld2 {v22.4s,v23.4s}, [x1], #32
283 fsub v2.4s, v0.4s, v4.4s
284 fadd v0.4s, v0.4s, v4.4s
285 fsub v3.4s, v1.4s, v5.4s
286 fadd v1.4s, v1.4s, v5.4s
288 uzp1 v16.4s, v18.4s, v19.4s
289 uzp2 v17.4s, v18.4s, v19.4s
291 st2 {v2.4s,v3.4s}, [x5], #32
292 st2 {v0.4s,v1.4s}, [x4], #32
293 ld2 {v20.4s,v21.4s}, [x0], #32
295 fmul v18.4s, v22.4s, v16.4s
296 fmul v19.4s, v22.4s, v17.4s
299 ld1 {v4.4s,v5.4s}, [x2], #32
301 fmls v18.4s, v23.4s, v17.4s
302 fmla v19.4s, v23.4s, v16.4s
304 ld2 {v2.4s,v3.4s}, [x1], #32
306 fsub v22.4s, v20.4s, v18.4s
307 fadd v20.4s, v20.4s, v18.4s
308 fsub v23.4s, v21.4s, v19.4s
309 fadd v21.4s, v21.4s, v19.4s
311 uzp1 v6.4s, v4.4s, v5.4s
312 uzp2 v7.4s, v4.4s, v5.4s
314 st2 {v22.4s,v23.4s}, [x5], #32
315 st2 {v20.4s,v21.4s}, [x4], #32
316 ld2 {v0.4s,v1.4s}, [x0], #32
320 fmls v4.4s, v3.4s, v7.4s
321 fmla v5.4s, v3.4s, v6.4s
323 fsub v2.4s, v0.4s, v4.4s
324 fadd v0.4s, v0.4s, v4.4s
325 fsub v3.4s, v1.4s, v5.4s
326 fadd v1.4s, v1.4s, v5.4s
328 st2 {v2.4s,v3.4s}, [x5], #32
329 st2 {v0.4s,v1.4s}, [x4], #32
333 fmls v18.4s, v23.4s, v17.4s
334 fmla v19.4s, v23.4s, v16.4s
336 fsub v22.4s, v20.4s, v18.4s
337 fadd v20.4s, v20.4s, v18.4s
338 fsub v23.4s, v21.4s, v19.4s
339 fadd v21.4s, v21.4s, v19.4s
341 st2 {v22.4s,v23.4s}, [x5], #32
342 st2 {v20.4s,v21.4s}, [x4], #32
347 function fft30_neon, align=6
350 stp x22, x30, [sp, #0x10]
360 add x1, x22, x20, lsl #3
364 ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1]
369 ldp x22, x30, [sp, #0x10]
375 function fft\n\()_neon, align=6
378 stp x22, x30, [sp, #0x10]
379 stp x23, x24, [sp, #0x20]
388 add x1, x21, #(\n2 * 8)
389 add x2, x22, x20, lsl #3
394 add x5, x10, #CELT_EXPTAB
396 ldr x2, [x5, x23, lsl #3] // s->exptab[N]
397 add x1, x21, #(\n2 * 8)
400 ldp x22, x30, [sp, #0x10]
401 ldp x23, x24, [sp, #0x20]
413 function fft_b15_calc_neon
415 ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0]
417 movrel x11, shuffle_0213
418 movrel x12, shuffle_1032
419 movrel x13, shuffle_2301
420 movrel x14, shuffle_3120
422 movrel x5, fft_tab_neon
424 stp d8, d9, [sp, #0x10]
425 stp d10, d11, [sp, #0x20]
426 stp d12, d13, [sp, #0x30]
427 stp d14, d15, [sp, #0x40]
429 ld1 {v0.4s,v1.4s}, [x8], #32
430 ld1 {v6.2s}, [x8], #8
431 ld1 {v2.4s,v3.4s}, [x8], #32
432 ld1 {v7.2s}, [x8], #8
433 ld1 {v4.4s,v5.4s}, [x8], #32
434 uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re
435 uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im
436 uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re
437 uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im
438 uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re
439 uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im
440 zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im
441 ldr x6, [x5, x3, lsl #3]
446 ldp d8, d9, [sp, #0x10]
447 ldp d10, d11, [sp, #0x20]
448 ldp d12, d13, [sp, #0x30]
449 ldp d14, d15, [sp, #0x40]
455 .quad fft15_neon - fft_tab_neon
456 .quad fft30_neon - fft_tab_neon
457 .quad fft60_neon - fft_tab_neon
458 .quad fft120_neon - fft_tab_neon
459 .quad fft240_neon - fft_tab_neon
460 .quad fft480_neon - fft_tab_neon
461 .quad fft960_neon - fft_tab_neon
464 function ff_celt_imdct_half_neon, export=1
469 ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4
473 lsl x7, x3, #3 // 2 * stride * sizeof(float)
474 sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float)
476 ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE
477 ldr w3, [x0, #CELT_FFT_N]
478 add x5, x2, x5, lsl #2
482 ld1 {v0.s}[0], [x5], x8
483 ld1 {v1.s}[0], [x2], x7
484 ld1 {v4.4s,v5.4s}, [x10], #32
485 ld1 {v0.s}[1], [x5], x8
486 ld1 {v1.s}[1], [x2], x7
487 uzp1 v2.4s, v4.4s, v5.4s
488 ld1 {v0.s}[2], [x5], x8
489 ld1 {v1.s}[2], [x2], x7
490 uzp2 v3.4s, v4.4s, v5.4s
491 ld1 {v0.s}[3], [x5], x8
492 ld1 {v1.s}[3], [x2], x7
496 ld1 {v20.s}[0], [x5], x8
497 ld1 {v21.s}[0], [x2], x7
498 ld1 {v4.4s,v5.4s}, [x10], #32
500 fmul v6.4s, v0.4s, v2.4s
501 fmul v7.4s, v0.4s, v3.4s
503 ld1 {v20.s}[1], [x5], x8
504 ld1 {v21.s}[1], [x2], x7
506 fmls v6.4s, v1.4s, v3.4s
507 fmla v7.4s, v1.4s, v2.4s
509 ld1 {v20.s}[2], [x5], x8
510 ld1 {v21.s}[2], [x2], x7
512 uzp1 v2.4s, v4.4s, v5.4s
513 uzp2 v3.4s, v4.4s, v5.4s
514 ld1 {v20.s}[3], [x5], x8
515 ld1 {v21.s}[3], [x2], x7
517 zip1 v4.4s, v6.4s, v7.4s
518 zip2 v5.4s, v6.4s, v7.4s
520 fmul v6.4s, v20.4s, v2.4s
521 fmul v7.4s, v20.4s, v3.4s
523 st1 {v4.4s,v5.4s}, [x9], #32
525 fmls v6.4s, v21.4s, v3.4s
526 fmla v7.4s, v21.4s, v2.4s
531 ld1 {v4.4s,v5.4s}, [x10], #32
532 ld1 {v0.s}[0], [x5], x8
533 ld1 {v1.s}[0], [x2], x7
534 uzp1 v2.4s, v4.4s, v5.4s
535 ld1 {v0.s}[1], [x5], x8
536 ld1 {v1.s}[1], [x2], x7
537 uzp2 v3.4s, v4.4s, v5.4s
538 ld1 {v0.s}[2], [x5], x8
539 ld1 {v1.s}[2], [x2], x7
540 zip1 v4.4s, v6.4s, v7.4s
541 zip2 v5.4s, v6.4s, v7.4s
542 ld1 {v0.s}[3], [x5], x8
543 ld1 {v1.s}[3], [x2], x7
545 st1 {v4.4s,v5.4s}, [x9], #32
549 fmul v6.4s, v0.4s, v2.4s
550 fmul v7.4s, v0.4s, v3.4s
551 fmls v6.4s, v1.4s, v3.4s
552 fmla v7.4s, v1.4s, v2.4s
554 zip1 v4.4s, v6.4s, v7.4s
555 zip2 v5.4s, v6.4s, v7.4s
556 st1 {v4.4s,v5.4s}, [x9], #32
563 ldr w5, [x10, #CELT_LEN4]
564 ldr x6, [x10, #CELT_TWIDDLE]
567 add x1, x21, x5, lsl #2
568 add x3, x6, x5, lsl #2
578 ld1 {v0.4s}, [x0], x8
579 ld1 {v1.4s}, [x1], x7
580 ld1 {v2.4s}, [x2], x8
581 ld1 {v3.4s}, [x3], x7
583 uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re
584 uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im
586 uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
587 uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
589 fmul v1.4s, v6.4s, v5.4s
590 fmul v0.4s, v6.4s, v7.4s
594 ld1 {v20.4s}, [x0], x8
596 fmla v1.4s, v4.4s, v7.4s
597 fmls v0.4s, v4.4s, v5.4s
599 ld1 {v21.4s}, [x1], x7
601 ext v1.16b, v1.16b, v1.16b, #8
602 fmul v0.4s, v0.4s, v31.s[0]
604 ld1 {v2.4s}, [x2], x8
607 fmul v1.4s, v1.4s, v31.s[0]
609 ld1 {v3.4s}, [x3], x7
611 zip1 v5.4s, v0.4s, v1.4s
612 zip2 v7.4s, v0.4s, v1.4s
614 uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re
615 uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im
617 st1 {v5.4s}, [x10], x8
618 st1 {v7.4s}, [x11], x7
620 uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
621 uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
623 fmul v1.4s, v6.4s, v5.4s
624 fmul v0.4s, v6.4s, v7.4s
627 fmla v1.4s, v4.4s, v7.4s
628 fmls v0.4s, v4.4s, v5.4s
629 ext v1.16b, v1.16b, v1.16b, #8
630 fmul v0.4s, v0.4s, v31.s[0]
632 fmul v1.4s, v1.4s, v31.s[0]
633 zip1 v5.4s, v0.4s, v1.4s
634 zip2 v7.4s, v0.4s, v1.4s
635 st1 {v5.4s}, [x10], x8
636 st1 {v7.4s}, [x11], x7
643 // [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
645 .float 0.30901699437494745, 0.95105651629515353
646 .float -0.80901699437494734, 0.58778525229247325