| 1 | /* |
| 2 | * ARM NEON optimised FFT |
| 3 | * |
| 4 | * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> |
| 5 | * Copyright (c) 2009 Naotoshi Nojiri |
| 6 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> |
| 7 | * |
| 8 | * This algorithm (though not any of the implementation details) is |
| 9 | * based on libdjbfft by D. J. Bernstein. |
| 10 | * |
| 11 | * This file is part of FFmpeg. |
| 12 | * |
| 13 | * FFmpeg is free software; you can redistribute it and/or |
| 14 | * modify it under the terms of the GNU Lesser General Public |
| 15 | * License as published by the Free Software Foundation; either |
| 16 | * version 2.1 of the License, or (at your option) any later version. |
| 17 | * |
| 18 | * FFmpeg is distributed in the hope that it will be useful, |
| 19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 21 | * Lesser General Public License for more details. |
| 22 | * |
| 23 | * You should have received a copy of the GNU Lesser General Public |
| 24 | * License along with FFmpeg; if not, write to the Free Software |
| 25 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 26 | */ |
| 27 | |
| 28 | #include "libavutil/aarch64/asm.S" |
| 29 | |
| 30 | #define M_SQRT1_2 0.70710678118654752440 |
| 31 | |
| 32 | .macro transpose d0, d1, s0, s1 |
| 33 | trn1 \d0, \s0, \s1 |
| 34 | trn2 \d1, \s0, \s1 |
| 35 | .endm |
| 36 | |
| 37 | |
| 38 | function fft4_neon |
| 39 | ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] |
| 40 | |
| 41 | fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 |
| 42 | fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 |
| 43 | |
| 44 | ext v16.8b, v2.8b, v3.8b, #4 |
| 45 | ext v17.8b, v3.8b, v2.8b, #4 |
| 46 | |
| 47 | fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 |
| 48 | fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 |
| 49 | |
| 50 | fadd v0.2s, v4.2s, v5.2s |
| 51 | fsub v2.2s, v4.2s, v5.2s |
| 52 | fadd v1.2s, v6.2s, v7.2s |
| 53 | fsub v3.2s, v6.2s, v7.2s |
| 54 | |
| 55 | st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] |
| 56 | |
| 57 | ret |
| 58 | endfunc |
| 59 | |
| 60 | function fft8_neon |
| 61 | mov x1, x0 |
| 62 | ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 |
| 63 | ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] |
| 64 | ext v22.8b, v2.8b, v3.8b, #4 |
| 65 | ext v23.8b, v3.8b, v2.8b, #4 |
| 66 | fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 |
| 67 | fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 |
| 68 | fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 |
| 69 | fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 |
| 70 | rev64 v27.2s, v28.2s // ??? |
| 71 | fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 |
| 72 | fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 |
| 73 | fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w |
| 74 | ext v6.8b, v4.8b, v5.8b, #4 |
| 75 | ext v7.8b, v5.8b, v4.8b, #4 |
| 76 | fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w |
| 77 | fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 |
| 78 | fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 |
| 79 | fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w |
| 80 | fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w |
| 81 | fadd v0.2s, v20.2s, v21.2s |
| 82 | fsub v2.2s, v20.2s, v21.2s |
| 83 | fadd v1.2s, v22.2s, v23.2s |
| 84 | rev64 v26.2s, v26.2s |
| 85 | rev64 v27.2s, v27.2s |
| 86 | fsub v3.2s, v22.2s, v23.2s |
| 87 | fsub v6.2s, v6.2s, v7.2s |
| 88 | fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 |
| 89 | fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 |
| 90 | fadd v7.2s, v4.2s, v5.2s |
| 91 | fsub v18.2s, v2.2s, v6.2s |
| 92 | ext v26.8b, v24.8b, v25.8b, #4 |
| 93 | ext v27.8b, v25.8b, v24.8b, #4 |
| 94 | fadd v2.2s, v2.2s, v6.2s |
| 95 | fsub v16.2s, v0.2s, v7.2s |
| 96 | fadd v5.2s, v25.2s, v24.2s |
| 97 | fsub v4.2s, v26.2s, v27.2s |
| 98 | fadd v0.2s, v0.2s, v7.2s |
| 99 | fsub v17.2s, v1.2s, v5.2s |
| 100 | fsub v19.2s, v3.2s, v4.2s |
| 101 | fadd v3.2s, v3.2s, v4.2s |
| 102 | fadd v1.2s, v1.2s, v5.2s |
| 103 | |
| 104 | st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] |
| 105 | st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] |
| 106 | |
| 107 | ret |
| 108 | endfunc |
| 109 | |
| 110 | function fft16_neon |
| 111 | mov x1, x0 |
| 112 | ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 |
| 113 | ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 |
| 114 | ext v22.8b, v2.8b, v3.8b, #4 |
| 115 | ext v23.8b, v3.8b, v2.8b, #4 |
| 116 | fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 |
| 117 | fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 |
| 118 | fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 |
| 119 | fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 |
| 120 | rev64 v27.2s, v28.2s // ??? |
| 121 | fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 |
| 122 | fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 |
| 123 | fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w |
| 124 | ext v6.8b, v4.8b, v5.8b, #4 |
| 125 | ext v7.8b, v5.8b, v4.8b, #4 |
| 126 | fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w |
| 127 | fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 |
| 128 | fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 |
| 129 | fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w |
| 130 | fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w |
| 131 | fadd v0.2s, v20.2s, v21.2s |
| 132 | fsub v2.2s, v20.2s, v21.2s |
| 133 | fadd v1.2s, v22.2s, v23.2s |
| 134 | rev64 v26.2s, v26.2s |
| 135 | rev64 v27.2s, v27.2s |
| 136 | fsub v3.2s, v22.2s, v23.2s |
| 137 | fsub v6.2s, v6.2s, v7.2s |
| 138 | fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 |
| 139 | fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 |
| 140 | fadd v7.2s, v4.2s, v5.2s |
| 141 | fsub v18.2s, v2.2s, v6.2s |
| 142 | ld1 {v20.4s,v21.4s}, [x0], #32 |
| 143 | ld1 {v22.4s,v23.4s}, [x0], #32 |
| 144 | ext v26.8b, v24.8b, v25.8b, #4 |
| 145 | ext v27.8b, v25.8b, v24.8b, #4 |
| 146 | fadd v2.2s, v2.2s, v6.2s |
| 147 | fsub v16.2s, v0.2s, v7.2s |
| 148 | fadd v5.2s, v25.2s, v24.2s |
| 149 | fsub v4.2s, v26.2s, v27.2s |
| 150 | transpose v24.2d, v25.2d, v20.2d, v22.2d |
| 151 | transpose v26.2d, v27.2d, v21.2d, v23.2d |
| 152 | fadd v0.2s, v0.2s, v7.2s |
| 153 | fsub v17.2s, v1.2s, v5.2s |
| 154 | fsub v19.2s, v3.2s, v4.2s |
| 155 | fadd v3.2s, v3.2s, v4.2s |
| 156 | fadd v1.2s, v1.2s, v5.2s |
| 157 | ext v20.16b, v21.16b, v21.16b, #4 |
| 158 | ext v21.16b, v23.16b, v23.16b, #4 |
| 159 | |
| 160 | zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} |
| 161 | zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} |
| 162 | zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} |
| 163 | zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} |
| 164 | |
| 165 | // 2 x fft4 |
| 166 | transpose v22.2d, v23.2d, v20.2d, v21.2d |
| 167 | |
| 168 | fadd v4.4s, v24.4s, v25.4s |
| 169 | fadd v5.4s, v26.4s, v27.4s |
| 170 | fsub v6.4s, v24.4s, v25.4s |
| 171 | fsub v7.4s, v22.4s, v23.4s |
| 172 | |
| 173 | ld1 {v23.4s}, [x14] |
| 174 | |
| 175 | fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} |
| 176 | fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} |
| 177 | fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} |
| 178 | fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} |
| 179 | |
| 180 | //fft_pass_neon_16 |
| 181 | rev64 v7.4s, v25.4s |
| 182 | fmul v25.4s, v25.4s, v23.s[1] |
| 183 | fmul v7.4s, v7.4s, v29.4s |
| 184 | fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} |
| 185 | |
| 186 | zip1 v20.4s, v24.4s, v25.4s |
| 187 | zip2 v21.4s, v24.4s, v25.4s |
| 188 | fneg v22.4s, v20.4s |
| 189 | fadd v4.4s, v21.4s, v20.4s |
| 190 | fsub v6.4s, v20.4s, v21.4s // just the second half |
| 191 | fadd v5.4s, v21.4s, v22.4s // just the first half |
| 192 | |
| 193 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float |
| 194 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float |
| 195 | |
| 196 | fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} |
| 197 | fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} |
| 198 | fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} |
| 199 | fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} |
| 200 | |
| 201 | //second half |
| 202 | rev64 v6.4s, v26.4s |
| 203 | fmul v26.4s, v26.4s, v23.s[2] |
| 204 | rev64 v7.4s, v27.4s |
| 205 | fmul v27.4s, v27.4s, v23.s[3] |
| 206 | fmul v6.4s, v6.4s, v29.4s |
| 207 | fmul v7.4s, v7.4s, v29.4s |
| 208 | fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} |
| 209 | fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} |
| 210 | |
| 211 | zip1 v24.4s, v26.4s, v27.4s |
| 212 | zip2 v25.4s, v26.4s, v27.4s |
| 213 | fneg v26.4s, v24.4s |
| 214 | fadd v4.4s, v25.4s, v24.4s |
| 215 | fsub v6.4s, v24.4s, v25.4s // just the second half |
| 216 | fadd v5.4s, v25.4s, v26.4s // just the first half |
| 217 | |
| 218 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float |
| 219 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float |
| 220 | |
| 221 | fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} |
| 222 | fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} |
| 223 | fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} |
| 224 | fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} |
| 225 | |
| 226 | st1 {v16.4s,v17.4s}, [x1], #32 |
| 227 | st1 {v18.4s,v19.4s}, [x1], #32 |
| 228 | st1 {v20.4s,v21.4s}, [x1], #32 |
| 229 | st1 {v22.4s,v23.4s}, [x1], #32 |
| 230 | |
| 231 | ret |
| 232 | endfunc |
| 233 | |
| 234 | |
| 235 | const trans4_float, align=4 |
| 236 | .byte 0, 1, 2, 3 |
| 237 | .byte 8, 9, 10, 11 |
| 238 | .byte 4, 5, 6, 7 |
| 239 | .byte 12, 13, 14, 15 |
| 240 | endconst |
| 241 | |
| 242 | const trans8_float, align=4 |
| 243 | .byte 24, 25, 26, 27 |
| 244 | .byte 0, 1, 2, 3 |
| 245 | .byte 28, 29, 30, 31 |
| 246 | .byte 4, 5, 6, 7 |
| 247 | endconst |
| 248 | |
| 249 | function fft_pass_neon |
| 250 | sub x6, x2, #1 // n - 1, loop counter |
| 251 | lsl x5, x2, #3 // 2 * n * sizeof FFTSample |
| 252 | lsl x1, x2, #4 // 2 * n * sizeof FFTComplex |
| 253 | add x5, x4, x5 // wim |
| 254 | add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex |
| 255 | add x2, x0, x2, lsl #5 // &z[o2] |
| 256 | add x3, x0, x3 // &z[o3] |
| 257 | add x1, x0, x1 // &z[o1] |
| 258 | ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} |
| 259 | ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} |
| 260 | ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} |
| 261 | trn2 v25.2d, v20.2d, v22.2d |
| 262 | sub x5, x5, #4 // wim-- |
| 263 | trn1 v24.2d, v20.2d, v22.2d |
| 264 | ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] |
| 265 | rev64 v7.4s, v25.4s |
| 266 | fmul v25.4s, v25.4s, v4.s[1] |
| 267 | ld1 {v16.4s}, [x0] // {z[0],z[1]} |
| 268 | fmul v7.4s, v7.4s, v29.4s |
| 269 | ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} |
| 270 | prfm pldl1keep, [x2, #16] |
| 271 | prfm pldl1keep, [x3, #16] |
| 272 | fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} |
| 273 | prfm pldl1keep, [x0, #16] |
| 274 | prfm pldl1keep, [x1, #16] |
| 275 | |
| 276 | zip1 v20.4s, v24.4s, v25.4s |
| 277 | zip2 v21.4s, v24.4s, v25.4s |
| 278 | fneg v22.4s, v20.4s |
| 279 | fadd v4.4s, v21.4s, v20.4s |
| 280 | fsub v6.4s, v20.4s, v21.4s // just the second half |
| 281 | fadd v5.4s, v21.4s, v22.4s // just the first half |
| 282 | |
| 283 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float |
| 284 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float |
| 285 | |
| 286 | fadd v20.4s, v16.4s, v4.4s |
| 287 | fsub v22.4s, v16.4s, v4.4s |
| 288 | fadd v21.4s, v17.4s, v5.4s |
| 289 | st1 {v20.4s}, [x0], #16 // {z[0], z[1]} |
| 290 | fsub v23.4s, v17.4s, v5.4s |
| 291 | |
| 292 | st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} |
| 293 | st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} |
| 294 | st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} |
| 295 | 1: |
| 296 | ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} |
| 297 | ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} |
| 298 | ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} |
| 299 | transpose v26.2d, v27.2d, v20.2d, v22.2d |
| 300 | ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} |
| 301 | rev64 v6.4s, v26.4s |
| 302 | fmul v26.4s, v26.4s, v4.s[0] |
| 303 | rev64 v7.4s, v27.4s |
| 304 | fmul v27.4s, v27.4s, v4.s[1] |
| 305 | fmul v6.4s, v6.4s, v29.4s |
| 306 | fmul v7.4s, v7.4s, v29.4s |
| 307 | ld1 {v16.4s},[x0] // {z[0],z[1]} |
| 308 | fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} |
| 309 | fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} |
| 310 | ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} |
| 311 | |
| 312 | subs x6, x6, #1 // n-- |
| 313 | |
| 314 | zip1 v20.4s, v26.4s, v27.4s |
| 315 | zip2 v21.4s, v26.4s, v27.4s |
| 316 | fneg v22.4s, v20.4s |
| 317 | fadd v4.4s, v21.4s, v20.4s |
| 318 | fsub v6.4s, v20.4s, v21.4s // just the second half |
| 319 | fadd v5.4s, v21.4s, v22.4s // just the first half |
| 320 | |
| 321 | tbl v4.16b, {v4.16b}, v30.16b // trans4_float |
| 322 | tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float |
| 323 | |
| 324 | fadd v20.4s, v16.4s, v4.4s |
| 325 | fsub v22.4s, v16.4s, v4.4s |
| 326 | fadd v21.4s, v17.4s, v5.4s |
| 327 | st1 {v20.4s}, [x0], #16 // {z[0], z[1]} |
| 328 | fsub v23.4s, v17.4s, v5.4s |
| 329 | |
| 330 | st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} |
| 331 | st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} |
| 332 | st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} |
| 333 | b.ne 1b |
| 334 | |
| 335 | ret |
| 336 | endfunc |
| 337 | |
| 338 | .macro def_fft n, n2, n4 |
| 339 | function fft\n\()_neon, align=6 |
| 340 | sub sp, sp, #16 |
| 341 | stp x28, x30, [sp] |
| 342 | add x28, x0, #\n4*2*8 |
| 343 | bl fft\n2\()_neon |
| 344 | mov x0, x28 |
| 345 | bl fft\n4\()_neon |
| 346 | add x0, x28, #\n4*1*8 |
| 347 | bl fft\n4\()_neon |
| 348 | sub x0, x28, #\n4*2*8 |
| 349 | ldp x28, x30, [sp], #16 |
| 350 | movrel x4, X(ff_cos_\n) |
| 351 | mov x2, #\n4>>1 |
| 352 | b fft_pass_neon |
| 353 | endfunc |
| 354 | .endm |
| 355 | |
| 356 | def_fft 32, 16, 8 |
| 357 | def_fft 64, 32, 16 |
| 358 | def_fft 128, 64, 32 |
| 359 | def_fft 256, 128, 64 |
| 360 | def_fft 512, 256, 128 |
| 361 | def_fft 1024, 512, 256 |
| 362 | def_fft 2048, 1024, 512 |
| 363 | def_fft 4096, 2048, 1024 |
| 364 | def_fft 8192, 4096, 2048 |
| 365 | def_fft 16384, 8192, 4096 |
| 366 | def_fft 32768, 16384, 8192 |
| 367 | def_fft 65536, 32768, 16384 |
| 368 | |
| 369 | function ff_fft_calc_neon, export=1 |
| 370 | prfm pldl1keep, [x1] |
| 371 | movrel x10, trans4_float |
| 372 | ldr w2, [x0] |
| 373 | movrel x11, trans8_float |
| 374 | sub w2, w2, #2 |
| 375 | movrel x3, fft_tab_neon |
| 376 | ld1 {v30.16b}, [x10] |
| 377 | mov x7, #-8 |
| 378 | movrel x12, pmmp |
| 379 | ldr x3, [x3, x2, lsl #3] |
| 380 | movrel x13, mppm |
| 381 | movrel x14, X(ff_cos_16) |
| 382 | ld1 {v31.16b}, [x11] |
| 383 | mov x0, x1 |
| 384 | ld1 {v29.4s}, [x12] // pmmp |
| 385 | ld1 {v28.4s}, [x13] |
| 386 | br x3 |
| 387 | endfunc |
| 388 | |
| 389 | function ff_fft_permute_neon, export=1 |
| 390 | mov x6, #1 |
| 391 | ldr w2, [x0] // nbits |
| 392 | ldr x3, [x0, #16] // tmp_buf |
| 393 | ldr x0, [x0, #8] // revtab |
| 394 | lsl x6, x6, x2 |
| 395 | mov x2, x6 |
| 396 | 1: |
| 397 | ld1 {v0.2s,v1.2s}, [x1], #16 |
| 398 | ldr w4, [x0], #4 |
| 399 | uxth w5, w4 |
| 400 | lsr w4, w4, #16 |
| 401 | add x5, x3, x5, lsl #3 |
| 402 | add x4, x3, x4, lsl #3 |
| 403 | st1 {v0.2s}, [x5] |
| 404 | st1 {v1.2s}, [x4] |
| 405 | subs x6, x6, #2 |
| 406 | b.gt 1b |
| 407 | |
| 408 | sub x1, x1, x2, lsl #3 |
| 409 | 1: |
| 410 | ld1 {v0.4s,v1.4s}, [x3], #32 |
| 411 | st1 {v0.4s,v1.4s}, [x1], #32 |
| 412 | subs x2, x2, #4 |
| 413 | b.gt 1b |
| 414 | |
| 415 | ret |
| 416 | endfunc |
| 417 | |
| 418 | const fft_tab_neon |
| 419 | .quad fft4_neon |
| 420 | .quad fft8_neon |
| 421 | .quad fft16_neon |
| 422 | .quad fft32_neon |
| 423 | .quad fft64_neon |
| 424 | .quad fft128_neon |
| 425 | .quad fft256_neon |
| 426 | .quad fft512_neon |
| 427 | .quad fft1024_neon |
| 428 | .quad fft2048_neon |
| 429 | .quad fft4096_neon |
| 430 | .quad fft8192_neon |
| 431 | .quad fft16384_neon |
| 432 | .quad fft32768_neon |
| 433 | .quad fft65536_neon |
| 434 | endconst |
| 435 | |
| 436 | const pmmp, align=4 |
| 437 | .float +1.0, -1.0, -1.0, +1.0 |
| 438 | endconst |
| 439 | |
| 440 | const mppm, align=4 |
| 441 | .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 |
| 442 | endconst |