| 1 | /* |
| 2 | * Copyright (c) 2013 RISC OS Open Ltd |
| 3 | * Author: Ben Avison <bavison@riscosopen.org> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/arm/asm.S" |
| 23 | |
| 24 | @ The fftx_internal_vfp versions of the functions obey a modified AAPCS: |
| 25 | @ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and |
| 26 | @ all single-precision VFP registers may be corrupted on exit. The a2 |
| 27 | @ register may not be clobbered in these functions, as it holds the |
| 28 | @ stored original FPSCR. |
| 29 | |
| 30 | function ff_fft_calc_vfp, export=1 |
| 31 | ldr ip, [a1, #0] @ nbits |
| 32 | mov a1, a2 |
| 33 | A ldr pc, [pc, ip, lsl #2] |
| 34 | A .word 0 |
| 35 | A .word 0 |
| 36 | A .word 0 |
| 37 | T movrel a2, (fft_tab_vfp - 8) |
| 38 | T ldr pc, [a2, ip, lsl #2] |
| 39 | T endfunc |
| 40 | T const fft_tab_vfp |
| 41 | .word fft4_vfp |
| 42 | .word fft8_vfp |
| 43 | .word X(ff_fft16_vfp) @ this one alone is exported |
| 44 | .word fft32_vfp |
| 45 | .word fft64_vfp |
| 46 | .word fft128_vfp |
| 47 | .word fft256_vfp |
| 48 | .word fft512_vfp |
| 49 | .word fft1024_vfp |
| 50 | .word fft2048_vfp |
| 51 | .word fft4096_vfp |
| 52 | .word fft8192_vfp |
| 53 | .word fft16384_vfp |
| 54 | .word fft32768_vfp |
| 55 | .word fft65536_vfp |
| 56 | A endfunc |
| 57 | |
| 58 | function fft4_vfp |
| 59 | vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] |
| 60 | vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] |
| 61 | vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] |
| 62 | vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] |
| 63 | @ stall |
| 64 | vadd.f s12, s0, s8 @ i0 |
| 65 | vadd.f s13, s1, s9 @ i1 |
| 66 | vadd.f s14, s2, s10 @ i2 |
| 67 | vadd.f s15, s3, s11 @ i3 |
| 68 | vsub.f s8, s0, s8 @ i4 |
| 69 | vsub.f s9, s1, s9 @ i5 |
| 70 | vsub.f s10, s2, s10 @ i6 |
| 71 | vsub.f s11, s3, s11 @ i7 |
| 72 | @ stall |
| 73 | @ stall |
| 74 | vadd.f s0, s12, s14 @ z[0].re |
| 75 | vsub.f s4, s12, s14 @ z[2].re |
| 76 | vadd.f s1, s13, s15 @ z[0].im |
| 77 | vsub.f s5, s13, s15 @ z[2].im |
| 78 | vadd.f s7, s9, s10 @ z[3].im |
| 79 | vsub.f s3, s9, s10 @ z[1].im |
| 80 | vadd.f s2, s8, s11 @ z[1].re |
| 81 | vsub.f s6, s8, s11 @ z[3].re |
| 82 | @ stall |
| 83 | @ stall |
| 84 | vstr d0, [a1, #0*2*4] |
| 85 | vstr d2, [a1, #2*2*4] |
| 86 | @ stall |
| 87 | @ stall |
| 88 | vstr d1, [a1, #1*2*4] |
| 89 | vstr d3, [a1, #3*2*4] |
| 90 | |
| 91 | bx lr |
| 92 | endfunc |
| 93 | |
| 94 | .macro macro_fft8_head |
| 95 | @ FFT4 |
| 96 | vldr d4, [a1, #0 * 2*4] |
| 97 | vldr d6, [a1, #1 * 2*4] |
| 98 | vldr d5, [a1, #2 * 2*4] |
| 99 | vldr d7, [a1, #3 * 2*4] |
| 100 | @ BF |
| 101 | vldr d12, [a1, #4 * 2*4] |
| 102 | vadd.f s16, s8, s12 @ vector op |
| 103 | vldr d14, [a1, #5 * 2*4] |
| 104 | vldr d13, [a1, #6 * 2*4] |
| 105 | vldr d15, [a1, #7 * 2*4] |
| 106 | vsub.f s20, s8, s12 @ vector op |
| 107 | vadd.f s0, s16, s18 |
| 108 | vsub.f s2, s16, s18 |
| 109 | vadd.f s1, s17, s19 |
| 110 | vsub.f s3, s17, s19 |
| 111 | vadd.f s7, s21, s22 |
| 112 | vsub.f s5, s21, s22 |
| 113 | vadd.f s4, s20, s23 |
| 114 | vsub.f s6, s20, s23 |
| 115 | vsub.f s20, s24, s28 @ vector op |
| 116 | vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory |
| 117 | vstr d1, [a1, #1 * 2*4] |
| 118 | vldr s0, cos1pi4 |
| 119 | vadd.f s16, s24, s28 @ vector op |
| 120 | vstr d2, [a1, #2 * 2*4] |
| 121 | vstr d3, [a1, #3 * 2*4] |
| 122 | vldr d12, [a1, #0 * 2*4] |
| 123 | @ TRANSFORM |
| 124 | vmul.f s20, s20, s0 @ vector x scalar op |
| 125 | vldr d13, [a1, #1 * 2*4] |
| 126 | vldr d14, [a1, #2 * 2*4] |
| 127 | vldr d15, [a1, #3 * 2*4] |
| 128 | @ BUTTERFLIES |
| 129 | vadd.f s0, s18, s16 |
| 130 | vadd.f s1, s17, s19 |
| 131 | vsub.f s2, s17, s19 |
| 132 | vsub.f s3, s18, s16 |
| 133 | vadd.f s4, s21, s20 |
| 134 | vsub.f s5, s21, s20 |
| 135 | vadd.f s6, s22, s23 |
| 136 | vsub.f s7, s22, s23 |
| 137 | vadd.f s8, s0, s24 @ vector op |
| 138 | vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory |
| 139 | vstr d1, [a1, #1 * 2*4] |
| 140 | vldr d6, [a1, #0 * 2*4] |
| 141 | vldr d7, [a1, #1 * 2*4] |
| 142 | vadd.f s1, s5, s6 |
| 143 | vadd.f s0, s7, s4 |
| 144 | vsub.f s2, s5, s6 |
| 145 | vsub.f s3, s7, s4 |
| 146 | vsub.f s12, s24, s12 @ vector op |
| 147 | vsub.f s5, s29, s1 |
| 148 | vsub.f s4, s28, s0 |
| 149 | vsub.f s6, s30, s2 |
| 150 | vsub.f s7, s31, s3 |
| 151 | vadd.f s16, s0, s28 @ vector op |
| 152 | vstr d6, [a1, #4 * 2*4] |
| 153 | vstr d7, [a1, #6 * 2*4] |
| 154 | vstr d4, [a1, #0 * 2*4] |
| 155 | vstr d5, [a1, #2 * 2*4] |
| 156 | vstr d2, [a1, #5 * 2*4] |
| 157 | vstr d3, [a1, #7 * 2*4] |
| 158 | .endm |
| 159 | |
| 160 | .macro macro_fft8_tail |
| 161 | vstr d8, [a1, #1 * 2*4] |
| 162 | vstr d9, [a1, #3 * 2*4] |
| 163 | .endm |
| 164 | |
| 165 | function .Lfft8_internal_vfp |
| 166 | macro_fft8_head |
| 167 | macro_fft8_tail |
| 168 | bx lr |
| 169 | endfunc |
| 170 | |
| 171 | function fft8_vfp |
| 172 | ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 |
| 173 | fmrx a2, FPSCR |
| 174 | fmxr FPSCR, a3 |
| 175 | vpush {s16-s31} |
| 176 | mov ip, lr |
| 177 | bl .Lfft8_internal_vfp |
| 178 | vpop {s16-s31} |
| 179 | fmxr FPSCR, a2 |
| 180 | bx ip |
| 181 | endfunc |
| 182 | |
| 183 | .align 3 |
| 184 | cos1pi4: @ cos(1*pi/4) = sqrt(2) |
| 185 | .float 0.707106769084930419921875 |
| 186 | cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 |
| 187 | .float 0.92387950420379638671875 |
| 188 | cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 |
| 189 | .float 0.3826834261417388916015625 |
| 190 | |
| 191 | function .Lfft16_internal_vfp |
| 192 | macro_fft8_head |
| 193 | @ FFT4(z+8) |
| 194 | vldr d10, [a1, #8 * 2*4] |
| 195 | vldr d12, [a1, #9 * 2*4] |
| 196 | vldr d11, [a1, #10 * 2*4] |
| 197 | vldr d13, [a1, #11 * 2*4] |
| 198 | macro_fft8_tail |
| 199 | vadd.f s16, s20, s24 @ vector op |
| 200 | @ FFT4(z+12) |
| 201 | vldr d4, [a1, #12 * 2*4] |
| 202 | vldr d6, [a1, #13 * 2*4] |
| 203 | vldr d5, [a1, #14 * 2*4] |
| 204 | vsub.f s20, s20, s24 @ vector op |
| 205 | vldr d7, [a1, #15 * 2*4] |
| 206 | vadd.f s0, s16, s18 |
| 207 | vsub.f s4, s16, s18 |
| 208 | vadd.f s1, s17, s19 |
| 209 | vsub.f s5, s17, s19 |
| 210 | vadd.f s7, s21, s22 |
| 211 | vsub.f s3, s21, s22 |
| 212 | vadd.f s2, s20, s23 |
| 213 | vsub.f s6, s20, s23 |
| 214 | vadd.f s16, s8, s12 @ vector op |
| 215 | vstr d0, [a1, #8 * 2*4] |
| 216 | vstr d2, [a1, #10 * 2*4] |
| 217 | vstr d1, [a1, #9 * 2*4] |
| 218 | vsub.f s20, s8, s12 |
| 219 | vstr d3, [a1, #11 * 2*4] |
| 220 | @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) |
| 221 | vldr d12, [a1, #10 * 2*4] |
| 222 | vadd.f s0, s16, s18 |
| 223 | vadd.f s1, s17, s19 |
| 224 | vsub.f s6, s16, s18 |
| 225 | vsub.f s7, s17, s19 |
| 226 | vsub.f s3, s21, s22 |
| 227 | vadd.f s2, s20, s23 |
| 228 | vadd.f s5, s21, s22 |
| 229 | vsub.f s4, s20, s23 |
| 230 | vstr d0, [a1, #12 * 2*4] |
| 231 | vmov s0, s6 |
| 232 | @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) |
| 233 | vldr d6, [a1, #9 * 2*4] |
| 234 | vstr d1, [a1, #13 * 2*4] |
| 235 | vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 |
| 236 | vstr d2, [a1, #15 * 2*4] |
| 237 | vldr d7, [a1, #13 * 2*4] |
| 238 | vadd.f s4, s25, s24 |
| 239 | vsub.f s5, s25, s24 |
| 240 | vsub.f s6, s0, s7 |
| 241 | vadd.f s7, s0, s7 |
| 242 | vmul.f s20, s12, s3 @ vector op |
| 243 | @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) |
| 244 | vldr d4, [a1, #11 * 2*4] |
| 245 | vldr d5, [a1, #15 * 2*4] |
| 246 | vldr s1, cos3pi8 |
| 247 | vmul.f s24, s4, s2 @ vector * scalar op |
| 248 | vmul.f s28, s12, s1 @ vector * scalar op |
| 249 | vmul.f s12, s8, s1 @ vector * scalar op |
| 250 | vadd.f s4, s20, s29 |
| 251 | vsub.f s5, s21, s28 |
| 252 | vsub.f s6, s22, s31 |
| 253 | vadd.f s7, s23, s30 |
| 254 | vmul.f s8, s8, s3 @ vector * scalar op |
| 255 | vldr d8, [a1, #1 * 2*4] |
| 256 | vldr d9, [a1, #5 * 2*4] |
| 257 | vldr d10, [a1, #3 * 2*4] |
| 258 | vldr d11, [a1, #7 * 2*4] |
| 259 | vldr d14, [a1, #2 * 2*4] |
| 260 | vadd.f s0, s6, s4 |
| 261 | vadd.f s1, s5, s7 |
| 262 | vsub.f s2, s5, s7 |
| 263 | vsub.f s3, s6, s4 |
| 264 | vadd.f s4, s12, s9 |
| 265 | vsub.f s5, s13, s8 |
| 266 | vsub.f s6, s14, s11 |
| 267 | vadd.f s7, s15, s10 |
| 268 | vadd.f s12, s0, s16 @ vector op |
| 269 | vstr d0, [a1, #1 * 2*4] |
| 270 | vstr d1, [a1, #5 * 2*4] |
| 271 | vldr d4, [a1, #1 * 2*4] |
| 272 | vldr d5, [a1, #5 * 2*4] |
| 273 | vadd.f s0, s6, s4 |
| 274 | vadd.f s1, s5, s7 |
| 275 | vsub.f s2, s5, s7 |
| 276 | vsub.f s3, s6, s4 |
| 277 | vsub.f s8, s16, s8 @ vector op |
| 278 | vstr d6, [a1, #1 * 2*4] |
| 279 | vstr d7, [a1, #5 * 2*4] |
| 280 | vldr d15, [a1, #6 * 2*4] |
| 281 | vsub.f s4, s20, s0 |
| 282 | vsub.f s5, s21, s1 |
| 283 | vsub.f s6, s22, s2 |
| 284 | vsub.f s7, s23, s3 |
| 285 | vadd.f s20, s0, s20 @ vector op |
| 286 | vstr d4, [a1, #9 * 2*4] |
| 287 | @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) |
| 288 | vldr d6, [a1, #8 * 2*4] |
| 289 | vstr d5, [a1, #13 * 2*4] |
| 290 | vldr d7, [a1, #12 * 2*4] |
| 291 | vstr d2, [a1, #11 * 2*4] |
| 292 | vldr d8, [a1, #0 * 2*4] |
| 293 | vstr d3, [a1, #15 * 2*4] |
| 294 | vldr d9, [a1, #4 * 2*4] |
| 295 | vadd.f s0, s26, s24 |
| 296 | vadd.f s1, s25, s27 |
| 297 | vsub.f s2, s25, s27 |
| 298 | vsub.f s3, s26, s24 |
| 299 | vadd.f s4, s14, s12 |
| 300 | vadd.f s5, s13, s15 |
| 301 | vsub.f s6, s13, s15 |
| 302 | vsub.f s7, s14, s12 |
| 303 | vadd.f s8, s0, s28 @ vector op |
| 304 | vstr d0, [a1, #3 * 2*4] |
| 305 | vstr d1, [a1, #7 * 2*4] |
| 306 | vldr d6, [a1, #3 * 2*4] |
| 307 | vldr d7, [a1, #7 * 2*4] |
| 308 | vsub.f s0, s16, s4 |
| 309 | vsub.f s1, s17, s5 |
| 310 | vsub.f s2, s18, s6 |
| 311 | vsub.f s3, s19, s7 |
| 312 | vsub.f s12, s28, s12 @ vector op |
| 313 | vadd.f s16, s4, s16 @ vector op |
| 314 | vstr d10, [a1, #3 * 2*4] |
| 315 | vstr d11, [a1, #7 * 2*4] |
| 316 | vstr d4, [a1, #2 * 2*4] |
| 317 | vstr d5, [a1, #6 * 2*4] |
| 318 | vstr d0, [a1, #8 * 2*4] |
| 319 | vstr d1, [a1, #12 * 2*4] |
| 320 | vstr d6, [a1, #10 * 2*4] |
| 321 | vstr d7, [a1, #14 * 2*4] |
| 322 | vstr d8, [a1, #0 * 2*4] |
| 323 | vstr d9, [a1, #4 * 2*4] |
| 324 | |
| 325 | bx lr |
| 326 | endfunc |
| 327 | |
| 328 | function ff_fft16_vfp, export=1 |
| 329 | ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 |
| 330 | fmrx a2, FPSCR |
| 331 | fmxr FPSCR, a3 |
| 332 | vpush {s16-s31} |
| 333 | mov ip, lr |
| 334 | bl .Lfft16_internal_vfp |
| 335 | vpop {s16-s31} |
| 336 | fmxr FPSCR, a2 |
| 337 | bx ip |
| 338 | endfunc |
| 339 | |
| 340 | .macro pass n, z0, z1, z2, z3 |
| 341 | add v6, v5, #4*2*\n |
| 342 | @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) |
| 343 | @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) |
| 344 | @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) |
| 345 | @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) |
| 346 | vldr d8, [\z2, #8*(o2+1)] @ s16,s17 |
| 347 | vldmdb v6!, {s2} |
| 348 | vldr d9, [\z3, #8*(o3+1)] @ s18,s19 |
| 349 | vldmia v5!, {s0,s1} @ s0 is unused |
| 350 | vldr s7, [\z2, #8*o2] @ t1 |
| 351 | vmul.f s20, s16, s2 @ vector * scalar |
| 352 | vldr s0, [\z3, #8*o3] @ t5 |
| 353 | vldr s6, [\z2, #8*o2+4] @ t2 |
| 354 | vldr s3, [\z3, #8*o3+4] @ t6 |
| 355 | vmul.f s16, s16, s1 @ vector * scalar |
| 356 | ldr a4, =\n-1 |
| 357 | 1: add \z0, \z0, #8*2 |
| 358 | .if \n*4*2 >= 512 |
| 359 | add \z1, \z1, #8*2 |
| 360 | .endif |
| 361 | .if \n*4*2 >= 256 |
| 362 | add \z2, \z2, #8*2 |
| 363 | .endif |
| 364 | .if \n*4*2 >= 512 |
| 365 | add \z3, \z3, #8*2 |
| 366 | .endif |
| 367 | @ up to 2 stalls (VFP vector issuing / waiting for s0) |
| 368 | @ depending upon whether this is the first iteration and |
| 369 | @ how many add instructions are inserted above |
| 370 | vadd.f s4, s0, s7 @ t5 |
| 371 | vadd.f s5, s6, s3 @ t6 |
| 372 | vsub.f s6, s6, s3 @ t4 |
| 373 | vsub.f s7, s0, s7 @ t3 |
| 374 | vldr d6, [\z0, #8*0-8*2] @ s12,s13 |
| 375 | vadd.f s0, s16, s21 @ t1 |
| 376 | vldr d7, [\z1, #8*o1-8*2] @ s14,s15 |
| 377 | vsub.f s1, s18, s23 @ t5 |
| 378 | vadd.f s8, s4, s12 @ vector + vector |
| 379 | @ stall (VFP vector issuing) |
| 380 | @ stall (VFP vector issuing) |
| 381 | @ stall (VFP vector issuing) |
| 382 | vsub.f s4, s12, s4 |
| 383 | vsub.f s5, s13, s5 |
| 384 | vsub.f s6, s14, s6 |
| 385 | vsub.f s7, s15, s7 |
| 386 | vsub.f s2, s17, s20 @ t2 |
| 387 | vadd.f s3, s19, s22 @ t6 |
| 388 | vstr d4, [\z0, #8*0-8*2] @ s8,s9 |
| 389 | vstr d5, [\z1, #8*o1-8*2] @ s10,s11 |
| 390 | @ stall (waiting for s5) |
| 391 | vstr d2, [\z2, #8*o2-8*2] @ s4,s5 |
| 392 | vadd.f s4, s1, s0 @ t5 |
| 393 | vstr d3, [\z3, #8*o3-8*2] @ s6,s7 |
| 394 | vsub.f s7, s1, s0 @ t3 |
| 395 | vadd.f s5, s2, s3 @ t6 |
| 396 | vsub.f s6, s2, s3 @ t4 |
| 397 | vldr d6, [\z0, #8*1-8*2] @ s12,s13 |
| 398 | vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 |
| 399 | vldr d4, [\z2, #8*o2] @ s8,s9 |
| 400 | vldmdb v6!, {s2,s3} |
| 401 | vldr d5, [\z3, #8*o3] @ s10,s11 |
| 402 | vadd.f s20, s4, s12 @ vector + vector |
| 403 | vldmia v5!, {s0,s1} |
| 404 | vldr d8, [\z2, #8*(o2+1)] @ s16,s17 |
| 405 | @ stall (VFP vector issuing) |
| 406 | vsub.f s4, s12, s4 |
| 407 | vsub.f s5, s13, s5 |
| 408 | vsub.f s6, s14, s6 |
| 409 | vsub.f s7, s15, s7 |
| 410 | vmul.f s12, s8, s3 @ vector * scalar |
| 411 | vstr d10, [\z0, #8*1-8*2] @ s20,s21 |
| 412 | vldr d9, [\z3, #8*(o3+1)] @ s18,s19 |
| 413 | vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 |
| 414 | vmul.f s8, s8, s0 @ vector * scalar |
| 415 | vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 |
| 416 | @ stall (waiting for s7) |
| 417 | vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 |
| 418 | vmul.f s20, s16, s2 @ vector * scalar |
| 419 | @ stall (VFP vector issuing) |
| 420 | @ stall (VFP vector issuing) |
| 421 | @ stall (VFP vector issuing) |
| 422 | vadd.f s7, s8, s13 @ t1 |
| 423 | vsub.f s6, s9, s12 @ t2 |
| 424 | vsub.f s0, s10, s15 @ t5 |
| 425 | vadd.f s3, s11, s14 @ t6 |
| 426 | vmul.f s16, s16, s1 @ vector * scalar |
| 427 | subs a4, a4, #1 |
| 428 | bne 1b |
| 429 | @ What remains is identical to the first two indentations of |
| 430 | @ the above, but without the increment of z |
| 431 | vadd.f s4, s0, s7 @ t5 |
| 432 | vadd.f s5, s6, s3 @ t6 |
| 433 | vsub.f s6, s6, s3 @ t4 |
| 434 | vsub.f s7, s0, s7 @ t3 |
| 435 | vldr d6, [\z0, #8*0] @ s12,s13 |
| 436 | vadd.f s0, s16, s21 @ t1 |
| 437 | vldr d7, [\z1, #8*o1] @ s14,s15 |
| 438 | vsub.f s1, s18, s23 @ t5 |
| 439 | vadd.f s8, s4, s12 @ vector + vector |
| 440 | vsub.f s4, s12, s4 |
| 441 | vsub.f s5, s13, s5 |
| 442 | vsub.f s6, s14, s6 |
| 443 | vsub.f s7, s15, s7 |
| 444 | vsub.f s2, s17, s20 @ t2 |
| 445 | vadd.f s3, s19, s22 @ t6 |
| 446 | vstr d4, [\z0, #8*0] @ s8,s9 |
| 447 | vstr d5, [\z1, #8*o1] @ s10,s11 |
| 448 | vstr d2, [\z2, #8*o2] @ s4,s5 |
| 449 | vadd.f s4, s1, s0 @ t5 |
| 450 | vstr d3, [\z3, #8*o3] @ s6,s7 |
| 451 | vsub.f s7, s1, s0 @ t3 |
| 452 | vadd.f s5, s2, s3 @ t6 |
| 453 | vsub.f s6, s2, s3 @ t4 |
| 454 | vldr d6, [\z0, #8*1] @ s12,s13 |
| 455 | vldr d7, [\z1, #8*(o1+1)] @ s14,s15 |
| 456 | vadd.f s20, s4, s12 @ vector + vector |
| 457 | vsub.f s4, s12, s4 |
| 458 | vsub.f s5, s13, s5 |
| 459 | vsub.f s6, s14, s6 |
| 460 | vsub.f s7, s15, s7 |
| 461 | vstr d10, [\z0, #8*1] @ s20,s21 |
| 462 | vstr d11, [\z1, #8*(o1+1)] @ s22,s23 |
| 463 | vstr d2, [\z2, #8*(o2+1)] @ s4,s5 |
| 464 | vstr d3, [\z3, #8*(o3+1)] @ s6,s7 |
| 465 | .endm |
| 466 | |
| 467 | .macro def_fft n, n2, n4 |
| 468 | function .Lfft\n\()_internal_vfp |
| 469 | .if \n >= 512 |
| 470 | push {v1-v6,lr} |
| 471 | .elseif \n >= 256 |
| 472 | push {v1-v2,v5-v6,lr} |
| 473 | .else |
| 474 | push {v1,v5-v6,lr} |
| 475 | .endif |
| 476 | mov v1, a1 |
| 477 | bl .Lfft\n2\()_internal_vfp |
| 478 | add a1, v1, #8*(\n/4)*2 |
| 479 | bl .Lfft\n4\()_internal_vfp |
| 480 | movrelx v5, X(ff_cos_\n), a1 |
| 481 | add a1, v1, #8*(\n/4)*3 |
| 482 | bl .Lfft\n4\()_internal_vfp |
| 483 | .if \n >= 512 |
| 484 | .set o1, 0*(\n/4/2) |
| 485 | .set o2, 0*(\n/4/2) |
| 486 | .set o3, 0*(\n/4/2) |
| 487 | add v2, v1, #8*2*(\n/4/2) |
| 488 | add v3, v1, #8*4*(\n/4/2) |
| 489 | add v4, v1, #8*6*(\n/4/2) |
| 490 | pass (\n/4/2), v1, v2, v3, v4 |
| 491 | pop {v1-v6,pc} |
| 492 | .elseif \n >= 256 |
| 493 | .set o1, 2*(\n/4/2) |
| 494 | .set o2, 0*(\n/4/2) |
| 495 | .set o3, 2*(\n/4/2) |
| 496 | add v2, v1, #8*4*(\n/4/2) |
| 497 | pass (\n/4/2), v1, v1, v2, v2 |
| 498 | pop {v1-v2,v5-v6,pc} |
| 499 | .else |
| 500 | .set o1, 2*(\n/4/2) |
| 501 | .set o2, 4*(\n/4/2) |
| 502 | .set o3, 6*(\n/4/2) |
| 503 | pass (\n/4/2), v1, v1, v1, v1 |
| 504 | pop {v1,v5-v6,pc} |
| 505 | .endif |
| 506 | endfunc |
| 507 | |
| 508 | function fft\n\()_vfp |
| 509 | ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ |
| 510 | fmrx a2, FPSCR |
| 511 | fmxr FPSCR, a3 |
| 512 | vpush {s16-s31} |
| 513 | mov ip, lr |
| 514 | bl .Lfft\n\()_internal_vfp |
| 515 | vpop {s16-s31} |
| 516 | fmxr FPSCR, a2 |
| 517 | bx ip |
| 518 | endfunc |
| 519 | |
| 520 | .ltorg |
| 521 | .endm |
| 522 | |
| 523 | def_fft 32, 16, 8 |
| 524 | def_fft 64, 32, 16 |
| 525 | def_fft 128, 64, 32 |
| 526 | def_fft 256, 128, 64 |
| 527 | def_fft 512, 256, 128 |
| 528 | def_fft 1024, 512, 256 |
| 529 | def_fft 2048, 1024, 512 |
| 530 | def_fft 4096, 2048, 1024 |
| 531 | def_fft 8192, 4096, 2048 |
| 532 | def_fft 16384, 8192, 4096 |
| 533 | def_fft 32768, 16384, 8192 |
| 534 | def_fft 65536, 32768, 16384 |