| 1 | /* |
| 2 | * Copyright (c) 2013 RISC OS Open Ltd |
| 3 | * Author: Ben Avison <bavison@riscosopen.org> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/arm/asm.S" |
| 23 | |
| 24 | POUT .req a1 |
| 25 | PIN .req a2 |
| 26 | PCOEF .req a3 |
| 27 | OLDFPSCR .req a4 |
| 28 | COUNTER .req ip |
| 29 | |
| 30 | IN0 .req s4 |
| 31 | IN1 .req s5 |
| 32 | IN2 .req s6 |
| 33 | IN3 .req s7 |
| 34 | IN4 .req s0 |
| 35 | IN5 .req s1 |
| 36 | IN6 .req s2 |
| 37 | IN7 .req s3 |
| 38 | COEF0 .req s8 @ coefficient elements |
| 39 | COEF1 .req s9 |
| 40 | COEF2 .req s10 |
| 41 | COEF3 .req s11 |
| 42 | COEF4 .req s12 |
| 43 | COEF5 .req s13 |
| 44 | COEF6 .req s14 |
| 45 | COEF7 .req s15 |
| 46 | ACCUM0 .req s16 @ double-buffered multiply-accumulate results |
| 47 | ACCUM4 .req s20 |
| 48 | POST0 .req s24 @ do long-latency post-multiply in this vector in parallel |
| 49 | POST1 .req s25 |
| 50 | POST2 .req s26 |
| 51 | POST3 .req s27 |
| 52 | |
| 53 | |
| 54 | .macro inner_loop decifactor, dir, tail, head |
| 55 | .ifc "\dir","up" |
| 56 | .set X, 0 |
| 57 | .set Y, 4 |
| 58 | .else |
| 59 | .set X, 4*JMAX*4 - 4 |
| 60 | .set Y, -4 |
| 61 | .endif |
| 62 | .ifnc "\head","" |
| 63 | vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y] |
| 64 | vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y] |
| 65 | vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y] |
| 66 | vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y] |
| 67 | .endif |
| 68 | .ifnc "\tail","" |
| 69 | vadd.f POST0, ACCUM0, ACCUM4 @ vector operation |
| 70 | .endif |
| 71 | .ifnc "\head","" |
| 72 | vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar |
| 73 | vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y] |
| 74 | vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y] |
| 75 | vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y] |
| 76 | .endif |
| 77 | .ifnc "\head","" |
| 78 | vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y] |
| 79 | .ifc "\tail","" |
| 80 | vmul.f ACCUM4, COEF4, IN1 @ vector operation |
| 81 | .endif |
| 82 | vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y] |
| 83 | vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y] |
| 84 | .ifnc "\tail","" |
| 85 | vmul.f ACCUM4, COEF4, IN1 @ vector operation |
| 86 | .endif |
| 87 | vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y] |
| 88 | vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y] |
| 89 | .endif |
| 90 | .ifnc "\tail","" |
| 91 | vstmia POUT!, {POST0-POST3} |
| 92 | .endif |
| 93 | .ifnc "\head","" |
| 94 | vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar |
| 95 | vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y] |
| 96 | vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y] |
| 97 | vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y] |
| 98 | vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y] |
| 99 | vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar |
| 100 | .if \decifactor == 32 |
| 101 | vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y] |
| 102 | vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y] |
| 103 | vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y] |
| 104 | vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y] |
| 105 | vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar |
| 106 | vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y] |
| 107 | vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y] |
| 108 | vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y] |
| 109 | vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y] |
| 110 | vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar |
| 111 | vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y] |
| 112 | vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y] |
| 113 | vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y] |
| 114 | vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y] |
| 115 | vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar |
| 116 | vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y] |
| 117 | vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y] |
| 118 | vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y] |
| 119 | vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y] |
| 120 | vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar |
| 121 | .endif |
| 122 | .endif |
| 123 | .endm |
| 124 | |
| 125 | .macro dca_lfe_fir decifactor |
| 126 | function ff_dca_lfe_fir\decifactor\()_vfp, export=1 |
| 127 | fmrx OLDFPSCR, FPSCR |
| 128 | ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 |
| 129 | fmxr FPSCR, ip |
| 130 | vldr IN0, [PIN, #-0*4] |
| 131 | vldr IN1, [PIN, #-1*4] |
| 132 | vldr IN2, [PIN, #-2*4] |
| 133 | vldr IN3, [PIN, #-3*4] |
| 134 | .if \decifactor == 32 |
| 135 | .set JMAX, 8 |
| 136 | vpush {s16-s31} |
| 137 | vldr IN4, [PIN, #-4*4] |
| 138 | vldr IN5, [PIN, #-5*4] |
| 139 | vldr IN6, [PIN, #-6*4] |
| 140 | vldr IN7, [PIN, #-7*4] |
| 141 | .else |
| 142 | .set JMAX, 4 |
| 143 | vpush {s16-s27} |
| 144 | .endif |
| 145 | |
| 146 | mov COUNTER, #\decifactor/4 - 1 |
| 147 | inner_loop \decifactor, up,, head |
| 148 | 1: add PCOEF, PCOEF, #4*JMAX*4 |
| 149 | subs COUNTER, COUNTER, #1 |
| 150 | inner_loop \decifactor, up, tail, head |
| 151 | bne 1b |
| 152 | inner_loop \decifactor, up, tail |
| 153 | |
| 154 | mov COUNTER, #\decifactor/4 - 1 |
| 155 | inner_loop \decifactor, down,, head |
| 156 | 1: sub PCOEF, PCOEF, #4*JMAX*4 |
| 157 | subs COUNTER, COUNTER, #1 |
| 158 | inner_loop \decifactor, down, tail, head |
| 159 | bne 1b |
| 160 | inner_loop \decifactor, down, tail |
| 161 | |
| 162 | .if \decifactor == 32 |
| 163 | vpop {s16-s31} |
| 164 | .else |
| 165 | vpop {s16-s27} |
| 166 | .endif |
| 167 | fmxr FPSCR, OLDFPSCR |
| 168 | bx lr |
| 169 | endfunc |
| 170 | .endm |
| 171 | |
| 172 | dca_lfe_fir 64 |
| 173 | .ltorg |
| 174 | dca_lfe_fir 32 |
| 175 | |
| 176 | .unreq POUT |
| 177 | .unreq PIN |
| 178 | .unreq PCOEF |
| 179 | .unreq OLDFPSCR |
| 180 | .unreq COUNTER |
| 181 | |
| 182 | .unreq IN0 |
| 183 | .unreq IN1 |
| 184 | .unreq IN2 |
| 185 | .unreq IN3 |
| 186 | .unreq IN4 |
| 187 | .unreq IN5 |
| 188 | .unreq IN6 |
| 189 | .unreq IN7 |
| 190 | .unreq COEF0 |
| 191 | .unreq COEF1 |
| 192 | .unreq COEF2 |
| 193 | .unreq COEF3 |
| 194 | .unreq COEF4 |
| 195 | .unreq COEF5 |
| 196 | .unreq COEF6 |
| 197 | .unreq COEF7 |
| 198 | .unreq ACCUM0 |
| 199 | .unreq ACCUM4 |
| 200 | .unreq POST0 |
| 201 | .unreq POST1 |
| 202 | .unreq POST2 |
| 203 | .unreq POST3 |
| 204 | |
| 205 | |
| 206 | IN .req a1 |
| 207 | SBACT .req a2 |
| 208 | OLDFPSCR .req a3 |
| 209 | IMDCT .req a4 |
| 210 | WINDOW .req v1 |
| 211 | OUT .req v2 |
| 212 | BUF .req v3 |
| 213 | SCALEINT .req v4 @ only used in softfp case |
| 214 | COUNT .req v5 |
| 215 | |
| 216 | SCALE .req s0 |
| 217 | |
| 218 | /* Stack layout differs in softfp and hardfp cases: |
| 219 | * |
| 220 | * hardfp |
| 221 | * fp -> 6 arg words saved by caller |
| 222 | * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) |
| 223 | * s16-s23 on entry |
| 224 | * align 16 |
| 225 | * buf -> 8*32*4 bytes buffer |
| 226 | * s0 on entry |
| 227 | * sp -> 3 arg words for callee |
| 228 | * |
| 229 | * softfp |
| 230 | * fp -> 7 arg words saved by caller |
| 231 | * a4,v1-v5,fp,lr on entry |
| 232 | * s16-s23 on entry |
| 233 | * align 16 |
| 234 | * buf -> 8*32*4 bytes buffer |
| 235 | * sp -> 4 arg words for callee |
| 236 | */ |
| 237 | |
| 238 | /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, |
| 239 | * SynthFilterContext *synth, FFTContext *imdct, |
| 240 | * float (*synth_buf_ptr)[512], |
| 241 | * int *synth_buf_offset, float (*synth_buf2)[32], |
| 242 | * const float (*window)[512], float *samples_out, |
| 243 | * float (*raXin)[32], float scale); |
| 244 | */ |
| 245 | function ff_dca_qmf_32_subbands_vfp, export=1 |
| 246 | VFP push {a3-a4,v1-v3,v5,fp,lr} |
| 247 | NOVFP push {a4,v1-v5,fp,lr} |
| 248 | add fp, sp, #8*4 |
| 249 | vpush {s16-s23} |
| 250 | @ The buffer pointed at by raXin isn't big enough for us to do a |
| 251 | @ complete matrix transposition as we want to, so allocate an |
| 252 | @ alternative buffer from the stack. Align to 4 words for speed. |
| 253 | sub BUF, sp, #8*32*4 |
| 254 | bic BUF, BUF, #15 |
| 255 | mov sp, BUF |
| 256 | ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2 |
| 257 | fmrx OLDFPSCR, FPSCR |
| 258 | fmxr FPSCR, lr |
| 259 | @ COUNT is used to count down 2 things at once: |
| 260 | @ bits 0-4 are the number of word pairs remaining in the output row |
| 261 | @ bits 5-31 are the number of words to copy (with possible negation) |
| 262 | @ from the source matrix before we start zeroing the remainder |
| 263 | mov COUNT, #(-4 << 5) + 16 |
| 264 | adds COUNT, COUNT, SBACT, lsl #5 |
| 265 | bmi 2f |
| 266 | 1: |
| 267 | vldr s8, [IN, #(0*8+0)*4] |
| 268 | vldr s10, [IN, #(0*8+1)*4] |
| 269 | vldr s12, [IN, #(0*8+2)*4] |
| 270 | vldr s14, [IN, #(0*8+3)*4] |
| 271 | vldr s16, [IN, #(0*8+4)*4] |
| 272 | vldr s18, [IN, #(0*8+5)*4] |
| 273 | vldr s20, [IN, #(0*8+6)*4] |
| 274 | vldr s22, [IN, #(0*8+7)*4] |
| 275 | vneg.f s8, s8 |
| 276 | vldr s9, [IN, #(1*8+0)*4] |
| 277 | vldr s11, [IN, #(1*8+1)*4] |
| 278 | vldr s13, [IN, #(1*8+2)*4] |
| 279 | vldr s15, [IN, #(1*8+3)*4] |
| 280 | vneg.f s16, s16 |
| 281 | vldr s17, [IN, #(1*8+4)*4] |
| 282 | vldr s19, [IN, #(1*8+5)*4] |
| 283 | vldr s21, [IN, #(1*8+6)*4] |
| 284 | vldr s23, [IN, #(1*8+7)*4] |
| 285 | vstr d4, [BUF, #(0*32+0)*4] |
| 286 | vstr d5, [BUF, #(1*32+0)*4] |
| 287 | vstr d6, [BUF, #(2*32+0)*4] |
| 288 | vstr d7, [BUF, #(3*32+0)*4] |
| 289 | vstr d8, [BUF, #(4*32+0)*4] |
| 290 | vstr d9, [BUF, #(5*32+0)*4] |
| 291 | vstr d10, [BUF, #(6*32+0)*4] |
| 292 | vstr d11, [BUF, #(7*32+0)*4] |
| 293 | vldr s9, [IN, #(3*8+0)*4] |
| 294 | vldr s11, [IN, #(3*8+1)*4] |
| 295 | vldr s13, [IN, #(3*8+2)*4] |
| 296 | vldr s15, [IN, #(3*8+3)*4] |
| 297 | vldr s17, [IN, #(3*8+4)*4] |
| 298 | vldr s19, [IN, #(3*8+5)*4] |
| 299 | vldr s21, [IN, #(3*8+6)*4] |
| 300 | vldr s23, [IN, #(3*8+7)*4] |
| 301 | vneg.f s9, s9 |
| 302 | vldr s8, [IN, #(2*8+0)*4] |
| 303 | vldr s10, [IN, #(2*8+1)*4] |
| 304 | vldr s12, [IN, #(2*8+2)*4] |
| 305 | vldr s14, [IN, #(2*8+3)*4] |
| 306 | vneg.f s17, s17 |
| 307 | vldr s16, [IN, #(2*8+4)*4] |
| 308 | vldr s18, [IN, #(2*8+5)*4] |
| 309 | vldr s20, [IN, #(2*8+6)*4] |
| 310 | vldr s22, [IN, #(2*8+7)*4] |
| 311 | vstr d4, [BUF, #(0*32+2)*4] |
| 312 | vstr d5, [BUF, #(1*32+2)*4] |
| 313 | vstr d6, [BUF, #(2*32+2)*4] |
| 314 | vstr d7, [BUF, #(3*32+2)*4] |
| 315 | vstr d8, [BUF, #(4*32+2)*4] |
| 316 | vstr d9, [BUF, #(5*32+2)*4] |
| 317 | vstr d10, [BUF, #(6*32+2)*4] |
| 318 | vstr d11, [BUF, #(7*32+2)*4] |
| 319 | add IN, IN, #4*8*4 |
| 320 | add BUF, BUF, #4*4 |
| 321 | subs COUNT, COUNT, #(4 << 5) + 2 |
| 322 | bpl 1b |
| 323 | 2: @ Now deal with trailing < 4 samples |
| 324 | adds COUNT, COUNT, #3 << 5 |
| 325 | bmi 4f @ sb_act was a multiple of 4 |
| 326 | bics lr, COUNT, #0x1F |
| 327 | bne 3f |
| 328 | @ sb_act was n*4+1 |
| 329 | vldr s8, [IN, #(0*8+0)*4] |
| 330 | vldr s10, [IN, #(0*8+1)*4] |
| 331 | vldr s12, [IN, #(0*8+2)*4] |
| 332 | vldr s14, [IN, #(0*8+3)*4] |
| 333 | vldr s16, [IN, #(0*8+4)*4] |
| 334 | vldr s18, [IN, #(0*8+5)*4] |
| 335 | vldr s20, [IN, #(0*8+6)*4] |
| 336 | vldr s22, [IN, #(0*8+7)*4] |
| 337 | vneg.f s8, s8 |
| 338 | vldr s9, zero |
| 339 | vldr s11, zero |
| 340 | vldr s13, zero |
| 341 | vldr s15, zero |
| 342 | vneg.f s16, s16 |
| 343 | vldr s17, zero |
| 344 | vldr s19, zero |
| 345 | vldr s21, zero |
| 346 | vldr s23, zero |
| 347 | vstr d4, [BUF, #(0*32+0)*4] |
| 348 | vstr d5, [BUF, #(1*32+0)*4] |
| 349 | vstr d6, [BUF, #(2*32+0)*4] |
| 350 | vstr d7, [BUF, #(3*32+0)*4] |
| 351 | vstr d8, [BUF, #(4*32+0)*4] |
| 352 | vstr d9, [BUF, #(5*32+0)*4] |
| 353 | vstr d10, [BUF, #(6*32+0)*4] |
| 354 | vstr d11, [BUF, #(7*32+0)*4] |
| 355 | add BUF, BUF, #2*4 |
| 356 | sub COUNT, COUNT, #1 |
| 357 | b 4f |
| 358 | 3: @ sb_act was n*4+2 or n*4+3, so do the first 2 |
| 359 | vldr s8, [IN, #(0*8+0)*4] |
| 360 | vldr s10, [IN, #(0*8+1)*4] |
| 361 | vldr s12, [IN, #(0*8+2)*4] |
| 362 | vldr s14, [IN, #(0*8+3)*4] |
| 363 | vldr s16, [IN, #(0*8+4)*4] |
| 364 | vldr s18, [IN, #(0*8+5)*4] |
| 365 | vldr s20, [IN, #(0*8+6)*4] |
| 366 | vldr s22, [IN, #(0*8+7)*4] |
| 367 | vneg.f s8, s8 |
| 368 | vldr s9, [IN, #(1*8+0)*4] |
| 369 | vldr s11, [IN, #(1*8+1)*4] |
| 370 | vldr s13, [IN, #(1*8+2)*4] |
| 371 | vldr s15, [IN, #(1*8+3)*4] |
| 372 | vneg.f s16, s16 |
| 373 | vldr s17, [IN, #(1*8+4)*4] |
| 374 | vldr s19, [IN, #(1*8+5)*4] |
| 375 | vldr s21, [IN, #(1*8+6)*4] |
| 376 | vldr s23, [IN, #(1*8+7)*4] |
| 377 | vstr d4, [BUF, #(0*32+0)*4] |
| 378 | vstr d5, [BUF, #(1*32+0)*4] |
| 379 | vstr d6, [BUF, #(2*32+0)*4] |
| 380 | vstr d7, [BUF, #(3*32+0)*4] |
| 381 | vstr d8, [BUF, #(4*32+0)*4] |
| 382 | vstr d9, [BUF, #(5*32+0)*4] |
| 383 | vstr d10, [BUF, #(6*32+0)*4] |
| 384 | vstr d11, [BUF, #(7*32+0)*4] |
| 385 | add BUF, BUF, #2*4 |
| 386 | sub COUNT, COUNT, #(2 << 5) + 1 |
| 387 | bics lr, COUNT, #0x1F |
| 388 | bne 4f |
| 389 | @ sb_act was n*4+3 |
| 390 | vldr s8, [IN, #(2*8+0)*4] |
| 391 | vldr s10, [IN, #(2*8+1)*4] |
| 392 | vldr s12, [IN, #(2*8+2)*4] |
| 393 | vldr s14, [IN, #(2*8+3)*4] |
| 394 | vldr s16, [IN, #(2*8+4)*4] |
| 395 | vldr s18, [IN, #(2*8+5)*4] |
| 396 | vldr s20, [IN, #(2*8+6)*4] |
| 397 | vldr s22, [IN, #(2*8+7)*4] |
| 398 | vldr s9, zero |
| 399 | vldr s11, zero |
| 400 | vldr s13, zero |
| 401 | vldr s15, zero |
| 402 | vldr s17, zero |
| 403 | vldr s19, zero |
| 404 | vldr s21, zero |
| 405 | vldr s23, zero |
| 406 | vstr d4, [BUF, #(0*32+0)*4] |
| 407 | vstr d5, [BUF, #(1*32+0)*4] |
| 408 | vstr d6, [BUF, #(2*32+0)*4] |
| 409 | vstr d7, [BUF, #(3*32+0)*4] |
| 410 | vstr d8, [BUF, #(4*32+0)*4] |
| 411 | vstr d9, [BUF, #(5*32+0)*4] |
| 412 | vstr d10, [BUF, #(6*32+0)*4] |
| 413 | vstr d11, [BUF, #(7*32+0)*4] |
| 414 | add BUF, BUF, #2*4 |
| 415 | sub COUNT, COUNT, #1 |
| 416 | 4: @ Now fill the remainder with 0 |
| 417 | vldr s8, zero |
| 418 | vldr s9, zero |
| 419 | ands COUNT, COUNT, #0x1F |
| 420 | beq 6f |
| 421 | 5: vstr d4, [BUF, #(0*32+0)*4] |
| 422 | vstr d4, [BUF, #(1*32+0)*4] |
| 423 | vstr d4, [BUF, #(2*32+0)*4] |
| 424 | vstr d4, [BUF, #(3*32+0)*4] |
| 425 | vstr d4, [BUF, #(4*32+0)*4] |
| 426 | vstr d4, [BUF, #(5*32+0)*4] |
| 427 | vstr d4, [BUF, #(6*32+0)*4] |
| 428 | vstr d4, [BUF, #(7*32+0)*4] |
| 429 | add BUF, BUF, #2*4 |
| 430 | subs COUNT, COUNT, #1 |
| 431 | bne 5b |
| 432 | 6: |
| 433 | fmxr FPSCR, OLDFPSCR |
| 434 | ldr WINDOW, [fp, #3*4] |
| 435 | ldr OUT, [fp, #4*4] |
| 436 | sub BUF, BUF, #32*4 |
| 437 | NOVFP ldr SCALEINT, [fp, #6*4] |
| 438 | mov COUNT, #8 |
| 439 | VFP vpush {SCALE} |
| 440 | VFP sub sp, sp, #3*4 |
| 441 | NOVFP sub sp, sp, #4*4 |
| 442 | 7: |
| 443 | VFP ldr a1, [fp, #-7*4] @ imdct |
| 444 | NOVFP ldr a1, [fp, #-8*4] |
| 445 | ldmia fp, {a2-a4} |
| 446 | VFP stmia sp, {WINDOW, OUT, BUF} |
| 447 | NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT} |
| 448 | VFP vldr SCALE, [sp, #3*4] |
| 449 | bl X(ff_synth_filter_float_vfp) |
| 450 | add OUT, OUT, #32*4 |
| 451 | add BUF, BUF, #32*4 |
| 452 | subs COUNT, COUNT, #1 |
| 453 | bne 7b |
| 454 | |
| 455 | A sub sp, fp, #(8+8)*4 |
| 456 | T sub fp, fp, #(8+8)*4 |
| 457 | T mov sp, fp |
| 458 | vpop {s16-s23} |
| 459 | VFP pop {a3-a4,v1-v3,v5,fp,pc} |
| 460 | NOVFP pop {a4,v1-v5,fp,pc} |
| 461 | endfunc |
| 462 | |
| 463 | .unreq IN |
| 464 | .unreq SBACT |
| 465 | .unreq OLDFPSCR |
| 466 | .unreq IMDCT |
| 467 | .unreq WINDOW |
| 468 | .unreq OUT |
| 469 | .unreq BUF |
| 470 | .unreq SCALEINT |
| 471 | .unreq COUNT |
| 472 | |
| 473 | .unreq SCALE |
| 474 | |
| 475 | .align 2 |
| 476 | zero: .word 0 |