| 1 | /* |
| 2 | * Copyright (c) 2014 RISC OS Open Ltd |
| 3 | * Author: Ben Avison <bavison@riscosopen.org> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/arm/asm.S" |
| 23 | |
| 24 | #define MAX_CHANNELS 8 |
| 25 | #define MAX_FIR_ORDER 8 |
| 26 | #define MAX_IIR_ORDER 4 |
| 27 | #define MAX_RATEFACTOR 4 |
| 28 | #define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR) |
| 29 | |
| 30 | PST .req a1 |
| 31 | PCO .req a2 |
| 32 | AC0 .req a3 |
| 33 | AC1 .req a4 |
| 34 | CO0 .req v1 |
| 35 | CO1 .req v2 |
| 36 | CO2 .req v3 |
| 37 | CO3 .req v4 |
| 38 | ST0 .req v5 |
| 39 | ST1 .req v6 |
| 40 | ST2 .req sl |
| 41 | ST3 .req fp |
| 42 | I .req ip |
| 43 | PSAMP .req lr |
| 44 | |
| 45 | |
| 46 | // Some macros that do loads/multiplies where the register number is determined |
| 47 | // from an assembly-time expression. Boy is GNU assembler's syntax ugly... |
| 48 | |
| 49 | .macro load group, index, base, offset |
| 50 | .altmacro |
| 51 | load_ \group, %(\index), \base, \offset |
| 52 | .noaltmacro |
| 53 | .endm |
| 54 | |
| 55 | .macro load_ group, index, base, offset |
| 56 | ldr \group\index, [\base, #\offset] |
| 57 | .endm |
| 58 | |
| 59 | .macro loadd group, index, base, offset |
| 60 | .altmacro |
| 61 | loadd_ \group, %(\index), %(\index+1), \base, \offset |
| 62 | .noaltmacro |
| 63 | .endm |
| 64 | |
| 65 | .macro loadd_ group, index0, index1, base, offset |
| 66 | A .if \offset >= 256 |
| 67 | A ldr \group\index0, [\base, #\offset] |
| 68 | A ldr \group\index1, [\base, #(\offset) + 4] |
| 69 | A .else |
| 70 | ldrd \group\index0, \group\index1, [\base, #\offset] |
| 71 | A .endif |
| 72 | .endm |
| 73 | |
| 74 | .macro multiply index, accumulate, long |
| 75 | .altmacro |
| 76 | multiply_ %(\index), \accumulate, \long |
| 77 | .noaltmacro |
| 78 | .endm |
| 79 | |
| 80 | .macro multiply_ index, accumulate, long |
| 81 | .if \long |
| 82 | .if \accumulate |
| 83 | smlal AC0, AC1, CO\index, ST\index |
| 84 | .else |
| 85 | smull AC0, AC1, CO\index, ST\index |
| 86 | .endif |
| 87 | .else |
| 88 | .if \accumulate |
| 89 | mla AC0, CO\index, ST\index, AC0 |
| 90 | .else |
| 91 | mul AC0, CO\index, ST\index |
| 92 | .endif |
| 93 | .endif |
| 94 | .endm |
| 95 | |
| 96 | // A macro to update the load register number and load offsets |
| 97 | |
| 98 | .macro inc howmany |
| 99 | .set LOAD_REG, (LOAD_REG + \howmany) & 3 |
| 100 | .set OFFSET_CO, OFFSET_CO + 4 * \howmany |
| 101 | .set OFFSET_ST, OFFSET_ST + 4 * \howmany |
| 102 | .if FIR_REMAIN > 0 |
| 103 | .set FIR_REMAIN, FIR_REMAIN - \howmany |
| 104 | .if FIR_REMAIN == 0 |
| 105 | .set OFFSET_CO, 4 * MAX_FIR_ORDER |
| 106 | .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) |
| 107 | .endif |
| 108 | .elseif IIR_REMAIN > 0 |
| 109 | .set IIR_REMAIN, IIR_REMAIN - \howmany |
| 110 | .endif |
| 111 | .endm |
| 112 | |
| 113 | // Macro to implement the inner loop for one specific combination of parameters |
| 114 | |
| 115 | .macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps |
| 116 | .set TOTAL_TAPS, \iir_taps + \fir_taps |
| 117 | |
| 118 | // Deal with register allocation... |
| 119 | .set DEFINED_SHIFT, 0 |
| 120 | .set DEFINED_MASK, 0 |
| 121 | .set SHUFFLE_SHIFT, 0 |
| 122 | .set SHUFFLE_MASK, 0 |
| 123 | .set SPILL_SHIFT, 0 |
| 124 | .set SPILL_MASK, 0 |
| 125 | .if TOTAL_TAPS == 0 |
| 126 | // Little register pressure in this case - just keep MASK where it was |
| 127 | .if !\mask_minus1 |
| 128 | MASK .req ST1 |
| 129 | .set DEFINED_MASK, 1 |
| 130 | .endif |
| 131 | .else |
| 132 | .if \shift_0 |
| 133 | .if !\mask_minus1 |
| 134 | // AC1 is unused with shift 0 |
| 135 | MASK .req AC1 |
| 136 | .set DEFINED_MASK, 1 |
| 137 | .set SHUFFLE_MASK, 1 |
| 138 | .endif |
| 139 | .elseif \shift_8 |
| 140 | .if !\mask_minus1 |
| 141 | .if TOTAL_TAPS <= 4 |
| 142 | // All coefficients are preloaded (so pointer not needed) |
| 143 | MASK .req PCO |
| 144 | .set DEFINED_MASK, 1 |
| 145 | .set SHUFFLE_MASK, 1 |
| 146 | .else |
| 147 | .set SPILL_MASK, 1 |
| 148 | .endif |
| 149 | .endif |
| 150 | .else // shift not 0 or 8 |
| 151 | .if TOTAL_TAPS <= 3 |
| 152 | // All coefficients are preloaded, and at least one CO register is unused |
| 153 | .if \fir_taps & 1 |
| 154 | SHIFT .req CO0 |
| 155 | .set DEFINED_SHIFT, 1 |
| 156 | .set SHUFFLE_SHIFT, 1 |
| 157 | .else |
| 158 | SHIFT .req CO3 |
| 159 | .set DEFINED_SHIFT, 1 |
| 160 | .set SHUFFLE_SHIFT, 1 |
| 161 | .endif |
| 162 | .if !\mask_minus1 |
| 163 | MASK .req PCO |
| 164 | .set DEFINED_MASK, 1 |
| 165 | .set SHUFFLE_MASK, 1 |
| 166 | .endif |
| 167 | .elseif TOTAL_TAPS == 4 |
| 168 | // All coefficients are preloaded |
| 169 | SHIFT .req PCO |
| 170 | .set DEFINED_SHIFT, 1 |
| 171 | .set SHUFFLE_SHIFT, 1 |
| 172 | .if !\mask_minus1 |
| 173 | .set SPILL_MASK, 1 |
| 174 | .endif |
| 175 | .else |
| 176 | .set SPILL_SHIFT, 1 |
| 177 | .if !\mask_minus1 |
| 178 | .set SPILL_MASK, 1 |
| 179 | .endif |
| 180 | .endif |
| 181 | .endif |
| 182 | .endif |
| 183 | .if SPILL_SHIFT |
| 184 | SHIFT .req ST0 |
| 185 | .set DEFINED_SHIFT, 1 |
| 186 | .endif |
| 187 | .if SPILL_MASK |
| 188 | MASK .req ST1 |
| 189 | .set DEFINED_MASK, 1 |
| 190 | .endif |
| 191 | |
| 192 | // Preload coefficients if possible |
| 193 | .if TOTAL_TAPS <= 4 |
| 194 | .set OFFSET_CO, 0 |
| 195 | .if \fir_taps & 1 |
| 196 | .set LOAD_REG, 1 |
| 197 | .else |
| 198 | .set LOAD_REG, 0 |
| 199 | .endif |
| 200 | .rept \fir_taps |
| 201 | load CO, LOAD_REG, PCO, OFFSET_CO |
| 202 | .set LOAD_REG, (LOAD_REG + 1) & 3 |
| 203 | .set OFFSET_CO, OFFSET_CO + 4 |
| 204 | .endr |
| 205 | .set OFFSET_CO, 4 * MAX_FIR_ORDER |
| 206 | .rept \iir_taps |
| 207 | load CO, LOAD_REG, PCO, OFFSET_CO |
| 208 | .set LOAD_REG, (LOAD_REG + 1) & 3 |
| 209 | .set OFFSET_CO, OFFSET_CO + 4 |
| 210 | .endr |
| 211 | .endif |
| 212 | |
| 213 | // Move mask/shift to final positions if necessary |
| 214 | // Need to do this after preloading, because in some cases we |
| 215 | // reuse the coefficient pointer register |
| 216 | .if SHUFFLE_SHIFT |
| 217 | mov SHIFT, ST0 |
| 218 | .endif |
| 219 | .if SHUFFLE_MASK |
| 220 | mov MASK, ST1 |
| 221 | .endif |
| 222 | |
| 223 | // Begin loop |
| 224 | 01: |
| 225 | .if TOTAL_TAPS == 0 |
| 226 | // Things simplify a lot in this case |
| 227 | // In fact this could be pipelined further if it's worth it... |
| 228 | ldr ST0, [PSAMP] |
| 229 | subs I, I, #1 |
| 230 | .if !\mask_minus1 |
| 231 | and ST0, ST0, MASK |
| 232 | .endif |
| 233 | str ST0, [PST, #-4]! |
| 234 | str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] |
| 235 | str ST0, [PSAMP], #4 * MAX_CHANNELS |
| 236 | bne 01b |
| 237 | .else |
| 238 | .if \fir_taps & 1 |
| 239 | .set LOAD_REG, 1 |
| 240 | .else |
| 241 | .set LOAD_REG, 0 |
| 242 | .endif |
| 243 | .set LOAD_BANK, 0 |
| 244 | .set FIR_REMAIN, \fir_taps |
| 245 | .set IIR_REMAIN, \iir_taps |
| 246 | .if FIR_REMAIN == 0 // only IIR terms |
| 247 | .set OFFSET_CO, 4 * MAX_FIR_ORDER |
| 248 | .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) |
| 249 | .else |
| 250 | .set OFFSET_CO, 0 |
| 251 | .set OFFSET_ST, 0 |
| 252 | .endif |
| 253 | .set MUL_REG, LOAD_REG |
| 254 | .set COUNTER, 0 |
| 255 | .rept TOTAL_TAPS + 2 |
| 256 | // Do load(s) |
| 257 | .if FIR_REMAIN != 0 || IIR_REMAIN != 0 |
| 258 | .if COUNTER == 0 |
| 259 | .if TOTAL_TAPS > 4 |
| 260 | load CO, LOAD_REG, PCO, OFFSET_CO |
| 261 | .endif |
| 262 | load ST, LOAD_REG, PST, OFFSET_ST |
| 263 | inc 1 |
| 264 | .elseif COUNTER == 1 && (\fir_taps & 1) == 0 |
| 265 | .if TOTAL_TAPS > 4 |
| 266 | load CO, LOAD_REG, PCO, OFFSET_CO |
| 267 | .endif |
| 268 | load ST, LOAD_REG, PST, OFFSET_ST |
| 269 | inc 1 |
| 270 | .elseif LOAD_BANK == 0 |
| 271 | .if TOTAL_TAPS > 4 |
| 272 | .if FIR_REMAIN == 0 && IIR_REMAIN == 1 |
| 273 | load CO, LOAD_REG, PCO, OFFSET_CO |
| 274 | .else |
| 275 | loadd CO, LOAD_REG, PCO, OFFSET_CO |
| 276 | .endif |
| 277 | .endif |
| 278 | .set LOAD_BANK, 1 |
| 279 | .else |
| 280 | .if FIR_REMAIN == 0 && IIR_REMAIN == 1 |
| 281 | load ST, LOAD_REG, PST, OFFSET_ST |
| 282 | inc 1 |
| 283 | .else |
| 284 | loadd ST, LOAD_REG, PST, OFFSET_ST |
| 285 | inc 2 |
| 286 | .endif |
| 287 | .set LOAD_BANK, 0 |
| 288 | .endif |
| 289 | .endif |
| 290 | |
| 291 | // Do interleaved multiplies, slightly delayed |
| 292 | .if COUNTER >= 2 |
| 293 | multiply MUL_REG, COUNTER > 2, !\shift_0 |
| 294 | .set MUL_REG, (MUL_REG + 1) & 3 |
| 295 | .endif |
| 296 | .set COUNTER, COUNTER + 1 |
| 297 | .endr |
| 298 | |
| 299 | // Post-process the result of the multiplies |
| 300 | .if SPILL_SHIFT |
| 301 | ldr SHIFT, [sp, #9*4 + 0*4] |
| 302 | .endif |
| 303 | .if SPILL_MASK |
| 304 | ldr MASK, [sp, #9*4 + 1*4] |
| 305 | .endif |
| 306 | ldr ST2, [PSAMP] |
| 307 | subs I, I, #1 |
| 308 | .if \shift_8 |
| 309 | mov AC0, AC0, lsr #8 |
| 310 | orr AC0, AC0, AC1, lsl #24 |
| 311 | .elseif !\shift_0 |
| 312 | rsb ST3, SHIFT, #32 |
| 313 | mov AC0, AC0, lsr SHIFT |
| 314 | A orr AC0, AC0, AC1, lsl ST3 |
| 315 | T mov AC1, AC1, lsl ST3 |
| 316 | T orr AC0, AC0, AC1 |
| 317 | .endif |
| 318 | .if \mask_minus1 |
| 319 | add ST3, ST2, AC0 |
| 320 | .else |
| 321 | add ST2, ST2, AC0 |
| 322 | and ST3, ST2, MASK |
| 323 | sub ST2, ST3, AC0 |
| 324 | .endif |
| 325 | str ST3, [PST, #-4]! |
| 326 | str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] |
| 327 | str ST3, [PSAMP], #4 * MAX_CHANNELS |
| 328 | bne 01b |
| 329 | .endif |
| 330 | b 99f |
| 331 | |
| 332 | .if DEFINED_SHIFT |
| 333 | .unreq SHIFT |
| 334 | .endif |
| 335 | .if DEFINED_MASK |
| 336 | .unreq MASK |
| 337 | .endif |
| 338 | .endm |
| 339 | |
| 340 | .macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps |
| 341 | A ldr pc, [pc, a3, lsl #2] // firorder is in range 0-(8-iir_taps) |
| 342 | T tbh [pc, a3, lsl #1] |
| 343 | 0: |
| 344 | A .word 0, 70f, 71f, 72f, 73f, 74f |
| 345 | T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2 |
| 346 | .if \iir_taps <= 3 |
| 347 | A .word 75f |
| 348 | T .hword (75f - 0b) / 2 |
| 349 | .if \iir_taps <= 2 |
| 350 | A .word 76f |
| 351 | T .hword (76f - 0b) / 2 |
| 352 | .if \iir_taps <= 1 |
| 353 | A .word 77f |
| 354 | T .hword (77f - 0b) / 2 |
| 355 | .if \iir_taps == 0 |
| 356 | A .word 78f |
| 357 | T .hword (78f - 0b) / 2 |
| 358 | .endif |
| 359 | .endif |
| 360 | .endif |
| 361 | .endif |
| 362 | 70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0 |
| 363 | 71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1 |
| 364 | 72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2 |
| 365 | 73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3 |
| 366 | 74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4 |
| 367 | .if \iir_taps <= 3 |
| 368 | 75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5 |
| 369 | .if \iir_taps <= 2 |
| 370 | 76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6 |
| 371 | .if \iir_taps <= 1 |
| 372 | 77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7 |
| 373 | .if \iir_taps == 0 |
| 374 | 78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8 |
| 375 | .endif |
| 376 | .endif |
| 377 | .endif |
| 378 | .endif |
| 379 | .endm |
| 380 | |
| 381 | .macro switch_on_iir_taps mask_minus1, shift_0, shift_8 |
| 382 | A ldr pc, [pc, a4, lsl #2] // irorder is in range 0-4 |
| 383 | T tbh [pc, a4, lsl #1] |
| 384 | 0: |
| 385 | A .word 0, 60f, 61f, 62f, 63f, 64f |
| 386 | T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2 |
| 387 | 60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0 |
| 388 | 61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1 |
| 389 | 62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2 |
| 390 | 63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3 |
| 391 | 64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4 |
| 392 | .endm |
| 393 | |
| 394 | /* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, |
| 395 | * int firorder, int iirorder, |
| 396 | * unsigned int filter_shift, int32_t mask, |
| 397 | * int blocksize, int32_t *sample_buffer); |
| 398 | */ |
| 399 | function ff_mlp_filter_channel_arm, export=1 |
| 400 | push {v1-fp,lr} |
| 401 | add v1, sp, #9*4 // point at arguments on stack |
| 402 | ldm v1, {ST0,ST1,I,PSAMP} |
| 403 | cmp ST1, #-1 |
| 404 | bne 30f |
| 405 | movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 |
| 406 | bne 20f |
| 407 | bcs 10f |
| 408 | switch_on_iir_taps 1, 1, 0 |
| 409 | 10: switch_on_iir_taps 1, 0, 1 |
| 410 | 20: switch_on_iir_taps 1, 0, 0 |
| 411 | 30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 |
| 412 | bne 50f |
| 413 | bcs 40f |
| 414 | switch_on_iir_taps 0, 1, 0 |
| 415 | 40: switch_on_iir_taps 0, 0, 1 |
| 416 | 50: switch_on_iir_taps 0, 0, 0 |
| 417 | 99: pop {v1-fp,pc} |
| 418 | endfunc |
| 419 | |
| 420 | .unreq PST |
| 421 | .unreq PCO |
| 422 | .unreq AC0 |
| 423 | .unreq AC1 |
| 424 | .unreq CO0 |
| 425 | .unreq CO1 |
| 426 | .unreq CO2 |
| 427 | .unreq CO3 |
| 428 | .unreq ST0 |
| 429 | .unreq ST1 |
| 430 | .unreq ST2 |
| 431 | .unreq ST3 |
| 432 | .unreq I |
| 433 | .unreq PSAMP |
| 434 | |
| 435 | /********************************************************************/ |
| 436 | |
| 437 | PSA .req a1 // samples |
| 438 | PCO .req a2 // coeffs |
| 439 | PBL .req a3 // bypassed_lsbs |
| 440 | INDEX .req a4 |
| 441 | CO0 .req v1 |
| 442 | CO1 .req v2 |
| 443 | CO2 .req v3 |
| 444 | CO3 .req v4 |
| 445 | SA0 .req v5 |
| 446 | SA1 .req v6 |
| 447 | SA2 .req sl |
| 448 | SA3 .req fp |
| 449 | AC0 .req ip |
| 450 | AC1 .req lr |
| 451 | NOISE .req SA0 |
| 452 | LSB .req SA1 |
| 453 | DCH .req SA2 // dest_ch |
| 454 | MASK .req SA3 |
| 455 | |
| 456 | // INDEX is used as follows: |
| 457 | // bits 0..6 index2 (values up to 17, but wider so that we can |
| 458 | // add to index field without needing to mask) |
| 459 | // bits 7..14 i (values up to 160) |
| 460 | // bit 15 underflow detect for i |
| 461 | // bits 25..31 (if access_unit_size_pow2 == 128) \ index |
| 462 | // bits 26..31 (if access_unit_size_pow2 == 64) / |
| 463 | |
| 464 | .macro implement_rematrix shift, index_mask, mask_minus1, maxchan |
| 465 | .if \maxchan == 1 |
| 466 | // We can just leave the coefficients in registers in this case |
| 467 | ldrd CO0, CO1, [PCO] |
| 468 | .endif |
| 469 | 1: |
| 470 | .if \maxchan == 1 |
| 471 | ldrd SA0, SA1, [PSA] |
| 472 | smull AC0, AC1, CO0, SA0 |
| 473 | .elseif \maxchan == 5 |
| 474 | ldr CO0, [PCO, #0] |
| 475 | ldr SA0, [PSA, #0] |
| 476 | ldr CO1, [PCO, #4] |
| 477 | ldr SA1, [PSA, #4] |
| 478 | ldrd CO2, CO3, [PCO, #8] |
| 479 | smull AC0, AC1, CO0, SA0 |
| 480 | ldrd SA2, SA3, [PSA, #8] |
| 481 | smlal AC0, AC1, CO1, SA1 |
| 482 | ldrd CO0, CO1, [PCO, #16] |
| 483 | smlal AC0, AC1, CO2, SA2 |
| 484 | ldrd SA0, SA1, [PSA, #16] |
| 485 | smlal AC0, AC1, CO3, SA3 |
| 486 | smlal AC0, AC1, CO0, SA0 |
| 487 | .else // \maxchan == 7 |
| 488 | ldr CO2, [PCO, #0] |
| 489 | ldr SA2, [PSA, #0] |
| 490 | ldr CO3, [PCO, #4] |
| 491 | ldr SA3, [PSA, #4] |
| 492 | ldrd CO0, CO1, [PCO, #8] |
| 493 | smull AC0, AC1, CO2, SA2 |
| 494 | ldrd SA0, SA1, [PSA, #8] |
| 495 | smlal AC0, AC1, CO3, SA3 |
| 496 | ldrd CO2, CO3, [PCO, #16] |
| 497 | smlal AC0, AC1, CO0, SA0 |
| 498 | ldrd SA2, SA3, [PSA, #16] |
| 499 | smlal AC0, AC1, CO1, SA1 |
| 500 | ldrd CO0, CO1, [PCO, #24] |
| 501 | smlal AC0, AC1, CO2, SA2 |
| 502 | ldrd SA0, SA1, [PSA, #24] |
| 503 | smlal AC0, AC1, CO3, SA3 |
| 504 | smlal AC0, AC1, CO0, SA0 |
| 505 | .endif |
| 506 | ldm sp, {NOISE, DCH, MASK} |
| 507 | smlal AC0, AC1, CO1, SA1 |
| 508 | .if \shift != 0 |
| 509 | .if \index_mask == 63 |
| 510 | add NOISE, NOISE, INDEX, lsr #32-6 |
| 511 | ldrb LSB, [PBL], #MAX_CHANNELS |
| 512 | ldrsb NOISE, [NOISE] |
| 513 | add INDEX, INDEX, INDEX, lsl #32-6 |
| 514 | .else // \index_mask == 127 |
| 515 | add NOISE, NOISE, INDEX, lsr #32-7 |
| 516 | ldrb LSB, [PBL], #MAX_CHANNELS |
| 517 | ldrsb NOISE, [NOISE] |
| 518 | add INDEX, INDEX, INDEX, lsl #32-7 |
| 519 | .endif |
| 520 | sub INDEX, INDEX, #1<<7 |
| 521 | adds AC0, AC0, NOISE, lsl #\shift + 7 |
| 522 | adc AC1, AC1, NOISE, asr #31 |
| 523 | .else |
| 524 | ldrb LSB, [PBL], #MAX_CHANNELS |
| 525 | sub INDEX, INDEX, #1<<7 |
| 526 | .endif |
| 527 | add PSA, PSA, #MAX_CHANNELS*4 |
| 528 | mov AC0, AC0, lsr #14 |
| 529 | orr AC0, AC0, AC1, lsl #18 |
| 530 | .if !\mask_minus1 |
| 531 | and AC0, AC0, MASK |
| 532 | .endif |
| 533 | add AC0, AC0, LSB |
| 534 | tst INDEX, #1<<15 |
| 535 | str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA |
| 536 | beq 1b |
| 537 | b 98f |
| 538 | .endm |
| 539 | |
| 540 | .macro switch_on_maxchan shift, index_mask, mask_minus1 |
| 541 | cmp v4, #5 |
| 542 | blo 51f |
| 543 | beq 50f |
| 544 | implement_rematrix \shift, \index_mask, \mask_minus1, 7 |
| 545 | 50: implement_rematrix \shift, \index_mask, \mask_minus1, 5 |
| 546 | 51: implement_rematrix \shift, \index_mask, \mask_minus1, 1 |
| 547 | .endm |
| 548 | |
| 549 | .macro switch_on_mask shift, index_mask |
| 550 | cmp sl, #-1 |
| 551 | bne 40f |
| 552 | switch_on_maxchan \shift, \index_mask, 1 |
| 553 | 40: switch_on_maxchan \shift, \index_mask, 0 |
| 554 | .endm |
| 555 | |
| 556 | .macro switch_on_au_size shift |
| 557 | .if \shift == 0 |
| 558 | switch_on_mask \shift, undefined |
| 559 | .else |
| 560 | teq v6, #64 |
| 561 | bne 30f |
| 562 | orr INDEX, INDEX, v1, lsl #32-6 |
| 563 | switch_on_mask \shift, 63 |
| 564 | 30: orr INDEX, INDEX, v1, lsl #32-7 |
| 565 | switch_on_mask \shift, 127 |
| 566 | .endif |
| 567 | .endm |
| 568 | |
| 569 | /* void ff_mlp_rematrix_channel_arm(int32_t *samples, |
| 570 | * const int32_t *coeffs, |
| 571 | * const uint8_t *bypassed_lsbs, |
| 572 | * const int8_t *noise_buffer, |
| 573 | * int index, |
| 574 | * unsigned int dest_ch, |
| 575 | * uint16_t blockpos, |
| 576 | * unsigned int maxchan, |
| 577 | * int matrix_noise_shift, |
| 578 | * int access_unit_size_pow2, |
| 579 | * int32_t mask); |
| 580 | */ |
| 581 | function ff_mlp_rematrix_channel_arm, export=1 |
| 582 | push {v1-fp,lr} |
| 583 | add v1, sp, #9*4 // point at arguments on stack |
| 584 | ldm v1, {v1-sl} |
| 585 | teq v4, #1 |
| 586 | itt ne |
| 587 | teqne v4, #5 |
| 588 | teqne v4, #7 |
| 589 | bne 99f |
| 590 | teq v6, #64 |
| 591 | it ne |
| 592 | teqne v6, #128 |
| 593 | bne 99f |
| 594 | sub v2, v2, #MAX_CHANNELS |
| 595 | push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned |
| 596 | movs INDEX, v3, lsl #7 |
| 597 | beq 98f // just in case, do nothing if blockpos = 0 |
| 598 | subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time |
| 599 | adc lr, v1, v1 // calculate index2 (C was set by preceding subs) |
| 600 | orr INDEX, INDEX, lr |
| 601 | // Switch on matrix_noise_shift: values 0 and 1 are |
| 602 | // disproportionately common so do those in a form the branch |
| 603 | // predictor can accelerate. Values can only go up to 15. |
| 604 | cmp v5, #1 |
| 605 | beq 11f |
| 606 | blo 10f |
| 607 | A ldr pc, [pc, v5, lsl #2] |
| 608 | T tbh [pc, v5, lsl #1] |
| 609 | 0: |
| 610 | A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f |
| 611 | T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2 |
| 612 | T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2 |
| 613 | T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2 |
| 614 | 10: switch_on_au_size 0 |
| 615 | 11: switch_on_au_size 1 |
| 616 | 12: switch_on_au_size 2 |
| 617 | 13: switch_on_au_size 3 |
| 618 | 14: switch_on_au_size 4 |
| 619 | 15: switch_on_au_size 5 |
| 620 | 16: switch_on_au_size 6 |
| 621 | 17: switch_on_au_size 7 |
| 622 | 18: switch_on_au_size 8 |
| 623 | 19: switch_on_au_size 9 |
| 624 | 20: switch_on_au_size 10 |
| 625 | 21: switch_on_au_size 11 |
| 626 | 22: switch_on_au_size 12 |
| 627 | 23: switch_on_au_size 13 |
| 628 | 24: switch_on_au_size 14 |
| 629 | 25: switch_on_au_size 15 |
| 630 | |
| 631 | 98: add sp, sp, #3*4 |
| 632 | pop {v1-fp,pc} |
| 633 | 99: // Can't handle these parameters, drop back to C |
| 634 | pop {v1-fp,lr} |
| 635 | b X(ff_mlp_rematrix_channel) |
| 636 | endfunc |
| 637 | |
| 638 | .unreq PSA |
| 639 | .unreq PCO |
| 640 | .unreq PBL |
| 641 | .unreq INDEX |
| 642 | .unreq CO0 |
| 643 | .unreq CO1 |
| 644 | .unreq CO2 |
| 645 | .unreq CO3 |
| 646 | .unreq SA0 |
| 647 | .unreq SA1 |
| 648 | .unreq SA2 |
| 649 | .unreq SA3 |
| 650 | .unreq AC0 |
| 651 | .unreq AC1 |
| 652 | .unreq NOISE |
| 653 | .unreq LSB |
| 654 | .unreq DCH |
| 655 | .unreq MASK |