Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2014 RISC OS Open Ltd | |
3 | * Author: Ben Avison <bavison@riscosopen.org> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/arm/asm.S" | |
23 | ||
24 | #define MAX_CHANNELS 8 | |
25 | #define MAX_FIR_ORDER 8 | |
26 | #define MAX_IIR_ORDER 4 | |
27 | #define MAX_RATEFACTOR 4 | |
28 | #define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR) | |
29 | ||
30 | PST .req a1 | |
31 | PCO .req a2 | |
32 | AC0 .req a3 | |
33 | AC1 .req a4 | |
34 | CO0 .req v1 | |
35 | CO1 .req v2 | |
36 | CO2 .req v3 | |
37 | CO3 .req v4 | |
38 | ST0 .req v5 | |
39 | ST1 .req v6 | |
40 | ST2 .req sl | |
41 | ST3 .req fp | |
42 | I .req ip | |
43 | PSAMP .req lr | |
44 | ||
45 | ||
46 | // Some macros that do loads/multiplies where the register number is determined | |
47 | // from an assembly-time expression. Boy is GNU assembler's syntax ugly... | |
48 | ||
49 | .macro load group, index, base, offset | |
50 | .altmacro | |
51 | load_ \group, %(\index), \base, \offset | |
52 | .noaltmacro | |
53 | .endm | |
54 | ||
55 | .macro load_ group, index, base, offset | |
56 | ldr \group\index, [\base, #\offset] | |
57 | .endm | |
58 | ||
59 | .macro loadd group, index, base, offset | |
60 | .altmacro | |
61 | loadd_ \group, %(\index), %(\index+1), \base, \offset | |
62 | .noaltmacro | |
63 | .endm | |
64 | ||
65 | .macro loadd_ group, index0, index1, base, offset | |
66 | A .if \offset >= 256 | |
67 | A ldr \group\index0, [\base, #\offset] | |
68 | A ldr \group\index1, [\base, #(\offset) + 4] | |
69 | A .else | |
70 | ldrd \group\index0, \group\index1, [\base, #\offset] | |
71 | A .endif | |
72 | .endm | |
73 | ||
74 | .macro multiply index, accumulate, long | |
75 | .altmacro | |
76 | multiply_ %(\index), \accumulate, \long | |
77 | .noaltmacro | |
78 | .endm | |
79 | ||
80 | .macro multiply_ index, accumulate, long | |
81 | .if \long | |
82 | .if \accumulate | |
83 | smlal AC0, AC1, CO\index, ST\index | |
84 | .else | |
85 | smull AC0, AC1, CO\index, ST\index | |
86 | .endif | |
87 | .else | |
88 | .if \accumulate | |
89 | mla AC0, CO\index, ST\index, AC0 | |
90 | .else | |
91 | mul AC0, CO\index, ST\index | |
92 | .endif | |
93 | .endif | |
94 | .endm | |
95 | ||
96 | // A macro to update the load register number and load offsets | |
97 | ||
98 | .macro inc howmany | |
99 | .set LOAD_REG, (LOAD_REG + \howmany) & 3 | |
100 | .set OFFSET_CO, OFFSET_CO + 4 * \howmany | |
101 | .set OFFSET_ST, OFFSET_ST + 4 * \howmany | |
102 | .if FIR_REMAIN > 0 | |
103 | .set FIR_REMAIN, FIR_REMAIN - \howmany | |
104 | .if FIR_REMAIN == 0 | |
105 | .set OFFSET_CO, 4 * MAX_FIR_ORDER | |
106 | .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) | |
107 | .endif | |
108 | .elseif IIR_REMAIN > 0 | |
109 | .set IIR_REMAIN, IIR_REMAIN - \howmany | |
110 | .endif | |
111 | .endm | |
112 | ||
113 | // Macro to implement the inner loop for one specific combination of parameters | |
114 | ||
115 | .macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps | |
116 | .set TOTAL_TAPS, \iir_taps + \fir_taps | |
117 | ||
118 | // Deal with register allocation... | |
119 | .set DEFINED_SHIFT, 0 | |
120 | .set DEFINED_MASK, 0 | |
121 | .set SHUFFLE_SHIFT, 0 | |
122 | .set SHUFFLE_MASK, 0 | |
123 | .set SPILL_SHIFT, 0 | |
124 | .set SPILL_MASK, 0 | |
125 | .if TOTAL_TAPS == 0 | |
126 | // Little register pressure in this case - just keep MASK where it was | |
127 | .if !\mask_minus1 | |
128 | MASK .req ST1 | |
129 | .set DEFINED_MASK, 1 | |
130 | .endif | |
131 | .else | |
132 | .if \shift_0 | |
133 | .if !\mask_minus1 | |
134 | // AC1 is unused with shift 0 | |
135 | MASK .req AC1 | |
136 | .set DEFINED_MASK, 1 | |
137 | .set SHUFFLE_MASK, 1 | |
138 | .endif | |
139 | .elseif \shift_8 | |
140 | .if !\mask_minus1 | |
141 | .if TOTAL_TAPS <= 4 | |
142 | // All coefficients are preloaded (so pointer not needed) | |
143 | MASK .req PCO | |
144 | .set DEFINED_MASK, 1 | |
145 | .set SHUFFLE_MASK, 1 | |
146 | .else | |
147 | .set SPILL_MASK, 1 | |
148 | .endif | |
149 | .endif | |
150 | .else // shift not 0 or 8 | |
151 | .if TOTAL_TAPS <= 3 | |
152 | // All coefficients are preloaded, and at least one CO register is unused | |
153 | .if \fir_taps & 1 | |
154 | SHIFT .req CO0 | |
155 | .set DEFINED_SHIFT, 1 | |
156 | .set SHUFFLE_SHIFT, 1 | |
157 | .else | |
158 | SHIFT .req CO3 | |
159 | .set DEFINED_SHIFT, 1 | |
160 | .set SHUFFLE_SHIFT, 1 | |
161 | .endif | |
162 | .if !\mask_minus1 | |
163 | MASK .req PCO | |
164 | .set DEFINED_MASK, 1 | |
165 | .set SHUFFLE_MASK, 1 | |
166 | .endif | |
167 | .elseif TOTAL_TAPS == 4 | |
168 | // All coefficients are preloaded | |
169 | SHIFT .req PCO | |
170 | .set DEFINED_SHIFT, 1 | |
171 | .set SHUFFLE_SHIFT, 1 | |
172 | .if !\mask_minus1 | |
173 | .set SPILL_MASK, 1 | |
174 | .endif | |
175 | .else | |
176 | .set SPILL_SHIFT, 1 | |
177 | .if !\mask_minus1 | |
178 | .set SPILL_MASK, 1 | |
179 | .endif | |
180 | .endif | |
181 | .endif | |
182 | .endif | |
183 | .if SPILL_SHIFT | |
184 | SHIFT .req ST0 | |
185 | .set DEFINED_SHIFT, 1 | |
186 | .endif | |
187 | .if SPILL_MASK | |
188 | MASK .req ST1 | |
189 | .set DEFINED_MASK, 1 | |
190 | .endif | |
191 | ||
192 | // Preload coefficients if possible | |
193 | .if TOTAL_TAPS <= 4 | |
194 | .set OFFSET_CO, 0 | |
195 | .if \fir_taps & 1 | |
196 | .set LOAD_REG, 1 | |
197 | .else | |
198 | .set LOAD_REG, 0 | |
199 | .endif | |
200 | .rept \fir_taps | |
201 | load CO, LOAD_REG, PCO, OFFSET_CO | |
202 | .set LOAD_REG, (LOAD_REG + 1) & 3 | |
203 | .set OFFSET_CO, OFFSET_CO + 4 | |
204 | .endr | |
205 | .set OFFSET_CO, 4 * MAX_FIR_ORDER | |
206 | .rept \iir_taps | |
207 | load CO, LOAD_REG, PCO, OFFSET_CO | |
208 | .set LOAD_REG, (LOAD_REG + 1) & 3 | |
209 | .set OFFSET_CO, OFFSET_CO + 4 | |
210 | .endr | |
211 | .endif | |
212 | ||
213 | // Move mask/shift to final positions if necessary | |
214 | // Need to do this after preloading, because in some cases we | |
215 | // reuse the coefficient pointer register | |
216 | .if SHUFFLE_SHIFT | |
217 | mov SHIFT, ST0 | |
218 | .endif | |
219 | .if SHUFFLE_MASK | |
220 | mov MASK, ST1 | |
221 | .endif | |
222 | ||
223 | // Begin loop | |
224 | 01: | |
225 | .if TOTAL_TAPS == 0 | |
226 | // Things simplify a lot in this case | |
227 | // In fact this could be pipelined further if it's worth it... | |
228 | ldr ST0, [PSAMP] | |
229 | subs I, I, #1 | |
230 | .if !\mask_minus1 | |
231 | and ST0, ST0, MASK | |
232 | .endif | |
233 | str ST0, [PST, #-4]! | |
234 | str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] | |
235 | str ST0, [PSAMP], #4 * MAX_CHANNELS | |
236 | bne 01b | |
237 | .else | |
238 | .if \fir_taps & 1 | |
239 | .set LOAD_REG, 1 | |
240 | .else | |
241 | .set LOAD_REG, 0 | |
242 | .endif | |
243 | .set LOAD_BANK, 0 | |
244 | .set FIR_REMAIN, \fir_taps | |
245 | .set IIR_REMAIN, \iir_taps | |
246 | .if FIR_REMAIN == 0 // only IIR terms | |
247 | .set OFFSET_CO, 4 * MAX_FIR_ORDER | |
248 | .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) | |
249 | .else | |
250 | .set OFFSET_CO, 0 | |
251 | .set OFFSET_ST, 0 | |
252 | .endif | |
253 | .set MUL_REG, LOAD_REG | |
254 | .set COUNTER, 0 | |
255 | .rept TOTAL_TAPS + 2 | |
256 | // Do load(s) | |
257 | .if FIR_REMAIN != 0 || IIR_REMAIN != 0 | |
258 | .if COUNTER == 0 | |
259 | .if TOTAL_TAPS > 4 | |
260 | load CO, LOAD_REG, PCO, OFFSET_CO | |
261 | .endif | |
262 | load ST, LOAD_REG, PST, OFFSET_ST | |
263 | inc 1 | |
264 | .elseif COUNTER == 1 && (\fir_taps & 1) == 0 | |
265 | .if TOTAL_TAPS > 4 | |
266 | load CO, LOAD_REG, PCO, OFFSET_CO | |
267 | .endif | |
268 | load ST, LOAD_REG, PST, OFFSET_ST | |
269 | inc 1 | |
270 | .elseif LOAD_BANK == 0 | |
271 | .if TOTAL_TAPS > 4 | |
272 | .if FIR_REMAIN == 0 && IIR_REMAIN == 1 | |
273 | load CO, LOAD_REG, PCO, OFFSET_CO | |
274 | .else | |
275 | loadd CO, LOAD_REG, PCO, OFFSET_CO | |
276 | .endif | |
277 | .endif | |
278 | .set LOAD_BANK, 1 | |
279 | .else | |
280 | .if FIR_REMAIN == 0 && IIR_REMAIN == 1 | |
281 | load ST, LOAD_REG, PST, OFFSET_ST | |
282 | inc 1 | |
283 | .else | |
284 | loadd ST, LOAD_REG, PST, OFFSET_ST | |
285 | inc 2 | |
286 | .endif | |
287 | .set LOAD_BANK, 0 | |
288 | .endif | |
289 | .endif | |
290 | ||
291 | // Do interleaved multiplies, slightly delayed | |
292 | .if COUNTER >= 2 | |
293 | multiply MUL_REG, COUNTER > 2, !\shift_0 | |
294 | .set MUL_REG, (MUL_REG + 1) & 3 | |
295 | .endif | |
296 | .set COUNTER, COUNTER + 1 | |
297 | .endr | |
298 | ||
299 | // Post-process the result of the multiplies | |
300 | .if SPILL_SHIFT | |
301 | ldr SHIFT, [sp, #9*4 + 0*4] | |
302 | .endif | |
303 | .if SPILL_MASK | |
304 | ldr MASK, [sp, #9*4 + 1*4] | |
305 | .endif | |
306 | ldr ST2, [PSAMP] | |
307 | subs I, I, #1 | |
308 | .if \shift_8 | |
309 | mov AC0, AC0, lsr #8 | |
310 | orr AC0, AC0, AC1, lsl #24 | |
311 | .elseif !\shift_0 | |
312 | rsb ST3, SHIFT, #32 | |
313 | mov AC0, AC0, lsr SHIFT | |
314 | A orr AC0, AC0, AC1, lsl ST3 | |
315 | T mov AC1, AC1, lsl ST3 | |
316 | T orr AC0, AC0, AC1 | |
317 | .endif | |
318 | .if \mask_minus1 | |
319 | add ST3, ST2, AC0 | |
320 | .else | |
321 | add ST2, ST2, AC0 | |
322 | and ST3, ST2, MASK | |
323 | sub ST2, ST3, AC0 | |
324 | .endif | |
325 | str ST3, [PST, #-4]! | |
326 | str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] | |
327 | str ST3, [PSAMP], #4 * MAX_CHANNELS | |
328 | bne 01b | |
329 | .endif | |
330 | b 99f | |
331 | ||
332 | .if DEFINED_SHIFT | |
333 | .unreq SHIFT | |
334 | .endif | |
335 | .if DEFINED_MASK | |
336 | .unreq MASK | |
337 | .endif | |
338 | .endm | |
339 | ||
340 | .macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps | |
341 | A ldr pc, [pc, a3, lsl #2] // firorder is in range 0-(8-iir_taps) | |
342 | T tbh [pc, a3, lsl #1] | |
343 | 0: | |
344 | A .word 0, 70f, 71f, 72f, 73f, 74f | |
345 | T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2 | |
346 | .if \iir_taps <= 3 | |
347 | A .word 75f | |
348 | T .hword (75f - 0b) / 2 | |
349 | .if \iir_taps <= 2 | |
350 | A .word 76f | |
351 | T .hword (76f - 0b) / 2 | |
352 | .if \iir_taps <= 1 | |
353 | A .word 77f | |
354 | T .hword (77f - 0b) / 2 | |
355 | .if \iir_taps == 0 | |
356 | A .word 78f | |
357 | T .hword (78f - 0b) / 2 | |
358 | .endif | |
359 | .endif | |
360 | .endif | |
361 | .endif | |
362 | 70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0 | |
363 | 71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1 | |
364 | 72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2 | |
365 | 73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3 | |
366 | 74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4 | |
367 | .if \iir_taps <= 3 | |
368 | 75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5 | |
369 | .if \iir_taps <= 2 | |
370 | 76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6 | |
371 | .if \iir_taps <= 1 | |
372 | 77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7 | |
373 | .if \iir_taps == 0 | |
374 | 78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8 | |
375 | .endif | |
376 | .endif | |
377 | .endif | |
378 | .endif | |
379 | .endm | |
380 | ||
381 | .macro switch_on_iir_taps mask_minus1, shift_0, shift_8 | |
382 | A ldr pc, [pc, a4, lsl #2] // irorder is in range 0-4 | |
383 | T tbh [pc, a4, lsl #1] | |
384 | 0: | |
385 | A .word 0, 60f, 61f, 62f, 63f, 64f | |
386 | T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2 | |
387 | 60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0 | |
388 | 61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1 | |
389 | 62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2 | |
390 | 63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3 | |
391 | 64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4 | |
392 | .endm | |
393 | ||
394 | /* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, | |
395 | * int firorder, int iirorder, | |
396 | * unsigned int filter_shift, int32_t mask, | |
397 | * int blocksize, int32_t *sample_buffer); | |
398 | */ | |
399 | function ff_mlp_filter_channel_arm, export=1 | |
400 | push {v1-fp,lr} | |
401 | add v1, sp, #9*4 // point at arguments on stack | |
402 | ldm v1, {ST0,ST1,I,PSAMP} | |
403 | cmp ST1, #-1 | |
404 | bne 30f | |
405 | movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 | |
406 | bne 20f | |
407 | bcs 10f | |
408 | switch_on_iir_taps 1, 1, 0 | |
409 | 10: switch_on_iir_taps 1, 0, 1 | |
410 | 20: switch_on_iir_taps 1, 0, 0 | |
411 | 30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 | |
412 | bne 50f | |
413 | bcs 40f | |
414 | switch_on_iir_taps 0, 1, 0 | |
415 | 40: switch_on_iir_taps 0, 0, 1 | |
416 | 50: switch_on_iir_taps 0, 0, 0 | |
417 | 99: pop {v1-fp,pc} | |
418 | endfunc | |
419 | ||
420 | .unreq PST | |
421 | .unreq PCO | |
422 | .unreq AC0 | |
423 | .unreq AC1 | |
424 | .unreq CO0 | |
425 | .unreq CO1 | |
426 | .unreq CO2 | |
427 | .unreq CO3 | |
428 | .unreq ST0 | |
429 | .unreq ST1 | |
430 | .unreq ST2 | |
431 | .unreq ST3 | |
432 | .unreq I | |
433 | .unreq PSAMP | |
434 | ||
435 | /********************************************************************/ | |
436 | ||
437 | PSA .req a1 // samples | |
438 | PCO .req a2 // coeffs | |
439 | PBL .req a3 // bypassed_lsbs | |
440 | INDEX .req a4 | |
441 | CO0 .req v1 | |
442 | CO1 .req v2 | |
443 | CO2 .req v3 | |
444 | CO3 .req v4 | |
445 | SA0 .req v5 | |
446 | SA1 .req v6 | |
447 | SA2 .req sl | |
448 | SA3 .req fp | |
449 | AC0 .req ip | |
450 | AC1 .req lr | |
451 | NOISE .req SA0 | |
452 | LSB .req SA1 | |
453 | DCH .req SA2 // dest_ch | |
454 | MASK .req SA3 | |
455 | ||
456 | // INDEX is used as follows: | |
457 | // bits 0..6 index2 (values up to 17, but wider so that we can | |
458 | // add to index field without needing to mask) | |
459 | // bits 7..14 i (values up to 160) | |
460 | // bit 15 underflow detect for i | |
461 | // bits 25..31 (if access_unit_size_pow2 == 128) \ index | |
462 | // bits 26..31 (if access_unit_size_pow2 == 64) / | |
463 | ||
464 | .macro implement_rematrix shift, index_mask, mask_minus1, maxchan | |
465 | .if \maxchan == 1 | |
466 | // We can just leave the coefficients in registers in this case | |
467 | ldrd CO0, CO1, [PCO] | |
468 | .endif | |
469 | 1: | |
470 | .if \maxchan == 1 | |
471 | ldrd SA0, SA1, [PSA] | |
472 | smull AC0, AC1, CO0, SA0 | |
473 | .elseif \maxchan == 5 | |
474 | ldr CO0, [PCO, #0] | |
475 | ldr SA0, [PSA, #0] | |
476 | ldr CO1, [PCO, #4] | |
477 | ldr SA1, [PSA, #4] | |
478 | ldrd CO2, CO3, [PCO, #8] | |
479 | smull AC0, AC1, CO0, SA0 | |
480 | ldrd SA2, SA3, [PSA, #8] | |
481 | smlal AC0, AC1, CO1, SA1 | |
482 | ldrd CO0, CO1, [PCO, #16] | |
483 | smlal AC0, AC1, CO2, SA2 | |
484 | ldrd SA0, SA1, [PSA, #16] | |
485 | smlal AC0, AC1, CO3, SA3 | |
486 | smlal AC0, AC1, CO0, SA0 | |
487 | .else // \maxchan == 7 | |
488 | ldr CO2, [PCO, #0] | |
489 | ldr SA2, [PSA, #0] | |
490 | ldr CO3, [PCO, #4] | |
491 | ldr SA3, [PSA, #4] | |
492 | ldrd CO0, CO1, [PCO, #8] | |
493 | smull AC0, AC1, CO2, SA2 | |
494 | ldrd SA0, SA1, [PSA, #8] | |
495 | smlal AC0, AC1, CO3, SA3 | |
496 | ldrd CO2, CO3, [PCO, #16] | |
497 | smlal AC0, AC1, CO0, SA0 | |
498 | ldrd SA2, SA3, [PSA, #16] | |
499 | smlal AC0, AC1, CO1, SA1 | |
500 | ldrd CO0, CO1, [PCO, #24] | |
501 | smlal AC0, AC1, CO2, SA2 | |
502 | ldrd SA0, SA1, [PSA, #24] | |
503 | smlal AC0, AC1, CO3, SA3 | |
504 | smlal AC0, AC1, CO0, SA0 | |
505 | .endif | |
506 | ldm sp, {NOISE, DCH, MASK} | |
507 | smlal AC0, AC1, CO1, SA1 | |
508 | .if \shift != 0 | |
509 | .if \index_mask == 63 | |
510 | add NOISE, NOISE, INDEX, lsr #32-6 | |
511 | ldrb LSB, [PBL], #MAX_CHANNELS | |
512 | ldrsb NOISE, [NOISE] | |
513 | add INDEX, INDEX, INDEX, lsl #32-6 | |
514 | .else // \index_mask == 127 | |
515 | add NOISE, NOISE, INDEX, lsr #32-7 | |
516 | ldrb LSB, [PBL], #MAX_CHANNELS | |
517 | ldrsb NOISE, [NOISE] | |
518 | add INDEX, INDEX, INDEX, lsl #32-7 | |
519 | .endif | |
520 | sub INDEX, INDEX, #1<<7 | |
521 | adds AC0, AC0, NOISE, lsl #\shift + 7 | |
522 | adc AC1, AC1, NOISE, asr #31 | |
523 | .else | |
524 | ldrb LSB, [PBL], #MAX_CHANNELS | |
525 | sub INDEX, INDEX, #1<<7 | |
526 | .endif | |
527 | add PSA, PSA, #MAX_CHANNELS*4 | |
528 | mov AC0, AC0, lsr #14 | |
529 | orr AC0, AC0, AC1, lsl #18 | |
530 | .if !\mask_minus1 | |
531 | and AC0, AC0, MASK | |
532 | .endif | |
533 | add AC0, AC0, LSB | |
534 | tst INDEX, #1<<15 | |
535 | str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA | |
536 | beq 1b | |
537 | b 98f | |
538 | .endm | |
539 | ||
540 | .macro switch_on_maxchan shift, index_mask, mask_minus1 | |
541 | cmp v4, #5 | |
542 | blo 51f | |
543 | beq 50f | |
544 | implement_rematrix \shift, \index_mask, \mask_minus1, 7 | |
545 | 50: implement_rematrix \shift, \index_mask, \mask_minus1, 5 | |
546 | 51: implement_rematrix \shift, \index_mask, \mask_minus1, 1 | |
547 | .endm | |
548 | ||
549 | .macro switch_on_mask shift, index_mask | |
550 | cmp sl, #-1 | |
551 | bne 40f | |
552 | switch_on_maxchan \shift, \index_mask, 1 | |
553 | 40: switch_on_maxchan \shift, \index_mask, 0 | |
554 | .endm | |
555 | ||
556 | .macro switch_on_au_size shift | |
557 | .if \shift == 0 | |
558 | switch_on_mask \shift, undefined | |
559 | .else | |
560 | teq v6, #64 | |
561 | bne 30f | |
562 | orr INDEX, INDEX, v1, lsl #32-6 | |
563 | switch_on_mask \shift, 63 | |
564 | 30: orr INDEX, INDEX, v1, lsl #32-7 | |
565 | switch_on_mask \shift, 127 | |
566 | .endif | |
567 | .endm | |
568 | ||
569 | /* void ff_mlp_rematrix_channel_arm(int32_t *samples, | |
570 | * const int32_t *coeffs, | |
571 | * const uint8_t *bypassed_lsbs, | |
572 | * const int8_t *noise_buffer, | |
573 | * int index, | |
574 | * unsigned int dest_ch, | |
575 | * uint16_t blockpos, | |
576 | * unsigned int maxchan, | |
577 | * int matrix_noise_shift, | |
578 | * int access_unit_size_pow2, | |
579 | * int32_t mask); | |
580 | */ | |
581 | function ff_mlp_rematrix_channel_arm, export=1 | |
582 | push {v1-fp,lr} | |
583 | add v1, sp, #9*4 // point at arguments on stack | |
584 | ldm v1, {v1-sl} | |
585 | teq v4, #1 | |
586 | itt ne | |
587 | teqne v4, #5 | |
588 | teqne v4, #7 | |
589 | bne 99f | |
590 | teq v6, #64 | |
591 | it ne | |
592 | teqne v6, #128 | |
593 | bne 99f | |
594 | sub v2, v2, #MAX_CHANNELS | |
595 | push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned | |
596 | movs INDEX, v3, lsl #7 | |
597 | beq 98f // just in case, do nothing if blockpos = 0 | |
598 | subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time | |
599 | adc lr, v1, v1 // calculate index2 (C was set by preceding subs) | |
600 | orr INDEX, INDEX, lr | |
601 | // Switch on matrix_noise_shift: values 0 and 1 are | |
602 | // disproportionately common so do those in a form the branch | |
603 | // predictor can accelerate. Values can only go up to 15. | |
604 | cmp v5, #1 | |
605 | beq 11f | |
606 | blo 10f | |
607 | A ldr pc, [pc, v5, lsl #2] | |
608 | T tbh [pc, v5, lsl #1] | |
609 | 0: | |
610 | A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f | |
611 | T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2 | |
612 | T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2 | |
613 | T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2 | |
614 | 10: switch_on_au_size 0 | |
615 | 11: switch_on_au_size 1 | |
616 | 12: switch_on_au_size 2 | |
617 | 13: switch_on_au_size 3 | |
618 | 14: switch_on_au_size 4 | |
619 | 15: switch_on_au_size 5 | |
620 | 16: switch_on_au_size 6 | |
621 | 17: switch_on_au_size 7 | |
622 | 18: switch_on_au_size 8 | |
623 | 19: switch_on_au_size 9 | |
624 | 20: switch_on_au_size 10 | |
625 | 21: switch_on_au_size 11 | |
626 | 22: switch_on_au_size 12 | |
627 | 23: switch_on_au_size 13 | |
628 | 24: switch_on_au_size 14 | |
629 | 25: switch_on_au_size 15 | |
630 | ||
631 | 98: add sp, sp, #3*4 | |
632 | pop {v1-fp,pc} | |
633 | 99: // Can't handle these parameters, drop back to C | |
634 | pop {v1-fp,lr} | |
635 | b X(ff_mlp_rematrix_channel) | |
636 | endfunc | |
637 | ||
638 | .unreq PSA | |
639 | .unreq PCO | |
640 | .unreq PBL | |
641 | .unreq INDEX | |
642 | .unreq CO0 | |
643 | .unreq CO1 | |
644 | .unreq CO2 | |
645 | .unreq CO3 | |
646 | .unreq SA0 | |
647 | .unreq SA1 | |
648 | .unreq SA2 | |
649 | .unreq SA3 | |
650 | .unreq AC0 | |
651 | .unreq AC1 | |
652 | .unreq NOISE | |
653 | .unreq LSB | |
654 | .unreq DCH | |
655 | .unreq MASK |