| 1 | /* |
| 2 | * Copyright (c) 2013 RISC OS Open Ltd |
| 3 | * Author: Ben Avison <bavison@riscosopen.org> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/arm/asm.S" |
| 23 | |
| 24 | IMDCT .req r0 |
| 25 | ORIG_P_SB .req r1 |
| 26 | P_SB_OFF .req r2 |
| 27 | I .req r0 |
| 28 | P_SB2_UP .req r1 |
| 29 | OLDFPSCR .req r2 |
| 30 | P_SB2_DN .req r3 |
| 31 | P_WIN_DN .req r4 |
| 32 | P_OUT_DN .req r5 |
| 33 | P_SB .req r6 |
| 34 | J_WRAP .req r7 |
| 35 | P_WIN_UP .req r12 |
| 36 | P_OUT_UP .req r14 |
| 37 | |
| 38 | SCALE .req s0 |
| 39 | SBUF_DAT_REV0 .req s4 |
| 40 | SBUF_DAT_REV1 .req s5 |
| 41 | SBUF_DAT_REV2 .req s6 |
| 42 | SBUF_DAT_REV3 .req s7 |
| 43 | VA0 .req s8 |
| 44 | VA3 .req s11 |
| 45 | VB0 .req s12 |
| 46 | VB3 .req s15 |
| 47 | VC0 .req s8 |
| 48 | VC3 .req s11 |
| 49 | VD0 .req s12 |
| 50 | VD3 .req s15 |
| 51 | SBUF_DAT0 .req s16 |
| 52 | SBUF_DAT1 .req s17 |
| 53 | SBUF_DAT2 .req s18 |
| 54 | SBUF_DAT3 .req s19 |
| 55 | SBUF_DAT_ALT0 .req s20 |
| 56 | SBUF_DAT_ALT1 .req s21 |
| 57 | SBUF_DAT_ALT2 .req s22 |
| 58 | SBUF_DAT_ALT3 .req s23 |
| 59 | WIN_DN_DAT0 .req s24 |
| 60 | WIN_UP_DAT0 .req s28 |
| 61 | |
| 62 | |
| 63 | .macro inner_loop half, tail, head |
| 64 | .if (OFFSET & (64*4)) == 0 @ even numbered call |
| 65 | SBUF_DAT_THIS0 .req SBUF_DAT0 |
| 66 | SBUF_DAT_THIS1 .req SBUF_DAT1 |
| 67 | SBUF_DAT_THIS2 .req SBUF_DAT2 |
| 68 | SBUF_DAT_THIS3 .req SBUF_DAT3 |
| 69 | .ifnc "\head","" |
| 70 | vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT |
| 71 | vldr d9, [P_SB, #OFFSET+8] |
| 72 | .endif |
| 73 | .else |
| 74 | SBUF_DAT_THIS0 .req SBUF_DAT_ALT0 |
| 75 | SBUF_DAT_THIS1 .req SBUF_DAT_ALT1 |
| 76 | SBUF_DAT_THIS2 .req SBUF_DAT_ALT2 |
| 77 | SBUF_DAT_THIS3 .req SBUF_DAT_ALT3 |
| 78 | .ifnc "\head","" |
| 79 | vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT |
| 80 | vldr d11, [P_SB, #OFFSET+8] |
| 81 | .endif |
| 82 | .endif |
| 83 | .ifnc "\tail","" |
| 84 | .ifc "\half","ab" |
| 85 | vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors |
| 86 | .else |
| 87 | vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors |
| 88 | .endif |
| 89 | .endif |
| 90 | .ifnc "\head","" |
| 91 | vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT |
| 92 | vldr d15, [P_WIN_UP, #OFFSET+8] |
| 93 | vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT |
| 94 | vldr d13, [P_WIN_DN, #OFFSET+8] |
| 95 | vmov SBUF_DAT_REV3, SBUF_DAT_THIS0 |
| 96 | vmov SBUF_DAT_REV2, SBUF_DAT_THIS1 |
| 97 | vmov SBUF_DAT_REV1, SBUF_DAT_THIS2 |
| 98 | vmov SBUF_DAT_REV0, SBUF_DAT_THIS3 |
| 99 | .ifc "\half","ab" |
| 100 | vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0 |
| 101 | .else |
| 102 | vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0 |
| 103 | .endif |
| 104 | teq J_WRAP, #J |
| 105 | bne 2f @ strongly predictable, so better than cond exec in this case |
| 106 | sub P_SB, P_SB, #512*4 |
| 107 | 2: |
| 108 | .set J, J - 64 |
| 109 | .set OFFSET, OFFSET + 64*4 |
| 110 | .endif |
| 111 | .unreq SBUF_DAT_THIS0 |
| 112 | .unreq SBUF_DAT_THIS1 |
| 113 | .unreq SBUF_DAT_THIS2 |
| 114 | .unreq SBUF_DAT_THIS3 |
| 115 | .endm |
| 116 | |
| 117 | |
| 118 | /* void ff_synth_filter_float_vfp(FFTContext *imdct, |
| 119 | * float *synth_buf_ptr, int *synth_buf_offset, |
| 120 | * float synth_buf2[32], const float window[512], |
| 121 | * float out[32], const float in[32], float scale) |
| 122 | */ |
| 123 | function ff_synth_filter_float_vfp, export=1 |
| 124 | push {r3-r7,lr} |
| 125 | vpush {s16-s31} |
| 126 | ldr lr, [P_SB_OFF] |
| 127 | add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half |
| 128 | mov P_SB, a2 @ and keep a copy for ourselves |
| 129 | bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop |
| 130 | sub lr, lr, #32 |
| 131 | and lr, lr, #512-32 |
| 132 | str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call |
| 133 | ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half |
| 134 | VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case |
| 135 | bl X(ff_imdct_half_vfp) |
| 136 | VFP vmov SCALE, s16 |
| 137 | |
| 138 | fmrx OLDFPSCR, FPSCR |
| 139 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 |
| 140 | fmxr FPSCR, lr |
| 141 | ldr P_SB2_DN, [sp, #16*4] |
| 142 | ldr P_WIN_DN, [sp, #(16+6+0)*4] |
| 143 | ldr P_OUT_DN, [sp, #(16+6+1)*4] |
| 144 | NOVFP vldr SCALE, [sp, #(16+6+3)*4] |
| 145 | |
| 146 | #define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */ |
| 147 | add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range |
| 148 | add P_SB2_UP, P_SB2_DN, #16*4 |
| 149 | add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW |
| 150 | add P_OUT_UP, P_OUT_DN, #16*4 |
| 151 | add P_SB2_DN, P_SB2_DN, #16*4 |
| 152 | add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW |
| 153 | add P_OUT_DN, P_OUT_DN, #16*4 |
| 154 | mov I, #4 |
| 155 | 1: |
| 156 | vldmia P_SB2_UP!, {VB0-VB3} |
| 157 | vldmdb P_SB2_DN!, {VA0-VA3} |
| 158 | .set J, 512 - 64 |
| 159 | .set OFFSET, -IMM_OFF_SKEW |
| 160 | inner_loop ab,, head |
| 161 | .rept 7 |
| 162 | inner_loop ab, tail, head |
| 163 | .endr |
| 164 | inner_loop ab, tail |
| 165 | add P_WIN_UP, P_WIN_UP, #4*4 |
| 166 | sub P_WIN_DN, P_WIN_DN, #4*4 |
| 167 | vmul.f VB0, VB0, SCALE @ SCALE treated as scalar |
| 168 | add P_SB, P_SB, #(512+4)*4 |
| 169 | subs I, I, #1 |
| 170 | vmul.f VA0, VA0, SCALE |
| 171 | vstmia P_OUT_UP!, {VB0-VB3} |
| 172 | vstmdb P_OUT_DN!, {VA0-VA3} |
| 173 | bne 1b |
| 174 | |
| 175 | add P_SB2_DN, P_SB2_DN, #(16+28-12)*4 |
| 176 | sub P_SB2_UP, P_SB2_UP, #(16+16)*4 |
| 177 | add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4 |
| 178 | mov I, #4 |
| 179 | 1: |
| 180 | vldr.d d4, zero @ d4 = VC0 |
| 181 | vldr.d d5, zero |
| 182 | vldr.d d6, zero @ d6 = VD0 |
| 183 | vldr.d d7, zero |
| 184 | .set J, 512 - 64 |
| 185 | .set OFFSET, -IMM_OFF_SKEW |
| 186 | inner_loop cd,, head |
| 187 | .rept 7 |
| 188 | inner_loop cd, tail, head |
| 189 | .endr |
| 190 | inner_loop cd, tail |
| 191 | add P_WIN_UP, P_WIN_UP, #4*4 |
| 192 | sub P_WIN_DN, P_WIN_DN, #4*4 |
| 193 | add P_SB, P_SB, #(512+4)*4 |
| 194 | subs I, I, #1 |
| 195 | vstmia P_SB2_UP!, {VC0-VC3} |
| 196 | vstmdb P_SB2_DN!, {VD0-VD3} |
| 197 | bne 1b |
| 198 | |
| 199 | fmxr FPSCR, OLDFPSCR |
| 200 | vpop {s16-s31} |
| 201 | pop {r3-r7,pc} |
| 202 | endfunc |
| 203 | |
| 204 | .unreq IMDCT |
| 205 | .unreq ORIG_P_SB |
| 206 | .unreq P_SB_OFF |
| 207 | .unreq I |
| 208 | .unreq P_SB2_UP |
| 209 | .unreq OLDFPSCR |
| 210 | .unreq P_SB2_DN |
| 211 | .unreq P_WIN_DN |
| 212 | .unreq P_OUT_DN |
| 213 | .unreq P_SB |
| 214 | .unreq J_WRAP |
| 215 | .unreq P_WIN_UP |
| 216 | .unreq P_OUT_UP |
| 217 | |
| 218 | .unreq SCALE |
| 219 | .unreq SBUF_DAT_REV0 |
| 220 | .unreq SBUF_DAT_REV1 |
| 221 | .unreq SBUF_DAT_REV2 |
| 222 | .unreq SBUF_DAT_REV3 |
| 223 | .unreq VA0 |
| 224 | .unreq VA3 |
| 225 | .unreq VB0 |
| 226 | .unreq VB3 |
| 227 | .unreq VC0 |
| 228 | .unreq VC3 |
| 229 | .unreq VD0 |
| 230 | .unreq VD3 |
| 231 | .unreq SBUF_DAT0 |
| 232 | .unreq SBUF_DAT1 |
| 233 | .unreq SBUF_DAT2 |
| 234 | .unreq SBUF_DAT3 |
| 235 | .unreq SBUF_DAT_ALT0 |
| 236 | .unreq SBUF_DAT_ALT1 |
| 237 | .unreq SBUF_DAT_ALT2 |
| 238 | .unreq SBUF_DAT_ALT3 |
| 239 | .unreq WIN_DN_DAT0 |
| 240 | .unreq WIN_UP_DAT0 |
| 241 | |
| 242 | .align 3 |
| 243 | zero: .word 0, 0 |