| 1 | /* |
| 2 | * Copyright (c) 2013 RISC OS Open Ltd |
| 3 | * Author: Ben Avison <bavison@riscosopen.org> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/arm/asm.S" |
| 23 | |
| 24 | CONTEXT .req a1 |
| 25 | ORIGOUT .req a2 |
| 26 | IN .req a3 |
| 27 | OUT .req v1 |
| 28 | REVTAB .req v2 |
| 29 | TCOS .req v3 |
| 30 | TSIN .req v4 |
| 31 | OLDFPSCR .req v5 |
| 32 | J0 .req a2 |
| 33 | J1 .req a4 |
| 34 | J2 .req ip |
| 35 | J3 .req lr |
| 36 | REVTAB_HI .req v5 |
| 37 | IN_HI .req v6 |
| 38 | OUT_HI .req v6 |
| 39 | TCOS_HI .req sl |
| 40 | TSIN_HI .req fp |
| 41 | |
| 42 | .macro prerotation_innerloop |
| 43 | .set trig_lo, k |
| 44 | .set trig_hi, n4 - k - 2 |
| 45 | .set in_lo, trig_lo * 2 |
| 46 | .set in_hi, trig_hi * 2 |
| 47 | vldr d8, [TCOS, #trig_lo*4] @ s16,s17 |
| 48 | vldr d9, [TCOS, #trig_hi*4] @ s18,s19 |
| 49 | vldr s0, [IN, #in_hi*4 + 12] |
| 50 | vldr s1, [IN, #in_hi*4 + 4] |
| 51 | vldr s2, [IN, #in_lo*4 + 12] |
| 52 | vldr s3, [IN, #in_lo*4 + 4] |
| 53 | vmul.f s8, s0, s16 @ vector operation |
| 54 | vldr d10, [TSIN, #trig_lo*4] @ s20,s21 |
| 55 | vldr d11, [TSIN, #trig_hi*4] @ s22,s23 |
| 56 | vldr s4, [IN, #in_lo*4] |
| 57 | vldr s5, [IN, #in_lo*4 + 8] |
| 58 | vldr s6, [IN, #in_hi*4] |
| 59 | vldr s7, [IN, #in_hi*4 + 8] |
| 60 | ldr J0, [REVTAB, #trig_lo*2] |
| 61 | vmul.f s12, s0, s20 @ vector operation |
| 62 | ldr J2, [REVTAB, #trig_hi*2] |
| 63 | mov J1, J0, lsr #16 |
| 64 | and J0, J0, #255 @ halfword value will be < n4 |
| 65 | vmls.f s8, s4, s20 @ vector operation |
| 66 | mov J3, J2, lsr #16 |
| 67 | and J2, J2, #255 @ halfword value will be < n4 |
| 68 | add J0, OUT, J0, lsl #3 |
| 69 | vmla.f s12, s4, s16 @ vector operation |
| 70 | add J1, OUT, J1, lsl #3 |
| 71 | add J2, OUT, J2, lsl #3 |
| 72 | add J3, OUT, J3, lsl #3 |
| 73 | vstr s8, [J0] |
| 74 | vstr s9, [J1] |
| 75 | vstr s10, [J2] |
| 76 | vstr s11, [J3] |
| 77 | vstr s12, [J0, #4] |
| 78 | vstr s13, [J1, #4] |
| 79 | vstr s14, [J2, #4] |
| 80 | vstr s15, [J3, #4] |
| 81 | .set k, k + 2 |
| 82 | .endm |
| 83 | |
| 84 | .macro prerotation_innerloop_rolled |
| 85 | vldmia TCOS!, {s16,s17} |
| 86 | vldmdb TCOS_HI!, {s18,s19} |
| 87 | vldr s0, [IN_HI, #-4] |
| 88 | vldr s1, [IN_HI, #-12] |
| 89 | vldr s2, [IN, #12] |
| 90 | vldr s3, [IN, #4] |
| 91 | vmul.f s8, s0, s16 @ vector operation |
| 92 | vldmia TSIN!, {s20,s21} |
| 93 | vldmdb TSIN_HI!, {s22,s23} |
| 94 | vldr s4, [IN] |
| 95 | vldr s5, [IN, #8] |
| 96 | vldr s6, [IN_HI, #-16] |
| 97 | vldr s7, [IN_HI, #-8] |
| 98 | vmul.f s12, s0, s20 @ vector operation |
| 99 | add IN, IN, #16 |
| 100 | sub IN_HI, IN_HI, #16 |
| 101 | ldrh J0, [REVTAB], #2 |
| 102 | ldrh J1, [REVTAB], #2 |
| 103 | vmls.f s8, s4, s20 @ vector operation |
| 104 | ldrh J3, [REVTAB_HI, #-2]! |
| 105 | ldrh J2, [REVTAB_HI, #-2]! |
| 106 | add J0, OUT, J0, lsl #3 |
| 107 | vmla.f s12, s4, s16 @ vector operation |
| 108 | add J1, OUT, J1, lsl #3 |
| 109 | add J2, OUT, J2, lsl #3 |
| 110 | add J3, OUT, J3, lsl #3 |
| 111 | vstr s8, [J0] |
| 112 | vstr s9, [J1] |
| 113 | vstr s10, [J2] |
| 114 | vstr s11, [J3] |
| 115 | vstr s12, [J0, #4] |
| 116 | vstr s13, [J1, #4] |
| 117 | vstr s14, [J2, #4] |
| 118 | vstr s15, [J3, #4] |
| 119 | .endm |
| 120 | |
| 121 | .macro postrotation_innerloop tail, head |
| 122 | .set trig_lo_head, n8 - k - 2 |
| 123 | .set trig_hi_head, n8 + k |
| 124 | .set out_lo_head, trig_lo_head * 2 |
| 125 | .set out_hi_head, trig_hi_head * 2 |
| 126 | .set trig_lo_tail, n8 - (k - 2) - 2 |
| 127 | .set trig_hi_tail, n8 + (k - 2) |
| 128 | .set out_lo_tail, trig_lo_tail * 2 |
| 129 | .set out_hi_tail, trig_hi_tail * 2 |
| 130 | .if (k & 2) == 0 |
| 131 | TCOS_D0_HEAD .req d10 @ s20,s21 |
| 132 | TCOS_D1_HEAD .req d11 @ s22,s23 |
| 133 | TCOS_S0_TAIL .req s24 |
| 134 | .else |
| 135 | TCOS_D0_HEAD .req d12 @ s24,s25 |
| 136 | TCOS_D1_HEAD .req d13 @ s26,s27 |
| 137 | TCOS_S0_TAIL .req s20 |
| 138 | .endif |
| 139 | .ifnc "\tail","" |
| 140 | vmls.f s8, s0, TCOS_S0_TAIL @ vector operation |
| 141 | .endif |
| 142 | .ifnc "\head","" |
| 143 | vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17 |
| 144 | vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19 |
| 145 | vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4] |
| 146 | .endif |
| 147 | .ifnc "\tail","" |
| 148 | vmla.f s12, s4, TCOS_S0_TAIL @ vector operation |
| 149 | .endif |
| 150 | .ifnc "\head","" |
| 151 | vldr s0, [OUT, #out_lo_head*4] |
| 152 | vldr s1, [OUT, #out_lo_head*4 + 8] |
| 153 | vldr s2, [OUT, #out_hi_head*4] |
| 154 | vldr s3, [OUT, #out_hi_head*4 + 8] |
| 155 | vldr s4, [OUT, #out_lo_head*4 + 4] |
| 156 | vldr s5, [OUT, #out_lo_head*4 + 12] |
| 157 | vldr s6, [OUT, #out_hi_head*4 + 4] |
| 158 | vldr s7, [OUT, #out_hi_head*4 + 12] |
| 159 | .endif |
| 160 | .ifnc "\tail","" |
| 161 | vstr s8, [OUT, #out_lo_tail*4] |
| 162 | vstr s9, [OUT, #out_lo_tail*4 + 8] |
| 163 | vstr s10, [OUT, #out_hi_tail*4] |
| 164 | vstr s11, [OUT, #out_hi_tail*4 + 8] |
| 165 | .endif |
| 166 | .ifnc "\head","" |
| 167 | vmul.f s8, s4, s16 @ vector operation |
| 168 | .endif |
| 169 | .ifnc "\tail","" |
| 170 | vstr s12, [OUT, #out_hi_tail*4 + 12] |
| 171 | vstr s13, [OUT, #out_hi_tail*4 + 4] |
| 172 | vstr s14, [OUT, #out_lo_tail*4 + 12] |
| 173 | vstr s15, [OUT, #out_lo_tail*4 + 4] |
| 174 | .endif |
| 175 | .ifnc "\head","" |
| 176 | vmul.f s12, s0, s16 @ vector operation |
| 177 | vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4] |
| 178 | .endif |
| 179 | .unreq TCOS_D0_HEAD |
| 180 | .unreq TCOS_D1_HEAD |
| 181 | .unreq TCOS_S0_TAIL |
| 182 | .ifnc "\head","" |
| 183 | .set k, k + 2 |
| 184 | .endif |
| 185 | .endm |
| 186 | |
| 187 | .macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail |
| 188 | .ifnc "\tail","" |
| 189 | vmls.f s8, s0, \tcos_s0_tail @ vector operation |
| 190 | .endif |
| 191 | .ifnc "\head","" |
| 192 | vldmia TSIN!, {s16,s17} |
| 193 | vldmdb TSIN_HI!, {s18,s19} |
| 194 | vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head} |
| 195 | .endif |
| 196 | .ifnc "\tail","" |
| 197 | vmla.f s12, s4, \tcos_s0_tail @ vector operation |
| 198 | .endif |
| 199 | .ifnc "\head","" |
| 200 | vldr s0, [OUT, #+\out_offset_head+0] |
| 201 | vldr s1, [OUT, #+\out_offset_head+8] |
| 202 | vldr s2, [OUT_HI, #-\out_offset_head-16] |
| 203 | vldr s3, [OUT_HI, #-\out_offset_head-8] |
| 204 | vldr s4, [OUT, #+\out_offset_head+4] |
| 205 | vldr s5, [OUT, #+\out_offset_head+12] |
| 206 | vldr s6, [OUT_HI, #-\out_offset_head-12] |
| 207 | vldr s7, [OUT_HI, #-\out_offset_head-4] |
| 208 | .endif |
| 209 | .ifnc "\tail","" |
| 210 | vstr s8, [OUT, #+\out_offset_tail+0] |
| 211 | vstr s9, [OUT, #+\out_offset_tail+8] |
| 212 | vstr s10, [OUT_HI, #-\out_offset_tail-16] |
| 213 | vstr s11, [OUT_HI, #-\out_offset_tail-8] |
| 214 | .endif |
| 215 | .ifnc "\head","" |
| 216 | vmul.f s8, s4, s16 @ vector operation |
| 217 | .endif |
| 218 | .ifnc "\tail","" |
| 219 | vstr s12, [OUT_HI, #-\out_offset_tail-4] |
| 220 | vstr s13, [OUT_HI, #-\out_offset_tail-12] |
| 221 | vstr s14, [OUT, #+\out_offset_tail+12] |
| 222 | vstr s15, [OUT, #+\out_offset_tail+4] |
| 223 | .endif |
| 224 | .ifnc "\head","" |
| 225 | vmul.f s12, s0, s16 @ vector operation |
| 226 | vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head} |
| 227 | .endif |
| 228 | .endm |
| 229 | |
| 230 | |
| 231 | /* void ff_imdct_half_vfp(FFTContext *s, |
| 232 | * FFTSample *output, |
| 233 | * const FFTSample *input) |
| 234 | */ |
| 235 | function ff_imdct_half_vfp, export=1 |
| 236 | ldr ip, [CONTEXT, #5*4] @ mdct_bits |
| 237 | teq ip, #6 |
| 238 | bne 10f |
| 239 | |
| 240 | .set n, 1<<6 |
| 241 | .set n2, n/2 |
| 242 | .set n4, n/4 |
| 243 | .set n8, n/8 |
| 244 | |
| 245 | push {v1-v5,lr} |
| 246 | vpush {s16-s27} |
| 247 | fmrx OLDFPSCR, FPSCR |
| 248 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 |
| 249 | fmxr FPSCR, lr |
| 250 | mov OUT, ORIGOUT |
| 251 | ldr REVTAB, [CONTEXT, #2*4] |
| 252 | ldr TCOS, [CONTEXT, #6*4] |
| 253 | ldr TSIN, [CONTEXT, #7*4] |
| 254 | |
| 255 | .set k, 0 |
| 256 | .rept n8/2 |
| 257 | prerotation_innerloop |
| 258 | .endr |
| 259 | |
| 260 | fmxr FPSCR, OLDFPSCR |
| 261 | mov a1, OUT |
| 262 | bl X(ff_fft16_vfp) |
| 263 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 |
| 264 | fmxr FPSCR, lr |
| 265 | |
| 266 | .set k, 0 |
| 267 | postrotation_innerloop , head |
| 268 | .rept n8/2 - 1 |
| 269 | postrotation_innerloop tail, head |
| 270 | .endr |
| 271 | postrotation_innerloop tail |
| 272 | |
| 273 | fmxr FPSCR, OLDFPSCR |
| 274 | vpop {s16-s27} |
| 275 | pop {v1-v5,pc} |
| 276 | |
| 277 | 10: |
| 278 | push {v1-v6,sl,fp,lr} |
| 279 | vpush {s16-s27} |
| 280 | fmrx OLDFPSCR, FPSCR |
| 281 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 |
| 282 | fmxr FPSCR, lr |
| 283 | mov lr, #1 |
| 284 | mov OUT, ORIGOUT |
| 285 | ldr REVTAB, [CONTEXT, #2*4] |
| 286 | ldr TCOS, [CONTEXT, #6*4] |
| 287 | ldr TSIN, [CONTEXT, #7*4] |
| 288 | mov lr, lr, lsl ip |
| 289 | |
| 290 | push {CONTEXT,OLDFPSCR} |
| 291 | add IN_HI, IN, lr, lsl #1 |
| 292 | add REVTAB_HI, REVTAB, lr, lsr #1 |
| 293 | add TCOS_HI, TCOS, lr |
| 294 | add TSIN_HI, TSIN, lr |
| 295 | 0: prerotation_innerloop_rolled |
| 296 | teq IN, IN_HI |
| 297 | bne 0b |
| 298 | ldmia sp, {CONTEXT,OLDFPSCR} |
| 299 | |
| 300 | mov ORIGOUT, OUT |
| 301 | fmxr FPSCR, OLDFPSCR |
| 302 | ldr ip, [CONTEXT, #9*4] |
| 303 | blx ip @ s->fft_calc(s, output) |
| 304 | |
| 305 | pop {CONTEXT,OLDFPSCR} |
| 306 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 |
| 307 | ldr ip, [CONTEXT, #5*4] @ mdct_bits |
| 308 | fmxr FPSCR, lr |
| 309 | mov lr, #1 |
| 310 | mov lr, lr, lsl ip |
| 311 | sub TCOS, TCOS, lr, lsr #1 |
| 312 | sub TSIN, TSIN, lr, lsr #1 |
| 313 | add OUT_HI, OUT, lr, lsl #1 |
| 314 | add TCOS_HI, TCOS, lr |
| 315 | add TSIN_HI, TSIN, lr |
| 316 | postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0 |
| 317 | b 1f |
| 318 | 0: add OUT, OUT, #32 |
| 319 | sub OUT_HI, OUT_HI, #32 |
| 320 | postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16 |
| 321 | 1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0 |
| 322 | teq TSIN, TSIN_HI |
| 323 | bne 0b |
| 324 | postrotation_innerloop_rolled tail,,,,,, s24,, 16 |
| 325 | |
| 326 | fmxr FPSCR, OLDFPSCR |
| 327 | vpop {s16-s27} |
| 328 | pop {v1-v6,sl,fp,pc} |
| 329 | endfunc |
| 330 | |
| 331 | .unreq CONTEXT |
| 332 | .unreq ORIGOUT |
| 333 | .unreq IN |
| 334 | .unreq OUT |
| 335 | .unreq REVTAB |
| 336 | .unreq TCOS |
| 337 | .unreq TSIN |
| 338 | .unreq OLDFPSCR |
| 339 | .unreq J0 |
| 340 | .unreq J1 |
| 341 | .unreq J2 |
| 342 | .unreq J3 |
| 343 | .unreq REVTAB_HI |
| 344 | .unreq IN_HI |
| 345 | .unreq OUT_HI |
| 346 | .unreq TCOS_HI |
| 347 | .unreq TSIN_HI |