| 1 | /* |
| 2 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> |
| 3 | * |
| 4 | * This file is part of FFmpeg. |
| 5 | * |
| 6 | * FFmpeg is free software; you can redistribute it and/or |
| 7 | * modify it under the terms of the GNU Lesser General Public |
| 8 | * License as published by the Free Software Foundation; either |
| 9 | * version 2.1 of the License, or (at your option) any later version. |
| 10 | * |
| 11 | * FFmpeg is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | * Lesser General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU Lesser General Public |
| 17 | * License along with FFmpeg; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | */ |
| 20 | |
| 21 | #include "libavutil/aarch64/asm.S" |
| 22 | #include "asm-offsets.h" |
| 23 | |
| 24 | .macro resample_one fmt, es=2 |
| 25 | .ifnc \fmt, dbl |
| 26 | .macro M_MUL2 x:vararg |
| 27 | .endm |
| 28 | .macro M_MLA2 x:vararg |
| 29 | .endm |
| 30 | .endif |
| 31 | function ff_resample_one_\fmt\()_neon, export=1 |
| 32 | sxtw x2, w2 |
| 33 | ldr x9, [x0, #FILTER_BANK] |
| 34 | ldr w6, [x0, #FILTER_LENGTH] |
| 35 | ldp w7, w8, [x0, #PHASE_SHIFT] // and phase_mask |
| 36 | lsr x10, x4, x7 // sample_index |
| 37 | and x4, x4, x8 |
| 38 | lsl x11, x6, #\es // filter_length * elem_size |
| 39 | add x3, x3, x10, lsl #\es // src[sample_index] |
| 40 | madd x9, x11, x4, x9 // filter |
| 41 | cmp w6, #16 |
| 42 | b.lt 5f |
| 43 | 8: // remaining filter_length at least 16 |
| 44 | subs w6, w6, #16 |
| 45 | LOAD8 v4, v5, v6, v7, x3 |
| 46 | LOAD8 v16, v17, v18, v19, x9 |
| 47 | M_MUL v0, v4, v16, v1 |
| 48 | M_MUL2 v1, v6, v18 |
| 49 | 7: |
| 50 | LOAD8 v20, v21, v22, v23, x3 |
| 51 | M_MLA v0, v5, v17, v1 |
| 52 | M_MLA2 v1, v7, v19 |
| 53 | LOAD8 v24, v25, v26, v27, x9 |
| 54 | M_MLA v0, v20, v24, v1 |
| 55 | M_MLA2 v1, v22, v26 |
| 56 | b.eq 6f |
| 57 | cmp w6, #16 |
| 58 | M_MLA v0, v21, v25, v1 |
| 59 | M_MLA2 v1, v23, v27 |
| 60 | b.lt 4f |
| 61 | subs w6, w6, #16 |
| 62 | LOAD8 v4, v5, v6, v7, x3 |
| 63 | LOAD8 v16, v17, v18, v19, x9 |
| 64 | M_MLA v0, v4, v16, v1 |
| 65 | M_MLA2 v1, v6, v18 |
| 66 | b 7b |
| 67 | 6: |
| 68 | M_MLA v0, v21, v25, v1 |
| 69 | M_MLA2 v1, v23, v27 |
| 70 | STORE_ONE 0, x1, x2, v1 |
| 71 | ret |
| 72 | 5: |
| 73 | movi v0.16b, #0 |
| 74 | movi v1.16b, #0 |
| 75 | 4: // remaining filter_length 1-15 |
| 76 | cmp w6, #4 |
| 77 | b.lt 2f |
| 78 | subs w6, w6, #4 |
| 79 | LOAD4 v4, v5, x3 |
| 80 | LOAD4 v6, v7, x9 |
| 81 | M_MLA v0, v4, v6, v1 |
| 82 | M_MLA2 v1, v5, v7 |
| 83 | b.eq 0f |
| 84 | b 4b |
| 85 | 2: // remaining filter_length 1-3 |
| 86 | cmp w6, #2 |
| 87 | b.lt 1f |
| 88 | LOAD2 2, x3 |
| 89 | LOAD2 3, x9 |
| 90 | subs w6, w6, #2 |
| 91 | M_MLA v0, v2, v3 |
| 92 | b.eq 0f |
| 93 | 1: // remaining filter_length 1 |
| 94 | LOAD1 6, x3 |
| 95 | LOAD1 7, x9 |
| 96 | M_MLA v0, v6, v7 |
| 97 | 0: |
| 98 | STORE_ONE 0, x1, x2, v1 |
| 99 | ret |
| 100 | endfunc |
| 101 | |
| 102 | .purgem LOAD1 |
| 103 | .purgem LOAD2 |
| 104 | .purgem LOAD4 |
| 105 | .purgem LOAD8 |
| 106 | .purgem M_MLA |
| 107 | .purgem M_MLA2 |
| 108 | .purgem M_MUL |
| 109 | .purgem M_MUL2 |
| 110 | .purgem STORE_ONE |
| 111 | .endm |
| 112 | |
| 113 | |
| 114 | .macro LOAD1 d1, addr |
| 115 | ldr d\d1, [\addr], #8 |
| 116 | .endm |
| 117 | .macro LOAD2 d1, addr |
| 118 | ld1 {v\d1\().2d}, [\addr], #16 |
| 119 | .endm |
| 120 | .macro LOAD4 d1, d2, addr |
| 121 | ld1 {\d1\().2d,\d2\().2d}, [\addr], #32 |
| 122 | .endm |
| 123 | .macro LOAD8 d1, d2, d3, d4, addr |
| 124 | ld1 {\d1\().2d,\d2\().2d,\d3\().2d,\d4\().2d}, [\addr], #64 |
| 125 | .endm |
| 126 | .macro M_MLA d, r0, r1, d2:vararg |
| 127 | fmla \d\().2d, \r0\().2d, \r1\().2d |
| 128 | .endm |
| 129 | .macro M_MLA2 second:vararg |
| 130 | M_MLA \second |
| 131 | .endm |
| 132 | .macro M_MUL d, r0, r1, d2:vararg |
| 133 | fmul \d\().2d, \r0\().2d, \r1\().2d |
| 134 | .endm |
| 135 | .macro M_MUL2 second:vararg |
| 136 | M_MUL \second |
| 137 | .endm |
| 138 | .macro STORE_ONE rn, addr, idx, d2 |
| 139 | fadd v\rn\().2d, v\rn\().2d, \d2\().2d |
| 140 | faddp d\rn\(), v\rn\().2d |
| 141 | str d\rn\(), [\addr, \idx, lsl #3] |
| 142 | .endm |
| 143 | |
| 144 | resample_one dbl, 3 |
| 145 | |
| 146 | |
| 147 | .macro LOAD1 d1, addr |
| 148 | ldr s\d1, [\addr], #4 |
| 149 | .endm |
| 150 | .macro LOAD2 d1, addr |
| 151 | ld1 {v\d1\().2s}, [\addr], #8 |
| 152 | .endm |
| 153 | .macro LOAD4 d1, d2, addr |
| 154 | ld1 {\d1\().4s}, [\addr], #16 |
| 155 | .endm |
| 156 | .macro LOAD8 d1, d2, d3, d4, addr |
| 157 | ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 |
| 158 | .endm |
| 159 | .macro M_MLA d, r0, r1, d2:vararg |
| 160 | fmla \d\().4s, \r0\().4s, \r1\().4s |
| 161 | .endm |
| 162 | .macro M_MUL d, r0, r1, d2:vararg |
| 163 | fmul \d\().4s, \r0\().4s, \r1\().4s |
| 164 | .endm |
| 165 | .macro STORE_ONE rn, addr, idx, d2 |
| 166 | faddp v\rn\().4s, v\rn\().4s, v\rn\().4s |
| 167 | faddp s\rn\(), v\rn\().2s |
| 168 | str s\rn\(), [\addr, \idx, lsl #2] |
| 169 | .endm |
| 170 | |
| 171 | resample_one flt |
| 172 | |
| 173 | |
| 174 | .macro LOAD1 d1, addr |
| 175 | ldr h\d1, [\addr], #2 |
| 176 | .endm |
| 177 | .macro LOAD2 d1, addr |
| 178 | ldr s\d1, [\addr], #4 |
| 179 | .endm |
| 180 | .macro LOAD4 d1, d2, addr |
| 181 | ld1 {\d1\().4h}, [\addr], #8 |
| 182 | .endm |
| 183 | .macro LOAD8 d1, d2, d3, d4, addr |
| 184 | ld1 {\d1\().4h,\d2\().4h}, [\addr], #16 |
| 185 | .endm |
| 186 | .macro M_MLA d, r0, r1, d2:vararg |
| 187 | smlal \d\().4s, \r0\().4h, \r1\().4h |
| 188 | .endm |
| 189 | .macro M_MUL d, r0, r1, d2:vararg |
| 190 | smull \d\().4s, \r0\().4h, \r1\().4h |
| 191 | .endm |
| 192 | .macro STORE_ONE rn, addr, idx, d2 |
| 193 | addp v\rn\().4s, v\rn\().4s, v\rn\().4s |
| 194 | addp v\rn\().4s, v\rn\().4s, v\rn\().4s |
| 195 | sqrshrn v\rn\().4h, v\rn\().4s, #15 |
| 196 | str h\rn\(), [\addr, \idx, lsl #1] |
| 197 | .endm |
| 198 | |
| 199 | resample_one s16, 1 |
| 200 | |
| 201 | |
| 202 | .macro LOAD1 d1, addr |
| 203 | ldr s\d1, [\addr], #4 |
| 204 | .endm |
| 205 | .macro LOAD2 d1, addr |
| 206 | ld1 {v\d1\().2s}, [\addr], #8 |
| 207 | .endm |
| 208 | .macro LOAD4 d1, d2, addr |
| 209 | ld1 {\d1\().4s}, [\addr], #16 |
| 210 | .endm |
| 211 | .macro LOAD8 d1, d2, d3, d4, addr |
| 212 | ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 |
| 213 | .endm |
| 214 | .macro M_MLA d1, r0, r1, d2:vararg |
| 215 | smlal \d1\().2d, \r0\().2s, \r1\().2s |
| 216 | .ifnb \d2 |
| 217 | smlal2 \d2\().2d, \r0\().4s, \r1\().4s |
| 218 | .endif |
| 219 | .endm |
| 220 | .macro M_MUL d1, r0, r1, d2:vararg |
| 221 | smull \d1\().2d, \r0\().2s, \r1\().2s |
| 222 | .ifnb \d2 |
| 223 | smull2 \d2\().2d, \r0\().4s, \r1\().4s |
| 224 | .endif |
| 225 | .endm |
| 226 | .macro STORE_ONE rn, addr, idx, d2 |
| 227 | add v\rn\().2d, v\rn\().2d, \d2\().2d |
| 228 | addp d\rn\(), v\rn\().2d |
| 229 | sqrshrn v\rn\().2s, v\rn\().2d, #30 |
| 230 | str s\rn\(), [\addr, \idx, lsl #2] |
| 231 | .endm |
| 232 | |
| 233 | resample_one s32 |