| 1 | /* |
| 2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
| 3 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "config.h" |
| 23 | #include "libavutil/aarch64/asm.S" |
| 24 | |
| 25 | function ff_conv_flt_to_s16_neon, export=1 |
| 26 | subs x2, x2, #8 |
| 27 | ld1 {v0.4s}, [x1], #16 |
| 28 | fcvtzs v4.4s, v0.4s, #31 |
| 29 | ld1 {v1.4s}, [x1], #16 |
| 30 | fcvtzs v5.4s, v1.4s, #31 |
| 31 | b.eq 3f |
| 32 | ands x12, x2, #~15 |
| 33 | b.eq 2f |
| 34 | 1: subs x12, x12, #16 |
| 35 | sqrshrn v4.4h, v4.4s, #16 |
| 36 | ld1 {v2.4s}, [x1], #16 |
| 37 | fcvtzs v6.4s, v2.4s, #31 |
| 38 | sqrshrn2 v4.8h, v5.4s, #16 |
| 39 | ld1 {v3.4s}, [x1], #16 |
| 40 | fcvtzs v7.4s, v3.4s, #31 |
| 41 | sqrshrn v6.4h, v6.4s, #16 |
| 42 | st1 {v4.8h}, [x0], #16 |
| 43 | sqrshrn2 v6.8h, v7.4s, #16 |
| 44 | ld1 {v0.4s}, [x1], #16 |
| 45 | fcvtzs v4.4s, v0.4s, #31 |
| 46 | ld1 {v1.4s}, [x1], #16 |
| 47 | fcvtzs v5.4s, v1.4s, #31 |
| 48 | st1 {v6.8h}, [x0], #16 |
| 49 | b.ne 1b |
| 50 | ands x2, x2, #15 |
| 51 | b.eq 3f |
| 52 | 2: ld1 {v2.4s}, [x1], #16 |
| 53 | sqrshrn v4.4h, v4.4s, #16 |
| 54 | fcvtzs v6.4s, v2.4s, #31 |
| 55 | ld1 {v3.4s}, [x1], #16 |
| 56 | sqrshrn2 v4.8h, v5.4s, #16 |
| 57 | fcvtzs v7.4s, v3.4s, #31 |
| 58 | sqrshrn v6.4h, v6.4s, #16 |
| 59 | st1 {v4.8h}, [x0], #16 |
| 60 | sqrshrn2 v6.8h, v7.4s, #16 |
| 61 | st1 {v6.8h}, [x0] |
| 62 | ret |
| 63 | 3: sqrshrn v4.4h, v4.4s, #16 |
| 64 | sqrshrn2 v4.8h, v5.4s, #16 |
| 65 | st1 {v4.8h}, [x0] |
| 66 | ret |
| 67 | endfunc |
| 68 | |
| 69 | function ff_conv_fltp_to_s16_2ch_neon, export=1 |
| 70 | ldp x4, x5, [x1] |
| 71 | subs x2, x2, #8 |
| 72 | ld1 {v0.4s}, [x4], #16 |
| 73 | fcvtzs v4.4s, v0.4s, #31 |
| 74 | ld1 {v1.4s}, [x4], #16 |
| 75 | fcvtzs v5.4s, v1.4s, #31 |
| 76 | ld1 {v2.4s}, [x5], #16 |
| 77 | fcvtzs v6.4s, v2.4s, #31 |
| 78 | ld1 {v3.4s}, [x5], #16 |
| 79 | fcvtzs v7.4s, v3.4s, #31 |
| 80 | b.eq 3f |
| 81 | ands x12, x2, #~15 |
| 82 | b.eq 2f |
| 83 | 1: subs x12, x12, #16 |
| 84 | ld1 {v16.4s}, [x4], #16 |
| 85 | fcvtzs v20.4s, v16.4s, #31 |
| 86 | sri v6.4s, v4.4s, #16 |
| 87 | ld1 {v17.4s}, [x4], #16 |
| 88 | fcvtzs v21.4s, v17.4s, #31 |
| 89 | ld1 {v18.4s}, [x5], #16 |
| 90 | fcvtzs v22.4s, v18.4s, #31 |
| 91 | ld1 {v19.4s}, [x5], #16 |
| 92 | sri v7.4s, v5.4s, #16 |
| 93 | st1 {v6.4s}, [x0], #16 |
| 94 | fcvtzs v23.4s, v19.4s, #31 |
| 95 | st1 {v7.4s}, [x0], #16 |
| 96 | sri v22.4s, v20.4s, #16 |
| 97 | ld1 {v0.4s}, [x4], #16 |
| 98 | sri v23.4s, v21.4s, #16 |
| 99 | st1 {v22.4s}, [x0], #16 |
| 100 | fcvtzs v4.4s, v0.4s, #31 |
| 101 | ld1 {v1.4s}, [x4], #16 |
| 102 | fcvtzs v5.4s, v1.4s, #31 |
| 103 | ld1 {v2.4s}, [x5], #16 |
| 104 | fcvtzs v6.4s, v2.4s, #31 |
| 105 | ld1 {v3.4s}, [x5], #16 |
| 106 | fcvtzs v7.4s, v3.4s, #31 |
| 107 | st1 {v23.4s}, [x0], #16 |
| 108 | b.ne 1b |
| 109 | ands x2, x2, #15 |
| 110 | b.eq 3f |
| 111 | 2: sri v6.4s, v4.4s, #16 |
| 112 | ld1 {v0.4s}, [x4], #16 |
| 113 | fcvtzs v0.4s, v0.4s, #31 |
| 114 | ld1 {v1.4s}, [x4], #16 |
| 115 | fcvtzs v1.4s, v1.4s, #31 |
| 116 | ld1 {v2.4s}, [x5], #16 |
| 117 | fcvtzs v2.4s, v2.4s, #31 |
| 118 | sri v7.4s, v5.4s, #16 |
| 119 | ld1 {v3.4s}, [x5], #16 |
| 120 | fcvtzs v3.4s, v3.4s, #31 |
| 121 | sri v2.4s, v0.4s, #16 |
| 122 | st1 {v6.4s,v7.4s}, [x0], #32 |
| 123 | sri v3.4s, v1.4s, #16 |
| 124 | st1 {v2.4s,v3.4s}, [x0], #32 |
| 125 | ret |
| 126 | 3: sri v6.4s, v4.4s, #16 |
| 127 | sri v7.4s, v5.4s, #16 |
| 128 | st1 {v6.4s,v7.4s}, [x0] |
| 129 | ret |
| 130 | endfunc |
| 131 | |
| 132 | function ff_conv_fltp_to_s16_neon, export=1 |
| 133 | cmp w3, #2 |
| 134 | b.eq X(ff_conv_fltp_to_s16_2ch_neon) |
| 135 | b.gt 1f |
| 136 | ldr x1, [x1] |
| 137 | b X(ff_conv_flt_to_s16_neon) |
| 138 | 1: |
| 139 | cmp w3, #4 |
| 140 | lsl x12, x3, #1 |
| 141 | b.lt 4f |
| 142 | |
| 143 | 5: // 4 channels |
| 144 | ldp x4, x5, [x1], #16 |
| 145 | ldp x6, x7, [x1], #16 |
| 146 | mov w9, w2 |
| 147 | mov x8, x0 |
| 148 | ld1 {v4.4s}, [x4], #16 |
| 149 | fcvtzs v4.4s, v4.4s, #31 |
| 150 | ld1 {v5.4s}, [x5], #16 |
| 151 | fcvtzs v5.4s, v5.4s, #31 |
| 152 | ld1 {v6.4s}, [x6], #16 |
| 153 | fcvtzs v6.4s, v6.4s, #31 |
| 154 | ld1 {v7.4s}, [x7], #16 |
| 155 | fcvtzs v7.4s, v7.4s, #31 |
| 156 | 6: |
| 157 | subs w9, w9, #8 |
| 158 | ld1 {v0.4s}, [x4], #16 |
| 159 | fcvtzs v0.4s, v0.4s, #31 |
| 160 | sri v5.4s, v4.4s, #16 |
| 161 | ld1 {v1.4s}, [x5], #16 |
| 162 | fcvtzs v1.4s, v1.4s, #31 |
| 163 | sri v7.4s, v6.4s, #16 |
| 164 | ld1 {v2.4s}, [x6], #16 |
| 165 | fcvtzs v2.4s, v2.4s, #31 |
| 166 | zip1 v16.4s, v5.4s, v7.4s |
| 167 | ld1 {v3.4s}, [x7], #16 |
| 168 | fcvtzs v3.4s, v3.4s, #31 |
| 169 | zip2 v17.4s, v5.4s, v7.4s |
| 170 | st1 {v16.d}[0], [x8], x12 |
| 171 | sri v1.4s, v0.4s, #16 |
| 172 | st1 {v16.d}[1], [x8], x12 |
| 173 | sri v3.4s, v2.4s, #16 |
| 174 | st1 {v17.d}[0], [x8], x12 |
| 175 | zip1 v18.4s, v1.4s, v3.4s |
| 176 | st1 {v17.d}[1], [x8], x12 |
| 177 | zip2 v19.4s, v1.4s, v3.4s |
| 178 | b.eq 7f |
| 179 | ld1 {v4.4s}, [x4], #16 |
| 180 | fcvtzs v4.4s, v4.4s, #31 |
| 181 | st1 {v18.d}[0], [x8], x12 |
| 182 | ld1 {v5.4s}, [x5], #16 |
| 183 | fcvtzs v5.4s, v5.4s, #31 |
| 184 | st1 {v18.d}[1], [x8], x12 |
| 185 | ld1 {v6.4s}, [x6], #16 |
| 186 | fcvtzs v6.4s, v6.4s, #31 |
| 187 | st1 {v19.d}[0], [x8], x12 |
| 188 | ld1 {v7.4s}, [x7], #16 |
| 189 | fcvtzs v7.4s, v7.4s, #31 |
| 190 | st1 {v19.d}[1], [x8], x12 |
| 191 | b 6b |
| 192 | 7: |
| 193 | st1 {v18.d}[0], [x8], x12 |
| 194 | st1 {v18.d}[1], [x8], x12 |
| 195 | st1 {v19.d}[0], [x8], x12 |
| 196 | st1 {v19.d}[1], [x8], x12 |
| 197 | subs w3, w3, #4 |
| 198 | b.eq end |
| 199 | cmp w3, #4 |
| 200 | add x0, x0, #8 |
| 201 | b.ge 5b |
| 202 | |
| 203 | 4: // 2 channels |
| 204 | cmp w3, #2 |
| 205 | b.lt 4f |
| 206 | ldp x4, x5, [x1], #16 |
| 207 | mov w9, w2 |
| 208 | mov x8, x0 |
| 209 | tst w9, #8 |
| 210 | ld1 {v4.4s}, [x4], #16 |
| 211 | fcvtzs v4.4s, v4.4s, #31 |
| 212 | ld1 {v5.4s}, [x5], #16 |
| 213 | fcvtzs v5.4s, v5.4s, #31 |
| 214 | ld1 {v6.4s}, [x4], #16 |
| 215 | fcvtzs v6.4s, v6.4s, #31 |
| 216 | ld1 {v7.4s}, [x5], #16 |
| 217 | fcvtzs v7.4s, v7.4s, #31 |
| 218 | b.eq 6f |
| 219 | subs w9, w9, #8 |
| 220 | b.eq 7f |
| 221 | sri v5.4s, v4.4s, #16 |
| 222 | ld1 {v4.4s}, [x4], #16 |
| 223 | fcvtzs v4.4s, v4.4s, #31 |
| 224 | st1 {v5.s}[0], [x8], x12 |
| 225 | sri v7.4s, v6.4s, #16 |
| 226 | st1 {v5.s}[1], [x8], x12 |
| 227 | ld1 {v6.4s}, [x4], #16 |
| 228 | fcvtzs v6.4s, v6.4s, #31 |
| 229 | st1 {v5.s}[2], [x8], x12 |
| 230 | st1 {v5.s}[3], [x8], x12 |
| 231 | st1 {v7.s}[0], [x8], x12 |
| 232 | st1 {v7.s}[1], [x8], x12 |
| 233 | ld1 {v5.4s}, [x5], #16 |
| 234 | fcvtzs v5.4s, v5.4s, #31 |
| 235 | st1 {v7.s}[2], [x8], x12 |
| 236 | st1 {v7.s}[3], [x8], x12 |
| 237 | ld1 {v7.4s}, [x5], #16 |
| 238 | fcvtzs v7.4s, v7.4s, #31 |
| 239 | 6: |
| 240 | subs w9, w9, #16 |
| 241 | ld1 {v0.4s}, [x4], #16 |
| 242 | sri v5.4s, v4.4s, #16 |
| 243 | fcvtzs v0.4s, v0.4s, #31 |
| 244 | ld1 {v1.4s}, [x5], #16 |
| 245 | sri v7.4s, v6.4s, #16 |
| 246 | st1 {v5.s}[0], [x8], x12 |
| 247 | st1 {v5.s}[1], [x8], x12 |
| 248 | fcvtzs v1.4s, v1.4s, #31 |
| 249 | st1 {v5.s}[2], [x8], x12 |
| 250 | st1 {v5.s}[3], [x8], x12 |
| 251 | ld1 {v2.4s}, [x4], #16 |
| 252 | st1 {v7.s}[0], [x8], x12 |
| 253 | fcvtzs v2.4s, v2.4s, #31 |
| 254 | st1 {v7.s}[1], [x8], x12 |
| 255 | ld1 {v3.4s}, [x5], #16 |
| 256 | st1 {v7.s}[2], [x8], x12 |
| 257 | fcvtzs v3.4s, v3.4s, #31 |
| 258 | st1 {v7.s}[3], [x8], x12 |
| 259 | sri v1.4s, v0.4s, #16 |
| 260 | sri v3.4s, v2.4s, #16 |
| 261 | b.eq 6f |
| 262 | ld1 {v4.4s}, [x4], #16 |
| 263 | st1 {v1.s}[0], [x8], x12 |
| 264 | fcvtzs v4.4s, v4.4s, #31 |
| 265 | st1 {v1.s}[1], [x8], x12 |
| 266 | ld1 {v5.4s}, [x5], #16 |
| 267 | st1 {v1.s}[2], [x8], x12 |
| 268 | fcvtzs v5.4s, v5.4s, #31 |
| 269 | st1 {v1.s}[3], [x8], x12 |
| 270 | ld1 {v6.4s}, [x4], #16 |
| 271 | st1 {v3.s}[0], [x8], x12 |
| 272 | fcvtzs v6.4s, v6.4s, #31 |
| 273 | st1 {v3.s}[1], [x8], x12 |
| 274 | ld1 {v7.4s}, [x5], #16 |
| 275 | st1 {v3.s}[2], [x8], x12 |
| 276 | fcvtzs v7.4s, v7.4s, #31 |
| 277 | st1 {v3.s}[3], [x8], x12 |
| 278 | b.gt 6b |
| 279 | 6: |
| 280 | st1 {v1.s}[0], [x8], x12 |
| 281 | st1 {v1.s}[1], [x8], x12 |
| 282 | st1 {v1.s}[2], [x8], x12 |
| 283 | st1 {v1.s}[3], [x8], x12 |
| 284 | st1 {v3.s}[0], [x8], x12 |
| 285 | st1 {v3.s}[1], [x8], x12 |
| 286 | st1 {v3.s}[2], [x8], x12 |
| 287 | st1 {v3.s}[3], [x8], x12 |
| 288 | b 8f |
| 289 | 7: |
| 290 | sri v5.4s, v4.4s, #16 |
| 291 | sri v7.4s, v6.4s, #16 |
| 292 | st1 {v5.s}[0], [x8], x12 |
| 293 | st1 {v5.s}[1], [x8], x12 |
| 294 | st1 {v5.s}[2], [x8], x12 |
| 295 | st1 {v5.s}[3], [x8], x12 |
| 296 | st1 {v7.s}[0], [x8], x12 |
| 297 | st1 {v7.s}[1], [x8], x12 |
| 298 | st1 {v7.s}[2], [x8], x12 |
| 299 | st1 {v7.s}[3], [x8], x12 |
| 300 | 8: |
| 301 | subs w3, w3, #2 |
| 302 | add x0, x0, #4 |
| 303 | b.eq end |
| 304 | |
| 305 | 4: // 1 channel |
| 306 | ldr x4, [x1] |
| 307 | tst w2, #8 |
| 308 | mov w9, w2 |
| 309 | mov x5, x0 |
| 310 | ld1 {v0.4s}, [x4], #16 |
| 311 | fcvtzs v0.4s, v0.4s, #31 |
| 312 | ld1 {v1.4s}, [x4], #16 |
| 313 | fcvtzs v1.4s, v1.4s, #31 |
| 314 | b.ne 8f |
| 315 | 6: |
| 316 | subs w9, w9, #16 |
| 317 | ld1 {v2.4s}, [x4], #16 |
| 318 | fcvtzs v2.4s, v2.4s, #31 |
| 319 | ld1 {v3.4s}, [x4], #16 |
| 320 | fcvtzs v3.4s, v3.4s, #31 |
| 321 | st1 {v0.h}[1], [x5], x12 |
| 322 | st1 {v0.h}[3], [x5], x12 |
| 323 | st1 {v0.h}[5], [x5], x12 |
| 324 | st1 {v0.h}[7], [x5], x12 |
| 325 | st1 {v1.h}[1], [x5], x12 |
| 326 | st1 {v1.h}[3], [x5], x12 |
| 327 | st1 {v1.h}[5], [x5], x12 |
| 328 | st1 {v1.h}[7], [x5], x12 |
| 329 | b.eq 7f |
| 330 | ld1 {v0.4s}, [x4], #16 |
| 331 | fcvtzs v0.4s, v0.4s, #31 |
| 332 | ld1 {v1.4s}, [x4], #16 |
| 333 | fcvtzs v1.4s, v1.4s, #31 |
| 334 | 7: |
| 335 | st1 {v2.h}[1], [x5], x12 |
| 336 | st1 {v2.h}[3], [x5], x12 |
| 337 | st1 {v2.h}[5], [x5], x12 |
| 338 | st1 {v2.h}[7], [x5], x12 |
| 339 | st1 {v3.h}[1], [x5], x12 |
| 340 | st1 {v3.h}[3], [x5], x12 |
| 341 | st1 {v3.h}[5], [x5], x12 |
| 342 | st1 {v3.h}[7], [x5], x12 |
| 343 | b.gt 6b |
| 344 | ret |
| 345 | 8: |
| 346 | subs w9, w9, #8 |
| 347 | st1 {v0.h}[1], [x5], x12 |
| 348 | st1 {v0.h}[3], [x5], x12 |
| 349 | st1 {v0.h}[5], [x5], x12 |
| 350 | st1 {v0.h}[7], [x5], x12 |
| 351 | st1 {v1.h}[1], [x5], x12 |
| 352 | st1 {v1.h}[3], [x5], x12 |
| 353 | st1 {v1.h}[5], [x5], x12 |
| 354 | st1 {v1.h}[7], [x5], x12 |
| 355 | b.eq end |
| 356 | ld1 {v0.4s}, [x4], #16 |
| 357 | fcvtzs v0.4s, v0.4s, #31 |
| 358 | ld1 {v1.4s}, [x4], #16 |
| 359 | fcvtzs v1.4s, v1.4s, #31 |
| 360 | b 6b |
| 361 | end: |
| 362 | ret |
| 363 | endfunc |