| 1 | /* |
| 2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
| 3 | * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/aarch64/asm.S" |
| 23 | #include "neon.S" |
| 24 | |
| 25 | function ff_h264_idct_add_neon, export=1 |
| 26 | ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] |
| 27 | sxtw x2, w2 |
| 28 | movi v30.8H, #0 |
| 29 | |
| 30 | add v4.4H, v0.4H, v2.4H |
| 31 | sshr v16.4H, v1.4H, #1 |
| 32 | st1 {v30.8H}, [x1], #16 |
| 33 | sshr v17.4H, v3.4H, #1 |
| 34 | st1 {v30.8H}, [x1], #16 |
| 35 | sub v5.4H, v0.4H, v2.4H |
| 36 | add v6.4H, v1.4H, v17.4H |
| 37 | sub v7.4H, v16.4H, v3.4H |
| 38 | add v0.4H, v4.4H, v6.4H |
| 39 | add v1.4H, v5.4H, v7.4H |
| 40 | sub v2.4H, v4.4H, v6.4H |
| 41 | sub v3.4H, v5.4H, v7.4H |
| 42 | |
| 43 | transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 |
| 44 | |
| 45 | add v4.4H, v0.4H, v3.4H |
| 46 | ld1 {v18.S}[0], [x0], x2 |
| 47 | sshr v16.4H, v2.4H, #1 |
| 48 | sshr v17.4H, v1.4H, #1 |
| 49 | ld1 {v19.S}[1], [x0], x2 |
| 50 | sub v5.4H, v0.4H, v3.4H |
| 51 | ld1 {v18.S}[1], [x0], x2 |
| 52 | add v6.4H, v16.4H, v1.4H |
| 53 | ins v4.D[1], v5.D[0] |
| 54 | sub v7.4H, v2.4H, v17.4H |
| 55 | ld1 {v19.S}[0], [x0], x2 |
| 56 | ins v6.D[1], v7.D[0] |
| 57 | sub x0, x0, x2, lsl #2 |
| 58 | add v0.8H, v4.8H, v6.8H |
| 59 | sub v1.8H, v4.8H, v6.8H |
| 60 | |
| 61 | srshr v0.8H, v0.8H, #6 |
| 62 | srshr v1.8H, v1.8H, #6 |
| 63 | |
| 64 | uaddw v0.8H, v0.8H, v18.8B |
| 65 | uaddw v1.8H, v1.8H, v19.8B |
| 66 | |
| 67 | sqxtun v0.8B, v0.8H |
| 68 | sqxtun v1.8B, v1.8H |
| 69 | |
| 70 | st1 {v0.S}[0], [x0], x2 |
| 71 | st1 {v1.S}[1], [x0], x2 |
| 72 | st1 {v0.S}[1], [x0], x2 |
| 73 | st1 {v1.S}[0], [x0], x2 |
| 74 | |
| 75 | sub x1, x1, #32 |
| 76 | ret |
| 77 | endfunc |
| 78 | |
| 79 | function ff_h264_idct_dc_add_neon, export=1 |
| 80 | sxtw x2, w2 |
| 81 | mov w3, #0 |
| 82 | ld1r {v2.8H}, [x1] |
| 83 | strh w3, [x1] |
| 84 | srshr v2.8H, v2.8H, #6 |
| 85 | ld1 {v0.S}[0], [x0], x2 |
| 86 | ld1 {v0.S}[1], [x0], x2 |
| 87 | uaddw v3.8H, v2.8H, v0.8B |
| 88 | ld1 {v1.S}[0], [x0], x2 |
| 89 | ld1 {v1.S}[1], [x0], x2 |
| 90 | uaddw v4.8H, v2.8H, v1.8B |
| 91 | sqxtun v0.8B, v3.8H |
| 92 | sqxtun v1.8B, v4.8H |
| 93 | sub x0, x0, x2, lsl #2 |
| 94 | st1 {v0.S}[0], [x0], x2 |
| 95 | st1 {v0.S}[1], [x0], x2 |
| 96 | st1 {v1.S}[0], [x0], x2 |
| 97 | st1 {v1.S}[1], [x0], x2 |
| 98 | ret |
| 99 | endfunc |
| 100 | |
| 101 | function ff_h264_idct_add16_neon, export=1 |
| 102 | mov x12, x30 |
| 103 | mov x6, x0 // dest |
| 104 | mov x5, x1 // block_offset |
| 105 | mov x1, x2 // block |
| 106 | mov w9, w3 // stride |
| 107 | movrel x7, scan8 |
| 108 | mov x10, #16 |
| 109 | movrel x13, X(ff_h264_idct_dc_add_neon) |
| 110 | movrel x14, X(ff_h264_idct_add_neon) |
| 111 | 1: mov w2, w9 |
| 112 | ldrb w3, [x7], #1 |
| 113 | ldrsw x0, [x5], #4 |
| 114 | ldrb w3, [x4, w3, uxtw] |
| 115 | subs w3, w3, #1 |
| 116 | b.lt 2f |
| 117 | ldrsh w3, [x1] |
| 118 | add x0, x0, x6 |
| 119 | ccmp w3, #0, #4, eq |
| 120 | csel x15, x13, x14, ne |
| 121 | blr x15 |
| 122 | 2: subs x10, x10, #1 |
| 123 | add x1, x1, #32 |
| 124 | b.ne 1b |
| 125 | ret x12 |
| 126 | endfunc |
| 127 | |
| 128 | function ff_h264_idct_add16intra_neon, export=1 |
| 129 | mov x12, x30 |
| 130 | mov x6, x0 // dest |
| 131 | mov x5, x1 // block_offset |
| 132 | mov x1, x2 // block |
| 133 | mov w9, w3 // stride |
| 134 | movrel x7, scan8 |
| 135 | mov x10, #16 |
| 136 | movrel x13, X(ff_h264_idct_dc_add_neon) |
| 137 | movrel x14, X(ff_h264_idct_add_neon) |
| 138 | 1: mov w2, w9 |
| 139 | ldrb w3, [x7], #1 |
| 140 | ldrsw x0, [x5], #4 |
| 141 | ldrb w3, [x4, w3, uxtw] |
| 142 | add x0, x0, x6 |
| 143 | cmp w3, #0 |
| 144 | ldrsh w3, [x1] |
| 145 | csel x15, x13, x14, eq |
| 146 | ccmp w3, #0, #0, eq |
| 147 | b.eq 2f |
| 148 | blr x15 |
| 149 | 2: subs x10, x10, #1 |
| 150 | add x1, x1, #32 |
| 151 | b.ne 1b |
| 152 | ret x12 |
| 153 | endfunc |
| 154 | |
| 155 | function ff_h264_idct_add8_neon, export=1 |
| 156 | sub sp, sp, #0x40 |
| 157 | stp x19, x20, [sp] |
| 158 | mov x12, x30 |
| 159 | ldp x6, x15, [x0] // dest[0], dest[1] |
| 160 | add x5, x1, #16*4 // block_offset |
| 161 | add x9, x2, #16*32 // block |
| 162 | mov w19, w3 // stride |
| 163 | movrel x13, X(ff_h264_idct_dc_add_neon) |
| 164 | movrel x14, X(ff_h264_idct_add_neon) |
| 165 | movrel x7, scan8+16 |
| 166 | mov x10, #0 |
| 167 | mov x11, #16 |
| 168 | 1: mov w2, w19 |
| 169 | ldrb w3, [x7, x10] // scan8[i] |
| 170 | ldrsw x0, [x5, x10, lsl #2] // block_offset[i] |
| 171 | ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] |
| 172 | add x0, x0, x6 // block_offset[i] + dst[j-1] |
| 173 | add x1, x9, x10, lsl #5 // block + i * 16 |
| 174 | cmp w3, #0 |
| 175 | ldrsh w3, [x1] // block[i*16] |
| 176 | csel x20, x13, x14, eq |
| 177 | ccmp w3, #0, #0, eq |
| 178 | b.eq 2f |
| 179 | blr x20 |
| 180 | 2: add x10, x10, #1 |
| 181 | cmp x10, #4 |
| 182 | csel x10, x11, x10, eq // mov x10, #16 |
| 183 | csel x6, x15, x6, eq |
| 184 | cmp x10, #20 |
| 185 | b.lt 1b |
| 186 | ldp x19, x20, [sp] |
| 187 | add sp, sp, #0x40 |
| 188 | ret x12 |
| 189 | endfunc |
| 190 | |
| 191 | .macro idct8x8_cols pass |
| 192 | .if \pass == 0 |
| 193 | va .req v18 |
| 194 | vb .req v30 |
| 195 | sshr v18.8H, v26.8H, #1 |
| 196 | add v16.8H, v24.8H, v28.8H |
| 197 | ld1 {v30.8H, v31.8H}, [x1] |
| 198 | st1 {v19.8H}, [x1], #16 |
| 199 | st1 {v19.8H}, [x1], #16 |
| 200 | sub v17.8H, v24.8H, v28.8H |
| 201 | sshr v19.8H, v30.8H, #1 |
| 202 | sub v18.8H, v18.8H, v30.8H |
| 203 | add v19.8H, v19.8H, v26.8H |
| 204 | .else |
| 205 | va .req v30 |
| 206 | vb .req v18 |
| 207 | sshr v30.8H, v26.8H, #1 |
| 208 | sshr v19.8H, v18.8H, #1 |
| 209 | add v16.8H, v24.8H, v28.8H |
| 210 | sub v17.8H, v24.8H, v28.8H |
| 211 | sub v30.8H, v30.8H, v18.8H |
| 212 | add v19.8H, v19.8H, v26.8H |
| 213 | .endif |
| 214 | add v26.8H, v17.8H, va.8H |
| 215 | sub v28.8H, v17.8H, va.8H |
| 216 | add v24.8H, v16.8H, v19.8H |
| 217 | sub vb.8H, v16.8H, v19.8H |
| 218 | sub v16.8H, v29.8H, v27.8H |
| 219 | add v17.8H, v31.8H, v25.8H |
| 220 | sub va.8H, v31.8H, v25.8H |
| 221 | add v19.8H, v29.8H, v27.8H |
| 222 | sub v16.8H, v16.8H, v31.8H |
| 223 | sub v17.8H, v17.8H, v27.8H |
| 224 | add va.8H, va.8H, v29.8H |
| 225 | add v19.8H, v19.8H, v25.8H |
| 226 | sshr v25.8H, v25.8H, #1 |
| 227 | sshr v27.8H, v27.8H, #1 |
| 228 | sshr v29.8H, v29.8H, #1 |
| 229 | sshr v31.8H, v31.8H, #1 |
| 230 | sub v16.8H, v16.8H, v31.8H |
| 231 | sub v17.8H, v17.8H, v27.8H |
| 232 | add va.8H, va.8H, v29.8H |
| 233 | add v19.8H, v19.8H, v25.8H |
| 234 | sshr v25.8H, v16.8H, #2 |
| 235 | sshr v27.8H, v17.8H, #2 |
| 236 | sshr v29.8H, va.8H, #2 |
| 237 | sshr v31.8H, v19.8H, #2 |
| 238 | sub v19.8H, v19.8H, v25.8H |
| 239 | sub va.8H, v27.8H, va.8H |
| 240 | add v17.8H, v17.8H, v29.8H |
| 241 | add v16.8H, v16.8H, v31.8H |
| 242 | .if \pass == 0 |
| 243 | sub v31.8H, v24.8H, v19.8H |
| 244 | add v24.8H, v24.8H, v19.8H |
| 245 | add v25.8H, v26.8H, v18.8H |
| 246 | sub v18.8H, v26.8H, v18.8H |
| 247 | add v26.8H, v28.8H, v17.8H |
| 248 | add v27.8H, v30.8H, v16.8H |
| 249 | sub v29.8H, v28.8H, v17.8H |
| 250 | sub v28.8H, v30.8H, v16.8H |
| 251 | .else |
| 252 | sub v31.8H, v24.8H, v19.8H |
| 253 | add v24.8H, v24.8H, v19.8H |
| 254 | add v25.8H, v26.8H, v30.8H |
| 255 | sub v30.8H, v26.8H, v30.8H |
| 256 | add v26.8H, v28.8H, v17.8H |
| 257 | sub v29.8H, v28.8H, v17.8H |
| 258 | add v27.8H, v18.8H, v16.8H |
| 259 | sub v28.8H, v18.8H, v16.8H |
| 260 | .endif |
| 261 | .unreq va |
| 262 | .unreq vb |
| 263 | .endm |
| 264 | |
| 265 | function ff_h264_idct8_add_neon, export=1 |
| 266 | movi v19.8H, #0 |
| 267 | ld1 {v24.8H, v25.8H}, [x1] |
| 268 | st1 {v19.8H}, [x1], #16 |
| 269 | st1 {v19.8H}, [x1], #16 |
| 270 | ld1 {v26.8H, v27.8H}, [x1] |
| 271 | st1 {v19.8H}, [x1], #16 |
| 272 | st1 {v19.8H}, [x1], #16 |
| 273 | ld1 {v28.8H, v29.8H}, [x1] |
| 274 | st1 {v19.8H}, [x1], #16 |
| 275 | st1 {v19.8H}, [x1], #16 |
| 276 | |
| 277 | idct8x8_cols 0 |
| 278 | transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 |
| 279 | idct8x8_cols 1 |
| 280 | |
| 281 | mov x3, x0 |
| 282 | srshr v24.8H, v24.8H, #6 |
| 283 | ld1 {v0.8B}, [x0], x2 |
| 284 | srshr v25.8H, v25.8H, #6 |
| 285 | ld1 {v1.8B}, [x0], x2 |
| 286 | srshr v26.8H, v26.8H, #6 |
| 287 | ld1 {v2.8B}, [x0], x2 |
| 288 | srshr v27.8H, v27.8H, #6 |
| 289 | ld1 {v3.8B}, [x0], x2 |
| 290 | srshr v28.8H, v28.8H, #6 |
| 291 | ld1 {v4.8B}, [x0], x2 |
| 292 | srshr v29.8H, v29.8H, #6 |
| 293 | ld1 {v5.8B}, [x0], x2 |
| 294 | srshr v30.8H, v30.8H, #6 |
| 295 | ld1 {v6.8B}, [x0], x2 |
| 296 | srshr v31.8H, v31.8H, #6 |
| 297 | ld1 {v7.8B}, [x0], x2 |
| 298 | uaddw v24.8H, v24.8H, v0.8B |
| 299 | uaddw v25.8H, v25.8H, v1.8B |
| 300 | uaddw v26.8H, v26.8H, v2.8B |
| 301 | sqxtun v0.8B, v24.8H |
| 302 | uaddw v27.8H, v27.8H, v3.8B |
| 303 | sqxtun v1.8B, v25.8H |
| 304 | uaddw v28.8H, v28.8H, v4.8B |
| 305 | sqxtun v2.8B, v26.8H |
| 306 | st1 {v0.8B}, [x3], x2 |
| 307 | uaddw v29.8H, v29.8H, v5.8B |
| 308 | sqxtun v3.8B, v27.8H |
| 309 | st1 {v1.8B}, [x3], x2 |
| 310 | uaddw v30.8H, v30.8H, v6.8B |
| 311 | sqxtun v4.8B, v28.8H |
| 312 | st1 {v2.8B}, [x3], x2 |
| 313 | uaddw v31.8H, v31.8H, v7.8B |
| 314 | sqxtun v5.8B, v29.8H |
| 315 | st1 {v3.8B}, [x3], x2 |
| 316 | sqxtun v6.8B, v30.8H |
| 317 | sqxtun v7.8B, v31.8H |
| 318 | st1 {v4.8B}, [x3], x2 |
| 319 | st1 {v5.8B}, [x3], x2 |
| 320 | st1 {v6.8B}, [x3], x2 |
| 321 | st1 {v7.8B}, [x3], x2 |
| 322 | |
| 323 | sub x1, x1, #128 |
| 324 | ret |
| 325 | endfunc |
| 326 | |
| 327 | function ff_h264_idct8_dc_add_neon, export=1 |
| 328 | mov w3, #0 |
| 329 | sxtw x2, w2 |
| 330 | ld1r {v31.8H}, [x1] |
| 331 | strh w3, [x1] |
| 332 | ld1 {v0.8B}, [x0], x2 |
| 333 | srshr v31.8H, v31.8H, #6 |
| 334 | ld1 {v1.8B}, [x0], x2 |
| 335 | ld1 {v2.8B}, [x0], x2 |
| 336 | uaddw v24.8H, v31.8H, v0.8B |
| 337 | ld1 {v3.8B}, [x0], x2 |
| 338 | uaddw v25.8H, v31.8H, v1.8B |
| 339 | ld1 {v4.8B}, [x0], x2 |
| 340 | uaddw v26.8H, v31.8H, v2.8B |
| 341 | ld1 {v5.8B}, [x0], x2 |
| 342 | uaddw v27.8H, v31.8H, v3.8B |
| 343 | ld1 {v6.8B}, [x0], x2 |
| 344 | uaddw v28.8H, v31.8H, v4.8B |
| 345 | ld1 {v7.8B}, [x0], x2 |
| 346 | uaddw v29.8H, v31.8H, v5.8B |
| 347 | uaddw v30.8H, v31.8H, v6.8B |
| 348 | uaddw v31.8H, v31.8H, v7.8B |
| 349 | sqxtun v0.8B, v24.8H |
| 350 | sqxtun v1.8B, v25.8H |
| 351 | sqxtun v2.8B, v26.8H |
| 352 | sqxtun v3.8B, v27.8H |
| 353 | sub x0, x0, x2, lsl #3 |
| 354 | st1 {v0.8B}, [x0], x2 |
| 355 | sqxtun v4.8B, v28.8H |
| 356 | st1 {v1.8B}, [x0], x2 |
| 357 | sqxtun v5.8B, v29.8H |
| 358 | st1 {v2.8B}, [x0], x2 |
| 359 | sqxtun v6.8B, v30.8H |
| 360 | st1 {v3.8B}, [x0], x2 |
| 361 | sqxtun v7.8B, v31.8H |
| 362 | st1 {v4.8B}, [x0], x2 |
| 363 | st1 {v5.8B}, [x0], x2 |
| 364 | st1 {v6.8B}, [x0], x2 |
| 365 | st1 {v7.8B}, [x0], x2 |
| 366 | ret |
| 367 | endfunc |
| 368 | |
| 369 | function ff_h264_idct8_add4_neon, export=1 |
| 370 | mov x12, x30 |
| 371 | mov x6, x0 |
| 372 | mov x5, x1 |
| 373 | mov x1, x2 |
| 374 | mov w2, w3 |
| 375 | movrel x7, scan8 |
| 376 | mov w10, #16 |
| 377 | movrel x13, X(ff_h264_idct8_dc_add_neon) |
| 378 | movrel x14, X(ff_h264_idct8_add_neon) |
| 379 | 1: ldrb w9, [x7], #4 |
| 380 | ldrsw x0, [x5], #16 |
| 381 | ldrb w9, [x4, w9, UXTW] |
| 382 | subs w9, w9, #1 |
| 383 | b.lt 2f |
| 384 | ldrsh w11, [x1] |
| 385 | add x0, x6, x0 |
| 386 | ccmp w11, #0, #4, eq |
| 387 | csel x15, x13, x14, ne |
| 388 | blr x15 |
| 389 | 2: subs w10, w10, #4 |
| 390 | add x1, x1, #128 |
| 391 | b.ne 1b |
| 392 | ret x12 |
| 393 | endfunc |
| 394 | |
| 395 | const scan8 |
| 396 | .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 |
| 397 | .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 |
| 398 | .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 |
| 399 | .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 |
| 400 | .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 |
| 401 | .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 |
| 402 | .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 |
| 403 | .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 |
| 404 | .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 |
| 405 | .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 |
| 406 | .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 |
| 407 | .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 |
| 408 | endconst |