| 1 | /* |
| 2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
| 3 | * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/aarch64/asm.S" |
| 23 | #include "neon.S" |
| 24 | |
| 25 | /* H.264 qpel MC */ |
| 26 | |
| 27 | .macro lowpass_const r |
| 28 | movz \r, #20, lsl #16 |
| 29 | movk \r, #5 |
| 30 | mov v6.S[0], \r |
| 31 | .endm |
| 32 | |
| 33 | //trashes v0-v5 |
| 34 | .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 |
| 35 | ext v2.8B, \r0\().8B, \r1\().8B, #2 |
| 36 | ext v3.8B, \r0\().8B, \r1\().8B, #3 |
| 37 | uaddl v2.8H, v2.8B, v3.8B |
| 38 | ext v4.8B, \r0\().8B, \r1\().8B, #1 |
| 39 | ext v5.8B, \r0\().8B, \r1\().8B, #4 |
| 40 | uaddl v4.8H, v4.8B, v5.8B |
| 41 | ext v1.8B, \r0\().8B, \r1\().8B, #5 |
| 42 | uaddl \d0\().8H, \r0\().8B, v1.8B |
| 43 | ext v0.8B, \r2\().8B, \r3\().8B, #2 |
| 44 | mla \d0\().8H, v2.8H, v6.H[1] |
| 45 | ext v1.8B, \r2\().8B, \r3\().8B, #3 |
| 46 | uaddl v0.8H, v0.8B, v1.8B |
| 47 | ext v1.8B, \r2\().8B, \r3\().8B, #1 |
| 48 | mls \d0\().8H, v4.8H, v6.H[0] |
| 49 | ext v3.8B, \r2\().8B, \r3\().8B, #4 |
| 50 | uaddl v1.8H, v1.8B, v3.8B |
| 51 | ext v2.8B, \r2\().8B, \r3\().8B, #5 |
| 52 | uaddl \d1\().8H, \r2\().8B, v2.8B |
| 53 | mla \d1\().8H, v0.8H, v6.H[1] |
| 54 | mls \d1\().8H, v1.8H, v6.H[0] |
| 55 | .if \narrow |
| 56 | sqrshrun \d0\().8B, \d0\().8H, #5 |
| 57 | sqrshrun \d1\().8B, \d1\().8H, #5 |
| 58 | .endif |
| 59 | .endm |
| 60 | |
| 61 | //trashes v0-v5, v7, v30-v31 |
| 62 | .macro lowpass_8H r0, r1 |
| 63 | ext v0.16B, \r0\().16B, \r0\().16B, #2 |
| 64 | ext v1.16B, \r0\().16B, \r0\().16B, #3 |
| 65 | uaddl v0.8H, v0.8B, v1.8B |
| 66 | ext v2.16B, \r0\().16B, \r0\().16B, #1 |
| 67 | ext v3.16B, \r0\().16B, \r0\().16B, #4 |
| 68 | uaddl v2.8H, v2.8B, v3.8B |
| 69 | ext v30.16B, \r0\().16B, \r0\().16B, #5 |
| 70 | uaddl \r0\().8H, \r0\().8B, v30.8B |
| 71 | ext v4.16B, \r1\().16B, \r1\().16B, #2 |
| 72 | mla \r0\().8H, v0.8H, v6.H[1] |
| 73 | ext v5.16B, \r1\().16B, \r1\().16B, #3 |
| 74 | uaddl v4.8H, v4.8B, v5.8B |
| 75 | ext v7.16B, \r1\().16B, \r1\().16B, #1 |
| 76 | mls \r0\().8H, v2.8H, v6.H[0] |
| 77 | ext v0.16B, \r1\().16B, \r1\().16B, #4 |
| 78 | uaddl v7.8H, v7.8B, v0.8B |
| 79 | ext v31.16B, \r1\().16B, \r1\().16B, #5 |
| 80 | uaddl \r1\().8H, \r1\().8B, v31.8B |
| 81 | mla \r1\().8H, v4.8H, v6.H[1] |
| 82 | mls \r1\().8H, v7.8H, v6.H[0] |
| 83 | .endm |
| 84 | |
| 85 | // trashes v2-v5, v30 |
| 86 | .macro lowpass_8_1 r0, r1, d0, narrow=1 |
| 87 | ext v2.8B, \r0\().8B, \r1\().8B, #2 |
| 88 | ext v3.8B, \r0\().8B, \r1\().8B, #3 |
| 89 | uaddl v2.8H, v2.8B, v3.8B |
| 90 | ext v4.8B, \r0\().8B, \r1\().8B, #1 |
| 91 | ext v5.8B, \r0\().8B, \r1\().8B, #4 |
| 92 | uaddl v4.8H, v4.8B, v5.8B |
| 93 | ext v30.8B, \r0\().8B, \r1\().8B, #5 |
| 94 | uaddl \d0\().8H, \r0\().8B, v30.8B |
| 95 | mla \d0\().8H, v2.8H, v6.H[1] |
| 96 | mls \d0\().8H, v4.8H, v6.H[0] |
| 97 | .if \narrow |
| 98 | sqrshrun \d0\().8B, \d0\().8H, #5 |
| 99 | .endif |
| 100 | .endm |
| 101 | |
| 102 | // trashed v0-v7 |
| 103 | .macro lowpass_8.16 r0, r1, r2 |
| 104 | ext v1.16B, \r0\().16B, \r1\().16B, #4 |
| 105 | ext v0.16B, \r0\().16B, \r1\().16B, #6 |
| 106 | saddl v5.4S, v1.4H, v0.4H |
| 107 | ext v2.16B, \r0\().16B, \r1\().16B, #2 |
| 108 | saddl2 v1.4S, v1.8H, v0.8H |
| 109 | ext v3.16B, \r0\().16B, \r1\().16B, #8 |
| 110 | saddl v6.4S, v2.4H, v3.4H |
| 111 | ext \r1\().16B, \r0\().16B, \r1\().16B, #10 |
| 112 | saddl2 v2.4S, v2.8H, v3.8H |
| 113 | saddl v0.4S, \r0\().4H, \r1\().4H |
| 114 | saddl2 v4.4S, \r0\().8H, \r1\().8H |
| 115 | |
| 116 | shl v3.4S, v5.4S, #4 |
| 117 | shl v5.4S, v5.4S, #2 |
| 118 | shl v7.4S, v6.4S, #2 |
| 119 | add v5.4S, v5.4S, v3.4S |
| 120 | add v6.4S, v6.4S, v7.4S |
| 121 | |
| 122 | shl v3.4S, v1.4S, #4 |
| 123 | shl v1.4S, v1.4S, #2 |
| 124 | shl v7.4S, v2.4S, #2 |
| 125 | add v1.4S, v1.4S, v3.4S |
| 126 | add v2.4S, v2.4S, v7.4S |
| 127 | |
| 128 | add v5.4S, v5.4S, v0.4S |
| 129 | sub v5.4S, v5.4S, v6.4S |
| 130 | |
| 131 | add v1.4S, v1.4S, v4.4S |
| 132 | sub v1.4S, v1.4S, v2.4S |
| 133 | |
| 134 | rshrn v5.4H, v5.4S, #10 |
| 135 | rshrn2 v5.8H, v1.4S, #10 |
| 136 | |
| 137 | sqxtun \r2\().8B, v5.8H |
| 138 | .endm |
| 139 | |
| 140 | function put_h264_qpel16_h_lowpass_neon_packed |
| 141 | mov x4, x30 |
| 142 | mov x12, #16 |
| 143 | mov x3, #8 |
| 144 | bl put_h264_qpel8_h_lowpass_neon |
| 145 | sub x1, x1, x2, lsl #4 |
| 146 | add x1, x1, #8 |
| 147 | mov x12, #16 |
| 148 | mov x30, x4 |
| 149 | b put_h264_qpel8_h_lowpass_neon |
| 150 | endfunc |
| 151 | |
| 152 | .macro h264_qpel_h_lowpass type |
| 153 | function \type\()_h264_qpel16_h_lowpass_neon |
| 154 | mov x13, x30 |
| 155 | mov x12, #16 |
| 156 | bl \type\()_h264_qpel8_h_lowpass_neon |
| 157 | sub x0, x0, x3, lsl #4 |
| 158 | sub x1, x1, x2, lsl #4 |
| 159 | add x0, x0, #8 |
| 160 | add x1, x1, #8 |
| 161 | mov x12, #16 |
| 162 | mov x30, x13 |
| 163 | endfunc |
| 164 | |
| 165 | function \type\()_h264_qpel8_h_lowpass_neon |
| 166 | 1: ld1 {v28.8B, v29.8B}, [x1], x2 |
| 167 | ld1 {v16.8B, v17.8B}, [x1], x2 |
| 168 | subs x12, x12, #2 |
| 169 | lowpass_8 v28, v29, v16, v17, v28, v16 |
| 170 | .ifc \type,avg |
| 171 | ld1 {v2.8B}, [x0], x3 |
| 172 | urhadd v28.8B, v28.8B, v2.8B |
| 173 | ld1 {v3.8B}, [x0] |
| 174 | urhadd v16.8B, v16.8B, v3.8B |
| 175 | sub x0, x0, x3 |
| 176 | .endif |
| 177 | st1 {v28.8B}, [x0], x3 |
| 178 | st1 {v16.8B}, [x0], x3 |
| 179 | b.ne 1b |
| 180 | ret |
| 181 | endfunc |
| 182 | .endm |
| 183 | |
| 184 | h264_qpel_h_lowpass put |
| 185 | h264_qpel_h_lowpass avg |
| 186 | |
| 187 | .macro h264_qpel_h_lowpass_l2 type |
| 188 | function \type\()_h264_qpel16_h_lowpass_l2_neon |
| 189 | mov x13, x30 |
| 190 | mov x12, #16 |
| 191 | bl \type\()_h264_qpel8_h_lowpass_l2_neon |
| 192 | sub x0, x0, x2, lsl #4 |
| 193 | sub x1, x1, x2, lsl #4 |
| 194 | sub x3, x3, x2, lsl #4 |
| 195 | add x0, x0, #8 |
| 196 | add x1, x1, #8 |
| 197 | add x3, x3, #8 |
| 198 | mov x12, #16 |
| 199 | mov x30, x13 |
| 200 | endfunc |
| 201 | |
| 202 | function \type\()_h264_qpel8_h_lowpass_l2_neon |
| 203 | 1: ld1 {v26.8B, v27.8B}, [x1], x2 |
| 204 | ld1 {v16.8B, v17.8B}, [x1], x2 |
| 205 | ld1 {v28.8B}, [x3], x2 |
| 206 | ld1 {v29.8B}, [x3], x2 |
| 207 | subs x12, x12, #2 |
| 208 | lowpass_8 v26, v27, v16, v17, v26, v27 |
| 209 | urhadd v26.8B, v26.8B, v28.8B |
| 210 | urhadd v27.8B, v27.8B, v29.8B |
| 211 | .ifc \type,avg |
| 212 | ld1 {v2.8B}, [x0], x2 |
| 213 | urhadd v26.8B, v26.8B, v2.8B |
| 214 | ld1 {v3.8B}, [x0] |
| 215 | urhadd v27.8B, v27.8B, v3.8B |
| 216 | sub x0, x0, x2 |
| 217 | .endif |
| 218 | st1 {v26.8B}, [x0], x2 |
| 219 | st1 {v27.8B}, [x0], x2 |
| 220 | b.ne 1b |
| 221 | ret |
| 222 | endfunc |
| 223 | .endm |
| 224 | |
| 225 | h264_qpel_h_lowpass_l2 put |
| 226 | h264_qpel_h_lowpass_l2 avg |
| 227 | |
| 228 | function put_h264_qpel16_v_lowpass_neon_packed |
| 229 | mov x4, x30 |
| 230 | mov x2, #8 |
| 231 | bl put_h264_qpel8_v_lowpass_neon |
| 232 | sub x1, x1, x3, lsl #2 |
| 233 | bl put_h264_qpel8_v_lowpass_neon |
| 234 | sub x1, x1, x3, lsl #4 |
| 235 | sub x1, x1, x3, lsl #2 |
| 236 | add x1, x1, #8 |
| 237 | bl put_h264_qpel8_v_lowpass_neon |
| 238 | sub x1, x1, x3, lsl #2 |
| 239 | mov x30, x4 |
| 240 | b put_h264_qpel8_v_lowpass_neon |
| 241 | endfunc |
| 242 | |
| 243 | .macro h264_qpel_v_lowpass type |
| 244 | function \type\()_h264_qpel16_v_lowpass_neon |
| 245 | mov x4, x30 |
| 246 | bl \type\()_h264_qpel8_v_lowpass_neon |
| 247 | sub x1, x1, x3, lsl #2 |
| 248 | bl \type\()_h264_qpel8_v_lowpass_neon |
| 249 | sub x0, x0, x2, lsl #4 |
| 250 | add x0, x0, #8 |
| 251 | sub x1, x1, x3, lsl #4 |
| 252 | sub x1, x1, x3, lsl #2 |
| 253 | add x1, x1, #8 |
| 254 | bl \type\()_h264_qpel8_v_lowpass_neon |
| 255 | sub x1, x1, x3, lsl #2 |
| 256 | mov x30, x4 |
| 257 | endfunc |
| 258 | |
| 259 | function \type\()_h264_qpel8_v_lowpass_neon |
| 260 | ld1 {v16.8B}, [x1], x3 |
| 261 | ld1 {v18.8B}, [x1], x3 |
| 262 | ld1 {v20.8B}, [x1], x3 |
| 263 | ld1 {v22.8B}, [x1], x3 |
| 264 | ld1 {v24.8B}, [x1], x3 |
| 265 | ld1 {v26.8B}, [x1], x3 |
| 266 | ld1 {v28.8B}, [x1], x3 |
| 267 | ld1 {v30.8B}, [x1], x3 |
| 268 | ld1 {v17.8B}, [x1], x3 |
| 269 | ld1 {v19.8B}, [x1], x3 |
| 270 | ld1 {v21.8B}, [x1], x3 |
| 271 | ld1 {v23.8B}, [x1], x3 |
| 272 | ld1 {v25.8B}, [x1] |
| 273 | |
| 274 | transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 |
| 275 | transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 |
| 276 | lowpass_8 v16, v17, v18, v19, v16, v17 |
| 277 | lowpass_8 v20, v21, v22, v23, v18, v19 |
| 278 | lowpass_8 v24, v25, v26, v27, v20, v21 |
| 279 | lowpass_8 v28, v29, v30, v31, v22, v23 |
| 280 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
| 281 | |
| 282 | .ifc \type,avg |
| 283 | ld1 {v24.8B}, [x0], x2 |
| 284 | urhadd v16.8B, v16.8B, v24.8B |
| 285 | ld1 {v25.8B}, [x0], x2 |
| 286 | urhadd v17.8B, v17.8B, v25.8B |
| 287 | ld1 {v26.8B}, [x0], x2 |
| 288 | urhadd v18.8B, v18.8B, v26.8B |
| 289 | ld1 {v27.8B}, [x0], x2 |
| 290 | urhadd v19.8B, v19.8B, v27.8B |
| 291 | ld1 {v28.8B}, [x0], x2 |
| 292 | urhadd v20.8B, v20.8B, v28.8B |
| 293 | ld1 {v29.8B}, [x0], x2 |
| 294 | urhadd v21.8B, v21.8B, v29.8B |
| 295 | ld1 {v30.8B}, [x0], x2 |
| 296 | urhadd v22.8B, v22.8B, v30.8B |
| 297 | ld1 {v31.8B}, [x0], x2 |
| 298 | urhadd v23.8B, v23.8B, v31.8B |
| 299 | sub x0, x0, x2, lsl #3 |
| 300 | .endif |
| 301 | |
| 302 | st1 {v16.8B}, [x0], x2 |
| 303 | st1 {v17.8B}, [x0], x2 |
| 304 | st1 {v18.8B}, [x0], x2 |
| 305 | st1 {v19.8B}, [x0], x2 |
| 306 | st1 {v20.8B}, [x0], x2 |
| 307 | st1 {v21.8B}, [x0], x2 |
| 308 | st1 {v22.8B}, [x0], x2 |
| 309 | st1 {v23.8B}, [x0], x2 |
| 310 | |
| 311 | ret |
| 312 | endfunc |
| 313 | .endm |
| 314 | |
| 315 | h264_qpel_v_lowpass put |
| 316 | h264_qpel_v_lowpass avg |
| 317 | |
| 318 | .macro h264_qpel_v_lowpass_l2 type |
| 319 | function \type\()_h264_qpel16_v_lowpass_l2_neon |
| 320 | mov x4, x30 |
| 321 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| 322 | sub x1, x1, x3, lsl #2 |
| 323 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| 324 | sub x0, x0, x3, lsl #4 |
| 325 | sub x12, x12, x2, lsl #4 |
| 326 | add x0, x0, #8 |
| 327 | add x12, x12, #8 |
| 328 | sub x1, x1, x3, lsl #4 |
| 329 | sub x1, x1, x3, lsl #2 |
| 330 | add x1, x1, #8 |
| 331 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| 332 | sub x1, x1, x3, lsl #2 |
| 333 | mov x30, x4 |
| 334 | endfunc |
| 335 | |
| 336 | function \type\()_h264_qpel8_v_lowpass_l2_neon |
| 337 | ld1 {v16.8B}, [x1], x3 |
| 338 | ld1 {v18.8B}, [x1], x3 |
| 339 | ld1 {v20.8B}, [x1], x3 |
| 340 | ld1 {v22.8B}, [x1], x3 |
| 341 | ld1 {v24.8B}, [x1], x3 |
| 342 | ld1 {v26.8B}, [x1], x3 |
| 343 | ld1 {v28.8B}, [x1], x3 |
| 344 | ld1 {v30.8B}, [x1], x3 |
| 345 | ld1 {v17.8B}, [x1], x3 |
| 346 | ld1 {v19.8B}, [x1], x3 |
| 347 | ld1 {v21.8B}, [x1], x3 |
| 348 | ld1 {v23.8B}, [x1], x3 |
| 349 | ld1 {v25.8B}, [x1] |
| 350 | |
| 351 | transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 |
| 352 | transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 |
| 353 | lowpass_8 v16, v17, v18, v19, v16, v17 |
| 354 | lowpass_8 v20, v21, v22, v23, v18, v19 |
| 355 | lowpass_8 v24, v25, v26, v27, v20, v21 |
| 356 | lowpass_8 v28, v29, v30, v31, v22, v23 |
| 357 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
| 358 | |
| 359 | ld1 {v24.8B}, [x12], x2 |
| 360 | ld1 {v25.8B}, [x12], x2 |
| 361 | ld1 {v26.8B}, [x12], x2 |
| 362 | ld1 {v27.8B}, [x12], x2 |
| 363 | ld1 {v28.8B}, [x12], x2 |
| 364 | urhadd v16.8B, v24.8B, v16.8B |
| 365 | urhadd v17.8B, v25.8B, v17.8B |
| 366 | ld1 {v29.8B}, [x12], x2 |
| 367 | urhadd v18.8B, v26.8B, v18.8B |
| 368 | urhadd v19.8B, v27.8B, v19.8B |
| 369 | ld1 {v30.8B}, [x12], x2 |
| 370 | urhadd v20.8B, v28.8B, v20.8B |
| 371 | urhadd v21.8B, v29.8B, v21.8B |
| 372 | ld1 {v31.8B}, [x12], x2 |
| 373 | urhadd v22.8B, v30.8B, v22.8B |
| 374 | urhadd v23.8B, v31.8B, v23.8B |
| 375 | |
| 376 | .ifc \type,avg |
| 377 | ld1 {v24.8B}, [x0], x3 |
| 378 | urhadd v16.8B, v16.8B, v24.8B |
| 379 | ld1 {v25.8B}, [x0], x3 |
| 380 | urhadd v17.8B, v17.8B, v25.8B |
| 381 | ld1 {v26.8B}, [x0], x3 |
| 382 | urhadd v18.8B, v18.8B, v26.8B |
| 383 | ld1 {v27.8B}, [x0], x3 |
| 384 | urhadd v19.8B, v19.8B, v27.8B |
| 385 | ld1 {v28.8B}, [x0], x3 |
| 386 | urhadd v20.8B, v20.8B, v28.8B |
| 387 | ld1 {v29.8B}, [x0], x3 |
| 388 | urhadd v21.8B, v21.8B, v29.8B |
| 389 | ld1 {v30.8B}, [x0], x3 |
| 390 | urhadd v22.8B, v22.8B, v30.8B |
| 391 | ld1 {v31.8B}, [x0], x3 |
| 392 | urhadd v23.8B, v23.8B, v31.8B |
| 393 | sub x0, x0, x3, lsl #3 |
| 394 | .endif |
| 395 | |
| 396 | st1 {v16.8B}, [x0], x3 |
| 397 | st1 {v17.8B}, [x0], x3 |
| 398 | st1 {v18.8B}, [x0], x3 |
| 399 | st1 {v19.8B}, [x0], x3 |
| 400 | st1 {v20.8B}, [x0], x3 |
| 401 | st1 {v21.8B}, [x0], x3 |
| 402 | st1 {v22.8B}, [x0], x3 |
| 403 | st1 {v23.8B}, [x0], x3 |
| 404 | |
| 405 | ret |
| 406 | endfunc |
| 407 | .endm |
| 408 | |
| 409 | h264_qpel_v_lowpass_l2 put |
| 410 | h264_qpel_v_lowpass_l2 avg |
| 411 | |
| 412 | function put_h264_qpel8_hv_lowpass_neon_top |
| 413 | lowpass_const w12 |
| 414 | ld1 {v16.8H}, [x1], x3 |
| 415 | ld1 {v17.8H}, [x1], x3 |
| 416 | ld1 {v18.8H}, [x1], x3 |
| 417 | ld1 {v19.8H}, [x1], x3 |
| 418 | ld1 {v20.8H}, [x1], x3 |
| 419 | ld1 {v21.8H}, [x1], x3 |
| 420 | ld1 {v22.8H}, [x1], x3 |
| 421 | ld1 {v23.8H}, [x1], x3 |
| 422 | ld1 {v24.8H}, [x1], x3 |
| 423 | ld1 {v25.8H}, [x1], x3 |
| 424 | ld1 {v26.8H}, [x1], x3 |
| 425 | ld1 {v27.8H}, [x1], x3 |
| 426 | ld1 {v28.8H}, [x1] |
| 427 | lowpass_8H v16, v17 |
| 428 | lowpass_8H v18, v19 |
| 429 | lowpass_8H v20, v21 |
| 430 | lowpass_8H v22, v23 |
| 431 | lowpass_8H v24, v25 |
| 432 | lowpass_8H v26, v27 |
| 433 | lowpass_8H v28, v29 |
| 434 | |
| 435 | transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
| 436 | transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 |
| 437 | |
| 438 | lowpass_8.16 v16, v24, v16 |
| 439 | lowpass_8.16 v17, v25, v17 |
| 440 | |
| 441 | lowpass_8.16 v18, v26, v18 |
| 442 | lowpass_8.16 v19, v27, v19 |
| 443 | |
| 444 | lowpass_8.16 v20, v28, v20 |
| 445 | lowpass_8.16 v21, v29, v21 |
| 446 | |
| 447 | lowpass_8.16 v22, v30, v22 |
| 448 | lowpass_8.16 v23, v31, v23 |
| 449 | |
| 450 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
| 451 | |
| 452 | ret |
| 453 | endfunc |
| 454 | |
| 455 | .macro h264_qpel8_hv_lowpass type |
| 456 | function \type\()_h264_qpel8_hv_lowpass_neon |
| 457 | mov x10, x30 |
| 458 | bl put_h264_qpel8_hv_lowpass_neon_top |
| 459 | .ifc \type,avg |
| 460 | ld1 {v0.8B}, [x0], x2 |
| 461 | urhadd v16.8B, v16.8B, v0.8B |
| 462 | ld1 {v1.8B}, [x0], x2 |
| 463 | urhadd v17.8B, v17.8B, v1.8B |
| 464 | ld1 {v2.8B}, [x0], x2 |
| 465 | urhadd v18.8B, v18.8B, v2.8B |
| 466 | ld1 {v3.8B}, [x0], x2 |
| 467 | urhadd v19.8B, v19.8B, v3.8B |
| 468 | ld1 {v4.8B}, [x0], x2 |
| 469 | urhadd v20.8B, v20.8B, v4.8B |
| 470 | ld1 {v5.8B}, [x0], x2 |
| 471 | urhadd v21.8B, v21.8B, v5.8B |
| 472 | ld1 {v6.8B}, [x0], x2 |
| 473 | urhadd v22.8B, v22.8B, v6.8B |
| 474 | ld1 {v7.8B}, [x0], x2 |
| 475 | urhadd v23.8B, v23.8B, v7.8B |
| 476 | sub x0, x0, x2, lsl #3 |
| 477 | .endif |
| 478 | |
| 479 | st1 {v16.8B}, [x0], x2 |
| 480 | st1 {v17.8B}, [x0], x2 |
| 481 | st1 {v18.8B}, [x0], x2 |
| 482 | st1 {v19.8B}, [x0], x2 |
| 483 | st1 {v20.8B}, [x0], x2 |
| 484 | st1 {v21.8B}, [x0], x2 |
| 485 | st1 {v22.8B}, [x0], x2 |
| 486 | st1 {v23.8B}, [x0], x2 |
| 487 | |
| 488 | ret x10 |
| 489 | endfunc |
| 490 | .endm |
| 491 | |
| 492 | h264_qpel8_hv_lowpass put |
| 493 | h264_qpel8_hv_lowpass avg |
| 494 | |
| 495 | .macro h264_qpel8_hv_lowpass_l2 type |
| 496 | function \type\()_h264_qpel8_hv_lowpass_l2_neon |
| 497 | mov x10, x30 |
| 498 | bl put_h264_qpel8_hv_lowpass_neon_top |
| 499 | |
| 500 | ld1 {v0.8B, v1.8B}, [x2], #16 |
| 501 | ld1 {v2.8B, v3.8B}, [x2], #16 |
| 502 | urhadd v0.8B, v0.8B, v16.8B |
| 503 | urhadd v1.8B, v1.8B, v17.8B |
| 504 | ld1 {v4.8B, v5.8B}, [x2], #16 |
| 505 | urhadd v2.8B, v2.8B, v18.8B |
| 506 | urhadd v3.8B, v3.8B, v19.8B |
| 507 | ld1 {v6.8B, v7.8B}, [x2], #16 |
| 508 | urhadd v4.8B, v4.8B, v20.8B |
| 509 | urhadd v5.8B, v5.8B, v21.8B |
| 510 | urhadd v6.8B, v6.8B, v22.8B |
| 511 | urhadd v7.8B, v7.8B, v23.8B |
| 512 | .ifc \type,avg |
| 513 | ld1 {v16.8B}, [x0], x3 |
| 514 | urhadd v0.8B, v0.8B, v16.8B |
| 515 | ld1 {v17.8B}, [x0], x3 |
| 516 | urhadd v1.8B, v1.8B, v17.8B |
| 517 | ld1 {v18.8B}, [x0], x3 |
| 518 | urhadd v2.8B, v2.8B, v18.8B |
| 519 | ld1 {v19.8B}, [x0], x3 |
| 520 | urhadd v3.8B, v3.8B, v19.8B |
| 521 | ld1 {v20.8B}, [x0], x3 |
| 522 | urhadd v4.8B, v4.8B, v20.8B |
| 523 | ld1 {v21.8B}, [x0], x3 |
| 524 | urhadd v5.8B, v5.8B, v21.8B |
| 525 | ld1 {v22.8B}, [x0], x3 |
| 526 | urhadd v6.8B, v6.8B, v22.8B |
| 527 | ld1 {v23.8B}, [x0], x3 |
| 528 | urhadd v7.8B, v7.8B, v23.8B |
| 529 | sub x0, x0, x3, lsl #3 |
| 530 | .endif |
| 531 | st1 {v0.8B}, [x0], x3 |
| 532 | st1 {v1.8B}, [x0], x3 |
| 533 | st1 {v2.8B}, [x0], x3 |
| 534 | st1 {v3.8B}, [x0], x3 |
| 535 | st1 {v4.8B}, [x0], x3 |
| 536 | st1 {v5.8B}, [x0], x3 |
| 537 | st1 {v6.8B}, [x0], x3 |
| 538 | st1 {v7.8B}, [x0], x3 |
| 539 | |
| 540 | ret x10 |
| 541 | endfunc |
| 542 | .endm |
| 543 | |
| 544 | h264_qpel8_hv_lowpass_l2 put |
| 545 | h264_qpel8_hv_lowpass_l2 avg |
| 546 | |
| 547 | .macro h264_qpel16_hv type |
| 548 | function \type\()_h264_qpel16_hv_lowpass_neon |
| 549 | mov x13, x30 |
| 550 | bl \type\()_h264_qpel8_hv_lowpass_neon |
| 551 | sub x1, x1, x3, lsl #2 |
| 552 | bl \type\()_h264_qpel8_hv_lowpass_neon |
| 553 | sub x1, x1, x3, lsl #4 |
| 554 | sub x1, x1, x3, lsl #2 |
| 555 | add x1, x1, #8 |
| 556 | sub x0, x0, x2, lsl #4 |
| 557 | add x0, x0, #8 |
| 558 | bl \type\()_h264_qpel8_hv_lowpass_neon |
| 559 | sub x1, x1, x3, lsl #2 |
| 560 | mov x30, x13 |
| 561 | b \type\()_h264_qpel8_hv_lowpass_neon |
| 562 | endfunc |
| 563 | |
| 564 | function \type\()_h264_qpel16_hv_lowpass_l2_neon |
| 565 | mov x13, x30 |
| 566 | sub x2, x4, #256 |
| 567 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| 568 | sub x1, x1, x3, lsl #2 |
| 569 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| 570 | sub x1, x1, x3, lsl #4 |
| 571 | sub x1, x1, x3, lsl #2 |
| 572 | add x1, x1, #8 |
| 573 | sub x0, x0, x3, lsl #4 |
| 574 | add x0, x0, #8 |
| 575 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| 576 | sub x1, x1, x3, lsl #2 |
| 577 | mov x30, x13 |
| 578 | b \type\()_h264_qpel8_hv_lowpass_l2_neon |
| 579 | endfunc |
| 580 | .endm |
| 581 | |
| 582 | h264_qpel16_hv put |
| 583 | h264_qpel16_hv avg |
| 584 | |
| 585 | .macro h264_qpel8 type |
| 586 | function ff_\type\()_h264_qpel8_mc10_neon, export=1 |
| 587 | lowpass_const w3 |
| 588 | mov x3, x1 |
| 589 | sub x1, x1, #2 |
| 590 | mov x12, #8 |
| 591 | b \type\()_h264_qpel8_h_lowpass_l2_neon |
| 592 | endfunc |
| 593 | |
| 594 | function ff_\type\()_h264_qpel8_mc20_neon, export=1 |
| 595 | lowpass_const w3 |
| 596 | sub x1, x1, #2 |
| 597 | mov x3, x2 |
| 598 | mov x12, #8 |
| 599 | b \type\()_h264_qpel8_h_lowpass_neon |
| 600 | endfunc |
| 601 | |
| 602 | function ff_\type\()_h264_qpel8_mc30_neon, export=1 |
| 603 | lowpass_const w3 |
| 604 | add x3, x1, #1 |
| 605 | sub x1, x1, #2 |
| 606 | mov x12, #8 |
| 607 | b \type\()_h264_qpel8_h_lowpass_l2_neon |
| 608 | endfunc |
| 609 | |
| 610 | function ff_\type\()_h264_qpel8_mc01_neon, export=1 |
| 611 | mov x14, x30 |
| 612 | mov x12, x1 |
| 613 | \type\()_h264_qpel8_mc01: |
| 614 | lowpass_const w3 |
| 615 | mov x3, x2 |
| 616 | sub x1, x1, x2, lsl #1 |
| 617 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| 618 | ret x14 |
| 619 | endfunc |
| 620 | |
| 621 | function ff_\type\()_h264_qpel8_mc11_neon, export=1 |
| 622 | mov x14, x30 |
| 623 | mov x8, x0 |
| 624 | mov x9, x1 |
| 625 | \type\()_h264_qpel8_mc11: |
| 626 | lowpass_const w3 |
| 627 | mov x11, sp |
| 628 | sub sp, sp, #64 |
| 629 | mov x0, sp |
| 630 | sub x1, x1, #2 |
| 631 | mov x3, #8 |
| 632 | mov x12, #8 |
| 633 | bl put_h264_qpel8_h_lowpass_neon |
| 634 | mov x0, x8 |
| 635 | mov x3, x2 |
| 636 | mov x12, sp |
| 637 | sub x1, x9, x2, lsl #1 |
| 638 | mov x2, #8 |
| 639 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
| 640 | mov sp, x11 |
| 641 | ret x14 |
| 642 | endfunc |
| 643 | |
| 644 | function ff_\type\()_h264_qpel8_mc21_neon, export=1 |
| 645 | mov x14, x30 |
| 646 | mov x8, x0 |
| 647 | mov x9, x1 |
| 648 | \type\()_h264_qpel8_mc21: |
| 649 | lowpass_const w3 |
| 650 | mov x11, sp |
| 651 | sub sp, sp, #(8*8+16*12) |
| 652 | sub x1, x1, #2 |
| 653 | mov x3, #8 |
| 654 | mov x0, sp |
| 655 | mov x12, #8 |
| 656 | bl put_h264_qpel8_h_lowpass_neon |
| 657 | mov x4, x0 |
| 658 | mov x0, x8 |
| 659 | sub x1, x9, x2, lsl #1 |
| 660 | sub x1, x1, #2 |
| 661 | mov x3, x2 |
| 662 | sub x2, x4, #64 |
| 663 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| 664 | mov sp, x11 |
| 665 | ret x14 |
| 666 | endfunc |
| 667 | |
| 668 | function ff_\type\()_h264_qpel8_mc31_neon, export=1 |
| 669 | add x1, x1, #1 |
| 670 | mov x14, x30 |
| 671 | mov x8, x0 |
| 672 | mov x9, x1 |
| 673 | sub x1, x1, #1 |
| 674 | b \type\()_h264_qpel8_mc11 |
| 675 | endfunc |
| 676 | |
| 677 | function ff_\type\()_h264_qpel8_mc02_neon, export=1 |
| 678 | mov x14, x30 |
| 679 | lowpass_const w3 |
| 680 | sub x1, x1, x2, lsl #1 |
| 681 | mov x3, x2 |
| 682 | bl \type\()_h264_qpel8_v_lowpass_neon |
| 683 | ret x14 |
| 684 | endfunc |
| 685 | |
| 686 | function ff_\type\()_h264_qpel8_mc12_neon, export=1 |
| 687 | mov x14, x30 |
| 688 | mov x8, x0 |
| 689 | mov x9, x1 |
| 690 | \type\()_h264_qpel8_mc12: |
| 691 | lowpass_const w3 |
| 692 | mov x11, sp |
| 693 | sub sp, sp, #(8*8+16*12) |
| 694 | sub x1, x1, x2, lsl #1 |
| 695 | mov x3, x2 |
| 696 | mov x2, #8 |
| 697 | mov x0, sp |
| 698 | bl put_h264_qpel8_v_lowpass_neon |
| 699 | mov x4, x0 |
| 700 | mov x0, x8 |
| 701 | sub x1, x9, x3, lsl #1 |
| 702 | sub x1, x1, #2 |
| 703 | sub x2, x4, #64 |
| 704 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
| 705 | mov sp, x11 |
| 706 | ret x14 |
| 707 | endfunc |
| 708 | |
| 709 | function ff_\type\()_h264_qpel8_mc22_neon, export=1 |
| 710 | mov x14, x30 |
| 711 | mov x11, sp |
| 712 | sub x1, x1, x2, lsl #1 |
| 713 | sub x1, x1, #2 |
| 714 | mov x3, x2 |
| 715 | bl \type\()_h264_qpel8_hv_lowpass_neon |
| 716 | mov sp, x11 |
| 717 | ret x14 |
| 718 | endfunc |
| 719 | |
| 720 | function ff_\type\()_h264_qpel8_mc32_neon, export=1 |
| 721 | mov x14, x30 |
| 722 | mov x8, x0 |
| 723 | mov x9, x1 |
| 724 | add x1, x1, #1 |
| 725 | b \type\()_h264_qpel8_mc12 |
| 726 | endfunc |
| 727 | |
| 728 | function ff_\type\()_h264_qpel8_mc03_neon, export=1 |
| 729 | mov x14, x30 |
| 730 | add x12, x1, x2 |
| 731 | b \type\()_h264_qpel8_mc01 |
| 732 | endfunc |
| 733 | |
| 734 | function ff_\type\()_h264_qpel8_mc13_neon, export=1 |
| 735 | mov x14, x30 |
| 736 | mov x8, x0 |
| 737 | mov x9, x1 |
| 738 | add x1, x1, x2 |
| 739 | b \type\()_h264_qpel8_mc11 |
| 740 | endfunc |
| 741 | |
| 742 | function ff_\type\()_h264_qpel8_mc23_neon, export=1 |
| 743 | mov x14, x30 |
| 744 | mov x8, x0 |
| 745 | mov x9, x1 |
| 746 | add x1, x1, x2 |
| 747 | b \type\()_h264_qpel8_mc21 |
| 748 | endfunc |
| 749 | |
| 750 | function ff_\type\()_h264_qpel8_mc33_neon, export=1 |
| 751 | add x1, x1, #1 |
| 752 | mov x14, x30 |
| 753 | mov x8, x0 |
| 754 | mov x9, x1 |
| 755 | add x1, x1, x2 |
| 756 | sub x1, x1, #1 |
| 757 | b \type\()_h264_qpel8_mc11 |
| 758 | endfunc |
| 759 | .endm |
| 760 | |
| 761 | h264_qpel8 put |
| 762 | h264_qpel8 avg |
| 763 | |
| 764 | .macro h264_qpel16 type |
| 765 | function ff_\type\()_h264_qpel16_mc10_neon, export=1 |
| 766 | lowpass_const w3 |
| 767 | mov x3, x1 |
| 768 | sub x1, x1, #2 |
| 769 | b \type\()_h264_qpel16_h_lowpass_l2_neon |
| 770 | endfunc |
| 771 | |
| 772 | function ff_\type\()_h264_qpel16_mc20_neon, export=1 |
| 773 | lowpass_const w3 |
| 774 | sub x1, x1, #2 |
| 775 | mov x3, x2 |
| 776 | b \type\()_h264_qpel16_h_lowpass_neon |
| 777 | endfunc |
| 778 | |
| 779 | function ff_\type\()_h264_qpel16_mc30_neon, export=1 |
| 780 | lowpass_const w3 |
| 781 | add x3, x1, #1 |
| 782 | sub x1, x1, #2 |
| 783 | b \type\()_h264_qpel16_h_lowpass_l2_neon |
| 784 | endfunc |
| 785 | |
| 786 | function ff_\type\()_h264_qpel16_mc01_neon, export=1 |
| 787 | mov x14, x30 |
| 788 | mov x12, x1 |
| 789 | \type\()_h264_qpel16_mc01: |
| 790 | lowpass_const w3 |
| 791 | mov x3, x2 |
| 792 | sub x1, x1, x2, lsl #1 |
| 793 | bl \type\()_h264_qpel16_v_lowpass_l2_neon |
| 794 | ret x14 |
| 795 | endfunc |
| 796 | |
| 797 | function ff_\type\()_h264_qpel16_mc11_neon, export=1 |
| 798 | mov x14, x30 |
| 799 | mov x8, x0 |
| 800 | mov x9, x1 |
| 801 | \type\()_h264_qpel16_mc11: |
| 802 | lowpass_const w3 |
| 803 | mov x11, sp |
| 804 | sub sp, sp, #256 |
| 805 | mov x0, sp |
| 806 | sub x1, x1, #2 |
| 807 | mov x3, #16 |
| 808 | bl put_h264_qpel16_h_lowpass_neon |
| 809 | mov x0, x8 |
| 810 | mov x3, x2 |
| 811 | mov x12, sp |
| 812 | sub x1, x9, x2, lsl #1 |
| 813 | mov x2, #16 |
| 814 | bl \type\()_h264_qpel16_v_lowpass_l2_neon |
| 815 | mov sp, x11 |
| 816 | ret x14 |
| 817 | endfunc |
| 818 | |
| 819 | function ff_\type\()_h264_qpel16_mc21_neon, export=1 |
| 820 | mov x14, x30 |
| 821 | mov x8, x0 |
| 822 | mov x9, x1 |
| 823 | \type\()_h264_qpel16_mc21: |
| 824 | lowpass_const w3 |
| 825 | mov x11, sp |
| 826 | sub sp, sp, #(16*16+16*12) |
| 827 | sub x1, x1, #2 |
| 828 | mov x0, sp |
| 829 | bl put_h264_qpel16_h_lowpass_neon_packed |
| 830 | mov x4, x0 |
| 831 | mov x0, x8 |
| 832 | sub x1, x9, x2, lsl #1 |
| 833 | sub x1, x1, #2 |
| 834 | mov x3, x2 |
| 835 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
| 836 | mov sp, x11 |
| 837 | ret x14 |
| 838 | endfunc |
| 839 | |
| 840 | function ff_\type\()_h264_qpel16_mc31_neon, export=1 |
| 841 | add x1, x1, #1 |
| 842 | mov x14, x30 |
| 843 | mov x8, x0 |
| 844 | mov x9, x1 |
| 845 | sub x1, x1, #1 |
| 846 | b \type\()_h264_qpel16_mc11 |
| 847 | endfunc |
| 848 | |
| 849 | function ff_\type\()_h264_qpel16_mc02_neon, export=1 |
| 850 | mov x14, x30 |
| 851 | lowpass_const w3 |
| 852 | sub x1, x1, x2, lsl #1 |
| 853 | mov x3, x2 |
| 854 | bl \type\()_h264_qpel16_v_lowpass_neon |
| 855 | ret x14 |
| 856 | endfunc |
| 857 | |
| 858 | function ff_\type\()_h264_qpel16_mc12_neon, export=1 |
| 859 | mov x14, x30 |
| 860 | mov x8, x0 |
| 861 | mov x9, x1 |
| 862 | \type\()_h264_qpel16_mc12: |
| 863 | lowpass_const w3 |
| 864 | mov x11, sp |
| 865 | sub sp, sp, #(16*16+16*12) |
| 866 | sub x1, x1, x2, lsl #1 |
| 867 | mov x0, sp |
| 868 | mov x3, x2 |
| 869 | bl put_h264_qpel16_v_lowpass_neon_packed |
| 870 | mov x4, x0 |
| 871 | mov x0, x8 |
| 872 | sub x1, x9, x3, lsl #1 |
| 873 | sub x1, x1, #2 |
| 874 | mov x2, x3 |
| 875 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
| 876 | mov sp, x11 |
| 877 | ret x14 |
| 878 | endfunc |
| 879 | |
| 880 | function ff_\type\()_h264_qpel16_mc22_neon, export=1 |
| 881 | mov x14, x30 |
| 882 | lowpass_const w3 |
| 883 | mov x11, sp |
| 884 | sub x1, x1, x2, lsl #1 |
| 885 | sub x1, x1, #2 |
| 886 | mov x3, x2 |
| 887 | bl \type\()_h264_qpel16_hv_lowpass_neon |
| 888 | mov sp, x11 // restore stack |
| 889 | ret x14 |
| 890 | endfunc |
| 891 | |
| 892 | function ff_\type\()_h264_qpel16_mc32_neon, export=1 |
| 893 | mov x14, x30 |
| 894 | mov x8, x0 |
| 895 | mov x9, x1 |
| 896 | add x1, x1, #1 |
| 897 | b \type\()_h264_qpel16_mc12 |
| 898 | endfunc |
| 899 | |
| 900 | function ff_\type\()_h264_qpel16_mc03_neon, export=1 |
| 901 | mov x14, x30 |
| 902 | add x12, x1, x2 |
| 903 | b \type\()_h264_qpel16_mc01 |
| 904 | endfunc |
| 905 | |
| 906 | function ff_\type\()_h264_qpel16_mc13_neon, export=1 |
| 907 | mov x14, x30 |
| 908 | mov x8, x0 |
| 909 | mov x9, x1 |
| 910 | add x1, x1, x2 |
| 911 | b \type\()_h264_qpel16_mc11 |
| 912 | endfunc |
| 913 | |
| 914 | function ff_\type\()_h264_qpel16_mc23_neon, export=1 |
| 915 | mov x14, x30 |
| 916 | mov x8, x0 |
| 917 | mov x9, x1 |
| 918 | add x1, x1, x2 |
| 919 | b \type\()_h264_qpel16_mc21 |
| 920 | endfunc |
| 921 | |
| 922 | function ff_\type\()_h264_qpel16_mc33_neon, export=1 |
| 923 | add x1, x1, #1 |
| 924 | mov x14, x30 |
| 925 | mov x8, x0 |
| 926 | mov x9, x1 |
| 927 | add x1, x1, x2 |
| 928 | sub x1, x1, #1 |
| 929 | b \type\()_h264_qpel16_mc11 |
| 930 | endfunc |
| 931 | .endm |
| 932 | |
| 933 | h264_qpel16 put |
| 934 | h264_qpel16 avg |