| 1 | /* |
| 2 | * ARM NEON IDCT |
| 3 | * |
| 4 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
| 5 | * |
| 6 | * Based on Simple IDCT |
| 7 | * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |
| 8 | * |
| 9 | * This file is part of FFmpeg. |
| 10 | * |
| 11 | * FFmpeg is free software; you can redistribute it and/or |
| 12 | * modify it under the terms of the GNU Lesser General Public |
| 13 | * License as published by the Free Software Foundation; either |
| 14 | * version 2.1 of the License, or (at your option) any later version. |
| 15 | * |
| 16 | * FFmpeg is distributed in the hope that it will be useful, |
| 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 19 | * Lesser General Public License for more details. |
| 20 | * |
| 21 | * You should have received a copy of the GNU Lesser General Public |
| 22 | * License along with FFmpeg; if not, write to the Free Software |
| 23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 24 | */ |
| 25 | |
| 26 | #include "libavutil/arm/asm.S" |
| 27 | |
| 28 | #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
| 29 | #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
| 30 | #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
| 31 | #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
| 32 | #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
| 33 | #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
| 34 | #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
| 35 | #define W4c ((1<<(COL_SHIFT-1))/W4) |
| 36 | #define ROW_SHIFT 11 |
| 37 | #define COL_SHIFT 20 |
| 38 | |
| 39 | #define w1 d0[0] |
| 40 | #define w2 d0[1] |
| 41 | #define w3 d0[2] |
| 42 | #define w4 d0[3] |
| 43 | #define w5 d1[0] |
| 44 | #define w6 d1[1] |
| 45 | #define w7 d1[2] |
| 46 | #define w4c d1[3] |
| 47 | |
| 48 | .macro idct_col4_top |
| 49 | vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ |
| 50 | vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ |
| 51 | vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ |
| 52 | vadd.i32 q11, q15, q7 |
| 53 | vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ |
| 54 | vadd.i32 q12, q15, q8 |
| 55 | vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ |
| 56 | vsub.i32 q13, q15, q8 |
| 57 | vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ |
| 58 | vsub.i32 q14, q15, q7 |
| 59 | |
| 60 | vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ |
| 61 | vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ |
| 62 | vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ |
| 63 | vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ |
| 64 | .endm |
| 65 | |
| 66 | .text |
| 67 | .align 6 |
| 68 | |
| 69 | function idct_row4_pld_neon |
| 70 | pld [r0] |
| 71 | add r3, r0, r1, lsl #2 |
| 72 | pld [r0, r1] |
| 73 | pld [r0, r1, lsl #1] |
| 74 | A pld [r3, -r1] |
| 75 | pld [r3] |
| 76 | pld [r3, r1] |
| 77 | add r3, r3, r1, lsl #1 |
| 78 | pld [r3] |
| 79 | pld [r3, r1] |
| 80 | endfunc |
| 81 | |
| 82 | function idct_row4_neon |
| 83 | vmov.i32 q15, #(1<<(ROW_SHIFT-1)) |
| 84 | vld1.64 {d2-d5}, [r2,:128]! |
| 85 | vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ |
| 86 | vld1.64 {d6,d7}, [r2,:128]! |
| 87 | vorr d10, d3, d5 |
| 88 | vld1.64 {d8,d9}, [r2,:128]! |
| 89 | add r2, r2, #-64 |
| 90 | |
| 91 | vorr d11, d7, d9 |
| 92 | vorr d10, d10, d11 |
| 93 | vmov r3, r4, d10 |
| 94 | |
| 95 | idct_col4_top |
| 96 | |
| 97 | orrs r3, r3, r4 |
| 98 | beq 1f |
| 99 | |
| 100 | vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ |
| 101 | vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ |
| 102 | vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ |
| 103 | vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ |
| 104 | vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ |
| 105 | vadd.i32 q11, q11, q7 |
| 106 | vsub.i32 q12, q12, q7 |
| 107 | vsub.i32 q13, q13, q7 |
| 108 | vadd.i32 q14, q14, q7 |
| 109 | vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ |
| 110 | vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ |
| 111 | vmlal.s16 q9, d9, w7 |
| 112 | vmlsl.s16 q10, d9, w5 |
| 113 | vmlal.s16 q5, d9, w3 |
| 114 | vmlsl.s16 q6, d9, w1 |
| 115 | vadd.i32 q11, q11, q7 |
| 116 | vsub.i32 q12, q12, q8 |
| 117 | vadd.i32 q13, q13, q8 |
| 118 | vsub.i32 q14, q14, q7 |
| 119 | |
| 120 | 1: vadd.i32 q3, q11, q9 |
| 121 | vadd.i32 q4, q12, q10 |
| 122 | vshrn.i32 d2, q3, #ROW_SHIFT |
| 123 | vshrn.i32 d4, q4, #ROW_SHIFT |
| 124 | vadd.i32 q7, q13, q5 |
| 125 | vadd.i32 q8, q14, q6 |
| 126 | vtrn.16 d2, d4 |
| 127 | vshrn.i32 d6, q7, #ROW_SHIFT |
| 128 | vshrn.i32 d8, q8, #ROW_SHIFT |
| 129 | vsub.i32 q14, q14, q6 |
| 130 | vsub.i32 q11, q11, q9 |
| 131 | vtrn.16 d6, d8 |
| 132 | vsub.i32 q13, q13, q5 |
| 133 | vshrn.i32 d3, q14, #ROW_SHIFT |
| 134 | vtrn.32 d2, d6 |
| 135 | vsub.i32 q12, q12, q10 |
| 136 | vtrn.32 d4, d8 |
| 137 | vshrn.i32 d5, q13, #ROW_SHIFT |
| 138 | vshrn.i32 d7, q12, #ROW_SHIFT |
| 139 | vshrn.i32 d9, q11, #ROW_SHIFT |
| 140 | |
| 141 | vtrn.16 d3, d5 |
| 142 | vtrn.16 d7, d9 |
| 143 | vtrn.32 d3, d7 |
| 144 | vtrn.32 d5, d9 |
| 145 | |
| 146 | vst1.64 {d2-d5}, [r2,:128]! |
| 147 | vst1.64 {d6-d9}, [r2,:128]! |
| 148 | |
| 149 | bx lr |
| 150 | endfunc |
| 151 | |
| 152 | function idct_col4_neon |
| 153 | mov ip, #16 |
| 154 | vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ |
| 155 | vdup.16 d30, w4c |
| 156 | vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ |
| 157 | vadd.i16 d30, d30, d2 |
| 158 | vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ |
| 159 | vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ |
| 160 | vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ |
| 161 | |
| 162 | ldrd r4, r5, [r2] |
| 163 | ldrd r6, r7, [r2, #16] |
| 164 | orrs r4, r4, r5 |
| 165 | |
| 166 | idct_col4_top |
| 167 | it eq |
| 168 | addeq r2, r2, #16 |
| 169 | beq 1f |
| 170 | |
| 171 | vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ |
| 172 | vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ |
| 173 | vadd.i32 q11, q11, q7 |
| 174 | vsub.i32 q12, q12, q7 |
| 175 | vsub.i32 q13, q13, q7 |
| 176 | vadd.i32 q14, q14, q7 |
| 177 | |
| 178 | 1: orrs r6, r6, r7 |
| 179 | ldrd r4, r5, [r2, #16] |
| 180 | it eq |
| 181 | addeq r2, r2, #16 |
| 182 | beq 2f |
| 183 | |
| 184 | vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ |
| 185 | vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ |
| 186 | vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ |
| 187 | vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ |
| 188 | vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ |
| 189 | |
| 190 | 2: orrs r4, r4, r5 |
| 191 | ldrd r4, r5, [r2, #16] |
| 192 | it eq |
| 193 | addeq r2, r2, #16 |
| 194 | beq 3f |
| 195 | |
| 196 | vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ |
| 197 | vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ |
| 198 | vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ |
| 199 | vadd.i32 q11, q11, q7 |
| 200 | vsub.i32 q14, q14, q7 |
| 201 | vsub.i32 q12, q12, q8 |
| 202 | vadd.i32 q13, q13, q8 |
| 203 | |
| 204 | 3: orrs r4, r4, r5 |
| 205 | it eq |
| 206 | addeq r2, r2, #16 |
| 207 | beq 4f |
| 208 | |
| 209 | vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ |
| 210 | vmlal.s16 q9, d9, w7 |
| 211 | vmlsl.s16 q10, d9, w5 |
| 212 | vmlal.s16 q5, d9, w3 |
| 213 | vmlsl.s16 q6, d9, w1 |
| 214 | |
| 215 | 4: vaddhn.i32 d2, q11, q9 |
| 216 | vaddhn.i32 d3, q12, q10 |
| 217 | vaddhn.i32 d4, q13, q5 |
| 218 | vaddhn.i32 d5, q14, q6 |
| 219 | vsubhn.i32 d9, q11, q9 |
| 220 | vsubhn.i32 d8, q12, q10 |
| 221 | vsubhn.i32 d7, q13, q5 |
| 222 | vsubhn.i32 d6, q14, q6 |
| 223 | |
| 224 | bx lr |
| 225 | endfunc |
| 226 | |
| 227 | .align 6 |
| 228 | |
| 229 | function idct_col4_st8_neon |
| 230 | vqshrun.s16 d2, q1, #COL_SHIFT-16 |
| 231 | vqshrun.s16 d3, q2, #COL_SHIFT-16 |
| 232 | vqshrun.s16 d4, q3, #COL_SHIFT-16 |
| 233 | vqshrun.s16 d5, q4, #COL_SHIFT-16 |
| 234 | vst1.32 {d2[0]}, [r0,:32], r1 |
| 235 | vst1.32 {d2[1]}, [r0,:32], r1 |
| 236 | vst1.32 {d3[0]}, [r0,:32], r1 |
| 237 | vst1.32 {d3[1]}, [r0,:32], r1 |
| 238 | vst1.32 {d4[0]}, [r0,:32], r1 |
| 239 | vst1.32 {d4[1]}, [r0,:32], r1 |
| 240 | vst1.32 {d5[0]}, [r0,:32], r1 |
| 241 | vst1.32 {d5[1]}, [r0,:32], r1 |
| 242 | |
| 243 | bx lr |
| 244 | endfunc |
| 245 | |
| 246 | const idct_coeff_neon, align=4 |
| 247 | .short W1, W2, W3, W4, W5, W6, W7, W4c |
| 248 | endconst |
| 249 | |
| 250 | .macro idct_start data |
| 251 | push {r4-r7, lr} |
| 252 | pld [\data] |
| 253 | pld [\data, #64] |
| 254 | vpush {d8-d15} |
| 255 | movrel r3, idct_coeff_neon |
| 256 | vld1.64 {d0,d1}, [r3,:128] |
| 257 | .endm |
| 258 | |
| 259 | .macro idct_end |
| 260 | vpop {d8-d15} |
| 261 | pop {r4-r7, pc} |
| 262 | .endm |
| 263 | |
| 264 | /* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */ |
| 265 | function ff_simple_idct_put_neon, export=1 |
| 266 | idct_start r2 |
| 267 | |
| 268 | bl idct_row4_pld_neon |
| 269 | bl idct_row4_neon |
| 270 | add r2, r2, #-128 |
| 271 | bl idct_col4_neon |
| 272 | bl idct_col4_st8_neon |
| 273 | sub r0, r0, r1, lsl #3 |
| 274 | add r0, r0, #4 |
| 275 | add r2, r2, #-120 |
| 276 | bl idct_col4_neon |
| 277 | bl idct_col4_st8_neon |
| 278 | |
| 279 | idct_end |
| 280 | endfunc |
| 281 | |
| 282 | .align 6 |
| 283 | |
| 284 | function idct_col4_add8_neon |
| 285 | mov ip, r0 |
| 286 | |
| 287 | vld1.32 {d10[0]}, [r0,:32], r1 |
| 288 | vshr.s16 q1, q1, #COL_SHIFT-16 |
| 289 | vld1.32 {d10[1]}, [r0,:32], r1 |
| 290 | vshr.s16 q2, q2, #COL_SHIFT-16 |
| 291 | vld1.32 {d11[0]}, [r0,:32], r1 |
| 292 | vshr.s16 q3, q3, #COL_SHIFT-16 |
| 293 | vld1.32 {d11[1]}, [r0,:32], r1 |
| 294 | vshr.s16 q4, q4, #COL_SHIFT-16 |
| 295 | vld1.32 {d12[0]}, [r0,:32], r1 |
| 296 | vaddw.u8 q1, q1, d10 |
| 297 | vld1.32 {d12[1]}, [r0,:32], r1 |
| 298 | vaddw.u8 q2, q2, d11 |
| 299 | vld1.32 {d13[0]}, [r0,:32], r1 |
| 300 | vqmovun.s16 d2, q1 |
| 301 | vld1.32 {d13[1]}, [r0,:32], r1 |
| 302 | vaddw.u8 q3, q3, d12 |
| 303 | vst1.32 {d2[0]}, [ip,:32], r1 |
| 304 | vqmovun.s16 d3, q2 |
| 305 | vst1.32 {d2[1]}, [ip,:32], r1 |
| 306 | vaddw.u8 q4, q4, d13 |
| 307 | vst1.32 {d3[0]}, [ip,:32], r1 |
| 308 | vqmovun.s16 d4, q3 |
| 309 | vst1.32 {d3[1]}, [ip,:32], r1 |
| 310 | vqmovun.s16 d5, q4 |
| 311 | vst1.32 {d4[0]}, [ip,:32], r1 |
| 312 | vst1.32 {d4[1]}, [ip,:32], r1 |
| 313 | vst1.32 {d5[0]}, [ip,:32], r1 |
| 314 | vst1.32 {d5[1]}, [ip,:32], r1 |
| 315 | |
| 316 | bx lr |
| 317 | endfunc |
| 318 | |
| 319 | /* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */ |
| 320 | function ff_simple_idct_add_neon, export=1 |
| 321 | idct_start r2 |
| 322 | |
| 323 | bl idct_row4_pld_neon |
| 324 | bl idct_row4_neon |
| 325 | add r2, r2, #-128 |
| 326 | bl idct_col4_neon |
| 327 | bl idct_col4_add8_neon |
| 328 | sub r0, r0, r1, lsl #3 |
| 329 | add r0, r0, #4 |
| 330 | add r2, r2, #-120 |
| 331 | bl idct_col4_neon |
| 332 | bl idct_col4_add8_neon |
| 333 | |
| 334 | idct_end |
| 335 | endfunc |
| 336 | |
| 337 | .align 6 |
| 338 | |
| 339 | function idct_col4_st16_neon |
| 340 | mov ip, #16 |
| 341 | |
| 342 | vshr.s16 q1, q1, #COL_SHIFT-16 |
| 343 | vshr.s16 q2, q2, #COL_SHIFT-16 |
| 344 | vst1.64 {d2}, [r2,:64], ip |
| 345 | vshr.s16 q3, q3, #COL_SHIFT-16 |
| 346 | vst1.64 {d3}, [r2,:64], ip |
| 347 | vshr.s16 q4, q4, #COL_SHIFT-16 |
| 348 | vst1.64 {d4}, [r2,:64], ip |
| 349 | vst1.64 {d5}, [r2,:64], ip |
| 350 | vst1.64 {d6}, [r2,:64], ip |
| 351 | vst1.64 {d7}, [r2,:64], ip |
| 352 | vst1.64 {d8}, [r2,:64], ip |
| 353 | vst1.64 {d9}, [r2,:64], ip |
| 354 | |
| 355 | bx lr |
| 356 | endfunc |
| 357 | |
| 358 | /* void ff_simple_idct_neon(int16_t *data); */ |
| 359 | function ff_simple_idct_neon, export=1 |
| 360 | idct_start r0 |
| 361 | |
| 362 | mov r2, r0 |
| 363 | bl idct_row4_neon |
| 364 | bl idct_row4_neon |
| 365 | add r2, r2, #-128 |
| 366 | bl idct_col4_neon |
| 367 | add r2, r2, #-128 |
| 368 | bl idct_col4_st16_neon |
| 369 | add r2, r2, #-120 |
| 370 | bl idct_col4_neon |
| 371 | add r2, r2, #-128 |
| 372 | bl idct_col4_st16_neon |
| 373 | |
| 374 | idct_end |
| 375 | endfunc |