| 1 | /* |
| 2 | * Copyright (C) 2003 James Klicman <james@klicman.org> |
| 3 | * |
| 4 | * This file is part of FFmpeg. |
| 5 | * |
| 6 | * FFmpeg is free software; you can redistribute it and/or |
| 7 | * modify it under the terms of the GNU Lesser General Public |
| 8 | * License as published by the Free Software Foundation; either |
| 9 | * version 2.1 of the License, or (at your option) any later version. |
| 10 | * |
| 11 | * FFmpeg is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | * Lesser General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU Lesser General Public |
| 17 | * License along with FFmpeg; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | */ |
| 20 | |
| 21 | #include "config.h" |
| 22 | #if HAVE_ALTIVEC_H |
| 23 | #include <altivec.h> |
| 24 | #endif |
| 25 | |
| 26 | #include "libavutil/attributes.h" |
| 27 | #include "libavutil/cpu.h" |
| 28 | #include "libavutil/ppc/cpu.h" |
| 29 | #include "libavcodec/fdctdsp.h" |
| 30 | #include "fdct.h" |
| 31 | |
| 32 | #if HAVE_ALTIVEC |
| 33 | |
| 34 | #define vs16(v) ((vector signed short) (v)) |
| 35 | #define vs32(v) ((vector signed int) (v)) |
| 36 | #define vu8(v) ((vector unsigned char) (v)) |
| 37 | #define vu16(v) ((vector unsigned short) (v)) |
| 38 | #define vu32(v) ((vector unsigned int) (v)) |
| 39 | |
| 40 | #define C1 0.98078525066375732421875000 /* cos(1 * PI / 16) */ |
| 41 | #define C2 0.92387950420379638671875000 /* cos(2 * PI / 16) */ |
| 42 | #define C3 0.83146959543228149414062500 /* cos(3 * PI / 16) */ |
| 43 | #define C4 0.70710676908493041992187500 /* cos(4 * PI / 16) */ |
| 44 | #define C5 0.55557024478912353515625000 /* cos(5 * PI / 16) */ |
| 45 | #define C6 0.38268342614173889160156250 /* cos(6 * PI / 16) */ |
| 46 | #define C7 0.19509032368659973144531250 /* cos(7 * PI / 16) */ |
| 47 | #define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */ |
| 48 | |
| 49 | #define W0 -(2 * C2) |
| 50 | #define W1 (2 * C6) |
| 51 | #define W2 (SQRT_2 * C6) |
| 52 | #define W3 (SQRT_2 * C3) |
| 53 | #define W4 (SQRT_2 * (-C1 + C3 + C5 - C7)) |
| 54 | #define W5 (SQRT_2 * (C1 + C3 - C5 + C7)) |
| 55 | #define W6 (SQRT_2 * (C1 + C3 + C5 - C7)) |
| 56 | #define W7 (SQRT_2 * (C1 + C3 - C5 - C7)) |
| 57 | #define W8 (SQRT_2 * (C7 - C3)) |
| 58 | #define W9 (SQRT_2 * (-C1 - C3)) |
| 59 | #define WA (SQRT_2 * (-C3 - C5)) |
| 60 | #define WB (SQRT_2 * (C5 - C3)) |
| 61 | |
| 62 | static const vector float fdctconsts[3] = { |
| 63 | { W0, W1, W2, W3 }, |
| 64 | { W4, W5, W6, W7 }, |
| 65 | { W8, W9, WA, WB } |
| 66 | }; |
| 67 | |
| 68 | #define LD_W0 vec_splat(cnsts0, 0) |
| 69 | #define LD_W1 vec_splat(cnsts0, 1) |
| 70 | #define LD_W2 vec_splat(cnsts0, 2) |
| 71 | #define LD_W3 vec_splat(cnsts0, 3) |
| 72 | #define LD_W4 vec_splat(cnsts1, 0) |
| 73 | #define LD_W5 vec_splat(cnsts1, 1) |
| 74 | #define LD_W6 vec_splat(cnsts1, 2) |
| 75 | #define LD_W7 vec_splat(cnsts1, 3) |
| 76 | #define LD_W8 vec_splat(cnsts2, 0) |
| 77 | #define LD_W9 vec_splat(cnsts2, 1) |
| 78 | #define LD_WA vec_splat(cnsts2, 2) |
| 79 | #define LD_WB vec_splat(cnsts2, 3) |
| 80 | |
| 81 | #define FDCTROW(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \ |
| 82 | x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ |
| 83 | x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ |
| 84 | x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ |
| 85 | x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ |
| 86 | x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ |
| 87 | x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ |
| 88 | x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ |
| 89 | x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ |
| 90 | \ |
| 91 | b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ |
| 92 | b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ |
| 93 | b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ |
| 94 | b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ |
| 95 | \ |
| 96 | b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ |
| 97 | b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ |
| 98 | b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ |
| 99 | cnst = LD_W2; \ |
| 100 | b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ |
| 101 | cnst = LD_W1; \ |
| 102 | b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ |
| 103 | cnst = LD_W0; \ |
| 104 | b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ |
| 105 | \ |
| 106 | x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ |
| 107 | x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ |
| 108 | x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ |
| 109 | x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ |
| 110 | x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ |
| 111 | cnst = LD_W3; \ |
| 112 | x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ |
| 113 | \ |
| 114 | cnst = LD_W8; \ |
| 115 | x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ |
| 116 | cnst = LD_W9; \ |
| 117 | x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ |
| 118 | cnst = LD_WA; \ |
| 119 | x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ |
| 120 | cnst = LD_WB; \ |
| 121 | x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ |
| 122 | \ |
| 123 | cnst = LD_W4; \ |
| 124 | b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ |
| 125 | cnst = LD_W5; \ |
| 126 | b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ |
| 127 | cnst = LD_W6; \ |
| 128 | b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ |
| 129 | cnst = LD_W7; \ |
| 130 | b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ |
| 131 | \ |
| 132 | b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \ |
| 133 | b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \ |
| 134 | b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \ |
| 135 | b1 = vec_add(b1, x3) /* b1 = b1 + x3; */ \ |
| 136 | /* }}} */ |
| 137 | |
| 138 | #define FDCTCOL(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \ |
| 139 | x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ |
| 140 | x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ |
| 141 | x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ |
| 142 | x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ |
| 143 | x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ |
| 144 | x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ |
| 145 | x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ |
| 146 | x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ |
| 147 | \ |
| 148 | b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ |
| 149 | b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ |
| 150 | b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ |
| 151 | b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ |
| 152 | \ |
| 153 | b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ |
| 154 | b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ |
| 155 | b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ |
| 156 | cnst = LD_W2; \ |
| 157 | b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ |
| 158 | cnst = LD_W1; \ |
| 159 | b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ |
| 160 | cnst = LD_W0; \ |
| 161 | b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ |
| 162 | \ |
| 163 | x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ |
| 164 | x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ |
| 165 | x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ |
| 166 | x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ |
| 167 | x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ |
| 168 | cnst = LD_W3; \ |
| 169 | x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ |
| 170 | \ |
| 171 | cnst = LD_W8; \ |
| 172 | x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ |
| 173 | cnst = LD_W9; \ |
| 174 | x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ |
| 175 | cnst = LD_WA; \ |
| 176 | x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ |
| 177 | cnst = LD_WB; \ |
| 178 | x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ |
| 179 | \ |
| 180 | cnst = LD_W4; \ |
| 181 | b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ |
| 182 | cnst = LD_W5; \ |
| 183 | b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ |
| 184 | cnst = LD_W6; \ |
| 185 | b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ |
| 186 | cnst = LD_W7; \ |
| 187 | b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ |
| 188 | \ |
| 189 | b7 = vec_add(b7, x2); /* b7 += x2; */ \ |
| 190 | b5 = vec_add(b5, x3); /* b5 += x3; */ \ |
| 191 | b3 = vec_add(b3, x2); /* b3 += x2; */ \ |
| 192 | b1 = vec_add(b1, x3) /* b1 += x3; */ \ |
| 193 | /* }}} */ |
| 194 | |
| 195 | /* two dimensional discrete cosine transform */ |
| 196 | void ff_fdct_altivec(int16_t *block) |
| 197 | { |
| 198 | vector signed short *bp; |
| 199 | vector float *cp = fdctconsts; |
| 200 | vector float b00, b10, b20, b30, b40, b50, b60, b70; |
| 201 | vector float b01, b11, b21, b31, b41, b51, b61, b71; |
| 202 | vector float mzero, cnst, cnsts0, cnsts1, cnsts2; |
| 203 | vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; |
| 204 | |
| 205 | /* setup constants {{{ */ |
| 206 | /* mzero = -0.0 */ |
| 207 | mzero = ((vector float) vec_splat_u32(-1)); |
| 208 | mzero = ((vector float) vec_sl(vu32(mzero), vu32(mzero))); |
| 209 | cnsts0 = vec_ld(0, cp); |
| 210 | cp++; |
| 211 | cnsts1 = vec_ld(0, cp); |
| 212 | cp++; |
| 213 | cnsts2 = vec_ld(0, cp); |
| 214 | /* }}} */ |
| 215 | |
| 216 | /* 8x8 matrix transpose (vector short[8]) {{{ */ |
| 217 | #define MERGE_S16(hl, a, b) vec_merge ## hl(vs16(a), vs16(b)) |
| 218 | |
| 219 | bp = (vector signed short *) block; |
| 220 | b00 = ((vector float) vec_ld(0, bp)); |
| 221 | b40 = ((vector float) vec_ld(16 * 4, bp)); |
| 222 | b01 = ((vector float) MERGE_S16(h, b00, b40)); |
| 223 | b11 = ((vector float) MERGE_S16(l, b00, b40)); |
| 224 | bp++; |
| 225 | b10 = ((vector float) vec_ld(0, bp)); |
| 226 | b50 = ((vector float) vec_ld(16 * 4, bp)); |
| 227 | b21 = ((vector float) MERGE_S16(h, b10, b50)); |
| 228 | b31 = ((vector float) MERGE_S16(l, b10, b50)); |
| 229 | bp++; |
| 230 | b20 = ((vector float) vec_ld(0, bp)); |
| 231 | b60 = ((vector float) vec_ld(16 * 4, bp)); |
| 232 | b41 = ((vector float) MERGE_S16(h, b20, b60)); |
| 233 | b51 = ((vector float) MERGE_S16(l, b20, b60)); |
| 234 | bp++; |
| 235 | b30 = ((vector float) vec_ld(0, bp)); |
| 236 | b70 = ((vector float) vec_ld(16 * 4, bp)); |
| 237 | b61 = ((vector float) MERGE_S16(h, b30, b70)); |
| 238 | b71 = ((vector float) MERGE_S16(l, b30, b70)); |
| 239 | |
| 240 | x0 = ((vector float) MERGE_S16(h, b01, b41)); |
| 241 | x1 = ((vector float) MERGE_S16(l, b01, b41)); |
| 242 | x2 = ((vector float) MERGE_S16(h, b11, b51)); |
| 243 | x3 = ((vector float) MERGE_S16(l, b11, b51)); |
| 244 | x4 = ((vector float) MERGE_S16(h, b21, b61)); |
| 245 | x5 = ((vector float) MERGE_S16(l, b21, b61)); |
| 246 | x6 = ((vector float) MERGE_S16(h, b31, b71)); |
| 247 | x7 = ((vector float) MERGE_S16(l, b31, b71)); |
| 248 | |
| 249 | b00 = ((vector float) MERGE_S16(h, x0, x4)); |
| 250 | b10 = ((vector float) MERGE_S16(l, x0, x4)); |
| 251 | b20 = ((vector float) MERGE_S16(h, x1, x5)); |
| 252 | b30 = ((vector float) MERGE_S16(l, x1, x5)); |
| 253 | b40 = ((vector float) MERGE_S16(h, x2, x6)); |
| 254 | b50 = ((vector float) MERGE_S16(l, x2, x6)); |
| 255 | b60 = ((vector float) MERGE_S16(h, x3, x7)); |
| 256 | b70 = ((vector float) MERGE_S16(l, x3, x7)); |
| 257 | |
| 258 | #undef MERGE_S16 |
| 259 | /* }}} */ |
| 260 | |
| 261 | /* Some of the initial calculations can be done as vector short |
| 262 | * before conversion to vector float. The following code section |
| 263 | * takes advantage of this. */ |
| 264 | |
| 265 | /* fdct rows {{{ */ |
| 266 | x0 = ((vector float) vec_add(vs16(b00), vs16(b70))); |
| 267 | x7 = ((vector float) vec_sub(vs16(b00), vs16(b70))); |
| 268 | x1 = ((vector float) vec_add(vs16(b10), vs16(b60))); |
| 269 | x6 = ((vector float) vec_sub(vs16(b10), vs16(b60))); |
| 270 | x2 = ((vector float) vec_add(vs16(b20), vs16(b50))); |
| 271 | x5 = ((vector float) vec_sub(vs16(b20), vs16(b50))); |
| 272 | x3 = ((vector float) vec_add(vs16(b30), vs16(b40))); |
| 273 | x4 = ((vector float) vec_sub(vs16(b30), vs16(b40))); |
| 274 | |
| 275 | b70 = ((vector float) vec_add(vs16(x0), vs16(x3))); |
| 276 | b10 = ((vector float) vec_add(vs16(x1), vs16(x2))); |
| 277 | |
| 278 | b00 = ((vector float) vec_add(vs16(b70), vs16(b10))); |
| 279 | b40 = ((vector float) vec_sub(vs16(b70), vs16(b10))); |
| 280 | |
| 281 | #define CTF0(n) \ |
| 282 | b ## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ## 0))); \ |
| 283 | b ## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ## 0))); \ |
| 284 | b ## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0); \ |
| 285 | b ## n ## 0 = vec_ctf(vs32(b ## n ## 0), 0) |
| 286 | |
| 287 | CTF0(0); |
| 288 | CTF0(4); |
| 289 | |
| 290 | b20 = ((vector float) vec_sub(vs16(x0), vs16(x3))); |
| 291 | b60 = ((vector float) vec_sub(vs16(x1), vs16(x2))); |
| 292 | |
| 293 | CTF0(2); |
| 294 | CTF0(6); |
| 295 | |
| 296 | #undef CTF0 |
| 297 | |
| 298 | x0 = vec_add(b60, b20); |
| 299 | x1 = vec_add(b61, b21); |
| 300 | |
| 301 | cnst = LD_W2; |
| 302 | x0 = vec_madd(cnst, x0, mzero); |
| 303 | x1 = vec_madd(cnst, x1, mzero); |
| 304 | cnst = LD_W1; |
| 305 | b20 = vec_madd(cnst, b20, x0); |
| 306 | b21 = vec_madd(cnst, b21, x1); |
| 307 | cnst = LD_W0; |
| 308 | b60 = vec_madd(cnst, b60, x0); |
| 309 | b61 = vec_madd(cnst, b61, x1); |
| 310 | |
| 311 | #define CTFX(x, b) \ |
| 312 | b ## 0 = ((vector float) vec_unpackh(vs16(x))); \ |
| 313 | b ## 1 = ((vector float) vec_unpackl(vs16(x))); \ |
| 314 | b ## 0 = vec_ctf(vs32(b ## 0), 0); \ |
| 315 | b ## 1 = vec_ctf(vs32(b ## 1), 0) |
| 316 | |
| 317 | CTFX(x4, b7); |
| 318 | CTFX(x5, b5); |
| 319 | CTFX(x6, b3); |
| 320 | CTFX(x7, b1); |
| 321 | |
| 322 | #undef CTFX |
| 323 | |
| 324 | x0 = vec_add(b70, b10); |
| 325 | x1 = vec_add(b50, b30); |
| 326 | x2 = vec_add(b70, b30); |
| 327 | x3 = vec_add(b50, b10); |
| 328 | x8 = vec_add(x2, x3); |
| 329 | cnst = LD_W3; |
| 330 | x8 = vec_madd(cnst, x8, mzero); |
| 331 | |
| 332 | cnst = LD_W8; |
| 333 | x0 = vec_madd(cnst, x0, mzero); |
| 334 | cnst = LD_W9; |
| 335 | x1 = vec_madd(cnst, x1, mzero); |
| 336 | cnst = LD_WA; |
| 337 | x2 = vec_madd(cnst, x2, x8); |
| 338 | cnst = LD_WB; |
| 339 | x3 = vec_madd(cnst, x3, x8); |
| 340 | |
| 341 | cnst = LD_W4; |
| 342 | b70 = vec_madd(cnst, b70, x0); |
| 343 | cnst = LD_W5; |
| 344 | b50 = vec_madd(cnst, b50, x1); |
| 345 | cnst = LD_W6; |
| 346 | b30 = vec_madd(cnst, b30, x1); |
| 347 | cnst = LD_W7; |
| 348 | b10 = vec_madd(cnst, b10, x0); |
| 349 | |
| 350 | b70 = vec_add(b70, x2); |
| 351 | b50 = vec_add(b50, x3); |
| 352 | b30 = vec_add(b30, x2); |
| 353 | b10 = vec_add(b10, x3); |
| 354 | |
| 355 | x0 = vec_add(b71, b11); |
| 356 | x1 = vec_add(b51, b31); |
| 357 | x2 = vec_add(b71, b31); |
| 358 | x3 = vec_add(b51, b11); |
| 359 | x8 = vec_add(x2, x3); |
| 360 | cnst = LD_W3; |
| 361 | x8 = vec_madd(cnst, x8, mzero); |
| 362 | |
| 363 | cnst = LD_W8; |
| 364 | x0 = vec_madd(cnst, x0, mzero); |
| 365 | cnst = LD_W9; |
| 366 | x1 = vec_madd(cnst, x1, mzero); |
| 367 | cnst = LD_WA; |
| 368 | x2 = vec_madd(cnst, x2, x8); |
| 369 | cnst = LD_WB; |
| 370 | x3 = vec_madd(cnst, x3, x8); |
| 371 | |
| 372 | cnst = LD_W4; |
| 373 | b71 = vec_madd(cnst, b71, x0); |
| 374 | cnst = LD_W5; |
| 375 | b51 = vec_madd(cnst, b51, x1); |
| 376 | cnst = LD_W6; |
| 377 | b31 = vec_madd(cnst, b31, x1); |
| 378 | cnst = LD_W7; |
| 379 | b11 = vec_madd(cnst, b11, x0); |
| 380 | |
| 381 | b71 = vec_add(b71, x2); |
| 382 | b51 = vec_add(b51, x3); |
| 383 | b31 = vec_add(b31, x2); |
| 384 | b11 = vec_add(b11, x3); |
| 385 | /* }}} */ |
| 386 | |
| 387 | /* 8x8 matrix transpose (vector float[8][2]) {{{ */ |
| 388 | x0 = vec_mergel(b00, b20); |
| 389 | x1 = vec_mergeh(b00, b20); |
| 390 | x2 = vec_mergel(b10, b30); |
| 391 | x3 = vec_mergeh(b10, b30); |
| 392 | |
| 393 | b00 = vec_mergeh(x1, x3); |
| 394 | b10 = vec_mergel(x1, x3); |
| 395 | b20 = vec_mergeh(x0, x2); |
| 396 | b30 = vec_mergel(x0, x2); |
| 397 | |
| 398 | x4 = vec_mergel(b41, b61); |
| 399 | x5 = vec_mergeh(b41, b61); |
| 400 | x6 = vec_mergel(b51, b71); |
| 401 | x7 = vec_mergeh(b51, b71); |
| 402 | |
| 403 | b41 = vec_mergeh(x5, x7); |
| 404 | b51 = vec_mergel(x5, x7); |
| 405 | b61 = vec_mergeh(x4, x6); |
| 406 | b71 = vec_mergel(x4, x6); |
| 407 | |
| 408 | x0 = vec_mergel(b01, b21); |
| 409 | x1 = vec_mergeh(b01, b21); |
| 410 | x2 = vec_mergel(b11, b31); |
| 411 | x3 = vec_mergeh(b11, b31); |
| 412 | |
| 413 | x4 = vec_mergel(b40, b60); |
| 414 | x5 = vec_mergeh(b40, b60); |
| 415 | x6 = vec_mergel(b50, b70); |
| 416 | x7 = vec_mergeh(b50, b70); |
| 417 | |
| 418 | b40 = vec_mergeh(x1, x3); |
| 419 | b50 = vec_mergel(x1, x3); |
| 420 | b60 = vec_mergeh(x0, x2); |
| 421 | b70 = vec_mergel(x0, x2); |
| 422 | |
| 423 | b01 = vec_mergeh(x5, x7); |
| 424 | b11 = vec_mergel(x5, x7); |
| 425 | b21 = vec_mergeh(x4, x6); |
| 426 | b31 = vec_mergel(x4, x6); |
| 427 | /* }}} */ |
| 428 | |
| 429 | FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); |
| 430 | FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); |
| 431 | |
| 432 | /* round, convert back to short {{{ */ |
| 433 | #define CTS(n) \ |
| 434 | b ## n ## 0 = vec_round(b ## n ## 0); \ |
| 435 | b ## n ## 1 = vec_round(b ## n ## 1); \ |
| 436 | b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0)); \ |
| 437 | b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0)); \ |
| 438 | b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0), \ |
| 439 | vs32(b ## n ## 1))); \ |
| 440 | vec_st(vs16(b ## n ## 0), 0, bp) |
| 441 | |
| 442 | bp = (vector signed short *) block; |
| 443 | CTS(0); |
| 444 | bp++; |
| 445 | CTS(1); |
| 446 | bp++; |
| 447 | CTS(2); |
| 448 | bp++; |
| 449 | CTS(3); |
| 450 | bp++; |
| 451 | CTS(4); |
| 452 | bp++; |
| 453 | CTS(5); |
| 454 | bp++; |
| 455 | CTS(6); |
| 456 | bp++; |
| 457 | CTS(7); |
| 458 | |
| 459 | #undef CTS |
| 460 | /* }}} */ |
| 461 | } |
| 462 | |
| 463 | #endif /* HAVE_ALTIVEC */ |
| 464 | |
| 465 | av_cold void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx, |
| 466 | unsigned high_bit_depth) |
| 467 | { |
| 468 | #if HAVE_ALTIVEC |
| 469 | if (!PPC_ALTIVEC(av_get_cpu_flags())) |
| 470 | return; |
| 471 | |
| 472 | if (!high_bit_depth) { |
| 473 | if (avctx->dct_algo == FF_DCT_AUTO || |
| 474 | avctx->dct_algo == FF_DCT_ALTIVEC) { |
| 475 | c->fdct = ff_fdct_altivec; |
| 476 | } |
| 477 | } |
| 478 | #endif /* HAVE_ALTIVEC */ |
| 479 | } |