| 1 | /* |
| 2 | * Copyright (C) 2009 David Conrad |
| 3 | * |
| 4 | * This file is part of FFmpeg. |
| 5 | * |
| 6 | * FFmpeg is free software; you can redistribute it and/or |
| 7 | * modify it under the terms of the GNU Lesser General Public |
| 8 | * License as published by the Free Software Foundation; either |
| 9 | * version 2.1 of the License, or (at your option) any later version. |
| 10 | * |
| 11 | * FFmpeg is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | * Lesser General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU Lesser General Public |
| 17 | * License along with FFmpeg; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | */ |
| 20 | |
| 21 | #include <string.h> |
| 22 | |
| 23 | #include "config.h" |
| 24 | #include "libavutil/attributes.h" |
| 25 | #include "libavutil/cpu.h" |
| 26 | #include "libavutil/ppc/cpu.h" |
| 27 | #include "libavutil/ppc/types_altivec.h" |
| 28 | #include "libavutil/ppc/util_altivec.h" |
| 29 | #include "libavcodec/vp3dsp.h" |
| 30 | |
| 31 | #if HAVE_ALTIVEC |
| 32 | |
| 33 | static const vec_s16 constants = |
| 34 | {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785}; |
| 35 | static const vec_u8 interleave_high = |
| 36 | {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29}; |
| 37 | |
| 38 | #define IDCT_START \ |
| 39 | vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\ |
| 40 | vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\ |
| 41 | vec_s16 eight = vec_splat_s16(8);\ |
| 42 | vec_u16 four = vec_splat_u16(4);\ |
| 43 | \ |
| 44 | vec_s16 C1 = vec_splat(constants, 1);\ |
| 45 | vec_s16 C2 = vec_splat(constants, 2);\ |
| 46 | vec_s16 C3 = vec_splat(constants, 3);\ |
| 47 | vec_s16 C4 = vec_splat(constants, 4);\ |
| 48 | vec_s16 C5 = vec_splat(constants, 5);\ |
| 49 | vec_s16 C6 = vec_splat(constants, 6);\ |
| 50 | vec_s16 C7 = vec_splat(constants, 7);\ |
| 51 | \ |
| 52 | vec_s16 b0 = vec_ld(0x00, block);\ |
| 53 | vec_s16 b1 = vec_ld(0x10, block);\ |
| 54 | vec_s16 b2 = vec_ld(0x20, block);\ |
| 55 | vec_s16 b3 = vec_ld(0x30, block);\ |
| 56 | vec_s16 b4 = vec_ld(0x40, block);\ |
| 57 | vec_s16 b5 = vec_ld(0x50, block);\ |
| 58 | vec_s16 b6 = vec_ld(0x60, block);\ |
| 59 | vec_s16 b7 = vec_ld(0x70, block); |
| 60 | |
| 61 | // these functions do (a*C)>>16 |
| 62 | // things are tricky because a is signed, but C unsigned. |
| 63 | // M15 is used if C fits in 15 bit unsigned (C6,C7) |
| 64 | // M16 is used if C requires 16 bits unsigned |
| 65 | static inline vec_s16 M15(vec_s16 a, vec_s16 C) |
| 66 | { |
| 67 | return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high); |
| 68 | } |
| 69 | static inline vec_s16 M16(vec_s16 a, vec_s16 C) |
| 70 | { |
| 71 | return vec_add(a, M15(a, C)); |
| 72 | } |
| 73 | |
| 74 | #define IDCT_1D(ADD, SHIFT)\ |
| 75 | A = vec_add(M16(b1, C1), M15(b7, C7));\ |
| 76 | B = vec_sub(M15(b1, C7), M16(b7, C1));\ |
| 77 | C = vec_add(M16(b3, C3), M16(b5, C5));\ |
| 78 | D = vec_sub(M16(b5, C3), M16(b3, C5));\ |
| 79 | \ |
| 80 | Ad = M16(vec_sub(A, C), C4);\ |
| 81 | Bd = M16(vec_sub(B, D), C4);\ |
| 82 | \ |
| 83 | Cd = vec_add(A, C);\ |
| 84 | Dd = vec_add(B, D);\ |
| 85 | \ |
| 86 | E = ADD(M16(vec_add(b0, b4), C4));\ |
| 87 | F = ADD(M16(vec_sub(b0, b4), C4));\ |
| 88 | \ |
| 89 | G = vec_add(M16(b2, C2), M15(b6, C6));\ |
| 90 | H = vec_sub(M15(b2, C6), M16(b6, C2));\ |
| 91 | \ |
| 92 | Ed = vec_sub(E, G);\ |
| 93 | Gd = vec_add(E, G);\ |
| 94 | \ |
| 95 | Add = vec_add(F, Ad);\ |
| 96 | Bdd = vec_sub(Bd, H);\ |
| 97 | \ |
| 98 | Fd = vec_sub(F, Ad);\ |
| 99 | Hd = vec_add(Bd, H);\ |
| 100 | \ |
| 101 | b0 = SHIFT(vec_add(Gd, Cd));\ |
| 102 | b7 = SHIFT(vec_sub(Gd, Cd));\ |
| 103 | \ |
| 104 | b1 = SHIFT(vec_add(Add, Hd));\ |
| 105 | b2 = SHIFT(vec_sub(Add, Hd));\ |
| 106 | \ |
| 107 | b3 = SHIFT(vec_add(Ed, Dd));\ |
| 108 | b4 = SHIFT(vec_sub(Ed, Dd));\ |
| 109 | \ |
| 110 | b5 = SHIFT(vec_add(Fd, Bdd));\ |
| 111 | b6 = SHIFT(vec_sub(Fd, Bdd)); |
| 112 | |
| 113 | #define NOP(a) a |
| 114 | #define ADD8(a) vec_add(a, eight) |
| 115 | #define SHIFT4(a) vec_sra(a, four) |
| 116 | |
| 117 | static void vp3_idct_put_altivec(uint8_t *dst, int stride, int16_t block[64]) |
| 118 | { |
| 119 | vec_u8 t; |
| 120 | IDCT_START |
| 121 | |
| 122 | // pixels are signed; so add 128*16 in addition to the normal 8 |
| 123 | vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); |
| 124 | eight = vec_add(eight, v2048); |
| 125 | |
| 126 | IDCT_1D(NOP, NOP) |
| 127 | TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); |
| 128 | IDCT_1D(ADD8, SHIFT4) |
| 129 | |
| 130 | #define PUT(a)\ |
| 131 | t = vec_packsu(a, a);\ |
| 132 | vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ |
| 133 | vec_ste((vec_u32)t, 4, (unsigned int *)dst); |
| 134 | |
| 135 | PUT(b0) dst += stride; |
| 136 | PUT(b1) dst += stride; |
| 137 | PUT(b2) dst += stride; |
| 138 | PUT(b3) dst += stride; |
| 139 | PUT(b4) dst += stride; |
| 140 | PUT(b5) dst += stride; |
| 141 | PUT(b6) dst += stride; |
| 142 | PUT(b7) |
| 143 | memset(block, 0, sizeof(*block) * 64); |
| 144 | } |
| 145 | |
| 146 | static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64]) |
| 147 | { |
| 148 | LOAD_ZERO; |
| 149 | vec_u8 t, vdst; |
| 150 | vec_s16 vdst_16; |
| 151 | vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); |
| 152 | |
| 153 | IDCT_START |
| 154 | |
| 155 | IDCT_1D(NOP, NOP) |
| 156 | TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); |
| 157 | IDCT_1D(ADD8, SHIFT4) |
| 158 | |
| 159 | #define ADD(a)\ |
| 160 | vdst = vec_ld(0, dst);\ |
| 161 | vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\ |
| 162 | vdst_16 = vec_adds(a, vdst_16);\ |
| 163 | t = vec_packsu(vdst_16, vdst_16);\ |
| 164 | vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ |
| 165 | vec_ste((vec_u32)t, 4, (unsigned int *)dst); |
| 166 | |
| 167 | ADD(b0) dst += stride; |
| 168 | ADD(b1) dst += stride; |
| 169 | ADD(b2) dst += stride; |
| 170 | ADD(b3) dst += stride; |
| 171 | ADD(b4) dst += stride; |
| 172 | ADD(b5) dst += stride; |
| 173 | ADD(b6) dst += stride; |
| 174 | ADD(b7) |
| 175 | memset(block, 0, sizeof(*block) * 64); |
| 176 | } |
| 177 | |
| 178 | #endif /* HAVE_ALTIVEC */ |
| 179 | |
| 180 | av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags) |
| 181 | { |
| 182 | #if HAVE_ALTIVEC |
| 183 | if (!PPC_ALTIVEC(av_get_cpu_flags())) |
| 184 | return; |
| 185 | |
| 186 | c->idct_put = vp3_idct_put_altivec; |
| 187 | c->idct_add = vp3_idct_add_altivec; |
| 188 | #endif |
| 189 | } |