| 1 | /* |
| 2 | * x86-optimized AC-3 DSP functions |
| 3 | * Copyright (c) 2011 Justin Ruggles |
| 4 | * |
| 5 | * This file is part of FFmpeg. |
| 6 | * |
| 7 | * FFmpeg is free software; you can redistribute it and/or |
| 8 | * modify it under the terms of the GNU Lesser General Public |
| 9 | * License as published by the Free Software Foundation; either |
| 10 | * version 2.1 of the License, or (at your option) any later version. |
| 11 | * |
| 12 | * FFmpeg is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | * Lesser General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU Lesser General Public |
| 18 | * License along with FFmpeg; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | */ |
| 21 | |
| 22 | #include "libavutil/attributes.h" |
| 23 | #include "libavutil/mem.h" |
| 24 | #include "libavutil/x86/asm.h" |
| 25 | #include "libavutil/x86/cpu.h" |
| 26 | #include "libavcodec/ac3.h" |
| 27 | #include "libavcodec/ac3dsp.h" |
| 28 | |
| 29 | void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
| 30 | void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
| 31 | void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
| 32 | |
| 33 | int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); |
| 34 | int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); |
| 35 | int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); |
| 36 | int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); |
| 37 | |
| 38 | void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); |
| 39 | void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); |
| 40 | |
| 41 | void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); |
| 42 | void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); |
| 43 | |
| 44 | void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); |
| 45 | void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); |
| 46 | void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); |
| 47 | |
| 48 | int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); |
| 49 | |
| 50 | void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); |
| 51 | void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); |
| 52 | void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); |
| 53 | |
| 54 | void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, |
| 55 | const int16_t *window, unsigned int len); |
| 56 | void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, |
| 57 | const int16_t *window, unsigned int len); |
| 58 | void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, |
| 59 | const int16_t *window, unsigned int len); |
| 60 | void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, |
| 61 | const int16_t *window, unsigned int len); |
| 62 | void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, |
| 63 | const int16_t *window, unsigned int len); |
| 64 | void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, |
| 65 | const int16_t *window, unsigned int len); |
| 66 | |
| 67 | #if ARCH_X86_32 && defined(__INTEL_COMPILER) |
| 68 | # undef HAVE_7REGS |
| 69 | # define HAVE_7REGS 0 |
| 70 | #endif |
| 71 | |
| 72 | #if HAVE_SSE_INLINE && HAVE_7REGS |
| 73 | |
| 74 | #define IF1(x) x |
| 75 | #define IF0(x) |
| 76 | |
| 77 | #define MIX5(mono, stereo) \ |
| 78 | __asm__ volatile ( \ |
| 79 | "movss 0(%1), %%xmm5 \n" \ |
| 80 | "movss 8(%1), %%xmm6 \n" \ |
| 81 | "movss 24(%1), %%xmm7 \n" \ |
| 82 | "shufps $0, %%xmm5, %%xmm5 \n" \ |
| 83 | "shufps $0, %%xmm6, %%xmm6 \n" \ |
| 84 | "shufps $0, %%xmm7, %%xmm7 \n" \ |
| 85 | "1: \n" \ |
| 86 | "movaps (%0, %2), %%xmm0 \n" \ |
| 87 | "movaps (%0, %3), %%xmm1 \n" \ |
| 88 | "movaps (%0, %4), %%xmm2 \n" \ |
| 89 | "movaps (%0, %5), %%xmm3 \n" \ |
| 90 | "movaps (%0, %6), %%xmm4 \n" \ |
| 91 | "mulps %%xmm5, %%xmm0 \n" \ |
| 92 | "mulps %%xmm6, %%xmm1 \n" \ |
| 93 | "mulps %%xmm5, %%xmm2 \n" \ |
| 94 | "mulps %%xmm7, %%xmm3 \n" \ |
| 95 | "mulps %%xmm7, %%xmm4 \n" \ |
| 96 | stereo("addps %%xmm1, %%xmm0 \n") \ |
| 97 | "addps %%xmm1, %%xmm2 \n" \ |
| 98 | "addps %%xmm3, %%xmm0 \n" \ |
| 99 | "addps %%xmm4, %%xmm2 \n" \ |
| 100 | mono("addps %%xmm2, %%xmm0 \n") \ |
| 101 | "movaps %%xmm0, (%0, %2) \n" \ |
| 102 | stereo("movaps %%xmm2, (%0, %3) \n") \ |
| 103 | "add $16, %0 \n" \ |
| 104 | "jl 1b \n" \ |
| 105 | : "+&r"(i) \ |
| 106 | : "r"(matrix), \ |
| 107 | "r"(samples[0] + len), \ |
| 108 | "r"(samples[1] + len), \ |
| 109 | "r"(samples[2] + len), \ |
| 110 | "r"(samples[3] + len), \ |
| 111 | "r"(samples[4] + len) \ |
| 112 | : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
| 113 | "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ |
| 114 | "memory" \ |
| 115 | ); |
| 116 | |
| 117 | #define MIX_MISC(stereo) \ |
| 118 | __asm__ volatile ( \ |
| 119 | "mov %5, %2 \n" \ |
| 120 | "1: \n" \ |
| 121 | "mov -%c7(%6, %2, %c8), %3 \n" \ |
| 122 | "movaps (%3, %0), %%xmm0 \n" \ |
| 123 | stereo("movaps %%xmm0, %%xmm1 \n") \ |
| 124 | "mulps %%xmm4, %%xmm0 \n" \ |
| 125 | stereo("mulps %%xmm5, %%xmm1 \n") \ |
| 126 | "2: \n" \ |
| 127 | "mov (%6, %2, %c8), %1 \n" \ |
| 128 | "movaps (%1, %0), %%xmm2 \n" \ |
| 129 | stereo("movaps %%xmm2, %%xmm3 \n") \ |
| 130 | "mulps (%4, %2, 8), %%xmm2 \n" \ |
| 131 | stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ |
| 132 | "addps %%xmm2, %%xmm0 \n" \ |
| 133 | stereo("addps %%xmm3, %%xmm1 \n") \ |
| 134 | "add $4, %2 \n" \ |
| 135 | "jl 2b \n" \ |
| 136 | "mov %5, %2 \n" \ |
| 137 | stereo("mov (%6, %2, %c8), %1 \n") \ |
| 138 | "movaps %%xmm0, (%3, %0) \n" \ |
| 139 | stereo("movaps %%xmm1, (%1, %0) \n") \ |
| 140 | "add $16, %0 \n" \ |
| 141 | "jl 1b \n" \ |
| 142 | : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ |
| 143 | : "r"(matrix_simd + in_ch), \ |
| 144 | "g"((intptr_t) - 4 * (in_ch - 1)), \ |
| 145 | "r"(samp + in_ch), \ |
| 146 | "i"(sizeof(float *)), "i"(sizeof(float *)/4) \ |
| 147 | : "memory" \ |
| 148 | ); |
| 149 | |
| 150 | static void ac3_downmix_sse(float **samples, float (*matrix)[2], |
| 151 | int out_ch, int in_ch, int len) |
| 152 | { |
| 153 | int (*matrix_cmp)[2] = (int(*)[2])matrix; |
| 154 | intptr_t i, j, k, m; |
| 155 | |
| 156 | i = -len * sizeof(float); |
| 157 | if (in_ch == 5 && out_ch == 2 && |
| 158 | !(matrix_cmp[0][1] | matrix_cmp[2][0] | |
| 159 | matrix_cmp[3][1] | matrix_cmp[4][0] | |
| 160 | (matrix_cmp[1][0] ^ matrix_cmp[1][1]) | |
| 161 | (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) { |
| 162 | MIX5(IF0, IF1); |
| 163 | } else if (in_ch == 5 && out_ch == 1 && |
| 164 | matrix_cmp[0][0] == matrix_cmp[2][0] && |
| 165 | matrix_cmp[3][0] == matrix_cmp[4][0]) { |
| 166 | MIX5(IF1, IF0); |
| 167 | } else { |
| 168 | DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; |
| 169 | float *samp[AC3_MAX_CHANNELS]; |
| 170 | |
| 171 | for (j = 0; j < in_ch; j++) |
| 172 | samp[j] = samples[j] + len; |
| 173 | |
| 174 | j = 2 * in_ch * sizeof(float); |
| 175 | __asm__ volatile ( |
| 176 | "1: \n" |
| 177 | "sub $8, %0 \n" |
| 178 | "movss (%2, %0), %%xmm4 \n" |
| 179 | "movss 4(%2, %0), %%xmm5 \n" |
| 180 | "shufps $0, %%xmm4, %%xmm4 \n" |
| 181 | "shufps $0, %%xmm5, %%xmm5 \n" |
| 182 | "movaps %%xmm4, (%1, %0, 4) \n" |
| 183 | "movaps %%xmm5, 16(%1, %0, 4) \n" |
| 184 | "jg 1b \n" |
| 185 | : "+&r"(j) |
| 186 | : "r"(matrix_simd), "r"(matrix) |
| 187 | : "memory" |
| 188 | ); |
| 189 | if (out_ch == 2) { |
| 190 | MIX_MISC(IF1); |
| 191 | } else { |
| 192 | MIX_MISC(IF0); |
| 193 | } |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | #endif /* HAVE_SSE_INLINE && HAVE_7REGS */ |
| 198 | |
| 199 | av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
| 200 | { |
| 201 | int cpu_flags = av_get_cpu_flags(); |
| 202 | |
| 203 | if (EXTERNAL_MMX(cpu_flags)) { |
| 204 | c->ac3_exponent_min = ff_ac3_exponent_min_mmx; |
| 205 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; |
| 206 | c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; |
| 207 | c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; |
| 208 | } |
| 209 | if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
| 210 | if (!bit_exact) { |
| 211 | c->float_to_fixed24 = ff_float_to_fixed24_3dnow; |
| 212 | } |
| 213 | } |
| 214 | if (EXTERNAL_MMXEXT(cpu_flags)) { |
| 215 | c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; |
| 216 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; |
| 217 | if (bit_exact) { |
| 218 | c->apply_window_int16 = ff_apply_window_int16_mmxext; |
| 219 | } else { |
| 220 | c->apply_window_int16 = ff_apply_window_int16_round_mmxext; |
| 221 | } |
| 222 | } |
| 223 | if (EXTERNAL_SSE(cpu_flags)) { |
| 224 | c->float_to_fixed24 = ff_float_to_fixed24_sse; |
| 225 | } |
| 226 | if (EXTERNAL_SSE2(cpu_flags)) { |
| 227 | c->ac3_exponent_min = ff_ac3_exponent_min_sse2; |
| 228 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; |
| 229 | c->float_to_fixed24 = ff_float_to_fixed24_sse2; |
| 230 | c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; |
| 231 | c->extract_exponents = ff_ac3_extract_exponents_sse2; |
| 232 | if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { |
| 233 | c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; |
| 234 | c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; |
| 235 | } |
| 236 | if (bit_exact) { |
| 237 | c->apply_window_int16 = ff_apply_window_int16_sse2; |
| 238 | } else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { |
| 239 | c->apply_window_int16 = ff_apply_window_int16_round_sse2; |
| 240 | } |
| 241 | } |
| 242 | if (EXTERNAL_SSSE3(cpu_flags)) { |
| 243 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; |
| 244 | if (cpu_flags & AV_CPU_FLAG_ATOM) { |
| 245 | c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; |
| 246 | } else { |
| 247 | c->extract_exponents = ff_ac3_extract_exponents_ssse3; |
| 248 | c->apply_window_int16 = ff_apply_window_int16_ssse3; |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | #if HAVE_SSE_INLINE && HAVE_7REGS |
| 253 | if (INLINE_SSE(cpu_flags)) { |
| 254 | c->downmix = ac3_downmix_sse; |
| 255 | } |
| 256 | #endif |
| 257 | } |