2 * x86-optimized AC-3 DSP functions
3 * Copyright (c) 2011 Justin Ruggles
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/attributes.h"
23 #include "libavutil/mem.h"
24 #include "libavutil/x86/asm.h"
25 #include "libavutil/x86/cpu.h"
26 #include "libavcodec/ac3.h"
27 #include "libavcodec/ac3dsp.h"
29 void ff_ac3_exponent_min_mmx (uint8_t *exp
, int num_reuse_blocks
, int nb_coefs
);
30 void ff_ac3_exponent_min_mmxext(uint8_t *exp
, int num_reuse_blocks
, int nb_coefs
);
31 void ff_ac3_exponent_min_sse2 (uint8_t *exp
, int num_reuse_blocks
, int nb_coefs
);
33 int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src
, int len
);
34 int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src
, int len
);
35 int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src
, int len
);
36 int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src
, int len
);
38 void ff_ac3_lshift_int16_mmx (int16_t *src
, unsigned int len
, unsigned int shift
);
39 void ff_ac3_lshift_int16_sse2(int16_t *src
, unsigned int len
, unsigned int shift
);
41 void ff_ac3_rshift_int32_mmx (int32_t *src
, unsigned int len
, unsigned int shift
);
42 void ff_ac3_rshift_int32_sse2(int32_t *src
, unsigned int len
, unsigned int shift
);
44 void ff_float_to_fixed24_3dnow(int32_t *dst
, const float *src
, unsigned int len
);
45 void ff_float_to_fixed24_sse (int32_t *dst
, const float *src
, unsigned int len
);
46 void ff_float_to_fixed24_sse2 (int32_t *dst
, const float *src
, unsigned int len
);
48 int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt
[6][16]);
50 void ff_ac3_extract_exponents_3dnow(uint8_t *exp
, int32_t *coef
, int nb_coefs
);
51 void ff_ac3_extract_exponents_sse2 (uint8_t *exp
, int32_t *coef
, int nb_coefs
);
52 void ff_ac3_extract_exponents_ssse3(uint8_t *exp
, int32_t *coef
, int nb_coefs
);
54 void ff_apply_window_int16_round_mmxext(int16_t *output
, const int16_t *input
,
55 const int16_t *window
, unsigned int len
);
56 void ff_apply_window_int16_round_sse2(int16_t *output
, const int16_t *input
,
57 const int16_t *window
, unsigned int len
);
58 void ff_apply_window_int16_mmxext(int16_t *output
, const int16_t *input
,
59 const int16_t *window
, unsigned int len
);
60 void ff_apply_window_int16_sse2(int16_t *output
, const int16_t *input
,
61 const int16_t *window
, unsigned int len
);
62 void ff_apply_window_int16_ssse3(int16_t *output
, const int16_t *input
,
63 const int16_t *window
, unsigned int len
);
64 void ff_apply_window_int16_ssse3_atom(int16_t *output
, const int16_t *input
,
65 const int16_t *window
, unsigned int len
);
67 #if ARCH_X86_32 && defined(__INTEL_COMPILER)
72 #if HAVE_SSE_INLINE && HAVE_7REGS
77 #define MIX5(mono, stereo) \
79 "movss 0(%1), %%xmm5 \n" \
80 "movss 8(%1), %%xmm6 \n" \
81 "movss 24(%1), %%xmm7 \n" \
82 "shufps $0, %%xmm5, %%xmm5 \n" \
83 "shufps $0, %%xmm6, %%xmm6 \n" \
84 "shufps $0, %%xmm7, %%xmm7 \n" \
86 "movaps (%0, %2), %%xmm0 \n" \
87 "movaps (%0, %3), %%xmm1 \n" \
88 "movaps (%0, %4), %%xmm2 \n" \
89 "movaps (%0, %5), %%xmm3 \n" \
90 "movaps (%0, %6), %%xmm4 \n" \
91 "mulps %%xmm5, %%xmm0 \n" \
92 "mulps %%xmm6, %%xmm1 \n" \
93 "mulps %%xmm5, %%xmm2 \n" \
94 "mulps %%xmm7, %%xmm3 \n" \
95 "mulps %%xmm7, %%xmm4 \n" \
96 stereo("addps %%xmm1, %%xmm0 \n") \
97 "addps %%xmm1, %%xmm2 \n" \
98 "addps %%xmm3, %%xmm0 \n" \
99 "addps %%xmm4, %%xmm2 \n" \
100 mono("addps %%xmm2, %%xmm0 \n") \
101 "movaps %%xmm0, (%0, %2) \n" \
102 stereo("movaps %%xmm2, (%0, %3) \n") \
107 "r"(samples[0] + len), \
108 "r"(samples[1] + len), \
109 "r"(samples[2] + len), \
110 "r"(samples[3] + len), \
111 "r"(samples[4] + len) \
112 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
113 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
117 #define MIX_MISC(stereo) \
121 "mov -%c7(%6, %2, %c8), %3 \n" \
122 "movaps (%3, %0), %%xmm0 \n" \
123 stereo("movaps %%xmm0, %%xmm1 \n") \
124 "mulps %%xmm4, %%xmm0 \n" \
125 stereo("mulps %%xmm5, %%xmm1 \n") \
127 "mov (%6, %2, %c8), %1 \n" \
128 "movaps (%1, %0), %%xmm2 \n" \
129 stereo("movaps %%xmm2, %%xmm3 \n") \
130 "mulps (%4, %2, 8), %%xmm2 \n" \
131 stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \
132 "addps %%xmm2, %%xmm0 \n" \
133 stereo("addps %%xmm3, %%xmm1 \n") \
137 stereo("mov (%6, %2, %c8), %1 \n") \
138 "movaps %%xmm0, (%3, %0) \n" \
139 stereo("movaps %%xmm1, (%1, %0) \n") \
142 : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \
143 : "r"(matrix_simd + in_ch), \
144 "g"((intptr_t) - 4 * (in_ch - 1)), \
146 "i"(sizeof(float *)), "i"(sizeof(float *)/4) \
150 static void ac3_downmix_sse(float **samples
, float (*matrix
)[2],
151 int out_ch
, int in_ch
, int len
)
153 int (*matrix_cmp
)[2] = (int(*)[2])matrix
;
156 i
= -len
* sizeof(float);
157 if (in_ch
== 5 && out_ch
== 2 &&
158 !(matrix_cmp
[0][1] | matrix_cmp
[2][0] |
159 matrix_cmp
[3][1] | matrix_cmp
[4][0] |
160 (matrix_cmp
[1][0] ^ matrix_cmp
[1][1]) |
161 (matrix_cmp
[0][0] ^ matrix_cmp
[2][1]))) {
163 } else if (in_ch
== 5 && out_ch
== 1 &&
164 matrix_cmp
[0][0] == matrix_cmp
[2][0] &&
165 matrix_cmp
[3][0] == matrix_cmp
[4][0]) {
168 DECLARE_ALIGNED(16, float, matrix_simd
)[AC3_MAX_CHANNELS
][2][4];
169 float *samp
[AC3_MAX_CHANNELS
];
171 for (j
= 0; j
< in_ch
; j
++)
172 samp
[j
] = samples
[j
] + len
;
174 j
= 2 * in_ch
* sizeof(float);
178 "movss (%2, %0), %%xmm4 \n"
179 "movss 4(%2, %0), %%xmm5 \n"
180 "shufps $0, %%xmm4, %%xmm4 \n"
181 "shufps $0, %%xmm5, %%xmm5 \n"
182 "movaps %%xmm4, (%1, %0, 4) \n"
183 "movaps %%xmm5, 16(%1, %0, 4) \n"
186 : "r"(matrix_simd
), "r"(matrix
)
197 #endif /* HAVE_SSE_INLINE && HAVE_7REGS */
199 av_cold
void ff_ac3dsp_init_x86(AC3DSPContext
*c
, int bit_exact
)
201 int cpu_flags
= av_get_cpu_flags();
203 if (EXTERNAL_MMX(cpu_flags
)) {
204 c
->ac3_exponent_min
= ff_ac3_exponent_min_mmx
;
205 c
->ac3_max_msb_abs_int16
= ff_ac3_max_msb_abs_int16_mmx
;
206 c
->ac3_lshift_int16
= ff_ac3_lshift_int16_mmx
;
207 c
->ac3_rshift_int32
= ff_ac3_rshift_int32_mmx
;
209 if (EXTERNAL_AMD3DNOW(cpu_flags
)) {
211 c
->float_to_fixed24
= ff_float_to_fixed24_3dnow
;
214 if (EXTERNAL_MMXEXT(cpu_flags
)) {
215 c
->ac3_exponent_min
= ff_ac3_exponent_min_mmxext
;
216 c
->ac3_max_msb_abs_int16
= ff_ac3_max_msb_abs_int16_mmxext
;
218 c
->apply_window_int16
= ff_apply_window_int16_mmxext
;
220 c
->apply_window_int16
= ff_apply_window_int16_round_mmxext
;
223 if (EXTERNAL_SSE(cpu_flags
)) {
224 c
->float_to_fixed24
= ff_float_to_fixed24_sse
;
226 if (EXTERNAL_SSE2(cpu_flags
)) {
227 c
->ac3_exponent_min
= ff_ac3_exponent_min_sse2
;
228 c
->ac3_max_msb_abs_int16
= ff_ac3_max_msb_abs_int16_sse2
;
229 c
->float_to_fixed24
= ff_float_to_fixed24_sse2
;
230 c
->compute_mantissa_size
= ff_ac3_compute_mantissa_size_sse2
;
231 c
->extract_exponents
= ff_ac3_extract_exponents_sse2
;
232 if (!(cpu_flags
& AV_CPU_FLAG_SSE2SLOW
)) {
233 c
->ac3_lshift_int16
= ff_ac3_lshift_int16_sse2
;
234 c
->ac3_rshift_int32
= ff_ac3_rshift_int32_sse2
;
237 c
->apply_window_int16
= ff_apply_window_int16_sse2
;
238 } else if (!(cpu_flags
& AV_CPU_FLAG_SSE2SLOW
)) {
239 c
->apply_window_int16
= ff_apply_window_int16_round_sse2
;
242 if (EXTERNAL_SSSE3(cpu_flags
)) {
243 c
->ac3_max_msb_abs_int16
= ff_ac3_max_msb_abs_int16_ssse3
;
244 if (cpu_flags
& AV_CPU_FLAG_ATOM
) {
245 c
->apply_window_int16
= ff_apply_window_int16_ssse3_atom
;
247 c
->extract_exponents
= ff_ac3_extract_exponents_ssse3
;
248 c
->apply_window_int16
= ff_apply_window_int16_ssse3
;
252 #if HAVE_SSE_INLINE && HAVE_7REGS
253 if (INLINE_SSE(cpu_flags
)) {
254 c
->downmix
= ac3_downmix_sse
;