Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * x86-optimized AC-3 DSP functions | |
3 | * Copyright (c) 2011 Justin Ruggles | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/attributes.h" | |
23 | #include "libavutil/mem.h" | |
24 | #include "libavutil/x86/asm.h" | |
25 | #include "libavutil/x86/cpu.h" | |
26 | #include "libavcodec/ac3.h" | |
27 | #include "libavcodec/ac3dsp.h" | |
28 | ||
29 | void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); | |
30 | void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); | |
31 | void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); | |
32 | ||
33 | int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); | |
34 | int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); | |
35 | int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); | |
36 | int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); | |
37 | ||
38 | void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); | |
39 | void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); | |
40 | ||
41 | void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); | |
42 | void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); | |
43 | ||
44 | void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); | |
45 | void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); | |
46 | void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); | |
47 | ||
48 | int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); | |
49 | ||
50 | void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); | |
51 | void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); | |
52 | void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); | |
53 | ||
54 | void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, | |
55 | const int16_t *window, unsigned int len); | |
56 | void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, | |
57 | const int16_t *window, unsigned int len); | |
58 | void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, | |
59 | const int16_t *window, unsigned int len); | |
60 | void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, | |
61 | const int16_t *window, unsigned int len); | |
62 | void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, | |
63 | const int16_t *window, unsigned int len); | |
64 | void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, | |
65 | const int16_t *window, unsigned int len); | |
66 | ||
67 | #if ARCH_X86_32 && defined(__INTEL_COMPILER) | |
68 | # undef HAVE_7REGS | |
69 | # define HAVE_7REGS 0 | |
70 | #endif | |
71 | ||
72 | #if HAVE_SSE_INLINE && HAVE_7REGS | |
73 | ||
74 | #define IF1(x) x | |
75 | #define IF0(x) | |
76 | ||
77 | #define MIX5(mono, stereo) \ | |
78 | __asm__ volatile ( \ | |
79 | "movss 0(%1), %%xmm5 \n" \ | |
80 | "movss 8(%1), %%xmm6 \n" \ | |
81 | "movss 24(%1), %%xmm7 \n" \ | |
82 | "shufps $0, %%xmm5, %%xmm5 \n" \ | |
83 | "shufps $0, %%xmm6, %%xmm6 \n" \ | |
84 | "shufps $0, %%xmm7, %%xmm7 \n" \ | |
85 | "1: \n" \ | |
86 | "movaps (%0, %2), %%xmm0 \n" \ | |
87 | "movaps (%0, %3), %%xmm1 \n" \ | |
88 | "movaps (%0, %4), %%xmm2 \n" \ | |
89 | "movaps (%0, %5), %%xmm3 \n" \ | |
90 | "movaps (%0, %6), %%xmm4 \n" \ | |
91 | "mulps %%xmm5, %%xmm0 \n" \ | |
92 | "mulps %%xmm6, %%xmm1 \n" \ | |
93 | "mulps %%xmm5, %%xmm2 \n" \ | |
94 | "mulps %%xmm7, %%xmm3 \n" \ | |
95 | "mulps %%xmm7, %%xmm4 \n" \ | |
96 | stereo("addps %%xmm1, %%xmm0 \n") \ | |
97 | "addps %%xmm1, %%xmm2 \n" \ | |
98 | "addps %%xmm3, %%xmm0 \n" \ | |
99 | "addps %%xmm4, %%xmm2 \n" \ | |
100 | mono("addps %%xmm2, %%xmm0 \n") \ | |
101 | "movaps %%xmm0, (%0, %2) \n" \ | |
102 | stereo("movaps %%xmm2, (%0, %3) \n") \ | |
103 | "add $16, %0 \n" \ | |
104 | "jl 1b \n" \ | |
105 | : "+&r"(i) \ | |
106 | : "r"(matrix), \ | |
107 | "r"(samples[0] + len), \ | |
108 | "r"(samples[1] + len), \ | |
109 | "r"(samples[2] + len), \ | |
110 | "r"(samples[3] + len), \ | |
111 | "r"(samples[4] + len) \ | |
112 | : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ | |
113 | "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ | |
114 | "memory" \ | |
115 | ); | |
116 | ||
117 | #define MIX_MISC(stereo) \ | |
118 | __asm__ volatile ( \ | |
119 | "mov %5, %2 \n" \ | |
120 | "1: \n" \ | |
121 | "mov -%c7(%6, %2, %c8), %3 \n" \ | |
122 | "movaps (%3, %0), %%xmm0 \n" \ | |
123 | stereo("movaps %%xmm0, %%xmm1 \n") \ | |
124 | "mulps %%xmm4, %%xmm0 \n" \ | |
125 | stereo("mulps %%xmm5, %%xmm1 \n") \ | |
126 | "2: \n" \ | |
127 | "mov (%6, %2, %c8), %1 \n" \ | |
128 | "movaps (%1, %0), %%xmm2 \n" \ | |
129 | stereo("movaps %%xmm2, %%xmm3 \n") \ | |
130 | "mulps (%4, %2, 8), %%xmm2 \n" \ | |
131 | stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ | |
132 | "addps %%xmm2, %%xmm0 \n" \ | |
133 | stereo("addps %%xmm3, %%xmm1 \n") \ | |
134 | "add $4, %2 \n" \ | |
135 | "jl 2b \n" \ | |
136 | "mov %5, %2 \n" \ | |
137 | stereo("mov (%6, %2, %c8), %1 \n") \ | |
138 | "movaps %%xmm0, (%3, %0) \n" \ | |
139 | stereo("movaps %%xmm1, (%1, %0) \n") \ | |
140 | "add $16, %0 \n" \ | |
141 | "jl 1b \n" \ | |
142 | : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ | |
143 | : "r"(matrix_simd + in_ch), \ | |
144 | "g"((intptr_t) - 4 * (in_ch - 1)), \ | |
145 | "r"(samp + in_ch), \ | |
146 | "i"(sizeof(float *)), "i"(sizeof(float *)/4) \ | |
147 | : "memory" \ | |
148 | ); | |
149 | ||
150 | static void ac3_downmix_sse(float **samples, float (*matrix)[2], | |
151 | int out_ch, int in_ch, int len) | |
152 | { | |
153 | int (*matrix_cmp)[2] = (int(*)[2])matrix; | |
154 | intptr_t i, j, k, m; | |
155 | ||
156 | i = -len * sizeof(float); | |
157 | if (in_ch == 5 && out_ch == 2 && | |
158 | !(matrix_cmp[0][1] | matrix_cmp[2][0] | | |
159 | matrix_cmp[3][1] | matrix_cmp[4][0] | | |
160 | (matrix_cmp[1][0] ^ matrix_cmp[1][1]) | | |
161 | (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) { | |
162 | MIX5(IF0, IF1); | |
163 | } else if (in_ch == 5 && out_ch == 1 && | |
164 | matrix_cmp[0][0] == matrix_cmp[2][0] && | |
165 | matrix_cmp[3][0] == matrix_cmp[4][0]) { | |
166 | MIX5(IF1, IF0); | |
167 | } else { | |
168 | DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; | |
169 | float *samp[AC3_MAX_CHANNELS]; | |
170 | ||
171 | for (j = 0; j < in_ch; j++) | |
172 | samp[j] = samples[j] + len; | |
173 | ||
174 | j = 2 * in_ch * sizeof(float); | |
175 | __asm__ volatile ( | |
176 | "1: \n" | |
177 | "sub $8, %0 \n" | |
178 | "movss (%2, %0), %%xmm4 \n" | |
179 | "movss 4(%2, %0), %%xmm5 \n" | |
180 | "shufps $0, %%xmm4, %%xmm4 \n" | |
181 | "shufps $0, %%xmm5, %%xmm5 \n" | |
182 | "movaps %%xmm4, (%1, %0, 4) \n" | |
183 | "movaps %%xmm5, 16(%1, %0, 4) \n" | |
184 | "jg 1b \n" | |
185 | : "+&r"(j) | |
186 | : "r"(matrix_simd), "r"(matrix) | |
187 | : "memory" | |
188 | ); | |
189 | if (out_ch == 2) { | |
190 | MIX_MISC(IF1); | |
191 | } else { | |
192 | MIX_MISC(IF0); | |
193 | } | |
194 | } | |
195 | } | |
196 | ||
197 | #endif /* HAVE_SSE_INLINE && HAVE_7REGS */ | |
198 | ||
199 | av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) | |
200 | { | |
201 | int cpu_flags = av_get_cpu_flags(); | |
202 | ||
203 | if (EXTERNAL_MMX(cpu_flags)) { | |
204 | c->ac3_exponent_min = ff_ac3_exponent_min_mmx; | |
205 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; | |
206 | c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; | |
207 | c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; | |
208 | } | |
209 | if (EXTERNAL_AMD3DNOW(cpu_flags)) { | |
210 | if (!bit_exact) { | |
211 | c->float_to_fixed24 = ff_float_to_fixed24_3dnow; | |
212 | } | |
213 | } | |
214 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
215 | c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; | |
216 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; | |
217 | if (bit_exact) { | |
218 | c->apply_window_int16 = ff_apply_window_int16_mmxext; | |
219 | } else { | |
220 | c->apply_window_int16 = ff_apply_window_int16_round_mmxext; | |
221 | } | |
222 | } | |
223 | if (EXTERNAL_SSE(cpu_flags)) { | |
224 | c->float_to_fixed24 = ff_float_to_fixed24_sse; | |
225 | } | |
226 | if (EXTERNAL_SSE2(cpu_flags)) { | |
227 | c->ac3_exponent_min = ff_ac3_exponent_min_sse2; | |
228 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; | |
229 | c->float_to_fixed24 = ff_float_to_fixed24_sse2; | |
230 | c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; | |
231 | c->extract_exponents = ff_ac3_extract_exponents_sse2; | |
232 | if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { | |
233 | c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; | |
234 | c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; | |
235 | } | |
236 | if (bit_exact) { | |
237 | c->apply_window_int16 = ff_apply_window_int16_sse2; | |
238 | } else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { | |
239 | c->apply_window_int16 = ff_apply_window_int16_round_sse2; | |
240 | } | |
241 | } | |
242 | if (EXTERNAL_SSSE3(cpu_flags)) { | |
243 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; | |
244 | if (cpu_flags & AV_CPU_FLAG_ATOM) { | |
245 | c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; | |
246 | } else { | |
247 | c->extract_exponents = ff_ac3_extract_exponents_ssse3; | |
248 | c->apply_window_int16 = ff_apply_window_int16_ssse3; | |
249 | } | |
250 | } | |
251 | ||
252 | #if HAVE_SSE_INLINE && HAVE_7REGS | |
253 | if (INLINE_SSE(cpu_flags)) { | |
254 | c->downmix = ac3_downmix_sse; | |
255 | } | |
256 | #endif | |
257 | } |