Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / mpegaudiodsp.c
CommitLineData
2ba45a60
DM
1/*
2 * SIMD-optimized MP3 decoding functions
3 * Copyright (c) 2010 Vitor Sessak
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/attributes.h"
23#include "libavutil/cpu.h"
24#include "libavutil/internal.h"
25#include "libavutil/x86/asm.h"
26#include "libavutil/x86/cpu.h"
27#include "libavcodec/mpegaudiodsp.h"
28
29#define DECL(CPU)\
30static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
31void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
32
33#if ARCH_X86_32
34DECL(sse)
35#endif
36DECL(sse2)
37DECL(sse3)
38DECL(ssse3)
39DECL(avx)
40
41void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
42 float *tmpbuf);
43void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
44 float *tmpbuf);
45
46DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
47
48#if HAVE_6REGS && HAVE_SSE_INLINE
49
50#define MACS(rt, ra, rb) rt+=(ra)*(rb)
51#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
52
53#define SUM8(op, sum, w, p) \
54{ \
55 op(sum, (w)[0 * 64], (p)[0 * 64]); \
56 op(sum, (w)[1 * 64], (p)[1 * 64]); \
57 op(sum, (w)[2 * 64], (p)[2 * 64]); \
58 op(sum, (w)[3 * 64], (p)[3 * 64]); \
59 op(sum, (w)[4 * 64], (p)[4 * 64]); \
60 op(sum, (w)[5 * 64], (p)[5 * 64]); \
61 op(sum, (w)[6 * 64], (p)[6 * 64]); \
62 op(sum, (w)[7 * 64], (p)[7 * 64]); \
63}
64
65static void apply_window(const float *buf, const float *win1,
66 const float *win2, float *sum1, float *sum2, int len)
67{
68 x86_reg count = - 4*len;
69 const float *win1a = win1+len;
70 const float *win2a = win2+len;
71 const float *bufa = buf+len;
72 float *sum1a = sum1+len;
73 float *sum2a = sum2+len;
74
75
76#define MULT(a, b) \
77 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
78 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
79 "mulps %%xmm2, %%xmm1 \n\t" \
80 "subps %%xmm1, %%xmm0 \n\t" \
81 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
82 "subps %%xmm2, %%xmm4 \n\t" \
83
84 __asm__ volatile(
85 "1: \n\t"
86 "xorps %%xmm0, %%xmm0 \n\t"
87 "xorps %%xmm4, %%xmm4 \n\t"
88
89 MULT( 0, 0)
90 MULT( 256, 64)
91 MULT( 512, 128)
92 MULT( 768, 192)
93 MULT(1024, 256)
94 MULT(1280, 320)
95 MULT(1536, 384)
96 MULT(1792, 448)
97
98 "movaps %%xmm0, (%4,%0) \n\t"
99 "movaps %%xmm4, (%5,%0) \n\t"
100 "add $16, %0 \n\t"
101 "jl 1b \n\t"
102 :"+&r"(count)
103 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
104 );
105
106#undef MULT
107}
108
109static void apply_window_mp3(float *in, float *win, int *unused, float *out,
110 int incr)
111{
112 LOCAL_ALIGNED_16(float, suma, [17]);
113 LOCAL_ALIGNED_16(float, sumb, [17]);
114 LOCAL_ALIGNED_16(float, sumc, [17]);
115 LOCAL_ALIGNED_16(float, sumd, [17]);
116
117 float sum;
118
119 /* copy to avoid wrap */
120 __asm__ volatile(
121 "movaps 0(%0), %%xmm0 \n\t" \
122 "movaps 16(%0), %%xmm1 \n\t" \
123 "movaps 32(%0), %%xmm2 \n\t" \
124 "movaps 48(%0), %%xmm3 \n\t" \
125 "movaps %%xmm0, 0(%1) \n\t" \
126 "movaps %%xmm1, 16(%1) \n\t" \
127 "movaps %%xmm2, 32(%1) \n\t" \
128 "movaps %%xmm3, 48(%1) \n\t" \
129 "movaps 64(%0), %%xmm0 \n\t" \
130 "movaps 80(%0), %%xmm1 \n\t" \
131 "movaps 96(%0), %%xmm2 \n\t" \
132 "movaps 112(%0), %%xmm3 \n\t" \
133 "movaps %%xmm0, 64(%1) \n\t" \
134 "movaps %%xmm1, 80(%1) \n\t" \
135 "movaps %%xmm2, 96(%1) \n\t" \
136 "movaps %%xmm3, 112(%1) \n\t"
137 ::"r"(in), "r"(in+512)
138 :"memory"
139 );
140
141 apply_window(in + 16, win , win + 512, suma, sumc, 16);
142 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
143
144 SUM8(MACS, suma[0], win + 32, in + 48);
145
146 sumc[ 0] = 0;
147 sumb[16] = 0;
148 sumd[16] = 0;
149
150#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
151 "movups " #sumd "(%4), %%xmm0 \n\t" \
152 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
153 "subps " #suma "(%1), %%xmm0 \n\t" \
154 "movaps %%xmm0," #out1 "(%0) \n\t" \
155\
156 "movups " #sumc "(%3), %%xmm0 \n\t" \
157 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
158 "addps " #sumb "(%2), %%xmm0 \n\t" \
159 "movaps %%xmm0," #out2 "(%0) \n\t"
160
161 if (incr == 1) {
162 __asm__ volatile(
163 SUMS( 0, 48, 4, 52, 0, 112)
164 SUMS(16, 32, 20, 36, 16, 96)
165 SUMS(32, 16, 36, 20, 32, 80)
166 SUMS(48, 0, 52, 4, 48, 64)
167
168 :"+&r"(out)
169 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
170 :"memory"
171 );
172 out += 16*incr;
173 } else {
174 int j;
175 float *out2 = out + 32 * incr;
176 out[0 ] = -suma[ 0];
177 out += incr;
178 out2 -= incr;
179 for(j=1;j<16;j++) {
180 *out = -suma[ j] + sumd[16-j];
181 *out2 = sumb[16-j] + sumc[ j];
182 out += incr;
183 out2 -= incr;
184 }
185 }
186
187 sum = 0;
188 SUM8(MLSS, sum, win + 16 + 32, in + 32);
189 *out = sum;
190}
191
192#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
193
194#if HAVE_YASM
195#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
196static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
197 int count, int switch_point, int block_type) \
198{ \
199 int align_end = count - (count & 3); \
200 int j; \
201 for (j = 0; j < align_end; j+= 4) { \
202 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
203 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
204 /* apply window & overlap with previous buffer */ \
205 \
206 /* select window */ \
207 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
208 in += 4*18; \
209 buf += 4*18; \
210 out += 4; \
211 } \
212 for (; j < count; j++) { \
213 /* apply window & overlap with previous buffer */ \
214 \
215 /* select window */ \
216 int win_idx = (switch_point && j < 2) ? 0 : block_type; \
217 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
218 \
219 ff_imdct36_float_ ## CPU1(out, buf, in, win); \
220 \
221 in += 18; \
222 buf++; \
223 out++; \
224 } \
225}
226
227#if HAVE_SSE
228#if ARCH_X86_32
229DECL_IMDCT_BLOCKS(sse,sse)
230#endif
231DECL_IMDCT_BLOCKS(sse2,sse)
232DECL_IMDCT_BLOCKS(sse3,sse)
233DECL_IMDCT_BLOCKS(ssse3,sse)
234#endif
235#if HAVE_AVX_EXTERNAL
236DECL_IMDCT_BLOCKS(avx,avx)
237#endif
238#endif /* HAVE_YASM */
239
240av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
241{
242 int cpu_flags = av_get_cpu_flags();
243
244 int i, j;
245 for (j = 0; j < 4; j++) {
246 for (i = 0; i < 40; i ++) {
247 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
248 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
249 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
250 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
251 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
252 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
253 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
254 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
255 }
256 }
257
258#if HAVE_6REGS && HAVE_SSE_INLINE
259 if (INLINE_SSE(cpu_flags)) {
260 s->apply_window_float = apply_window_mp3;
261 }
262#endif /* HAVE_SSE_INLINE */
263
264#if HAVE_YASM
265#if HAVE_SSE
266#if ARCH_X86_32
267 if (EXTERNAL_SSE(cpu_flags)) {
268 s->imdct36_blocks_float = imdct36_blocks_sse;
269 }
270#endif
271 if (EXTERNAL_SSE2(cpu_flags)) {
272 s->imdct36_blocks_float = imdct36_blocks_sse2;
273 }
274 if (EXTERNAL_SSE3(cpu_flags)) {
275 s->imdct36_blocks_float = imdct36_blocks_sse3;
276 }
277 if (EXTERNAL_SSSE3(cpu_flags)) {
278 s->imdct36_blocks_float = imdct36_blocks_ssse3;
279 }
280#endif
281#if HAVE_AVX_EXTERNAL
282 if (EXTERNAL_AVX(cpu_flags)) {
283 s->imdct36_blocks_float = imdct36_blocks_avx;
284 }
285#endif
286#endif /* HAVE_YASM */
287}