[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / mpegaudiodsp.c

/*
 * SIMD-optimized MP3 decoding functions
 * Copyright (c) 2010 Vitor Sessak
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegaudiodsp.h"

#define DECL(CPU)\
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);

#if ARCH_X86_32
DECL(sse)
#endif
DECL(sse2)
DECL(sse3)
DECL(ssse3)
DECL(avx)

void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
                               float *tmpbuf);
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
                               float *tmpbuf);

DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];

#if HAVE_6REGS && HAVE_SSE_INLINE

#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)

#define SUM8(op, sum, w, p)               \
{                                         \
    op(sum, (w)[0 * 64], (p)[0 * 64]);    \
    op(sum, (w)[1 * 64], (p)[1 * 64]);    \
    op(sum, (w)[2 * 64], (p)[2 * 64]);    \
    op(sum, (w)[3 * 64], (p)[3 * 64]);    \
    op(sum, (w)[4 * 64], (p)[4 * 64]);    \
    op(sum, (w)[5 * 64], (p)[5 * 64]);    \
    op(sum, (w)[6 * 64], (p)[6 * 64]);    \
    op(sum, (w)[7 * 64], (p)[7 * 64]);    \
}

static void apply_window(const float *buf, const float *win1,
                         const float *win2, float *sum1, float *sum2, int len)
{
    x86_reg count = - 4*len;
    const float *win1a = win1+len;
    const float *win2a = win2+len;
    const float *bufa  = buf+len;
    float *sum1a = sum1+len;
    float *sum2a = sum2+len;


#define MULT(a, b)                                 \
    "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
    "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
    "mulps         %%xmm2, %%xmm1           \n\t"  \
    "subps         %%xmm1, %%xmm0           \n\t"  \
    "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
    "subps         %%xmm2, %%xmm4           \n\t"  \

    __asm__ volatile(
            "1:                                   \n\t"
            "xorps       %%xmm0, %%xmm0           \n\t"
            "xorps       %%xmm4, %%xmm4           \n\t"

            MULT(   0,   0)
            MULT( 256,  64)
            MULT( 512, 128)
            MULT( 768, 192)
            MULT(1024, 256)
            MULT(1280, 320)
            MULT(1536, 384)
            MULT(1792, 448)

            "movaps      %%xmm0, (%4,%0)          \n\t"
            "movaps      %%xmm4, (%5,%0)          \n\t"
            "add            $16,  %0              \n\t"
            "jl              1b                   \n\t"
            :"+&r"(count)
            :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
            );

#undef MULT
}

static void apply_window_mp3(float *in, float *win, int *unused, float *out,
                             int incr)
{
    LOCAL_ALIGNED_16(float, suma, [17]);
    LOCAL_ALIGNED_16(float, sumb, [17]);
    LOCAL_ALIGNED_16(float, sumc, [17]);
    LOCAL_ALIGNED_16(float, sumd, [17]);

    float sum;

    /* copy to avoid wrap */
    __asm__ volatile(
            "movaps    0(%0), %%xmm0   \n\t" \
            "movaps   16(%0), %%xmm1   \n\t" \
            "movaps   32(%0), %%xmm2   \n\t" \
            "movaps   48(%0), %%xmm3   \n\t" \
            "movaps   %%xmm0,   0(%1) \n\t" \
            "movaps   %%xmm1,  16(%1) \n\t" \
            "movaps   %%xmm2,  32(%1) \n\t" \
            "movaps   %%xmm3,  48(%1) \n\t" \
            "movaps   64(%0), %%xmm0   \n\t" \
            "movaps   80(%0), %%xmm1   \n\t" \
            "movaps   96(%0), %%xmm2   \n\t" \
            "movaps  112(%0), %%xmm3   \n\t" \
            "movaps   %%xmm0,  64(%1) \n\t" \
            "movaps   %%xmm1,  80(%1) \n\t" \
            "movaps   %%xmm2,  96(%1) \n\t" \
            "movaps   %%xmm3, 112(%1) \n\t"
            ::"r"(in), "r"(in+512)
            :"memory"
            );

    apply_window(in + 16, win     , win + 512, suma, sumc, 16);
    apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);

    SUM8(MACS, suma[0], win + 32, in + 48);

    sumc[ 0] = 0;
    sumb[16] = 0;
    sumd[16] = 0;

#define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
            "movups " #sumd "(%4),       %%xmm0          \n\t" \
            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
            "subps  " #suma "(%1),       %%xmm0          \n\t" \
            "movaps        %%xmm0," #out1 "(%0)          \n\t" \
\
            "movups " #sumc "(%3),       %%xmm0          \n\t" \
            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
            "addps  " #sumb "(%2),       %%xmm0          \n\t" \
            "movaps        %%xmm0," #out2 "(%0)          \n\t"

    if (incr == 1) {
        __asm__ volatile(
            SUMS( 0, 48,  4, 52,  0, 112)
            SUMS(16, 32, 20, 36, 16,  96)
            SUMS(32, 16, 36, 20, 32,  80)
            SUMS(48,  0, 52,  4, 48,  64)

            :"+&r"(out)
            :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
            :"memory"
            );
        out += 16*incr;
    } else {
        int j;
        float *out2 = out + 32 * incr;
        out[0  ]  = -suma[   0];
        out += incr;
        out2 -= incr;
        for(j=1;j<16;j++) {
            *out  = -suma[   j] + sumd[16-j];
            *out2 =  sumb[16-j] + sumc[   j];
            out  += incr;
            out2 -= incr;
        }
    }

    sum = 0;
    SUM8(MLSS, sum, win + 16 + 32, in + 32);
    *out = sum;
}

#endif /* HAVE_6REGS && HAVE_SSE_INLINE */

#if HAVE_YASM
#define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
                               int count, int switch_point, int block_type) \
{                                                                           \
    int align_end = count - (count & 3);                                \
    int j;                                                              \
    for (j = 0; j < align_end; j+= 4) {                                 \
        LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
        float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
        /* apply window & overlap with previous buffer */               \
                                                                        \
        /* select window */                                             \
        ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
        in      += 4*18;                                                \
        buf     += 4*18;                                                \
        out     += 4;                                                   \
    }                                                                   \
    for (; j < count; j++) {                                            \
        /* apply window & overlap with previous buffer */               \
                                                                        \
        /* select window */                                             \
        int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
                                                                        \
        ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
                                                                        \
        in  += 18;                                                      \
        buf++;                                                          \
        out++;                                                          \
    }                                                                   \
}

#if HAVE_SSE
#if ARCH_X86_32
DECL_IMDCT_BLOCKS(sse,sse)
#endif
DECL_IMDCT_BLOCKS(sse2,sse)
DECL_IMDCT_BLOCKS(sse3,sse)
DECL_IMDCT_BLOCKS(ssse3,sse)
#endif
#if HAVE_AVX_EXTERNAL
DECL_IMDCT_BLOCKS(avx,avx)
#endif
#endif /* HAVE_YASM */

av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
{
    int cpu_flags = av_get_cpu_flags();

    int i, j;
    for (j = 0; j < 4; j++) {
        for (i = 0; i < 40; i ++) {
            mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
            mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
            mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
            mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
            mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
            mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
            mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
            mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
        }
    }

#if HAVE_6REGS && HAVE_SSE_INLINE
    if (INLINE_SSE(cpu_flags)) {
        s->apply_window_float = apply_window_mp3;
    }
#endif /* HAVE_SSE_INLINE */

#if HAVE_YASM
#if HAVE_SSE
#if ARCH_X86_32
    if (EXTERNAL_SSE(cpu_flags)) {
        s->imdct36_blocks_float = imdct36_blocks_sse;
    }
#endif
    if (EXTERNAL_SSE2(cpu_flags)) {
        s->imdct36_blocks_float = imdct36_blocks_sse2;
    }
    if (EXTERNAL_SSE3(cpu_flags)) {
        s->imdct36_blocks_float = imdct36_blocks_sse3;
    }
    if (EXTERNAL_SSSE3(cpu_flags)) {
        s->imdct36_blocks_float = imdct36_blocks_ssse3;
    }
#endif
#if HAVE_AVX_EXTERNAL
    if (EXTERNAL_AVX(cpu_flags)) {
        s->imdct36_blocks_float = imdct36_blocks_avx;
    }
#endif
#endif /* HAVE_YASM */
}
Commit	Line	Data
	1	/*
	2	* SIMD-optimized MP3 decoding functions
	3	* Copyright (c) 2010 Vitor Sessak
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "libavutil/attributes.h"
	23	#include "libavutil/cpu.h"
	24	#include "libavutil/internal.h"
	25	#include "libavutil/x86/asm.h"
	26	#include "libavutil/x86/cpu.h"
	27	#include "libavcodec/mpegaudiodsp.h"
	28
	29	#define DECL(CPU)\
	30	static void imdct36_blocks_ ## CPU(float out, float buf, float *in, int count, int switch_point, int block_type);\
	31	void ff_imdct36_float_ ## CPU(float out, float buf, float in, float win);
	32
	33	#if ARCH_X86_32
	34	DECL(sse)
	35	#endif
	36	DECL(sse2)
	37	DECL(sse3)
	38	DECL(ssse3)
	39	DECL(avx)
	40
	41	void ff_four_imdct36_float_sse(float out, float buf, float in, float win,
	42	float *tmpbuf);
	43	void ff_four_imdct36_float_avx(float out, float buf, float in, float win,
	44	float *tmpbuf);
	45
	46	DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
	47
	48	#if HAVE_6REGS && HAVE_SSE_INLINE
	49
	50	#define MACS(rt, ra, rb) rt+=(ra)*(rb)
	51	#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
	52
	53	#define SUM8(op, sum, w, p) \
	54	{ \
	55	op(sum, (w)[0 * 64], (p)[0 * 64]); \
	56	op(sum, (w)[1 * 64], (p)[1 * 64]); \
	57	op(sum, (w)[2 * 64], (p)[2 * 64]); \
	58	op(sum, (w)[3 * 64], (p)[3 * 64]); \
	59	op(sum, (w)[4 * 64], (p)[4 * 64]); \
	60	op(sum, (w)[5 * 64], (p)[5 * 64]); \
	61	op(sum, (w)[6 * 64], (p)[6 * 64]); \
	62	op(sum, (w)[7 * 64], (p)[7 * 64]); \
	63	}
	64
	65	static void apply_window(const float buf, const float win1,
	66	const float win2, float sum1, float *sum2, int len)
	67	{
	68	x86_reg count = - 4*len;
	69	const float *win1a = win1+len;
	70	const float *win2a = win2+len;
	71	const float *bufa = buf+len;
	72	float *sum1a = sum1+len;
	73	float *sum2a = sum2+len;
	74
	75
	76	#define MULT(a, b) \
	77	"movaps " #a "(%1,%0), %%xmm1 \n\t" \
	78	"movaps " #a "(%3,%0), %%xmm2 \n\t" \
	79	"mulps %%xmm2, %%xmm1 \n\t" \
	80	"subps %%xmm1, %%xmm0 \n\t" \
	81	"mulps " #b "(%2,%0), %%xmm2 \n\t" \
	82	"subps %%xmm2, %%xmm4 \n\t" \
	83
	84	__asm__ volatile(
	85	"1: \n\t"
	86	"xorps %%xmm0, %%xmm0 \n\t"
	87	"xorps %%xmm4, %%xmm4 \n\t"
	88
	89	MULT( 0, 0)
	90	MULT( 256, 64)
	91	MULT( 512, 128)
	92	MULT( 768, 192)
	93	MULT(1024, 256)
	94	MULT(1280, 320)
	95	MULT(1536, 384)
	96	MULT(1792, 448)
	97
	98	"movaps %%xmm0, (%4,%0) \n\t"
	99	"movaps %%xmm4, (%5,%0) \n\t"
	100	"add $16, %0 \n\t"
	101	"jl 1b \n\t"
	102	:"+&r"(count)
	103	:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
	104	);
	105
	106	#undef MULT
	107	}
	108
	109	static void apply_window_mp3(float in, float win, int unused, float out,
	110	int incr)
	111	{
	112	LOCAL_ALIGNED_16(float, suma, [17]);
	113	LOCAL_ALIGNED_16(float, sumb, [17]);
	114	LOCAL_ALIGNED_16(float, sumc, [17]);
	115	LOCAL_ALIGNED_16(float, sumd, [17]);
	116
	117	float sum;
	118
	119	/* copy to avoid wrap */
	120	__asm__ volatile(
	121	"movaps 0(%0), %%xmm0 \n\t" \
	122	"movaps 16(%0), %%xmm1 \n\t" \
	123	"movaps 32(%0), %%xmm2 \n\t" \
	124	"movaps 48(%0), %%xmm3 \n\t" \
	125	"movaps %%xmm0, 0(%1) \n\t" \
	126	"movaps %%xmm1, 16(%1) \n\t" \
	127	"movaps %%xmm2, 32(%1) \n\t" \
	128	"movaps %%xmm3, 48(%1) \n\t" \
	129	"movaps 64(%0), %%xmm0 \n\t" \
	130	"movaps 80(%0), %%xmm1 \n\t" \
	131	"movaps 96(%0), %%xmm2 \n\t" \
	132	"movaps 112(%0), %%xmm3 \n\t" \
	133	"movaps %%xmm0, 64(%1) \n\t" \
	134	"movaps %%xmm1, 80(%1) \n\t" \
	135	"movaps %%xmm2, 96(%1) \n\t" \
	136	"movaps %%xmm3, 112(%1) \n\t"
	137	::"r"(in), "r"(in+512)
	138	:"memory"
	139	);
	140
	141	apply_window(in + 16, win , win + 512, suma, sumc, 16);
	142	apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
	143
	144	SUM8(MACS, suma[0], win + 32, in + 48);
	145
	146	sumc[ 0] = 0;
	147	sumb[16] = 0;
	148	sumd[16] = 0;
	149
	150	#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
	151	"movups " #sumd "(%4), %%xmm0 \n\t" \
	152	"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
	153	"subps " #suma "(%1), %%xmm0 \n\t" \
	154	"movaps %%xmm0," #out1 "(%0) \n\t" \
	155	\
	156	"movups " #sumc "(%3), %%xmm0 \n\t" \
	157	"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
	158	"addps " #sumb "(%2), %%xmm0 \n\t" \
	159	"movaps %%xmm0," #out2 "(%0) \n\t"
	160
	161	if (incr == 1) {
	162	__asm__ volatile(
	163	SUMS( 0, 48, 4, 52, 0, 112)
	164	SUMS(16, 32, 20, 36, 16, 96)
	165	SUMS(32, 16, 36, 20, 32, 80)
	166	SUMS(48, 0, 52, 4, 48, 64)
	167
	168	:"+&r"(out)
	169	:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
	170	:"memory"
	171	);
	172	out += 16*incr;
	173	} else {
	174	int j;
	175	float out2 = out + 32 incr;
	176	out[0 ] = -suma[ 0];
	177	out += incr;
	178	out2 -= incr;
	179	for(j=1;j<16;j++) {
	180	*out = -suma[ j] + sumd[16-j];
	181	*out2 = sumb[16-j] + sumc[ j];
	182	out += incr;
	183	out2 -= incr;
	184	}
	185	}
	186
	187	sum = 0;
	188	SUM8(MLSS, sum, win + 16 + 32, in + 32);
	189	*out = sum;
	190	}
	191
	192	#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
	193
	194	#if HAVE_YASM
	195	#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
	196	static void imdct36_blocks_ ## CPU1(float out, float buf, float *in, \
	197	int count, int switch_point, int block_type) \
	198	{ \
	199	int align_end = count - (count & 3); \
	200	int j; \
	201	for (j = 0; j < align_end; j+= 4) { \
	202	LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
	203	float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
	204	/* apply window & overlap with previous buffer */ \
	205	\
	206	/* select window */ \
	207	ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
	208	in += 4*18; \
	209	buf += 4*18; \
	210	out += 4; \
	211	} \
	212	for (; j < count; j++) { \
	213	/* apply window & overlap with previous buffer */ \
	214	\
	215	/* select window */ \
	216	int win_idx = (switch_point && j < 2) ? 0 : block_type; \
	217	float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
	218	\
	219	ff_imdct36_float_ ## CPU1(out, buf, in, win); \
	220	\
	221	in += 18; \
	222	buf++; \
	223	out++; \
	224	} \
	225	}
	226
	227	#if HAVE_SSE
	228	#if ARCH_X86_32
	229	DECL_IMDCT_BLOCKS(sse,sse)
	230	#endif
	231	DECL_IMDCT_BLOCKS(sse2,sse)
	232	DECL_IMDCT_BLOCKS(sse3,sse)
	233	DECL_IMDCT_BLOCKS(ssse3,sse)
	234	#endif
	235	#if HAVE_AVX_EXTERNAL
	236	DECL_IMDCT_BLOCKS(avx,avx)
	237	#endif
	238	#endif /* HAVE_YASM */
	239
	240	av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
	241	{
	242	int cpu_flags = av_get_cpu_flags();
	243
	244	int i, j;
	245	for (j = 0; j < 4; j++) {
	246	for (i = 0; i < 40; i ++) {
	247	mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
	248	mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
	249	mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
	250	mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
	251	mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
	252	mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
	253	mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
	254	mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
	255	}
	256	}
	257
	258	#if HAVE_6REGS && HAVE_SSE_INLINE
	259	if (INLINE_SSE(cpu_flags)) {
	260	s->apply_window_float = apply_window_mp3;
	261	}
	262	#endif /* HAVE_SSE_INLINE */
	263
	264	#if HAVE_YASM
	265	#if HAVE_SSE
	266	#if ARCH_X86_32
	267	if (EXTERNAL_SSE(cpu_flags)) {
	268	s->imdct36_blocks_float = imdct36_blocks_sse;
	269	}
	270	#endif
	271	if (EXTERNAL_SSE2(cpu_flags)) {
	272	s->imdct36_blocks_float = imdct36_blocks_sse2;
	273	}
	274	if (EXTERNAL_SSE3(cpu_flags)) {
	275	s->imdct36_blocks_float = imdct36_blocks_sse3;
	276	}
	277	if (EXTERNAL_SSSE3(cpu_flags)) {
	278	s->imdct36_blocks_float = imdct36_blocks_ssse3;
	279	}
	280	#endif
	281	#if HAVE_AVX_EXTERNAL
	282	if (EXTERNAL_AVX(cpu_flags)) {
	283	s->imdct36_blocks_float = imdct36_blocks_avx;
	284	}
	285	#endif
	286	#endif /* HAVE_YASM */
	287	}