ffmpeg/libavcodec/x86/mpegaudiodsp.c

   1 /*
   2  * SIMD-optimized MP3 decoding functions
   3  * Copyright (c) 2010 Vitor Sessak
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/attributes.h"
  23 #include "libavutil/cpu.h"
  24 #include "libavutil/internal.h"
  25 #include "libavutil/x86/asm.h"
  26 #include "libavutil/x86/cpu.h"
  27 #include "libavcodec/mpegaudiodsp.h"
  28
  29 #define DECL(CPU)\
  30 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
  31 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
  32
  33 #if ARCH_X86_32
  34 DECL(sse)
  35 #endif
  36 DECL(sse2)
  37 DECL(sse3)
  38 DECL(ssse3)
  39 DECL(avx)
  40
  41 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
  42                                float *tmpbuf);
  43 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
  44                                float *tmpbuf);
  45
  46 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
  47
  48 #if HAVE_6REGS && HAVE_SSE_INLINE
  49
  50 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
  51 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
  52
  53 #define SUM8(op, sum, w, p)               \
  54 {                                         \
  55     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
  56     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
  57     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
  58     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
  59     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
  60     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
  61     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
  62     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
  63 }
  64
  65 static void apply_window(const float *buf, const float *win1,
  66                          const float *win2, float *sum1, float *sum2, int len)
  67 {
  68     x86_reg count = - 4*len;
  69     const float *win1a = win1+len;
  70     const float *win2a = win2+len;
  71     const float *bufa  = buf+len;
  72     float *sum1a = sum1+len;
  73     float *sum2a = sum2+len;
  74
  75
  76 #define MULT(a, b)                                 \
  77     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
  78     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
  79     "mulps         %%xmm2, %%xmm1           \n\t"  \
  80     "subps         %%xmm1, %%xmm0           \n\t"  \
  81     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
  82     "subps         %%xmm2, %%xmm4           \n\t"  \
  83
  84     __asm__ volatile(
  85             "1:                                   \n\t"
  86             "xorps       %%xmm0, %%xmm0           \n\t"
  87             "xorps       %%xmm4, %%xmm4           \n\t"
  88
  89             MULT(   0,   0)
  90             MULT( 256,  64)
  91             MULT( 512, 128)
  92             MULT( 768, 192)
  93             MULT(1024, 256)
  94             MULT(1280, 320)
  95             MULT(1536, 384)
  96             MULT(1792, 448)
  97
  98             "movaps      %%xmm0, (%4,%0)          \n\t"
  99             "movaps      %%xmm4, (%5,%0)          \n\t"
 100             "add            $16,  %0              \n\t"
 101             "jl              1b                   \n\t"
 102             :"+&r"(count)
 103             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
 104             );
 105
 106 #undef MULT
 107 }
 108
 109 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
 110                              int incr)
 111 {
 112     LOCAL_ALIGNED_16(float, suma, [17]);
 113     LOCAL_ALIGNED_16(float, sumb, [17]);
 114     LOCAL_ALIGNED_16(float, sumc, [17]);
 115     LOCAL_ALIGNED_16(float, sumd, [17]);
 116
 117     float sum;
 118
 119     /* copy to avoid wrap */
 120     __asm__ volatile(
 121             "movaps    0(%0), %%xmm0   \n\t" \
 122             "movaps   16(%0), %%xmm1   \n\t" \
 123             "movaps   32(%0), %%xmm2   \n\t" \
 124             "movaps   48(%0), %%xmm3   \n\t" \
 125             "movaps   %%xmm0,   0(%1) \n\t" \
 126             "movaps   %%xmm1,  16(%1) \n\t" \
 127             "movaps   %%xmm2,  32(%1) \n\t" \
 128             "movaps   %%xmm3,  48(%1) \n\t" \
 129             "movaps   64(%0), %%xmm0   \n\t" \
 130             "movaps   80(%0), %%xmm1   \n\t" \
 131             "movaps   96(%0), %%xmm2   \n\t" \
 132             "movaps  112(%0), %%xmm3   \n\t" \
 133             "movaps   %%xmm0,  64(%1) \n\t" \
 134             "movaps   %%xmm1,  80(%1) \n\t" \
 135             "movaps   %%xmm2,  96(%1) \n\t" \
 136             "movaps   %%xmm3, 112(%1) \n\t"
 137             ::"r"(in), "r"(in+512)
 138             :"memory"
 139             );
 140
 141     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
 142     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
 143
 144     SUM8(MACS, suma[0], win + 32, in + 48);
 145
 146     sumc[ 0] = 0;
 147     sumb[16] = 0;
 148     sumd[16] = 0;
 149
 150 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
 151             "movups " #sumd "(%4),       %%xmm0          \n\t" \
 152             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 153             "subps  " #suma "(%1),       %%xmm0          \n\t" \
 154             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
 155 \
 156             "movups " #sumc "(%3),       %%xmm0          \n\t" \
 157             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 158             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
 159             "movaps        %%xmm0," #out2 "(%0)          \n\t"
 160
 161     if (incr == 1) {
 162         __asm__ volatile(
 163             SUMS( 0, 48,  4, 52,  0, 112)
 164             SUMS(16, 32, 20, 36, 16,  96)
 165             SUMS(32, 16, 36, 20, 32,  80)
 166             SUMS(48,  0, 52,  4, 48,  64)
 167
 168             :"+&r"(out)
 169             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
 170             :"memory"
 171             );
 172         out += 16*incr;
 173     } else {
 174         int j;
 175         float *out2 = out + 32 * incr;
 176         out[0  ]  = -suma[   0];
 177         out += incr;
 178         out2 -= incr;
 179         for(j=1;j<16;j++) {
 180             *out  = -suma[   j] + sumd[16-j];
 181             *out2 =  sumb[16-j] + sumc[   j];
 182             out  += incr;
 183             out2 -= incr;
 184         }
 185     }
 186
 187     sum = 0;
 188     SUM8(MLSS, sum, win + 16 + 32, in + 32);
 189     *out = sum;
 190 }
 191
 192 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
 193
 194 #if HAVE_YASM
 195 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
 196 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
 197                                int count, int switch_point, int block_type) \
 198 {                                                                           \
 199     int align_end = count - (count & 3);                                \
 200     int j;                                                              \
 201     for (j = 0; j < align_end; j+= 4) {                                 \
 202         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
 203         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
 204         /* apply window & overlap with previous buffer */               \
 205                                                                         \
 206         /* select window */                                             \
 207         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
 208         in      += 4*18;                                                \
 209         buf     += 4*18;                                                \
 210         out     += 4;                                                   \
 211     }                                                                   \
 212     for (; j < count; j++) {                                            \
 213         /* apply window & overlap with previous buffer */               \
 214                                                                         \
 215         /* select window */                                             \
 216         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
 217         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
 218                                                                         \
 219         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
 220                                                                         \
 221         in  += 18;                                                      \
 222         buf++;                                                          \
 223         out++;                                                          \
 224     }                                                                   \
 225 }
 226
 227 #if HAVE_SSE
 228 #if ARCH_X86_32
 229 DECL_IMDCT_BLOCKS(sse,sse)
 230 #endif
 231 DECL_IMDCT_BLOCKS(sse2,sse)
 232 DECL_IMDCT_BLOCKS(sse3,sse)
 233 DECL_IMDCT_BLOCKS(ssse3,sse)
 234 #endif
 235 #if HAVE_AVX_EXTERNAL
 236 DECL_IMDCT_BLOCKS(avx,avx)
 237 #endif
 238 #endif /* HAVE_YASM */
 239
 240 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 241 {
 242     int cpu_flags = av_get_cpu_flags();
 243
 244     int i, j;
 245     for (j = 0; j < 4; j++) {
 246         for (i = 0; i < 40; i ++) {
 247             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
 248             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
 249             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
 250             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
 251             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
 252             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
 253             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
 254             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
 255         }
 256     }
 257
 258 #if HAVE_6REGS && HAVE_SSE_INLINE
 259     if (INLINE_SSE(cpu_flags)) {
 260         s->apply_window_float = apply_window_mp3;
 261     }
 262 #endif /* HAVE_SSE_INLINE */
 263
 264 #if HAVE_YASM
 265 #if HAVE_SSE
 266 #if ARCH_X86_32
 267     if (EXTERNAL_SSE(cpu_flags)) {
 268         s->imdct36_blocks_float = imdct36_blocks_sse;
 269     }
 270 #endif
 271     if (EXTERNAL_SSE2(cpu_flags)) {
 272         s->imdct36_blocks_float = imdct36_blocks_sse2;
 273     }
 274     if (EXTERNAL_SSE3(cpu_flags)) {
 275         s->imdct36_blocks_float = imdct36_blocks_sse3;
 276     }
 277     if (EXTERNAL_SSSE3(cpu_flags)) {
 278         s->imdct36_blocks_float = imdct36_blocks_ssse3;
 279     }
 280 #endif
 281 #if HAVE_AVX_EXTERNAL
 282     if (EXTERNAL_AVX(cpu_flags)) {
 283         s->imdct36_blocks_float = imdct36_blocks_avx;
 284     }
 285 #endif
 286 #endif /* HAVE_YASM */
 287 }