[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp8dsp_init.c

/*
 * VP8 DSP functions x86-optimized
 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp8dsp.h"

#if HAVE_YASM

/*
 * MC functions
 */
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);

void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);

void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);

void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);

void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                   uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);


void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
                             uint8_t *src, ptrdiff_t srcstride,
                             int height, int mx, int my);
void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
                             uint8_t *src, ptrdiff_t srcstride,
                             int height, int mx, int my);
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
                             uint8_t *src, ptrdiff_t srcstride,
                             int height, int mx, int my);

#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
    ptrdiff_t srcstride, int height, int mx, int my) \
{ \
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
        dst,     dststride, src,     srcstride, height, mx, my); \
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
        dst + 8, dststride, src + 8, srcstride, height, mx, my); \
}
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
    ptrdiff_t srcstride, int height, int mx, int my) \
{ \
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
        dst,     dststride, src,     srcstride, height, mx, my); \
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
}

#if ARCH_X86_32
TAP_W8 (mmxext, epel, h4)
TAP_W8 (mmxext, epel, h6)
TAP_W16(mmxext, epel, h6)
TAP_W8 (mmxext, epel, v4)
TAP_W8 (mmxext, epel, v6)
TAP_W16(mmxext, epel, v6)
TAP_W8 (mmxext, bilinear, h)
TAP_W16(mmxext, bilinear, h)
TAP_W8 (mmxext, bilinear, v)
TAP_W16(mmxext, bilinear, v)
#endif

TAP_W16(sse2,  epel, h6)
TAP_W16(sse2,  epel, v6)
TAP_W16(sse2,  bilinear, h)
TAP_W16(sse2,  bilinear, v)

TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v)

#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
    uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
    ptrdiff_t srcstride, int height, int mx, int my) \
{ \
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
    uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
    src -= srcstride * (TAPNUMY / 2 - 1); \
    ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
        tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
    ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
        dst, dststride, tmpptr, SIZE,      height,               mx, my); \
}

#if ARCH_X86_32
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y,  4,  8) \
HVTAP(mmxext, 8, x, y,  8, 16)

HVTAP(mmxext, 8, 6, 6, 16, 16)
#else
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y,  4,  8)
#endif

HVTAPMMX(4, 4)
HVTAPMMX(4, 6)
HVTAPMMX(6, 4)
HVTAPMMX(6, 6)

#define HVTAPSSE2(x, y, w) \
HVTAP(sse2,  16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)

HVTAPSSE2(4, 4, 8)
HVTAPSSE2(4, 6, 8)
HVTAPSSE2(6, 4, 8)
HVTAPSSE2(6, 6, 8)
HVTAPSSE2(6, 6, 16)

HVTAP(ssse3, 16, 4, 4, 4, 8)
HVTAP(ssse3, 16, 4, 6, 4, 8)
HVTAP(ssse3, 16, 6, 4, 4, 8)
HVTAP(ssse3, 16, 6, 6, 4, 8)

#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
    uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
    ptrdiff_t srcstride, int height, int mx, int my) \
{ \
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
    ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
        tmp, SIZE,      src, srcstride, height + 1, mx, my); \
    ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
        dst, dststride, tmp, SIZE,      height,     mx, my); \
}

HVBILIN(mmxext,  8,  4,  8)
#if ARCH_X86_32
HVBILIN(mmxext,  8,  8, 16)
HVBILIN(mmxext,  8, 16, 16)
#endif
HVBILIN(sse2,  8,  8, 16)
HVBILIN(sse2,  8, 16, 16)
HVBILIN(ssse3, 8,  4,  8)
HVBILIN(ssse3, 8,  8, 16)
HVBILIN(ssse3, 8, 16, 16)

void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
                            ptrdiff_t stride);
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
                             ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
                               ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
                               ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
                               ptrdiff_t stride);
void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);

#define DECLARE_LOOP_FILTER(NAME)                                       \
void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
                                          ptrdiff_t stride,             \
                                          int flim);                    \
void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
                                          ptrdiff_t stride,             \
                                          int flim);                    \
void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
                                             ptrdiff_t stride,          \
                                             int e, int i, int hvt);    \
void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
                                             ptrdiff_t stride,          \
                                             int e, int i, int hvt);    \
void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
                                             uint8_t *dstV,             \
                                             ptrdiff_t s,               \
                                             int e, int i, int hvt);    \
void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
                                             uint8_t *dstV,             \
                                             ptrdiff_t s,               \
                                             int e, int i, int hvt);    \
void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
                                             ptrdiff_t stride,          \
                                             int e, int i, int hvt);    \
void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
                                             ptrdiff_t stride,          \
                                             int e, int i, int hvt);    \
void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
                                             uint8_t *dstV,             \
                                             ptrdiff_t s,               \
                                             int e, int i, int hvt);    \
void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
                                             uint8_t *dstV,             \
                                             ptrdiff_t s,               \
                                             int e, int i, int hvt);

DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext)
DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)

#endif /* HAVE_YASM */

#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
    c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT

#define VP8_MC_FUNC(IDX, SIZE, OPT) \
    c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
    VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)

#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
    c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT


av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
{
#if HAVE_YASM
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(cpu_flags)) {
#if ARCH_X86_32
        c->put_vp8_epel_pixels_tab[0][0][0]     =
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
#endif
        c->put_vp8_epel_pixels_tab[1][0][0]     =
        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
    }

    /* note that 4-tap width=16 functions are missing because w=16
     * is only used for luma, and luma is always a copy or sixtap. */
    if (EXTERNAL_MMXEXT(cpu_flags)) {
        VP8_MC_FUNC(2, 4, mmxext);
        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
#if ARCH_X86_32
        VP8_LUMA_MC_FUNC(0, 16, mmxext);
        VP8_MC_FUNC(1, 8, mmxext);
        VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
        VP8_BILINEAR_MC_FUNC(1,  8, mmxext);
#endif
    }

    if (EXTERNAL_SSE(cpu_flags)) {
        c->put_vp8_epel_pixels_tab[0][0][0]     =
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
    }

    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
        VP8_LUMA_MC_FUNC(0, 16, sse2);
        VP8_MC_FUNC(1, 8, sse2);
        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
    }

    if (EXTERNAL_SSSE3(cpu_flags)) {
        VP8_LUMA_MC_FUNC(0, 16, ssse3);
        VP8_MC_FUNC(1, 8, ssse3);
        VP8_MC_FUNC(2, 4, ssse3);
        VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
        VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
        VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
    }
#endif /* HAVE_YASM */
}

av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
{
#if HAVE_YASM
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(cpu_flags)) {
        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
        c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
#if ARCH_X86_32
        c->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmx;
        c->vp8_idct_add       = ff_vp8_idct_add_mmx;
        c->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmx;

        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;

        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;

        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmx;
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmx;
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmx;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmx;
#endif
    }

    /* note that 4-tap width=16 functions are missing because w=16
     * is only used for luma, and luma is always a copy or sixtap. */
    if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
        c->vp8_v_loop_filter_simple   = ff_vp8_v_loop_filter_simple_mmxext;
        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_mmxext;

        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;

        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmxext;
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmxext;
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
#endif
    }

    if (EXTERNAL_SSE(cpu_flags)) {
        c->vp8_idct_add                         = ff_vp8_idct_add_sse;
        c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
    }

    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;

        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;

        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_sse2;
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_sse2;
    }

    if (EXTERNAL_SSE2(cpu_flags)) {
        c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;

        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;

        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;

        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
    }

    if (EXTERNAL_SSSE3(cpu_flags)) {
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;

        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;

        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
    }

    if (EXTERNAL_SSE4(cpu_flags)) {
        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;

        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4;
    }
#endif /* HAVE_YASM */
}
Commit	Line	Data
2ba45a60 DM	1	/*
	2	* VP8 DSP functions x86-optimized
	3	* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
	4	* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
	5	*
	6	* This file is part of FFmpeg.
	7	*
	8	* FFmpeg is free software; you can redistribute it and/or
	9	* modify it under the terms of the GNU Lesser General Public
	10	* License as published by the Free Software Foundation; either
	11	* version 2.1 of the License, or (at your option) any later version.
	12	*
	13	* FFmpeg is distributed in the hope that it will be useful,
	14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	16	* Lesser General Public License for more details.
	17	*
	18	* You should have received a copy of the GNU Lesser General Public
	19	* License along with FFmpeg; if not, write to the Free Software
	20	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	21	*/
	22
	23	#include "libavutil/attributes.h"
	24	#include "libavutil/cpu.h"
	25	#include "libavutil/mem.h"
	26	#include "libavutil/x86/asm.h"
	27	#include "libavutil/x86/cpu.h"
	28	#include "libavcodec/vp8dsp.h"
	29
	30	#if HAVE_YASM
	31
	32	/*
	33	* MC functions
	34	*/
	35	void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
	36	uint8_t *src, ptrdiff_t srcstride,
	37	int height, int mx, int my);
	38	void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
	39	uint8_t *src, ptrdiff_t srcstride,
	40	int height, int mx, int my);
	41	void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
	42	uint8_t *src, ptrdiff_t srcstride,
	43	int height, int mx, int my);
	44	void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
	45	uint8_t *src, ptrdiff_t srcstride,
	46	int height, int mx, int my);
	47
	48	void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
	49	uint8_t *src, ptrdiff_t srcstride,
	50	int height, int mx, int my);
	51	void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
	52	uint8_t *src, ptrdiff_t srcstride,
	53	int height, int mx, int my);
	54	void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
	55	uint8_t *src, ptrdiff_t srcstride,
	56	int height, int mx, int my);
	57	void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
	58	uint8_t *src, ptrdiff_t srcstride,
	59	int height, int mx, int my);
	60
	61	void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
	62	uint8_t *src, ptrdiff_t srcstride,
	63	int height, int mx, int my);
	64	void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
65	uint8_t *src, ptrdiff_t srcstride,
66	int height, int mx, int my);
67	void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
68	uint8_t *src, ptrdiff_t srcstride,
69	int height, int mx, int my);
70	void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
71	uint8_t *src, ptrdiff_t srcstride,
72	int height, int mx, int my);
73	void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
74	uint8_t *src, ptrdiff_t srcstride,
75	int height, int mx, int my);
76	void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
77	uint8_t *src, ptrdiff_t srcstride,
78	int height, int mx, int my);
79	void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
80	uint8_t *src, ptrdiff_t srcstride,
81	int height, int mx, int my);
82	void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
83	uint8_t *src, ptrdiff_t srcstride,
84	int height, int mx, int my);
85
86	void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
87	uint8_t *src, ptrdiff_t srcstride,
88	int height, int mx, int my);
89	void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
90	uint8_t *src, ptrdiff_t srcstride,
91	int height, int mx, int my);
92	void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
93	uint8_t *src, ptrdiff_t srcstride,
94	int height, int mx, int my);
95	void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
96	uint8_t *src, ptrdiff_t srcstride,
97	int height, int mx, int my);
98
99	void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
100	uint8_t *src, ptrdiff_t srcstride,
101	int height, int mx, int my);
102	void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
103	uint8_t *src, ptrdiff_t srcstride,
104	int height, int mx, int my);
105	void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
106	uint8_t *src, ptrdiff_t srcstride,
107	int height, int mx, int my);
108	void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
109	uint8_t *src, ptrdiff_t srcstride,
110	int height, int mx, int my);
111
112
113	void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
114	uint8_t *src, ptrdiff_t srcstride,
115	int height, int mx, int my);
116	void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
117	uint8_t *src, ptrdiff_t srcstride,
118	int height, int mx, int my);
119	void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
120	uint8_t *src, ptrdiff_t srcstride,
121	int height, int mx, int my);
122
123	#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
124	static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
125	uint8_t dst, ptrdiff_t dststride, uint8_t src, \
126	ptrdiff_t srcstride, int height, int mx, int my) \
127	{ \
128	ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
129	dst, dststride, src, srcstride, height, mx, my); \
130	ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
131	dst + 8, dststride, src + 8, srcstride, height, mx, my); \
132	}
133	#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
134	static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
135	uint8_t dst, ptrdiff_t dststride, uint8_t src, \
136	ptrdiff_t srcstride, int height, int mx, int my) \
137	{ \
138	ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
139	dst, dststride, src, srcstride, height, mx, my); \
140	ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
141	dst + 4, dststride, src + 4, srcstride, height, mx, my); \
142	}
143
144	#if ARCH_X86_32
145	TAP_W8 (mmxext, epel, h4)
146	TAP_W8 (mmxext, epel, h6)
147	TAP_W16(mmxext, epel, h6)
148	TAP_W8 (mmxext, epel, v4)
149	TAP_W8 (mmxext, epel, v6)
150	TAP_W16(mmxext, epel, v6)
151	TAP_W8 (mmxext, bilinear, h)
152	TAP_W16(mmxext, bilinear, h)
153	TAP_W8 (mmxext, bilinear, v)
154	TAP_W16(mmxext, bilinear, v)
155	#endif
156
157	TAP_W16(sse2, epel, h6)
158	TAP_W16(sse2, epel, v6)
159	TAP_W16(sse2, bilinear, h)
160	TAP_W16(sse2, bilinear, v)
161
162	TAP_W16(ssse3, epel, h6)
163	TAP_W16(ssse3, epel, v6)
164	TAP_W16(ssse3, bilinear, h)
165	TAP_W16(ssse3, bilinear, v)
166
167	#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
168	static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
169	uint8_t dst, ptrdiff_t dststride, uint8_t src, \
170	ptrdiff_t srcstride, int height, int mx, int my) \
171	{ \
172	DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
173	uint8_t tmpptr = tmp + SIZE (TAPNUMY / 2 - 1); \
174	src -= srcstride * (TAPNUMY / 2 - 1); \
175	ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
176	tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
177	ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
178	dst, dststride, tmpptr, SIZE, height, mx, my); \
179	}
180
181	#if ARCH_X86_32
182	#define HVTAPMMX(x, y) \
183	HVTAP(mmxext, 8, x, y, 4, 8) \
184	HVTAP(mmxext, 8, x, y, 8, 16)
185
186	HVTAP(mmxext, 8, 6, 6, 16, 16)
187	#else
188	#define HVTAPMMX(x, y) \
189	HVTAP(mmxext, 8, x, y, 4, 8)
190	#endif
191
192	HVTAPMMX(4, 4)
193	HVTAPMMX(4, 6)
194	HVTAPMMX(6, 4)
195	HVTAPMMX(6, 6)
196
197	#define HVTAPSSE2(x, y, w) \
198	HVTAP(sse2, 16, x, y, w, 16) \
199	HVTAP(ssse3, 16, x, y, w, 16)
200
201	HVTAPSSE2(4, 4, 8)
202	HVTAPSSE2(4, 6, 8)
203	HVTAPSSE2(6, 4, 8)
204	HVTAPSSE2(6, 6, 8)
205	HVTAPSSE2(6, 6, 16)
206
207	HVTAP(ssse3, 16, 4, 4, 4, 8)
208	HVTAP(ssse3, 16, 4, 6, 4, 8)
209	HVTAP(ssse3, 16, 6, 4, 4, 8)
210	HVTAP(ssse3, 16, 6, 6, 4, 8)
211
212	#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
213	static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
214	uint8_t dst, ptrdiff_t dststride, uint8_t src, \
215	ptrdiff_t srcstride, int height, int mx, int my) \
216	{ \
217	DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
218	ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
219	tmp, SIZE, src, srcstride, height + 1, mx, my); \
220	ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
221	dst, dststride, tmp, SIZE, height, mx, my); \
222	}
223
224	HVBILIN(mmxext, 8, 4, 8)
225	#if ARCH_X86_32
226	HVBILIN(mmxext, 8, 8, 16)
227	HVBILIN(mmxext, 8, 16, 16)
228	#endif
229	HVBILIN(sse2, 8, 8, 16)
230	HVBILIN(sse2, 8, 16, 16)
231	HVBILIN(ssse3, 8, 4, 8)
232	HVBILIN(ssse3, 8, 8, 16)
233	HVBILIN(ssse3, 8, 16, 16)
234
235	void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
236	ptrdiff_t stride);
237	void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
238	ptrdiff_t stride);
239	void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
240	ptrdiff_t stride);
241	void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
242	ptrdiff_t stride);
243	void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
244	ptrdiff_t stride);
245	void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
246	void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
247	void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
248	void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
249
250	#define DECLARE_LOOP_FILTER(NAME) \
251	void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
252	ptrdiff_t stride, \
253	int flim); \
254	void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
255	ptrdiff_t stride, \
256	int flim); \
257	void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
258	ptrdiff_t stride, \
259	int e, int i, int hvt); \
260	void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
261	ptrdiff_t stride, \
262	int e, int i, int hvt); \
263	void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
264	uint8_t *dstV, \
265	ptrdiff_t s, \
266	int e, int i, int hvt); \
267	void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
268	uint8_t *dstV, \
269	ptrdiff_t s, \
270	int e, int i, int hvt); \
271	void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
272	ptrdiff_t stride, \
273	int e, int i, int hvt); \
274	void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
275	ptrdiff_t stride, \
276	int e, int i, int hvt); \
277	void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
278	uint8_t *dstV, \
279	ptrdiff_t s, \
280	int e, int i, int hvt); \
281	void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
282	uint8_t *dstV, \
283	ptrdiff_t s, \
284	int e, int i, int hvt);
285
286	DECLARE_LOOP_FILTER(mmx)
287	DECLARE_LOOP_FILTER(mmxext)
288	DECLARE_LOOP_FILTER(sse2)
289	DECLARE_LOOP_FILTER(ssse3)
290	DECLARE_LOOP_FILTER(sse4)
291
292	#endif /* HAVE_YASM */
293
294	#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
295	c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
296	c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
297	c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
298
299	#define VP8_MC_FUNC(IDX, SIZE, OPT) \
300	c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
301	c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
302	c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
303	c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
304	c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
305	VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
306
307	#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
308	c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
309	c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
310	c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
311	c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
312	c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
313	c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
314	c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
315	c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
316
317
318	av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
319	{
320	#if HAVE_YASM
321	int cpu_flags = av_get_cpu_flags();
322
323	if (EXTERNAL_MMX(cpu_flags)) {
324	#if ARCH_X86_32
325	c->put_vp8_epel_pixels_tab[0][0][0] =
326	c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
327	#endif
328	c->put_vp8_epel_pixels_tab[1][0][0] =
329	c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
330	}
331
332	/* note that 4-tap width=16 functions are missing because w=16
333	* is only used for luma, and luma is always a copy or sixtap. */
334	if (EXTERNAL_MMXEXT(cpu_flags)) {
335	VP8_MC_FUNC(2, 4, mmxext);
336	VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
337	#if ARCH_X86_32
338	VP8_LUMA_MC_FUNC(0, 16, mmxext);
339	VP8_MC_FUNC(1, 8, mmxext);
340	VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
341	VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
342	#endif
343	}
344
345	if (EXTERNAL_SSE(cpu_flags)) {
346	c->put_vp8_epel_pixels_tab[0][0][0] =
347	c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
348	}
349
350	if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 \| AV_CPU_FLAG_SSE2SLOW)) {
351	VP8_LUMA_MC_FUNC(0, 16, sse2);
352	VP8_MC_FUNC(1, 8, sse2);
353	VP8_BILINEAR_MC_FUNC(0, 16, sse2);
354	VP8_BILINEAR_MC_FUNC(1, 8, sse2);
355	}
356
357	if (EXTERNAL_SSSE3(cpu_flags)) {
358	VP8_LUMA_MC_FUNC(0, 16, ssse3);
359	VP8_MC_FUNC(1, 8, ssse3);
360	VP8_MC_FUNC(2, 4, ssse3);
361	VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
362	VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
363	VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
364	}
365	#endif /* HAVE_YASM */
366	}
367
368	av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
369	{
370	#if HAVE_YASM
371	int cpu_flags = av_get_cpu_flags();
372
373	if (EXTERNAL_MMX(cpu_flags)) {
374	c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
375	c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
376	#if ARCH_X86_32
377	c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
378	c->vp8_idct_add = ff_vp8_idct_add_mmx;
379	c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
380
381	c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
382	c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
383
384	c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
385	c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
386	c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
387	c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
388
389	c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
390	c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
391	c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
392	c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
393	#endif
394	}
395
396	/* note that 4-tap width=16 functions are missing because w=16
397	* is only used for luma, and luma is always a copy or sixtap. */
398	if (EXTERNAL_MMXEXT(cpu_flags)) {
399	#if ARCH_X86_32
400	c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
401	c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
402
403	c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
404	c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
405	c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
406	c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
407
408	c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
409	c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
410	c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
411	c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
412	#endif
413	}
414
415	if (EXTERNAL_SSE(cpu_flags)) {
416	c->vp8_idct_add = ff_vp8_idct_add_sse;
417	c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
418	}
419
420	if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 \| AV_CPU_FLAG_SSE2SLOW)) {
421	c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
422
423	c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
424	c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
425
426	c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
427	c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
428	}
429
430	if (EXTERNAL_SSE2(cpu_flags)) {
431	c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
432
433	c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
434
435	c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
436	c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
437
438	c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
439	c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
440	}
441
442	if (EXTERNAL_SSSE3(cpu_flags)) {
443	c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
444	c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
445
446	c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
447	c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
448	c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
449	c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
450
451	c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
452	c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
453	c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
454	c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
455	}
456
457	if (EXTERNAL_SSE4(cpu_flags)) {
458	c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
459
460	c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
461	c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
462	c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
463	}
464	#endif /* HAVE_YASM */
465	}