[deb_ffmpeg.git] / ffmpeg / libswscale / x86 / hscale_fast_bilinear_simd.c

/*
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "../swscale_internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"

#define RET 0xC3 // near return opcode for x86
#define PREFETCH "prefetchnta"

#if HAVE_INLINE_ASM
av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
                                       int16_t *filter, int32_t *filterPos,
                                       int numSplits)
{
    uint8_t *fragmentA;
    x86_reg imm8OfPShufW1A;
    x86_reg imm8OfPShufW2A;
    x86_reg fragmentLengthA;
    uint8_t *fragmentB;
    x86_reg imm8OfPShufW1B;
    x86_reg imm8OfPShufW2B;
    x86_reg fragmentLengthB;
    int fragmentPos;

    int xpos, i;

    // create an optimized horizontal scaling routine
    /* This scaler is made of runtime-generated MMXEXT code using specially tuned
     * pshufw instructions. For every four output pixels, if four input pixels
     * are enough for the fast bilinear scaling, then a chunk of fragmentB is
     * used. If five input pixels are needed, then a chunk of fragmentA is used.
     */

    // code fragment

    __asm__ volatile (
        "jmp                         9f                 \n\t"
        // Begin
        "0:                                             \n\t"
        "movq    (%%"REG_d", %%"REG_a"), %%mm3          \n\t"
        "movd    (%%"REG_c", %%"REG_S"), %%mm0          \n\t"
        "movd   1(%%"REG_c", %%"REG_S"), %%mm1          \n\t"
        "punpcklbw                %%mm7, %%mm1          \n\t"
        "punpcklbw                %%mm7, %%mm0          \n\t"
        "pshufw                   $0xFF, %%mm1, %%mm1   \n\t"
        "1:                                             \n\t"
        "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
        "2:                                             \n\t"
        "psubw                    %%mm1, %%mm0          \n\t"
        "movl   8(%%"REG_b", %%"REG_a"), %%esi          \n\t"
        "pmullw                   %%mm3, %%mm0          \n\t"
        "psllw                       $7, %%mm1          \n\t"
        "paddw                    %%mm1, %%mm0          \n\t"

        "movq                     %%mm0, (%%"REG_D", %%"REG_a") \n\t"

        "add                         $8, %%"REG_a"      \n\t"
        // End
        "9:                                             \n\t"
        // "int $3                                         \n\t"
        "lea       " LOCAL_MANGLE(0b) ", %0             \n\t"
        "lea       " LOCAL_MANGLE(1b) ", %1             \n\t"
        "lea       " LOCAL_MANGLE(2b) ", %2             \n\t"
        "dec                         %1                 \n\t"
        "dec                         %2                 \n\t"
        "sub                         %0, %1             \n\t"
        "sub                         %0, %2             \n\t"
        "lea       " LOCAL_MANGLE(9b) ", %3             \n\t"
        "sub                         %0, %3             \n\t"


        : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
          "=r" (fragmentLengthA)
        );

    __asm__ volatile (
        "jmp                         9f                 \n\t"
        // Begin
        "0:                                             \n\t"
        "movq    (%%"REG_d", %%"REG_a"), %%mm3          \n\t"
        "movd    (%%"REG_c", %%"REG_S"), %%mm0          \n\t"
        "punpcklbw                %%mm7, %%mm0          \n\t"
        "pshufw                   $0xFF, %%mm0, %%mm1   \n\t"
        "1:                                             \n\t"
        "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
        "2:                                             \n\t"
        "psubw                    %%mm1, %%mm0          \n\t"
        "movl   8(%%"REG_b", %%"REG_a"), %%esi          \n\t"
        "pmullw                   %%mm3, %%mm0          \n\t"
        "psllw                       $7, %%mm1          \n\t"
        "paddw                    %%mm1, %%mm0          \n\t"

        "movq                     %%mm0, (%%"REG_D", %%"REG_a") \n\t"

        "add                         $8, %%"REG_a"      \n\t"
        // End
        "9:                                             \n\t"
        // "int                       $3                   \n\t"
        "lea       " LOCAL_MANGLE(0b) ", %0             \n\t"
        "lea       " LOCAL_MANGLE(1b) ", %1             \n\t"
        "lea       " LOCAL_MANGLE(2b) ", %2             \n\t"
        "dec                         %1                 \n\t"
        "dec                         %2                 \n\t"
        "sub                         %0, %1             \n\t"
        "sub                         %0, %2             \n\t"
        "lea       " LOCAL_MANGLE(9b) ", %3             \n\t"
        "sub                         %0, %3             \n\t"


        : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
          "=r" (fragmentLengthB)
        );

    xpos        = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
    fragmentPos = 0;

    for (i = 0; i < dstW / numSplits; i++) {
        int xx = xpos >> 16;

        if ((i & 3) == 0) {
            int a                  = 0;
            int b                  = ((xpos + xInc) >> 16) - xx;
            int c                  = ((xpos + xInc * 2) >> 16) - xx;
            int d                  = ((xpos + xInc * 3) >> 16) - xx;
            int inc                = (d + 1 < 4);
            uint8_t *fragment      = inc ? fragmentB : fragmentA;
            x86_reg imm8OfPShufW1  = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
            x86_reg imm8OfPShufW2  = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
            x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
            int maxShift           = 3 - (d + inc);
            int shift              = 0;

            if (filterCode) {
                filter[i]        = ((xpos              & 0xFFFF) ^ 0xFFFF) >> 9;
                filter[i + 1]    = (((xpos + xInc)     & 0xFFFF) ^ 0xFFFF) >> 9;
                filter[i + 2]    = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
                filter[i + 3]    = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
                filterPos[i / 2] = xx;

                memcpy(filterCode + fragmentPos, fragment, fragmentLength);

                filterCode[fragmentPos + imm8OfPShufW1] =  (a + inc)       |
                                                          ((b + inc) << 2) |
                                                          ((c + inc) << 4) |
                                                          ((d + inc) << 6);
                filterCode[fragmentPos + imm8OfPShufW2] =  a | (b << 2) |
                                                               (c << 4) |
                                                               (d << 6);

                if (i + 4 - inc >= dstW)
                    shift = maxShift;               // avoid overread
                else if ((filterPos[i / 2] & 3) <= maxShift)
                    shift = filterPos[i / 2] & 3;   // align

                if (shift && i >= shift) {
                    filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
                    filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
                    filterPos[i / 2]                        -= shift;
                }
            }

            fragmentPos += fragmentLength;

            if (filterCode)
                filterCode[fragmentPos] = RET;
        }
        xpos += xInc;
    }
    if (filterCode)
        filterPos[((i / 2) + 1) & (~1)] = xpos >> 16;  // needed to jump to the next part

    return fragmentPos + 1;
}

void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
                                 int dstWidth, const uint8_t *src,
                                 int srcW, int xInc)
{
    int32_t *filterPos = c->hLumFilterPos;
    int16_t *filter    = c->hLumFilter;
    void    *mmxextFilterCode = c->lumMmxextFilterCode;
    int i;
#if defined(PIC)
    uint64_t ebxsave;
#endif
#if ARCH_X86_64
    uint64_t retsave;
#endif

    __asm__ volatile(
#if defined(PIC)
        "mov               %%"REG_b", %5        \n\t"
#if ARCH_X86_64
        "mov               -8(%%rsp), %%"REG_a" \n\t"
        "mov               %%"REG_a", %6        \n\t"
#endif
#else
#if ARCH_X86_64
        "mov               -8(%%rsp), %%"REG_a" \n\t"
        "mov               %%"REG_a", %5        \n\t"
#endif
#endif
        "pxor                  %%mm7, %%mm7     \n\t"
        "mov                      %0, %%"REG_c" \n\t"
        "mov                      %1, %%"REG_D" \n\t"
        "mov                      %2, %%"REG_d" \n\t"
        "mov                      %3, %%"REG_b" \n\t"
        "xor               %%"REG_a", %%"REG_a" \n\t" // i
        PREFETCH"        (%%"REG_c")            \n\t"
        PREFETCH"      32(%%"REG_c")            \n\t"
        PREFETCH"      64(%%"REG_c")            \n\t"

#if ARCH_X86_64
#define CALL_MMXEXT_FILTER_CODE \
        "movl            (%%"REG_b"), %%esi     \n\t"\
        "call                    *%4            \n\t"\
        "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
        "add               %%"REG_S", %%"REG_c" \n\t"\
        "add               %%"REG_a", %%"REG_D" \n\t"\
        "xor               %%"REG_a", %%"REG_a" \n\t"\

#else
#define CALL_MMXEXT_FILTER_CODE \
        "movl (%%"REG_b"), %%esi        \n\t"\
        "call         *%4                       \n\t"\
        "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
        "add               %%"REG_a", %%"REG_D" \n\t"\
        "xor               %%"REG_a", %%"REG_a" \n\t"\

#endif /* ARCH_X86_64 */

        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE

#if defined(PIC)
        "mov                      %5, %%"REG_b" \n\t"
#if ARCH_X86_64
        "mov                      %6, %%"REG_a" \n\t"
        "mov               %%"REG_a", -8(%%rsp) \n\t"
#endif
#else
#if ARCH_X86_64
        "mov                      %5, %%"REG_a" \n\t"
        "mov               %%"REG_a", -8(%%rsp) \n\t"
#endif
#endif
        :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
           "m" (mmxextFilterCode)
#if defined(PIC)
          ,"m" (ebxsave)
#endif
#if ARCH_X86_64
          ,"m"(retsave)
#endif
        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if !defined(PIC)
         ,"%"REG_b
#endif
    );

    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
        dst[i] = src[srcW-1]*128;
}

void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
                                 int dstWidth, const uint8_t *src1,
                                 const uint8_t *src2, int srcW, int xInc)
{
    int32_t *filterPos = c->hChrFilterPos;
    int16_t *filter    = c->hChrFilter;
    void    *mmxextFilterCode = c->chrMmxextFilterCode;
    int i;
#if defined(PIC)
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
#endif
#if ARCH_X86_64
    DECLARE_ALIGNED(8, uint64_t, retsave);
#endif

    __asm__ volatile(
#if defined(PIC)
        "mov          %%"REG_b", %7         \n\t"
#if ARCH_X86_64
        "mov          -8(%%rsp), %%"REG_a"  \n\t"
        "mov          %%"REG_a", %8         \n\t"
#endif
#else
#if ARCH_X86_64
        "mov          -8(%%rsp), %%"REG_a"  \n\t"
        "mov          %%"REG_a", %7         \n\t"
#endif
#endif
        "pxor             %%mm7, %%mm7      \n\t"
        "mov                 %0, %%"REG_c"  \n\t"
        "mov                 %1, %%"REG_D"  \n\t"
        "mov                 %2, %%"REG_d"  \n\t"
        "mov                 %3, %%"REG_b"  \n\t"
        "xor          %%"REG_a", %%"REG_a"  \n\t" // i
        PREFETCH"   (%%"REG_c")             \n\t"
        PREFETCH" 32(%%"REG_c")             \n\t"
        PREFETCH" 64(%%"REG_c")             \n\t"

        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        "xor          %%"REG_a", %%"REG_a"  \n\t" // i
        "mov                 %5, %%"REG_c"  \n\t" // src
        "mov                 %6, %%"REG_D"  \n\t" // buf2
        PREFETCH"   (%%"REG_c")             \n\t"
        PREFETCH" 32(%%"REG_c")             \n\t"
        PREFETCH" 64(%%"REG_c")             \n\t"

        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE
        CALL_MMXEXT_FILTER_CODE

#if defined(PIC)
        "mov %7, %%"REG_b"    \n\t"
#if ARCH_X86_64
        "mov                 %8, %%"REG_a"  \n\t"
        "mov          %%"REG_a", -8(%%rsp)  \n\t"
#endif
#else
#if ARCH_X86_64
        "mov                 %7, %%"REG_a"  \n\t"
        "mov          %%"REG_a", -8(%%rsp)  \n\t"
#endif
#endif
        :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
           "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
#if defined(PIC)
          ,"m" (ebxsave)
#endif
#if ARCH_X86_64
          ,"m"(retsave)
#endif
        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if !defined(PIC)
         ,"%"REG_b
#endif
    );

    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
        dst1[i] = src1[srcW-1]*128;
        dst2[i] = src2[srcW-1]*128;
    }
}
#endif //HAVE_INLINE_ASM
Commit	Line	Data
2ba45a60 DM	1	/*
	2	* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
	3	*
	4	* This file is part of FFmpeg.
	5	*
	6	* FFmpeg is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2.1 of the License, or (at your option) any later version.
	10	*
	11	* FFmpeg is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with FFmpeg; if not, write to the Free Software
	18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	19	*/
	20
	21	#include "../swscale_internal.h"
	22	#include "libavutil/x86/asm.h"
	23	#include "libavutil/x86/cpu.h"
	24
	25	#define RET 0xC3 // near return opcode for x86
	26	#define PREFETCH "prefetchnta"
	27
	28	#if HAVE_INLINE_ASM
	29	av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
	30	int16_t filter, int32_t filterPos,
	31	int numSplits)
	32	{
	33	uint8_t *fragmentA;
	34	x86_reg imm8OfPShufW1A;
	35	x86_reg imm8OfPShufW2A;
	36	x86_reg fragmentLengthA;
	37	uint8_t *fragmentB;
	38	x86_reg imm8OfPShufW1B;
	39	x86_reg imm8OfPShufW2B;
	40	x86_reg fragmentLengthB;
	41	int fragmentPos;
	42
	43	int xpos, i;
	44
	45	// create an optimized horizontal scaling routine
	46	/* This scaler is made of runtime-generated MMXEXT code using specially tuned
	47	* pshufw instructions. For every four output pixels, if four input pixels
	48	* are enough for the fast bilinear scaling, then a chunk of fragmentB is
	49	* used. If five input pixels are needed, then a chunk of fragmentA is used.
	50	*/
	51
	52	// code fragment
	53
	54	__asm__ volatile (
	55	"jmp 9f \n\t"
	56	// Begin
	57	"0: \n\t"
	58	"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
	59	"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
	60	"movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
	61	"punpcklbw %%mm7, %%mm1 \n\t"
	62	"punpcklbw %%mm7, %%mm0 \n\t"
	63	"pshufw $0xFF, %%mm1, %%mm1 \n\t"
	64	"1: \n\t"
65	"pshufw $0xFF, %%mm0, %%mm0 \n\t"
66	"2: \n\t"
67	"psubw %%mm1, %%mm0 \n\t"
68	"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
69	"pmullw %%mm3, %%mm0 \n\t"
70	"psllw $7, %%mm1 \n\t"
71	"paddw %%mm1, %%mm0 \n\t"
72
73	"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
74
75	"add $8, %%"REG_a" \n\t"
76	// End
77	"9: \n\t"
78	// "int $3 \n\t"
79	"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
80	"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
81	"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
82	"dec %1 \n\t"
83	"dec %2 \n\t"
84	"sub %0, %1 \n\t"
85	"sub %0, %2 \n\t"
86	"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
87	"sub %0, %3 \n\t"
88
89
90	: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
91	"=r" (fragmentLengthA)
92	);
93
94	__asm__ volatile (
95	"jmp 9f \n\t"
96	// Begin
97	"0: \n\t"
98	"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
99	"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
100	"punpcklbw %%mm7, %%mm0 \n\t"
101	"pshufw $0xFF, %%mm0, %%mm1 \n\t"
102	"1: \n\t"
103	"pshufw $0xFF, %%mm0, %%mm0 \n\t"
104	"2: \n\t"
105	"psubw %%mm1, %%mm0 \n\t"
106	"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
107	"pmullw %%mm3, %%mm0 \n\t"
108	"psllw $7, %%mm1 \n\t"
109	"paddw %%mm1, %%mm0 \n\t"
110
111	"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
112
113	"add $8, %%"REG_a" \n\t"
114	// End
115	"9: \n\t"
116	// "int $3 \n\t"
117	"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
118	"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
119	"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
120	"dec %1 \n\t"
121	"dec %2 \n\t"
122	"sub %0, %1 \n\t"
123	"sub %0, %2 \n\t"
124	"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
125	"sub %0, %3 \n\t"
126
127
128	: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
129	"=r" (fragmentLengthB)
130	);
131
132	xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
133	fragmentPos = 0;
134
135	for (i = 0; i < dstW / numSplits; i++) {
136	int xx = xpos >> 16;
137
138	if ((i & 3) == 0) {
139	int a = 0;
140	int b = ((xpos + xInc) >> 16) - xx;
141	int c = ((xpos + xInc * 2) >> 16) - xx;
142	int d = ((xpos + xInc * 3) >> 16) - xx;
143	int inc = (d + 1 < 4);
144	uint8_t *fragment = inc ? fragmentB : fragmentA;
145	x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
146	x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
147	x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
148	int maxShift = 3 - (d + inc);
149	int shift = 0;
150
151	if (filterCode) {
152	filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
153	filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
154	filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
155	filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
156	filterPos[i / 2] = xx;
157
158	memcpy(filterCode + fragmentPos, fragment, fragmentLength);
159
160	filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) \|
161	((b + inc) << 2) \|
162	((c + inc) << 4) \|
163	((d + inc) << 6);
164	filterCode[fragmentPos + imm8OfPShufW2] = a \| (b << 2) \|
165	(c << 4) \|
166	(d << 6);
167
168	if (i + 4 - inc >= dstW)
169	shift = maxShift; // avoid overread
170	else if ((filterPos[i / 2] & 3) <= maxShift)
171	shift = filterPos[i / 2] & 3; // align
172
173	if (shift && i >= shift) {
174	filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
175	filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
176	filterPos[i / 2] -= shift;
177	}
178	}
179
180	fragmentPos += fragmentLength;
181
182	if (filterCode)
183	filterCode[fragmentPos] = RET;
184	}
185	xpos += xInc;
186	}
187	if (filterCode)
188	filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
189
190	return fragmentPos + 1;
191	}
192
193	void ff_hyscale_fast_mmxext(SwsContext c, int16_t dst,
194	int dstWidth, const uint8_t *src,
195	int srcW, int xInc)
196	{
197	int32_t *filterPos = c->hLumFilterPos;
198	int16_t *filter = c->hLumFilter;
199	void *mmxextFilterCode = c->lumMmxextFilterCode;
200	int i;
201	#if defined(PIC)
202	uint64_t ebxsave;
203	#endif
204	#if ARCH_X86_64
205	uint64_t retsave;
206	#endif
207
208	__asm__ volatile(
209	#if defined(PIC)
210	"mov %%"REG_b", %5 \n\t"
211	#if ARCH_X86_64
212	"mov -8(%%rsp), %%"REG_a" \n\t"
213	"mov %%"REG_a", %6 \n\t"
214	#endif
215	#else
216	#if ARCH_X86_64
217	"mov -8(%%rsp), %%"REG_a" \n\t"
218	"mov %%"REG_a", %5 \n\t"
219	#endif
220	#endif
221	"pxor %%mm7, %%mm7 \n\t"
222	"mov %0, %%"REG_c" \n\t"
223	"mov %1, %%"REG_D" \n\t"
224	"mov %2, %%"REG_d" \n\t"
225	"mov %3, %%"REG_b" \n\t"
226	"xor %%"REG_a", %%"REG_a" \n\t" // i
227	PREFETCH" (%%"REG_c") \n\t"
228	PREFETCH" 32(%%"REG_c") \n\t"
229	PREFETCH" 64(%%"REG_c") \n\t"
230
231	#if ARCH_X86_64
232	#define CALL_MMXEXT_FILTER_CODE \
233	"movl (%%"REG_b"), %%esi \n\t"\
234	"call *%4 \n\t"\
235	"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
236	"add %%"REG_S", %%"REG_c" \n\t"\
237	"add %%"REG_a", %%"REG_D" \n\t"\
238	"xor %%"REG_a", %%"REG_a" \n\t"\
239
240	#else
241	#define CALL_MMXEXT_FILTER_CODE \
242	"movl (%%"REG_b"), %%esi \n\t"\
243	"call *%4 \n\t"\
244	"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
245	"add %%"REG_a", %%"REG_D" \n\t"\
246	"xor %%"REG_a", %%"REG_a" \n\t"\
247
248	#endif /* ARCH_X86_64 */
249
250	CALL_MMXEXT_FILTER_CODE
251	CALL_MMXEXT_FILTER_CODE
252	CALL_MMXEXT_FILTER_CODE
253	CALL_MMXEXT_FILTER_CODE
254	CALL_MMXEXT_FILTER_CODE
255	CALL_MMXEXT_FILTER_CODE
256	CALL_MMXEXT_FILTER_CODE
257	CALL_MMXEXT_FILTER_CODE
258
259	#if defined(PIC)
260	"mov %5, %%"REG_b" \n\t"
261	#if ARCH_X86_64
262	"mov %6, %%"REG_a" \n\t"
263	"mov %%"REG_a", -8(%%rsp) \n\t"
264	#endif
265	#else
266	#if ARCH_X86_64
267	"mov %5, %%"REG_a" \n\t"
268	"mov %%"REG_a", -8(%%rsp) \n\t"
269	#endif
270	#endif
271	:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
272	"m" (mmxextFilterCode)
273	#if defined(PIC)
274	,"m" (ebxsave)
275	#endif
276	#if ARCH_X86_64
277	,"m"(retsave)
278	#endif
279	: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
280	#if !defined(PIC)
281	,"%"REG_b
282	#endif
283	);
284
285	for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
286	dst[i] = src[srcW-1]*128;
287	}
288
289	void ff_hcscale_fast_mmxext(SwsContext c, int16_t dst1, int16_t *dst2,
290	int dstWidth, const uint8_t *src1,
291	const uint8_t *src2, int srcW, int xInc)
292	{
293	int32_t *filterPos = c->hChrFilterPos;
294	int16_t *filter = c->hChrFilter;
295	void *mmxextFilterCode = c->chrMmxextFilterCode;
296	int i;
297	#if defined(PIC)
298	DECLARE_ALIGNED(8, uint64_t, ebxsave);
299	#endif
300	#if ARCH_X86_64
301	DECLARE_ALIGNED(8, uint64_t, retsave);
302	#endif
303
304	__asm__ volatile(
305	#if defined(PIC)
306	"mov %%"REG_b", %7 \n\t"
307	#if ARCH_X86_64
308	"mov -8(%%rsp), %%"REG_a" \n\t"
309	"mov %%"REG_a", %8 \n\t"
310	#endif
311	#else
312	#if ARCH_X86_64
313	"mov -8(%%rsp), %%"REG_a" \n\t"
314	"mov %%"REG_a", %7 \n\t"
315	#endif
316	#endif
317	"pxor %%mm7, %%mm7 \n\t"
318	"mov %0, %%"REG_c" \n\t"
319	"mov %1, %%"REG_D" \n\t"
320	"mov %2, %%"REG_d" \n\t"
321	"mov %3, %%"REG_b" \n\t"
322	"xor %%"REG_a", %%"REG_a" \n\t" // i
323	PREFETCH" (%%"REG_c") \n\t"
324	PREFETCH" 32(%%"REG_c") \n\t"
325	PREFETCH" 64(%%"REG_c") \n\t"
326
327	CALL_MMXEXT_FILTER_CODE
328	CALL_MMXEXT_FILTER_CODE
329	CALL_MMXEXT_FILTER_CODE
330	CALL_MMXEXT_FILTER_CODE
331	"xor %%"REG_a", %%"REG_a" \n\t" // i
332	"mov %5, %%"REG_c" \n\t" // src
333	"mov %6, %%"REG_D" \n\t" // buf2
334	PREFETCH" (%%"REG_c") \n\t"
335	PREFETCH" 32(%%"REG_c") \n\t"
336	PREFETCH" 64(%%"REG_c") \n\t"
337
338	CALL_MMXEXT_FILTER_CODE
339	CALL_MMXEXT_FILTER_CODE
340	CALL_MMXEXT_FILTER_CODE
341	CALL_MMXEXT_FILTER_CODE
342
343	#if defined(PIC)
344	"mov %7, %%"REG_b" \n\t"
345	#if ARCH_X86_64
346	"mov %8, %%"REG_a" \n\t"
347	"mov %%"REG_a", -8(%%rsp) \n\t"
348	#endif
349	#else
350	#if ARCH_X86_64
351	"mov %7, %%"REG_a" \n\t"
352	"mov %%"REG_a", -8(%%rsp) \n\t"
353	#endif
354	#endif
355	:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
356	"m" (mmxextFilterCode), "m" (src2), "m"(dst2)
357	#if defined(PIC)
358	,"m" (ebxsave)
359	#endif
360	#if ARCH_X86_64
361	,"m"(retsave)
362	#endif
363	: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
364	#if !defined(PIC)
365	,"%"REG_b
366	#endif
367	);
368
369	for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
370	dst1[i] = src1[srcW-1]*128;
371	dst2[i] = src2[srcW-1]*128;
372	}
373	}
374	#endif //HAVE_INLINE_ASM