[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / mpegvideoenc_qns_template.c

/*
 * QNS functions are compiled 3 times for MMX/3DNOW/SSSE3
 * Copyright (c) 2004 Michael Niedermayer
 *
 * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
 * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <stdint.h>

#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"

#include "inline_asm.h"

#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))

static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
{
    x86_reg i=0;

    av_assert2(FFABS(scale) < MAX_ABS);
    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;

    SET_RND(mm6);
    __asm__ volatile(
        "pxor %%mm7, %%mm7              \n\t"
        "movd  %4, %%mm5                \n\t"
        "punpcklwd %%mm5, %%mm5         \n\t"
        "punpcklwd %%mm5, %%mm5         \n\t"
        ".p2align 4                     \n\t"
        "1:                             \n\t"
        "movq  (%1, %0), %%mm0          \n\t"
        "movq  8(%1, %0), %%mm1         \n\t"
        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
        "paddw (%2, %0), %%mm0          \n\t"
        "paddw 8(%2, %0), %%mm1         \n\t"
        "psraw $6, %%mm0                \n\t"
        "psraw $6, %%mm1                \n\t"
        "pmullw (%3, %0), %%mm0         \n\t"
        "pmullw 8(%3, %0), %%mm1        \n\t"
        "pmaddwd %%mm0, %%mm0           \n\t"
        "pmaddwd %%mm1, %%mm1           \n\t"
        "paddd %%mm1, %%mm0             \n\t"
        "psrld $4, %%mm0                \n\t"
        "paddd %%mm0, %%mm7             \n\t"
        "add $16, %0                    \n\t"
        "cmp $128, %0                   \n\t" //FIXME optimize & bench
        " jb 1b                         \n\t"
        PHADDD(%%mm7, %%mm6)
        "psrld $2, %%mm7                \n\t"
        "movd %%mm7, %0                 \n\t"

        : "+r" (i)
        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
    );
    return i;
}

static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
{
    x86_reg i=0;

    if(FFABS(scale) < MAX_ABS){
        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
        SET_RND(mm6);
        __asm__ volatile(
                "movd  %3, %%mm5        \n\t"
                "punpcklwd %%mm5, %%mm5 \n\t"
                "punpcklwd %%mm5, %%mm5 \n\t"
                ".p2align 4             \n\t"
                "1:                     \n\t"
                "movq  (%1, %0), %%mm0  \n\t"
                "movq  8(%1, %0), %%mm1 \n\t"
                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
                "paddw (%2, %0), %%mm0  \n\t"
                "paddw 8(%2, %0), %%mm1 \n\t"
                "movq %%mm0, (%2, %0)   \n\t"
                "movq %%mm1, 8(%2, %0)  \n\t"
                "add $16, %0            \n\t"
                "cmp $128, %0           \n\t" // FIXME optimize & bench
                " jb 1b                 \n\t"

                : "+r" (i)
                : "r"(basis), "r"(rem), "g"(scale)
        );
    }else{
        for(i=0; i<8*8; i++){
            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
        }
    }
}
Commit	Line	Data
	1	/*
	2	* QNS functions are compiled 3 times for MMX/3DNOW/SSSE3
	3	* Copyright (c) 2004 Michael Niedermayer
	4	*
	5	* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
	6	* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
	7	*
	8	* This file is part of FFmpeg.
	9	*
	10	* FFmpeg is free software; you can redistribute it and/or
	11	* modify it under the terms of the GNU Lesser General Public
	12	* License as published by the Free Software Foundation; either
	13	* version 2.1 of the License, or (at your option) any later version.
	14	*
	15	* FFmpeg is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* Lesser General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU Lesser General Public
	21	* License along with FFmpeg; if not, write to the Free Software
	22	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	23	*/
	24
	25	#include <stdint.h>
	26
	27	#include "libavutil/avassert.h"
	28	#include "libavutil/common.h"
	29	#include "libavutil/x86/asm.h"
	30
	31	#include "inline_asm.h"
	32
	33	#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
	34
	35	static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
	36	{
	37	x86_reg i=0;
	38
	39	av_assert2(FFABS(scale) < MAX_ABS);
	40	scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
	41
	42	SET_RND(mm6);
	43	__asm__ volatile(
	44	"pxor %%mm7, %%mm7 \n\t"
	45	"movd %4, %%mm5 \n\t"
	46	"punpcklwd %%mm5, %%mm5 \n\t"
	47	"punpcklwd %%mm5, %%mm5 \n\t"
	48	".p2align 4 \n\t"
	49	"1: \n\t"
	50	"movq (%1, %0), %%mm0 \n\t"
	51	"movq 8(%1, %0), %%mm1 \n\t"
	52	PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
	53	"paddw (%2, %0), %%mm0 \n\t"
	54	"paddw 8(%2, %0), %%mm1 \n\t"
	55	"psraw $6, %%mm0 \n\t"
	56	"psraw $6, %%mm1 \n\t"
	57	"pmullw (%3, %0), %%mm0 \n\t"
	58	"pmullw 8(%3, %0), %%mm1 \n\t"
	59	"pmaddwd %%mm0, %%mm0 \n\t"
	60	"pmaddwd %%mm1, %%mm1 \n\t"
	61	"paddd %%mm1, %%mm0 \n\t"
	62	"psrld $4, %%mm0 \n\t"
	63	"paddd %%mm0, %%mm7 \n\t"
	64	"add $16, %0 \n\t"
	65	"cmp $128, %0 \n\t" //FIXME optimize & bench
	66	" jb 1b \n\t"
	67	PHADDD(%%mm7, %%mm6)
	68	"psrld $2, %%mm7 \n\t"
	69	"movd %%mm7, %0 \n\t"
	70
	71	: "+r" (i)
	72	: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
	73	);
	74	return i;
	75	}
	76
	77	static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
	78	{
	79	x86_reg i=0;
	80
	81	if(FFABS(scale) < MAX_ABS){
	82	scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
	83	SET_RND(mm6);
	84	__asm__ volatile(
	85	"movd %3, %%mm5 \n\t"
	86	"punpcklwd %%mm5, %%mm5 \n\t"
	87	"punpcklwd %%mm5, %%mm5 \n\t"
	88	".p2align 4 \n\t"
	89	"1: \n\t"
	90	"movq (%1, %0), %%mm0 \n\t"
	91	"movq 8(%1, %0), %%mm1 \n\t"
	92	PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
	93	"paddw (%2, %0), %%mm0 \n\t"
	94	"paddw 8(%2, %0), %%mm1 \n\t"
	95	"movq %%mm0, (%2, %0) \n\t"
	96	"movq %%mm1, 8(%2, %0) \n\t"
	97	"add $16, %0 \n\t"
	98	"cmp $128, %0 \n\t" // FIXME optimize & bench
	99	" jb 1b \n\t"
	100
	101	: "+r" (i)
	102	: "r"(basis), "r"(rem), "g"(scale)
	103	);
	104	}else{
	105	for(i=0; i<8*8; i++){
	106	rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
	107	}
	108	}
	109	}