[deb_ffmpeg.git] / ffmpeg / libavcodec / alpha / hpeldsp_alpha.c

/*
 * Alpha optimized DSP utils
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/attributes.h"
#include "libavcodec/hpeldsp.h"
#include "hpeldsp_alpha.h"
#include "asm.h"

static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
{
    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
}

static inline uint64_t avg2(uint64_t a, uint64_t b)
{
    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
}

#if 0
/* The XY2 routines basically utilize this scheme, but reuse parts in
   each iteration.  */
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
{
    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
                    + (l2 & BYTE_VEC(0x03))
                    + (l3 & BYTE_VEC(0x03))
                    + (l4 & BYTE_VEC(0x03))
                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
    return r1 + r2;
}
#endif

#define OP(LOAD, STORE)                         \
    do {                                        \
        STORE(LOAD(pixels), block);             \
        pixels += line_size;                    \
        block += line_size;                     \
    } while (--h)

#define OP_X2(LOAD, STORE)                                      \
    do {                                                        \
        uint64_t pix1, pix2;                                    \
                                                                \
        pix1 = LOAD(pixels);                                    \
        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
        STORE(AVG2(pix1, pix2), block);                         \
        pixels += line_size;                                    \
        block += line_size;                                     \
    } while (--h)

#define OP_Y2(LOAD, STORE)                      \
    do {                                        \
        uint64_t pix = LOAD(pixels);            \
        do {                                    \
            uint64_t next_pix;                  \
                                                \
            pixels += line_size;                \
            next_pix = LOAD(pixels);            \
            STORE(AVG2(pix, next_pix), block);  \
            block += line_size;                 \
            pix = next_pix;                     \
        } while (--h);                          \
    } while (0)

#define OP_XY2(LOAD, STORE)                                                 \
    do {                                                                    \
        uint64_t pix1 = LOAD(pixels);                                       \
        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
                       + (pix2 & BYTE_VEC(0x03));                           \
        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
                                                                            \
        do {                                                                \
            uint64_t npix1, npix2;                                          \
            uint64_t npix_l, npix_h;                                        \
            uint64_t avg;                                                   \
                                                                            \
            pixels += line_size;                                            \
            npix1 = LOAD(pixels);                                           \
            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
            npix_l = (npix1 & BYTE_VEC(0x03))                               \
                   + (npix2 & BYTE_VEC(0x03));                              \
            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
                + pix_h + npix_h;                                           \
            STORE(avg, block);                                              \
                                                                            \
            block += line_size;                                             \
            pix_l = npix_l;                                                 \
            pix_h = npix_h;                                                 \
        } while (--h);                                                      \
    } while (0)

#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
static void OPNAME ## _pixels ## SUFF ## _axp                               \
        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
         ptrdiff_t line_size, int h)                                        \
{                                                                           \
    if ((size_t) pixels & 0x7) {                                            \
        OPKIND(uldq, STORE);                                                \
    } else {                                                                \
        OPKIND(ldq, STORE);                                                 \
    }                                                                       \
}                                                                           \
                                                                            \
static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
         ptrdiff_t line_size, int h)                                        \
{                                                                           \
    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
}

#define PIXOP(OPNAME, STORE)                    \
    MAKE_OP(OPNAME, ,     OP,     STORE)        \
    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)

/* Rounding primitives.  */
#define AVG2 avg2
#define AVG4 avg4
#define AVG4_ROUNDER BYTE_VEC(0x02)
#define STORE(l, b) stq(l, b)
PIXOP(put, STORE);

#undef STORE
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
PIXOP(avg, STORE);

/* Not rounding primitives.  */
#undef AVG2
#undef AVG4
#undef AVG4_ROUNDER
#undef STORE
#define AVG2 avg2_no_rnd
#define AVG4 avg4_no_rnd
#define AVG4_ROUNDER BYTE_VEC(0x01)
#define STORE(l, b) stq(l, b)
PIXOP(put_no_rnd, STORE);

#undef STORE
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
PIXOP(avg_no_rnd, STORE);

static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
                                 ptrdiff_t line_size, int h)
{
    put_pixels_axp_asm(block,     pixels,     line_size, h);
    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
}

av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
{
    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;

    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;

    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;

    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;

    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;

    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;

    c->avg_pixels_tab[1][0] = avg_pixels_axp;
    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
}
Commit	Line	Data
	1	/*
	2	* Alpha optimized DSP utils
	3	* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "libavutil/attributes.h"
	23	#include "libavcodec/hpeldsp.h"
	24	#include "hpeldsp_alpha.h"
	25	#include "asm.h"
	26
	27	static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
	28	{
	29	return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
	30	}
	31
	32	static inline uint64_t avg2(uint64_t a, uint64_t b)
	33	{
	34	return (a \| b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
	35	}
	36
	37	#if 0
	38	/* The XY2 routines basically utilize this scheme, but reuse parts in
	39	each iteration. */
	40	static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
	41	{
	42	uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
	43	+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
	44	+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
	45	+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
	46	uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
	47	+ (l2 & BYTE_VEC(0x03))
	48	+ (l3 & BYTE_VEC(0x03))
	49	+ (l4 & BYTE_VEC(0x03))
	50	+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
	51	return r1 + r2;
	52	}
	53	#endif
	54
	55	#define OP(LOAD, STORE) \
	56	do { \
	57	STORE(LOAD(pixels), block); \
	58	pixels += line_size; \
	59	block += line_size; \
	60	} while (--h)
	61
	62	#define OP_X2(LOAD, STORE) \
	63	do { \
	64	uint64_t pix1, pix2; \
	65	\
	66	pix1 = LOAD(pixels); \
	67	pix2 = pix1 >> 8 \| ((uint64_t) pixels[8] << 56); \
	68	STORE(AVG2(pix1, pix2), block); \
	69	pixels += line_size; \
	70	block += line_size; \
	71	} while (--h)
	72
	73	#define OP_Y2(LOAD, STORE) \
	74	do { \
	75	uint64_t pix = LOAD(pixels); \
	76	do { \
	77	uint64_t next_pix; \
	78	\
	79	pixels += line_size; \
	80	next_pix = LOAD(pixels); \
	81	STORE(AVG2(pix, next_pix), block); \
	82	block += line_size; \
	83	pix = next_pix; \
	84	} while (--h); \
	85	} while (0)
	86
	87	#define OP_XY2(LOAD, STORE) \
	88	do { \
	89	uint64_t pix1 = LOAD(pixels); \
	90	uint64_t pix2 = pix1 >> 8 \| ((uint64_t) pixels[8] << 56); \
	91	uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
	92	+ (pix2 & BYTE_VEC(0x03)); \
	93	uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
	94	+ ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
	95	\
	96	do { \
	97	uint64_t npix1, npix2; \
	98	uint64_t npix_l, npix_h; \
	99	uint64_t avg; \
	100	\
	101	pixels += line_size; \
	102	npix1 = LOAD(pixels); \
	103	npix2 = npix1 >> 8 \| ((uint64_t) pixels[8] << 56); \
	104	npix_l = (npix1 & BYTE_VEC(0x03)) \
	105	+ (npix2 & BYTE_VEC(0x03)); \
	106	npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
	107	+ ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
	108	avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
	109	+ pix_h + npix_h; \
	110	STORE(avg, block); \
	111	\
	112	block += line_size; \
	113	pix_l = npix_l; \
	114	pix_h = npix_h; \
	115	} while (--h); \
	116	} while (0)
	117
	118	#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
	119	static void OPNAME ## _pixels ## SUFF ## _axp \
	120	(uint8_t restrict block, const uint8_t restrict pixels, \
	121	ptrdiff_t line_size, int h) \
	122	{ \
	123	if ((size_t) pixels & 0x7) { \
	124	OPKIND(uldq, STORE); \
	125	} else { \
	126	OPKIND(ldq, STORE); \
	127	} \
	128	} \
	129	\
	130	static void OPNAME ## _pixels16 ## SUFF ## _axp \
	131	(uint8_t restrict block, const uint8_t restrict pixels, \
	132	ptrdiff_t line_size, int h) \
	133	{ \
	134	OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \
	135	OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
	136	}
	137
	138	#define PIXOP(OPNAME, STORE) \
	139	MAKE_OP(OPNAME, , OP, STORE) \
	140	MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
	141	MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
	142	MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
	143
	144	/* Rounding primitives. */
	145	#define AVG2 avg2
	146	#define AVG4 avg4
	147	#define AVG4_ROUNDER BYTE_VEC(0x02)
	148	#define STORE(l, b) stq(l, b)
	149	PIXOP(put, STORE);
	150
	151	#undef STORE
	152	#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
	153	PIXOP(avg, STORE);
	154
	155	/* Not rounding primitives. */
	156	#undef AVG2
	157	#undef AVG4
	158	#undef AVG4_ROUNDER
	159	#undef STORE
	160	#define AVG2 avg2_no_rnd
	161	#define AVG4 avg4_no_rnd
	162	#define AVG4_ROUNDER BYTE_VEC(0x01)
	163	#define STORE(l, b) stq(l, b)
	164	PIXOP(put_no_rnd, STORE);
	165
	166	#undef STORE
	167	#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
	168	PIXOP(avg_no_rnd, STORE);
	169
	170	static void put_pixels16_axp_asm(uint8_t block, const uint8_t pixels,
	171	ptrdiff_t line_size, int h)
	172	{
	173	put_pixels_axp_asm(block, pixels, line_size, h);
	174	put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
	175	}
	176
	177	av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
	178	{
	179	c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
	180	c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
	181	c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
	182	c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
	183
	184	c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
	185	c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
	186	c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
	187	c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
	188
	189	c->avg_pixels_tab[0][0] = avg_pixels16_axp;
	190	c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
	191	c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
	192	c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
	193
	194	c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
	195	c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
	196	c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
	197	c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
	198
	199	c->put_pixels_tab[1][0] = put_pixels_axp_asm;
	200	c->put_pixels_tab[1][1] = put_pixels_x2_axp;
	201	c->put_pixels_tab[1][2] = put_pixels_y2_axp;
	202	c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
	203
	204	c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
	205	c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
	206	c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
	207	c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
	208
	209	c->avg_pixels_tab[1][0] = avg_pixels_axp;
	210	c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
	211	c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
	212	c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
	213	}