[deb_ffmpeg.git] / ffmpeg / libavcodec / alpha / hpeldsp_alpha.c

/*
 * Alpha optimized DSP utils
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/attributes.h"
#include "libavcodec/hpeldsp.h"
#include "hpeldsp_alpha.h"
#include "asm.h"

static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
{
    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
}

static inline uint64_t avg2(uint64_t a, uint64_t b)
{
    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
}

#if 0
/* The XY2 routines basically utilize this scheme, but reuse parts in
   each iteration.  */
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
{
    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
                    + (l2 & BYTE_VEC(0x03))
                    + (l3 & BYTE_VEC(0x03))
                    + (l4 & BYTE_VEC(0x03))
                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
    return r1 + r2;
}
#endif

#define OP(LOAD, STORE)                         \
    do {                                        \
        STORE(LOAD(pixels), block);             \
        pixels += line_size;                    \
        block += line_size;                     \
    } while (--h)

#define OP_X2(LOAD, STORE)                                      \
    do {                                                        \
        uint64_t pix1, pix2;                                    \
                                                                \
        pix1 = LOAD(pixels);                                    \
        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
        STORE(AVG2(pix1, pix2), block);                         \
        pixels += line_size;                                    \
        block += line_size;                                     \
    } while (--h)

#define OP_Y2(LOAD, STORE)                      \
    do {                                        \
        uint64_t pix = LOAD(pixels);            \
        do {                                    \
            uint64_t next_pix;                  \
                                                \
            pixels += line_size;                \
            next_pix = LOAD(pixels);            \
            STORE(AVG2(pix, next_pix), block);  \
            block += line_size;                 \
            pix = next_pix;                     \
        } while (--h);                          \
    } while (0)

#define OP_XY2(LOAD, STORE)                                                 \
    do {                                                                    \
        uint64_t pix1 = LOAD(pixels);                                       \
        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
                       + (pix2 & BYTE_VEC(0x03));                           \
        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
                                                                            \
        do {                                                                \
            uint64_t npix1, npix2;                                          \
            uint64_t npix_l, npix_h;                                        \
            uint64_t avg;                                                   \
                                                                            \
            pixels += line_size;                                            \
            npix1 = LOAD(pixels);                                           \
            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
            npix_l = (npix1 & BYTE_VEC(0x03))                               \
                   + (npix2 & BYTE_VEC(0x03));                              \
            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
                + pix_h + npix_h;                                           \
            STORE(avg, block);                                              \
                                                                            \
            block += line_size;                                             \
            pix_l = npix_l;                                                 \
            pix_h = npix_h;                                                 \
        } while (--h);                                                      \
    } while (0)

#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
static void OPNAME ## _pixels ## SUFF ## _axp                               \
        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
         ptrdiff_t line_size, int h)                                        \
{                                                                           \
    if ((size_t) pixels & 0x7) {                                            \
        OPKIND(uldq, STORE);                                                \
    } else {                                                                \
        OPKIND(ldq, STORE);                                                 \
    }                                                                       \
}                                                                           \
                                                                            \
static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
         ptrdiff_t line_size, int h)                                        \
{                                                                           \
    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
}

#define PIXOP(OPNAME, STORE)                    \
    MAKE_OP(OPNAME, ,     OP,     STORE)        \
    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)

/* Rounding primitives.  */
#define AVG2 avg2
#define AVG4 avg4
#define AVG4_ROUNDER BYTE_VEC(0x02)
#define STORE(l, b) stq(l, b)
PIXOP(put, STORE);

#undef STORE
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
PIXOP(avg, STORE);

/* Not rounding primitives.  */
#undef AVG2
#undef AVG4
#undef AVG4_ROUNDER
#undef STORE
#define AVG2 avg2_no_rnd
#define AVG4 avg4_no_rnd
#define AVG4_ROUNDER BYTE_VEC(0x01)
#define STORE(l, b) stq(l, b)
PIXOP(put_no_rnd, STORE);

#undef STORE
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
PIXOP(avg_no_rnd, STORE);

static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
                                 ptrdiff_t line_size, int h)
{
    put_pixels_axp_asm(block,     pixels,     line_size, h);
    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
}

av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
{
    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;

    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;

    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;

    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;

    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;

    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;

    c->avg_pixels_tab[1][0] = avg_pixels_axp;
    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
}
Commit	Line	Data
2ba45a60 DM	1	/*
	2	* Alpha optimized DSP utils
	3	* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "libavutil/attributes.h"
	23	#include "libavcodec/hpeldsp.h"
	24	#include "hpeldsp_alpha.h"
	25	#include "asm.h"
	26
	27	static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
	28	{
	29	return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
	30	}
	31
	32	static inline uint64_t avg2(uint64_t a, uint64_t b)
	33	{
	34	return (a \| b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
	35	}
	36
	37	#if 0
	38	/* The XY2 routines basically utilize this scheme, but reuse parts in
	39	each iteration. */
	40	static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
	41	{
	42	uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
	43	+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
	44	+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
	45	+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
	46	uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
	47	+ (l2 & BYTE_VEC(0x03))
	48	+ (l3 & BYTE_VEC(0x03))
	49	+ (l4 & BYTE_VEC(0x03))
	50	+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
	51	return r1 + r2;
	52	}
	53	#endif
	54
	55	#define OP(LOAD, STORE) \
	56	do { \
	57	STORE(LOAD(pixels), block); \
	58	pixels += line_size; \
	59	block += line_size; \
	60	} while (--h)
	61
	62	#define OP_X2(LOAD, STORE) \
	63	do { \
	64	uint64_t pix1, pix2; \
65	\
66	pix1 = LOAD(pixels); \
67	pix2 = pix1 >> 8 \| ((uint64_t) pixels[8] << 56); \
68	STORE(AVG2(pix1, pix2), block); \
69	pixels += line_size; \
70	block += line_size; \
71	} while (--h)
72
73	#define OP_Y2(LOAD, STORE) \
74	do { \
75	uint64_t pix = LOAD(pixels); \
76	do { \
77	uint64_t next_pix; \
78	\
79	pixels += line_size; \
80	next_pix = LOAD(pixels); \
81	STORE(AVG2(pix, next_pix), block); \
82	block += line_size; \
83	pix = next_pix; \
84	} while (--h); \
85	} while (0)
86
87	#define OP_XY2(LOAD, STORE) \
88	do { \
89	uint64_t pix1 = LOAD(pixels); \
90	uint64_t pix2 = pix1 >> 8 \| ((uint64_t) pixels[8] << 56); \
91	uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
92	+ (pix2 & BYTE_VEC(0x03)); \
93	uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
94	+ ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
95	\
96	do { \
97	uint64_t npix1, npix2; \
98	uint64_t npix_l, npix_h; \
99	uint64_t avg; \
100	\
101	pixels += line_size; \
102	npix1 = LOAD(pixels); \
103	npix2 = npix1 >> 8 \| ((uint64_t) pixels[8] << 56); \
104	npix_l = (npix1 & BYTE_VEC(0x03)) \
105	+ (npix2 & BYTE_VEC(0x03)); \
106	npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
107	+ ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
108	avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
109	+ pix_h + npix_h; \
110	STORE(avg, block); \
111	\
112	block += line_size; \
113	pix_l = npix_l; \
114	pix_h = npix_h; \
115	} while (--h); \
116	} while (0)
117
118	#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
119	static void OPNAME ## _pixels ## SUFF ## _axp \
120	(uint8_t restrict block, const uint8_t restrict pixels, \
121	ptrdiff_t line_size, int h) \
122	{ \
123	if ((size_t) pixels & 0x7) { \
124	OPKIND(uldq, STORE); \
125	} else { \
126	OPKIND(ldq, STORE); \
127	} \
128	} \
129	\
130	static void OPNAME ## _pixels16 ## SUFF ## _axp \
131	(uint8_t restrict block, const uint8_t restrict pixels, \
132	ptrdiff_t line_size, int h) \
133	{ \
134	OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \
135	OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
136	}
137
138	#define PIXOP(OPNAME, STORE) \
139	MAKE_OP(OPNAME, , OP, STORE) \
140	MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
141	MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
142	MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
143
144	/* Rounding primitives. */
145	#define AVG2 avg2
146	#define AVG4 avg4
147	#define AVG4_ROUNDER BYTE_VEC(0x02)
148	#define STORE(l, b) stq(l, b)
149	PIXOP(put, STORE);
150
151	#undef STORE
152	#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
153	PIXOP(avg, STORE);
154
155	/* Not rounding primitives. */
156	#undef AVG2
157	#undef AVG4
158	#undef AVG4_ROUNDER
159	#undef STORE
160	#define AVG2 avg2_no_rnd
161	#define AVG4 avg4_no_rnd
162	#define AVG4_ROUNDER BYTE_VEC(0x01)
163	#define STORE(l, b) stq(l, b)
164	PIXOP(put_no_rnd, STORE);
165
166	#undef STORE
167	#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
168	PIXOP(avg_no_rnd, STORE);
169
170	static void put_pixels16_axp_asm(uint8_t block, const uint8_t pixels,
171	ptrdiff_t line_size, int h)
172	{
173	put_pixels_axp_asm(block, pixels, line_size, h);
174	put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
175	}
176
177	av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
178	{
179	c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
180	c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
181	c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
182	c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
183
184	c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
185	c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
186	c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
187	c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
188
189	c->avg_pixels_tab[0][0] = avg_pixels16_axp;
190	c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
191	c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
192	c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
193
194	c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
195	c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
196	c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
197	c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
198
199	c->put_pixels_tab[1][0] = put_pixels_axp_asm;
200	c->put_pixels_tab[1][1] = put_pixels_x2_axp;
201	c->put_pixels_tab[1][2] = put_pixels_y2_axp;
202	c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
203
204	c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
205	c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
206	c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
207	c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
208
209	c->avg_pixels_tab[1][0] = avg_pixels_axp;
210	c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
211	c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
212	c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
213	}