2 * Alpha optimized DSP utils
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/attributes.h"
23 #include "libavcodec/hpeldsp.h"
24 #include "hpeldsp_alpha.h"
27 static inline uint64_t avg2_no_rnd(uint64_t a
, uint64_t b
)
29 return (a
& b
) + (((a
^ b
) & BYTE_VEC(0xfe)) >> 1);
32 static inline uint64_t avg2(uint64_t a
, uint64_t b
)
34 return (a
| b
) - (((a
^ b
) & BYTE_VEC(0xfe)) >> 1);
38 /* The XY2 routines basically utilize this scheme, but reuse parts in
40 static inline uint64_t avg4(uint64_t l1
, uint64_t l2
, uint64_t l3
, uint64_t l4
)
42 uint64_t r1
= ((l1
& ~BYTE_VEC(0x03)) >> 2)
43 + ((l2
& ~BYTE_VEC(0x03)) >> 2)
44 + ((l3
& ~BYTE_VEC(0x03)) >> 2)
45 + ((l4
& ~BYTE_VEC(0x03)) >> 2);
46 uint64_t r2
= (( (l1
& BYTE_VEC(0x03))
47 + (l2
& BYTE_VEC(0x03))
48 + (l3
& BYTE_VEC(0x03))
49 + (l4
& BYTE_VEC(0x03))
50 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
55 #define OP(LOAD, STORE) \
57 STORE(LOAD(pixels), block); \
58 pixels += line_size; \
62 #define OP_X2(LOAD, STORE) \
64 uint64_t pix1, pix2; \
66 pix1 = LOAD(pixels); \
67 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
68 STORE(AVG2(pix1, pix2), block); \
69 pixels += line_size; \
73 #define OP_Y2(LOAD, STORE) \
75 uint64_t pix = LOAD(pixels); \
79 pixels += line_size; \
80 next_pix = LOAD(pixels); \
81 STORE(AVG2(pix, next_pix), block); \
87 #define OP_XY2(LOAD, STORE) \
89 uint64_t pix1 = LOAD(pixels); \
90 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
91 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
92 + (pix2 & BYTE_VEC(0x03)); \
93 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
94 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
97 uint64_t npix1, npix2; \
98 uint64_t npix_l, npix_h; \
101 pixels += line_size; \
102 npix1 = LOAD(pixels); \
103 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \
104 npix_l = (npix1 & BYTE_VEC(0x03)) \
105 + (npix2 & BYTE_VEC(0x03)); \
106 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
107 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
108 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
112 block += line_size; \
118 #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
119 static void OPNAME ## _pixels ## SUFF ## _axp \
120 (uint8_t *restrict block, const uint8_t *restrict pixels, \
121 ptrdiff_t line_size, int h) \
123 if ((size_t) pixels & 0x7) { \
124 OPKIND(uldq, STORE); \
126 OPKIND(ldq, STORE); \
130 static void OPNAME ## _pixels16 ## SUFF ## _axp \
131 (uint8_t *restrict block, const uint8_t *restrict pixels, \
132 ptrdiff_t line_size, int h) \
134 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \
135 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
138 #define PIXOP(OPNAME, STORE) \
139 MAKE_OP(OPNAME, , OP, STORE) \
140 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
141 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
142 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
144 /* Rounding primitives. */
147 #define AVG4_ROUNDER BYTE_VEC(0x02)
148 #define STORE(l, b) stq(l, b)
152 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
155 /* Not rounding primitives. */
160 #define AVG2 avg2_no_rnd
161 #define AVG4 avg4_no_rnd
162 #define AVG4_ROUNDER BYTE_VEC(0x01)
163 #define STORE(l, b) stq(l, b)
164 PIXOP(put_no_rnd
, STORE
);
167 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
168 PIXOP(avg_no_rnd
, STORE
);
170 static void put_pixels16_axp_asm(uint8_t *block
, const uint8_t *pixels
,
171 ptrdiff_t line_size
, int h
)
173 put_pixels_axp_asm(block
, pixels
, line_size
, h
);
174 put_pixels_axp_asm(block
+ 8, pixels
+ 8, line_size
, h
);
177 av_cold
void ff_hpeldsp_init_alpha(HpelDSPContext
*c
, int flags
)
179 c
->put_pixels_tab
[0][0] = put_pixels16_axp_asm
;
180 c
->put_pixels_tab
[0][1] = put_pixels16_x2_axp
;
181 c
->put_pixels_tab
[0][2] = put_pixels16_y2_axp
;
182 c
->put_pixels_tab
[0][3] = put_pixels16_xy2_axp
;
184 c
->put_no_rnd_pixels_tab
[0][0] = put_pixels16_axp_asm
;
185 c
->put_no_rnd_pixels_tab
[0][1] = put_no_rnd_pixels16_x2_axp
;
186 c
->put_no_rnd_pixels_tab
[0][2] = put_no_rnd_pixels16_y2_axp
;
187 c
->put_no_rnd_pixels_tab
[0][3] = put_no_rnd_pixels16_xy2_axp
;
189 c
->avg_pixels_tab
[0][0] = avg_pixels16_axp
;
190 c
->avg_pixels_tab
[0][1] = avg_pixels16_x2_axp
;
191 c
->avg_pixels_tab
[0][2] = avg_pixels16_y2_axp
;
192 c
->avg_pixels_tab
[0][3] = avg_pixels16_xy2_axp
;
194 c
->avg_no_rnd_pixels_tab
[0] = avg_no_rnd_pixels16_axp
;
195 c
->avg_no_rnd_pixels_tab
[1] = avg_no_rnd_pixels16_x2_axp
;
196 c
->avg_no_rnd_pixels_tab
[2] = avg_no_rnd_pixels16_y2_axp
;
197 c
->avg_no_rnd_pixels_tab
[3] = avg_no_rnd_pixels16_xy2_axp
;
199 c
->put_pixels_tab
[1][0] = put_pixels_axp_asm
;
200 c
->put_pixels_tab
[1][1] = put_pixels_x2_axp
;
201 c
->put_pixels_tab
[1][2] = put_pixels_y2_axp
;
202 c
->put_pixels_tab
[1][3] = put_pixels_xy2_axp
;
204 c
->put_no_rnd_pixels_tab
[1][0] = put_pixels_axp_asm
;
205 c
->put_no_rnd_pixels_tab
[1][1] = put_no_rnd_pixels_x2_axp
;
206 c
->put_no_rnd_pixels_tab
[1][2] = put_no_rnd_pixels_y2_axp
;
207 c
->put_no_rnd_pixels_tab
[1][3] = put_no_rnd_pixels_xy2_axp
;
209 c
->avg_pixels_tab
[1][0] = avg_pixels_axp
;
210 c
->avg_pixels_tab
[1][1] = avg_pixels_x2_axp
;
211 c
->avg_pixels_tab
[1][2] = avg_pixels_y2_axp
;
212 c
->avg_pixels_tab
[1][3] = avg_pixels_xy2_axp
;