2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "libavutil/ppc/cpu.h"
32 #include "libavutil/ppc/types_altivec.h"
33 #include "libavutil/ppc/util_altivec.h"
34 #include "libavcodec/hpeldsp.h"
35 #include "hpeldsp_altivec.h"
38 /* next one assumes that ((line_size % 16) == 0) */
39 void ff_put_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
41 register vector
unsigned char pixelsv1
;
42 register vector
unsigned char pixelsv1B
;
43 register vector
unsigned char pixelsv1C
;
44 register vector
unsigned char pixelsv1D
;
47 register ptrdiff_t line_size_2
= line_size
<< 1;
48 register ptrdiff_t line_size_3
= line_size
+ line_size_2
;
49 register ptrdiff_t line_size_4
= line_size
<< 2;
51 // hand-unrolling the loop by 4 gains about 15%
52 // mininum execution time goes from 74 to 60 cycles
53 // it's faster than -funroll-loops, but using
54 // -funroll-loops w/ this is bad - 74 cycles again.
55 // all this is on a 7450, tuning for the 7450
56 for (i
= 0; i
< h
; i
+= 4) {
57 pixelsv1
= unaligned_load( 0, pixels
);
58 pixelsv1B
= unaligned_load(line_size
, pixels
);
59 pixelsv1C
= unaligned_load(line_size_2
, pixels
);
60 pixelsv1D
= unaligned_load(line_size_3
, pixels
);
61 VEC_ST(pixelsv1
, 0, (unsigned char*)block
);
62 VEC_ST(pixelsv1B
, line_size
, (unsigned char*)block
);
63 VEC_ST(pixelsv1C
, line_size_2
, (unsigned char*)block
);
64 VEC_ST(pixelsv1D
, line_size_3
, (unsigned char*)block
);
70 /* next one assumes that ((line_size % 16) == 0) */
71 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
72 void ff_avg_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
74 register vector
unsigned char pixelsv
, blockv
;
77 for (i
= 0; i
< h
; i
++) {
78 blockv
= vec_ld(0, block
);
79 pixelsv
= VEC_LD( 0, pixels
);
80 blockv
= vec_avg(blockv
,pixelsv
);
81 vec_st(blockv
, 0, (unsigned char*)block
);
87 /* next one assumes that ((line_size % 8) == 0) */
88 static void avg_pixels8_altivec(uint8_t * block
, const uint8_t * pixels
, ptrdiff_t line_size
, int h
)
90 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
93 for (i
= 0; i
< h
; i
++) {
94 /* block is 8 bytes-aligned, so we're either in the
95 left block (16 bytes-aligned) or in the right block (not) */
96 int rightside
= ((unsigned long)block
& 0x0000000F);
98 blockv
= vec_ld(0, block
);
99 pixelsv
= VEC_LD( 0, pixels
);
102 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(0,1,s0
,s1
));
104 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(s0
,s1
,2,3));
107 blockv
= vec_avg(blockv
, pixelsv
);
109 vec_st(blockv
, 0, block
);
116 /* next one assumes that ((line_size % 8) == 0) */
117 static void put_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
120 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsavg
;
121 register vector
unsigned char blockv
;
122 register vector
unsigned short pixelssum1
, pixelssum2
, temp3
;
123 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
124 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
126 pixelsv1
= VEC_LD(0, pixels
);
127 pixelsv2
= VEC_LD(1, pixels
);
128 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
129 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
131 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
132 (vector
unsigned short)pixelsv2
);
133 pixelssum1
= vec_add(pixelssum1
, vctwo
);
135 for (i
= 0; i
< h
; i
++) {
136 int rightside
= ((unsigned long)block
& 0x0000000F);
137 blockv
= vec_ld(0, block
);
139 pixelsv1
= unaligned_load(line_size
, pixels
);
140 pixelsv2
= unaligned_load(line_size
+1, pixels
);
141 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
142 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
143 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
144 (vector
unsigned short)pixelsv2
);
145 temp3
= vec_add(pixelssum1
, pixelssum2
);
146 temp3
= vec_sra(temp3
, vctwo
);
147 pixelssum1
= vec_add(pixelssum2
, vctwo
);
148 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
151 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
153 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
156 vec_st(blockv
, 0, block
);
163 /* next one assumes that ((line_size % 8) == 0) */
164 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
167 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsavg
;
168 register vector
unsigned char blockv
;
169 register vector
unsigned short pixelssum1
, pixelssum2
, temp3
;
170 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
171 register const vector
unsigned short vcone
= (const vector
unsigned short)vec_splat_u16(1);
172 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
174 pixelsv1
= VEC_LD(0, pixels
);
175 pixelsv2
= VEC_LD(1, pixels
);
176 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
177 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
178 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
179 (vector
unsigned short)pixelsv2
);
180 pixelssum1
= vec_add(pixelssum1
, vcone
);
182 for (i
= 0; i
< h
; i
++) {
183 int rightside
= ((unsigned long)block
& 0x0000000F);
184 blockv
= vec_ld(0, block
);
186 pixelsv1
= unaligned_load(line_size
, pixels
);
187 pixelsv2
= unaligned_load(line_size
+1, pixels
);
188 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
189 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
190 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
191 (vector
unsigned short)pixelsv2
);
192 temp3
= vec_add(pixelssum1
, pixelssum2
);
193 temp3
= vec_sra(temp3
, vctwo
);
194 pixelssum1
= vec_add(pixelssum2
, vcone
);
195 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
198 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
200 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
203 vec_st(blockv
, 0, block
);
210 /* next one assumes that ((line_size % 16) == 0) */
211 static void put_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, ptrdiff_t line_size
, int h
)
214 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
215 register vector
unsigned char blockv
;
216 register vector
unsigned short temp3
, temp4
,
217 pixelssum1
, pixelssum2
, pixelssum3
, pixelssum4
;
218 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
219 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
221 pixelsv1
= VEC_LD(0, pixels
);
222 pixelsv2
= VEC_LD(1, pixels
);
223 pixelsv3
= VEC_MERGEL(vczero
, pixelsv1
);
224 pixelsv4
= VEC_MERGEL(vczero
, pixelsv2
);
225 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
226 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
227 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
228 (vector
unsigned short)pixelsv4
);
229 pixelssum3
= vec_add(pixelssum3
, vctwo
);
230 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
231 (vector
unsigned short)pixelsv2
);
232 pixelssum1
= vec_add(pixelssum1
, vctwo
);
234 for (i
= 0; i
< h
; i
++) {
235 blockv
= vec_ld(0, block
);
237 pixelsv1
= unaligned_load(line_size
, pixels
);
238 pixelsv2
= unaligned_load(line_size
+1, pixels
);
240 pixelsv3
= VEC_MERGEL(vczero
, pixelsv1
);
241 pixelsv4
= VEC_MERGEL(vczero
, pixelsv2
);
242 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
243 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
244 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
245 (vector
unsigned short)pixelsv4
);
246 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
247 (vector
unsigned short)pixelsv2
);
248 temp4
= vec_add(pixelssum3
, pixelssum4
);
249 temp4
= vec_sra(temp4
, vctwo
);
250 temp3
= vec_add(pixelssum1
, pixelssum2
);
251 temp3
= vec_sra(temp3
, vctwo
);
253 pixelssum3
= vec_add(pixelssum4
, vctwo
);
254 pixelssum1
= vec_add(pixelssum2
, vctwo
);
256 blockv
= vec_packsu(temp3
, temp4
);
258 vec_st(blockv
, 0, block
);
265 /* next one assumes that ((line_size % 16) == 0) */
266 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, ptrdiff_t line_size
, int h
)
269 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
270 register vector
unsigned char blockv
;
271 register vector
unsigned short temp3
, temp4
,
272 pixelssum1
, pixelssum2
, pixelssum3
, pixelssum4
;
273 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
274 register const vector
unsigned short vcone
= (const vector
unsigned short)vec_splat_u16(1);
275 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
277 pixelsv1
= VEC_LD(0, pixels
);
278 pixelsv2
= VEC_LD(1, pixels
);
279 pixelsv3
= VEC_MERGEL(vczero
, pixelsv1
);
280 pixelsv4
= VEC_MERGEL(vczero
, pixelsv2
);
281 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
282 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
283 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
284 (vector
unsigned short)pixelsv4
);
285 pixelssum3
= vec_add(pixelssum3
, vcone
);
286 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
287 (vector
unsigned short)pixelsv2
);
288 pixelssum1
= vec_add(pixelssum1
, vcone
);
290 for (i
= 0; i
< h
; i
++) {
291 pixelsv1
= unaligned_load(line_size
, pixels
);
292 pixelsv2
= unaligned_load(line_size
+1, pixels
);
294 pixelsv3
= VEC_MERGEL(vczero
, pixelsv1
);
295 pixelsv4
= VEC_MERGEL(vczero
, pixelsv2
);
296 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
297 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
298 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
299 (vector
unsigned short)pixelsv4
);
300 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
301 (vector
unsigned short)pixelsv2
);
302 temp4
= vec_add(pixelssum3
, pixelssum4
);
303 temp4
= vec_sra(temp4
, vctwo
);
304 temp3
= vec_add(pixelssum1
, pixelssum2
);
305 temp3
= vec_sra(temp3
, vctwo
);
307 pixelssum3
= vec_add(pixelssum4
, vcone
);
308 pixelssum1
= vec_add(pixelssum2
, vcone
);
310 blockv
= vec_packsu(temp3
, temp4
);
312 VEC_ST(blockv
, 0, block
);
319 /* next one assumes that ((line_size % 8) == 0) */
320 static void avg_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
323 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsavg
;
324 register vector
unsigned char blockv
, blocktemp
;
325 register vector
unsigned short pixelssum1
, pixelssum2
, temp3
;
327 register const vector
unsigned char vczero
= (const vector
unsigned char)
329 register const vector
unsigned short vctwo
= (const vector
unsigned short)
332 pixelsv1
= VEC_LD(0, pixels
);
333 pixelsv2
= VEC_LD(1, pixels
);
334 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
335 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
336 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
337 (vector
unsigned short)pixelsv2
);
338 pixelssum1
= vec_add(pixelssum1
, vctwo
);
340 for (i
= 0; i
< h
; i
++) {
341 int rightside
= ((unsigned long)block
& 0x0000000F);
342 blockv
= vec_ld(0, block
);
344 pixelsv1
= unaligned_load(line_size
, pixels
);
345 pixelsv2
= unaligned_load(line_size
+1, pixels
);
347 pixelsv1
= VEC_MERGEH(vczero
, pixelsv1
);
348 pixelsv2
= VEC_MERGEH(vczero
, pixelsv2
);
349 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
350 (vector
unsigned short)pixelsv2
);
351 temp3
= vec_add(pixelssum1
, pixelssum2
);
352 temp3
= vec_sra(temp3
, vctwo
);
353 pixelssum1
= vec_add(pixelssum2
, vctwo
);
354 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
357 blocktemp
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
359 blocktemp
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
362 blockv
= vec_avg(blocktemp
, blockv
);
363 vec_st(blockv
, 0, block
);
369 #endif /* HAVE_ALTIVEC */
371 av_cold
void ff_hpeldsp_init_ppc(HpelDSPContext
*c
, int flags
)
374 if (!PPC_ALTIVEC(av_get_cpu_flags()))
377 c
->avg_pixels_tab
[0][0] = ff_avg_pixels16_altivec
;
378 c
->avg_pixels_tab
[1][0] = avg_pixels8_altivec
;
379 c
->avg_pixels_tab
[1][3] = avg_pixels8_xy2_altivec
;
381 c
->put_pixels_tab
[0][0] = ff_put_pixels16_altivec
;
382 c
->put_pixels_tab
[1][3] = put_pixels8_xy2_altivec
;
383 c
->put_pixels_tab
[0][3] = put_pixels16_xy2_altivec
;
385 c
->put_no_rnd_pixels_tab
[0][0] = ff_put_pixels16_altivec
;
386 c
->put_no_rnd_pixels_tab
[1][3] = put_no_rnd_pixels8_xy2_altivec
;
387 c
->put_no_rnd_pixels_tab
[0][3] = put_no_rnd_pixels16_xy2_altivec
;
388 #endif /* HAVE_ALTIVEC */