2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "libavutil/ppc/cpu.h"
32 #include "libavutil/ppc/types_altivec.h"
33 #include "libavutil/ppc/util_altivec.h"
34 #include "libavcodec/hpeldsp.h"
35 #include "hpeldsp_altivec.h"
38 /* next one assumes that ((line_size % 16) == 0) */
39 void ff_put_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
41 register vector
unsigned char pixelsv1
, pixelsv2
;
42 register vector
unsigned char pixelsv1B
, pixelsv2B
;
43 register vector
unsigned char pixelsv1C
, pixelsv2C
;
44 register vector
unsigned char pixelsv1D
, pixelsv2D
;
46 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
48 register ptrdiff_t line_size_2
= line_size
<< 1;
49 register ptrdiff_t line_size_3
= line_size
+ line_size_2
;
50 register ptrdiff_t line_size_4
= line_size
<< 2;
52 // hand-unrolling the loop by 4 gains about 15%
53 // mininum execution time goes from 74 to 60 cycles
54 // it's faster than -funroll-loops, but using
55 // -funroll-loops w/ this is bad - 74 cycles again.
56 // all this is on a 7450, tuning for the 7450
57 for (i
= 0; i
< h
; i
+= 4) {
58 pixelsv1
= vec_ld( 0, pixels
);
59 pixelsv2
= vec_ld(15, pixels
);
60 pixelsv1B
= vec_ld(line_size
, pixels
);
61 pixelsv2B
= vec_ld(15 + line_size
, pixels
);
62 pixelsv1C
= vec_ld(line_size_2
, pixels
);
63 pixelsv2C
= vec_ld(15 + line_size_2
, pixels
);
64 pixelsv1D
= vec_ld(line_size_3
, pixels
);
65 pixelsv2D
= vec_ld(15 + line_size_3
, pixels
);
66 vec_st(vec_perm(pixelsv1
, pixelsv2
, perm
),
67 0, (unsigned char*)block
);
68 vec_st(vec_perm(pixelsv1B
, pixelsv2B
, perm
),
69 line_size
, (unsigned char*)block
);
70 vec_st(vec_perm(pixelsv1C
, pixelsv2C
, perm
),
71 line_size_2
, (unsigned char*)block
);
72 vec_st(vec_perm(pixelsv1D
, pixelsv2D
, perm
),
73 line_size_3
, (unsigned char*)block
);
79 /* next one assumes that ((line_size % 16) == 0) */
80 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
81 void ff_avg_pixels16_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
83 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
84 register vector
unsigned char perm
= vec_lvsl(0, pixels
);
87 for (i
= 0; i
< h
; i
++) {
88 pixelsv1
= vec_ld( 0, pixels
);
89 pixelsv2
= vec_ld(16,pixels
);
90 blockv
= vec_ld(0, block
);
91 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, perm
);
92 blockv
= vec_avg(blockv
,pixelsv
);
93 vec_st(blockv
, 0, (unsigned char*)block
);
99 /* next one assumes that ((line_size % 8) == 0) */
100 static void avg_pixels8_altivec(uint8_t * block
, const uint8_t * pixels
, ptrdiff_t line_size
, int h
)
102 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv
, blockv
;
105 for (i
= 0; i
< h
; i
++) {
106 /* block is 8 bytes-aligned, so we're either in the
107 left block (16 bytes-aligned) or in the right block (not) */
108 int rightside
= ((unsigned long)block
& 0x0000000F);
110 blockv
= vec_ld(0, block
);
111 pixelsv1
= vec_ld( 0, pixels
);
112 pixelsv2
= vec_ld(16, pixels
);
113 pixelsv
= vec_perm(pixelsv1
, pixelsv2
, vec_lvsl(0, pixels
));
116 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(0,1,s0
,s1
));
118 pixelsv
= vec_perm(blockv
, pixelsv
, vcprm(s0
,s1
,2,3));
121 blockv
= vec_avg(blockv
, pixelsv
);
123 vec_st(blockv
, 0, block
);
130 /* next one assumes that ((line_size % 8) == 0) */
131 static void put_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
134 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsavg
;
135 register vector
unsigned char blockv
, temp1
, temp2
;
136 register vector
unsigned short pixelssum1
, pixelssum2
, temp3
;
137 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
138 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
140 temp1
= vec_ld(0, pixels
);
141 temp2
= vec_ld(16, pixels
);
142 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
143 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F) {
146 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
148 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
149 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
150 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
151 (vector
unsigned short)pixelsv2
);
152 pixelssum1
= vec_add(pixelssum1
, vctwo
);
154 for (i
= 0; i
< h
; i
++) {
155 int rightside
= ((unsigned long)block
& 0x0000000F);
156 blockv
= vec_ld(0, block
);
158 temp1
= vec_ld(line_size
, pixels
);
159 temp2
= vec_ld(line_size
+ 16, pixels
);
160 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
161 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F) {
164 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
167 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
168 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
169 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
170 (vector
unsigned short)pixelsv2
);
171 temp3
= vec_add(pixelssum1
, pixelssum2
);
172 temp3
= vec_sra(temp3
, vctwo
);
173 pixelssum1
= vec_add(pixelssum2
, vctwo
);
174 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
177 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
179 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
182 vec_st(blockv
, 0, block
);
189 /* next one assumes that ((line_size % 8) == 0) */
190 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
193 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsavg
;
194 register vector
unsigned char blockv
, temp1
, temp2
;
195 register vector
unsigned short pixelssum1
, pixelssum2
, temp3
;
196 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
197 register const vector
unsigned short vcone
= (const vector
unsigned short)vec_splat_u16(1);
198 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
200 temp1
= vec_ld(0, pixels
);
201 temp2
= vec_ld(16, pixels
);
202 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
203 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F) {
206 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
208 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
209 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
210 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
211 (vector
unsigned short)pixelsv2
);
212 pixelssum1
= vec_add(pixelssum1
, vcone
);
214 for (i
= 0; i
< h
; i
++) {
215 int rightside
= ((unsigned long)block
& 0x0000000F);
216 blockv
= vec_ld(0, block
);
218 temp1
= vec_ld(line_size
, pixels
);
219 temp2
= vec_ld(line_size
+ 16, pixels
);
220 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
221 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F) {
224 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
227 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
228 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
229 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
230 (vector
unsigned short)pixelsv2
);
231 temp3
= vec_add(pixelssum1
, pixelssum2
);
232 temp3
= vec_sra(temp3
, vctwo
);
233 pixelssum1
= vec_add(pixelssum2
, vcone
);
234 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
237 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
239 blockv
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
242 vec_st(blockv
, 0, block
);
249 /* next one assumes that ((line_size % 16) == 0) */
250 static void put_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, ptrdiff_t line_size
, int h
)
253 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
254 register vector
unsigned char blockv
, temp1
, temp2
;
255 register vector
unsigned short temp3
, temp4
,
256 pixelssum1
, pixelssum2
, pixelssum3
, pixelssum4
;
257 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
258 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
260 temp1
= vec_ld(0, pixels
);
261 temp2
= vec_ld(16, pixels
);
262 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
263 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F) {
266 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
268 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
269 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
270 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
271 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
272 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
273 (vector
unsigned short)pixelsv4
);
274 pixelssum3
= vec_add(pixelssum3
, vctwo
);
275 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
276 (vector
unsigned short)pixelsv2
);
277 pixelssum1
= vec_add(pixelssum1
, vctwo
);
279 for (i
= 0; i
< h
; i
++) {
280 blockv
= vec_ld(0, block
);
282 temp1
= vec_ld(line_size
, pixels
);
283 temp2
= vec_ld(line_size
+ 16, pixels
);
284 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
285 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F) {
288 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
291 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
292 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
293 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
294 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
296 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
297 (vector
unsigned short)pixelsv4
);
298 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
299 (vector
unsigned short)pixelsv2
);
300 temp4
= vec_add(pixelssum3
, pixelssum4
);
301 temp4
= vec_sra(temp4
, vctwo
);
302 temp3
= vec_add(pixelssum1
, pixelssum2
);
303 temp3
= vec_sra(temp3
, vctwo
);
305 pixelssum3
= vec_add(pixelssum4
, vctwo
);
306 pixelssum1
= vec_add(pixelssum2
, vctwo
);
308 blockv
= vec_packsu(temp3
, temp4
);
310 vec_st(blockv
, 0, block
);
317 /* next one assumes that ((line_size % 16) == 0) */
318 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block
, const uint8_t * pixels
, ptrdiff_t line_size
, int h
)
321 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsv3
, pixelsv4
;
322 register vector
unsigned char blockv
, temp1
, temp2
;
323 register vector
unsigned short temp3
, temp4
,
324 pixelssum1
, pixelssum2
, pixelssum3
, pixelssum4
;
325 register const vector
unsigned char vczero
= (const vector
unsigned char)vec_splat_u8(0);
326 register const vector
unsigned short vcone
= (const vector
unsigned short)vec_splat_u16(1);
327 register const vector
unsigned short vctwo
= (const vector
unsigned short)vec_splat_u16(2);
329 temp1
= vec_ld(0, pixels
);
330 temp2
= vec_ld(16, pixels
);
331 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
332 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F) {
335 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
337 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
338 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
339 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
340 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
341 pixelssum3
= vec_add((vector
unsigned short)pixelsv3
,
342 (vector
unsigned short)pixelsv4
);
343 pixelssum3
= vec_add(pixelssum3
, vcone
);
344 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
345 (vector
unsigned short)pixelsv2
);
346 pixelssum1
= vec_add(pixelssum1
, vcone
);
348 for (i
= 0; i
< h
; i
++) {
349 blockv
= vec_ld(0, block
);
351 temp1
= vec_ld(line_size
, pixels
);
352 temp2
= vec_ld(line_size
+ 16, pixels
);
353 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
354 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F) {
357 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
360 pixelsv3
= vec_mergel(vczero
, pixelsv1
);
361 pixelsv4
= vec_mergel(vczero
, pixelsv2
);
362 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
363 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
365 pixelssum4
= vec_add((vector
unsigned short)pixelsv3
,
366 (vector
unsigned short)pixelsv4
);
367 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
368 (vector
unsigned short)pixelsv2
);
369 temp4
= vec_add(pixelssum3
, pixelssum4
);
370 temp4
= vec_sra(temp4
, vctwo
);
371 temp3
= vec_add(pixelssum1
, pixelssum2
);
372 temp3
= vec_sra(temp3
, vctwo
);
374 pixelssum3
= vec_add(pixelssum4
, vcone
);
375 pixelssum1
= vec_add(pixelssum2
, vcone
);
377 blockv
= vec_packsu(temp3
, temp4
);
379 vec_st(blockv
, 0, block
);
386 /* next one assumes that ((line_size % 8) == 0) */
387 static void avg_pixels8_xy2_altivec(uint8_t *block
, const uint8_t *pixels
, ptrdiff_t line_size
, int h
)
390 register vector
unsigned char pixelsv1
, pixelsv2
, pixelsavg
;
391 register vector
unsigned char blockv
, temp1
, temp2
, blocktemp
;
392 register vector
unsigned short pixelssum1
, pixelssum2
, temp3
;
394 register const vector
unsigned char vczero
= (const vector
unsigned char)
396 register const vector
unsigned short vctwo
= (const vector
unsigned short)
399 temp1
= vec_ld(0, pixels
);
400 temp2
= vec_ld(16, pixels
);
401 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(0, pixels
));
402 if ((((unsigned long)pixels
) & 0x0000000F) == 0x0000000F) {
405 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(1, pixels
));
407 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
408 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
409 pixelssum1
= vec_add((vector
unsigned short)pixelsv1
,
410 (vector
unsigned short)pixelsv2
);
411 pixelssum1
= vec_add(pixelssum1
, vctwo
);
413 for (i
= 0; i
< h
; i
++) {
414 int rightside
= ((unsigned long)block
& 0x0000000F);
415 blockv
= vec_ld(0, block
);
417 temp1
= vec_ld(line_size
, pixels
);
418 temp2
= vec_ld(line_size
+ 16, pixels
);
419 pixelsv1
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
, pixels
));
420 if (((((unsigned long)pixels
) + line_size
) & 0x0000000F) == 0x0000000F) {
423 pixelsv2
= vec_perm(temp1
, temp2
, vec_lvsl(line_size
+ 1, pixels
));
426 pixelsv1
= vec_mergeh(vczero
, pixelsv1
);
427 pixelsv2
= vec_mergeh(vczero
, pixelsv2
);
428 pixelssum2
= vec_add((vector
unsigned short)pixelsv1
,
429 (vector
unsigned short)pixelsv2
);
430 temp3
= vec_add(pixelssum1
, pixelssum2
);
431 temp3
= vec_sra(temp3
, vctwo
);
432 pixelssum1
= vec_add(pixelssum2
, vctwo
);
433 pixelsavg
= vec_packsu(temp3
, (vector
unsigned short) vczero
);
436 blocktemp
= vec_perm(blockv
, pixelsavg
, vcprm(0, 1, s0
, s1
));
438 blocktemp
= vec_perm(blockv
, pixelsavg
, vcprm(s0
, s1
, 2, 3));
441 blockv
= vec_avg(blocktemp
, blockv
);
442 vec_st(blockv
, 0, block
);
448 #endif /* HAVE_ALTIVEC */
450 av_cold
void ff_hpeldsp_init_ppc(HpelDSPContext
*c
, int flags
)
453 if (!PPC_ALTIVEC(av_get_cpu_flags()))
456 c
->avg_pixels_tab
[0][0] = ff_avg_pixels16_altivec
;
457 c
->avg_pixels_tab
[1][0] = avg_pixels8_altivec
;
458 c
->avg_pixels_tab
[1][3] = avg_pixels8_xy2_altivec
;
460 c
->put_pixels_tab
[0][0] = ff_put_pixels16_altivec
;
461 c
->put_pixels_tab
[1][3] = put_pixels8_xy2_altivec
;
462 c
->put_pixels_tab
[0][3] = put_pixels16_xy2_altivec
;
464 c
->put_no_rnd_pixels_tab
[0][0] = ff_put_pixels16_altivec
;
465 c
->put_no_rnd_pixels_tab
[1][3] = put_no_rnd_pixels8_xy2_altivec
;
466 c
->put_no_rnd_pixels_tab
[0][3] = put_no_rnd_pixels16_xy2_altivec
;
467 #endif /* HAVE_ALTIVEC */