2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 #include "libavutil/attributes.h"
29 #include "libavutil/cpu.h"
30 #include "libavutil/ppc/cpu.h"
31 #include "libavutil/ppc/types_altivec.h"
32 #include "libavutil/ppc/util_altivec.h"
33 #include "libavcodec/avcodec.h"
34 #include "libavcodec/mpegvideo.h"
35 #include "libavcodec/me_cmp.h"
40 #define GET_PERM(per1, per2, pix) {\
41 per1 = vec_lvsl(0, pix);\
42 per2 = vec_add(per1, vec_splat_u8(1));\
44 #define LOAD_PIX(v, iv, pix, per1, per2) {\
45 vector unsigned char pix2l = vec_ld(0, pix);\
46 vector unsigned char pix2r = vec_ld(16, pix);\
47 v = vec_perm(pix2l, pix2r, per1);\
48 iv = vec_perm(pix2l, pix2r, per2);\
51 #define GET_PERM(per1, per2, pix) {}
52 #define LOAD_PIX(v, iv, pix, per1, per2) {\
53 v = vec_vsx_ld(0, pix);\
54 iv = vec_vsx_ld(1, pix);\
57 static int sad16_x2_altivec(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
58 ptrdiff_t stride
, int h
)
61 int __attribute__((aligned(16))) s
= 0;
62 const vector
unsigned char zero
=
63 (const vector
unsigned char) vec_splat_u8(0);
64 vector
unsigned int sad
= (vector
unsigned int) vec_splat_u32(0);
65 vector
signed int sumdiffs
;
66 vector
unsigned char perm1
, perm2
, pix2v
, pix2iv
;
68 GET_PERM(perm1
, perm2
, pix2
);
69 for (i
= 0; i
< h
; i
++) {
70 /* Read unaligned pixels into our vectors. The vectors are as follows:
71 * pix1v: pix1[0] - pix1[15]
72 * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */
73 vector
unsigned char pix1v
= vec_ld(0, pix1
);
74 LOAD_PIX(pix2v
, pix2iv
, pix2
, perm1
, perm2
);
76 /* Calculate the average vector. */
77 vector
unsigned char avgv
= vec_avg(pix2v
, pix2iv
);
79 /* Calculate a sum of abs differences vector. */
80 vector
unsigned char t5
= vec_sub(vec_max(pix1v
, avgv
),
81 vec_min(pix1v
, avgv
));
83 /* Add each 4 pixel group together and put 4 results into sad. */
84 sad
= vec_sum4s(t5
, sad
);
89 /* Sum up the four partial sums, and put the result into s. */
90 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
91 sumdiffs
= vec_splat(sumdiffs
, 3);
92 vec_ste(sumdiffs
, 0, &s
);
97 static int sad16_y2_altivec(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
98 ptrdiff_t stride
, int h
)
101 int __attribute__((aligned(16))) s
= 0;
102 const vector
unsigned char zero
=
103 (const vector
unsigned char) vec_splat_u8(0);
104 vector
unsigned char pix1v
, pix3v
, avgv
, t5
;
105 vector
unsigned int sad
= (vector
unsigned int) vec_splat_u32(0);
106 vector
signed int sumdiffs
;
108 uint8_t *pix3
= pix2
+ stride
;
110 /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
111 * iteration becomes pix2 in the next iteration. We can use this
112 * fact to avoid a potentially expensive unaligned read, each
113 * time around the loop.
114 * Read unaligned pixels into our vectors. The vectors are as follows:
115 * pix2v: pix2[0] - pix2[15]
116 * Split the pixel vectors into shorts. */
117 vector
unsigned char pix2v
= VEC_LD(0, pix2
);
119 for (i
= 0; i
< h
; i
++) {
120 /* Read unaligned pixels into our vectors. The vectors are as follows:
121 * pix1v: pix1[0] - pix1[15]
122 * pix3v: pix3[0] - pix3[15] */
123 pix1v
= vec_ld(0, pix1
);
124 pix3v
= VEC_LD(0, pix3
);
126 /* Calculate the average vector. */
127 avgv
= vec_avg(pix2v
, pix3v
);
129 /* Calculate a sum of abs differences vector. */
130 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
132 /* Add each 4 pixel group together and put 4 results into sad. */
133 sad
= vec_sum4s(t5
, sad
);
140 /* Sum up the four partial sums, and put the result into s. */
141 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
142 sumdiffs
= vec_splat(sumdiffs
, 3);
143 vec_ste(sumdiffs
, 0, &s
);
147 static int sad16_xy2_altivec(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
148 ptrdiff_t stride
, int h
)
151 int __attribute__((aligned(16))) s
= 0;
152 uint8_t *pix3
= pix2
+ stride
;
153 const vector
unsigned char zero
=
154 (const vector
unsigned char) vec_splat_u8(0);
155 const vector
unsigned short two
=
156 (const vector
unsigned short) vec_splat_u16(2);
157 vector
unsigned char avgv
, t5
;
158 vector
unsigned char pix1v
, pix3v
, pix3iv
;
159 vector
unsigned short pix3lv
, pix3hv
, pix3ilv
, pix3ihv
;
160 vector
unsigned short avghv
, avglv
;
161 vector
unsigned int sad
= (vector
unsigned int) vec_splat_u32(0);
162 vector
signed int sumdiffs
;
163 vector
unsigned char perm1
, perm2
, pix2v
, pix2iv
;
164 GET_PERM(perm1
, perm2
, pix2
);
166 /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
167 * iteration becomes pix2 in the next iteration. We can use this
168 * fact to avoid a potentially expensive unaligned read, as well
169 * as some splitting, and vector addition each time around the loop.
170 * Read unaligned pixels into our vectors. The vectors are as follows:
171 * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16]
172 * Split the pixel vectors into shorts. */
173 LOAD_PIX(pix2v
, pix2iv
, pix2
, perm1
, perm2
);
174 vector
unsigned short pix2hv
=
175 (vector
unsigned short) VEC_MERGEH(zero
, pix2v
);
176 vector
unsigned short pix2lv
=
177 (vector
unsigned short) VEC_MERGEL(zero
, pix2v
);
178 vector
unsigned short pix2ihv
=
179 (vector
unsigned short) VEC_MERGEH(zero
, pix2iv
);
180 vector
unsigned short pix2ilv
=
181 (vector
unsigned short) VEC_MERGEL(zero
, pix2iv
);
183 vector
unsigned short t1
= vec_add(pix2hv
, pix2ihv
);
184 vector
unsigned short t2
= vec_add(pix2lv
, pix2ilv
);
185 vector
unsigned short t3
, t4
;
187 for (i
= 0; i
< h
; i
++) {
188 /* Read unaligned pixels into our vectors. The vectors are as follows:
189 * pix1v: pix1[0] - pix1[15]
190 * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */
191 pix1v
= vec_ld(0, pix1
);
192 LOAD_PIX(pix3v
, pix3iv
, pix3
, perm1
, perm2
);
194 /* Note that AltiVec does have vec_avg, but this works on vector pairs
195 * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
196 * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
197 * it should be 1. Instead, we have to split the pixel vectors into
198 * vectors of shorts and do the averaging by hand. */
200 /* Split the pixel vectors into shorts. */
201 pix3hv
= (vector
unsigned short) VEC_MERGEH(zero
, pix3v
);
202 pix3lv
= (vector
unsigned short) VEC_MERGEL(zero
, pix3v
);
203 pix3ihv
= (vector
unsigned short) VEC_MERGEH(zero
, pix3iv
);
204 pix3ilv
= (vector
unsigned short) VEC_MERGEL(zero
, pix3iv
);
206 /* Do the averaging on them. */
207 t3
= vec_add(pix3hv
, pix3ihv
);
208 t4
= vec_add(pix3lv
, pix3ilv
);
210 avghv
= vec_sr(vec_add(vec_add(t1
, t3
), two
), two
);
211 avglv
= vec_sr(vec_add(vec_add(t2
, t4
), two
), two
);
213 /* Pack the shorts back into a result. */
214 avgv
= vec_pack(avghv
, avglv
);
216 /* Calculate a sum of abs differences vector. */
217 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
219 /* Add each 4 pixel group together and put 4 results into sad. */
220 sad
= vec_sum4s(t5
, sad
);
224 /* Transfer the calculated values for pix3 into pix2. */
228 /* Sum up the four partial sums, and put the result into s. */
229 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
230 sumdiffs
= vec_splat(sumdiffs
, 3);
231 vec_ste(sumdiffs
, 0, &s
);
236 static int sad16_altivec(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
237 ptrdiff_t stride
, int h
)
240 int __attribute__((aligned(16))) s
;
241 const vector
unsigned int zero
=
242 (const vector
unsigned int) vec_splat_u32(0);
243 vector
unsigned int sad
= (vector
unsigned int) vec_splat_u32(0);
244 vector
signed int sumdiffs
;
246 for (i
= 0; i
< h
; i
++) {
247 /* Read potentially unaligned pixels into t1 and t2. */
248 vector
unsigned char t1
=vec_ld(0, pix1
);
249 vector
unsigned char t2
= VEC_LD(0, pix2
);
251 /* Calculate a sum of abs differences vector. */
252 vector
unsigned char t3
= vec_max(t1
, t2
);
253 vector
unsigned char t4
= vec_min(t1
, t2
);
254 vector
unsigned char t5
= vec_sub(t3
, t4
);
256 /* Add each 4 pixel group together and put 4 results into sad. */
257 sad
= vec_sum4s(t5
, sad
);
263 /* Sum up the four partial sums, and put the result into s. */
264 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
265 sumdiffs
= vec_splat(sumdiffs
, 3);
266 vec_ste(sumdiffs
, 0, &s
);
271 static int sad8_altivec(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
272 ptrdiff_t stride
, int h
)
275 int __attribute__((aligned(16))) s
;
276 const vector
unsigned int zero
=
277 (const vector
unsigned int) vec_splat_u32(0);
278 const vector
unsigned char permclear
=
279 (vector
unsigned char)
280 { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
281 vector
unsigned int sad
= (vector
unsigned int) vec_splat_u32(0);
282 vector
signed int sumdiffs
;
284 for (i
= 0; i
< h
; i
++) {
285 /* Read potentially unaligned pixels into t1 and t2.
286 * Since we're reading 16 pixels, and actually only want 8,
287 * mask out the last 8 pixels. The 0s don't change the sum. */
288 vector
unsigned char pix1l
= VEC_LD(0, pix1
);
289 vector
unsigned char pix2l
= VEC_LD(0, pix2
);
290 vector
unsigned char t1
= vec_and(pix1l
, permclear
);
291 vector
unsigned char t2
= vec_and(pix2l
, permclear
);
293 /* Calculate a sum of abs differences vector. */
294 vector
unsigned char t3
= vec_max(t1
, t2
);
295 vector
unsigned char t4
= vec_min(t1
, t2
);
296 vector
unsigned char t5
= vec_sub(t3
, t4
);
298 /* Add each 4 pixel group together and put 4 results into sad. */
299 sad
= vec_sum4s(t5
, sad
);
305 /* Sum up the four partial sums, and put the result into s. */
306 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
307 sumdiffs
= vec_splat(sumdiffs
, 3);
308 vec_ste(sumdiffs
, 0, &s
);
313 /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
314 * It's the sad8_altivec code above w/ squaring added. */
315 static int sse8_altivec(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
316 ptrdiff_t stride
, int h
)
319 int __attribute__((aligned(16))) s
;
320 const vector
unsigned int zero
=
321 (const vector
unsigned int) vec_splat_u32(0);
322 const vector
unsigned char permclear
=
323 (vector
unsigned char)
324 { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
325 vector
unsigned int sum
= (vector
unsigned int) vec_splat_u32(0);
326 vector
signed int sumsqr
;
328 for (i
= 0; i
< h
; i
++) {
329 /* Read potentially unaligned pixels into t1 and t2.
330 * Since we're reading 16 pixels, and actually only want 8,
331 * mask out the last 8 pixels. The 0s don't change the sum. */
332 vector
unsigned char t1
= vec_and(VEC_LD(0, pix1
), permclear
);
333 vector
unsigned char t2
= vec_and(VEC_LD(0, pix2
), permclear
);
335 /* Since we want to use unsigned chars, we can take advantage
336 * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
338 /* Calculate abs differences vector. */
339 vector
unsigned char t3
= vec_max(t1
, t2
);
340 vector
unsigned char t4
= vec_min(t1
, t2
);
341 vector
unsigned char t5
= vec_sub(t3
, t4
);
343 /* Square the values and add them to our sum. */
344 sum
= vec_msum(t5
, t5
, sum
);
350 /* Sum up the four partial sums, and put the result into s. */
351 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
352 sumsqr
= vec_splat(sumsqr
, 3);
353 vec_ste(sumsqr
, 0, &s
);
358 /* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
359 * It's the sad16_altivec code above w/ squaring added. */
360 static int sse16_altivec(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
361 ptrdiff_t stride
, int h
)
364 int __attribute__((aligned(16))) s
;
365 const vector
unsigned int zero
=
366 (const vector
unsigned int) vec_splat_u32(0);
367 vector
unsigned int sum
= (vector
unsigned int) vec_splat_u32(0);
368 vector
signed int sumsqr
;
370 for (i
= 0; i
< h
; i
++) {
371 /* Read potentially unaligned pixels into t1 and t2. */
372 vector
unsigned char t1
= vec_ld(0, pix1
);
373 vector
unsigned char t2
= VEC_LD(0, pix2
);
375 /* Since we want to use unsigned chars, we can take advantage
376 * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
378 /* Calculate abs differences vector. */
379 vector
unsigned char t3
= vec_max(t1
, t2
);
380 vector
unsigned char t4
= vec_min(t1
, t2
);
381 vector
unsigned char t5
= vec_sub(t3
, t4
);
383 /* Square the values and add them to our sum. */
384 sum
= vec_msum(t5
, t5
, sum
);
390 /* Sum up the four partial sums, and put the result into s. */
391 sumsqr
= vec_sums((vector
signed int) sum
, (vector
signed int) zero
);
392 sumsqr
= vec_splat(sumsqr
, 3);
394 vec_ste(sumsqr
, 0, &s
);
398 static int hadamard8_diff8x8_altivec(MpegEncContext
*s
, uint8_t *dst
,
399 uint8_t *src
, ptrdiff_t stride
, int h
)
401 int __attribute__((aligned(16))) sum
;
402 register const vector
unsigned char vzero
=
403 (const vector
unsigned char) vec_splat_u8(0);
404 register vector
signed short temp0
, temp1
, temp2
, temp3
, temp4
,
407 register const vector
signed short vprod1
=
408 (const vector
signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
409 register const vector
signed short vprod2
=
410 (const vector
signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
411 register const vector
signed short vprod3
=
412 (const vector
signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
413 register const vector
unsigned char perm1
=
414 (const vector
unsigned char)
415 { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
416 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
417 register const vector
unsigned char perm2
=
418 (const vector
unsigned char)
419 { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
420 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
421 register const vector
unsigned char perm3
=
422 (const vector
unsigned char)
423 { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
424 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
427 #define ONEITERBUTTERFLY(i, res) \
429 register vector unsigned char srcO = unaligned_load(stride * i, src); \
430 register vector unsigned char dstO = unaligned_load(stride * i, dst);\
432 /* Promote the unsigned chars to signed shorts. */ \
433 /* We're in the 8x8 function, we only care for the first 8. */ \
434 register vector signed short srcV = \
435 (vector signed short) VEC_MERGEH((vector signed char) vzero, \
436 (vector signed char) srcO); \
437 register vector signed short dstV = \
438 (vector signed short) VEC_MERGEH((vector signed char) vzero, \
439 (vector signed char) dstO); \
441 /* subtractions inside the first butterfly */ \
442 register vector signed short but0 = vec_sub(srcV, dstV); \
443 register vector signed short op1 = vec_perm(but0, but0, perm1); \
444 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
445 register vector signed short op2 = vec_perm(but1, but1, perm2); \
446 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
447 register vector signed short op3 = vec_perm(but2, but2, perm3); \
448 res = vec_mladd(but2, vprod3, op3); \
451 ONEITERBUTTERFLY(0, temp0
);
452 ONEITERBUTTERFLY(1, temp1
);
453 ONEITERBUTTERFLY(2, temp2
);
454 ONEITERBUTTERFLY(3, temp3
);
455 ONEITERBUTTERFLY(4, temp4
);
456 ONEITERBUTTERFLY(5, temp5
);
457 ONEITERBUTTERFLY(6, temp6
);
458 ONEITERBUTTERFLY(7, temp7
);
460 #undef ONEITERBUTTERFLY
462 register vector
signed int vsum
;
463 register vector
signed short line0
= vec_add(temp0
, temp1
);
464 register vector
signed short line1
= vec_sub(temp0
, temp1
);
465 register vector
signed short line2
= vec_add(temp2
, temp3
);
466 register vector
signed short line3
= vec_sub(temp2
, temp3
);
467 register vector
signed short line4
= vec_add(temp4
, temp5
);
468 register vector
signed short line5
= vec_sub(temp4
, temp5
);
469 register vector
signed short line6
= vec_add(temp6
, temp7
);
470 register vector
signed short line7
= vec_sub(temp6
, temp7
);
472 register vector
signed short line0B
= vec_add(line0
, line2
);
473 register vector
signed short line2B
= vec_sub(line0
, line2
);
474 register vector
signed short line1B
= vec_add(line1
, line3
);
475 register vector
signed short line3B
= vec_sub(line1
, line3
);
476 register vector
signed short line4B
= vec_add(line4
, line6
);
477 register vector
signed short line6B
= vec_sub(line4
, line6
);
478 register vector
signed short line5B
= vec_add(line5
, line7
);
479 register vector
signed short line7B
= vec_sub(line5
, line7
);
481 register vector
signed short line0C
= vec_add(line0B
, line4B
);
482 register vector
signed short line4C
= vec_sub(line0B
, line4B
);
483 register vector
signed short line1C
= vec_add(line1B
, line5B
);
484 register vector
signed short line5C
= vec_sub(line1B
, line5B
);
485 register vector
signed short line2C
= vec_add(line2B
, line6B
);
486 register vector
signed short line6C
= vec_sub(line2B
, line6B
);
487 register vector
signed short line3C
= vec_add(line3B
, line7B
);
488 register vector
signed short line7C
= vec_sub(line3B
, line7B
);
490 vsum
= vec_sum4s(vec_abs(line0C
), vec_splat_s32(0));
491 vsum
= vec_sum4s(vec_abs(line1C
), vsum
);
492 vsum
= vec_sum4s(vec_abs(line2C
), vsum
);
493 vsum
= vec_sum4s(vec_abs(line3C
), vsum
);
494 vsum
= vec_sum4s(vec_abs(line4C
), vsum
);
495 vsum
= vec_sum4s(vec_abs(line5C
), vsum
);
496 vsum
= vec_sum4s(vec_abs(line6C
), vsum
);
497 vsum
= vec_sum4s(vec_abs(line7C
), vsum
);
498 vsum
= vec_sums(vsum
, (vector
signed int) vzero
);
499 vsum
= vec_splat(vsum
, 3);
501 vec_ste(vsum
, 0, &sum
);
507 * 16x8 works with 16 elements; it allows to avoid replicating loads, and
508 * gives the compiler more room for scheduling. It's only used from
509 * inside hadamard8_diff16_altivec.
511 * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
512 * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
513 * registers by itself. The following code includes hand-made register
514 * allocation. It's not clean, but on a 7450 the resulting code is much faster
515 * (best case falls from 700+ cycles to 550).
517 * xlc doesn't add spill code, but it doesn't know how to schedule for the
518 * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
519 * 25% fewer instructions...)
521 * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
522 * but xlc goes to around 660 on the regular C code...
524 static int hadamard8_diff16x8_altivec(MpegEncContext
*s
, uint8_t *dst
,
525 uint8_t *src
, ptrdiff_t stride
, int h
)
527 int __attribute__((aligned(16))) sum
;
528 register vector
signed short
529 temp0
__asm__ ("v0"),
530 temp1
__asm__ ("v1"),
531 temp2
__asm__ ("v2"),
532 temp3
__asm__ ("v3"),
533 temp4
__asm__ ("v4"),
534 temp5
__asm__ ("v5"),
535 temp6
__asm__ ("v6"),
536 temp7
__asm__ ("v7");
537 register vector
signed short
538 temp0S
__asm__ ("v8"),
539 temp1S
__asm__ ("v9"),
540 temp2S
__asm__ ("v10"),
541 temp3S
__asm__ ("v11"),
542 temp4S
__asm__ ("v12"),
543 temp5S
__asm__ ("v13"),
544 temp6S
__asm__ ("v14"),
545 temp7S
__asm__ ("v15");
546 register const vector
unsigned char vzero
__asm__ ("v31") =
547 (const vector
unsigned char) vec_splat_u8(0);
549 register const vector
signed short vprod1
__asm__ ("v16") =
550 (const vector
signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
552 register const vector
signed short vprod2
__asm__ ("v17") =
553 (const vector
signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
555 register const vector
signed short vprod3
__asm__ ("v18") =
556 (const vector
signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
558 register const vector
unsigned char perm1
__asm__ ("v19") =
559 (const vector
unsigned char)
560 { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
561 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
563 register const vector
unsigned char perm2
__asm__ ("v20") =
564 (const vector
unsigned char)
565 { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
566 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
568 register const vector
unsigned char perm3
__asm__ ("v21") =
569 (const vector
unsigned char)
570 { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
571 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
573 #define ONEITERBUTTERFLY(i, res1, res2) \
575 register vector unsigned char srcO __asm__ ("v22") = \
576 unaligned_load(stride * i, src); \
577 register vector unsigned char dstO __asm__ ("v23") = \
578 unaligned_load(stride * i, dst);\
580 /* Promote the unsigned chars to signed shorts. */ \
581 register vector signed short srcV __asm__ ("v24") = \
582 (vector signed short) VEC_MERGEH((vector signed char) vzero, \
583 (vector signed char) srcO); \
584 register vector signed short dstV __asm__ ("v25") = \
585 (vector signed short) VEC_MERGEH((vector signed char) vzero, \
586 (vector signed char) dstO); \
587 register vector signed short srcW __asm__ ("v26") = \
588 (vector signed short) VEC_MERGEL((vector signed char) vzero, \
589 (vector signed char) srcO); \
590 register vector signed short dstW __asm__ ("v27") = \
591 (vector signed short) VEC_MERGEL((vector signed char) vzero, \
592 (vector signed char) dstO); \
594 /* subtractions inside the first butterfly */ \
595 register vector signed short but0 __asm__ ("v28") = \
596 vec_sub(srcV, dstV); \
597 register vector signed short but0S __asm__ ("v29") = \
598 vec_sub(srcW, dstW); \
599 register vector signed short op1 __asm__ ("v30") = \
600 vec_perm(but0, but0, perm1); \
601 register vector signed short but1 __asm__ ("v22") = \
602 vec_mladd(but0, vprod1, op1); \
603 register vector signed short op1S __asm__ ("v23") = \
604 vec_perm(but0S, but0S, perm1); \
605 register vector signed short but1S __asm__ ("v24") = \
606 vec_mladd(but0S, vprod1, op1S); \
607 register vector signed short op2 __asm__ ("v25") = \
608 vec_perm(but1, but1, perm2); \
609 register vector signed short but2 __asm__ ("v26") = \
610 vec_mladd(but1, vprod2, op2); \
611 register vector signed short op2S __asm__ ("v27") = \
612 vec_perm(but1S, but1S, perm2); \
613 register vector signed short but2S __asm__ ("v28") = \
614 vec_mladd(but1S, vprod2, op2S); \
615 register vector signed short op3 __asm__ ("v29") = \
616 vec_perm(but2, but2, perm3); \
617 register vector signed short op3S __asm__ ("v30") = \
618 vec_perm(but2S, but2S, perm3); \
619 res1 = vec_mladd(but2, vprod3, op3); \
620 res2 = vec_mladd(but2S, vprod3, op3S); \
623 ONEITERBUTTERFLY(0, temp0
, temp0S
);
624 ONEITERBUTTERFLY(1, temp1
, temp1S
);
625 ONEITERBUTTERFLY(2, temp2
, temp2S
);
626 ONEITERBUTTERFLY(3, temp3
, temp3S
);
627 ONEITERBUTTERFLY(4, temp4
, temp4S
);
628 ONEITERBUTTERFLY(5, temp5
, temp5S
);
629 ONEITERBUTTERFLY(6, temp6
, temp6S
);
630 ONEITERBUTTERFLY(7, temp7
, temp7S
);
632 #undef ONEITERBUTTERFLY
634 register vector
signed int vsum
;
636 register vector
signed short line0
= vec_add(temp0
, temp1
);
637 register vector
signed short line1
= vec_sub(temp0
, temp1
);
638 register vector
signed short line2
= vec_add(temp2
, temp3
);
639 register vector
signed short line3
= vec_sub(temp2
, temp3
);
640 register vector
signed short line4
= vec_add(temp4
, temp5
);
641 register vector
signed short line5
= vec_sub(temp4
, temp5
);
642 register vector
signed short line6
= vec_add(temp6
, temp7
);
643 register vector
signed short line7
= vec_sub(temp6
, temp7
);
645 register vector
signed short line0B
= vec_add(line0
, line2
);
646 register vector
signed short line2B
= vec_sub(line0
, line2
);
647 register vector
signed short line1B
= vec_add(line1
, line3
);
648 register vector
signed short line3B
= vec_sub(line1
, line3
);
649 register vector
signed short line4B
= vec_add(line4
, line6
);
650 register vector
signed short line6B
= vec_sub(line4
, line6
);
651 register vector
signed short line5B
= vec_add(line5
, line7
);
652 register vector
signed short line7B
= vec_sub(line5
, line7
);
654 register vector
signed short line0C
= vec_add(line0B
, line4B
);
655 register vector
signed short line4C
= vec_sub(line0B
, line4B
);
656 register vector
signed short line1C
= vec_add(line1B
, line5B
);
657 register vector
signed short line5C
= vec_sub(line1B
, line5B
);
658 register vector
signed short line2C
= vec_add(line2B
, line6B
);
659 register vector
signed short line6C
= vec_sub(line2B
, line6B
);
660 register vector
signed short line3C
= vec_add(line3B
, line7B
);
661 register vector
signed short line7C
= vec_sub(line3B
, line7B
);
663 register vector
signed short line0S
= vec_add(temp0S
, temp1S
);
664 register vector
signed short line1S
= vec_sub(temp0S
, temp1S
);
665 register vector
signed short line2S
= vec_add(temp2S
, temp3S
);
666 register vector
signed short line3S
= vec_sub(temp2S
, temp3S
);
667 register vector
signed short line4S
= vec_add(temp4S
, temp5S
);
668 register vector
signed short line5S
= vec_sub(temp4S
, temp5S
);
669 register vector
signed short line6S
= vec_add(temp6S
, temp7S
);
670 register vector
signed short line7S
= vec_sub(temp6S
, temp7S
);
672 register vector
signed short line0BS
= vec_add(line0S
, line2S
);
673 register vector
signed short line2BS
= vec_sub(line0S
, line2S
);
674 register vector
signed short line1BS
= vec_add(line1S
, line3S
);
675 register vector
signed short line3BS
= vec_sub(line1S
, line3S
);
676 register vector
signed short line4BS
= vec_add(line4S
, line6S
);
677 register vector
signed short line6BS
= vec_sub(line4S
, line6S
);
678 register vector
signed short line5BS
= vec_add(line5S
, line7S
);
679 register vector
signed short line7BS
= vec_sub(line5S
, line7S
);
681 register vector
signed short line0CS
= vec_add(line0BS
, line4BS
);
682 register vector
signed short line4CS
= vec_sub(line0BS
, line4BS
);
683 register vector
signed short line1CS
= vec_add(line1BS
, line5BS
);
684 register vector
signed short line5CS
= vec_sub(line1BS
, line5BS
);
685 register vector
signed short line2CS
= vec_add(line2BS
, line6BS
);
686 register vector
signed short line6CS
= vec_sub(line2BS
, line6BS
);
687 register vector
signed short line3CS
= vec_add(line3BS
, line7BS
);
688 register vector
signed short line7CS
= vec_sub(line3BS
, line7BS
);
690 vsum
= vec_sum4s(vec_abs(line0C
), vec_splat_s32(0));
691 vsum
= vec_sum4s(vec_abs(line1C
), vsum
);
692 vsum
= vec_sum4s(vec_abs(line2C
), vsum
);
693 vsum
= vec_sum4s(vec_abs(line3C
), vsum
);
694 vsum
= vec_sum4s(vec_abs(line4C
), vsum
);
695 vsum
= vec_sum4s(vec_abs(line5C
), vsum
);
696 vsum
= vec_sum4s(vec_abs(line6C
), vsum
);
697 vsum
= vec_sum4s(vec_abs(line7C
), vsum
);
699 vsum
= vec_sum4s(vec_abs(line0CS
), vsum
);
700 vsum
= vec_sum4s(vec_abs(line1CS
), vsum
);
701 vsum
= vec_sum4s(vec_abs(line2CS
), vsum
);
702 vsum
= vec_sum4s(vec_abs(line3CS
), vsum
);
703 vsum
= vec_sum4s(vec_abs(line4CS
), vsum
);
704 vsum
= vec_sum4s(vec_abs(line5CS
), vsum
);
705 vsum
= vec_sum4s(vec_abs(line6CS
), vsum
);
706 vsum
= vec_sum4s(vec_abs(line7CS
), vsum
);
707 vsum
= vec_sums(vsum
, (vector
signed int) vzero
);
708 vsum
= vec_splat(vsum
, 3);
710 vec_ste(vsum
, 0, &sum
);
715 static int hadamard8_diff16_altivec(MpegEncContext
*s
, uint8_t *dst
,
716 uint8_t *src
, ptrdiff_t stride
, int h
)
718 int score
= hadamard8_diff16x8_altivec(s
, dst
, src
, stride
, 8);
723 score
+= hadamard8_diff16x8_altivec(s
, dst
, src
, stride
, 8);
727 #endif /* HAVE_ALTIVEC */
729 av_cold
void ff_me_cmp_init_ppc(MECmpContext
*c
, AVCodecContext
*avctx
)
732 if (!PPC_ALTIVEC(av_get_cpu_flags()))
735 c
->pix_abs
[0][1] = sad16_x2_altivec
;
736 c
->pix_abs
[0][2] = sad16_y2_altivec
;
737 c
->pix_abs
[0][3] = sad16_xy2_altivec
;
738 c
->pix_abs
[0][0] = sad16_altivec
;
739 c
->pix_abs
[1][0] = sad8_altivec
;
741 c
->sad
[0] = sad16_altivec
;
742 c
->sad
[1] = sad8_altivec
;
743 c
->sse
[0] = sse16_altivec
;
744 c
->sse
[1] = sse8_altivec
;
746 c
->hadamard8_diff
[0] = hadamard8_diff16_altivec
;
747 c
->hadamard8_diff
[1] = hadamard8_diff8x8_altivec
;
748 #endif /* HAVE_ALTIVEC */