9a345485badcf453e1a2e066cbb8c8a3a0d77fd1
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/mem.h"
27 #include "libavutil/ppc/types_altivec.h"
28 #include "libavutil/ppc/util_altivec.h"
31 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
33 #define ASSERT_ALIGNED(ptr) ;
37 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
38 vec_u8 srcR1 = vec_ld(-2, s);\
39 vec_u8 srcR2 = vec_ld(14, s);\
42 srcM2 = vec_perm(srcR1, srcR2, pm2);\
43 srcM1 = vec_perm(srcR1, srcR2, pm1);\
44 srcP0 = vec_perm(srcR1, srcR2, pp0);\
45 srcP1 = vec_perm(srcR1, srcR2, pp1);\
46 srcP2 = vec_perm(srcR1, srcR2, pp2);\
47 srcP3 = vec_perm(srcR1, srcR2, pp3);\
50 srcM2 = vec_perm(srcR1, srcR2, pm2);\
51 srcM1 = vec_perm(srcR1, srcR2, pm1);\
52 srcP0 = vec_perm(srcR1, srcR2, pp0);\
53 srcP1 = vec_perm(srcR1, srcR2, pp1);\
54 srcP2 = vec_perm(srcR1, srcR2, pp2);\
58 vec_u8 srcR3 = vec_ld(30, s);\
59 srcM2 = vec_perm(srcR1, srcR2, pm2);\
60 srcM1 = vec_perm(srcR1, srcR2, pm1);\
61 srcP0 = vec_perm(srcR1, srcR2, pp0);\
62 srcP1 = vec_perm(srcR1, srcR2, pp1);\
64 srcP3 = vec_perm(srcR2, srcR3, pp3);\
67 vec_u8 srcR3 = vec_ld(30, s);\
68 srcM2 = vec_perm(srcR1, srcR2, pm2);\
69 srcM1 = vec_perm(srcR1, srcR2, pm1);\
70 srcP0 = vec_perm(srcR1, srcR2, pp0);\
72 srcP2 = vec_perm(srcR2, srcR3, pp2);\
73 srcP3 = vec_perm(srcR2, srcR3, pp3);\
76 vec_u8 srcR3 = vec_ld(30, s);\
77 srcM2 = vec_perm(srcR1, srcR2, pm2);\
78 srcM1 = vec_perm(srcR1, srcR2, pm1);\
80 srcP1 = vec_perm(srcR2, srcR3, pp1);\
81 srcP2 = vec_perm(srcR2, srcR3, pp2);\
82 srcP3 = vec_perm(srcR2, srcR3, pp3);\
85 vec_u8 srcR3 = vec_ld(30, s);\
86 srcM2 = vec_perm(srcR1, srcR2, pm2);\
88 srcP0 = vec_perm(srcR2, srcR3, pp0);\
89 srcP1 = vec_perm(srcR2, srcR3, pp1);\
90 srcP2 = vec_perm(srcR2, srcR3, pp2);\
91 srcP3 = vec_perm(srcR2, srcR3, pp3);\
96 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
97 srcM2 = vec_vsx_ld(-2, s);\
98 srcM1 = vec_vsx_ld(-1, s);\
99 srcP0 = vec_vsx_ld(0, s);\
100 srcP1 = vec_vsx_ld(1, s);\
101 srcP2 = vec_vsx_ld(2, s);\
102 srcP3 = vec_vsx_ld(3, s);\
104 #endif /* HAVE_BIGENDIAN */
106 /* this code assume stride % 16 == 0 */
107 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
108 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst
,
110 int dstStride
, int srcStride
)
115 vec_u8 permM2
, permM1
, permP0
, permP1
, permP2
, permP3
;
116 const vec_s16 v5ss
= vec_splat_s16(5);
117 const vec_u16 v5us
= vec_splat_u16(5);
118 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
119 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
121 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
123 register int align
= ((((unsigned long)src
) - 2) % 16);
125 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
126 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
127 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
128 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
129 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
130 psumA
, psumB
, sumA
, sumB
;
135 permM2
= vec_lvsl(-2, src
);
136 permM1
= vec_lvsl(-1, src
);
137 permP0
= vec_lvsl(+0, src
);
138 permP1
= vec_lvsl(+1, src
);
139 permP2
= vec_lvsl(+2, src
);
140 permP3
= vec_lvsl(+3, src
);
141 #endif /* HAVE_BIGENDIAN */
143 for (i
= 0 ; i
< 16 ; i
++) {
144 load_alignment(src
, align
, permM2
, permM1
, permP0
, permP1
, permP2
, permP3
);
146 srcP0A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP0
);
147 srcP0B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP0
);
148 srcP1A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP1
);
149 srcP1B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP1
);
151 srcP2A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP2
);
152 srcP2B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP2
);
153 srcP3A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP3
);
154 srcP3B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP3
);
156 srcM1A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcM1
);
157 srcM1B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcM1
);
158 srcM2A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcM2
);
159 srcM2B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcM2
);
161 sum1A
= vec_adds(srcP0A
, srcP1A
);
162 sum1B
= vec_adds(srcP0B
, srcP1B
);
163 sum2A
= vec_adds(srcM1A
, srcP2A
);
164 sum2B
= vec_adds(srcM1B
, srcP2B
);
165 sum3A
= vec_adds(srcM2A
, srcP3A
);
166 sum3B
= vec_adds(srcM2B
, srcP3B
);
168 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
169 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
171 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
172 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
174 pp3A
= vec_add(sum3A
, pp1A
);
175 pp3B
= vec_add(sum3B
, pp1B
);
177 psumA
= vec_sub(pp3A
, pp2A
);
178 psumB
= vec_sub(pp3B
, pp2B
);
180 sumA
= vec_sra(psumA
, v5us
);
181 sumB
= vec_sra(psumB
, v5us
);
183 sum
= vec_packsu(sumA
, sumB
);
187 OP_U8_ALTIVEC(fsum
, sum
, vec_ld(0, dst
));
189 vec_st(fsum
, 0, dst
);
197 /* this code assume stride % 16 == 0 */
198 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
199 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst
,
201 int dstStride
, int srcStride
)
208 perm
= vec_lvsl(0, src
);
210 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
211 const vec_u16 v5us
= vec_splat_u16(5);
212 const vec_s16 v5ss
= vec_splat_s16(5);
213 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
215 const uint8_t *srcbis
= src
- (srcStride
* 2);
217 const vec_u8 srcM2
= load_with_perm_vec(0, srcbis
, perm
);
219 const vec_u8 srcM1
= load_with_perm_vec(0, srcbis
, perm
);
221 const vec_u8 srcP0
= load_with_perm_vec(0, srcbis
, perm
);
223 const vec_u8 srcP1
= load_with_perm_vec(0, srcbis
, perm
);
225 const vec_u8 srcP2
= load_with_perm_vec(0, srcbis
, perm
);
228 vec_s16 srcM2ssA
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcM2
);
229 vec_s16 srcM2ssB
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcM2
);
230 vec_s16 srcM1ssA
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcM1
);
231 vec_s16 srcM1ssB
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcM1
);
232 vec_s16 srcP0ssA
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP0
);
233 vec_s16 srcP0ssB
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP0
);
234 vec_s16 srcP1ssA
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP1
);
235 vec_s16 srcP1ssB
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP1
);
236 vec_s16 srcP2ssA
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP2
);
237 vec_s16 srcP2ssB
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP2
);
239 vec_s16 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
240 psumA
, psumB
, sumA
, sumB
,
242 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
;
244 vec_u8 sum
, fsum
, srcP3
;
246 for (i
= 0 ; i
< 16 ; i
++) {
247 srcP3
= load_with_perm_vec(0, srcbis
, perm
);
250 srcP3ssA
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP3
);
251 srcP3ssB
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP3
);
253 sum1A
= vec_adds(srcP0ssA
, srcP1ssA
);
254 sum1B
= vec_adds(srcP0ssB
, srcP1ssB
);
255 sum2A
= vec_adds(srcM1ssA
, srcP2ssA
);
256 sum2B
= vec_adds(srcM1ssB
, srcP2ssB
);
257 sum3A
= vec_adds(srcM2ssA
, srcP3ssA
);
258 sum3B
= vec_adds(srcM2ssB
, srcP3ssB
);
271 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
272 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
274 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
275 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
277 pp3A
= vec_add(sum3A
, pp1A
);
278 pp3B
= vec_add(sum3B
, pp1B
);
280 psumA
= vec_sub(pp3A
, pp2A
);
281 psumB
= vec_sub(pp3B
, pp2B
);
283 sumA
= vec_sra(psumA
, v5us
);
284 sumB
= vec_sra(psumB
, v5us
);
286 sum
= vec_packsu(sumA
, sumB
);
290 OP_U8_ALTIVEC(fsum
, sum
, vec_ld(0, dst
));
292 vec_st(fsum
, 0, dst
);
299 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
300 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
301 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst
, int16_t *tmp
,
303 int dstStride
, int tmpStride
,
308 vec_u8 permM2
, permM1
, permP0
, permP1
, permP2
, permP3
;
309 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310 const vec_u32 v10ui
= vec_splat_u32(10);
311 const vec_s16 v5ss
= vec_splat_s16(5);
312 const vec_s16 v1ss
= vec_splat_s16(1);
313 const vec_s32 v512si
= vec_sl(vec_splat_s32(1),vec_splat_u32(9));
314 const vec_u32 v16ui
= vec_sl(vec_splat_u32(1),vec_splat_u32(4));
316 register int align
= ((((unsigned long)src
) - 2) % 16);
318 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
319 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
320 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
321 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
322 pp1A
, pp1B
, pp2A
, pp2B
, psumA
, psumB
;
324 const vec_u8 mperm
= (const vec_u8
)
325 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
326 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
327 int16_t *tmpbis
= tmp
;
329 vec_s16 tmpM1ssA
, tmpM1ssB
, tmpM2ssA
, tmpM2ssB
,
330 tmpP0ssA
, tmpP0ssB
, tmpP1ssA
, tmpP1ssB
,
333 vec_s32 pp1Ae
, pp1Ao
, pp1Be
, pp1Bo
, pp2Ae
, pp2Ao
, pp2Be
, pp2Bo
,
334 pp3Ae
, pp3Ao
, pp3Be
, pp3Bo
, pp1cAe
, pp1cAo
, pp1cBe
, pp1cBo
,
335 pp32Ae
, pp32Ao
, pp32Be
, pp32Bo
, sumAe
, sumAo
, sumBe
, sumBo
,
336 ssumAe
, ssumAo
, ssumBe
, ssumBo
;
337 vec_u8 fsum
, sumv
, sum
;
338 vec_s16 ssume
, ssumo
;
341 permM2
= vec_lvsl(-2, src
);
342 permM1
= vec_lvsl(-1, src
);
343 permP0
= vec_lvsl(+0, src
);
344 permP1
= vec_lvsl(+1, src
);
345 permP2
= vec_lvsl(+2, src
);
346 permP3
= vec_lvsl(+3, src
);
347 #endif /* HAVE_BIGENDIAN */
349 src
-= (2 * srcStride
);
350 for (i
= 0 ; i
< 21 ; i
++) {
351 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
353 load_alignment(src
, align
, permM2
, permM1
, permP0
, permP1
, permP2
, permP3
);
355 srcP0A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP0
);
356 srcP0B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP0
);
357 srcP1A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP1
);
358 srcP1B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP1
);
360 srcP2A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP2
);
361 srcP2B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP2
);
362 srcP3A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcP3
);
363 srcP3B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcP3
);
365 srcM1A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcM1
);
366 srcM1B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcM1
);
367 srcM2A
= (vec_s16
) VEC_MERGEH(zero_u8v
, srcM2
);
368 srcM2B
= (vec_s16
) VEC_MERGEL(zero_u8v
, srcM2
);
370 sum1A
= vec_adds(srcP0A
, srcP1A
);
371 sum1B
= vec_adds(srcP0B
, srcP1B
);
372 sum2A
= vec_adds(srcM1A
, srcP2A
);
373 sum2B
= vec_adds(srcM1B
, srcP2B
);
374 sum3A
= vec_adds(srcM2A
, srcP3A
);
375 sum3B
= vec_adds(srcM2B
, srcP3B
);
377 pp1A
= vec_mladd(sum1A
, v20ss
, sum3A
);
378 pp1B
= vec_mladd(sum1B
, v20ss
, sum3B
);
380 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
381 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
383 psumA
= vec_sub(pp1A
, pp2A
);
384 psumB
= vec_sub(pp1B
, pp2B
);
386 vec_st(psumA
, 0, tmp
);
387 vec_st(psumB
, 16, tmp
);
390 tmp
+= tmpStride
; /* int16_t*, and stride is 16, so it's OK here */
393 tmpM2ssA
= vec_ld(0, tmpbis
);
394 tmpM2ssB
= vec_ld(16, tmpbis
);
396 tmpM1ssA
= vec_ld(0, tmpbis
);
397 tmpM1ssB
= vec_ld(16, tmpbis
);
399 tmpP0ssA
= vec_ld(0, tmpbis
);
400 tmpP0ssB
= vec_ld(16, tmpbis
);
402 tmpP1ssA
= vec_ld(0, tmpbis
);
403 tmpP1ssB
= vec_ld(16, tmpbis
);
405 tmpP2ssA
= vec_ld(0, tmpbis
);
406 tmpP2ssB
= vec_ld(16, tmpbis
);
409 for (i
= 0 ; i
< 16 ; i
++) {
410 const vec_s16 tmpP3ssA
= vec_ld(0, tmpbis
);
411 const vec_s16 tmpP3ssB
= vec_ld(16, tmpbis
);
413 const vec_s16 sum1A
= vec_adds(tmpP0ssA
, tmpP1ssA
);
414 const vec_s16 sum1B
= vec_adds(tmpP0ssB
, tmpP1ssB
);
415 const vec_s16 sum2A
= vec_adds(tmpM1ssA
, tmpP2ssA
);
416 const vec_s16 sum2B
= vec_adds(tmpM1ssB
, tmpP2ssB
);
417 vec_s16 sum3A
= vec_adds(tmpM2ssA
, tmpP3ssA
);
418 vec_s16 sum3B
= vec_adds(tmpM2ssB
, tmpP3ssB
);
433 pp1Ae
= vec_mule(sum1A
, v20ss
);
434 pp1Ao
= vec_mulo(sum1A
, v20ss
);
435 pp1Be
= vec_mule(sum1B
, v20ss
);
436 pp1Bo
= vec_mulo(sum1B
, v20ss
);
438 pp2Ae
= vec_mule(sum2A
, v5ss
);
439 pp2Ao
= vec_mulo(sum2A
, v5ss
);
440 pp2Be
= vec_mule(sum2B
, v5ss
);
441 pp2Bo
= vec_mulo(sum2B
, v5ss
);
443 pp3Ao
= vec_mulo(sum3A
, v1ss
);
444 pp3Bo
= vec_mulo(sum3B
, v1ss
);
446 sum3A
= (vec_s16
)vec_perm(sum3A
, sum3A
,vcswapi2s(0,1,2,3));
447 sum3B
= (vec_s16
)vec_perm(sum3B
, sum3B
,vcswapi2s(0,1,2,3));
449 pp3Ae
= vec_sra((vec_s32
)sum3A
, v16ui
);
450 pp3Be
= vec_sra((vec_s32
)sum3B
, v16ui
);
452 pp1cAe
= vec_add(pp1Ae
, v512si
);
453 pp1cAo
= vec_add(pp1Ao
, v512si
);
454 pp1cBe
= vec_add(pp1Be
, v512si
);
455 pp1cBo
= vec_add(pp1Bo
, v512si
);
457 pp32Ae
= vec_sub(pp3Ae
, pp2Ae
);
458 pp32Ao
= vec_sub(pp3Ao
, pp2Ao
);
459 pp32Be
= vec_sub(pp3Be
, pp2Be
);
460 pp32Bo
= vec_sub(pp3Bo
, pp2Bo
);
462 sumAe
= vec_add(pp1cAe
, pp32Ae
);
463 sumAo
= vec_add(pp1cAo
, pp32Ao
);
464 sumBe
= vec_add(pp1cBe
, pp32Be
);
465 sumBo
= vec_add(pp1cBo
, pp32Bo
);
467 ssumAe
= vec_sra(sumAe
, v10ui
);
468 ssumAo
= vec_sra(sumAo
, v10ui
);
469 ssumBe
= vec_sra(sumBe
, v10ui
);
470 ssumBo
= vec_sra(sumBo
, v10ui
);
472 ssume
= vec_packs(ssumAe
, ssumBe
);
473 ssumo
= vec_packs(ssumAo
, ssumBo
);
475 sumv
= vec_packsu(ssume
, ssumo
);
476 sum
= vec_perm(sumv
, sumv
, mperm
);
480 OP_U8_ALTIVEC(fsum
, sum
, vec_ld(0, dst
));
482 vec_st(fsum
, 0, dst
);