2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mem.h"
24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
26 #define ASSERT_ALIGNED(ptr) ;
29 /* this code assume stride % 16 == 0 */
30 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst
,
33 int dstStride
, int srcStride
)
38 const vec_u8 permM2
= vec_lvsl(-2, src
);
39 const vec_u8 permM1
= vec_lvsl(-1, src
);
40 const vec_u8 permP0
= vec_lvsl(+0, src
);
41 const vec_u8 permP1
= vec_lvsl(+1, src
);
42 const vec_u8 permP2
= vec_lvsl(+2, src
);
43 const vec_u8 permP3
= vec_lvsl(+3, src
);
44 const vec_s16 v5ss
= vec_splat_s16(5);
45 const vec_u16 v5us
= vec_splat_u16(5);
46 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
47 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
49 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
51 register int align
= ((((unsigned long)src
) - 2) % 16);
53 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
54 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
55 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
56 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
57 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
58 psumA
, psumB
, sumA
, sumB
;
62 for (i
= 0 ; i
< 16 ; i
++) {
63 vec_u8 srcR1
= vec_ld(-2, src
);
64 vec_u8 srcR2
= vec_ld(14, src
);
68 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
69 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
70 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
71 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
72 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
73 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
76 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
77 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
78 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
79 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
80 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
84 vec_u8 srcR3
= vec_ld(30, src
);
85 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
86 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
87 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
88 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
90 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
93 vec_u8 srcR3
= vec_ld(30, src
);
94 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
95 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
96 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
98 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
99 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
102 vec_u8 srcR3
= vec_ld(30, src
);
103 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
104 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
106 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
107 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
108 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
111 vec_u8 srcR3
= vec_ld(30, src
);
112 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
114 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
115 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
116 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
117 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
121 srcP0A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
122 srcP0B
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
123 srcP1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
124 srcP1B
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
126 srcP2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
127 srcP2B
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
128 srcP3A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
129 srcP3B
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
131 srcM1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
132 srcM1B
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
133 srcM2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
134 srcM2B
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
136 sum1A
= vec_adds(srcP0A
, srcP1A
);
137 sum1B
= vec_adds(srcP0B
, srcP1B
);
138 sum2A
= vec_adds(srcM1A
, srcP2A
);
139 sum2B
= vec_adds(srcM1B
, srcP2B
);
140 sum3A
= vec_adds(srcM2A
, srcP3A
);
141 sum3B
= vec_adds(srcM2B
, srcP3B
);
143 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
144 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
146 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
147 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
149 pp3A
= vec_add(sum3A
, pp1A
);
150 pp3B
= vec_add(sum3B
, pp1B
);
152 psumA
= vec_sub(pp3A
, pp2A
);
153 psumB
= vec_sub(pp3B
, pp2B
);
155 sumA
= vec_sra(psumA
, v5us
);
156 sumB
= vec_sra(psumB
, v5us
);
158 sum
= vec_packsu(sumA
, sumB
);
162 OP_U8_ALTIVEC(fsum
, sum
, vec_ld(0, dst
));
164 vec_st(fsum
, 0, dst
);
172 /* this code assume stride % 16 == 0 */
173 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
174 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst
,
176 int dstStride
, int srcStride
)
181 const vec_u8 perm
= vec_lvsl(0, src
);
182 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
183 const vec_u16 v5us
= vec_splat_u16(5);
184 const vec_s16 v5ss
= vec_splat_s16(5);
185 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
187 const uint8_t *srcbis
= src
- (srcStride
* 2);
189 const vec_u8 srcM2a
= vec_ld(0, srcbis
);
190 const vec_u8 srcM2b
= vec_ld(16, srcbis
);
191 const vec_u8 srcM2
= vec_perm(srcM2a
, srcM2b
, perm
);
192 //srcbis += srcStride;
193 const vec_u8 srcM1a
= vec_ld(0, srcbis
+= srcStride
);
194 const vec_u8 srcM1b
= vec_ld(16, srcbis
);
195 const vec_u8 srcM1
= vec_perm(srcM1a
, srcM1b
, perm
);
196 //srcbis += srcStride;
197 const vec_u8 srcP0a
= vec_ld(0, srcbis
+= srcStride
);
198 const vec_u8 srcP0b
= vec_ld(16, srcbis
);
199 const vec_u8 srcP0
= vec_perm(srcP0a
, srcP0b
, perm
);
200 //srcbis += srcStride;
201 const vec_u8 srcP1a
= vec_ld(0, srcbis
+= srcStride
);
202 const vec_u8 srcP1b
= vec_ld(16, srcbis
);
203 const vec_u8 srcP1
= vec_perm(srcP1a
, srcP1b
, perm
);
204 //srcbis += srcStride;
205 const vec_u8 srcP2a
= vec_ld(0, srcbis
+= srcStride
);
206 const vec_u8 srcP2b
= vec_ld(16, srcbis
);
207 const vec_u8 srcP2
= vec_perm(srcP2a
, srcP2b
, perm
);
208 //srcbis += srcStride;
210 vec_s16 srcM2ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
211 vec_s16 srcM2ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
212 vec_s16 srcM1ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
213 vec_s16 srcM1ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
214 vec_s16 srcP0ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
215 vec_s16 srcP0ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
216 vec_s16 srcP1ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
217 vec_s16 srcP1ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
218 vec_s16 srcP2ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
219 vec_s16 srcP2ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
221 vec_s16 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
222 psumA
, psumB
, sumA
, sumB
,
224 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
;
226 vec_u8 sum
, fsum
, srcP3a
, srcP3b
, srcP3
;
228 for (i
= 0 ; i
< 16 ; i
++) {
229 srcP3a
= vec_ld(0, srcbis
+= srcStride
);
230 srcP3b
= vec_ld(16, srcbis
);
231 srcP3
= vec_perm(srcP3a
, srcP3b
, perm
);
232 srcP3ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
233 srcP3ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
234 //srcbis += srcStride;
236 sum1A
= vec_adds(srcP0ssA
, srcP1ssA
);
237 sum1B
= vec_adds(srcP0ssB
, srcP1ssB
);
238 sum2A
= vec_adds(srcM1ssA
, srcP2ssA
);
239 sum2B
= vec_adds(srcM1ssB
, srcP2ssB
);
240 sum3A
= vec_adds(srcM2ssA
, srcP3ssA
);
241 sum3B
= vec_adds(srcM2ssB
, srcP3ssB
);
254 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
255 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
257 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
258 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
260 pp3A
= vec_add(sum3A
, pp1A
);
261 pp3B
= vec_add(sum3B
, pp1B
);
263 psumA
= vec_sub(pp3A
, pp2A
);
264 psumB
= vec_sub(pp3B
, pp2B
);
266 sumA
= vec_sra(psumA
, v5us
);
267 sumB
= vec_sra(psumB
, v5us
);
269 sum
= vec_packsu(sumA
, sumB
);
273 OP_U8_ALTIVEC(fsum
, sum
, vec_ld(0, dst
));
275 vec_st(fsum
, 0, dst
);
282 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
283 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
284 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst
, int16_t *tmp
,
286 int dstStride
, int tmpStride
,
291 const vec_u8 permM2
= vec_lvsl(-2, src
);
292 const vec_u8 permM1
= vec_lvsl(-1, src
);
293 const vec_u8 permP0
= vec_lvsl(+0, src
);
294 const vec_u8 permP1
= vec_lvsl(+1, src
);
295 const vec_u8 permP2
= vec_lvsl(+2, src
);
296 const vec_u8 permP3
= vec_lvsl(+3, src
);
297 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
298 const vec_u32 v10ui
= vec_splat_u32(10);
299 const vec_s16 v5ss
= vec_splat_s16(5);
300 const vec_s16 v1ss
= vec_splat_s16(1);
301 const vec_s32 v512si
= vec_sl(vec_splat_s32(1),vec_splat_u32(9));
302 const vec_u32 v16ui
= vec_sl(vec_splat_u32(1),vec_splat_u32(4));
304 register int align
= ((((unsigned long)src
) - 2) % 16);
306 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
307 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
308 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
309 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
310 pp1A
, pp1B
, pp2A
, pp2B
, psumA
, psumB
;
312 const vec_u8 mperm
= (const vec_u8
)
313 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
314 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
315 int16_t *tmpbis
= tmp
;
317 vec_s16 tmpM1ssA
, tmpM1ssB
, tmpM2ssA
, tmpM2ssB
,
318 tmpP0ssA
, tmpP0ssB
, tmpP1ssA
, tmpP1ssB
,
321 vec_s32 pp1Ae
, pp1Ao
, pp1Be
, pp1Bo
, pp2Ae
, pp2Ao
, pp2Be
, pp2Bo
,
322 pp3Ae
, pp3Ao
, pp3Be
, pp3Bo
, pp1cAe
, pp1cAo
, pp1cBe
, pp1cBo
,
323 pp32Ae
, pp32Ao
, pp32Be
, pp32Bo
, sumAe
, sumAo
, sumBe
, sumBo
,
324 ssumAe
, ssumAo
, ssumBe
, ssumBo
;
325 vec_u8 fsum
, sumv
, sum
;
326 vec_s16 ssume
, ssumo
;
328 src
-= (2 * srcStride
);
329 for (i
= 0 ; i
< 21 ; i
++) {
330 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
331 vec_u8 srcR1
= vec_ld(-2, src
);
332 vec_u8 srcR2
= vec_ld(14, src
);
336 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
337 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
338 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
339 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
340 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
341 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
344 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
345 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
346 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
347 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
348 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
352 vec_u8 srcR3
= vec_ld(30, src
);
353 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
354 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
355 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
356 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
358 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
361 vec_u8 srcR3
= vec_ld(30, src
);
362 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
363 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
364 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
366 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
367 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
370 vec_u8 srcR3
= vec_ld(30, src
);
371 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
372 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
374 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
375 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
376 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
379 vec_u8 srcR3
= vec_ld(30, src
);
380 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
382 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
383 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
384 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
385 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
389 srcP0A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
390 srcP0B
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
391 srcP1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
392 srcP1B
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
394 srcP2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
395 srcP2B
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
396 srcP3A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
397 srcP3B
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
399 srcM1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
400 srcM1B
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
401 srcM2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
402 srcM2B
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
404 sum1A
= vec_adds(srcP0A
, srcP1A
);
405 sum1B
= vec_adds(srcP0B
, srcP1B
);
406 sum2A
= vec_adds(srcM1A
, srcP2A
);
407 sum2B
= vec_adds(srcM1B
, srcP2B
);
408 sum3A
= vec_adds(srcM2A
, srcP3A
);
409 sum3B
= vec_adds(srcM2B
, srcP3B
);
411 pp1A
= vec_mladd(sum1A
, v20ss
, sum3A
);
412 pp1B
= vec_mladd(sum1B
, v20ss
, sum3B
);
414 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
415 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
417 psumA
= vec_sub(pp1A
, pp2A
);
418 psumB
= vec_sub(pp1B
, pp2B
);
420 vec_st(psumA
, 0, tmp
);
421 vec_st(psumB
, 16, tmp
);
424 tmp
+= tmpStride
; /* int16_t*, and stride is 16, so it's OK here */
427 tmpM2ssA
= vec_ld(0, tmpbis
);
428 tmpM2ssB
= vec_ld(16, tmpbis
);
430 tmpM1ssA
= vec_ld(0, tmpbis
);
431 tmpM1ssB
= vec_ld(16, tmpbis
);
433 tmpP0ssA
= vec_ld(0, tmpbis
);
434 tmpP0ssB
= vec_ld(16, tmpbis
);
436 tmpP1ssA
= vec_ld(0, tmpbis
);
437 tmpP1ssB
= vec_ld(16, tmpbis
);
439 tmpP2ssA
= vec_ld(0, tmpbis
);
440 tmpP2ssB
= vec_ld(16, tmpbis
);
443 for (i
= 0 ; i
< 16 ; i
++) {
444 const vec_s16 tmpP3ssA
= vec_ld(0, tmpbis
);
445 const vec_s16 tmpP3ssB
= vec_ld(16, tmpbis
);
447 const vec_s16 sum1A
= vec_adds(tmpP0ssA
, tmpP1ssA
);
448 const vec_s16 sum1B
= vec_adds(tmpP0ssB
, tmpP1ssB
);
449 const vec_s16 sum2A
= vec_adds(tmpM1ssA
, tmpP2ssA
);
450 const vec_s16 sum2B
= vec_adds(tmpM1ssB
, tmpP2ssB
);
451 const vec_s16 sum3A
= vec_adds(tmpM2ssA
, tmpP3ssA
);
452 const vec_s16 sum3B
= vec_adds(tmpM2ssB
, tmpP3ssB
);
467 pp1Ae
= vec_mule(sum1A
, v20ss
);
468 pp1Ao
= vec_mulo(sum1A
, v20ss
);
469 pp1Be
= vec_mule(sum1B
, v20ss
);
470 pp1Bo
= vec_mulo(sum1B
, v20ss
);
472 pp2Ae
= vec_mule(sum2A
, v5ss
);
473 pp2Ao
= vec_mulo(sum2A
, v5ss
);
474 pp2Be
= vec_mule(sum2B
, v5ss
);
475 pp2Bo
= vec_mulo(sum2B
, v5ss
);
477 pp3Ae
= vec_sra((vec_s32
)sum3A
, v16ui
);
478 pp3Ao
= vec_mulo(sum3A
, v1ss
);
479 pp3Be
= vec_sra((vec_s32
)sum3B
, v16ui
);
480 pp3Bo
= vec_mulo(sum3B
, v1ss
);
482 pp1cAe
= vec_add(pp1Ae
, v512si
);
483 pp1cAo
= vec_add(pp1Ao
, v512si
);
484 pp1cBe
= vec_add(pp1Be
, v512si
);
485 pp1cBo
= vec_add(pp1Bo
, v512si
);
487 pp32Ae
= vec_sub(pp3Ae
, pp2Ae
);
488 pp32Ao
= vec_sub(pp3Ao
, pp2Ao
);
489 pp32Be
= vec_sub(pp3Be
, pp2Be
);
490 pp32Bo
= vec_sub(pp3Bo
, pp2Bo
);
492 sumAe
= vec_add(pp1cAe
, pp32Ae
);
493 sumAo
= vec_add(pp1cAo
, pp32Ao
);
494 sumBe
= vec_add(pp1cBe
, pp32Be
);
495 sumBo
= vec_add(pp1cBo
, pp32Bo
);
497 ssumAe
= vec_sra(sumAe
, v10ui
);
498 ssumAo
= vec_sra(sumAo
, v10ui
);
499 ssumBe
= vec_sra(sumBe
, v10ui
);
500 ssumBo
= vec_sra(sumBo
, v10ui
);
502 ssume
= vec_packs(ssumAe
, ssumBe
);
503 ssumo
= vec_packs(ssumAo
, ssumBo
);
505 sumv
= vec_packsu(ssume
, ssumo
);
506 sum
= vec_perm(sumv
, sumv
, mperm
);
510 OP_U8_ALTIVEC(fsum
, sum
, vec_ld(0, dst
));
512 vec_st(fsum
, 0, dst
);