9a345485badcf453e1a2e066cbb8c8a3a0d77fd1
[deb_ffmpeg.git] / h264qpel_template.c
1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "config.h"
22 #if HAVE_UNISTD_H
23 #include <unistd.h>
24 #endif
25
26 #include "libavutil/mem.h"
27 #include "libavutil/ppc/types_altivec.h"
28 #include "libavutil/ppc/util_altivec.h"
29
30 #ifdef DEBUG
31 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
32 #else
33 #define ASSERT_ALIGNED(ptr) ;
34 #endif
35
36 #if HAVE_BIGENDIAN
37 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
38 vec_u8 srcR1 = vec_ld(-2, s);\
39 vec_u8 srcR2 = vec_ld(14, s);\
40 switch (ali) {\
41 default: {\
42 srcM2 = vec_perm(srcR1, srcR2, pm2);\
43 srcM1 = vec_perm(srcR1, srcR2, pm1);\
44 srcP0 = vec_perm(srcR1, srcR2, pp0);\
45 srcP1 = vec_perm(srcR1, srcR2, pp1);\
46 srcP2 = vec_perm(srcR1, srcR2, pp2);\
47 srcP3 = vec_perm(srcR1, srcR2, pp3);\
48 } break;\
49 case 11: {\
50 srcM2 = vec_perm(srcR1, srcR2, pm2);\
51 srcM1 = vec_perm(srcR1, srcR2, pm1);\
52 srcP0 = vec_perm(srcR1, srcR2, pp0);\
53 srcP1 = vec_perm(srcR1, srcR2, pp1);\
54 srcP2 = vec_perm(srcR1, srcR2, pp2);\
55 srcP3 = srcR2;\
56 } break;\
57 case 12: {\
58 vec_u8 srcR3 = vec_ld(30, s);\
59 srcM2 = vec_perm(srcR1, srcR2, pm2);\
60 srcM1 = vec_perm(srcR1, srcR2, pm1);\
61 srcP0 = vec_perm(srcR1, srcR2, pp0);\
62 srcP1 = vec_perm(srcR1, srcR2, pp1);\
63 srcP2 = srcR2;\
64 srcP3 = vec_perm(srcR2, srcR3, pp3);\
65 } break;\
66 case 13: {\
67 vec_u8 srcR3 = vec_ld(30, s);\
68 srcM2 = vec_perm(srcR1, srcR2, pm2);\
69 srcM1 = vec_perm(srcR1, srcR2, pm1);\
70 srcP0 = vec_perm(srcR1, srcR2, pp0);\
71 srcP1 = srcR2;\
72 srcP2 = vec_perm(srcR2, srcR3, pp2);\
73 srcP3 = vec_perm(srcR2, srcR3, pp3);\
74 } break;\
75 case 14: {\
76 vec_u8 srcR3 = vec_ld(30, s);\
77 srcM2 = vec_perm(srcR1, srcR2, pm2);\
78 srcM1 = vec_perm(srcR1, srcR2, pm1);\
79 srcP0 = srcR2;\
80 srcP1 = vec_perm(srcR2, srcR3, pp1);\
81 srcP2 = vec_perm(srcR2, srcR3, pp2);\
82 srcP3 = vec_perm(srcR2, srcR3, pp3);\
83 } break;\
84 case 15: {\
85 vec_u8 srcR3 = vec_ld(30, s);\
86 srcM2 = vec_perm(srcR1, srcR2, pm2);\
87 srcM1 = srcR2;\
88 srcP0 = vec_perm(srcR2, srcR3, pp0);\
89 srcP1 = vec_perm(srcR2, srcR3, pp1);\
90 srcP2 = vec_perm(srcR2, srcR3, pp2);\
91 srcP3 = vec_perm(srcR2, srcR3, pp3);\
92 } break;\
93 }\
94 }
95 #else
96 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
97 srcM2 = vec_vsx_ld(-2, s);\
98 srcM1 = vec_vsx_ld(-1, s);\
99 srcP0 = vec_vsx_ld(0, s);\
100 srcP1 = vec_vsx_ld(1, s);\
101 srcP2 = vec_vsx_ld(2, s);\
102 srcP3 = vec_vsx_ld(3, s);\
103 }
104 #endif /* HAVE_BIGENDIAN */
105
106 /* this code assume stride % 16 == 0 */
107 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
108 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
109 const uint8_t *src,
110 int dstStride, int srcStride)
111 {
112 register int i;
113
114 LOAD_ZERO;
115 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
116 const vec_s16 v5ss = vec_splat_s16(5);
117 const vec_u16 v5us = vec_splat_u16(5);
118 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
119 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
120
121 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
122
123 register int align = ((((unsigned long)src) - 2) % 16);
124
125 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
126 srcP2A, srcP2B, srcP3A, srcP3B,
127 srcM1A, srcM1B, srcM2A, srcM2B,
128 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
129 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
130 psumA, psumB, sumA, sumB;
131
132 vec_u8 sum, fsum;
133
134 #if HAVE_BIGENDIAN
135 permM2 = vec_lvsl(-2, src);
136 permM1 = vec_lvsl(-1, src);
137 permP0 = vec_lvsl(+0, src);
138 permP1 = vec_lvsl(+1, src);
139 permP2 = vec_lvsl(+2, src);
140 permP3 = vec_lvsl(+3, src);
141 #endif /* HAVE_BIGENDIAN */
142
143 for (i = 0 ; i < 16 ; i ++) {
144 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
145
146 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
147 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
148 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
149 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
150
151 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
152 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
153 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
154 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
155
156 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
157 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
158 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
159 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
160
161 sum1A = vec_adds(srcP0A, srcP1A);
162 sum1B = vec_adds(srcP0B, srcP1B);
163 sum2A = vec_adds(srcM1A, srcP2A);
164 sum2B = vec_adds(srcM1B, srcP2B);
165 sum3A = vec_adds(srcM2A, srcP3A);
166 sum3B = vec_adds(srcM2B, srcP3B);
167
168 pp1A = vec_mladd(sum1A, v20ss, v16ss);
169 pp1B = vec_mladd(sum1B, v20ss, v16ss);
170
171 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
172 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
173
174 pp3A = vec_add(sum3A, pp1A);
175 pp3B = vec_add(sum3B, pp1B);
176
177 psumA = vec_sub(pp3A, pp2A);
178 psumB = vec_sub(pp3B, pp2B);
179
180 sumA = vec_sra(psumA, v5us);
181 sumB = vec_sra(psumB, v5us);
182
183 sum = vec_packsu(sumA, sumB);
184
185 ASSERT_ALIGNED(dst);
186
187 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
188
189 vec_st(fsum, 0, dst);
190
191 src += srcStride;
192 dst += dstStride;
193 }
194 }
195 #endif
196
197 /* this code assume stride % 16 == 0 */
198 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
199 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
200 const uint8_t *src,
201 int dstStride, int srcStride)
202 {
203 register int i;
204
205 LOAD_ZERO;
206 vec_u8 perm;
207 #if HAVE_BIGENDIAN
208 perm = vec_lvsl(0, src);
209 #endif
210 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
211 const vec_u16 v5us = vec_splat_u16(5);
212 const vec_s16 v5ss = vec_splat_s16(5);
213 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
214
215 const uint8_t *srcbis = src - (srcStride * 2);
216
217 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
218 srcbis += srcStride;
219 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
220 srcbis += srcStride;
221 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
222 srcbis += srcStride;
223 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
224 srcbis += srcStride;
225 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
226 srcbis += srcStride;
227
228 vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
229 vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
230 vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
231 vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
232 vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
233 vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
234 vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
235 vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
236 vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
237 vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
238
239 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
240 psumA, psumB, sumA, sumB,
241 srcP3ssA, srcP3ssB,
242 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
243
244 vec_u8 sum, fsum, srcP3;
245
246 for (i = 0 ; i < 16 ; i++) {
247 srcP3 = load_with_perm_vec(0, srcbis, perm);
248 srcbis += srcStride;
249
250 srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
251 srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
252
253 sum1A = vec_adds(srcP0ssA, srcP1ssA);
254 sum1B = vec_adds(srcP0ssB, srcP1ssB);
255 sum2A = vec_adds(srcM1ssA, srcP2ssA);
256 sum2B = vec_adds(srcM1ssB, srcP2ssB);
257 sum3A = vec_adds(srcM2ssA, srcP3ssA);
258 sum3B = vec_adds(srcM2ssB, srcP3ssB);
259
260 srcM2ssA = srcM1ssA;
261 srcM2ssB = srcM1ssB;
262 srcM1ssA = srcP0ssA;
263 srcM1ssB = srcP0ssB;
264 srcP0ssA = srcP1ssA;
265 srcP0ssB = srcP1ssB;
266 srcP1ssA = srcP2ssA;
267 srcP1ssB = srcP2ssB;
268 srcP2ssA = srcP3ssA;
269 srcP2ssB = srcP3ssB;
270
271 pp1A = vec_mladd(sum1A, v20ss, v16ss);
272 pp1B = vec_mladd(sum1B, v20ss, v16ss);
273
274 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
275 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
276
277 pp3A = vec_add(sum3A, pp1A);
278 pp3B = vec_add(sum3B, pp1B);
279
280 psumA = vec_sub(pp3A, pp2A);
281 psumB = vec_sub(pp3B, pp2B);
282
283 sumA = vec_sra(psumA, v5us);
284 sumB = vec_sra(psumB, v5us);
285
286 sum = vec_packsu(sumA, sumB);
287
288 ASSERT_ALIGNED(dst);
289
290 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
291
292 vec_st(fsum, 0, dst);
293
294 dst += dstStride;
295 }
296 }
297 #endif
298
299 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
300 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
301 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
302 const uint8_t *src,
303 int dstStride, int tmpStride,
304 int srcStride)
305 {
306 register int i;
307 LOAD_ZERO;
308 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310 const vec_u32 v10ui = vec_splat_u32(10);
311 const vec_s16 v5ss = vec_splat_s16(5);
312 const vec_s16 v1ss = vec_splat_s16(1);
313 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
314 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
315
316 register int align = ((((unsigned long)src) - 2) % 16);
317
318 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
319 srcP2A, srcP2B, srcP3A, srcP3B,
320 srcM1A, srcM1B, srcM2A, srcM2B,
321 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
322 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
323
324 const vec_u8 mperm = (const vec_u8)
325 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
326 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
327 int16_t *tmpbis = tmp;
328
329 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
330 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
331 tmpP2ssA, tmpP2ssB;
332
333 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
334 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
335 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
336 ssumAe, ssumAo, ssumBe, ssumBo;
337 vec_u8 fsum, sumv, sum;
338 vec_s16 ssume, ssumo;
339
340 #if HAVE_BIGENDIAN
341 permM2 = vec_lvsl(-2, src);
342 permM1 = vec_lvsl(-1, src);
343 permP0 = vec_lvsl(+0, src);
344 permP1 = vec_lvsl(+1, src);
345 permP2 = vec_lvsl(+2, src);
346 permP3 = vec_lvsl(+3, src);
347 #endif /* HAVE_BIGENDIAN */
348
349 src -= (2 * srcStride);
350 for (i = 0 ; i < 21 ; i ++) {
351 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
352
353 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
354
355 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
356 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
357 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
358 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
359
360 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
361 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
362 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
363 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
364
365 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
366 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
367 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
368 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
369
370 sum1A = vec_adds(srcP0A, srcP1A);
371 sum1B = vec_adds(srcP0B, srcP1B);
372 sum2A = vec_adds(srcM1A, srcP2A);
373 sum2B = vec_adds(srcM1B, srcP2B);
374 sum3A = vec_adds(srcM2A, srcP3A);
375 sum3B = vec_adds(srcM2B, srcP3B);
376
377 pp1A = vec_mladd(sum1A, v20ss, sum3A);
378 pp1B = vec_mladd(sum1B, v20ss, sum3B);
379
380 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
381 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
382
383 psumA = vec_sub(pp1A, pp2A);
384 psumB = vec_sub(pp1B, pp2B);
385
386 vec_st(psumA, 0, tmp);
387 vec_st(psumB, 16, tmp);
388
389 src += srcStride;
390 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
391 }
392
393 tmpM2ssA = vec_ld(0, tmpbis);
394 tmpM2ssB = vec_ld(16, tmpbis);
395 tmpbis += tmpStride;
396 tmpM1ssA = vec_ld(0, tmpbis);
397 tmpM1ssB = vec_ld(16, tmpbis);
398 tmpbis += tmpStride;
399 tmpP0ssA = vec_ld(0, tmpbis);
400 tmpP0ssB = vec_ld(16, tmpbis);
401 tmpbis += tmpStride;
402 tmpP1ssA = vec_ld(0, tmpbis);
403 tmpP1ssB = vec_ld(16, tmpbis);
404 tmpbis += tmpStride;
405 tmpP2ssA = vec_ld(0, tmpbis);
406 tmpP2ssB = vec_ld(16, tmpbis);
407 tmpbis += tmpStride;
408
409 for (i = 0 ; i < 16 ; i++) {
410 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
411 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
412
413 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
414 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
415 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
416 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
417 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
418 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
419
420 tmpbis += tmpStride;
421
422 tmpM2ssA = tmpM1ssA;
423 tmpM2ssB = tmpM1ssB;
424 tmpM1ssA = tmpP0ssA;
425 tmpM1ssB = tmpP0ssB;
426 tmpP0ssA = tmpP1ssA;
427 tmpP0ssB = tmpP1ssB;
428 tmpP1ssA = tmpP2ssA;
429 tmpP1ssB = tmpP2ssB;
430 tmpP2ssA = tmpP3ssA;
431 tmpP2ssB = tmpP3ssB;
432
433 pp1Ae = vec_mule(sum1A, v20ss);
434 pp1Ao = vec_mulo(sum1A, v20ss);
435 pp1Be = vec_mule(sum1B, v20ss);
436 pp1Bo = vec_mulo(sum1B, v20ss);
437
438 pp2Ae = vec_mule(sum2A, v5ss);
439 pp2Ao = vec_mulo(sum2A, v5ss);
440 pp2Be = vec_mule(sum2B, v5ss);
441 pp2Bo = vec_mulo(sum2B, v5ss);
442
443 pp3Ao = vec_mulo(sum3A, v1ss);
444 pp3Bo = vec_mulo(sum3B, v1ss);
445 #if !HAVE_BIGENDIAN
446 sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
447 sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
448 #endif
449 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
450 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
451
452 pp1cAe = vec_add(pp1Ae, v512si);
453 pp1cAo = vec_add(pp1Ao, v512si);
454 pp1cBe = vec_add(pp1Be, v512si);
455 pp1cBo = vec_add(pp1Bo, v512si);
456
457 pp32Ae = vec_sub(pp3Ae, pp2Ae);
458 pp32Ao = vec_sub(pp3Ao, pp2Ao);
459 pp32Be = vec_sub(pp3Be, pp2Be);
460 pp32Bo = vec_sub(pp3Bo, pp2Bo);
461
462 sumAe = vec_add(pp1cAe, pp32Ae);
463 sumAo = vec_add(pp1cAo, pp32Ao);
464 sumBe = vec_add(pp1cBe, pp32Be);
465 sumBo = vec_add(pp1cBo, pp32Bo);
466
467 ssumAe = vec_sra(sumAe, v10ui);
468 ssumAo = vec_sra(sumAo, v10ui);
469 ssumBe = vec_sra(sumBe, v10ui);
470 ssumBo = vec_sra(sumBo, v10ui);
471
472 ssume = vec_packs(ssumAe, ssumBe);
473 ssumo = vec_packs(ssumAo, ssumBo);
474
475 sumv = vec_packsu(ssume, ssumo);
476 sum = vec_perm(sumv, sumv, mperm);
477
478 ASSERT_ALIGNED(dst);
479
480 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
481
482 vec_st(fsum, 0, dst);
483
484 dst += dstStride;
485 }
486 }
487 #endif