Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / ppc / h264qpel_template.c
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mem.h"
22
23#ifdef DEBUG
24#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25#else
26#define ASSERT_ALIGNED(ptr) ;
27#endif
28
29/* this code assume stride % 16 == 0 */
30#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
32 const uint8_t *src,
33 int dstStride, int srcStride)
34{
35 register int i;
36
37 LOAD_ZERO;
38 const vec_u8 permM2 = vec_lvsl(-2, src);
39 const vec_u8 permM1 = vec_lvsl(-1, src);
40 const vec_u8 permP0 = vec_lvsl(+0, src);
41 const vec_u8 permP1 = vec_lvsl(+1, src);
42 const vec_u8 permP2 = vec_lvsl(+2, src);
43 const vec_u8 permP3 = vec_lvsl(+3, src);
44 const vec_s16 v5ss = vec_splat_s16(5);
45 const vec_u16 v5us = vec_splat_u16(5);
46 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
47 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
48
49 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
50
51 register int align = ((((unsigned long)src) - 2) % 16);
52
53 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
54 srcP2A, srcP2B, srcP3A, srcP3B,
55 srcM1A, srcM1B, srcM2A, srcM2B,
56 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
57 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
58 psumA, psumB, sumA, sumB;
59
60 vec_u8 sum, fsum;
61
62 for (i = 0 ; i < 16 ; i ++) {
63 vec_u8 srcR1 = vec_ld(-2, src);
64 vec_u8 srcR2 = vec_ld(14, src);
65
66 switch (align) {
67 default: {
68 srcM2 = vec_perm(srcR1, srcR2, permM2);
69 srcM1 = vec_perm(srcR1, srcR2, permM1);
70 srcP0 = vec_perm(srcR1, srcR2, permP0);
71 srcP1 = vec_perm(srcR1, srcR2, permP1);
72 srcP2 = vec_perm(srcR1, srcR2, permP2);
73 srcP3 = vec_perm(srcR1, srcR2, permP3);
74 } break;
75 case 11: {
76 srcM2 = vec_perm(srcR1, srcR2, permM2);
77 srcM1 = vec_perm(srcR1, srcR2, permM1);
78 srcP0 = vec_perm(srcR1, srcR2, permP0);
79 srcP1 = vec_perm(srcR1, srcR2, permP1);
80 srcP2 = vec_perm(srcR1, srcR2, permP2);
81 srcP3 = srcR2;
82 } break;
83 case 12: {
84 vec_u8 srcR3 = vec_ld(30, src);
85 srcM2 = vec_perm(srcR1, srcR2, permM2);
86 srcM1 = vec_perm(srcR1, srcR2, permM1);
87 srcP0 = vec_perm(srcR1, srcR2, permP0);
88 srcP1 = vec_perm(srcR1, srcR2, permP1);
89 srcP2 = srcR2;
90 srcP3 = vec_perm(srcR2, srcR3, permP3);
91 } break;
92 case 13: {
93 vec_u8 srcR3 = vec_ld(30, src);
94 srcM2 = vec_perm(srcR1, srcR2, permM2);
95 srcM1 = vec_perm(srcR1, srcR2, permM1);
96 srcP0 = vec_perm(srcR1, srcR2, permP0);
97 srcP1 = srcR2;
98 srcP2 = vec_perm(srcR2, srcR3, permP2);
99 srcP3 = vec_perm(srcR2, srcR3, permP3);
100 } break;
101 case 14: {
102 vec_u8 srcR3 = vec_ld(30, src);
103 srcM2 = vec_perm(srcR1, srcR2, permM2);
104 srcM1 = vec_perm(srcR1, srcR2, permM1);
105 srcP0 = srcR2;
106 srcP1 = vec_perm(srcR2, srcR3, permP1);
107 srcP2 = vec_perm(srcR2, srcR3, permP2);
108 srcP3 = vec_perm(srcR2, srcR3, permP3);
109 } break;
110 case 15: {
111 vec_u8 srcR3 = vec_ld(30, src);
112 srcM2 = vec_perm(srcR1, srcR2, permM2);
113 srcM1 = srcR2;
114 srcP0 = vec_perm(srcR2, srcR3, permP0);
115 srcP1 = vec_perm(srcR2, srcR3, permP1);
116 srcP2 = vec_perm(srcR2, srcR3, permP2);
117 srcP3 = vec_perm(srcR2, srcR3, permP3);
118 } break;
119 }
120
121 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
122 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
123 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
124 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
125
126 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
127 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
128 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
129 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
130
131 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
132 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
133 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
134 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
135
136 sum1A = vec_adds(srcP0A, srcP1A);
137 sum1B = vec_adds(srcP0B, srcP1B);
138 sum2A = vec_adds(srcM1A, srcP2A);
139 sum2B = vec_adds(srcM1B, srcP2B);
140 sum3A = vec_adds(srcM2A, srcP3A);
141 sum3B = vec_adds(srcM2B, srcP3B);
142
143 pp1A = vec_mladd(sum1A, v20ss, v16ss);
144 pp1B = vec_mladd(sum1B, v20ss, v16ss);
145
146 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
147 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
148
149 pp3A = vec_add(sum3A, pp1A);
150 pp3B = vec_add(sum3B, pp1B);
151
152 psumA = vec_sub(pp3A, pp2A);
153 psumB = vec_sub(pp3B, pp2B);
154
155 sumA = vec_sra(psumA, v5us);
156 sumB = vec_sra(psumB, v5us);
157
158 sum = vec_packsu(sumA, sumB);
159
160 ASSERT_ALIGNED(dst);
161
162 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
163
164 vec_st(fsum, 0, dst);
165
166 src += srcStride;
167 dst += dstStride;
168 }
169}
170#endif
171
172/* this code assume stride % 16 == 0 */
173#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
174static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
175 const uint8_t *src,
176 int dstStride, int srcStride)
177{
178 register int i;
179
180 LOAD_ZERO;
181 const vec_u8 perm = vec_lvsl(0, src);
182 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
183 const vec_u16 v5us = vec_splat_u16(5);
184 const vec_s16 v5ss = vec_splat_s16(5);
185 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
186
187 const uint8_t *srcbis = src - (srcStride * 2);
188
189 const vec_u8 srcM2a = vec_ld(0, srcbis);
190 const vec_u8 srcM2b = vec_ld(16, srcbis);
191 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
192 //srcbis += srcStride;
193 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
194 const vec_u8 srcM1b = vec_ld(16, srcbis);
195 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
196 //srcbis += srcStride;
197 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
198 const vec_u8 srcP0b = vec_ld(16, srcbis);
199 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
200 //srcbis += srcStride;
201 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
202 const vec_u8 srcP1b = vec_ld(16, srcbis);
203 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
204 //srcbis += srcStride;
205 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
206 const vec_u8 srcP2b = vec_ld(16, srcbis);
207 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
208 //srcbis += srcStride;
209
210 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
211 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
212 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
213 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
214 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
215 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
216 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
217 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
218 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
219 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
220
221 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
222 psumA, psumB, sumA, sumB,
223 srcP3ssA, srcP3ssB,
224 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
225
226 vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
227
228 for (i = 0 ; i < 16 ; i++) {
229 srcP3a = vec_ld(0, srcbis += srcStride);
230 srcP3b = vec_ld(16, srcbis);
231 srcP3 = vec_perm(srcP3a, srcP3b, perm);
232 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
233 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
234 //srcbis += srcStride;
235
236 sum1A = vec_adds(srcP0ssA, srcP1ssA);
237 sum1B = vec_adds(srcP0ssB, srcP1ssB);
238 sum2A = vec_adds(srcM1ssA, srcP2ssA);
239 sum2B = vec_adds(srcM1ssB, srcP2ssB);
240 sum3A = vec_adds(srcM2ssA, srcP3ssA);
241 sum3B = vec_adds(srcM2ssB, srcP3ssB);
242
243 srcM2ssA = srcM1ssA;
244 srcM2ssB = srcM1ssB;
245 srcM1ssA = srcP0ssA;
246 srcM1ssB = srcP0ssB;
247 srcP0ssA = srcP1ssA;
248 srcP0ssB = srcP1ssB;
249 srcP1ssA = srcP2ssA;
250 srcP1ssB = srcP2ssB;
251 srcP2ssA = srcP3ssA;
252 srcP2ssB = srcP3ssB;
253
254 pp1A = vec_mladd(sum1A, v20ss, v16ss);
255 pp1B = vec_mladd(sum1B, v20ss, v16ss);
256
257 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
258 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
259
260 pp3A = vec_add(sum3A, pp1A);
261 pp3B = vec_add(sum3B, pp1B);
262
263 psumA = vec_sub(pp3A, pp2A);
264 psumB = vec_sub(pp3B, pp2B);
265
266 sumA = vec_sra(psumA, v5us);
267 sumB = vec_sra(psumB, v5us);
268
269 sum = vec_packsu(sumA, sumB);
270
271 ASSERT_ALIGNED(dst);
272
273 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
274
275 vec_st(fsum, 0, dst);
276
277 dst += dstStride;
278 }
279}
280#endif
281
282/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
283#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
284static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
285 const uint8_t *src,
286 int dstStride, int tmpStride,
287 int srcStride)
288{
289 register int i;
290 LOAD_ZERO;
291 const vec_u8 permM2 = vec_lvsl(-2, src);
292 const vec_u8 permM1 = vec_lvsl(-1, src);
293 const vec_u8 permP0 = vec_lvsl(+0, src);
294 const vec_u8 permP1 = vec_lvsl(+1, src);
295 const vec_u8 permP2 = vec_lvsl(+2, src);
296 const vec_u8 permP3 = vec_lvsl(+3, src);
297 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
298 const vec_u32 v10ui = vec_splat_u32(10);
299 const vec_s16 v5ss = vec_splat_s16(5);
300 const vec_s16 v1ss = vec_splat_s16(1);
301 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
302 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
303
304 register int align = ((((unsigned long)src) - 2) % 16);
305
306 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
307 srcP2A, srcP2B, srcP3A, srcP3B,
308 srcM1A, srcM1B, srcM2A, srcM2B,
309 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
310 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
311
312 const vec_u8 mperm = (const vec_u8)
313 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
314 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
315 int16_t *tmpbis = tmp;
316
317 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
318 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
319 tmpP2ssA, tmpP2ssB;
320
321 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
322 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
323 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
324 ssumAe, ssumAo, ssumBe, ssumBo;
325 vec_u8 fsum, sumv, sum;
326 vec_s16 ssume, ssumo;
327
328 src -= (2 * srcStride);
329 for (i = 0 ; i < 21 ; i ++) {
330 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
331 vec_u8 srcR1 = vec_ld(-2, src);
332 vec_u8 srcR2 = vec_ld(14, src);
333
334 switch (align) {
335 default: {
336 srcM2 = vec_perm(srcR1, srcR2, permM2);
337 srcM1 = vec_perm(srcR1, srcR2, permM1);
338 srcP0 = vec_perm(srcR1, srcR2, permP0);
339 srcP1 = vec_perm(srcR1, srcR2, permP1);
340 srcP2 = vec_perm(srcR1, srcR2, permP2);
341 srcP3 = vec_perm(srcR1, srcR2, permP3);
342 } break;
343 case 11: {
344 srcM2 = vec_perm(srcR1, srcR2, permM2);
345 srcM1 = vec_perm(srcR1, srcR2, permM1);
346 srcP0 = vec_perm(srcR1, srcR2, permP0);
347 srcP1 = vec_perm(srcR1, srcR2, permP1);
348 srcP2 = vec_perm(srcR1, srcR2, permP2);
349 srcP3 = srcR2;
350 } break;
351 case 12: {
352 vec_u8 srcR3 = vec_ld(30, src);
353 srcM2 = vec_perm(srcR1, srcR2, permM2);
354 srcM1 = vec_perm(srcR1, srcR2, permM1);
355 srcP0 = vec_perm(srcR1, srcR2, permP0);
356 srcP1 = vec_perm(srcR1, srcR2, permP1);
357 srcP2 = srcR2;
358 srcP3 = vec_perm(srcR2, srcR3, permP3);
359 } break;
360 case 13: {
361 vec_u8 srcR3 = vec_ld(30, src);
362 srcM2 = vec_perm(srcR1, srcR2, permM2);
363 srcM1 = vec_perm(srcR1, srcR2, permM1);
364 srcP0 = vec_perm(srcR1, srcR2, permP0);
365 srcP1 = srcR2;
366 srcP2 = vec_perm(srcR2, srcR3, permP2);
367 srcP3 = vec_perm(srcR2, srcR3, permP3);
368 } break;
369 case 14: {
370 vec_u8 srcR3 = vec_ld(30, src);
371 srcM2 = vec_perm(srcR1, srcR2, permM2);
372 srcM1 = vec_perm(srcR1, srcR2, permM1);
373 srcP0 = srcR2;
374 srcP1 = vec_perm(srcR2, srcR3, permP1);
375 srcP2 = vec_perm(srcR2, srcR3, permP2);
376 srcP3 = vec_perm(srcR2, srcR3, permP3);
377 } break;
378 case 15: {
379 vec_u8 srcR3 = vec_ld(30, src);
380 srcM2 = vec_perm(srcR1, srcR2, permM2);
381 srcM1 = srcR2;
382 srcP0 = vec_perm(srcR2, srcR3, permP0);
383 srcP1 = vec_perm(srcR2, srcR3, permP1);
384 srcP2 = vec_perm(srcR2, srcR3, permP2);
385 srcP3 = vec_perm(srcR2, srcR3, permP3);
386 } break;
387 }
388
389 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
390 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
391 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
392 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
393
394 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
395 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
396 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
397 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
398
399 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
400 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
401 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
402 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
403
404 sum1A = vec_adds(srcP0A, srcP1A);
405 sum1B = vec_adds(srcP0B, srcP1B);
406 sum2A = vec_adds(srcM1A, srcP2A);
407 sum2B = vec_adds(srcM1B, srcP2B);
408 sum3A = vec_adds(srcM2A, srcP3A);
409 sum3B = vec_adds(srcM2B, srcP3B);
410
411 pp1A = vec_mladd(sum1A, v20ss, sum3A);
412 pp1B = vec_mladd(sum1B, v20ss, sum3B);
413
414 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
415 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
416
417 psumA = vec_sub(pp1A, pp2A);
418 psumB = vec_sub(pp1B, pp2B);
419
420 vec_st(psumA, 0, tmp);
421 vec_st(psumB, 16, tmp);
422
423 src += srcStride;
424 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
425 }
426
427 tmpM2ssA = vec_ld(0, tmpbis);
428 tmpM2ssB = vec_ld(16, tmpbis);
429 tmpbis += tmpStride;
430 tmpM1ssA = vec_ld(0, tmpbis);
431 tmpM1ssB = vec_ld(16, tmpbis);
432 tmpbis += tmpStride;
433 tmpP0ssA = vec_ld(0, tmpbis);
434 tmpP0ssB = vec_ld(16, tmpbis);
435 tmpbis += tmpStride;
436 tmpP1ssA = vec_ld(0, tmpbis);
437 tmpP1ssB = vec_ld(16, tmpbis);
438 tmpbis += tmpStride;
439 tmpP2ssA = vec_ld(0, tmpbis);
440 tmpP2ssB = vec_ld(16, tmpbis);
441 tmpbis += tmpStride;
442
443 for (i = 0 ; i < 16 ; i++) {
444 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
445 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
446
447 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
448 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
449 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
450 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
451 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
452 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
453
454 tmpbis += tmpStride;
455
456 tmpM2ssA = tmpM1ssA;
457 tmpM2ssB = tmpM1ssB;
458 tmpM1ssA = tmpP0ssA;
459 tmpM1ssB = tmpP0ssB;
460 tmpP0ssA = tmpP1ssA;
461 tmpP0ssB = tmpP1ssB;
462 tmpP1ssA = tmpP2ssA;
463 tmpP1ssB = tmpP2ssB;
464 tmpP2ssA = tmpP3ssA;
465 tmpP2ssB = tmpP3ssB;
466
467 pp1Ae = vec_mule(sum1A, v20ss);
468 pp1Ao = vec_mulo(sum1A, v20ss);
469 pp1Be = vec_mule(sum1B, v20ss);
470 pp1Bo = vec_mulo(sum1B, v20ss);
471
472 pp2Ae = vec_mule(sum2A, v5ss);
473 pp2Ao = vec_mulo(sum2A, v5ss);
474 pp2Be = vec_mule(sum2B, v5ss);
475 pp2Bo = vec_mulo(sum2B, v5ss);
476
477 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
478 pp3Ao = vec_mulo(sum3A, v1ss);
479 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
480 pp3Bo = vec_mulo(sum3B, v1ss);
481
482 pp1cAe = vec_add(pp1Ae, v512si);
483 pp1cAo = vec_add(pp1Ao, v512si);
484 pp1cBe = vec_add(pp1Be, v512si);
485 pp1cBo = vec_add(pp1Bo, v512si);
486
487 pp32Ae = vec_sub(pp3Ae, pp2Ae);
488 pp32Ao = vec_sub(pp3Ao, pp2Ao);
489 pp32Be = vec_sub(pp3Be, pp2Be);
490 pp32Bo = vec_sub(pp3Bo, pp2Bo);
491
492 sumAe = vec_add(pp1cAe, pp32Ae);
493 sumAo = vec_add(pp1cAo, pp32Ao);
494 sumBe = vec_add(pp1cBe, pp32Be);
495 sumBo = vec_add(pp1cBo, pp32Bo);
496
497 ssumAe = vec_sra(sumAe, v10ui);
498 ssumAo = vec_sra(sumAo, v10ui);
499 ssumBe = vec_sra(sumBe, v10ui);
500 ssumBo = vec_sra(sumBo, v10ui);
501
502 ssume = vec_packs(ssumAe, ssumBe);
503 ssumo = vec_packs(ssumAo, ssumBo);
504
505 sumv = vec_packsu(ssume, ssumo);
506 sum = vec_perm(sumv, sumv, mperm);
507
508 ASSERT_ALIGNED(dst);
509
510 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
511
512 vec_st(fsum, 0, dst);
513
514 dst += dstStride;
515 }
516}
517#endif