Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
f6fa7814 DM |
21 | #include "config.h" |
22 | #if HAVE_UNISTD_H | |
23 | #include <unistd.h> | |
24 | #endif | |
25 | ||
2ba45a60 | 26 | #include "libavutil/mem.h" |
f6fa7814 DM |
27 | #include "libavutil/ppc/types_altivec.h" |
28 | #include "libavutil/ppc/util_altivec.h" | |
2ba45a60 DM |
29 | |
30 | #ifdef DEBUG | |
31 | #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); | |
32 | #else | |
33 | #define ASSERT_ALIGNED(ptr) ; | |
34 | #endif | |
35 | ||
f6fa7814 DM |
36 | #if HAVE_BIGENDIAN |
37 | #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\ | |
38 | vec_u8 srcR1 = vec_ld(-2, s);\ | |
39 | vec_u8 srcR2 = vec_ld(14, s);\ | |
40 | switch (ali) {\ | |
41 | default: {\ | |
42 | srcM2 = vec_perm(srcR1, srcR2, pm2);\ | |
43 | srcM1 = vec_perm(srcR1, srcR2, pm1);\ | |
44 | srcP0 = vec_perm(srcR1, srcR2, pp0);\ | |
45 | srcP1 = vec_perm(srcR1, srcR2, pp1);\ | |
46 | srcP2 = vec_perm(srcR1, srcR2, pp2);\ | |
47 | srcP3 = vec_perm(srcR1, srcR2, pp3);\ | |
48 | } break;\ | |
49 | case 11: {\ | |
50 | srcM2 = vec_perm(srcR1, srcR2, pm2);\ | |
51 | srcM1 = vec_perm(srcR1, srcR2, pm1);\ | |
52 | srcP0 = vec_perm(srcR1, srcR2, pp0);\ | |
53 | srcP1 = vec_perm(srcR1, srcR2, pp1);\ | |
54 | srcP2 = vec_perm(srcR1, srcR2, pp2);\ | |
55 | srcP3 = srcR2;\ | |
56 | } break;\ | |
57 | case 12: {\ | |
58 | vec_u8 srcR3 = vec_ld(30, s);\ | |
59 | srcM2 = vec_perm(srcR1, srcR2, pm2);\ | |
60 | srcM1 = vec_perm(srcR1, srcR2, pm1);\ | |
61 | srcP0 = vec_perm(srcR1, srcR2, pp0);\ | |
62 | srcP1 = vec_perm(srcR1, srcR2, pp1);\ | |
63 | srcP2 = srcR2;\ | |
64 | srcP3 = vec_perm(srcR2, srcR3, pp3);\ | |
65 | } break;\ | |
66 | case 13: {\ | |
67 | vec_u8 srcR3 = vec_ld(30, s);\ | |
68 | srcM2 = vec_perm(srcR1, srcR2, pm2);\ | |
69 | srcM1 = vec_perm(srcR1, srcR2, pm1);\ | |
70 | srcP0 = vec_perm(srcR1, srcR2, pp0);\ | |
71 | srcP1 = srcR2;\ | |
72 | srcP2 = vec_perm(srcR2, srcR3, pp2);\ | |
73 | srcP3 = vec_perm(srcR2, srcR3, pp3);\ | |
74 | } break;\ | |
75 | case 14: {\ | |
76 | vec_u8 srcR3 = vec_ld(30, s);\ | |
77 | srcM2 = vec_perm(srcR1, srcR2, pm2);\ | |
78 | srcM1 = vec_perm(srcR1, srcR2, pm1);\ | |
79 | srcP0 = srcR2;\ | |
80 | srcP1 = vec_perm(srcR2, srcR3, pp1);\ | |
81 | srcP2 = vec_perm(srcR2, srcR3, pp2);\ | |
82 | srcP3 = vec_perm(srcR2, srcR3, pp3);\ | |
83 | } break;\ | |
84 | case 15: {\ | |
85 | vec_u8 srcR3 = vec_ld(30, s);\ | |
86 | srcM2 = vec_perm(srcR1, srcR2, pm2);\ | |
87 | srcM1 = srcR2;\ | |
88 | srcP0 = vec_perm(srcR2, srcR3, pp0);\ | |
89 | srcP1 = vec_perm(srcR2, srcR3, pp1);\ | |
90 | srcP2 = vec_perm(srcR2, srcR3, pp2);\ | |
91 | srcP3 = vec_perm(srcR2, srcR3, pp3);\ | |
92 | } break;\ | |
93 | }\ | |
94 | } | |
95 | #else | |
96 | #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\ | |
97 | srcM2 = vec_vsx_ld(-2, s);\ | |
98 | srcM1 = vec_vsx_ld(-1, s);\ | |
99 | srcP0 = vec_vsx_ld(0, s);\ | |
100 | srcP1 = vec_vsx_ld(1, s);\ | |
101 | srcP2 = vec_vsx_ld(2, s);\ | |
102 | srcP3 = vec_vsx_ld(3, s);\ | |
103 | } | |
104 | #endif /* HAVE_BIGENDIAN */ | |
105 | ||
2ba45a60 DM |
106 | /* this code assume stride % 16 == 0 */ |
107 | #ifdef PREFIX_h264_qpel16_h_lowpass_altivec | |
108 | static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst, | |
109 | const uint8_t *src, | |
110 | int dstStride, int srcStride) | |
111 | { | |
112 | register int i; | |
113 | ||
114 | LOAD_ZERO; | |
f6fa7814 | 115 | vec_u8 permM2, permM1, permP0, permP1, permP2, permP3; |
2ba45a60 DM |
116 | const vec_s16 v5ss = vec_splat_s16(5); |
117 | const vec_u16 v5us = vec_splat_u16(5); | |
118 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |
119 | const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |
120 | ||
121 | vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |
122 | ||
123 | register int align = ((((unsigned long)src) - 2) % 16); | |
124 | ||
125 | vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, | |
126 | srcP2A, srcP2B, srcP3A, srcP3B, | |
127 | srcM1A, srcM1B, srcM2A, srcM2B, | |
128 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |
129 | pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |
130 | psumA, psumB, sumA, sumB; | |
131 | ||
132 | vec_u8 sum, fsum; | |
133 | ||
f6fa7814 DM |
134 | #if HAVE_BIGENDIAN |
135 | permM2 = vec_lvsl(-2, src); | |
136 | permM1 = vec_lvsl(-1, src); | |
137 | permP0 = vec_lvsl(+0, src); | |
138 | permP1 = vec_lvsl(+1, src); | |
139 | permP2 = vec_lvsl(+2, src); | |
140 | permP3 = vec_lvsl(+3, src); | |
141 | #endif /* HAVE_BIGENDIAN */ | |
142 | ||
2ba45a60 | 143 | for (i = 0 ; i < 16 ; i ++) { |
f6fa7814 DM |
144 | load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3); |
145 | ||
146 | srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); | |
147 | srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); | |
148 | srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); | |
149 | srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); | |
150 | ||
151 | srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); | |
152 | srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); | |
153 | srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); | |
154 | srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); | |
155 | ||
156 | srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); | |
157 | srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); | |
158 | srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); | |
159 | srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); | |
2ba45a60 DM |
160 | |
161 | sum1A = vec_adds(srcP0A, srcP1A); | |
162 | sum1B = vec_adds(srcP0B, srcP1B); | |
163 | sum2A = vec_adds(srcM1A, srcP2A); | |
164 | sum2B = vec_adds(srcM1B, srcP2B); | |
165 | sum3A = vec_adds(srcM2A, srcP3A); | |
166 | sum3B = vec_adds(srcM2B, srcP3B); | |
167 | ||
168 | pp1A = vec_mladd(sum1A, v20ss, v16ss); | |
169 | pp1B = vec_mladd(sum1B, v20ss, v16ss); | |
170 | ||
171 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |
172 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |
173 | ||
174 | pp3A = vec_add(sum3A, pp1A); | |
175 | pp3B = vec_add(sum3B, pp1B); | |
176 | ||
177 | psumA = vec_sub(pp3A, pp2A); | |
178 | psumB = vec_sub(pp3B, pp2B); | |
179 | ||
180 | sumA = vec_sra(psumA, v5us); | |
181 | sumB = vec_sra(psumB, v5us); | |
182 | ||
183 | sum = vec_packsu(sumA, sumB); | |
184 | ||
185 | ASSERT_ALIGNED(dst); | |
186 | ||
187 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); | |
188 | ||
189 | vec_st(fsum, 0, dst); | |
190 | ||
191 | src += srcStride; | |
192 | dst += dstStride; | |
193 | } | |
194 | } | |
195 | #endif | |
196 | ||
197 | /* this code assume stride % 16 == 0 */ | |
198 | #ifdef PREFIX_h264_qpel16_v_lowpass_altivec | |
199 | static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst, | |
200 | const uint8_t *src, | |
201 | int dstStride, int srcStride) | |
202 | { | |
203 | register int i; | |
204 | ||
205 | LOAD_ZERO; | |
f6fa7814 DM |
206 | vec_u8 perm; |
207 | #if HAVE_BIGENDIAN | |
208 | perm = vec_lvsl(0, src); | |
209 | #endif | |
2ba45a60 DM |
210 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
211 | const vec_u16 v5us = vec_splat_u16(5); | |
212 | const vec_s16 v5ss = vec_splat_s16(5); | |
213 | const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |
214 | ||
215 | const uint8_t *srcbis = src - (srcStride * 2); | |
216 | ||
f6fa7814 DM |
217 | const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm); |
218 | srcbis += srcStride; | |
219 | const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm); | |
220 | srcbis += srcStride; | |
221 | const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm); | |
222 | srcbis += srcStride; | |
223 | const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm); | |
224 | srcbis += srcStride; | |
225 | const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm); | |
226 | srcbis += srcStride; | |
227 | ||
228 | vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); | |
229 | vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); | |
230 | vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); | |
231 | vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); | |
232 | vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); | |
233 | vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); | |
234 | vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); | |
235 | vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); | |
236 | vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); | |
237 | vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); | |
2ba45a60 DM |
238 | |
239 | vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |
240 | psumA, psumB, sumA, sumB, | |
241 | srcP3ssA, srcP3ssB, | |
242 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | |
243 | ||
f6fa7814 | 244 | vec_u8 sum, fsum, srcP3; |
2ba45a60 DM |
245 | |
246 | for (i = 0 ; i < 16 ; i++) { | |
f6fa7814 DM |
247 | srcP3 = load_with_perm_vec(0, srcbis, perm); |
248 | srcbis += srcStride; | |
249 | ||
250 | srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); | |
251 | srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); | |
2ba45a60 DM |
252 | |
253 | sum1A = vec_adds(srcP0ssA, srcP1ssA); | |
254 | sum1B = vec_adds(srcP0ssB, srcP1ssB); | |
255 | sum2A = vec_adds(srcM1ssA, srcP2ssA); | |
256 | sum2B = vec_adds(srcM1ssB, srcP2ssB); | |
257 | sum3A = vec_adds(srcM2ssA, srcP3ssA); | |
258 | sum3B = vec_adds(srcM2ssB, srcP3ssB); | |
259 | ||
260 | srcM2ssA = srcM1ssA; | |
261 | srcM2ssB = srcM1ssB; | |
262 | srcM1ssA = srcP0ssA; | |
263 | srcM1ssB = srcP0ssB; | |
264 | srcP0ssA = srcP1ssA; | |
265 | srcP0ssB = srcP1ssB; | |
266 | srcP1ssA = srcP2ssA; | |
267 | srcP1ssB = srcP2ssB; | |
268 | srcP2ssA = srcP3ssA; | |
269 | srcP2ssB = srcP3ssB; | |
270 | ||
271 | pp1A = vec_mladd(sum1A, v20ss, v16ss); | |
272 | pp1B = vec_mladd(sum1B, v20ss, v16ss); | |
273 | ||
274 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |
275 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |
276 | ||
277 | pp3A = vec_add(sum3A, pp1A); | |
278 | pp3B = vec_add(sum3B, pp1B); | |
279 | ||
280 | psumA = vec_sub(pp3A, pp2A); | |
281 | psumB = vec_sub(pp3B, pp2B); | |
282 | ||
283 | sumA = vec_sra(psumA, v5us); | |
284 | sumB = vec_sra(psumB, v5us); | |
285 | ||
286 | sum = vec_packsu(sumA, sumB); | |
287 | ||
288 | ASSERT_ALIGNED(dst); | |
289 | ||
290 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); | |
291 | ||
292 | vec_st(fsum, 0, dst); | |
293 | ||
294 | dst += dstStride; | |
295 | } | |
296 | } | |
297 | #endif | |
298 | ||
299 | /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | |
300 | #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec | |
301 | static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp, | |
302 | const uint8_t *src, | |
303 | int dstStride, int tmpStride, | |
304 | int srcStride) | |
305 | { | |
306 | register int i; | |
307 | LOAD_ZERO; | |
f6fa7814 | 308 | vec_u8 permM2, permM1, permP0, permP1, permP2, permP3; |
2ba45a60 DM |
309 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
310 | const vec_u32 v10ui = vec_splat_u32(10); | |
311 | const vec_s16 v5ss = vec_splat_s16(5); | |
312 | const vec_s16 v1ss = vec_splat_s16(1); | |
313 | const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | |
314 | const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | |
315 | ||
316 | register int align = ((((unsigned long)src) - 2) % 16); | |
317 | ||
318 | vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, | |
319 | srcP2A, srcP2B, srcP3A, srcP3B, | |
320 | srcM1A, srcM1B, srcM2A, srcM2B, | |
321 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |
322 | pp1A, pp1B, pp2A, pp2B, psumA, psumB; | |
323 | ||
324 | const vec_u8 mperm = (const vec_u8) | |
325 | {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | |
326 | 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; | |
327 | int16_t *tmpbis = tmp; | |
328 | ||
329 | vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, | |
330 | tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, | |
331 | tmpP2ssA, tmpP2ssB; | |
332 | ||
333 | vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | |
334 | pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | |
335 | pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | |
336 | ssumAe, ssumAo, ssumBe, ssumBo; | |
337 | vec_u8 fsum, sumv, sum; | |
338 | vec_s16 ssume, ssumo; | |
339 | ||
f6fa7814 DM |
340 | #if HAVE_BIGENDIAN |
341 | permM2 = vec_lvsl(-2, src); | |
342 | permM1 = vec_lvsl(-1, src); | |
343 | permP0 = vec_lvsl(+0, src); | |
344 | permP1 = vec_lvsl(+1, src); | |
345 | permP2 = vec_lvsl(+2, src); | |
346 | permP3 = vec_lvsl(+3, src); | |
347 | #endif /* HAVE_BIGENDIAN */ | |
348 | ||
2ba45a60 DM |
349 | src -= (2 * srcStride); |
350 | for (i = 0 ; i < 21 ; i ++) { | |
351 | vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |
f6fa7814 DM |
352 | |
353 | load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3); | |
354 | ||
355 | srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); | |
356 | srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); | |
357 | srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); | |
358 | srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); | |
359 | ||
360 | srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); | |
361 | srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); | |
362 | srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); | |
363 | srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); | |
364 | ||
365 | srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); | |
366 | srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); | |
367 | srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); | |
368 | srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); | |
2ba45a60 DM |
369 | |
370 | sum1A = vec_adds(srcP0A, srcP1A); | |
371 | sum1B = vec_adds(srcP0B, srcP1B); | |
372 | sum2A = vec_adds(srcM1A, srcP2A); | |
373 | sum2B = vec_adds(srcM1B, srcP2B); | |
374 | sum3A = vec_adds(srcM2A, srcP3A); | |
375 | sum3B = vec_adds(srcM2B, srcP3B); | |
376 | ||
377 | pp1A = vec_mladd(sum1A, v20ss, sum3A); | |
378 | pp1B = vec_mladd(sum1B, v20ss, sum3B); | |
379 | ||
380 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |
381 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |
382 | ||
383 | psumA = vec_sub(pp1A, pp2A); | |
384 | psumB = vec_sub(pp1B, pp2B); | |
385 | ||
386 | vec_st(psumA, 0, tmp); | |
387 | vec_st(psumB, 16, tmp); | |
388 | ||
389 | src += srcStride; | |
390 | tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ | |
391 | } | |
392 | ||
393 | tmpM2ssA = vec_ld(0, tmpbis); | |
394 | tmpM2ssB = vec_ld(16, tmpbis); | |
395 | tmpbis += tmpStride; | |
396 | tmpM1ssA = vec_ld(0, tmpbis); | |
397 | tmpM1ssB = vec_ld(16, tmpbis); | |
398 | tmpbis += tmpStride; | |
399 | tmpP0ssA = vec_ld(0, tmpbis); | |
400 | tmpP0ssB = vec_ld(16, tmpbis); | |
401 | tmpbis += tmpStride; | |
402 | tmpP1ssA = vec_ld(0, tmpbis); | |
403 | tmpP1ssB = vec_ld(16, tmpbis); | |
404 | tmpbis += tmpStride; | |
405 | tmpP2ssA = vec_ld(0, tmpbis); | |
406 | tmpP2ssB = vec_ld(16, tmpbis); | |
407 | tmpbis += tmpStride; | |
408 | ||
409 | for (i = 0 ; i < 16 ; i++) { | |
410 | const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); | |
411 | const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); | |
412 | ||
413 | const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); | |
414 | const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); | |
415 | const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); | |
416 | const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); | |
f6fa7814 DM |
417 | vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); |
418 | vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); | |
2ba45a60 DM |
419 | |
420 | tmpbis += tmpStride; | |
421 | ||
422 | tmpM2ssA = tmpM1ssA; | |
423 | tmpM2ssB = tmpM1ssB; | |
424 | tmpM1ssA = tmpP0ssA; | |
425 | tmpM1ssB = tmpP0ssB; | |
426 | tmpP0ssA = tmpP1ssA; | |
427 | tmpP0ssB = tmpP1ssB; | |
428 | tmpP1ssA = tmpP2ssA; | |
429 | tmpP1ssB = tmpP2ssB; | |
430 | tmpP2ssA = tmpP3ssA; | |
431 | tmpP2ssB = tmpP3ssB; | |
432 | ||
433 | pp1Ae = vec_mule(sum1A, v20ss); | |
434 | pp1Ao = vec_mulo(sum1A, v20ss); | |
435 | pp1Be = vec_mule(sum1B, v20ss); | |
436 | pp1Bo = vec_mulo(sum1B, v20ss); | |
437 | ||
438 | pp2Ae = vec_mule(sum2A, v5ss); | |
439 | pp2Ao = vec_mulo(sum2A, v5ss); | |
440 | pp2Be = vec_mule(sum2B, v5ss); | |
441 | pp2Bo = vec_mulo(sum2B, v5ss); | |
442 | ||
2ba45a60 | 443 | pp3Ao = vec_mulo(sum3A, v1ss); |
2ba45a60 | 444 | pp3Bo = vec_mulo(sum3B, v1ss); |
f6fa7814 DM |
445 | #if !HAVE_BIGENDIAN |
446 | sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3)); | |
447 | sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3)); | |
448 | #endif | |
449 | pp3Ae = vec_sra((vec_s32)sum3A, v16ui); | |
450 | pp3Be = vec_sra((vec_s32)sum3B, v16ui); | |
2ba45a60 DM |
451 | |
452 | pp1cAe = vec_add(pp1Ae, v512si); | |
453 | pp1cAo = vec_add(pp1Ao, v512si); | |
454 | pp1cBe = vec_add(pp1Be, v512si); | |
455 | pp1cBo = vec_add(pp1Bo, v512si); | |
456 | ||
457 | pp32Ae = vec_sub(pp3Ae, pp2Ae); | |
458 | pp32Ao = vec_sub(pp3Ao, pp2Ao); | |
459 | pp32Be = vec_sub(pp3Be, pp2Be); | |
460 | pp32Bo = vec_sub(pp3Bo, pp2Bo); | |
461 | ||
462 | sumAe = vec_add(pp1cAe, pp32Ae); | |
463 | sumAo = vec_add(pp1cAo, pp32Ao); | |
464 | sumBe = vec_add(pp1cBe, pp32Be); | |
465 | sumBo = vec_add(pp1cBo, pp32Bo); | |
466 | ||
467 | ssumAe = vec_sra(sumAe, v10ui); | |
468 | ssumAo = vec_sra(sumAo, v10ui); | |
469 | ssumBe = vec_sra(sumBe, v10ui); | |
470 | ssumBo = vec_sra(sumBo, v10ui); | |
471 | ||
472 | ssume = vec_packs(ssumAe, ssumBe); | |
473 | ssumo = vec_packs(ssumAo, ssumBo); | |
474 | ||
475 | sumv = vec_packsu(ssume, ssumo); | |
476 | sum = vec_perm(sumv, sumv, mperm); | |
477 | ||
478 | ASSERT_ALIGNED(dst); | |
479 | ||
480 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); | |
481 | ||
482 | vec_st(fsum, 0, dst); | |
483 | ||
484 | dst += dstStride; | |
485 | } | |
486 | } | |
487 | #endif |