Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/mem.h" | |
22 | ||
23 | #ifdef DEBUG | |
24 | #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); | |
25 | #else | |
26 | #define ASSERT_ALIGNED(ptr) ; | |
27 | #endif | |
28 | ||
29 | /* this code assume stride % 16 == 0 */ | |
30 | #ifdef PREFIX_h264_qpel16_h_lowpass_altivec | |
31 | static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst, | |
32 | const uint8_t *src, | |
33 | int dstStride, int srcStride) | |
34 | { | |
35 | register int i; | |
36 | ||
37 | LOAD_ZERO; | |
38 | const vec_u8 permM2 = vec_lvsl(-2, src); | |
39 | const vec_u8 permM1 = vec_lvsl(-1, src); | |
40 | const vec_u8 permP0 = vec_lvsl(+0, src); | |
41 | const vec_u8 permP1 = vec_lvsl(+1, src); | |
42 | const vec_u8 permP2 = vec_lvsl(+2, src); | |
43 | const vec_u8 permP3 = vec_lvsl(+3, src); | |
44 | const vec_s16 v5ss = vec_splat_s16(5); | |
45 | const vec_u16 v5us = vec_splat_u16(5); | |
46 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |
47 | const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |
48 | ||
49 | vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |
50 | ||
51 | register int align = ((((unsigned long)src) - 2) % 16); | |
52 | ||
53 | vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, | |
54 | srcP2A, srcP2B, srcP3A, srcP3B, | |
55 | srcM1A, srcM1B, srcM2A, srcM2B, | |
56 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |
57 | pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |
58 | psumA, psumB, sumA, sumB; | |
59 | ||
60 | vec_u8 sum, fsum; | |
61 | ||
62 | for (i = 0 ; i < 16 ; i ++) { | |
63 | vec_u8 srcR1 = vec_ld(-2, src); | |
64 | vec_u8 srcR2 = vec_ld(14, src); | |
65 | ||
66 | switch (align) { | |
67 | default: { | |
68 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
69 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
70 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
71 | srcP1 = vec_perm(srcR1, srcR2, permP1); | |
72 | srcP2 = vec_perm(srcR1, srcR2, permP2); | |
73 | srcP3 = vec_perm(srcR1, srcR2, permP3); | |
74 | } break; | |
75 | case 11: { | |
76 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
77 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
78 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
79 | srcP1 = vec_perm(srcR1, srcR2, permP1); | |
80 | srcP2 = vec_perm(srcR1, srcR2, permP2); | |
81 | srcP3 = srcR2; | |
82 | } break; | |
83 | case 12: { | |
84 | vec_u8 srcR3 = vec_ld(30, src); | |
85 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
86 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
87 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
88 | srcP1 = vec_perm(srcR1, srcR2, permP1); | |
89 | srcP2 = srcR2; | |
90 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
91 | } break; | |
92 | case 13: { | |
93 | vec_u8 srcR3 = vec_ld(30, src); | |
94 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
95 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
96 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
97 | srcP1 = srcR2; | |
98 | srcP2 = vec_perm(srcR2, srcR3, permP2); | |
99 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
100 | } break; | |
101 | case 14: { | |
102 | vec_u8 srcR3 = vec_ld(30, src); | |
103 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
104 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
105 | srcP0 = srcR2; | |
106 | srcP1 = vec_perm(srcR2, srcR3, permP1); | |
107 | srcP2 = vec_perm(srcR2, srcR3, permP2); | |
108 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
109 | } break; | |
110 | case 15: { | |
111 | vec_u8 srcR3 = vec_ld(30, src); | |
112 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
113 | srcM1 = srcR2; | |
114 | srcP0 = vec_perm(srcR2, srcR3, permP0); | |
115 | srcP1 = vec_perm(srcR2, srcR3, permP1); | |
116 | srcP2 = vec_perm(srcR2, srcR3, permP2); | |
117 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
118 | } break; | |
119 | } | |
120 | ||
121 | srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); | |
122 | srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); | |
123 | srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); | |
124 | srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); | |
125 | ||
126 | srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); | |
127 | srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); | |
128 | srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); | |
129 | srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); | |
130 | ||
131 | srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); | |
132 | srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); | |
133 | srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); | |
134 | srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); | |
135 | ||
136 | sum1A = vec_adds(srcP0A, srcP1A); | |
137 | sum1B = vec_adds(srcP0B, srcP1B); | |
138 | sum2A = vec_adds(srcM1A, srcP2A); | |
139 | sum2B = vec_adds(srcM1B, srcP2B); | |
140 | sum3A = vec_adds(srcM2A, srcP3A); | |
141 | sum3B = vec_adds(srcM2B, srcP3B); | |
142 | ||
143 | pp1A = vec_mladd(sum1A, v20ss, v16ss); | |
144 | pp1B = vec_mladd(sum1B, v20ss, v16ss); | |
145 | ||
146 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |
147 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |
148 | ||
149 | pp3A = vec_add(sum3A, pp1A); | |
150 | pp3B = vec_add(sum3B, pp1B); | |
151 | ||
152 | psumA = vec_sub(pp3A, pp2A); | |
153 | psumB = vec_sub(pp3B, pp2B); | |
154 | ||
155 | sumA = vec_sra(psumA, v5us); | |
156 | sumB = vec_sra(psumB, v5us); | |
157 | ||
158 | sum = vec_packsu(sumA, sumB); | |
159 | ||
160 | ASSERT_ALIGNED(dst); | |
161 | ||
162 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); | |
163 | ||
164 | vec_st(fsum, 0, dst); | |
165 | ||
166 | src += srcStride; | |
167 | dst += dstStride; | |
168 | } | |
169 | } | |
170 | #endif | |
171 | ||
172 | /* this code assume stride % 16 == 0 */ | |
173 | #ifdef PREFIX_h264_qpel16_v_lowpass_altivec | |
174 | static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst, | |
175 | const uint8_t *src, | |
176 | int dstStride, int srcStride) | |
177 | { | |
178 | register int i; | |
179 | ||
180 | LOAD_ZERO; | |
181 | const vec_u8 perm = vec_lvsl(0, src); | |
182 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |
183 | const vec_u16 v5us = vec_splat_u16(5); | |
184 | const vec_s16 v5ss = vec_splat_s16(5); | |
185 | const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |
186 | ||
187 | const uint8_t *srcbis = src - (srcStride * 2); | |
188 | ||
189 | const vec_u8 srcM2a = vec_ld(0, srcbis); | |
190 | const vec_u8 srcM2b = vec_ld(16, srcbis); | |
191 | const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); | |
192 | //srcbis += srcStride; | |
193 | const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); | |
194 | const vec_u8 srcM1b = vec_ld(16, srcbis); | |
195 | const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); | |
196 | //srcbis += srcStride; | |
197 | const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); | |
198 | const vec_u8 srcP0b = vec_ld(16, srcbis); | |
199 | const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); | |
200 | //srcbis += srcStride; | |
201 | const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); | |
202 | const vec_u8 srcP1b = vec_ld(16, srcbis); | |
203 | const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); | |
204 | //srcbis += srcStride; | |
205 | const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); | |
206 | const vec_u8 srcP2b = vec_ld(16, srcbis); | |
207 | const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); | |
208 | //srcbis += srcStride; | |
209 | ||
210 | vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); | |
211 | vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); | |
212 | vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); | |
213 | vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); | |
214 | vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); | |
215 | vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); | |
216 | vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); | |
217 | vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); | |
218 | vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); | |
219 | vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); | |
220 | ||
221 | vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |
222 | psumA, psumB, sumA, sumB, | |
223 | srcP3ssA, srcP3ssB, | |
224 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | |
225 | ||
226 | vec_u8 sum, fsum, srcP3a, srcP3b, srcP3; | |
227 | ||
228 | for (i = 0 ; i < 16 ; i++) { | |
229 | srcP3a = vec_ld(0, srcbis += srcStride); | |
230 | srcP3b = vec_ld(16, srcbis); | |
231 | srcP3 = vec_perm(srcP3a, srcP3b, perm); | |
232 | srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); | |
233 | srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); | |
234 | //srcbis += srcStride; | |
235 | ||
236 | sum1A = vec_adds(srcP0ssA, srcP1ssA); | |
237 | sum1B = vec_adds(srcP0ssB, srcP1ssB); | |
238 | sum2A = vec_adds(srcM1ssA, srcP2ssA); | |
239 | sum2B = vec_adds(srcM1ssB, srcP2ssB); | |
240 | sum3A = vec_adds(srcM2ssA, srcP3ssA); | |
241 | sum3B = vec_adds(srcM2ssB, srcP3ssB); | |
242 | ||
243 | srcM2ssA = srcM1ssA; | |
244 | srcM2ssB = srcM1ssB; | |
245 | srcM1ssA = srcP0ssA; | |
246 | srcM1ssB = srcP0ssB; | |
247 | srcP0ssA = srcP1ssA; | |
248 | srcP0ssB = srcP1ssB; | |
249 | srcP1ssA = srcP2ssA; | |
250 | srcP1ssB = srcP2ssB; | |
251 | srcP2ssA = srcP3ssA; | |
252 | srcP2ssB = srcP3ssB; | |
253 | ||
254 | pp1A = vec_mladd(sum1A, v20ss, v16ss); | |
255 | pp1B = vec_mladd(sum1B, v20ss, v16ss); | |
256 | ||
257 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |
258 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |
259 | ||
260 | pp3A = vec_add(sum3A, pp1A); | |
261 | pp3B = vec_add(sum3B, pp1B); | |
262 | ||
263 | psumA = vec_sub(pp3A, pp2A); | |
264 | psumB = vec_sub(pp3B, pp2B); | |
265 | ||
266 | sumA = vec_sra(psumA, v5us); | |
267 | sumB = vec_sra(psumB, v5us); | |
268 | ||
269 | sum = vec_packsu(sumA, sumB); | |
270 | ||
271 | ASSERT_ALIGNED(dst); | |
272 | ||
273 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); | |
274 | ||
275 | vec_st(fsum, 0, dst); | |
276 | ||
277 | dst += dstStride; | |
278 | } | |
279 | } | |
280 | #endif | |
281 | ||
282 | /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | |
283 | #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec | |
284 | static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp, | |
285 | const uint8_t *src, | |
286 | int dstStride, int tmpStride, | |
287 | int srcStride) | |
288 | { | |
289 | register int i; | |
290 | LOAD_ZERO; | |
291 | const vec_u8 permM2 = vec_lvsl(-2, src); | |
292 | const vec_u8 permM1 = vec_lvsl(-1, src); | |
293 | const vec_u8 permP0 = vec_lvsl(+0, src); | |
294 | const vec_u8 permP1 = vec_lvsl(+1, src); | |
295 | const vec_u8 permP2 = vec_lvsl(+2, src); | |
296 | const vec_u8 permP3 = vec_lvsl(+3, src); | |
297 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | |
298 | const vec_u32 v10ui = vec_splat_u32(10); | |
299 | const vec_s16 v5ss = vec_splat_s16(5); | |
300 | const vec_s16 v1ss = vec_splat_s16(1); | |
301 | const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | |
302 | const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | |
303 | ||
304 | register int align = ((((unsigned long)src) - 2) % 16); | |
305 | ||
306 | vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, | |
307 | srcP2A, srcP2B, srcP3A, srcP3B, | |
308 | srcM1A, srcM1B, srcM2A, srcM2B, | |
309 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |
310 | pp1A, pp1B, pp2A, pp2B, psumA, psumB; | |
311 | ||
312 | const vec_u8 mperm = (const vec_u8) | |
313 | {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | |
314 | 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; | |
315 | int16_t *tmpbis = tmp; | |
316 | ||
317 | vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, | |
318 | tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, | |
319 | tmpP2ssA, tmpP2ssB; | |
320 | ||
321 | vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | |
322 | pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | |
323 | pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | |
324 | ssumAe, ssumAo, ssumBe, ssumBo; | |
325 | vec_u8 fsum, sumv, sum; | |
326 | vec_s16 ssume, ssumo; | |
327 | ||
328 | src -= (2 * srcStride); | |
329 | for (i = 0 ; i < 21 ; i ++) { | |
330 | vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |
331 | vec_u8 srcR1 = vec_ld(-2, src); | |
332 | vec_u8 srcR2 = vec_ld(14, src); | |
333 | ||
334 | switch (align) { | |
335 | default: { | |
336 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
337 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
338 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
339 | srcP1 = vec_perm(srcR1, srcR2, permP1); | |
340 | srcP2 = vec_perm(srcR1, srcR2, permP2); | |
341 | srcP3 = vec_perm(srcR1, srcR2, permP3); | |
342 | } break; | |
343 | case 11: { | |
344 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
345 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
346 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
347 | srcP1 = vec_perm(srcR1, srcR2, permP1); | |
348 | srcP2 = vec_perm(srcR1, srcR2, permP2); | |
349 | srcP3 = srcR2; | |
350 | } break; | |
351 | case 12: { | |
352 | vec_u8 srcR3 = vec_ld(30, src); | |
353 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
354 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
355 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
356 | srcP1 = vec_perm(srcR1, srcR2, permP1); | |
357 | srcP2 = srcR2; | |
358 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
359 | } break; | |
360 | case 13: { | |
361 | vec_u8 srcR3 = vec_ld(30, src); | |
362 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
363 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
364 | srcP0 = vec_perm(srcR1, srcR2, permP0); | |
365 | srcP1 = srcR2; | |
366 | srcP2 = vec_perm(srcR2, srcR3, permP2); | |
367 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
368 | } break; | |
369 | case 14: { | |
370 | vec_u8 srcR3 = vec_ld(30, src); | |
371 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
372 | srcM1 = vec_perm(srcR1, srcR2, permM1); | |
373 | srcP0 = srcR2; | |
374 | srcP1 = vec_perm(srcR2, srcR3, permP1); | |
375 | srcP2 = vec_perm(srcR2, srcR3, permP2); | |
376 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
377 | } break; | |
378 | case 15: { | |
379 | vec_u8 srcR3 = vec_ld(30, src); | |
380 | srcM2 = vec_perm(srcR1, srcR2, permM2); | |
381 | srcM1 = srcR2; | |
382 | srcP0 = vec_perm(srcR2, srcR3, permP0); | |
383 | srcP1 = vec_perm(srcR2, srcR3, permP1); | |
384 | srcP2 = vec_perm(srcR2, srcR3, permP2); | |
385 | srcP3 = vec_perm(srcR2, srcR3, permP3); | |
386 | } break; | |
387 | } | |
388 | ||
389 | srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); | |
390 | srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); | |
391 | srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); | |
392 | srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); | |
393 | ||
394 | srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); | |
395 | srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); | |
396 | srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); | |
397 | srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); | |
398 | ||
399 | srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); | |
400 | srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); | |
401 | srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); | |
402 | srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); | |
403 | ||
404 | sum1A = vec_adds(srcP0A, srcP1A); | |
405 | sum1B = vec_adds(srcP0B, srcP1B); | |
406 | sum2A = vec_adds(srcM1A, srcP2A); | |
407 | sum2B = vec_adds(srcM1B, srcP2B); | |
408 | sum3A = vec_adds(srcM2A, srcP3A); | |
409 | sum3B = vec_adds(srcM2B, srcP3B); | |
410 | ||
411 | pp1A = vec_mladd(sum1A, v20ss, sum3A); | |
412 | pp1B = vec_mladd(sum1B, v20ss, sum3B); | |
413 | ||
414 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); | |
415 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); | |
416 | ||
417 | psumA = vec_sub(pp1A, pp2A); | |
418 | psumB = vec_sub(pp1B, pp2B); | |
419 | ||
420 | vec_st(psumA, 0, tmp); | |
421 | vec_st(psumB, 16, tmp); | |
422 | ||
423 | src += srcStride; | |
424 | tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ | |
425 | } | |
426 | ||
427 | tmpM2ssA = vec_ld(0, tmpbis); | |
428 | tmpM2ssB = vec_ld(16, tmpbis); | |
429 | tmpbis += tmpStride; | |
430 | tmpM1ssA = vec_ld(0, tmpbis); | |
431 | tmpM1ssB = vec_ld(16, tmpbis); | |
432 | tmpbis += tmpStride; | |
433 | tmpP0ssA = vec_ld(0, tmpbis); | |
434 | tmpP0ssB = vec_ld(16, tmpbis); | |
435 | tmpbis += tmpStride; | |
436 | tmpP1ssA = vec_ld(0, tmpbis); | |
437 | tmpP1ssB = vec_ld(16, tmpbis); | |
438 | tmpbis += tmpStride; | |
439 | tmpP2ssA = vec_ld(0, tmpbis); | |
440 | tmpP2ssB = vec_ld(16, tmpbis); | |
441 | tmpbis += tmpStride; | |
442 | ||
443 | for (i = 0 ; i < 16 ; i++) { | |
444 | const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); | |
445 | const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); | |
446 | ||
447 | const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); | |
448 | const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); | |
449 | const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); | |
450 | const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); | |
451 | const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); | |
452 | const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); | |
453 | ||
454 | tmpbis += tmpStride; | |
455 | ||
456 | tmpM2ssA = tmpM1ssA; | |
457 | tmpM2ssB = tmpM1ssB; | |
458 | tmpM1ssA = tmpP0ssA; | |
459 | tmpM1ssB = tmpP0ssB; | |
460 | tmpP0ssA = tmpP1ssA; | |
461 | tmpP0ssB = tmpP1ssB; | |
462 | tmpP1ssA = tmpP2ssA; | |
463 | tmpP1ssB = tmpP2ssB; | |
464 | tmpP2ssA = tmpP3ssA; | |
465 | tmpP2ssB = tmpP3ssB; | |
466 | ||
467 | pp1Ae = vec_mule(sum1A, v20ss); | |
468 | pp1Ao = vec_mulo(sum1A, v20ss); | |
469 | pp1Be = vec_mule(sum1B, v20ss); | |
470 | pp1Bo = vec_mulo(sum1B, v20ss); | |
471 | ||
472 | pp2Ae = vec_mule(sum2A, v5ss); | |
473 | pp2Ao = vec_mulo(sum2A, v5ss); | |
474 | pp2Be = vec_mule(sum2B, v5ss); | |
475 | pp2Bo = vec_mulo(sum2B, v5ss); | |
476 | ||
477 | pp3Ae = vec_sra((vec_s32)sum3A, v16ui); | |
478 | pp3Ao = vec_mulo(sum3A, v1ss); | |
479 | pp3Be = vec_sra((vec_s32)sum3B, v16ui); | |
480 | pp3Bo = vec_mulo(sum3B, v1ss); | |
481 | ||
482 | pp1cAe = vec_add(pp1Ae, v512si); | |
483 | pp1cAo = vec_add(pp1Ao, v512si); | |
484 | pp1cBe = vec_add(pp1Be, v512si); | |
485 | pp1cBo = vec_add(pp1Bo, v512si); | |
486 | ||
487 | pp32Ae = vec_sub(pp3Ae, pp2Ae); | |
488 | pp32Ao = vec_sub(pp3Ao, pp2Ao); | |
489 | pp32Be = vec_sub(pp3Be, pp2Be); | |
490 | pp32Bo = vec_sub(pp3Bo, pp2Bo); | |
491 | ||
492 | sumAe = vec_add(pp1cAe, pp32Ae); | |
493 | sumAo = vec_add(pp1cAo, pp32Ao); | |
494 | sumBe = vec_add(pp1cBe, pp32Be); | |
495 | sumBo = vec_add(pp1cBo, pp32Bo); | |
496 | ||
497 | ssumAe = vec_sra(sumAe, v10ui); | |
498 | ssumAo = vec_sra(sumAo, v10ui); | |
499 | ssumBe = vec_sra(sumBe, v10ui); | |
500 | ssumBo = vec_sra(sumBo, v10ui); | |
501 | ||
502 | ssume = vec_packs(ssumAe, ssumBe); | |
503 | ssumo = vec_packs(ssumAo, ssumBo); | |
504 | ||
505 | sumv = vec_packsu(ssume, ssumo); | |
506 | sum = vec_perm(sumv, sumv, mperm); | |
507 | ||
508 | ASSERT_ALIGNED(dst); | |
509 | ||
510 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); | |
511 | ||
512 | vec_st(fsum, 0, dst); | |
513 | ||
514 | dst += dstStride; | |
515 | } | |
516 | } | |
517 | #endif |