Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2002 Brian Foley | |
3 | * Copyright (c) 2002 Dieter Shirley | |
4 | * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "config.h" | |
24 | ||
25 | #if HAVE_ALTIVEC_H | |
26 | #include <altivec.h> | |
27 | #endif | |
28 | ||
29 | #include "libavutil/attributes.h" | |
30 | #include "libavutil/cpu.h" | |
31 | #include "libavutil/ppc/cpu.h" | |
32 | #include "libavutil/ppc/types_altivec.h" | |
33 | #include "libavutil/ppc/util_altivec.h" | |
34 | #include "libavcodec/hpeldsp.h" | |
35 | #include "hpeldsp_altivec.h" | |
36 | ||
37 | #if HAVE_ALTIVEC | |
38 | /* next one assumes that ((line_size % 16) == 0) */ | |
39 | void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
40 | { | |
f6fa7814 DM |
41 | register vector unsigned char pixelsv1; |
42 | register vector unsigned char pixelsv1B; | |
43 | register vector unsigned char pixelsv1C; | |
44 | register vector unsigned char pixelsv1D; | |
2ba45a60 | 45 | |
2ba45a60 DM |
46 | int i; |
47 | register ptrdiff_t line_size_2 = line_size << 1; | |
48 | register ptrdiff_t line_size_3 = line_size + line_size_2; | |
49 | register ptrdiff_t line_size_4 = line_size << 2; | |
50 | ||
51 | // hand-unrolling the loop by 4 gains about 15% | |
52 | // mininum execution time goes from 74 to 60 cycles | |
53 | // it's faster than -funroll-loops, but using | |
54 | // -funroll-loops w/ this is bad - 74 cycles again. | |
55 | // all this is on a 7450, tuning for the 7450 | |
56 | for (i = 0; i < h; i += 4) { | |
f6fa7814 DM |
57 | pixelsv1 = unaligned_load( 0, pixels); |
58 | pixelsv1B = unaligned_load(line_size, pixels); | |
59 | pixelsv1C = unaligned_load(line_size_2, pixels); | |
60 | pixelsv1D = unaligned_load(line_size_3, pixels); | |
61 | VEC_ST(pixelsv1, 0, (unsigned char*)block); | |
62 | VEC_ST(pixelsv1B, line_size, (unsigned char*)block); | |
63 | VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block); | |
64 | VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block); | |
2ba45a60 DM |
65 | pixels+=line_size_4; |
66 | block +=line_size_4; | |
67 | } | |
68 | } | |
69 | ||
70 | /* next one assumes that ((line_size % 16) == 0) */ | |
71 | #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | |
72 | void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
73 | { | |
f6fa7814 | 74 | register vector unsigned char pixelsv, blockv; |
2ba45a60 | 75 | |
f6fa7814 | 76 | int i; |
2ba45a60 | 77 | for (i = 0; i < h; i++) { |
2ba45a60 | 78 | blockv = vec_ld(0, block); |
f6fa7814 | 79 | pixelsv = VEC_LD( 0, pixels); |
2ba45a60 DM |
80 | blockv = vec_avg(blockv,pixelsv); |
81 | vec_st(blockv, 0, (unsigned char*)block); | |
82 | pixels+=line_size; | |
83 | block +=line_size; | |
84 | } | |
85 | } | |
86 | ||
87 | /* next one assumes that ((line_size % 8) == 0) */ | |
88 | static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) | |
89 | { | |
90 | register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
91 | int i; | |
92 | ||
93 | for (i = 0; i < h; i++) { | |
94 | /* block is 8 bytes-aligned, so we're either in the | |
95 | left block (16 bytes-aligned) or in the right block (not) */ | |
96 | int rightside = ((unsigned long)block & 0x0000000F); | |
97 | ||
98 | blockv = vec_ld(0, block); | |
f6fa7814 | 99 | pixelsv = VEC_LD( 0, pixels); |
2ba45a60 DM |
100 | |
101 | if (rightside) { | |
102 | pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); | |
103 | } else { | |
104 | pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); | |
105 | } | |
106 | ||
107 | blockv = vec_avg(blockv, pixelsv); | |
108 | ||
109 | vec_st(blockv, 0, block); | |
110 | ||
111 | pixels += line_size; | |
112 | block += line_size; | |
113 | } | |
114 | } | |
115 | ||
116 | /* next one assumes that ((line_size % 8) == 0) */ | |
117 | static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
118 | { | |
119 | register int i; | |
120 | register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | |
f6fa7814 | 121 | register vector unsigned char blockv; |
2ba45a60 DM |
122 | register vector unsigned short pixelssum1, pixelssum2, temp3; |
123 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
124 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
125 | ||
f6fa7814 DM |
126 | pixelsv1 = VEC_LD(0, pixels); |
127 | pixelsv2 = VEC_LD(1, pixels); | |
128 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
129 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
130 | ||
2ba45a60 DM |
131 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
132 | (vector unsigned short)pixelsv2); | |
133 | pixelssum1 = vec_add(pixelssum1, vctwo); | |
134 | ||
135 | for (i = 0; i < h ; i++) { | |
136 | int rightside = ((unsigned long)block & 0x0000000F); | |
137 | blockv = vec_ld(0, block); | |
138 | ||
f6fa7814 DM |
139 | pixelsv1 = unaligned_load(line_size, pixels); |
140 | pixelsv2 = unaligned_load(line_size+1, pixels); | |
141 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
142 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
143 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
144 | (vector unsigned short)pixelsv2); | |
145 | temp3 = vec_add(pixelssum1, pixelssum2); | |
146 | temp3 = vec_sra(temp3, vctwo); | |
147 | pixelssum1 = vec_add(pixelssum2, vctwo); | |
148 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
149 | ||
150 | if (rightside) { | |
151 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
152 | } else { | |
153 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
154 | } | |
155 | ||
156 | vec_st(blockv, 0, block); | |
157 | ||
158 | block += line_size; | |
159 | pixels += line_size; | |
160 | } | |
161 | } | |
162 | ||
163 | /* next one assumes that ((line_size % 8) == 0) */ | |
164 | static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
165 | { | |
166 | register int i; | |
167 | register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | |
f6fa7814 | 168 | register vector unsigned char blockv; |
2ba45a60 DM |
169 | register vector unsigned short pixelssum1, pixelssum2, temp3; |
170 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
171 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
172 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
173 | ||
f6fa7814 DM |
174 | pixelsv1 = VEC_LD(0, pixels); |
175 | pixelsv2 = VEC_LD(1, pixels); | |
176 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
177 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
178 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
179 | (vector unsigned short)pixelsv2); | |
180 | pixelssum1 = vec_add(pixelssum1, vcone); | |
181 | ||
182 | for (i = 0; i < h ; i++) { | |
183 | int rightside = ((unsigned long)block & 0x0000000F); | |
184 | blockv = vec_ld(0, block); | |
185 | ||
f6fa7814 DM |
186 | pixelsv1 = unaligned_load(line_size, pixels); |
187 | pixelsv2 = unaligned_load(line_size+1, pixels); | |
188 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
189 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
190 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
191 | (vector unsigned short)pixelsv2); | |
192 | temp3 = vec_add(pixelssum1, pixelssum2); | |
193 | temp3 = vec_sra(temp3, vctwo); | |
194 | pixelssum1 = vec_add(pixelssum2, vcone); | |
195 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
196 | ||
197 | if (rightside) { | |
198 | blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
199 | } else { | |
200 | blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
201 | } | |
202 | ||
203 | vec_st(blockv, 0, block); | |
204 | ||
205 | block += line_size; | |
206 | pixels += line_size; | |
207 | } | |
208 | } | |
209 | ||
210 | /* next one assumes that ((line_size % 16) == 0) */ | |
211 | static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) | |
212 | { | |
213 | register int i; | |
214 | register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
f6fa7814 | 215 | register vector unsigned char blockv; |
2ba45a60 DM |
216 | register vector unsigned short temp3, temp4, |
217 | pixelssum1, pixelssum2, pixelssum3, pixelssum4; | |
218 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
219 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
220 | ||
f6fa7814 DM |
221 | pixelsv1 = VEC_LD(0, pixels); |
222 | pixelsv2 = VEC_LD(1, pixels); | |
223 | pixelsv3 = VEC_MERGEL(vczero, pixelsv1); | |
224 | pixelsv4 = VEC_MERGEL(vczero, pixelsv2); | |
225 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
226 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
227 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
228 | (vector unsigned short)pixelsv4); | |
229 | pixelssum3 = vec_add(pixelssum3, vctwo); | |
230 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
231 | (vector unsigned short)pixelsv2); | |
232 | pixelssum1 = vec_add(pixelssum1, vctwo); | |
233 | ||
234 | for (i = 0; i < h ; i++) { | |
235 | blockv = vec_ld(0, block); | |
236 | ||
f6fa7814 DM |
237 | pixelsv1 = unaligned_load(line_size, pixels); |
238 | pixelsv2 = unaligned_load(line_size+1, pixels); | |
2ba45a60 | 239 | |
f6fa7814 DM |
240 | pixelsv3 = VEC_MERGEL(vczero, pixelsv1); |
241 | pixelsv4 = VEC_MERGEL(vczero, pixelsv2); | |
242 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
243 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
244 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
245 | (vector unsigned short)pixelsv4); | |
246 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
247 | (vector unsigned short)pixelsv2); | |
248 | temp4 = vec_add(pixelssum3, pixelssum4); | |
249 | temp4 = vec_sra(temp4, vctwo); | |
250 | temp3 = vec_add(pixelssum1, pixelssum2); | |
251 | temp3 = vec_sra(temp3, vctwo); | |
252 | ||
253 | pixelssum3 = vec_add(pixelssum4, vctwo); | |
254 | pixelssum1 = vec_add(pixelssum2, vctwo); | |
255 | ||
256 | blockv = vec_packsu(temp3, temp4); | |
257 | ||
258 | vec_st(blockv, 0, block); | |
259 | ||
260 | block += line_size; | |
261 | pixels += line_size; | |
262 | } | |
263 | } | |
264 | ||
265 | /* next one assumes that ((line_size % 16) == 0) */ | |
266 | static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) | |
267 | { | |
268 | register int i; | |
269 | register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
f6fa7814 | 270 | register vector unsigned char blockv; |
2ba45a60 DM |
271 | register vector unsigned short temp3, temp4, |
272 | pixelssum1, pixelssum2, pixelssum3, pixelssum4; | |
273 | register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
274 | register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
275 | register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
276 | ||
f6fa7814 DM |
277 | pixelsv1 = VEC_LD(0, pixels); |
278 | pixelsv2 = VEC_LD(1, pixels); | |
279 | pixelsv3 = VEC_MERGEL(vczero, pixelsv1); | |
280 | pixelsv4 = VEC_MERGEL(vczero, pixelsv2); | |
281 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
282 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
283 | pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
284 | (vector unsigned short)pixelsv4); | |
285 | pixelssum3 = vec_add(pixelssum3, vcone); | |
286 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
287 | (vector unsigned short)pixelsv2); | |
288 | pixelssum1 = vec_add(pixelssum1, vcone); | |
289 | ||
290 | for (i = 0; i < h ; i++) { | |
f6fa7814 DM |
291 | pixelsv1 = unaligned_load(line_size, pixels); |
292 | pixelsv2 = unaligned_load(line_size+1, pixels); | |
2ba45a60 | 293 | |
f6fa7814 DM |
294 | pixelsv3 = VEC_MERGEL(vczero, pixelsv1); |
295 | pixelsv4 = VEC_MERGEL(vczero, pixelsv2); | |
296 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
297 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
298 | pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
299 | (vector unsigned short)pixelsv4); | |
300 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
301 | (vector unsigned short)pixelsv2); | |
302 | temp4 = vec_add(pixelssum3, pixelssum4); | |
303 | temp4 = vec_sra(temp4, vctwo); | |
304 | temp3 = vec_add(pixelssum1, pixelssum2); | |
305 | temp3 = vec_sra(temp3, vctwo); | |
306 | ||
307 | pixelssum3 = vec_add(pixelssum4, vcone); | |
308 | pixelssum1 = vec_add(pixelssum2, vcone); | |
309 | ||
310 | blockv = vec_packsu(temp3, temp4); | |
311 | ||
f6fa7814 | 312 | VEC_ST(blockv, 0, block); |
2ba45a60 DM |
313 | |
314 | block += line_size; | |
315 | pixels += line_size; | |
316 | } | |
317 | } | |
318 | ||
319 | /* next one assumes that ((line_size % 8) == 0) */ | |
320 | static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
321 | { | |
322 | register int i; | |
323 | register vector unsigned char pixelsv1, pixelsv2, pixelsavg; | |
f6fa7814 | 324 | register vector unsigned char blockv, blocktemp; |
2ba45a60 DM |
325 | register vector unsigned short pixelssum1, pixelssum2, temp3; |
326 | ||
327 | register const vector unsigned char vczero = (const vector unsigned char) | |
328 | vec_splat_u8(0); | |
329 | register const vector unsigned short vctwo = (const vector unsigned short) | |
330 | vec_splat_u16(2); | |
331 | ||
f6fa7814 DM |
332 | pixelsv1 = VEC_LD(0, pixels); |
333 | pixelsv2 = VEC_LD(1, pixels); | |
334 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); | |
335 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
336 | pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
337 | (vector unsigned short)pixelsv2); | |
338 | pixelssum1 = vec_add(pixelssum1, vctwo); | |
339 | ||
340 | for (i = 0; i < h ; i++) { | |
341 | int rightside = ((unsigned long)block & 0x0000000F); | |
342 | blockv = vec_ld(0, block); | |
343 | ||
f6fa7814 DM |
344 | pixelsv1 = unaligned_load(line_size, pixels); |
345 | pixelsv2 = unaligned_load(line_size+1, pixels); | |
2ba45a60 | 346 | |
f6fa7814 DM |
347 | pixelsv1 = VEC_MERGEH(vczero, pixelsv1); |
348 | pixelsv2 = VEC_MERGEH(vczero, pixelsv2); | |
2ba45a60 DM |
349 | pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
350 | (vector unsigned short)pixelsv2); | |
351 | temp3 = vec_add(pixelssum1, pixelssum2); | |
352 | temp3 = vec_sra(temp3, vctwo); | |
353 | pixelssum1 = vec_add(pixelssum2, vctwo); | |
354 | pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
355 | ||
356 | if (rightside) { | |
357 | blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
358 | } else { | |
359 | blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
360 | } | |
361 | ||
362 | blockv = vec_avg(blocktemp, blockv); | |
363 | vec_st(blockv, 0, block); | |
364 | ||
365 | block += line_size; | |
366 | pixels += line_size; | |
367 | } | |
368 | } | |
369 | #endif /* HAVE_ALTIVEC */ | |
370 | ||
371 | av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags) | |
372 | { | |
373 | #if HAVE_ALTIVEC | |
374 | if (!PPC_ALTIVEC(av_get_cpu_flags())) | |
375 | return; | |
376 | ||
377 | c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec; | |
378 | c->avg_pixels_tab[1][0] = avg_pixels8_altivec; | |
379 | c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; | |
380 | ||
381 | c->put_pixels_tab[0][0] = ff_put_pixels16_altivec; | |
382 | c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; | |
383 | c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; | |
384 | ||
385 | c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec; | |
386 | c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; | |
387 | c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; | |
388 | #endif /* HAVE_ALTIVEC */ | |
389 | } |