Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / ppc / hpeldsp_altivec.c
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "config.h"
24
25#if HAVE_ALTIVEC_H
26#include <altivec.h>
27#endif
28
29#include "libavutil/attributes.h"
30#include "libavutil/cpu.h"
31#include "libavutil/ppc/cpu.h"
32#include "libavutil/ppc/types_altivec.h"
33#include "libavutil/ppc/util_altivec.h"
34#include "libavcodec/hpeldsp.h"
35#include "hpeldsp_altivec.h"
36
37#if HAVE_ALTIVEC
38/* next one assumes that ((line_size % 16) == 0) */
39void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
40{
41 register vector unsigned char pixelsv1, pixelsv2;
42 register vector unsigned char pixelsv1B, pixelsv2B;
43 register vector unsigned char pixelsv1C, pixelsv2C;
44 register vector unsigned char pixelsv1D, pixelsv2D;
45
46 register vector unsigned char perm = vec_lvsl(0, pixels);
47 int i;
48 register ptrdiff_t line_size_2 = line_size << 1;
49 register ptrdiff_t line_size_3 = line_size + line_size_2;
50 register ptrdiff_t line_size_4 = line_size << 2;
51
52// hand-unrolling the loop by 4 gains about 15%
53// mininum execution time goes from 74 to 60 cycles
54// it's faster than -funroll-loops, but using
55// -funroll-loops w/ this is bad - 74 cycles again.
56// all this is on a 7450, tuning for the 7450
57 for (i = 0; i < h; i += 4) {
58 pixelsv1 = vec_ld( 0, pixels);
59 pixelsv2 = vec_ld(15, pixels);
60 pixelsv1B = vec_ld(line_size, pixels);
61 pixelsv2B = vec_ld(15 + line_size, pixels);
62 pixelsv1C = vec_ld(line_size_2, pixels);
63 pixelsv2C = vec_ld(15 + line_size_2, pixels);
64 pixelsv1D = vec_ld(line_size_3, pixels);
65 pixelsv2D = vec_ld(15 + line_size_3, pixels);
66 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
67 0, (unsigned char*)block);
68 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
69 line_size, (unsigned char*)block);
70 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
71 line_size_2, (unsigned char*)block);
72 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
73 line_size_3, (unsigned char*)block);
74 pixels+=line_size_4;
75 block +=line_size_4;
76 }
77}
78
79/* next one assumes that ((line_size % 16) == 0) */
80#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
81void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
82{
83 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
84 register vector unsigned char perm = vec_lvsl(0, pixels);
85 int i;
86
87 for (i = 0; i < h; i++) {
88 pixelsv1 = vec_ld( 0, pixels);
89 pixelsv2 = vec_ld(16,pixels);
90 blockv = vec_ld(0, block);
91 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
92 blockv = vec_avg(blockv,pixelsv);
93 vec_st(blockv, 0, (unsigned char*)block);
94 pixels+=line_size;
95 block +=line_size;
96 }
97}
98
99/* next one assumes that ((line_size % 8) == 0) */
100static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
101{
102 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
103 int i;
104
105 for (i = 0; i < h; i++) {
106 /* block is 8 bytes-aligned, so we're either in the
107 left block (16 bytes-aligned) or in the right block (not) */
108 int rightside = ((unsigned long)block & 0x0000000F);
109
110 blockv = vec_ld(0, block);
111 pixelsv1 = vec_ld( 0, pixels);
112 pixelsv2 = vec_ld(16, pixels);
113 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
114
115 if (rightside) {
116 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
117 } else {
118 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
119 }
120
121 blockv = vec_avg(blockv, pixelsv);
122
123 vec_st(blockv, 0, block);
124
125 pixels += line_size;
126 block += line_size;
127 }
128}
129
130/* next one assumes that ((line_size % 8) == 0) */
131static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
132{
133 register int i;
134 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
135 register vector unsigned char blockv, temp1, temp2;
136 register vector unsigned short pixelssum1, pixelssum2, temp3;
137 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
138 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
139
140 temp1 = vec_ld(0, pixels);
141 temp2 = vec_ld(16, pixels);
142 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
143 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
144 pixelsv2 = temp2;
145 } else {
146 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
147 }
148 pixelsv1 = vec_mergeh(vczero, pixelsv1);
149 pixelsv2 = vec_mergeh(vczero, pixelsv2);
150 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
151 (vector unsigned short)pixelsv2);
152 pixelssum1 = vec_add(pixelssum1, vctwo);
153
154 for (i = 0; i < h ; i++) {
155 int rightside = ((unsigned long)block & 0x0000000F);
156 blockv = vec_ld(0, block);
157
158 temp1 = vec_ld(line_size, pixels);
159 temp2 = vec_ld(line_size + 16, pixels);
160 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
161 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
162 pixelsv2 = temp2;
163 } else {
164 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
165 }
166
167 pixelsv1 = vec_mergeh(vczero, pixelsv1);
168 pixelsv2 = vec_mergeh(vczero, pixelsv2);
169 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
170 (vector unsigned short)pixelsv2);
171 temp3 = vec_add(pixelssum1, pixelssum2);
172 temp3 = vec_sra(temp3, vctwo);
173 pixelssum1 = vec_add(pixelssum2, vctwo);
174 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
175
176 if (rightside) {
177 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
178 } else {
179 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
180 }
181
182 vec_st(blockv, 0, block);
183
184 block += line_size;
185 pixels += line_size;
186 }
187}
188
189/* next one assumes that ((line_size % 8) == 0) */
190static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
191{
192 register int i;
193 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
194 register vector unsigned char blockv, temp1, temp2;
195 register vector unsigned short pixelssum1, pixelssum2, temp3;
196 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
197 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
198 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
199
200 temp1 = vec_ld(0, pixels);
201 temp2 = vec_ld(16, pixels);
202 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
203 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
204 pixelsv2 = temp2;
205 } else {
206 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
207 }
208 pixelsv1 = vec_mergeh(vczero, pixelsv1);
209 pixelsv2 = vec_mergeh(vczero, pixelsv2);
210 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
211 (vector unsigned short)pixelsv2);
212 pixelssum1 = vec_add(pixelssum1, vcone);
213
214 for (i = 0; i < h ; i++) {
215 int rightside = ((unsigned long)block & 0x0000000F);
216 blockv = vec_ld(0, block);
217
218 temp1 = vec_ld(line_size, pixels);
219 temp2 = vec_ld(line_size + 16, pixels);
220 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
221 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
222 pixelsv2 = temp2;
223 } else {
224 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
225 }
226
227 pixelsv1 = vec_mergeh(vczero, pixelsv1);
228 pixelsv2 = vec_mergeh(vczero, pixelsv2);
229 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
230 (vector unsigned short)pixelsv2);
231 temp3 = vec_add(pixelssum1, pixelssum2);
232 temp3 = vec_sra(temp3, vctwo);
233 pixelssum1 = vec_add(pixelssum2, vcone);
234 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
235
236 if (rightside) {
237 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
238 } else {
239 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
240 }
241
242 vec_st(blockv, 0, block);
243
244 block += line_size;
245 pixels += line_size;
246 }
247}
248
249/* next one assumes that ((line_size % 16) == 0) */
250static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
251{
252 register int i;
253 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
254 register vector unsigned char blockv, temp1, temp2;
255 register vector unsigned short temp3, temp4,
256 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
257 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
258 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
259
260 temp1 = vec_ld(0, pixels);
261 temp2 = vec_ld(16, pixels);
262 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
263 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
264 pixelsv2 = temp2;
265 } else {
266 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
267 }
268 pixelsv3 = vec_mergel(vczero, pixelsv1);
269 pixelsv4 = vec_mergel(vczero, pixelsv2);
270 pixelsv1 = vec_mergeh(vczero, pixelsv1);
271 pixelsv2 = vec_mergeh(vczero, pixelsv2);
272 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
273 (vector unsigned short)pixelsv4);
274 pixelssum3 = vec_add(pixelssum3, vctwo);
275 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
276 (vector unsigned short)pixelsv2);
277 pixelssum1 = vec_add(pixelssum1, vctwo);
278
279 for (i = 0; i < h ; i++) {
280 blockv = vec_ld(0, block);
281
282 temp1 = vec_ld(line_size, pixels);
283 temp2 = vec_ld(line_size + 16, pixels);
284 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
285 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
286 pixelsv2 = temp2;
287 } else {
288 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
289 }
290
291 pixelsv3 = vec_mergel(vczero, pixelsv1);
292 pixelsv4 = vec_mergel(vczero, pixelsv2);
293 pixelsv1 = vec_mergeh(vczero, pixelsv1);
294 pixelsv2 = vec_mergeh(vczero, pixelsv2);
295
296 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
297 (vector unsigned short)pixelsv4);
298 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
299 (vector unsigned short)pixelsv2);
300 temp4 = vec_add(pixelssum3, pixelssum4);
301 temp4 = vec_sra(temp4, vctwo);
302 temp3 = vec_add(pixelssum1, pixelssum2);
303 temp3 = vec_sra(temp3, vctwo);
304
305 pixelssum3 = vec_add(pixelssum4, vctwo);
306 pixelssum1 = vec_add(pixelssum2, vctwo);
307
308 blockv = vec_packsu(temp3, temp4);
309
310 vec_st(blockv, 0, block);
311
312 block += line_size;
313 pixels += line_size;
314 }
315}
316
317/* next one assumes that ((line_size % 16) == 0) */
318static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
319{
320 register int i;
321 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
322 register vector unsigned char blockv, temp1, temp2;
323 register vector unsigned short temp3, temp4,
324 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
325 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
326 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
327 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
328
329 temp1 = vec_ld(0, pixels);
330 temp2 = vec_ld(16, pixels);
331 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
332 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
333 pixelsv2 = temp2;
334 } else {
335 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
336 }
337 pixelsv3 = vec_mergel(vczero, pixelsv1);
338 pixelsv4 = vec_mergel(vczero, pixelsv2);
339 pixelsv1 = vec_mergeh(vczero, pixelsv1);
340 pixelsv2 = vec_mergeh(vczero, pixelsv2);
341 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
342 (vector unsigned short)pixelsv4);
343 pixelssum3 = vec_add(pixelssum3, vcone);
344 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
345 (vector unsigned short)pixelsv2);
346 pixelssum1 = vec_add(pixelssum1, vcone);
347
348 for (i = 0; i < h ; i++) {
349 blockv = vec_ld(0, block);
350
351 temp1 = vec_ld(line_size, pixels);
352 temp2 = vec_ld(line_size + 16, pixels);
353 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
354 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
355 pixelsv2 = temp2;
356 } else {
357 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
358 }
359
360 pixelsv3 = vec_mergel(vczero, pixelsv1);
361 pixelsv4 = vec_mergel(vczero, pixelsv2);
362 pixelsv1 = vec_mergeh(vczero, pixelsv1);
363 pixelsv2 = vec_mergeh(vczero, pixelsv2);
364
365 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
366 (vector unsigned short)pixelsv4);
367 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
368 (vector unsigned short)pixelsv2);
369 temp4 = vec_add(pixelssum3, pixelssum4);
370 temp4 = vec_sra(temp4, vctwo);
371 temp3 = vec_add(pixelssum1, pixelssum2);
372 temp3 = vec_sra(temp3, vctwo);
373
374 pixelssum3 = vec_add(pixelssum4, vcone);
375 pixelssum1 = vec_add(pixelssum2, vcone);
376
377 blockv = vec_packsu(temp3, temp4);
378
379 vec_st(blockv, 0, block);
380
381 block += line_size;
382 pixels += line_size;
383 }
384}
385
386/* next one assumes that ((line_size % 8) == 0) */
387static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
388{
389 register int i;
390 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
391 register vector unsigned char blockv, temp1, temp2, blocktemp;
392 register vector unsigned short pixelssum1, pixelssum2, temp3;
393
394 register const vector unsigned char vczero = (const vector unsigned char)
395 vec_splat_u8(0);
396 register const vector unsigned short vctwo = (const vector unsigned short)
397 vec_splat_u16(2);
398
399 temp1 = vec_ld(0, pixels);
400 temp2 = vec_ld(16, pixels);
401 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
402 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
403 pixelsv2 = temp2;
404 } else {
405 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
406 }
407 pixelsv1 = vec_mergeh(vczero, pixelsv1);
408 pixelsv2 = vec_mergeh(vczero, pixelsv2);
409 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
410 (vector unsigned short)pixelsv2);
411 pixelssum1 = vec_add(pixelssum1, vctwo);
412
413 for (i = 0; i < h ; i++) {
414 int rightside = ((unsigned long)block & 0x0000000F);
415 blockv = vec_ld(0, block);
416
417 temp1 = vec_ld(line_size, pixels);
418 temp2 = vec_ld(line_size + 16, pixels);
419 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
420 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
421 pixelsv2 = temp2;
422 } else {
423 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
424 }
425
426 pixelsv1 = vec_mergeh(vczero, pixelsv1);
427 pixelsv2 = vec_mergeh(vczero, pixelsv2);
428 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
429 (vector unsigned short)pixelsv2);
430 temp3 = vec_add(pixelssum1, pixelssum2);
431 temp3 = vec_sra(temp3, vctwo);
432 pixelssum1 = vec_add(pixelssum2, vctwo);
433 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
434
435 if (rightside) {
436 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
437 } else {
438 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
439 }
440
441 blockv = vec_avg(blocktemp, blockv);
442 vec_st(blockv, 0, block);
443
444 block += line_size;
445 pixels += line_size;
446 }
447}
448#endif /* HAVE_ALTIVEC */
449
450av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
451{
452#if HAVE_ALTIVEC
453 if (!PPC_ALTIVEC(av_get_cpu_flags()))
454 return;
455
456 c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
457 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
458 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
459
460 c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
461 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
462 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
463
464 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
465 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
466 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
467#endif /* HAVE_ALTIVEC */
468}