Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * AltiVec acceleration for colorspace conversion | |
3 | * | |
4 | * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | /* | |
24 | * Convert I420 YV12 to RGB in various formats, | |
25 | * it rejects images that are not in 420 formats, | |
26 | * it rejects images that don't have widths of multiples of 16, | |
27 | * it rejects images that don't have heights of multiples of 2. | |
28 | * Reject defers to C simulation code. | |
29 | * | |
30 | * Lots of optimizations to be done here. | |
31 | * | |
32 | * 1. Need to fix saturation code. I just couldn't get it to fly with packs | |
33 | * and adds, so we currently use max/min to clip. | |
34 | * | |
35 | * 2. The inefficient use of chroma loading needs a bit of brushing up. | |
36 | * | |
37 | * 3. Analysis of pipeline stalls needs to be done. Use shark to identify | |
38 | * pipeline stalls. | |
39 | * | |
40 | * | |
41 | * MODIFIED to calculate coeffs from currently selected color space. | |
42 | * MODIFIED core to be a macro where you specify the output format. | |
43 | * ADDED UYVY conversion which is never called due to some thing in swscale. | |
44 | * CORRECTED algorithim selection to be strict on input formats. | |
45 | * ADDED runtime detection of AltiVec. | |
46 | * | |
47 | * ADDED altivec_yuv2packedX vertical scl + RGB converter | |
48 | * | |
49 | * March 27,2004 | |
50 | * PERFORMANCE ANALYSIS | |
51 | * | |
52 | * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo | |
53 | * used as test. | |
54 | * The AltiVec version uses 10% of the processor or ~100Mips for D1 video | |
55 | * same sequence. | |
56 | * | |
57 | * 720 * 480 * 30 ~10MPS | |
58 | * | |
59 | * so we have roughly 10 clocks per pixel. This is too high, something has | |
60 | * to be wrong. | |
61 | * | |
62 | * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the | |
63 | * need for vec_min. | |
64 | * | |
65 | * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to | |
66 | * have the input video frame, it was just decompressed so it probably resides | |
67 | * in L1 caches. However, we are creating the output video stream. This needs | |
68 | * to use the DSTST instruction to optimize for the cache. We couple this with | |
69 | * the fact that we are not going to be visiting the input buffer again so we | |
70 | * mark it Least Recently Used. This shaves 25% of the processor cycles off. | |
71 | * | |
72 | * Now memcpy is the largest mips consumer in the system, probably due | |
73 | * to the inefficient X11 stuff. | |
74 | * | |
75 | * GL libraries seem to be very slow on this machine 1.33Ghz PB running | |
76 | * Jaguar, this is not the case for my 1Ghz PB. I thought it might be | |
77 | * a versioning issue, however I have libGL.1.2.dylib for both | |
78 | * machines. (We need to figure this out now.) | |
79 | * | |
80 | * GL2 libraries work now with patch for RGB32. | |
81 | * | |
82 | * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. | |
83 | * | |
84 | * Integrated luma prescaling adjustment for saturation/contrast/brightness | |
85 | * adjustment. | |
86 | */ | |
87 | ||
88 | #include <stdio.h> | |
89 | #include <stdlib.h> | |
90 | #include <string.h> | |
91 | #include <inttypes.h> | |
92 | #include <assert.h> | |
93 | ||
94 | #include "config.h" | |
95 | #include "libswscale/rgb2rgb.h" | |
96 | #include "libswscale/swscale.h" | |
97 | #include "libswscale/swscale_internal.h" | |
98 | #include "libavutil/attributes.h" | |
99 | #include "libavutil/cpu.h" | |
100 | #include "libavutil/pixdesc.h" | |
101 | #include "yuv2rgb_altivec.h" | |
102 | ||
103 | #if HAVE_ALTIVEC | |
104 | ||
105 | #undef PROFILE_THE_BEAST | |
106 | #undef INC_SCALING | |
107 | ||
108 | typedef unsigned char ubyte; | |
109 | typedef signed char sbyte; | |
110 | ||
111 | /* RGB interleaver, 16 planar pels 8-bit samples per channel in | |
112 | * homogeneous vector registers x0,x1,x2 are interleaved with the | |
113 | * following technique: | |
114 | * | |
115 | * o0 = vec_mergeh(x0, x1); | |
116 | * o1 = vec_perm(o0, x2, perm_rgb_0); | |
117 | * o2 = vec_perm(o0, x2, perm_rgb_1); | |
118 | * o3 = vec_mergel(x0, x1); | |
119 | * o4 = vec_perm(o3, o2, perm_rgb_2); | |
120 | * o5 = vec_perm(o3, o2, perm_rgb_3); | |
121 | * | |
122 | * perm_rgb_0: o0(RG).h v1(B) --> o1* | |
123 | * 0 1 2 3 4 | |
124 | * rgbr|gbrg|brgb|rgbr | |
125 | * 0010 0100 1001 0010 | |
126 | * 0102 3145 2673 894A | |
127 | * | |
128 | * perm_rgb_1: o0(RG).h v1(B) --> o2 | |
129 | * 0 1 2 3 4 | |
130 | * gbrg|brgb|bbbb|bbbb | |
131 | * 0100 1001 1111 1111 | |
132 | * B5CD 6EF7 89AB CDEF | |
133 | * | |
134 | * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* | |
135 | * 0 1 2 3 4 | |
136 | * gbrg|brgb|rgbr|gbrg | |
137 | * 1111 1111 0010 0100 | |
138 | * 89AB CDEF 0182 3945 | |
139 | * | |
140 | * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* | |
141 | * 0 1 2 3 4 | |
142 | * brgb|rgbr|gbrg|brgb | |
143 | * 1001 0010 0100 1001 | |
144 | * a67b 89cA BdCD eEFf | |
145 | * | |
146 | */ | |
147 | static const vector unsigned char | |
148 | perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05, | |
149 | 0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a }, | |
150 | perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17, | |
151 | 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }, | |
152 | perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, | |
153 | 0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 }, | |
154 | perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a, | |
155 | 0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f }; | |
156 | ||
157 | #define vec_merge3(x2, x1, x0, y0, y1, y2) \ | |
158 | do { \ | |
159 | __typeof__(x0) o0, o2, o3; \ | |
160 | o0 = vec_mergeh(x0, x1); \ | |
161 | y0 = vec_perm(o0, x2, perm_rgb_0); \ | |
162 | o2 = vec_perm(o0, x2, perm_rgb_1); \ | |
163 | o3 = vec_mergel(x0, x1); \ | |
164 | y1 = vec_perm(o3, o2, perm_rgb_2); \ | |
165 | y2 = vec_perm(o3, o2, perm_rgb_3); \ | |
166 | } while (0) | |
167 | ||
168 | #define vec_mstbgr24(x0, x1, x2, ptr) \ | |
169 | do { \ | |
170 | __typeof__(x0) _0, _1, _2; \ | |
171 | vec_merge3(x0, x1, x2, _0, _1, _2); \ | |
172 | vec_st(_0, 0, ptr++); \ | |
173 | vec_st(_1, 0, ptr++); \ | |
174 | vec_st(_2, 0, ptr++); \ | |
175 | } while (0) | |
176 | ||
177 | #define vec_mstrgb24(x0, x1, x2, ptr) \ | |
178 | do { \ | |
179 | __typeof__(x0) _0, _1, _2; \ | |
180 | vec_merge3(x2, x1, x0, _0, _1, _2); \ | |
181 | vec_st(_0, 0, ptr++); \ | |
182 | vec_st(_1, 0, ptr++); \ | |
183 | vec_st(_2, 0, ptr++); \ | |
184 | } while (0) | |
185 | ||
186 | /* pack the pixels in rgb0 format | |
187 | * msb R | |
188 | * lsb 0 | |
189 | */ | |
190 | #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \ | |
191 | do { \ | |
192 | T _0, _1, _2, _3; \ | |
193 | _0 = vec_mergeh(x0, x1); \ | |
194 | _1 = vec_mergeh(x2, x3); \ | |
195 | _2 = (T) vec_mergeh((vector unsigned short) _0, \ | |
196 | (vector unsigned short) _1); \ | |
197 | _3 = (T) vec_mergel((vector unsigned short) _0, \ | |
198 | (vector unsigned short) _1); \ | |
199 | vec_st(_2, 0 * 16, (T *) ptr); \ | |
200 | vec_st(_3, 1 * 16, (T *) ptr); \ | |
201 | _0 = vec_mergel(x0, x1); \ | |
202 | _1 = vec_mergel(x2, x3); \ | |
203 | _2 = (T) vec_mergeh((vector unsigned short) _0, \ | |
204 | (vector unsigned short) _1); \ | |
205 | _3 = (T) vec_mergel((vector unsigned short) _0, \ | |
206 | (vector unsigned short) _1); \ | |
207 | vec_st(_2, 2 * 16, (T *) ptr); \ | |
208 | vec_st(_3, 3 * 16, (T *) ptr); \ | |
209 | ptr += 4; \ | |
210 | } while (0) | |
211 | ||
212 | /* | |
213 | * 1 0 1.4021 | | Y | | |
214 | * 1 -0.3441 -0.7142 |x| Cb| | |
215 | * 1 1.7718 0 | | Cr| | |
216 | * | |
217 | * | |
218 | * Y: [-128 127] | |
219 | * Cb/Cr : [-128 127] | |
220 | * | |
221 | * typical YUV conversion works on Y: 0-255 this version has been | |
222 | * optimized for JPEG decoding. | |
223 | */ | |
224 | ||
225 | #define vec_unh(x) \ | |
226 | (vector signed short) \ | |
227 | vec_perm(x, (__typeof__(x)) { 0 }, \ | |
228 | ((vector unsigned char) { \ | |
229 | 0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \ | |
230 | 0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 })) | |
231 | ||
232 | #define vec_unl(x) \ | |
233 | (vector signed short) \ | |
234 | vec_perm(x, (__typeof__(x)) { 0 }, \ | |
235 | ((vector unsigned char) { \ | |
236 | 0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \ | |
237 | 0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F })) | |
238 | ||
239 | #define vec_clip_s16(x) \ | |
240 | vec_max(vec_min(x, ((vector signed short) { \ | |
241 | 235, 235, 235, 235, 235, 235, 235, 235 })), \ | |
242 | ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 })) | |
243 | ||
244 | #define vec_packclp(x, y) \ | |
245 | (vector unsigned char) \ | |
246 | vec_packs((vector unsigned short) \ | |
247 | vec_max(x, ((vector signed short) { 0 })), \ | |
248 | (vector unsigned short) \ | |
249 | vec_max(y, ((vector signed short) { 0 }))) | |
250 | ||
251 | static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y, | |
252 | vector signed short U, vector signed short V, | |
253 | vector signed short *R, vector signed short *G, | |
254 | vector signed short *B) | |
255 | { | |
256 | vector signed short vx, ux, uvx; | |
257 | ||
258 | Y = vec_mradds(Y, c->CY, c->OY); | |
259 | U = vec_sub(U, (vector signed short) | |
260 | vec_splat((vector signed short) { 128 }, 0)); | |
261 | V = vec_sub(V, (vector signed short) | |
262 | vec_splat((vector signed short) { 128 }, 0)); | |
263 | ||
264 | // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15; | |
265 | ux = vec_sl(U, c->CSHIFT); | |
266 | *B = vec_mradds(ux, c->CBU, Y); | |
267 | ||
268 | // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15; | |
269 | vx = vec_sl(V, c->CSHIFT); | |
270 | *R = vec_mradds(vx, c->CRV, Y); | |
271 | ||
272 | // uvx = ((CGU * u) + (CGV * v)) >> 15; | |
273 | uvx = vec_mradds(U, c->CGU, Y); | |
274 | *G = vec_mradds(V, c->CGV, uvx); | |
275 | } | |
276 | ||
277 | /* | |
278 | * ------------------------------------------------------------------------------ | |
279 | * CS converters | |
280 | * ------------------------------------------------------------------------------ | |
281 | */ | |
282 | ||
283 | #define DEFCSP420_CVT(name, out_pixels) \ | |
284 | static int altivec_ ## name(SwsContext *c, const unsigned char **in, \ | |
285 | int *instrides, int srcSliceY, int srcSliceH, \ | |
286 | unsigned char **oplanes, int *outstrides) \ | |
287 | { \ | |
288 | int w = c->srcW; \ | |
289 | int h = srcSliceH; \ | |
290 | int i, j; \ | |
291 | int instrides_scl[3]; \ | |
292 | vector unsigned char y0, y1; \ | |
293 | \ | |
294 | vector signed char u, v; \ | |
295 | \ | |
296 | vector signed short Y0, Y1, Y2, Y3; \ | |
297 | vector signed short U, V; \ | |
298 | vector signed short vx, ux, uvx; \ | |
299 | vector signed short vx0, ux0, uvx0; \ | |
300 | vector signed short vx1, ux1, uvx1; \ | |
301 | vector signed short R0, G0, B0; \ | |
302 | vector signed short R1, G1, B1; \ | |
303 | vector unsigned char R, G, B; \ | |
304 | \ | |
305 | const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ | |
306 | vector unsigned char align_perm; \ | |
307 | \ | |
308 | vector signed short lCY = c->CY; \ | |
309 | vector signed short lOY = c->OY; \ | |
310 | vector signed short lCRV = c->CRV; \ | |
311 | vector signed short lCBU = c->CBU; \ | |
312 | vector signed short lCGU = c->CGU; \ | |
313 | vector signed short lCGV = c->CGV; \ | |
314 | vector unsigned short lCSHIFT = c->CSHIFT; \ | |
315 | \ | |
316 | const ubyte *y1i = in[0]; \ | |
317 | const ubyte *y2i = in[0] + instrides[0]; \ | |
318 | const ubyte *ui = in[1]; \ | |
319 | const ubyte *vi = in[2]; \ | |
320 | \ | |
321 | vector unsigned char *oute, *outo; \ | |
322 | \ | |
323 | /* loop moves y{1, 2}i by w */ \ | |
324 | instrides_scl[0] = instrides[0] * 2 - w; \ | |
325 | /* loop moves ui by w / 2 */ \ | |
326 | instrides_scl[1] = instrides[1] - w / 2; \ | |
327 | /* loop moves vi by w / 2 */ \ | |
328 | instrides_scl[2] = instrides[2] - w / 2; \ | |
329 | \ | |
330 | for (i = 0; i < h / 2; i++) { \ | |
331 | oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \ | |
332 | (srcSliceY + i * 2)); \ | |
333 | outo = oute + (outstrides[0] >> 4); \ | |
334 | vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \ | |
335 | vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \ | |
336 | \ | |
337 | for (j = 0; j < w / 16; j++) { \ | |
338 | y1ivP = (const vector unsigned char *) y1i; \ | |
339 | y2ivP = (const vector unsigned char *) y2i; \ | |
340 | uivP = (const vector unsigned char *) ui; \ | |
341 | vivP = (const vector unsigned char *) vi; \ | |
342 | \ | |
343 | align_perm = vec_lvsl(0, y1i); \ | |
344 | y0 = (vector unsigned char) \ | |
345 | vec_perm(y1ivP[0], y1ivP[1], align_perm); \ | |
346 | \ | |
347 | align_perm = vec_lvsl(0, y2i); \ | |
348 | y1 = (vector unsigned char) \ | |
349 | vec_perm(y2ivP[0], y2ivP[1], align_perm); \ | |
350 | \ | |
351 | align_perm = vec_lvsl(0, ui); \ | |
352 | u = (vector signed char) \ | |
353 | vec_perm(uivP[0], uivP[1], align_perm); \ | |
354 | \ | |
355 | align_perm = vec_lvsl(0, vi); \ | |
356 | v = (vector signed char) \ | |
357 | vec_perm(vivP[0], vivP[1], align_perm); \ | |
358 | \ | |
359 | u = (vector signed char) \ | |
360 | vec_sub(u, \ | |
361 | (vector signed char) \ | |
362 | vec_splat((vector signed char) { 128 }, 0)); \ | |
363 | v = (vector signed char) \ | |
364 | vec_sub(v, \ | |
365 | (vector signed char) \ | |
366 | vec_splat((vector signed char) { 128 }, 0)); \ | |
367 | \ | |
368 | U = vec_unpackh(u); \ | |
369 | V = vec_unpackh(v); \ | |
370 | \ | |
371 | Y0 = vec_unh(y0); \ | |
372 | Y1 = vec_unl(y0); \ | |
373 | Y2 = vec_unh(y1); \ | |
374 | Y3 = vec_unl(y1); \ | |
375 | \ | |
376 | Y0 = vec_mradds(Y0, lCY, lOY); \ | |
377 | Y1 = vec_mradds(Y1, lCY, lOY); \ | |
378 | Y2 = vec_mradds(Y2, lCY, lOY); \ | |
379 | Y3 = vec_mradds(Y3, lCY, lOY); \ | |
380 | \ | |
381 | /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \ | |
382 | ux = vec_sl(U, lCSHIFT); \ | |
383 | ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \ | |
384 | ux0 = vec_mergeh(ux, ux); \ | |
385 | ux1 = vec_mergel(ux, ux); \ | |
386 | \ | |
387 | /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \ | |
388 | vx = vec_sl(V, lCSHIFT); \ | |
389 | vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \ | |
390 | vx0 = vec_mergeh(vx, vx); \ | |
391 | vx1 = vec_mergel(vx, vx); \ | |
392 | \ | |
393 | /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \ | |
394 | uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \ | |
395 | uvx = vec_mradds(V, lCGV, uvx); \ | |
396 | uvx0 = vec_mergeh(uvx, uvx); \ | |
397 | uvx1 = vec_mergel(uvx, uvx); \ | |
398 | \ | |
399 | R0 = vec_add(Y0, vx0); \ | |
400 | G0 = vec_add(Y0, uvx0); \ | |
401 | B0 = vec_add(Y0, ux0); \ | |
402 | R1 = vec_add(Y1, vx1); \ | |
403 | G1 = vec_add(Y1, uvx1); \ | |
404 | B1 = vec_add(Y1, ux1); \ | |
405 | \ | |
406 | R = vec_packclp(R0, R1); \ | |
407 | G = vec_packclp(G0, G1); \ | |
408 | B = vec_packclp(B0, B1); \ | |
409 | \ | |
410 | out_pixels(R, G, B, oute); \ | |
411 | \ | |
412 | R0 = vec_add(Y2, vx0); \ | |
413 | G0 = vec_add(Y2, uvx0); \ | |
414 | B0 = vec_add(Y2, ux0); \ | |
415 | R1 = vec_add(Y3, vx1); \ | |
416 | G1 = vec_add(Y3, uvx1); \ | |
417 | B1 = vec_add(Y3, ux1); \ | |
418 | R = vec_packclp(R0, R1); \ | |
419 | G = vec_packclp(G0, G1); \ | |
420 | B = vec_packclp(B0, B1); \ | |
421 | \ | |
422 | \ | |
423 | out_pixels(R, G, B, outo); \ | |
424 | \ | |
425 | y1i += 16; \ | |
426 | y2i += 16; \ | |
427 | ui += 8; \ | |
428 | vi += 8; \ | |
429 | } \ | |
430 | \ | |
431 | ui += instrides_scl[1]; \ | |
432 | vi += instrides_scl[2]; \ | |
433 | y1i += instrides_scl[0]; \ | |
434 | y2i += instrides_scl[0]; \ | |
435 | } \ | |
436 | return srcSliceH; \ | |
437 | } | |
438 | ||
439 | #define out_abgr(a, b, c, ptr) \ | |
440 | vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr) | |
441 | #define out_bgra(a, b, c, ptr) \ | |
442 | vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr) | |
443 | #define out_rgba(a, b, c, ptr) \ | |
444 | vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr) | |
445 | #define out_argb(a, b, c, ptr) \ | |
446 | vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr) | |
447 | #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr) | |
448 | #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr) | |
449 | ||
450 | DEFCSP420_CVT(yuv2_abgr, out_abgr) | |
451 | DEFCSP420_CVT(yuv2_bgra, out_bgra) | |
452 | DEFCSP420_CVT(yuv2_rgba, out_rgba) | |
453 | DEFCSP420_CVT(yuv2_argb, out_argb) | |
454 | DEFCSP420_CVT(yuv2_rgb24, out_rgb24) | |
455 | DEFCSP420_CVT(yuv2_bgr24, out_bgr24) | |
456 | ||
457 | // uyvy|uyvy|uyvy|uyvy | |
458 | // 0123 4567 89ab cdef | |
459 | static const vector unsigned char | |
460 | demux_u = { 0x10, 0x00, 0x10, 0x00, | |
461 | 0x10, 0x04, 0x10, 0x04, | |
462 | 0x10, 0x08, 0x10, 0x08, | |
463 | 0x10, 0x0c, 0x10, 0x0c }, | |
464 | demux_v = { 0x10, 0x02, 0x10, 0x02, | |
465 | 0x10, 0x06, 0x10, 0x06, | |
466 | 0x10, 0x0A, 0x10, 0x0A, | |
467 | 0x10, 0x0E, 0x10, 0x0E }, | |
468 | demux_y = { 0x10, 0x01, 0x10, 0x03, | |
469 | 0x10, 0x05, 0x10, 0x07, | |
470 | 0x10, 0x09, 0x10, 0x0B, | |
471 | 0x10, 0x0D, 0x10, 0x0F }; | |
472 | ||
473 | /* | |
474 | * this is so I can play live CCIR raw video | |
475 | */ | |
476 | static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in, | |
477 | int *instrides, int srcSliceY, int srcSliceH, | |
478 | unsigned char **oplanes, int *outstrides) | |
479 | { | |
480 | int w = c->srcW; | |
481 | int h = srcSliceH; | |
482 | int i, j; | |
483 | vector unsigned char uyvy; | |
484 | vector signed short Y, U, V; | |
485 | vector signed short R0, G0, B0, R1, G1, B1; | |
486 | vector unsigned char R, G, B; | |
487 | vector unsigned char *out; | |
488 | const ubyte *img; | |
489 | ||
490 | img = in[0]; | |
491 | out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]); | |
492 | ||
493 | for (i = 0; i < h; i++) | |
494 | for (j = 0; j < w / 16; j++) { | |
495 | uyvy = vec_ld(0, img); | |
496 | ||
497 | U = (vector signed short) | |
498 | vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u); | |
499 | V = (vector signed short) | |
500 | vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v); | |
501 | Y = (vector signed short) | |
502 | vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y); | |
503 | ||
504 | cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0); | |
505 | ||
506 | uyvy = vec_ld(16, img); | |
507 | ||
508 | U = (vector signed short) | |
509 | vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u); | |
510 | V = (vector signed short) | |
511 | vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v); | |
512 | Y = (vector signed short) | |
513 | vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y); | |
514 | ||
515 | cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1); | |
516 | ||
517 | R = vec_packclp(R0, R1); | |
518 | G = vec_packclp(G0, G1); | |
519 | B = vec_packclp(B0, B1); | |
520 | ||
521 | // vec_mstbgr24 (R,G,B, out); | |
522 | out_rgba(R, G, B, out); | |
523 | ||
524 | img += 32; | |
525 | } | |
526 | return srcSliceH; | |
527 | } | |
528 | ||
529 | #endif /* HAVE_ALTIVEC */ | |
530 | ||
531 | /* Ok currently the acceleration routine only supports | |
532 | * inputs of widths a multiple of 16 | |
533 | * and heights a multiple 2 | |
534 | * | |
535 | * So we just fall back to the C codes for this. | |
536 | */ | |
537 | av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c) | |
538 | { | |
539 | #if HAVE_ALTIVEC | |
540 | if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) | |
541 | return NULL; | |
542 | ||
543 | /* | |
544 | * and this seems not to matter too much I tried a bunch of | |
545 | * videos with abnormal widths and MPlayer crashes elsewhere. | |
546 | * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv | |
547 | * boom with X11 bad match. | |
548 | * | |
549 | */ | |
550 | if ((c->srcW & 0xf) != 0) | |
551 | return NULL; | |
552 | ||
553 | switch (c->srcFormat) { | |
554 | case AV_PIX_FMT_YUV410P: | |
555 | case AV_PIX_FMT_YUV420P: | |
556 | /*case IMGFMT_CLPL: ??? */ | |
557 | case AV_PIX_FMT_GRAY8: | |
558 | case AV_PIX_FMT_NV12: | |
559 | case AV_PIX_FMT_NV21: | |
560 | if ((c->srcH & 0x1) != 0) | |
561 | return NULL; | |
562 | ||
563 | switch (c->dstFormat) { | |
564 | case AV_PIX_FMT_RGB24: | |
565 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); | |
566 | return altivec_yuv2_rgb24; | |
567 | case AV_PIX_FMT_BGR24: | |
568 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); | |
569 | return altivec_yuv2_bgr24; | |
570 | case AV_PIX_FMT_ARGB: | |
571 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); | |
572 | return altivec_yuv2_argb; | |
573 | case AV_PIX_FMT_ABGR: | |
574 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); | |
575 | return altivec_yuv2_abgr; | |
576 | case AV_PIX_FMT_RGBA: | |
577 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); | |
578 | return altivec_yuv2_rgba; | |
579 | case AV_PIX_FMT_BGRA: | |
580 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); | |
581 | return altivec_yuv2_bgra; | |
582 | default: return NULL; | |
583 | } | |
584 | break; | |
585 | ||
586 | case AV_PIX_FMT_UYVY422: | |
587 | switch (c->dstFormat) { | |
588 | case AV_PIX_FMT_BGR32: | |
589 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); | |
590 | return altivec_uyvy_rgb32; | |
591 | default: return NULL; | |
592 | } | |
593 | break; | |
594 | } | |
595 | #endif /* HAVE_ALTIVEC */ | |
596 | ||
597 | return NULL; | |
598 | } | |
599 | ||
600 | av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c, | |
601 | const int inv_table[4], | |
602 | int brightness, | |
603 | int contrast, | |
604 | int saturation) | |
605 | { | |
606 | #if HAVE_ALTIVEC | |
607 | union { | |
608 | DECLARE_ALIGNED(16, signed short, tmp)[8]; | |
609 | vector signed short vec; | |
610 | } buf; | |
611 | ||
612 | if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) | |
613 | return; | |
614 | ||
615 | buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy | |
616 | buf.tmp[1] = -256 * brightness; // oy | |
617 | buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv | |
618 | buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu | |
619 | buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu | |
620 | buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv | |
621 | ||
622 | c->CSHIFT = (vector unsigned short) vec_splat_u16(2); | |
623 | c->CY = vec_splat((vector signed short) buf.vec, 0); | |
624 | c->OY = vec_splat((vector signed short) buf.vec, 1); | |
625 | c->CRV = vec_splat((vector signed short) buf.vec, 2); | |
626 | c->CBU = vec_splat((vector signed short) buf.vec, 3); | |
627 | c->CGU = vec_splat((vector signed short) buf.vec, 4); | |
628 | c->CGV = vec_splat((vector signed short) buf.vec, 5); | |
629 | return; | |
630 | #endif /* HAVE_ALTIVEC */ | |
631 | } | |
632 | ||
633 | #if HAVE_ALTIVEC | |
634 | ||
635 | static av_always_inline void yuv2packedX_altivec(SwsContext *c, | |
636 | const int16_t *lumFilter, | |
637 | const int16_t **lumSrc, | |
638 | int lumFilterSize, | |
639 | const int16_t *chrFilter, | |
640 | const int16_t **chrUSrc, | |
641 | const int16_t **chrVSrc, | |
642 | int chrFilterSize, | |
643 | const int16_t **alpSrc, | |
644 | uint8_t *dest, | |
645 | int dstW, int dstY, | |
646 | enum AVPixelFormat target) | |
647 | { | |
648 | int i, j; | |
649 | vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V; | |
650 | vector signed short R0, G0, B0, R1, G1, B1; | |
651 | ||
652 | vector unsigned char R, G, B; | |
653 | vector unsigned char *out, *nout; | |
654 | ||
655 | vector signed short RND = vec_splat_s16(1 << 3); | |
656 | vector unsigned short SCL = vec_splat_u16(4); | |
657 | DECLARE_ALIGNED(16, unsigned int, scratch)[16]; | |
658 | ||
659 | vector signed short *YCoeffs, *CCoeffs; | |
660 | ||
661 | YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize; | |
662 | CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize; | |
663 | ||
664 | out = (vector unsigned char *) dest; | |
665 | ||
666 | for (i = 0; i < dstW; i += 16) { | |
667 | Y0 = RND; | |
668 | Y1 = RND; | |
669 | /* extract 16 coeffs from lumSrc */ | |
670 | for (j = 0; j < lumFilterSize; j++) { | |
671 | X0 = vec_ld(0, &lumSrc[j][i]); | |
672 | X1 = vec_ld(16, &lumSrc[j][i]); | |
673 | Y0 = vec_mradds(X0, YCoeffs[j], Y0); | |
674 | Y1 = vec_mradds(X1, YCoeffs[j], Y1); | |
675 | } | |
676 | ||
677 | U = RND; | |
678 | V = RND; | |
679 | /* extract 8 coeffs from U,V */ | |
680 | for (j = 0; j < chrFilterSize; j++) { | |
681 | X = vec_ld(0, &chrUSrc[j][i / 2]); | |
682 | U = vec_mradds(X, CCoeffs[j], U); | |
683 | X = vec_ld(0, &chrVSrc[j][i / 2]); | |
684 | V = vec_mradds(X, CCoeffs[j], V); | |
685 | } | |
686 | ||
687 | /* scale and clip signals */ | |
688 | Y0 = vec_sra(Y0, SCL); | |
689 | Y1 = vec_sra(Y1, SCL); | |
690 | U = vec_sra(U, SCL); | |
691 | V = vec_sra(V, SCL); | |
692 | ||
693 | Y0 = vec_clip_s16(Y0); | |
694 | Y1 = vec_clip_s16(Y1); | |
695 | U = vec_clip_s16(U); | |
696 | V = vec_clip_s16(V); | |
697 | ||
698 | /* now we have | |
699 | * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 | |
700 | * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 | |
701 | * | |
702 | * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 | |
703 | * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7 | |
704 | * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7 | |
705 | */ | |
706 | ||
707 | U0 = vec_mergeh(U, U); | |
708 | V0 = vec_mergeh(V, V); | |
709 | ||
710 | U1 = vec_mergel(U, U); | |
711 | V1 = vec_mergel(V, V); | |
712 | ||
713 | cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0); | |
714 | cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1); | |
715 | ||
716 | R = vec_packclp(R0, R1); | |
717 | G = vec_packclp(G0, G1); | |
718 | B = vec_packclp(B0, B1); | |
719 | ||
720 | switch (target) { | |
721 | case AV_PIX_FMT_ABGR: | |
722 | out_abgr(R, G, B, out); | |
723 | break; | |
724 | case AV_PIX_FMT_BGRA: | |
725 | out_bgra(R, G, B, out); | |
726 | break; | |
727 | case AV_PIX_FMT_RGBA: | |
728 | out_rgba(R, G, B, out); | |
729 | break; | |
730 | case AV_PIX_FMT_ARGB: | |
731 | out_argb(R, G, B, out); | |
732 | break; | |
733 | case AV_PIX_FMT_RGB24: | |
734 | out_rgb24(R, G, B, out); | |
735 | break; | |
736 | case AV_PIX_FMT_BGR24: | |
737 | out_bgr24(R, G, B, out); | |
738 | break; | |
739 | default: | |
740 | { | |
741 | /* If this is reached, the caller should have called yuv2packedXinC | |
742 | * instead. */ | |
743 | static int printed_error_message; | |
744 | if (!printed_error_message) { | |
745 | av_log(c, AV_LOG_ERROR, | |
746 | "altivec_yuv2packedX doesn't support %s output\n", | |
747 | av_get_pix_fmt_name(c->dstFormat)); | |
748 | printed_error_message = 1; | |
749 | } | |
750 | return; | |
751 | } | |
752 | } | |
753 | } | |
754 | ||
755 | if (i < dstW) { | |
756 | i -= 16; | |
757 | ||
758 | Y0 = RND; | |
759 | Y1 = RND; | |
760 | /* extract 16 coeffs from lumSrc */ | |
761 | for (j = 0; j < lumFilterSize; j++) { | |
762 | X0 = vec_ld(0, &lumSrc[j][i]); | |
763 | X1 = vec_ld(16, &lumSrc[j][i]); | |
764 | Y0 = vec_mradds(X0, YCoeffs[j], Y0); | |
765 | Y1 = vec_mradds(X1, YCoeffs[j], Y1); | |
766 | } | |
767 | ||
768 | U = RND; | |
769 | V = RND; | |
770 | /* extract 8 coeffs from U,V */ | |
771 | for (j = 0; j < chrFilterSize; j++) { | |
772 | X = vec_ld(0, &chrUSrc[j][i / 2]); | |
773 | U = vec_mradds(X, CCoeffs[j], U); | |
774 | X = vec_ld(0, &chrVSrc[j][i / 2]); | |
775 | V = vec_mradds(X, CCoeffs[j], V); | |
776 | } | |
777 | ||
778 | /* scale and clip signals */ | |
779 | Y0 = vec_sra(Y0, SCL); | |
780 | Y1 = vec_sra(Y1, SCL); | |
781 | U = vec_sra(U, SCL); | |
782 | V = vec_sra(V, SCL); | |
783 | ||
784 | Y0 = vec_clip_s16(Y0); | |
785 | Y1 = vec_clip_s16(Y1); | |
786 | U = vec_clip_s16(U); | |
787 | V = vec_clip_s16(V); | |
788 | ||
789 | /* now we have | |
790 | * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 | |
791 | * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 | |
792 | * | |
793 | * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 | |
794 | * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7 | |
795 | * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7 | |
796 | */ | |
797 | ||
798 | U0 = vec_mergeh(U, U); | |
799 | V0 = vec_mergeh(V, V); | |
800 | ||
801 | U1 = vec_mergel(U, U); | |
802 | V1 = vec_mergel(V, V); | |
803 | ||
804 | cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0); | |
805 | cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1); | |
806 | ||
807 | R = vec_packclp(R0, R1); | |
808 | G = vec_packclp(G0, G1); | |
809 | B = vec_packclp(B0, B1); | |
810 | ||
811 | nout = (vector unsigned char *) scratch; | |
812 | switch (target) { | |
813 | case AV_PIX_FMT_ABGR: | |
814 | out_abgr(R, G, B, nout); | |
815 | break; | |
816 | case AV_PIX_FMT_BGRA: | |
817 | out_bgra(R, G, B, nout); | |
818 | break; | |
819 | case AV_PIX_FMT_RGBA: | |
820 | out_rgba(R, G, B, nout); | |
821 | break; | |
822 | case AV_PIX_FMT_ARGB: | |
823 | out_argb(R, G, B, nout); | |
824 | break; | |
825 | case AV_PIX_FMT_RGB24: | |
826 | out_rgb24(R, G, B, nout); | |
827 | break; | |
828 | case AV_PIX_FMT_BGR24: | |
829 | out_bgr24(R, G, B, nout); | |
830 | break; | |
831 | default: | |
832 | /* Unreachable, I think. */ | |
833 | av_log(c, AV_LOG_ERROR, | |
834 | "altivec_yuv2packedX doesn't support %s output\n", | |
835 | av_get_pix_fmt_name(c->dstFormat)); | |
836 | return; | |
837 | } | |
838 | ||
839 | memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4); | |
840 | } | |
841 | } | |
842 | ||
843 | #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \ | |
844 | void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \ | |
845 | const int16_t *lumFilter, \ | |
846 | const int16_t **lumSrc, \ | |
847 | int lumFilterSize, \ | |
848 | const int16_t *chrFilter, \ | |
849 | const int16_t **chrUSrc, \ | |
850 | const int16_t **chrVSrc, \ | |
851 | int chrFilterSize, \ | |
852 | const int16_t **alpSrc, \ | |
853 | uint8_t *dest, int dstW, int dstY) \ | |
854 | { \ | |
855 | yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \ | |
856 | chrFilter, chrUSrc, chrVSrc, \ | |
857 | chrFilterSize, alpSrc, \ | |
858 | dest, dstW, dstY, pixfmt); \ | |
859 | } | |
860 | ||
861 | YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR); | |
862 | YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA); | |
863 | YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB); | |
864 | YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA); | |
865 | YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24); | |
866 | YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24); | |
867 | ||
868 | #endif /* HAVE_ALTIVEC */ |