2 * AltiVec-enhanced yuv2yuvX
4 * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 * based on the equivalent C code in swscale.c
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #include "libswscale/swscale.h"
28 #include "libswscale/swscale_internal.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
34 #define vzero vec_splat_s32(0)
36 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do { \
37 vector signed short l2 = vec_ld(((x) << 1) + 16, src); \
38 vector signed short ls = vec_perm(l1, l2, perm); \
39 vector signed int i1 = vec_mule(filter, ls); \
40 vector signed int i2 = vec_mulo(filter, ls); \
41 vector signed int vf1 = vec_mergeh(i1, i2); \
42 vector signed int vf2 = vec_mergel(i1, i2); \
43 d1 = vec_add(d1, vf1); \
44 d2 = vec_add(d2, vf2); \
48 static void yuv2planeX_16_altivec(const int16_t *filter
, int filterSize
,
49 const int16_t **src
, uint8_t *dest
,
50 const uint8_t *dither
, int offset
, int x
)
53 DECLARE_ALIGNED(16, int, val
)[16];
54 vector
signed int vo1
, vo2
, vo3
, vo4
;
55 vector
unsigned short vs1
, vs2
;
56 vector
unsigned char vf
;
57 vector
unsigned int altivec_vectorShiftInt19
=
58 vec_add(vec_splat_u32(10), vec_splat_u32(9));
60 for (i
= 0; i
< 16; i
++)
61 val
[i
] = dither
[(x
+ i
+ offset
) & 7] << 12;
64 vo2
= vec_ld(16, val
);
65 vo3
= vec_ld(32, val
);
66 vo4
= vec_ld(48, val
);
68 for (j
= 0; j
< filterSize
; j
++) {
69 vector
signed short l1
, vLumFilter
= vec_ld(j
<< 1, filter
);
70 vector
unsigned char perm
, perm0
= vec_lvsl(j
<< 1, filter
);
71 vLumFilter
= vec_perm(vLumFilter
, vLumFilter
, perm0
);
72 vLumFilter
= vec_splat(vLumFilter
, 0); // lumFilter[j] is loaded 8 times in vLumFilter
74 perm
= vec_lvsl(x
<< 1, src
[j
]);
75 l1
= vec_ld(x
<< 1, src
[j
]);
77 yuv2planeX_8(vo1
, vo2
, l1
, src
[j
], x
, perm
, vLumFilter
);
78 yuv2planeX_8(vo3
, vo4
, l1
, src
[j
], x
+ 8, perm
, vLumFilter
);
81 vo1
= vec_sra(vo1
, altivec_vectorShiftInt19
);
82 vo2
= vec_sra(vo2
, altivec_vectorShiftInt19
);
83 vo3
= vec_sra(vo3
, altivec_vectorShiftInt19
);
84 vo4
= vec_sra(vo4
, altivec_vectorShiftInt19
);
85 vs1
= vec_packsu(vo1
, vo2
);
86 vs2
= vec_packsu(vo3
, vo4
);
87 vf
= vec_packsu(vs1
, vs2
);
91 static inline void yuv2planeX_u(const int16_t *filter
, int filterSize
,
92 const int16_t **src
, uint8_t *dest
, int dstW
,
93 const uint8_t *dither
, int offset
, int x
)
97 for (i
= x
; i
< dstW
; i
++) {
98 int t
= dither
[(i
+ offset
) & 7] << 12;
99 for (j
= 0; j
< filterSize
; j
++)
100 t
+= src
[j
][i
] * filter
[j
];
101 dest
[i
] = av_clip_uint8(t
>> 19);
105 static void yuv2planeX_altivec(const int16_t *filter
, int filterSize
,
106 const int16_t **src
, uint8_t *dest
, int dstW
,
107 const uint8_t *dither
, int offset
)
109 int dst_u
= -(uintptr_t)dest
& 15;
112 yuv2planeX_u(filter
, filterSize
, src
, dest
, dst_u
, dither
, offset
, 0);
114 for (i
= dst_u
; i
< dstW
- 15; i
+= 16)
115 yuv2planeX_16_altivec(filter
, filterSize
, src
, dest
+ i
, dither
,
118 yuv2planeX_u(filter
, filterSize
, src
, dest
, dstW
, dither
, offset
, i
);
121 static void hScale_altivec_real(SwsContext
*c
, int16_t *dst
, int dstW
,
122 const uint8_t *src
, const int16_t *filter
,
123 const int32_t *filterPos
, int filterSize
)
126 DECLARE_ALIGNED(16, int, tempo
)[4];
128 if (filterSize
% 4) {
129 for (i
= 0; i
< dstW
; i
++) {
131 register int srcPos
= filterPos
[i
];
132 register int val
= 0;
133 for (j
= 0; j
< filterSize
; j
++)
134 val
+= ((int)src
[srcPos
+ j
]) * filter
[filterSize
* i
+ j
];
135 dst
[i
] = FFMIN(val
>> 7, (1 << 15) - 1);
138 switch (filterSize
) {
140 for (i
= 0; i
< dstW
; i
++) {
141 register int srcPos
= filterPos
[i
];
143 vector
unsigned char src_v0
= vec_ld(srcPos
, src
);
144 vector
unsigned char src_v1
, src_vF
;
145 vector
signed short src_v
, filter_v
;
146 vector
signed int val_vEven
, val_s
;
147 if ((((uintptr_t)src
+ srcPos
) % 16) > 12) {
148 src_v1
= vec_ld(srcPos
+ 16, src
);
150 src_vF
= vec_perm(src_v0
, src_v1
, vec_lvsl(srcPos
, src
));
152 src_v
= // vec_unpackh sign-extends...
153 (vector
signed short)(vec_mergeh((vector
unsigned char)vzero
, src_vF
));
154 // now put our elements in the even slots
155 src_v
= vec_mergeh(src_v
, (vector
signed short)vzero
);
157 filter_v
= vec_ld(i
<< 3, filter
);
158 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
160 // The neat trick: We only care for half the elements,
161 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
162 // and we're going to use vec_mule, so we choose
163 // carefully how to "unpack" the elements into the even slots.
165 filter_v
= vec_mergel(filter_v
, (vector
signed short)vzero
);
167 filter_v
= vec_mergeh(filter_v
, (vector
signed short)vzero
);
169 val_vEven
= vec_mule(src_v
, filter_v
);
170 val_s
= vec_sums(val_vEven
, vzero
);
171 vec_st(val_s
, 0, tempo
);
172 dst
[i
] = FFMIN(tempo
[3] >> 7, (1 << 15) - 1);
177 for (i
= 0; i
< dstW
; i
++) {
178 register int srcPos
= filterPos
[i
];
180 vector
unsigned char src_v0
= vec_ld(srcPos
, src
);
181 vector
unsigned char src_v1
, src_vF
;
182 vector
signed short src_v
, filter_v
;
183 vector
signed int val_v
, val_s
;
184 if ((((uintptr_t)src
+ srcPos
) % 16) > 8) {
185 src_v1
= vec_ld(srcPos
+ 16, src
);
187 src_vF
= vec_perm(src_v0
, src_v1
, vec_lvsl(srcPos
, src
));
189 src_v
= // vec_unpackh sign-extends...
190 (vector
signed short)(vec_mergeh((vector
unsigned char)vzero
, src_vF
));
191 filter_v
= vec_ld(i
<< 4, filter
);
192 // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
194 val_v
= vec_msums(src_v
, filter_v
, (vector
signed int)vzero
);
195 val_s
= vec_sums(val_v
, vzero
);
196 vec_st(val_s
, 0, tempo
);
197 dst
[i
] = FFMIN(tempo
[3] >> 7, (1 << 15) - 1);
202 for (i
= 0; i
< dstW
; i
++) {
203 register int srcPos
= filterPos
[i
];
205 vector
unsigned char src_v0
= vec_ld(srcPos
, src
);
206 vector
unsigned char src_v1
= vec_ld(srcPos
+ 16, src
);
207 vector
unsigned char src_vF
= vec_perm(src_v0
, src_v1
, vec_lvsl(srcPos
, src
));
209 vector
signed short src_vA
= // vec_unpackh sign-extends...
210 (vector
signed short)(vec_mergeh((vector
unsigned char)vzero
, src_vF
));
211 vector
signed short src_vB
= // vec_unpackh sign-extends...
212 (vector
signed short)(vec_mergel((vector
unsigned char)vzero
, src_vF
));
214 vector
signed short filter_v0
= vec_ld(i
<< 5, filter
);
215 vector
signed short filter_v1
= vec_ld((i
<< 5) + 16, filter
);
216 // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
218 vector
signed int val_acc
= vec_msums(src_vA
, filter_v0
, (vector
signed int)vzero
);
219 vector
signed int val_v
= vec_msums(src_vB
, filter_v1
, val_acc
);
221 vector
signed int val_s
= vec_sums(val_v
, vzero
);
223 vec_st(val_s
, 0, tempo
);
224 dst
[i
] = FFMIN(tempo
[3] >> 7, (1 << 15) - 1);
229 for (i
= 0; i
< dstW
; i
++) {
231 register int srcPos
= filterPos
[i
];
233 vector
signed int val_s
, val_v
= (vector
signed int)vzero
;
234 vector
signed short filter_v0R
= vec_ld(i
* 2 * filterSize
, filter
);
235 vector
unsigned char permF
= vec_lvsl((i
* 2 * filterSize
), filter
);
237 vector
unsigned char src_v0
= vec_ld(srcPos
, src
);
238 vector
unsigned char permS
= vec_lvsl(srcPos
, src
);
240 for (j
= 0; j
< filterSize
- 15; j
+= 16) {
241 vector
unsigned char src_v1
= vec_ld(srcPos
+ j
+ 16, src
);
242 vector
unsigned char src_vF
= vec_perm(src_v0
, src_v1
, permS
);
244 vector
signed short src_vA
= // vec_unpackh sign-extends...
245 (vector
signed short)(vec_mergeh((vector
unsigned char)vzero
, src_vF
));
246 vector
signed short src_vB
= // vec_unpackh sign-extends...
247 (vector
signed short)(vec_mergel((vector
unsigned char)vzero
, src_vF
));
249 vector
signed short filter_v1R
= vec_ld((i
* 2 * filterSize
) + (j
* 2) + 16, filter
);
250 vector
signed short filter_v2R
= vec_ld((i
* 2 * filterSize
) + (j
* 2) + 32, filter
);
251 vector
signed short filter_v0
= vec_perm(filter_v0R
, filter_v1R
, permF
);
252 vector
signed short filter_v1
= vec_perm(filter_v1R
, filter_v2R
, permF
);
254 vector
signed int val_acc
= vec_msums(src_vA
, filter_v0
, val_v
);
255 val_v
= vec_msums(src_vB
, filter_v1
, val_acc
);
257 filter_v0R
= filter_v2R
;
261 if (j
< filterSize
- 7) {
262 // loading src_v0 is useless, it's already done above
263 // vector unsigned char src_v0 = vec_ld(srcPos + j, src);
264 vector
unsigned char src_v1
, src_vF
;
265 vector
signed short src_v
, filter_v1R
, filter_v
;
266 if ((((uintptr_t)src
+ srcPos
) % 16) > 8) {
267 src_v1
= vec_ld(srcPos
+ j
+ 16, src
);
269 src_vF
= vec_perm(src_v0
, src_v1
, permS
);
271 src_v
= // vec_unpackh sign-extends...
272 (vector
signed short)(vec_mergeh((vector
unsigned char)vzero
, src_vF
));
273 // loading filter_v0R is useless, it's already done above
274 // vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
275 filter_v1R
= vec_ld((i
* 2 * filterSize
) + (j
* 2) + 16, filter
);
276 filter_v
= vec_perm(filter_v0R
, filter_v1R
, permF
);
278 val_v
= vec_msums(src_v
, filter_v
, val_v
);
281 val_s
= vec_sums(val_v
, vzero
);
283 vec_st(val_s
, 0, tempo
);
284 dst
[i
] = FFMIN(tempo
[3] >> 7, (1 << 15) - 1);
288 #endif /* HAVE_ALTIVEC */
290 av_cold
void ff_sws_init_swscale_ppc(SwsContext
*c
)
293 enum AVPixelFormat dstFormat
= c
->dstFormat
;
295 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC
))
298 if (c
->srcBpc
== 8 && c
->dstBpc
<= 14) {
299 c
->hyScale
= c
->hcScale
= hScale_altivec_real
;
301 if (!is16BPS(dstFormat
) && !is9_OR_10BPS(dstFormat
) &&
302 dstFormat
!= AV_PIX_FMT_NV12
&& dstFormat
!= AV_PIX_FMT_NV21
&&
304 c
->yuv2planeX
= yuv2planeX_altivec
;
307 /* The following list of supported dstFormat values should
308 * match what's found in the body of ff_yuv2packedX_altivec() */
309 if (!(c
->flags
& (SWS_BITEXACT
| SWS_FULL_CHR_H_INT
)) && !c
->alpPixBuf
) {
310 switch (c
->dstFormat
) {
311 case AV_PIX_FMT_ABGR
:
312 c
->yuv2packedX
= ff_yuv2abgr_X_altivec
;
314 case AV_PIX_FMT_BGRA
:
315 c
->yuv2packedX
= ff_yuv2bgra_X_altivec
;
317 case AV_PIX_FMT_ARGB
:
318 c
->yuv2packedX
= ff_yuv2argb_X_altivec
;
320 case AV_PIX_FMT_RGBA
:
321 c
->yuv2packedX
= ff_yuv2rgba_X_altivec
;
323 case AV_PIX_FMT_BGR24
:
324 c
->yuv2packedX
= ff_yuv2bgr24_X_altivec
;
326 case AV_PIX_FMT_RGB24
:
327 c
->yuv2packedX
= ff_yuv2rgb24_X_altivec
;
331 #endif /* HAVE_ALTIVEC */