2 * Copyright (C) 2013 Wei Gao <weigao@multicorewareinc.com>
3 * Copyright (C) 2013 Lenny Wang
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef AVFILTER_UNSHARP_OPENCL_KERNEL_H
23 #define AVFILTER_UNSHARP_OPENCL_KERNEL_H
25 #include "libavutil/opencl.h"
27 const char *ff_kernel_unsharp_opencl
= AV_OPENCL_KERNEL(
28 inline unsigned char clip_uint8(int a
)
36 kernel
void unsharp_luma(
37 global
unsigned char *src
,
38 global
unsigned char *dst
,
48 int2 threadIdx
, blockIdx
, globalIdx
;
49 threadIdx
.x
= get_local_id(0);
50 threadIdx
.y
= get_local_id(1);
51 blockIdx
.x
= get_group_id(0);
52 blockIdx
.y
= get_group_id(1);
53 globalIdx
.x
= get_global_id(0);
54 globalIdx
.y
= get_global_id(1);
57 if (globalIdx
.x
< width
&& globalIdx
.y
< height
)
58 dst
[globalIdx
.x
+ globalIdx
.y
*dst_stride
] = src
[globalIdx
.x
+ globalIdx
.y
*src_stride
];
62 local uchar l
[32][32];
63 local
int lc
[LU_RADIUS_X
*LU_RADIUS_Y
];
64 int indexIx
, indexIy
, i
, j
;
66 for(i
= 0; i
<= 1; i
++) {
67 indexIy
= -8 + (blockIdx
.y
+ i
) * 16 + threadIdx
.y
;
68 indexIy
= indexIy
< 0 ? 0 : indexIy
;
69 indexIy
= indexIy
>= height
? height
- 1: indexIy
;
70 for(j
= 0; j
<= 1; j
++) {
71 indexIx
= -8 + (blockIdx
.x
+ j
) * 16 + threadIdx
.x
;
72 indexIx
= indexIx
< 0 ? 0 : indexIx
;
73 indexIx
= indexIx
>= width
? width
- 1: indexIx
;
74 l
[i
*16 + threadIdx
.y
][j
*16 + threadIdx
.x
] = src
[indexIy
*src_stride
+ indexIx
];
78 int indexL
= threadIdx
.y
*16 + threadIdx
.x
;
79 if (indexL
< LU_RADIUS_X
*LU_RADIUS_Y
)
80 lc
[indexL
] = mask
[indexL
];
81 barrier(CLK_LOCAL_MEM_FENCE
);
83 int idx
, idy
, maskIndex
;
85 int steps_x
= LU_RADIUS_X
/2;
86 int steps_y
= LU_RADIUS_Y
/2;
89 for (i
= -steps_y
; i
<= steps_y
; i
++) {
90 idy
= 8 + i
+ threadIdx
.y
;
92 for (j
= -steps_x
; j
<= steps_x
; j
++) {
93 idx
= 8 + j
+ threadIdx
.x
;
94 maskIndex
= (i
+ steps_y
)*LU_RADIUS_X
+ j
+ steps_x
;
95 sum
+= (int)l
[idy
][idx
] * lc
[maskIndex
];
98 int temp
= (int)l
[threadIdx
.y
+ 8][threadIdx
.x
+ 8];
99 int res
= temp
+ (((temp
- (int)((sum
+ halfscale
) >> scalebits
)) * amount
) >> 16);
100 if (globalIdx
.x
< width
&& globalIdx
.y
< height
)
101 dst
[globalIdx
.x
+ globalIdx
.y
*dst_stride
] = clip_uint8(res
);
104 kernel
void unsharp_chroma(
105 global
unsigned char *src_y
,
106 global
unsigned char *dst_y
,
120 global
unsigned char *dst_u
= dst_y
+ height
* dst_stride_lu
;
121 global
unsigned char *dst_v
= dst_u
+ ch
* dst_stride_ch
;
122 global
unsigned char *src_u
= src_y
+ height
* src_stride_lu
;
123 global
unsigned char *src_v
= src_u
+ ch
* src_stride_ch
;
124 int2 threadIdx
, blockIdx
, globalIdx
;
125 threadIdx
.x
= get_local_id(0);
126 threadIdx
.y
= get_local_id(1);
127 blockIdx
.x
= get_group_id(0);
128 blockIdx
.y
= get_group_id(1);
129 globalIdx
.x
= get_global_id(0);
130 globalIdx
.y
= get_global_id(1);
131 int padch
= get_global_size(1)/2;
132 global
unsigned char *src
= globalIdx
.y
>=padch
? src_v
: src_u
;
133 global
unsigned char *dst
= globalIdx
.y
>=padch
? dst_v
: dst_u
;
135 blockIdx
.y
= globalIdx
.y
>=padch
? blockIdx
.y
- get_num_groups(1)/2 : blockIdx
.y
;
136 globalIdx
.y
= globalIdx
.y
>=padch
? globalIdx
.y
- padch
: globalIdx
.y
;
139 if (globalIdx
.x
< cw
&& globalIdx
.y
< ch
)
140 dst
[globalIdx
.x
+ globalIdx
.y
*dst_stride_ch
] = src
[globalIdx
.x
+ globalIdx
.y
*src_stride_ch
];
144 local uchar l
[32][32];
145 local
int lc
[CH_RADIUS_X
*CH_RADIUS_Y
];
146 int indexIx
, indexIy
, i
, j
;
147 for(i
= 0; i
<= 1; i
++) {
148 indexIy
= -8 + (blockIdx
.y
+ i
) * 16 + threadIdx
.y
;
149 indexIy
= indexIy
< 0 ? 0 : indexIy
;
150 indexIy
= indexIy
>= ch
? ch
- 1: indexIy
;
151 for(j
= 0; j
<= 1; j
++) {
152 indexIx
= -8 + (blockIdx
.x
+ j
) * 16 + threadIdx
.x
;
153 indexIx
= indexIx
< 0 ? 0 : indexIx
;
154 indexIx
= indexIx
>= cw
? cw
- 1: indexIx
;
155 l
[i
*16 + threadIdx
.y
][j
*16 + threadIdx
.x
] = src
[indexIy
* src_stride_ch
+ indexIx
];
159 int indexL
= threadIdx
.y
*16 + threadIdx
.x
;
160 if (indexL
< CH_RADIUS_X
*CH_RADIUS_Y
)
161 lc
[indexL
] = mask
[indexL
];
162 barrier(CLK_LOCAL_MEM_FENCE
);
164 int idx
, idy
, maskIndex
;
166 int steps_x
= CH_RADIUS_X
/2;
167 int steps_y
= CH_RADIUS_Y
/2;
170 for (i
= -steps_y
; i
<= steps_y
; i
++) {
171 idy
= 8 + i
+ threadIdx
.y
;
173 for (j
= -steps_x
; j
<= steps_x
; j
++) {
174 idx
= 8 + j
+ threadIdx
.x
;
175 maskIndex
= (i
+ steps_y
)*CH_RADIUS_X
+ j
+ steps_x
;
176 sum
+= (int)l
[idy
][idx
] * lc
[maskIndex
];
179 int temp
= (int)l
[threadIdx
.y
+ 8][threadIdx
.x
+ 8];
180 int res
= temp
+ (((temp
- (int)((sum
+ halfscale
) >> scalebits
)) * amount
) >> 16);
181 if (globalIdx
.x
< cw
&& globalIdx
.y
< ch
)
182 dst
[globalIdx
.x
+ globalIdx
.y
*dst_stride_ch
] = clip_uint8(res
);
185 kernel
void unsharp_default(global
unsigned char *src
,
186 global
unsigned char *dst
,
187 const global
unsigned int *mask_lu
,
188 const global
unsigned int *mask_ch
,
208 global
unsigned char *dst_y
= dst
;
209 global
unsigned char *dst_u
= dst_y
+ height
* dst_stride_lu
;
210 global
unsigned char *dst_v
= dst_u
+ ch
* dst_stride_ch
;
212 global
unsigned char *src_y
= src
;
213 global
unsigned char *src_u
= src_y
+ height
* src_stride_lu
;
214 global
unsigned char *src_v
= src_u
+ ch
* src_stride_ch
;
216 global
unsigned char *temp_dst
;
217 global
unsigned char *temp_src
;
218 const global
unsigned int *temp_mask
;
219 int global_id
= get_global_id(0);
220 int i
, j
, x
, y
, temp_src_stride
, temp_dst_stride
, temp_height
, temp_width
, temp_steps_x
, temp_steps_y
,
221 temp_amount
, temp_scalebits
, temp_halfscale
, sum
, idx_x
, idx_y
, temp
, res
;
222 if (global_id
< width
* height
) {
223 y
= global_id
/ width
;
224 x
= global_id
% width
;
227 temp_src_stride
= src_stride_lu
;
228 temp_dst_stride
= dst_stride_lu
;
229 temp_height
= height
;
231 temp_steps_x
= step_x_lu
;
232 temp_steps_y
= step_y_lu
;
234 temp_amount
= amount_lu
;
235 temp_scalebits
= scalebits_lu
;
236 temp_halfscale
= halfscale_lu
;
237 } else if ((global_id
>= width
* height
) && (global_id
< width
* height
+ ch
* cw
)) {
238 y
= (global_id
- width
* height
) / cw
;
239 x
= (global_id
- width
* height
) % cw
;
242 temp_src_stride
= src_stride_ch
;
243 temp_dst_stride
= dst_stride_ch
;
246 temp_steps_x
= step_x_ch
;
247 temp_steps_y
= step_y_ch
;
249 temp_amount
= amount_ch
;
250 temp_scalebits
= scalebits_ch
;
251 temp_halfscale
= halfscale_ch
;
253 y
= (global_id
- width
* height
- ch
* cw
) / cw
;
254 x
= (global_id
- width
* height
- ch
* cw
) % cw
;
257 temp_src_stride
= src_stride_ch
;
258 temp_dst_stride
= dst_stride_ch
;
261 temp_steps_x
= step_x_ch
;
262 temp_steps_y
= step_y_ch
;
264 temp_amount
= amount_ch
;
265 temp_scalebits
= scalebits_ch
;
266 temp_halfscale
= halfscale_ch
;
270 for (j
= 0; j
<= 2 * temp_steps_y
; j
++) {
271 idx_y
= (y
- temp_steps_y
+ j
) <= 0 ? 0 : (y
- temp_steps_y
+ j
) >= temp_height
? temp_height
-1 : y
- temp_steps_y
+ j
;
272 for (i
= 0; i
<= 2 * temp_steps_x
; i
++) {
273 idx_x
= (x
- temp_steps_x
+ i
) <= 0 ? 0 : (x
- temp_steps_x
+ i
) >= temp_width
? temp_width
-1 : x
- temp_steps_x
+ i
;
274 sum
+= temp_mask
[i
+ j
* (2 * temp_steps_x
+ 1)] * temp_src
[idx_x
+ idx_y
* temp_src_stride
];
277 temp
= (int)temp_src
[x
+ y
* temp_src_stride
];
278 res
= temp
+ (((temp
- (int)((sum
+ temp_halfscale
) >> temp_scalebits
)) * temp_amount
) >> 16);
279 temp_dst
[x
+ y
* temp_dst_stride
] = clip_uint8(res
);
281 temp_dst
[x
+ y
* temp_dst_stride
] = temp_src
[x
+ y
* temp_src_stride
];
286 #endif /* AVFILTER_UNSHARP_OPENCL_KERNEL_H */