Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (C) 2013 Wei Gao <weigao@multicorewareinc.com> | |
3 | * Copyright (C) 2013 Lenny Wang | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #ifndef AVFILTER_UNSHARP_OPENCL_KERNEL_H | |
23 | #define AVFILTER_UNSHARP_OPENCL_KERNEL_H | |
24 | ||
25 | #include "libavutil/opencl.h" | |
26 | ||
27 | const char *ff_kernel_unsharp_opencl = AV_OPENCL_KERNEL( | |
28 | inline unsigned char clip_uint8(int a) | |
29 | { | |
30 | if (a & (~0xFF)) | |
31 | return (-a)>>31; | |
32 | else | |
33 | return a; | |
34 | } | |
35 | ||
36 | kernel void unsharp_luma( | |
37 | global unsigned char *src, | |
38 | global unsigned char *dst, | |
39 | global int *mask, | |
40 | int amount, | |
41 | int scalebits, | |
42 | int halfscale, | |
43 | int src_stride, | |
44 | int dst_stride, | |
45 | int width, | |
46 | int height) | |
47 | { | |
48 | int2 threadIdx, blockIdx, globalIdx; | |
49 | threadIdx.x = get_local_id(0); | |
50 | threadIdx.y = get_local_id(1); | |
51 | blockIdx.x = get_group_id(0); | |
52 | blockIdx.y = get_group_id(1); | |
53 | globalIdx.x = get_global_id(0); | |
54 | globalIdx.y = get_global_id(1); | |
55 | ||
56 | if (!amount) { | |
57 | if (globalIdx.x < width && globalIdx.y < height) | |
58 | dst[globalIdx.x + globalIdx.y*dst_stride] = src[globalIdx.x + globalIdx.y*src_stride]; | |
59 | return; | |
60 | } | |
61 | ||
62 | local uchar l[32][32]; | |
63 | local int lc[LU_RADIUS_X*LU_RADIUS_Y]; | |
64 | int indexIx, indexIy, i, j; | |
65 | ||
66 | for(i = 0; i <= 1; i++) { | |
67 | indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; | |
68 | indexIy = indexIy < 0 ? 0 : indexIy; | |
69 | indexIy = indexIy >= height ? height - 1: indexIy; | |
70 | for(j = 0; j <= 1; j++) { | |
71 | indexIx = -8 + (blockIdx.x + j) * 16 + threadIdx.x; | |
72 | indexIx = indexIx < 0 ? 0 : indexIx; | |
73 | indexIx = indexIx >= width ? width - 1: indexIx; | |
74 | l[i*16 + threadIdx.y][j*16 + threadIdx.x] = src[indexIy*src_stride + indexIx]; | |
75 | } | |
76 | } | |
77 | ||
78 | int indexL = threadIdx.y*16 + threadIdx.x; | |
79 | if (indexL < LU_RADIUS_X*LU_RADIUS_Y) | |
80 | lc[indexL] = mask[indexL]; | |
81 | barrier(CLK_LOCAL_MEM_FENCE); | |
82 | ||
83 | int idx, idy, maskIndex; | |
84 | int sum = 0; | |
85 | int steps_x = LU_RADIUS_X/2; | |
86 | int steps_y = LU_RADIUS_Y/2; | |
87 | ||
88 | \n#pragma unroll\n | |
89 | for (i = -steps_y; i <= steps_y; i++) { | |
90 | idy = 8 + i + threadIdx.y; | |
91 | \n#pragma unroll\n | |
92 | for (j = -steps_x; j <= steps_x; j++) { | |
93 | idx = 8 + j + threadIdx.x; | |
94 | maskIndex = (i + steps_y)*LU_RADIUS_X + j + steps_x; | |
95 | sum += (int)l[idy][idx] * lc[maskIndex]; | |
96 | } | |
97 | } | |
98 | int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; | |
99 | int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * amount) >> 16); | |
100 | if (globalIdx.x < width && globalIdx.y < height) | |
101 | dst[globalIdx.x + globalIdx.y*dst_stride] = clip_uint8(res); | |
102 | } | |
103 | ||
104 | kernel void unsharp_chroma( | |
105 | global unsigned char *src_y, | |
106 | global unsigned char *dst_y, | |
107 | global int *mask, | |
108 | int amount, | |
109 | int scalebits, | |
110 | int halfscale, | |
111 | int src_stride_lu, | |
112 | int src_stride_ch, | |
113 | int dst_stride_lu, | |
114 | int dst_stride_ch, | |
115 | int width, | |
116 | int height, | |
117 | int cw, | |
118 | int ch) | |
119 | { | |
120 | global unsigned char *dst_u = dst_y + height * dst_stride_lu; | |
121 | global unsigned char *dst_v = dst_u + ch * dst_stride_ch; | |
122 | global unsigned char *src_u = src_y + height * src_stride_lu; | |
123 | global unsigned char *src_v = src_u + ch * src_stride_ch; | |
124 | int2 threadIdx, blockIdx, globalIdx; | |
125 | threadIdx.x = get_local_id(0); | |
126 | threadIdx.y = get_local_id(1); | |
127 | blockIdx.x = get_group_id(0); | |
128 | blockIdx.y = get_group_id(1); | |
129 | globalIdx.x = get_global_id(0); | |
130 | globalIdx.y = get_global_id(1); | |
131 | int padch = get_global_size(1)/2; | |
132 | global unsigned char *src = globalIdx.y>=padch ? src_v : src_u; | |
133 | global unsigned char *dst = globalIdx.y>=padch ? dst_v : dst_u; | |
134 | ||
135 | blockIdx.y = globalIdx.y>=padch ? blockIdx.y - get_num_groups(1)/2 : blockIdx.y; | |
136 | globalIdx.y = globalIdx.y>=padch ? globalIdx.y - padch : globalIdx.y; | |
137 | ||
138 | if (!amount) { | |
139 | if (globalIdx.x < cw && globalIdx.y < ch) | |
140 | dst[globalIdx.x + globalIdx.y*dst_stride_ch] = src[globalIdx.x + globalIdx.y*src_stride_ch]; | |
141 | return; | |
142 | } | |
143 | ||
144 | local uchar l[32][32]; | |
145 | local int lc[CH_RADIUS_X*CH_RADIUS_Y]; | |
146 | int indexIx, indexIy, i, j; | |
147 | for(i = 0; i <= 1; i++) { | |
148 | indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; | |
149 | indexIy = indexIy < 0 ? 0 : indexIy; | |
150 | indexIy = indexIy >= ch ? ch - 1: indexIy; | |
151 | for(j = 0; j <= 1; j++) { | |
152 | indexIx = -8 + (blockIdx.x + j) * 16 + threadIdx.x; | |
153 | indexIx = indexIx < 0 ? 0 : indexIx; | |
154 | indexIx = indexIx >= cw ? cw - 1: indexIx; | |
155 | l[i*16 + threadIdx.y][j*16 + threadIdx.x] = src[indexIy * src_stride_ch + indexIx]; | |
156 | } | |
157 | } | |
158 | ||
159 | int indexL = threadIdx.y*16 + threadIdx.x; | |
160 | if (indexL < CH_RADIUS_X*CH_RADIUS_Y) | |
161 | lc[indexL] = mask[indexL]; | |
162 | barrier(CLK_LOCAL_MEM_FENCE); | |
163 | ||
164 | int idx, idy, maskIndex; | |
165 | int sum = 0; | |
166 | int steps_x = CH_RADIUS_X/2; | |
167 | int steps_y = CH_RADIUS_Y/2; | |
168 | ||
169 | \n#pragma unroll\n | |
170 | for (i = -steps_y; i <= steps_y; i++) { | |
171 | idy = 8 + i + threadIdx.y; | |
172 | \n#pragma unroll\n | |
173 | for (j = -steps_x; j <= steps_x; j++) { | |
174 | idx = 8 + j + threadIdx.x; | |
175 | maskIndex = (i + steps_y)*CH_RADIUS_X + j + steps_x; | |
176 | sum += (int)l[idy][idx] * lc[maskIndex]; | |
177 | } | |
178 | } | |
179 | int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; | |
180 | int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * amount) >> 16); | |
181 | if (globalIdx.x < cw && globalIdx.y < ch) | |
182 | dst[globalIdx.x + globalIdx.y*dst_stride_ch] = clip_uint8(res); | |
183 | } | |
184 | ||
185 | kernel void unsharp_default(global unsigned char *src, | |
186 | global unsigned char *dst, | |
187 | const global unsigned int *mask_lu, | |
188 | const global unsigned int *mask_ch, | |
189 | int amount_lu, | |
190 | int amount_ch, | |
191 | int step_x_lu, | |
192 | int step_y_lu, | |
193 | int step_x_ch, | |
194 | int step_y_ch, | |
195 | int scalebits_lu, | |
196 | int scalebits_ch, | |
197 | int halfscale_lu, | |
198 | int halfscale_ch, | |
199 | int src_stride_lu, | |
200 | int src_stride_ch, | |
201 | int dst_stride_lu, | |
202 | int dst_stride_ch, | |
203 | int height, | |
204 | int width, | |
205 | int ch, | |
206 | int cw) | |
207 | { | |
208 | global unsigned char *dst_y = dst; | |
209 | global unsigned char *dst_u = dst_y + height * dst_stride_lu; | |
210 | global unsigned char *dst_v = dst_u + ch * dst_stride_ch; | |
211 | ||
212 | global unsigned char *src_y = src; | |
213 | global unsigned char *src_u = src_y + height * src_stride_lu; | |
214 | global unsigned char *src_v = src_u + ch * src_stride_ch; | |
215 | ||
216 | global unsigned char *temp_dst; | |
217 | global unsigned char *temp_src; | |
218 | const global unsigned int *temp_mask; | |
219 | int global_id = get_global_id(0); | |
220 | int i, j, x, y, temp_src_stride, temp_dst_stride, temp_height, temp_width, temp_steps_x, temp_steps_y, | |
221 | temp_amount, temp_scalebits, temp_halfscale, sum, idx_x, idx_y, temp, res; | |
222 | if (global_id < width * height) { | |
223 | y = global_id / width; | |
224 | x = global_id % width; | |
225 | temp_dst = dst_y; | |
226 | temp_src = src_y; | |
227 | temp_src_stride = src_stride_lu; | |
228 | temp_dst_stride = dst_stride_lu; | |
229 | temp_height = height; | |
230 | temp_width = width; | |
231 | temp_steps_x = step_x_lu; | |
232 | temp_steps_y = step_y_lu; | |
233 | temp_mask = mask_lu; | |
234 | temp_amount = amount_lu; | |
235 | temp_scalebits = scalebits_lu; | |
236 | temp_halfscale = halfscale_lu; | |
237 | } else if ((global_id >= width * height) && (global_id < width * height + ch * cw)) { | |
238 | y = (global_id - width * height) / cw; | |
239 | x = (global_id - width * height) % cw; | |
240 | temp_dst = dst_u; | |
241 | temp_src = src_u; | |
242 | temp_src_stride = src_stride_ch; | |
243 | temp_dst_stride = dst_stride_ch; | |
244 | temp_height = ch; | |
245 | temp_width = cw; | |
246 | temp_steps_x = step_x_ch; | |
247 | temp_steps_y = step_y_ch; | |
248 | temp_mask = mask_ch; | |
249 | temp_amount = amount_ch; | |
250 | temp_scalebits = scalebits_ch; | |
251 | temp_halfscale = halfscale_ch; | |
252 | } else { | |
253 | y = (global_id - width * height - ch * cw) / cw; | |
254 | x = (global_id - width * height - ch * cw) % cw; | |
255 | temp_dst = dst_v; | |
256 | temp_src = src_v; | |
257 | temp_src_stride = src_stride_ch; | |
258 | temp_dst_stride = dst_stride_ch; | |
259 | temp_height = ch; | |
260 | temp_width = cw; | |
261 | temp_steps_x = step_x_ch; | |
262 | temp_steps_y = step_y_ch; | |
263 | temp_mask = mask_ch; | |
264 | temp_amount = amount_ch; | |
265 | temp_scalebits = scalebits_ch; | |
266 | temp_halfscale = halfscale_ch; | |
267 | } | |
268 | if (temp_amount) { | |
269 | sum = 0; | |
270 | for (j = 0; j <= 2 * temp_steps_y; j++) { | |
271 | idx_y = (y - temp_steps_y + j) <= 0 ? 0 : (y - temp_steps_y + j) >= temp_height ? temp_height-1 : y - temp_steps_y + j; | |
272 | for (i = 0; i <= 2 * temp_steps_x; i++) { | |
273 | idx_x = (x - temp_steps_x + i) <= 0 ? 0 : (x - temp_steps_x + i) >= temp_width ? temp_width-1 : x - temp_steps_x + i; | |
274 | sum += temp_mask[i + j * (2 * temp_steps_x + 1)] * temp_src[idx_x + idx_y * temp_src_stride]; | |
275 | } | |
276 | } | |
277 | temp = (int)temp_src[x + y * temp_src_stride]; | |
278 | res = temp + (((temp - (int)((sum + temp_halfscale) >> temp_scalebits)) * temp_amount) >> 16); | |
279 | temp_dst[x + y * temp_dst_stride] = clip_uint8(res); | |
280 | } else { | |
281 | temp_dst[x + y * temp_dst_stride] = temp_src[x + y * temp_src_stride]; | |
282 | } | |
283 | } | |
284 | ); | |
285 | ||
286 | #endif /* AVFILTER_UNSHARP_OPENCL_KERNEL_H */ |