Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (C) 2013 Wei Gao <weigao@multicorewareinc.com> | |
3 | * Copyright (C) 2013 Lenny Wang | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | /** | |
23 | * @file | |
24 | * unsharp input video | |
25 | */ | |
26 | ||
27 | #include "unsharp_opencl.h" | |
28 | #include "libavutil/common.h" | |
29 | #include "libavutil/opencl_internal.h" | |
30 | ||
31 | #define PLANE_NUM 3 | |
32 | #define ROUND_TO_16(a) (((((a) - 1)/16)+1)*16) | |
33 | ||
34 | static inline void add_mask_counter(uint32_t *dst, uint32_t *counter1, uint32_t *counter2, int len) | |
35 | { | |
36 | int i; | |
37 | for (i = 0; i < len; i++) { | |
38 | dst[i] = counter1[i] + counter2[i]; | |
39 | } | |
40 | } | |
41 | ||
42 | static int compute_mask(int step, uint32_t *mask) | |
43 | { | |
44 | int i, z, ret = 0; | |
45 | int counter_size = sizeof(uint32_t) * (2 * step + 1); | |
46 | uint32_t *temp1_counter, *temp2_counter, **counter; | |
47 | temp1_counter = av_mallocz(counter_size); | |
48 | if (!temp1_counter) { | |
49 | ret = AVERROR(ENOMEM); | |
50 | goto end; | |
51 | } | |
52 | temp2_counter = av_mallocz(counter_size); | |
53 | if (!temp2_counter) { | |
54 | ret = AVERROR(ENOMEM); | |
55 | goto end; | |
56 | } | |
57 | counter = av_mallocz_array(2 * step + 1, sizeof(uint32_t *)); | |
58 | if (!counter) { | |
59 | ret = AVERROR(ENOMEM); | |
60 | goto end; | |
61 | } | |
62 | for (i = 0; i < 2 * step + 1; i++) { | |
63 | counter[i] = av_mallocz(counter_size); | |
64 | if (!counter[i]) { | |
65 | ret = AVERROR(ENOMEM); | |
66 | goto end; | |
67 | } | |
68 | } | |
69 | for (i = 0; i < 2 * step + 1; i++) { | |
70 | memset(temp1_counter, 0, counter_size); | |
71 | temp1_counter[i] = 1; | |
72 | for (z = 0; z < step * 2; z += 2) { | |
73 | add_mask_counter(temp2_counter, counter[z], temp1_counter, step * 2); | |
74 | memcpy(counter[z], temp1_counter, counter_size); | |
75 | add_mask_counter(temp1_counter, counter[z + 1], temp2_counter, step * 2); | |
76 | memcpy(counter[z + 1], temp2_counter, counter_size); | |
77 | } | |
78 | } | |
79 | memcpy(mask, temp1_counter, counter_size); | |
80 | end: | |
81 | av_freep(&temp1_counter); | |
82 | av_freep(&temp2_counter); | |
83 | for (i = 0; i < 2 * step + 1; i++) { | |
84 | av_freep(&counter[i]); | |
85 | } | |
86 | av_freep(&counter); | |
87 | return ret; | |
88 | } | |
89 | ||
90 | static int compute_mask_matrix(cl_mem cl_mask_matrix, int step_x, int step_y) | |
91 | { | |
92 | int i, j, ret = 0; | |
93 | uint32_t *mask_matrix, *mask_x, *mask_y; | |
94 | size_t size_matrix = sizeof(uint32_t) * (2 * step_x + 1) * (2 * step_y + 1); | |
95 | mask_x = av_mallocz_array(2 * step_x + 1, sizeof(uint32_t)); | |
96 | if (!mask_x) { | |
97 | ret = AVERROR(ENOMEM); | |
98 | goto end; | |
99 | } | |
100 | mask_y = av_mallocz_array(2 * step_y + 1, sizeof(uint32_t)); | |
101 | if (!mask_y) { | |
102 | ret = AVERROR(ENOMEM); | |
103 | goto end; | |
104 | } | |
105 | mask_matrix = av_mallocz(size_matrix); | |
106 | if (!mask_matrix) { | |
107 | ret = AVERROR(ENOMEM); | |
108 | goto end; | |
109 | } | |
110 | ret = compute_mask(step_x, mask_x); | |
111 | if (ret < 0) | |
112 | goto end; | |
113 | ret = compute_mask(step_y, mask_y); | |
114 | if (ret < 0) | |
115 | goto end; | |
116 | for (j = 0; j < 2 * step_y + 1; j++) { | |
117 | for (i = 0; i < 2 * step_x + 1; i++) { | |
118 | mask_matrix[i + j * (2 * step_x + 1)] = mask_y[j] * mask_x[i]; | |
119 | } | |
120 | } | |
121 | ret = av_opencl_buffer_write(cl_mask_matrix, (uint8_t *)mask_matrix, size_matrix); | |
122 | end: | |
123 | av_freep(&mask_x); | |
124 | av_freep(&mask_y); | |
125 | av_freep(&mask_matrix); | |
126 | return ret; | |
127 | } | |
128 | ||
129 | static int generate_mask(AVFilterContext *ctx) | |
130 | { | |
131 | UnsharpContext *unsharp = ctx->priv; | |
132 | int i, ret = 0, step_x[2], step_y[2]; | |
133 | cl_mem mask_matrix[2]; | |
134 | mask_matrix[0] = unsharp->opencl_ctx.cl_luma_mask; | |
135 | mask_matrix[1] = unsharp->opencl_ctx.cl_chroma_mask; | |
136 | step_x[0] = unsharp->luma.steps_x; | |
137 | step_x[1] = unsharp->chroma.steps_x; | |
138 | step_y[0] = unsharp->luma.steps_y; | |
139 | step_y[1] = unsharp->chroma.steps_y; | |
140 | ||
141 | /* use default kernel if any matrix dim larger than 8 due to limited local mem size */ | |
142 | if (step_x[0]>8 || step_x[1]>8 || step_y[0]>8 || step_y[1]>8) | |
143 | unsharp->opencl_ctx.use_fast_kernels = 0; | |
144 | else | |
145 | unsharp->opencl_ctx.use_fast_kernels = 1; | |
146 | ||
147 | if (!mask_matrix[0] || !mask_matrix[1]) { | |
148 | av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not be NULL\n"); | |
149 | return AVERROR(EINVAL); | |
150 | } | |
151 | for (i = 0; i < 2; i++) { | |
152 | ret = compute_mask_matrix(mask_matrix[i], step_x[i], step_y[i]); | |
153 | if (ret < 0) | |
154 | return ret; | |
155 | } | |
156 | return ret; | |
157 | } | |
158 | ||
159 | int ff_opencl_apply_unsharp(AVFilterContext *ctx, AVFrame *in, AVFrame *out) | |
160 | { | |
161 | int ret; | |
162 | AVFilterLink *link = ctx->inputs[0]; | |
163 | UnsharpContext *unsharp = ctx->priv; | |
164 | cl_int status; | |
165 | FFOpenclParam kernel1 = {0}; | |
166 | FFOpenclParam kernel2 = {0}; | |
167 | int width = link->w; | |
168 | int height = link->h; | |
169 | int cw = FF_CEIL_RSHIFT(link->w, unsharp->hsub); | |
170 | int ch = FF_CEIL_RSHIFT(link->h, unsharp->vsub); | |
171 | size_t globalWorkSize1d = width * height + 2 * ch * cw; | |
172 | size_t globalWorkSize2dLuma[2]; | |
173 | size_t globalWorkSize2dChroma[2]; | |
174 | size_t localWorkSize2d[2] = {16, 16}; | |
175 | ||
176 | if (unsharp->opencl_ctx.use_fast_kernels) { | |
177 | globalWorkSize2dLuma[0] = (size_t)ROUND_TO_16(width); | |
178 | globalWorkSize2dLuma[1] = (size_t)ROUND_TO_16(height); | |
179 | globalWorkSize2dChroma[0] = (size_t)ROUND_TO_16(cw); | |
180 | globalWorkSize2dChroma[1] = (size_t)(2*ROUND_TO_16(ch)); | |
181 | ||
182 | kernel1.ctx = ctx; | |
183 | kernel1.kernel = unsharp->opencl_ctx.kernel_luma; | |
184 | ret = avpriv_opencl_set_parameter(&kernel1, | |
185 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), | |
186 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), | |
187 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask), | |
188 | FF_OPENCL_PARAM_INFO(unsharp->luma.amount), | |
189 | FF_OPENCL_PARAM_INFO(unsharp->luma.scalebits), | |
190 | FF_OPENCL_PARAM_INFO(unsharp->luma.halfscale), | |
191 | FF_OPENCL_PARAM_INFO(in->linesize[0]), | |
192 | FF_OPENCL_PARAM_INFO(out->linesize[0]), | |
193 | FF_OPENCL_PARAM_INFO(width), | |
194 | FF_OPENCL_PARAM_INFO(height), | |
195 | NULL); | |
196 | if (ret < 0) | |
197 | return ret; | |
198 | ||
199 | kernel2.ctx = ctx; | |
200 | kernel2.kernel = unsharp->opencl_ctx.kernel_chroma; | |
201 | ret = avpriv_opencl_set_parameter(&kernel2, | |
202 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), | |
203 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), | |
204 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask), | |
205 | FF_OPENCL_PARAM_INFO(unsharp->chroma.amount), | |
206 | FF_OPENCL_PARAM_INFO(unsharp->chroma.scalebits), | |
207 | FF_OPENCL_PARAM_INFO(unsharp->chroma.halfscale), | |
208 | FF_OPENCL_PARAM_INFO(in->linesize[0]), | |
209 | FF_OPENCL_PARAM_INFO(in->linesize[1]), | |
210 | FF_OPENCL_PARAM_INFO(out->linesize[0]), | |
211 | FF_OPENCL_PARAM_INFO(out->linesize[1]), | |
212 | FF_OPENCL_PARAM_INFO(link->w), | |
213 | FF_OPENCL_PARAM_INFO(link->h), | |
214 | FF_OPENCL_PARAM_INFO(cw), | |
215 | FF_OPENCL_PARAM_INFO(ch), | |
216 | NULL); | |
217 | if (ret < 0) | |
218 | return ret; | |
219 | status = clEnqueueNDRangeKernel(unsharp->opencl_ctx.command_queue, | |
220 | unsharp->opencl_ctx.kernel_luma, 2, NULL, | |
221 | globalWorkSize2dLuma, localWorkSize2d, 0, NULL, NULL); | |
222 | status |=clEnqueueNDRangeKernel(unsharp->opencl_ctx.command_queue, | |
223 | unsharp->opencl_ctx.kernel_chroma, 2, NULL, | |
224 | globalWorkSize2dChroma, localWorkSize2d, 0, NULL, NULL); | |
225 | if (status != CL_SUCCESS) { | |
226 | av_log(ctx, AV_LOG_ERROR, "OpenCL run kernel error occurred: %s\n", av_opencl_errstr(status)); | |
227 | return AVERROR_EXTERNAL; | |
228 | } | |
229 | } else { /* use default kernel */ | |
230 | kernel1.ctx = ctx; | |
231 | kernel1.kernel = unsharp->opencl_ctx.kernel_default; | |
232 | ||
233 | ret = avpriv_opencl_set_parameter(&kernel1, | |
234 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), | |
235 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), | |
236 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask), | |
237 | FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask), | |
238 | FF_OPENCL_PARAM_INFO(unsharp->luma.amount), | |
239 | FF_OPENCL_PARAM_INFO(unsharp->chroma.amount), | |
240 | FF_OPENCL_PARAM_INFO(unsharp->luma.steps_x), | |
241 | FF_OPENCL_PARAM_INFO(unsharp->luma.steps_y), | |
242 | FF_OPENCL_PARAM_INFO(unsharp->chroma.steps_x), | |
243 | FF_OPENCL_PARAM_INFO(unsharp->chroma.steps_y), | |
244 | FF_OPENCL_PARAM_INFO(unsharp->luma.scalebits), | |
245 | FF_OPENCL_PARAM_INFO(unsharp->chroma.scalebits), | |
246 | FF_OPENCL_PARAM_INFO(unsharp->luma.halfscale), | |
247 | FF_OPENCL_PARAM_INFO(unsharp->chroma.halfscale), | |
248 | FF_OPENCL_PARAM_INFO(in->linesize[0]), | |
249 | FF_OPENCL_PARAM_INFO(in->linesize[1]), | |
250 | FF_OPENCL_PARAM_INFO(out->linesize[0]), | |
251 | FF_OPENCL_PARAM_INFO(out->linesize[1]), | |
252 | FF_OPENCL_PARAM_INFO(link->h), | |
253 | FF_OPENCL_PARAM_INFO(link->w), | |
254 | FF_OPENCL_PARAM_INFO(ch), | |
255 | FF_OPENCL_PARAM_INFO(cw), | |
256 | NULL); | |
257 | if (ret < 0) | |
258 | return ret; | |
259 | status = clEnqueueNDRangeKernel(unsharp->opencl_ctx.command_queue, | |
260 | unsharp->opencl_ctx.kernel_default, 1, NULL, | |
261 | &globalWorkSize1d, NULL, 0, NULL, NULL); | |
262 | if (status != CL_SUCCESS) { | |
263 | av_log(ctx, AV_LOG_ERROR, "OpenCL run kernel error occurred: %s\n", av_opencl_errstr(status)); | |
264 | return AVERROR_EXTERNAL; | |
265 | } | |
266 | } | |
267 | clFinish(unsharp->opencl_ctx.command_queue); | |
268 | return av_opencl_buffer_read_image(out->data, unsharp->opencl_ctx.out_plane_size, | |
269 | unsharp->opencl_ctx.plane_num, unsharp->opencl_ctx.cl_outbuf, | |
270 | unsharp->opencl_ctx.cl_outbuf_size); | |
271 | } | |
272 | ||
273 | int ff_opencl_unsharp_init(AVFilterContext *ctx) | |
274 | { | |
275 | int ret = 0; | |
276 | char build_opts[96]; | |
277 | UnsharpContext *unsharp = ctx->priv; | |
278 | ret = av_opencl_init(NULL); | |
279 | if (ret < 0) | |
280 | return ret; | |
281 | ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask, | |
282 | sizeof(uint32_t) * (2 * unsharp->luma.steps_x + 1) * (2 * unsharp->luma.steps_y + 1), | |
283 | CL_MEM_READ_ONLY, NULL); | |
284 | if (ret < 0) | |
285 | return ret; | |
286 | ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask, | |
287 | sizeof(uint32_t) * (2 * unsharp->chroma.steps_x + 1) * (2 * unsharp->chroma.steps_y + 1), | |
288 | CL_MEM_READ_ONLY, NULL); | |
289 | if (ret < 0) | |
290 | return ret; | |
291 | ret = generate_mask(ctx); | |
292 | if (ret < 0) | |
293 | return ret; | |
294 | unsharp->opencl_ctx.plane_num = PLANE_NUM; | |
295 | unsharp->opencl_ctx.command_queue = av_opencl_get_command_queue(); | |
296 | if (!unsharp->opencl_ctx.command_queue) { | |
297 | av_log(ctx, AV_LOG_ERROR, "Unable to get OpenCL command queue in filter 'unsharp'\n"); | |
298 | return AVERROR(EINVAL); | |
299 | } | |
300 | snprintf(build_opts, 96, "-D LU_RADIUS_X=%d -D LU_RADIUS_Y=%d -D CH_RADIUS_X=%d -D CH_RADIUS_Y=%d", | |
301 | 2*unsharp->luma.steps_x+1, 2*unsharp->luma.steps_y+1, 2*unsharp->chroma.steps_x+1, 2*unsharp->chroma.steps_y+1); | |
302 | unsharp->opencl_ctx.program = av_opencl_compile("unsharp", build_opts); | |
303 | if (!unsharp->opencl_ctx.program) { | |
304 | av_log(ctx, AV_LOG_ERROR, "OpenCL failed to compile program 'unsharp'\n"); | |
305 | return AVERROR(EINVAL); | |
306 | } | |
307 | if (unsharp->opencl_ctx.use_fast_kernels) { | |
308 | if (!unsharp->opencl_ctx.kernel_luma) { | |
309 | unsharp->opencl_ctx.kernel_luma = clCreateKernel(unsharp->opencl_ctx.program, "unsharp_luma", &ret); | |
310 | if (ret != CL_SUCCESS) { | |
311 | av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'unsharp_luma'\n"); | |
312 | return ret; | |
313 | } | |
314 | } | |
315 | if (!unsharp->opencl_ctx.kernel_chroma) { | |
316 | unsharp->opencl_ctx.kernel_chroma = clCreateKernel(unsharp->opencl_ctx.program, "unsharp_chroma", &ret); | |
317 | if (ret < 0) { | |
318 | av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'unsharp_chroma'\n"); | |
319 | return ret; | |
320 | } | |
321 | } | |
322 | } | |
323 | else { | |
324 | if (!unsharp->opencl_ctx.kernel_default) { | |
325 | unsharp->opencl_ctx.kernel_default = clCreateKernel(unsharp->opencl_ctx.program, "unsharp_default", &ret); | |
326 | if (ret < 0) { | |
327 | av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'unsharp_default'\n"); | |
328 | return ret; | |
329 | } | |
330 | } | |
331 | } | |
332 | return ret; | |
333 | } | |
334 | ||
335 | void ff_opencl_unsharp_uninit(AVFilterContext *ctx) | |
336 | { | |
337 | UnsharpContext *unsharp = ctx->priv; | |
338 | av_opencl_buffer_release(&unsharp->opencl_ctx.cl_inbuf); | |
339 | av_opencl_buffer_release(&unsharp->opencl_ctx.cl_outbuf); | |
340 | av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask); | |
341 | av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask); | |
342 | clReleaseKernel(unsharp->opencl_ctx.kernel_default); | |
343 | clReleaseKernel(unsharp->opencl_ctx.kernel_luma); | |
344 | clReleaseKernel(unsharp->opencl_ctx.kernel_chroma); | |
345 | clReleaseProgram(unsharp->opencl_ctx.program); | |
346 | unsharp->opencl_ctx.command_queue = NULL; | |
347 | av_opencl_uninit(); | |
348 | } | |
349 | ||
350 | int ff_opencl_unsharp_process_inout_buf(AVFilterContext *ctx, AVFrame *in, AVFrame *out) | |
351 | { | |
352 | int ret = 0; | |
353 | AVFilterLink *link = ctx->inputs[0]; | |
354 | UnsharpContext *unsharp = ctx->priv; | |
355 | int ch = FF_CEIL_RSHIFT(link->h, unsharp->vsub); | |
356 | ||
357 | if ((!unsharp->opencl_ctx.cl_inbuf) || (!unsharp->opencl_ctx.cl_outbuf)) { | |
358 | unsharp->opencl_ctx.in_plane_size[0] = (in->linesize[0] * in->height); | |
359 | unsharp->opencl_ctx.in_plane_size[1] = (in->linesize[1] * ch); | |
360 | unsharp->opencl_ctx.in_plane_size[2] = (in->linesize[2] * ch); | |
361 | unsharp->opencl_ctx.out_plane_size[0] = (out->linesize[0] * out->height); | |
362 | unsharp->opencl_ctx.out_plane_size[1] = (out->linesize[1] * ch); | |
363 | unsharp->opencl_ctx.out_plane_size[2] = (out->linesize[2] * ch); | |
364 | unsharp->opencl_ctx.cl_inbuf_size = unsharp->opencl_ctx.in_plane_size[0] + | |
365 | unsharp->opencl_ctx.in_plane_size[1] + | |
366 | unsharp->opencl_ctx.in_plane_size[2]; | |
367 | unsharp->opencl_ctx.cl_outbuf_size = unsharp->opencl_ctx.out_plane_size[0] + | |
368 | unsharp->opencl_ctx.out_plane_size[1] + | |
369 | unsharp->opencl_ctx.out_plane_size[2]; | |
370 | if (!unsharp->opencl_ctx.cl_inbuf) { | |
371 | ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_inbuf, | |
372 | unsharp->opencl_ctx.cl_inbuf_size, | |
373 | CL_MEM_READ_ONLY, NULL); | |
374 | if (ret < 0) | |
375 | return ret; | |
376 | } | |
377 | if (!unsharp->opencl_ctx.cl_outbuf) { | |
378 | ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_outbuf, | |
379 | unsharp->opencl_ctx.cl_outbuf_size, | |
380 | CL_MEM_READ_WRITE, NULL); | |
381 | if (ret < 0) | |
382 | return ret; | |
383 | } | |
384 | } | |
385 | return av_opencl_buffer_write_image(unsharp->opencl_ctx.cl_inbuf, | |
386 | unsharp->opencl_ctx.cl_inbuf_size, | |
387 | 0, in->data, unsharp->opencl_ctx.in_plane_size, | |
388 | unsharp->opencl_ctx.plane_num); | |
389 | } |