2 * Copyright (C) 2013 Lenny Wang
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/opt.h"
22 #include "libavutil/time.h"
23 #include "libavutil/log.h"
24 #include "libavutil/opencl.h"
32 } OpenCLDeviceBenchmark
;
34 const char *ocl_bench_source
= AV_OPENCL_KERNEL(
35 inline unsigned char clip_uint8(int a
)
43 kernel
void unsharp_bench(
44 global
unsigned char *src
,
45 global
unsigned char *dst
,
50 int i
, j
, local_idx
, lc_idx
, sum
= 0;
51 int2 thread_idx
, block_idx
, global_idx
, lm_idx
;
52 thread_idx
.x
= get_local_id(0);
53 thread_idx
.y
= get_local_id(1);
54 block_idx
.x
= get_group_id(0);
55 block_idx
.y
= get_group_id(1);
56 global_idx
.x
= get_global_id(0);
57 global_idx
.y
= get_global_id(1);
58 local uchar data
[32][32];
61 for (i
= 0; i
<= 1; i
++) {
62 lm_idx
.y
= -8 + (block_idx
.y
+ i
) * 16 + thread_idx
.y
;
63 lm_idx
.y
= lm_idx
.y
< 0 ? 0 : lm_idx
.y
;
64 lm_idx
.y
= lm_idx
.y
>= height
? height
- 1: lm_idx
.y
;
65 for (j
= 0; j
<= 1; j
++) {
66 lm_idx
.x
= -8 + (block_idx
.x
+ j
) * 16 + thread_idx
.x
;
67 lm_idx
.x
= lm_idx
.x
< 0 ? 0 : lm_idx
.x
;
68 lm_idx
.x
= lm_idx
.x
>= width
? width
- 1: lm_idx
.x
;
69 data
[i
*16 + thread_idx
.y
][j
*16 + thread_idx
.x
] = src
[lm_idx
.y
*width
+ lm_idx
.x
];
72 local_idx
= thread_idx
.y
*16 + thread_idx
.x
;
74 lc
[local_idx
] = mask
[local_idx
];
75 barrier(CLK_LOCAL_MEM_FENCE
);
78 for (i
= -4; i
<= 4; i
++) {
79 lm_idx
.y
= 8 + i
+ thread_idx
.y
;
81 for (j
= -4; j
<= 4; j
++) {
82 lm_idx
.x
= 8 + j
+ thread_idx
.x
;
83 lc_idx
= (i
+ 4)*8 + j
+ 4;
84 sum
+= (int)data
[lm_idx
.y
][lm_idx
.x
] * lc
[lc_idx
];
87 int temp
= (int)data
[thread_idx
.y
+ 8][thread_idx
.x
+ 8];
88 int res
= temp
+ (((temp
- (int)((sum
+ 1<<15) >> 16))) >> 16);
89 if (global_idx
.x
< width
&& global_idx
.y
< height
)
90 dst
[global_idx
.x
+ global_idx
.y
*width
] = clip_uint8(res
);
94 #define OCLCHECK(method, ... ) \
96 status = method(__VA_ARGS__); \
97 if (status != CL_SUCCESS) { \
98 av_log(NULL, AV_LOG_ERROR, # method " error '%s'\n", \
99 av_opencl_errstr(status)); \
100 ret = AVERROR_EXTERNAL; \
105 #define CREATEBUF(out, flags, size) \
107 out = clCreateBuffer(ext_opencl_env->context, flags, size, NULL, &status); \
108 if (status != CL_SUCCESS) { \
109 av_log(NULL, AV_LOG_ERROR, "Could not create OpenCL buffer\n"); \
110 ret = AVERROR_EXTERNAL; \
115 static void fill_rand_int(int *data
, int n
)
119 for (i
= 0; i
< n
; i
++)
123 #define OPENCL_NB_ITER 5
124 static int64_t run_opencl_bench(AVOpenCLExternalEnv
*ext_opencl_env
)
126 int i
, arg
= 0, width
= 1920, height
= 1088;
127 int64_t start
, ret
= 0;
132 int buf_size
= width
* height
* sizeof(char);
133 int mask_size
= sizeof(uint32_t) * 128;
135 cl_mem cl_mask
, cl_inbuf
, cl_outbuf
;
136 cl_kernel kernel
= NULL
;
137 cl_program program
= NULL
;
138 size_t local_work_size_2d
[2] = {16, 16};
139 size_t global_work_size_2d
[2] = {(size_t)width
, (size_t)height
};
141 if (!(inbuf
= av_malloc(buf_size
)) || !(mask
= av_malloc(mask_size
))) {
142 av_log(NULL
, AV_LOG_ERROR
, "Out of memory\n");
143 ret
= AVERROR(ENOMEM
);
146 fill_rand_int((int*)inbuf
, buf_size
/4);
147 fill_rand_int(mask
, mask_size
/4);
149 CREATEBUF(cl_mask
, CL_MEM_READ_ONLY
, mask_size
);
150 CREATEBUF(cl_inbuf
, CL_MEM_READ_ONLY
, buf_size
);
151 CREATEBUF(cl_outbuf
, CL_MEM_READ_WRITE
, buf_size
);
153 kernel_len
= strlen(ocl_bench_source
);
154 program
= clCreateProgramWithSource(ext_opencl_env
->context
, 1, &ocl_bench_source
,
155 &kernel_len
, &status
);
156 if (status
!= CL_SUCCESS
|| !program
) {
157 av_log(NULL
, AV_LOG_ERROR
, "OpenCL unable to create benchmark program\n");
158 ret
= AVERROR_EXTERNAL
;
161 status
= clBuildProgram(program
, 1, &(ext_opencl_env
->device_id
), NULL
, NULL
, NULL
);
162 if (status
!= CL_SUCCESS
) {
163 av_log(NULL
, AV_LOG_ERROR
, "OpenCL unable to build benchmark program\n");
164 ret
= AVERROR_EXTERNAL
;
167 kernel
= clCreateKernel(program
, "unsharp_bench", &status
);
168 if (status
!= CL_SUCCESS
) {
169 av_log(NULL
, AV_LOG_ERROR
, "OpenCL unable to create benchmark kernel\n");
170 ret
= AVERROR_EXTERNAL
;
174 OCLCHECK(clEnqueueWriteBuffer
, ext_opencl_env
->command_queue
, cl_inbuf
, CL_TRUE
, 0,
175 buf_size
, inbuf
, 0, NULL
, NULL
);
176 OCLCHECK(clEnqueueWriteBuffer
, ext_opencl_env
->command_queue
, cl_mask
, CL_TRUE
, 0,
177 mask_size
, mask
, 0, NULL
, NULL
);
178 OCLCHECK(clSetKernelArg
, kernel
, arg
++, sizeof(cl_mem
), &cl_inbuf
);
179 OCLCHECK(clSetKernelArg
, kernel
, arg
++, sizeof(cl_mem
), &cl_outbuf
);
180 OCLCHECK(clSetKernelArg
, kernel
, arg
++, sizeof(cl_mem
), &cl_mask
);
181 OCLCHECK(clSetKernelArg
, kernel
, arg
++, sizeof(cl_int
), &width
);
182 OCLCHECK(clSetKernelArg
, kernel
, arg
++, sizeof(cl_int
), &height
);
184 start
= av_gettime_relative();
185 for (i
= 0; i
< OPENCL_NB_ITER
; i
++)
186 OCLCHECK(clEnqueueNDRangeKernel
, ext_opencl_env
->command_queue
, kernel
, 2, NULL
,
187 global_work_size_2d
, local_work_size_2d
, 0, NULL
, NULL
);
188 clFinish(ext_opencl_env
->command_queue
);
189 ret
= (av_gettime_relative() - start
)/OPENCL_NB_ITER
;
192 clReleaseKernel(kernel
);
194 clReleaseProgram(program
);
196 clReleaseMemObject(cl_inbuf
);
198 clReleaseMemObject(cl_outbuf
);
200 clReleaseMemObject(cl_mask
);
206 static int compare_ocl_device_desc(const void *a
, const void *b
)
208 return ((OpenCLDeviceBenchmark
*)a
)->runtime
- ((OpenCLDeviceBenchmark
*)b
)->runtime
;
211 int opt_opencl_bench(void *optctx
, const char *opt
, const char *arg
)
213 int i
, j
, nb_devices
= 0, count
= 0;
215 AVOpenCLDeviceList
*device_list
;
216 AVOpenCLDeviceNode
*device_node
= NULL
;
217 OpenCLDeviceBenchmark
*devices
= NULL
;
218 cl_platform_id platform
;
220 av_opencl_get_device_list(&device_list
);
221 for (i
= 0; i
< device_list
->platform_num
; i
++)
222 nb_devices
+= device_list
->platform_node
[i
]->device_num
;
224 av_log(NULL
, AV_LOG_ERROR
, "No OpenCL device detected!\n");
225 return AVERROR(EINVAL
);
227 if (!(devices
= av_malloc_array(nb_devices
, sizeof(OpenCLDeviceBenchmark
)))) {
228 av_log(NULL
, AV_LOG_ERROR
, "Could not allocate buffer\n");
229 return AVERROR(ENOMEM
);
232 for (i
= 0; i
< device_list
->platform_num
; i
++) {
233 for (j
= 0; j
< device_list
->platform_node
[i
]->device_num
; j
++) {
234 device_node
= device_list
->platform_node
[i
]->device_node
[j
];
235 platform
= device_list
->platform_node
[i
]->platform_id
;
236 score
= av_opencl_benchmark(device_node
, platform
, run_opencl_bench
);
238 devices
[count
].platform_idx
= i
;
239 devices
[count
].device_idx
= j
;
240 devices
[count
].runtime
= score
;
241 strcpy(devices
[count
].device_name
, device_node
->device_name
);
246 qsort(devices
, count
, sizeof(OpenCLDeviceBenchmark
), compare_ocl_device_desc
);
247 fprintf(stderr
, "platform_idx\tdevice_idx\tdevice_name\truntime\n");
248 for (i
= 0; i
< count
; i
++)
249 fprintf(stdout
, "%d\t%d\t%s\t%"PRId64
"\n",
250 devices
[i
].platform_idx
, devices
[i
].device_idx
,
251 devices
[i
].device_name
, devices
[i
].runtime
);
253 av_opencl_free_device_list(&device_list
);
258 int opt_opencl(void *optctx
, const char *opt
, const char *arg
)
261 const char *opts
= arg
;
264 ret
= av_opt_get_key_value(&opts
, "=", ":", 0, &key
, &value
);
267 ret
= av_opencl_set_option(key
, value
);