WinGui: Fix another instance of the Caliburn vs Json.net sillyness where objects...
[HandBrake.git] / libhb / oclscale.c
blobc771a1f2b7e30da3b08620ed3b8882209656080e
1 /* oclscale.c
3 Copyright (c) 2003-2015 HandBrake Team
4 This file is part of the HandBrake source code
5 Homepage: <http://handbrake.fr/>.
6 It may be used under the terms of the GNU General Public License v2.
7 For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
9 Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
10 Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
14 #include <math.h>
15 #include "common.h"
16 #include "opencl.h"
17 #include "openclwrapper.h"
18 #define FILTER_LEN 4
20 #define _A -0.5f
22 cl_float cubic(cl_float x)
24 if (x < 0)
25 x = -x;
27 if (x < 1)
28 return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1;
29 else if (x < 2)
30 return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A);
31 else
32 return 0;
36 cl_float *hb_bicubic_weights(cl_float scale, int length)
38 cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4);
40 int i; // C rocks
41 cl_float *out = weights;
42 for (i = 0; i < length; ++i)
44 cl_float x = i / scale;
45 cl_float dx = x - (int)x;
46 *out++ = cubic(-dx - 1.0f);
47 *out++ = cubic(-dx);
48 *out++ = cubic(-dx + 1.0f);
49 *out++ = cubic(-dx + 2.0f);
51 return weights;
54 int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv);
56 /**
57 * executive scale using opencl
58 * get filter args
59 * create output buffer
60 * create horizontal filter buffer
61 * create vertical filter buffer
62 * create kernels
64 int hb_ocl_scale_func( void **data, KernelEnv *kenv )
66 cl_int status;
68 cl_mem in_buf = data[0];
69 cl_mem out_buf = data[1];
70 int crop_top = (intptr_t)data[2];
71 int crop_bottom = (intptr_t)data[3];
72 int crop_left = (intptr_t)data[4];
73 int crop_right = (intptr_t)data[5];
74 cl_int in_frame_w = (intptr_t)data[6];
75 cl_int in_frame_h = (intptr_t)data[7];
76 cl_int out_frame_w = (intptr_t)data[8];
77 cl_int out_frame_h = (intptr_t)data[9];
78 hb_oclscale_t *os = data[10];
79 hb_buffer_t *in = data[11];
80 hb_buffer_t *out = data[12];
82 if (hb_ocl == NULL)
84 hb_error("hb_ocl_scale_func: OpenCL support not available");
85 return 0;
88 if (os->initialized == 0)
90 hb_log( "Scaling With OpenCL" );
91 if (kenv->isAMD != 0)
92 hb_log( "Using Zero Copy");
93 // create the block kernel
94 cl_int status;
95 os->m_kernel = hb_ocl->clCreateKernel(kenv->program, "frame_scale", &status);
97 os->initialized = 1;
101 // Use the new kernel
102 cl_event events[5];
103 int eventCount = 0;
105 if (kenv->isAMD == 0) {
106 status = hb_ocl->clEnqueueUnmapMemObject(kenv->command_queue,
107 in->cl.buffer, in->data, 0,
108 NULL, &events[eventCount++]);
109 status = hb_ocl->clEnqueueUnmapMemObject(kenv->command_queue,
110 out->cl.buffer, out->data, 0,
111 NULL, &events[eventCount++]);
114 cl_int srcPlaneOffset0 = in->plane[0].data - in->data;
115 cl_int srcPlaneOffset1 = in->plane[1].data - in->data;
116 cl_int srcPlaneOffset2 = in->plane[2].data - in->data;
117 cl_int srcRowWords0 = in->plane[0].stride;
118 cl_int srcRowWords1 = in->plane[1].stride;
119 cl_int srcRowWords2 = in->plane[2].stride;
120 cl_int dstPlaneOffset0 = out->plane[0].data - out->data;
121 cl_int dstPlaneOffset1 = out->plane[1].data - out->data;
122 cl_int dstPlaneOffset2 = out->plane[2].data - out->data;
123 cl_int dstRowWords0 = out->plane[0].stride;
124 cl_int dstRowWords1 = out->plane[1].stride;
125 cl_int dstRowWords2 = out->plane[2].stride;
127 if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) {
128 srcPlaneOffset0 += crop_left + crop_top * srcRowWords0;
129 srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1;
130 srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2;
131 in_frame_w = in_frame_w - crop_right - crop_left;
132 in_frame_h = in_frame_h - crop_bottom - crop_top;
135 cl_float xscale = (out_frame_w * 1.0f) / in_frame_w;
136 cl_float yscale = (out_frame_h * 1.0f) / in_frame_h;
137 setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv);
139 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf);
140 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf);
141 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale);
142 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale);
143 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0);
144 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1);
145 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2);
146 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0);
147 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1);
148 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2);
149 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0);
150 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1);
151 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2);
152 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0);
153 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1);
154 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2);
155 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 16, sizeof(cl_int), &in_frame_w);
156 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 17, sizeof(cl_int), &in_frame_h);
157 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 18, sizeof(cl_int), &out_frame_w);
158 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 19, sizeof(cl_int), &out_frame_h);
159 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights);
160 HB_OCL_CHECK(hb_ocl->clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights);
162 size_t workOffset[] = { 0, 0, 0 };
163 size_t globalWorkSize[] = { 1, 1, 1 };
164 size_t localWorkSize[] = { 1, 1, 1 };
166 int xgroups = (out_frame_w + 63) / 64;
167 int ygroups = (out_frame_h + 15) / 16;
169 localWorkSize[0] = 64;
170 localWorkSize[1] = 1;
171 localWorkSize[2] = 1;
172 globalWorkSize[0] = xgroups * 64;
173 globalWorkSize[1] = ygroups;
174 globalWorkSize[2] = 3;
176 HB_OCL_CHECK(hb_ocl->clEnqueueNDRangeKernel, kenv->command_queue,
177 os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize,
178 eventCount, eventCount == 0 ? NULL : &events[0], &events[eventCount]);
179 ++eventCount;
181 if (kenv->isAMD == 0) {
182 in->data = hb_ocl->clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer,
183 CL_FALSE, CL_MAP_READ|CL_MAP_WRITE,
184 0, in->alloc,
185 eventCount ? 1 : 0,
186 eventCount ? &events[eventCount - 1] : NULL,
187 &events[eventCount], &status);
188 out->data = hb_ocl->clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer,
189 CL_FALSE, CL_MAP_READ|CL_MAP_WRITE,
190 0, out->alloc,
191 eventCount ? 1 : 0,
192 eventCount ? &events[eventCount - 1] : NULL,
193 &events[eventCount + 1], &status);
194 eventCount += 2;
197 hb_ocl->clFlush(kenv->command_queue);
198 hb_ocl->clWaitForEvents(eventCount, &events[0]);
199 int i;
200 for (i = 0; i < eventCount; ++i)
202 hb_ocl->clReleaseEvent(events[i]);
206 return 1;
209 int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv)
211 cl_int status;
213 if (hb_ocl == NULL)
215 hb_error("setupScaleWeights: OpenCL support not available");
216 return 1;
219 if (os->xscale != xscale || os->width < width)
221 cl_float *xweights = hb_bicubic_weights(xscale, width);
222 HB_OCL_BUF_FREE (hb_ocl, os->bicubic_x_weights);
223 HB_OCL_BUF_CREATE(hb_ocl, os->bicubic_x_weights, CL_MEM_READ_ONLY,
224 sizeof(cl_float) * width * 4);
225 HB_OCL_CHECK(hb_ocl->clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights,
226 CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL);
227 os->width = width;
228 os->xscale = xscale;
229 free(xweights);
232 if ((os->yscale != yscale) || (os->height < height))
234 cl_float *yweights = hb_bicubic_weights(yscale, height);
235 HB_OCL_BUF_FREE (hb_ocl, os->bicubic_y_weights);
236 HB_OCL_BUF_CREATE(hb_ocl, os->bicubic_y_weights, CL_MEM_READ_ONLY,
237 sizeof(cl_float) * height * 4);
238 HB_OCL_CHECK(hb_ocl->clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights,
239 CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL);
240 os->height = height;
241 os->yscale = yscale;
242 free(yweights);
244 return 0;
249 * function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm
250 * parameter:
251 * inputFrameBuffer: the source video frame opencl buffer
252 * outputdata: the destination video frame buffer
253 * inputWidth: the width of the source video frame
254 * inputHeight: the height of the source video frame
255 * outputWidth: the width of destination video frame
256 * outputHeight: the height of destination video frame
260 static int s_scale_init_flag = 0;
262 int do_scale_init()
264 if ( s_scale_init_flag==0 )
266 int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func );
267 if( !st )
269 hb_log( "register kernel[%s] failed", "frame_scale" );
270 return 0;
272 s_scale_init_flag++;
274 return 1;
278 int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os)
280 void *data[13];
282 if (do_scale_init() == 0)
283 return 0;
285 data[0] = in->cl.buffer;
286 data[1] = out->cl.buffer;
287 data[2] = (void*)(intptr_t)(crop[0]);
288 data[3] = (void*)(intptr_t)(crop[1]);
289 data[4] = (void*)(intptr_t)(crop[2]);
290 data[5] = (void*)(intptr_t)(crop[3]);
291 data[6] = (void*)(intptr_t)(in->f.width);
292 data[7] = (void*)(intptr_t)(in->f.height);
293 data[8] = (void*)(intptr_t)(out->f.width);
294 data[9] = (void*)(intptr_t)(out->f.height);
295 data[10] = os;
296 data[11] = in;
297 data[12] = out;
299 if( !hb_run_kernel( "frame_scale", data ) )
300 hb_log( "run kernel[%s] failed", "frame_scale" );
301 return 0;