3 Copyright (c) 2003-2015 HandBrake Team
4 This file is part of the HandBrake source code
5 Homepage: <http://handbrake.fr/>.
6 It may be used under the terms of the GNU General Public License v2.
7 For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
9 Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
10 Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
17 #include "openclwrapper.h"
22 cl_float
cubic(cl_float x
)
28 return (_A
+ 2.0f
) * (x
* x
* x
) - (_A
+ 3.0f
) * (x
* x
) + 0 + 1;
30 return (_A
) * (x
* x
* x
) - (5.0f
* _A
) * (x
* x
) + (8.0f
* _A
) * x
- (4.0f
* _A
);
36 cl_float
*hb_bicubic_weights(cl_float scale
, int length
)
38 cl_float
*weights
= (cl_float
*) malloc(length
* sizeof(cl_float
) * 4);
41 cl_float
*out
= weights
;
42 for (i
= 0; i
< length
; ++i
)
44 cl_float x
= i
/ scale
;
45 cl_float dx
= x
- (int)x
;
46 *out
++ = cubic(-dx
- 1.0f
);
48 *out
++ = cubic(-dx
+ 1.0f
);
49 *out
++ = cubic(-dx
+ 2.0f
);
54 int setupScaleWeights(cl_float xscale
, cl_float yscale
, int width
, int height
, hb_oclscale_t
*os
, KernelEnv
*kenv
);
57 * executive scale using opencl
59 * create output buffer
60 * create horizontal filter buffer
61 * create vertical filter buffer
64 int hb_ocl_scale_func( void **data
, KernelEnv
*kenv
)
68 cl_mem in_buf
= data
[0];
69 cl_mem out_buf
= data
[1];
70 int crop_top
= (intptr_t)data
[2];
71 int crop_bottom
= (intptr_t)data
[3];
72 int crop_left
= (intptr_t)data
[4];
73 int crop_right
= (intptr_t)data
[5];
74 cl_int in_frame_w
= (intptr_t)data
[6];
75 cl_int in_frame_h
= (intptr_t)data
[7];
76 cl_int out_frame_w
= (intptr_t)data
[8];
77 cl_int out_frame_h
= (intptr_t)data
[9];
78 hb_oclscale_t
*os
= data
[10];
79 hb_buffer_t
*in
= data
[11];
80 hb_buffer_t
*out
= data
[12];
84 hb_error("hb_ocl_scale_func: OpenCL support not available");
88 if (os
->initialized
== 0)
90 hb_log( "Scaling With OpenCL" );
92 hb_log( "Using Zero Copy");
93 // create the block kernel
95 os
->m_kernel
= hb_ocl
->clCreateKernel(kenv
->program
, "frame_scale", &status
);
101 // Use the new kernel
105 if (kenv
->isAMD
== 0) {
106 status
= hb_ocl
->clEnqueueUnmapMemObject(kenv
->command_queue
,
107 in
->cl
.buffer
, in
->data
, 0,
108 NULL
, &events
[eventCount
++]);
109 status
= hb_ocl
->clEnqueueUnmapMemObject(kenv
->command_queue
,
110 out
->cl
.buffer
, out
->data
, 0,
111 NULL
, &events
[eventCount
++]);
114 cl_int srcPlaneOffset0
= in
->plane
[0].data
- in
->data
;
115 cl_int srcPlaneOffset1
= in
->plane
[1].data
- in
->data
;
116 cl_int srcPlaneOffset2
= in
->plane
[2].data
- in
->data
;
117 cl_int srcRowWords0
= in
->plane
[0].stride
;
118 cl_int srcRowWords1
= in
->plane
[1].stride
;
119 cl_int srcRowWords2
= in
->plane
[2].stride
;
120 cl_int dstPlaneOffset0
= out
->plane
[0].data
- out
->data
;
121 cl_int dstPlaneOffset1
= out
->plane
[1].data
- out
->data
;
122 cl_int dstPlaneOffset2
= out
->plane
[2].data
- out
->data
;
123 cl_int dstRowWords0
= out
->plane
[0].stride
;
124 cl_int dstRowWords1
= out
->plane
[1].stride
;
125 cl_int dstRowWords2
= out
->plane
[2].stride
;
127 if (crop_top
!= 0 || crop_bottom
!= 0 || crop_left
!= 0 || crop_right
!= 0) {
128 srcPlaneOffset0
+= crop_left
+ crop_top
* srcRowWords0
;
129 srcPlaneOffset1
+= crop_left
/ 2 + (crop_top
/ 2) * srcRowWords1
;
130 srcPlaneOffset2
+= crop_left
/ 2 + (crop_top
/ 2) * srcRowWords2
;
131 in_frame_w
= in_frame_w
- crop_right
- crop_left
;
132 in_frame_h
= in_frame_h
- crop_bottom
- crop_top
;
135 cl_float xscale
= (out_frame_w
* 1.0f
) / in_frame_w
;
136 cl_float yscale
= (out_frame_h
* 1.0f
) / in_frame_h
;
137 setupScaleWeights(xscale
, yscale
, out_frame_w
, out_frame_h
, os
, kenv
);
139 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 0, sizeof(cl_mem
), &out_buf
);
140 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 1, sizeof(cl_mem
), &in_buf
);
141 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 2, sizeof(cl_float
), &xscale
);
142 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 3, sizeof(cl_float
), &yscale
);
143 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 4, sizeof(cl_int
), &srcPlaneOffset0
);
144 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 5, sizeof(cl_int
), &srcPlaneOffset1
);
145 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 6, sizeof(cl_int
), &srcPlaneOffset2
);
146 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 7, sizeof(cl_int
), &dstPlaneOffset0
);
147 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 8, sizeof(cl_int
), &dstPlaneOffset1
);
148 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 9, sizeof(cl_int
), &dstPlaneOffset2
);
149 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 10, sizeof(cl_int
), &srcRowWords0
);
150 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 11, sizeof(cl_int
), &srcRowWords1
);
151 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 12, sizeof(cl_int
), &srcRowWords2
);
152 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 13, sizeof(cl_int
), &dstRowWords0
);
153 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 14, sizeof(cl_int
), &dstRowWords1
);
154 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 15, sizeof(cl_int
), &dstRowWords2
);
155 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 16, sizeof(cl_int
), &in_frame_w
);
156 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 17, sizeof(cl_int
), &in_frame_h
);
157 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 18, sizeof(cl_int
), &out_frame_w
);
158 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 19, sizeof(cl_int
), &out_frame_h
);
159 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 20, sizeof(cl_mem
), &os
->bicubic_x_weights
);
160 HB_OCL_CHECK(hb_ocl
->clSetKernelArg
, os
->m_kernel
, 21, sizeof(cl_mem
), &os
->bicubic_y_weights
);
162 size_t workOffset
[] = { 0, 0, 0 };
163 size_t globalWorkSize
[] = { 1, 1, 1 };
164 size_t localWorkSize
[] = { 1, 1, 1 };
166 int xgroups
= (out_frame_w
+ 63) / 64;
167 int ygroups
= (out_frame_h
+ 15) / 16;
169 localWorkSize
[0] = 64;
170 localWorkSize
[1] = 1;
171 localWorkSize
[2] = 1;
172 globalWorkSize
[0] = xgroups
* 64;
173 globalWorkSize
[1] = ygroups
;
174 globalWorkSize
[2] = 3;
176 HB_OCL_CHECK(hb_ocl
->clEnqueueNDRangeKernel
, kenv
->command_queue
,
177 os
->m_kernel
, 3, workOffset
, globalWorkSize
, localWorkSize
,
178 eventCount
, eventCount
== 0 ? NULL
: &events
[0], &events
[eventCount
]);
181 if (kenv
->isAMD
== 0) {
182 in
->data
= hb_ocl
->clEnqueueMapBuffer(kenv
->command_queue
, in
->cl
.buffer
,
183 CL_FALSE
, CL_MAP_READ
|CL_MAP_WRITE
,
186 eventCount
? &events
[eventCount
- 1] : NULL
,
187 &events
[eventCount
], &status
);
188 out
->data
= hb_ocl
->clEnqueueMapBuffer(kenv
->command_queue
, out
->cl
.buffer
,
189 CL_FALSE
, CL_MAP_READ
|CL_MAP_WRITE
,
192 eventCount
? &events
[eventCount
- 1] : NULL
,
193 &events
[eventCount
+ 1], &status
);
197 hb_ocl
->clFlush(kenv
->command_queue
);
198 hb_ocl
->clWaitForEvents(eventCount
, &events
[0]);
200 for (i
= 0; i
< eventCount
; ++i
)
202 hb_ocl
->clReleaseEvent(events
[i
]);
209 int setupScaleWeights(cl_float xscale
, cl_float yscale
, int width
, int height
, hb_oclscale_t
*os
, KernelEnv
*kenv
)
215 hb_error("setupScaleWeights: OpenCL support not available");
219 if (os
->xscale
!= xscale
|| os
->width
< width
)
221 cl_float
*xweights
= hb_bicubic_weights(xscale
, width
);
222 HB_OCL_BUF_FREE (hb_ocl
, os
->bicubic_x_weights
);
223 HB_OCL_BUF_CREATE(hb_ocl
, os
->bicubic_x_weights
, CL_MEM_READ_ONLY
,
224 sizeof(cl_float
) * width
* 4);
225 HB_OCL_CHECK(hb_ocl
->clEnqueueWriteBuffer
, kenv
->command_queue
, os
->bicubic_x_weights
,
226 CL_TRUE
, 0, sizeof(cl_float
) * width
* 4, xweights
, 0, NULL
, NULL
);
232 if ((os
->yscale
!= yscale
) || (os
->height
< height
))
234 cl_float
*yweights
= hb_bicubic_weights(yscale
, height
);
235 HB_OCL_BUF_FREE (hb_ocl
, os
->bicubic_y_weights
);
236 HB_OCL_BUF_CREATE(hb_ocl
, os
->bicubic_y_weights
, CL_MEM_READ_ONLY
,
237 sizeof(cl_float
) * height
* 4);
238 HB_OCL_CHECK(hb_ocl
->clEnqueueWriteBuffer
, kenv
->command_queue
, os
->bicubic_y_weights
,
239 CL_TRUE
, 0, sizeof(cl_float
) * height
* 4, yweights
, 0, NULL
, NULL
);
249 * function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm
251 * inputFrameBuffer: the source video frame opencl buffer
252 * outputdata: the destination video frame buffer
253 * inputWidth: the width of the source video frame
254 * inputHeight: the height of the source video frame
255 * outputWidth: the width of destination video frame
256 * outputHeight: the height of destination video frame
260 static int s_scale_init_flag
= 0;
264 if ( s_scale_init_flag
==0 )
266 int st
= hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func
);
269 hb_log( "register kernel[%s] failed", "frame_scale" );
278 int hb_ocl_scale(hb_buffer_t
*in
, hb_buffer_t
*out
, int *crop
, hb_oclscale_t
*os
)
282 if (do_scale_init() == 0)
285 data
[0] = in
->cl
.buffer
;
286 data
[1] = out
->cl
.buffer
;
287 data
[2] = (void*)(intptr_t)(crop
[0]);
288 data
[3] = (void*)(intptr_t)(crop
[1]);
289 data
[4] = (void*)(intptr_t)(crop
[2]);
290 data
[5] = (void*)(intptr_t)(crop
[3]);
291 data
[6] = (void*)(intptr_t)(in
->f
.width
);
292 data
[7] = (void*)(intptr_t)(in
->f
.height
);
293 data
[8] = (void*)(intptr_t)(out
->f
.width
);
294 data
[9] = (void*)(intptr_t)(out
->f
.height
);
299 if( !hb_run_kernel( "frame_scale", data
) )
300 hb_log( "run kernel[%s] failed", "frame_scale" );