WinGui: Fix another instance of the Caliburn vs Json.net sillyness where objects...
[HandBrake.git] / libhb / openclkernels.h
blob5ab5124831b426e1a90b923ca021d0587f5bd831
1 /* openclkernels.h
3 Copyright (c) 2003-2015 HandBrake Team
4 This file is part of the HandBrake source code
5 Homepage: <http://handbrake.fr/>.
6 It may be used under the terms of the GNU General Public License v2.
7 For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
9 Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
10 Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
14 #ifndef USE_EXTERNAL_KERNEL
16 #define KERNEL( ... )# __VA_ARGS__
19 char *kernel_src_hscale = KERNEL (
21 typedef unsigned char fixed8;
23 /*******************************************************************************************************
24 dst: Horizontal scale destination;
25 src: YUV content in opencl buf;
26 hf_Y: Horizontal filter coefficients for Y planes;
27 hf_UV: Horizontal filter coefficients for UV planes;
28 hi_Y: Horizontal filter index for Y planes;
29 hi_UV: Horizontal filter index for UV planes;
30 stride: Src width;
31 filter_len: Length of filter;
32 ********************************************************************************************************/
33 kernel void frame_h_scale (
34 global fixed8 *src,
35 global float *hf_Y,
36 global float *hf_UV,
37 global int *hi_Y,
38 global int *hi_UV,
39 global fixed8 *dst,
40 int stride, //src_width
41 int filter_len
44 int x = get_global_id( 0 );
45 int y = get_global_id( 1 );
46 int width = get_global_size( 0 );
47 int height = get_global_size( 1 );
48 float result_Y = 0, result_U = 0, result_V = 0;
49 int i = 0;
51 global fixed8 *src_Y = src;
52 global fixed8 *src_U = src_Y + stride * height;
53 global fixed8 *src_V = src_U + (stride >> 1) * (height >> 1);
55 global fixed8 *dst_Y = dst;
56 global fixed8 *dst_U = dst_Y + width * height;
57 global fixed8 *dst_V = dst_U + (width >> 1) * (height >> 1);
59 int xy = y * width + x;
60 global fixed8 *rowdata_Y = src_Y + (y * stride);
61 for( int i = 0; i < filter_len; i++ )
63 result_Y += ( hf_Y[x + i * width] * rowdata_Y[hi_Y[x] + i]);
65 dst_Y[xy] = result_Y;
67 if( y < (height >> 1) && x < (width >> 1) )
69 int xy = y * (width >> 1) + x;
70 global fixed8 *rowdata_U = src_U + (y * (stride >> 1));
71 global fixed8 *rowdata_V = src_V + (y * (stride >> 1));
72 for( i = 0; i < filter_len; i++ )
74 result_U += ( hf_UV[x + i * (width >> 1)] * rowdata_U[hi_UV[x] + i]);
75 result_V += ( hf_UV[x + i * (width >> 1)] * rowdata_V[hi_UV[x] + i]);
77 dst_U[xy] = result_U;
78 dst_V[xy] = result_V;
83 /*******************************************************************************************************
84 dst: Vertical scale destination;
85 src: YUV content in opencl buf;
86 hf_Y: Vertical filter coefficients for Y planes;
87 hf_UV: Vertical filter coefficients for UV planes;
88 hi_Y: Vertical filter index for Y planes;
89 hi_UV: Vertical filter index for UV planes;
90 stride: Src height;
91 filter_len: Length of filter;
92 ********************************************************************************************************/
93 char *kernel_src_vscale = KERNEL (
95 kernel void frame_v_scale (
96 global fixed8 *src,
97 global float *vf_Y,
98 global float *vf_UV,
99 global int *vi_Y,
100 global int *vi_UV,
101 global fixed8 *dst,
102 int src_height,
103 int filter_len
106 int x = get_global_id( 0 );
107 int y = get_global_id( 1 );
108 int width = get_global_size( 0 );
109 int height = get_global_size( 1 );
110 float result_Y = 0, result_U = 0, result_V = 0;
111 int i = 0;
113 global fixed8 *src_Y = src;
114 global fixed8 *src_U = src_Y + src_height * width;
115 global fixed8 *src_V = src_U + (src_height >> 1) * (width >> 1);
117 global fixed8 *dst_Y = dst;
118 global fixed8 *dst_U = dst_Y + height * width;
119 global fixed8 *dst_V = dst_U + (height >> 1) * (width >> 1);
121 int xy = y * width + x;
122 for( i = 0; i < filter_len; i++ )
124 result_Y += vf_Y[y + i * height] * src_Y[(vi_Y[y] + i) * width + x];
126 dst_Y[xy] = result_Y;
128 if( y < (height >> 1) && x < (width >> 1) )
130 int xy = y * (width >> 1) + x;
131 for( i = 0; i < filter_len; i++ )
133 result_U += vf_UV[y + i * (height >> 1)] * src_U[(vi_UV[y] + i) * (width >> 1) + x];
134 result_V += vf_UV[y + i * (height >> 1)] * src_V[(vi_UV[y] + i) * (width >> 1) + x];
136 dst_U[xy] = result_U;
137 dst_V[xy] = result_V;
142 /*******************************************************************************************************
143 input: Input buffer;
144 output: Output buffer;
145 w: Width of frame;
146 h: Height of frame;
147 ********************************************************************************************************/
148 char *kernel_src_nvtoyuv = KERNEL (
150 kernel void nv12toyuv ( global char *input, global char* output, int w, int h )
152 int x = get_global_id( 0 );
153 int y = get_global_id( 1 );
154 int idx = y * (w >> 1) + x;
155 vstore4((vload4( 0, input + (idx << 2))), 0, output + (idx << 2)); //Y
156 char2 uv = vload2( 0, input + (idx << 1) + w * h );
157 output[idx + w * h] = uv.s0;
158 output[idx + w * h + ((w * h) >> 2)] = uv.s1;
162 /*******************************************************************************************************
163 dst: Horizontal scale destination;
164 src: YUV content in opencl buf;
165 yfilter: Opencl memory of horizontal filter coefficients for luma/alpha planes;
166 yfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for luma/alpha planes;
167 yfilterSize: Horizontal filter size for luma/alpha pixels;
168 cfilter: Opencl memory of horizontal filter coefficients for chroma planes;
169 cfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for chroma planes;
170 cfilterSize: Horizontal filter size for chroma pixels;
171 dstStride: Width of destination luma/alpha planes;
172 dstChrStride: Width of destination chroma planes;
173 ********************************************************************************************************/
175 char *kernel_src_hscaleall = KERNEL (
177 kernel void hscale_all_opencl (
178 global short *dst,
179 const global unsigned char *src,
180 const global short *yfilter,
181 const global int *yfilterPos,
182 int yfilterSize,
183 const global short *cfilter,
184 const global int *cfilterPos,
185 int cfilterSize,
186 int dstWidth,
187 int dstHeight,
188 int srcWidth,
189 int srcHeight,
190 int dstStride,
191 int dstChrStride,
192 int srcStride,
193 int srcChrStride)
195 int w = get_global_id(0);
196 int h = get_global_id(1);
198 int chrWidth = get_global_size(0);
199 int chrHeight = get_global_size(1);
201 int srcPos1 = h * srcStride + yfilterPos[w];
202 int srcPos2 = h * srcStride + yfilterPos[w + chrWidth];
203 int srcPos3 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w];
204 int srcPos4 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w + chrWidth];
205 int srcc1Pos = srcStride * srcHeight + (h) * (srcChrStride) + cfilterPos[w];
206 int srcc2Pos = srcc1Pos + ((srcChrStride)*(chrHeight));
208 int val1 = 0;
209 int val2 = 0;
210 int val3 = 0;
211 int val4 = 0;
212 int val5 = 0;
213 int val6 = 0;
215 int filterPos1 = yfilterSize * w;
216 int filterPos2 = yfilterSize * (w + chrWidth);
217 int cfilterPos1 = cfilterSize * w;
219 int j;
220 for (j = 0; j < yfilterSize; j++)
222 val1 += src[srcPos1 + j] * yfilter[filterPos1+ j];
223 val2 += src[srcPos2 + j] * yfilter[filterPos2 + j];
224 val3 += src[srcPos3 + j] * yfilter[filterPos1 + j];
225 val4 += src[srcPos4 + j] * yfilter[filterPos2 + j];
226 val5 += src[srcc1Pos+j] * cfilter[cfilterPos1 + j];
227 val6 += src[srcc2Pos+j] * cfilter[cfilterPos1 + j];
229 int dstPos1 = h *dstStride;
230 int dstPos2 = (h + chrHeight) * dstStride;
232 dst[dstPos1 + w] = ((val1 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val1 >> 7));
233 dst[dstPos1 + w + chrWidth] = ((val2 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val2 >> 7));
234 dst[dstPos2 + w] = ((val3 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val3 >> 7));
235 dst[dstPos2 + w + chrWidth] = ((val4 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val4 >> 7));
237 int dstPos3 = h * (dstChrStride) + w + dstStride * dstHeight;
238 int dstPos4 = h * (dstChrStride) + w + dstStride * dstHeight + ((dstChrStride) * chrHeight);
239 dst[dstPos3] = ((val5 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val5 >> 7));
240 dst[dstPos4] = ((val6 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val6 >> 7));
244 char *kernel_src_hscalefast = KERNEL (
246 kernel void hscale_fast_opencl (
247 global short *dst,
248 const global unsigned char *src,
249 int xInc,
250 int chrXInc,
251 int dstWidth,
252 int dstHeight,
253 int srcWidth,
254 int srcHeight,
255 int dstStride,
256 int dstChrStride,
257 int srcStride,
258 int srcChrStride)
261 int w = get_global_id(0);
262 int h = get_global_id(1);
264 int chrWidth = get_global_size(0);
265 int chrHeight = get_global_size(1);
266 int xpos1 = 0;
267 int xpos2 = 0;
268 int xx = xpos1 >> 16;
269 int xalpha = (xpos1 & 0xFFFF) >> 9;
270 dst[h * dstStride + w] = (src[h * srcStride + xx] << 7) + (src[h * srcStride + xx + 1] -src[h * srcStride + xx]) * xalpha;
271 int lowpart = h + (chrHeight);
272 dst[lowpart * dstStride + w] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;
274 int inv_i = w * xInc >> 16;
275 if( inv_i >= srcWidth - 1)
277 dst[h*dstStride + w] = src[h*srcStride + srcWidth-1]*128;
278 dst[lowpart*dstStride + w] = src[lowpart*srcStride + srcWidth - 1] * 128;
281 int rightpart = w + (chrWidth);
282 xx = xpos2 >> 16;
283 xalpha = (xpos2 & 0xFFFF) >> 9;
284 dst[h * dstStride + rightpart] = (src[h *srcStride + xx] << 7) + (src[h * srcStride + xx + 1] - src[h * srcStride + xx]) * xalpha;
285 dst[lowpart * dstStride + rightpart] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;
286 inv_i = rightpart * xInc >> 16;
287 if( inv_i >= srcWidth - 1)
289 dst[h * dstStride + rightpart] = src[h * srcStride + srcWidth - 1] * 128;
290 dst[lowpart * dstStride + rightpart] = src[lowpart * srcStride + srcWidth - 1] * 128;
293 int xpos = 0;
294 xpos = chrXInc * w;
295 xx = xpos >> 16;
296 xalpha = (xpos & 0xFFFF) >> 9;
297 src += srcStride * srcHeight;
298 dst += dstStride * dstHeight;
299 dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1] * xalpha);
300 inv_i = w * xInc >> 16;
301 if( inv_i >= (srcWidth >> 1) - 1)
303 dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1]*128;
306 xpos = chrXInc * (w);
307 xx = xpos >> 16;
308 src += srcChrStride * srcHeight >> 1;
309 dst += (dstChrStride * chrHeight);
310 dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1 ] * xalpha);
312 if( inv_i >= (srcWidth >> 1) - 1)
314 //v channel:
315 dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1] * 128;
320 char *kernel_src_vscalealldither = KERNEL (
322 kernel void vscale_all_dither_opencl (
323 global unsigned char *dst,
324 const global short *src,
325 const global short *yfilter,
326 int yfilterSize,
327 const global short *cfilter,
328 int cfilterSize,
329 const global int *yfilterPos,
330 const global int *cfilterPos,
331 int dstWidth,
332 int dstHeight,
333 int srcWidth,
334 int srcHeight,
335 int dstStride,
336 int dstChrStride,
337 int srcStride,
338 int srcChrStride)
340 const unsigned char hb_dither_8x8_128[8][8] = {
341 { 36, 68, 60, 92, 34, 66, 58, 90, },
342 { 100, 4, 124, 28, 98, 2, 122, 26, },
343 { 52, 84, 44, 76, 50, 82, 42, 74, },
344 { 116, 20, 108, 12, 114, 18, 106, 10, },
345 { 32, 64, 56, 88, 38, 70, 62, 94, },
346 { 96, 0, 120, 24, 102, 6, 126, 30, },
347 { 48, 80, 40, 72, 54, 86, 46, 78, },
348 { 112, 16, 104, 8, 118, 22, 110, 14, },
352 int w = get_global_id(0);
353 int h = get_global_id(1);
355 int chrWidth = get_global_size(0);
356 int chrHeight = get_global_size(1);
357 const unsigned char *local_up_dither;
358 const unsigned char *local_down_dither;
360 local_up_dither = hb_dither_8x8_128[h & 7];
361 local_down_dither = hb_dither_8x8_128[(h + chrHeight) & 7];
363 //yscale;
364 int srcPos1 = (yfilterPos[h]) * srcStride + w;
365 int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);
366 int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;
367 int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;
368 int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);
369 int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;
371 int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;
372 int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);
373 int val3 = (local_down_dither[w &7] << 12);
374 int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);
375 int val5 = (local_up_dither[w & 7] << 12);
376 int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.
378 int j;
379 int filterPos1 = h * yfilterSize;
380 int filterPos2 = ( h + chrHeight ) * yfilterSize;
381 for(j = 0; j < yfilterSize; j++)
383 val1 += src[srcPos1] * yfilter[filterPos1 + j];
384 srcPos1 += srcStride;
385 val2 += src[srcPos2] * yfilter[filterPos1 + j];
386 srcPos2 += srcStride;
387 val3 += src[srcPos3] * yfilter[filterPos2 + j];
388 srcPos3 += srcStride;
389 val4 += src[srcPos4] * yfilter[filterPos2 + j];
390 srcPos4 += srcStride;
391 val5 += src[src1Pos] * cfilter[filterPos1 + j];
392 val6 += src[src2Pos] * cfilter[filterPos1 + j];
393 src1Pos += dstChrStride;
394 src2Pos += dstChrStride;
396 dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));
397 dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));
398 dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));
399 dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));
401 int dst1Pos = dstStride * dstHeight + h*(dstChrStride)+(w);
402 int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;
403 dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));
404 dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));
408 char *kernel_src_vscaleallnodither = KERNEL (
410 kernel void vscale_all_nodither_opencl (
411 global unsigned char *dst,
412 const global short *src,
413 const global short *yfilter,
414 int yfilterSize,
415 const global short *cfilter,
416 int cfilterSize,
417 const global int *yfilterPos,
418 const global int *cfilterPos,
419 int dstWidth,
420 int dstHeight,
421 int srcWidth,
422 int srcHeight,
423 int dstStride,
424 int dstChrStride,
425 int srcStride,
426 int srcChrStride)
428 const unsigned char hb_sws_pb_64[8] = {
429 64, 64, 64, 64, 64, 64, 64, 64
432 int w = get_global_id(0);
433 int h = get_global_id(1);
435 int chrWidth = get_global_size(0);
436 int chrHeight = get_global_size(1);
437 const unsigned char *local_up_dither;
438 const unsigned char *local_down_dither;
440 local_up_dither = hb_sws_pb_64;
441 local_down_dither = hb_sws_pb_64;
444 //yscale;
445 int srcPos1 = (yfilterPos[h]) * srcStride + w;
446 int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);
447 int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;
448 int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;
449 int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);
450 int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;
452 int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;
453 int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);
454 int val3 = (local_down_dither[w &7] << 12);
455 int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);
456 int val5 = (local_up_dither[w & 7] << 12);
457 int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.
460 int j;
461 int filterPos1 = h * yfilterSize;
462 int filterPos2 = ( h + chrHeight ) * yfilterSize;
463 for(j = 0; j < yfilterSize; j++)
465 val1 += src[srcPos1] * yfilter[filterPos1 + j];
466 srcPos1 += srcStride;
467 val2 += src[srcPos2] * yfilter[filterPos1 + j];
468 srcPos2 += srcStride;
469 val3 += src[srcPos3] * yfilter[filterPos2 + j];
470 srcPos3 += srcStride;
471 val4 += src[srcPos4] * yfilter[filterPos2 + j];
472 srcPos4 += srcStride;
473 val5 += src[src1Pos] * cfilter[filterPos1 + j];
474 val6 += src[src2Pos] * cfilter[filterPos1 + j];
475 src1Pos += dstChrStride;
476 src2Pos += dstChrStride;
478 dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));
479 dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));
480 dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));
481 dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));;
483 int dst1Pos = dstStride * dstHeight + h * (dstChrStride) + (w);
484 int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;
485 dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));
486 dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));
490 char *kernel_src_vscalefast = KERNEL (
492 kernel void vscale_fast_opencl (
493 global unsigned char *dst,
494 const global short *src,
495 const global int *yfilterPos,
496 const global int *cfilterPos,
497 int dstWidth,
498 int dstHeight,
499 int srcWidth,
500 int srcHeight,
501 int dstStride,
502 int dstChrStride,
503 int srcStride,
504 int srcChrStride)
506 const unsigned char hb_sws_pb_64[8] = {
507 64, 64, 64, 64, 64, 64, 64, 64
510 int w = get_global_id(0);
511 int h = get_global_id(1);
513 int chrWidth = get_global_size(0);
514 int chrHeight = get_global_size(1);
516 const unsigned char *local_up_dither;
517 const unsigned char *local_down_dither;
519 local_up_dither = hb_sws_pb_64;
520 local_down_dither = hb_sws_pb_64;
523 int rightpart = w + chrWidth;
524 int bh = h + chrHeight; // bottom part
525 short val1 = (src[(yfilterPos[h]) * dstStride + w] + local_up_dither[(w + 0) & 7]) >> 7; //lum offset is 0;
526 short val2 = (src[(yfilterPos[h]) * dstStride + rightpart] + local_up_dither[rightpart & 7]) >> 7;
527 short val3 = (src[(yfilterPos[bh]) * dstStride + w] + local_down_dither[w & 7]) >> 7;
528 short val4 = (src[(yfilterPos[bh]) * dstStride + rightpart] + local_down_dither[rightpart & 7]) >> 7;
529 dst[h * dstStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
530 dst[h * dstStride + rightpart] = ((val2&(~0xFF)) ? ((-val2) >> 31) : (val2));
531 dst[bh * dstStride + w] = ((val3&(~0xFF)) ? ((-val3) >> 31) : (val3));
532 dst[bh * dstStride + rightpart] = ((val4&(~0xFF)) ? ((-val4) >> 31) : (val4));
534 src += dstStride * srcHeight;
535 dst += dstStride * dstHeight;
536 val1 = (src[cfilterPos[h] * (dstChrStride) + w] + local_up_dither[ w & 7]) >> 7;
537 dst[h * (dstChrStride) + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
539 src += dstChrStride * (srcHeight >> 1);
540 dst += dstChrStride * chrHeight;
541 val1 = (src[cfilterPos[h] * dstChrStride + w] + local_up_dither[ (w + 3) & 7] ) >> 7;
542 dst[h * dstChrStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
547 char *kernel_src_scale = KERNEL (
549 __kernel __attribute__((reqd_work_group_size(64, 1, 1))) void frame_scale(__global uchar *dst,
550 __global const uchar *src,
551 const float xscale,
552 const float yscale,
553 const int srcPlaneOffset0,
554 const int srcPlaneOffset1,
555 const int srcPlaneOffset2,
556 const int dstPlaneOffset0,
557 const int dstPlaneOffset1,
558 const int dstPlaneOffset2,
559 const int srcRowWords0,
560 const int srcRowWords1,
561 const int srcRowWords2,
562 const int dstRowWords0,
563 const int dstRowWords1,
564 const int dstRowWords2,
565 const int srcWidth,
566 const int srcHeight,
567 const int dstWidth,
568 const int dstHeight,
569 __global const float4* restrict xweights,
570 __global const float4* restrict yweights
573 const int x = get_global_id(0);
574 const int y = get_global_id(1);
575 const int z = get_global_id(2);
577 // Abort work items outside the dst image bounds.
579 if ((get_group_id(0) * 64 >= (dstWidth >> ((z == 0) ? 0 : 1))) || (get_group_id(1) * 16 >= (dstHeight >> ((z == 0) ? 0 : 1))))
580 return;
582 const int srcPlaneOffset = (z == 0) ? srcPlaneOffset0 : ((z == 1) ? srcPlaneOffset1 : srcPlaneOffset2);
583 const int dstPlaneOffset = (z == 0) ? dstPlaneOffset0 : ((z == 1) ? dstPlaneOffset1 : dstPlaneOffset2);
584 const int srcRowWords = (z == 0) ? srcRowWords0: ((z == 1) ? srcRowWords1 : srcRowWords2);
585 const int dstRowWords = (z == 0) ? dstRowWords0: ((z == 1) ? dstRowWords1 : dstRowWords2);
587 __local uchar pixels[64 * 36];
588 const int localRowPixels = 64;
589 const int groupHeight = 16; // src pixel height output by the workgroup
590 const int ypad = 2;
591 const int localx = get_local_id(0);
593 const int globalStartRow = floor((get_group_id(1) * groupHeight) / yscale);
594 const int globalRowCount = ceil(groupHeight / yscale) + 2 * ypad;
596 float4 weights = xweights[x];
597 int4 woffs = floor(x / xscale);
598 woffs += (int4)(-1, 0, 1, 2);
599 woffs = clamp(woffs, 0, (srcWidth >> ((z == 0) ? 0 : 1)) - 1);
600 const int maxy = (srcHeight >> ((z == 0) ? 0 : 1)) - 1;
602 // Scale x from global into LDS
604 for (int i = 0; i <= globalRowCount; ++i) {
605 int4 offs = srcPlaneOffset + clamp(globalStartRow - ypad + i, 0, maxy) * srcRowWords;
606 offs += woffs;
607 pixels[localx + i * localRowPixels] = convert_uchar(clamp(round(dot(weights,
608 (float4)(src[offs.x], src[offs.y], src[offs.z], src[offs.w]))), 0.0f, 255.0f));
611 barrier(CLK_LOCAL_MEM_FENCE);
613 // Scale y from LDS into global
615 if (x >= dstWidth >> ((z == 0) ? 0 : 1))
616 return;
618 int off = dstPlaneOffset + x + (get_group_id(1) * groupHeight) * dstRowWords;
620 for (int i = 0; i < groupHeight; ++i) {
621 if (y >= dstHeight >> ((z == 0) ? 0 : 1))
622 break;
623 int localy = floor((get_group_id(1) * groupHeight + i) / yscale);
624 localy = localy - globalStartRow + ypad;
625 int loff = localx + localy * localRowPixels;
626 dst[off] = convert_uchar(clamp(round(dot(yweights[get_group_id(1) * groupHeight + i],
627 (float4)(pixels[loff - localRowPixels], pixels[loff], pixels[loff + localRowPixels]
628 , pixels[loff + localRowPixels * 2]))), 0.0f, 255.0f));
629 off += dstRowWords;
635 char *kernel_src_yadif_filter = KERNEL(
636 void filter_v6(
637 global unsigned char *dst,
638 global unsigned char *prev,
639 global unsigned char *cur,
640 global unsigned char *next,
641 int x,
642 int y,
643 int width,
644 int height,
645 int parity,
646 int inlinesize,
647 int outlinesize,
648 int inmode,
649 int uvflag
653 int flag = uvflag * (y >=height) * height;
654 int prefs = select(-(inlinesize), inlinesize,((y+1) - flag) <height);
655 int mrefs = select(inlinesize, -(inlinesize),y - flag);
656 int mode = select(inmode,2,(y - flag==1) || (y - flag + 2==height));
657 int score;
659 global unsigned char *prev2 = parity ? prev : cur ;
660 global unsigned char *next2 = parity ? cur : next;
661 int index = x + y * inlinesize;
662 int outindex = x + y * outlinesize;
663 int c = cur[index + mrefs];
664 int d = (prev2[index] + next2[index])>>1;
665 int e = cur[index + prefs];
666 int temporal_diff0 = abs((prev2[index]) - (next2[index]));
667 int temporal_diff1 =(abs(prev[index + mrefs] - c) + abs(prev[index + prefs] - e) )>>1;
668 int temporal_diff2 =(abs(next[index + mrefs] - c) + abs(next[index + prefs] - e) )>>1;
669 int diff = max(max(temporal_diff0>>1, temporal_diff1), temporal_diff2);
670 int spatial_pred = (c+e)>>1;
671 int spatial_score = abs(cur[index + mrefs-1] - cur[index + prefs-1]) + abs(c-e) + abs(cur[index + mrefs+1] - cur[index + prefs+1]) - 1;
672 //check -1
673 score = abs(cur[index + mrefs-2] - cur[index + prefs])
674 + abs(cur[index + mrefs-1] - cur[index + prefs+1])
675 + abs(cur[index + mrefs] - cur[index + prefs+2]);
676 if (score < spatial_score)
678 spatial_score= score;
679 spatial_pred= (cur[index + mrefs-1] + cur[index + prefs+1])>>1;
681 //check -2
682 score = abs(cur[index + mrefs-3] - cur[index + prefs+1])
683 + abs(cur[index + mrefs-2] - cur[index + prefs+2])
684 + abs(cur[index + mrefs-1] - cur[index + prefs+3]);
685 if (score < spatial_score)
687 spatial_score= score;
688 spatial_pred= (cur[index + mrefs-2] + cur[index + prefs+2])>>1;
690 //check 1
691 score = abs(cur[index + mrefs] - cur[index + prefs-2])
692 + abs(cur[index + mrefs+1] - cur[index + prefs-1])
693 + abs(cur[index + mrefs+2] - cur[index + prefs]);
694 if (score < spatial_score)
696 spatial_score= score;
697 spatial_pred= (cur[index + mrefs+1] + cur[index + prefs-1])>>1;
699 //check 2
700 score = abs(cur[index + mrefs+1] - cur[index + prefs-3])
701 + abs(cur[index + mrefs+2] - cur[index + prefs-2])
702 + abs(cur[index + mrefs+3] - cur[index + prefs-1]);
703 if (score < spatial_score)
705 spatial_score= score;
706 spatial_pred= (cur[index + mrefs+2] + cur[index + prefs-2])>>1;
708 if (mode < 2)
710 int b = (prev2[index + (mrefs<<1)] + next2[index + (mrefs<<1)])>>1;
711 int f = (prev2[index + (prefs<<1)] + next2[index + (prefs<<1)])>>1;
712 int diffmax = max(max(d-e, d-c), min(b-c, f-e));
713 int diffmin = min(min(d-e, d-c), max(b-c, f-e));
715 diff = max(max(diff, diffmin), -diffmax);
717 if (spatial_pred > d + diff)
719 spatial_pred = d + diff;
721 else if (spatial_pred < d - diff)
723 spatial_pred = d - diff;
726 dst[outindex] = spatial_pred;
729 kernel void yadif_filter(
730 global unsigned char *dst,
731 global unsigned char *prev,
732 global unsigned char *cur,
733 global unsigned char *next,
734 int parity,
735 int inlinesizeY,
736 int inlinesizeUV,
737 int outlinesizeY,
738 int outlinesizeUV,
739 int mode)
741 int x=get_global_id(0);
742 int y=(get_global_id(1)<<1) + (!parity);
743 int width=(get_global_size(0)<<1)/3;
744 int height=get_global_size(1)<<1;
747 global unsigned char *dst_Y=dst;
748 global unsigned char *dst_U=dst_Y+height*outlinesizeY;
750 global unsigned char *prev_Y=prev;
751 global unsigned char *prev_U=prev_Y+height*inlinesizeY;
753 global unsigned char *cur_Y=cur;
754 global unsigned char *cur_U=cur_Y+height*inlinesizeY;
756 global unsigned char *next_Y=next;
757 global unsigned char *next_U=next_Y+height*inlinesizeY;
759 if(x < width)
761 filter_v6(dst_Y,prev_Y,cur_Y,next_Y,x,y,width,height,parity,inlinesizeY,outlinesizeY,mode,0);
763 else
765 x = x - width;
766 filter_v6(dst_U,prev_U,cur_U,next_U,x,y,width>>1,height>>1,parity,inlinesizeUV,outlinesizeUV,mode,1);
771 #endif