3 Copyright (c) 2003-2015 HandBrake Team
4 This file is part of the HandBrake source code
5 Homepage: <http://handbrake.fr/>.
6 It may be used under the terms of the GNU General Public License v2.
7 For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
9 Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
10 Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
14 #ifndef USE_EXTERNAL_KERNEL
16 #define KERNEL( ... )# __VA_ARGS__
19 char *kernel_src_hscale
= KERNEL (
21 typedef unsigned char fixed8
;
23 /*******************************************************************************************************
24 dst: Horizontal scale destination;
25 src: YUV content in opencl buf;
26 hf_Y: Horizontal filter coefficients for Y planes;
27 hf_UV: Horizontal filter coefficients for UV planes;
28 hi_Y: Horizontal filter index for Y planes;
29 hi_UV: Horizontal filter index for UV planes;
31 filter_len: Length of filter;
32 ********************************************************************************************************/
33 kernel
void frame_h_scale (
40 int stride
, //src_width
44 int x
= get_global_id( 0 );
45 int y
= get_global_id( 1 );
46 int width
= get_global_size( 0 );
47 int height
= get_global_size( 1 );
48 float result_Y
= 0, result_U
= 0, result_V
= 0;
51 global fixed8
*src_Y
= src
;
52 global fixed8
*src_U
= src_Y
+ stride
* height
;
53 global fixed8
*src_V
= src_U
+ (stride
>> 1) * (height
>> 1);
55 global fixed8
*dst_Y
= dst
;
56 global fixed8
*dst_U
= dst_Y
+ width
* height
;
57 global fixed8
*dst_V
= dst_U
+ (width
>> 1) * (height
>> 1);
59 int xy
= y
* width
+ x
;
60 global fixed8
*rowdata_Y
= src_Y
+ (y
* stride
);
61 for( int i
= 0; i
< filter_len
; i
++ )
63 result_Y
+= ( hf_Y
[x
+ i
* width
] * rowdata_Y
[hi_Y
[x
] + i
]);
67 if( y
< (height
>> 1) && x
< (width
>> 1) )
69 int xy
= y
* (width
>> 1) + x
;
70 global fixed8
*rowdata_U
= src_U
+ (y
* (stride
>> 1));
71 global fixed8
*rowdata_V
= src_V
+ (y
* (stride
>> 1));
72 for( i
= 0; i
< filter_len
; i
++ )
74 result_U
+= ( hf_UV
[x
+ i
* (width
>> 1)] * rowdata_U
[hi_UV
[x
] + i
]);
75 result_V
+= ( hf_UV
[x
+ i
* (width
>> 1)] * rowdata_V
[hi_UV
[x
] + i
]);
83 /*******************************************************************************************************
84 dst: Vertical scale destination;
85 src: YUV content in opencl buf;
86 hf_Y: Vertical filter coefficients for Y planes;
87 hf_UV: Vertical filter coefficients for UV planes;
88 hi_Y: Vertical filter index for Y planes;
89 hi_UV: Vertical filter index for UV planes;
91 filter_len: Length of filter;
92 ********************************************************************************************************/
93 char *kernel_src_vscale
= KERNEL (
95 kernel
void frame_v_scale (
106 int x
= get_global_id( 0 );
107 int y
= get_global_id( 1 );
108 int width
= get_global_size( 0 );
109 int height
= get_global_size( 1 );
110 float result_Y
= 0, result_U
= 0, result_V
= 0;
113 global fixed8
*src_Y
= src
;
114 global fixed8
*src_U
= src_Y
+ src_height
* width
;
115 global fixed8
*src_V
= src_U
+ (src_height
>> 1) * (width
>> 1);
117 global fixed8
*dst_Y
= dst
;
118 global fixed8
*dst_U
= dst_Y
+ height
* width
;
119 global fixed8
*dst_V
= dst_U
+ (height
>> 1) * (width
>> 1);
121 int xy
= y
* width
+ x
;
122 for( i
= 0; i
< filter_len
; i
++ )
124 result_Y
+= vf_Y
[y
+ i
* height
] * src_Y
[(vi_Y
[y
] + i
) * width
+ x
];
126 dst_Y
[xy
] = result_Y
;
128 if( y
< (height
>> 1) && x
< (width
>> 1) )
130 int xy
= y
* (width
>> 1) + x
;
131 for( i
= 0; i
< filter_len
; i
++ )
133 result_U
+= vf_UV
[y
+ i
* (height
>> 1)] * src_U
[(vi_UV
[y
] + i
) * (width
>> 1) + x
];
134 result_V
+= vf_UV
[y
+ i
* (height
>> 1)] * src_V
[(vi_UV
[y
] + i
) * (width
>> 1) + x
];
136 dst_U
[xy
] = result_U
;
137 dst_V
[xy
] = result_V
;
142 /*******************************************************************************************************
144 output: Output buffer;
147 ********************************************************************************************************/
148 char *kernel_src_nvtoyuv
= KERNEL (
150 kernel
void nv12toyuv ( global
char *input
, global
char* output
, int w
, int h
)
152 int x
= get_global_id( 0 );
153 int y
= get_global_id( 1 );
154 int idx
= y
* (w
>> 1) + x
;
155 vstore4((vload4( 0, input
+ (idx
<< 2))), 0, output
+ (idx
<< 2)); //Y
156 char2 uv
= vload2( 0, input
+ (idx
<< 1) + w
* h
);
157 output
[idx
+ w
* h
] = uv
.s0
;
158 output
[idx
+ w
* h
+ ((w
* h
) >> 2)] = uv
.s1
;
162 /*******************************************************************************************************
163 dst: Horizontal scale destination;
164 src: YUV content in opencl buf;
165 yfilter: Opencl memory of horizontal filter coefficients for luma/alpha planes;
166 yfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for luma/alpha planes;
167 yfilterSize: Horizontal filter size for luma/alpha pixels;
168 cfilter: Opencl memory of horizontal filter coefficients for chroma planes;
169 cfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for chroma planes;
170 cfilterSize: Horizontal filter size for chroma pixels;
171 dstStride: Width of destination luma/alpha planes;
172 dstChrStride: Width of destination chroma planes;
173 ********************************************************************************************************/
175 char *kernel_src_hscaleall
= KERNEL (
177 kernel
void hscale_all_opencl (
179 const global
unsigned char *src
,
180 const global
short *yfilter
,
181 const global
int *yfilterPos
,
183 const global
short *cfilter
,
184 const global
int *cfilterPos
,
195 int w
= get_global_id(0);
196 int h
= get_global_id(1);
198 int chrWidth
= get_global_size(0);
199 int chrHeight
= get_global_size(1);
201 int srcPos1
= h
* srcStride
+ yfilterPos
[w
];
202 int srcPos2
= h
* srcStride
+ yfilterPos
[w
+ chrWidth
];
203 int srcPos3
= (h
+ (srcHeight
>> 1)) * srcStride
+ yfilterPos
[w
];
204 int srcPos4
= (h
+ (srcHeight
>> 1)) * srcStride
+ yfilterPos
[w
+ chrWidth
];
205 int srcc1Pos
= srcStride
* srcHeight
+ (h
) * (srcChrStride
) + cfilterPos
[w
];
206 int srcc2Pos
= srcc1Pos
+ ((srcChrStride
)*(chrHeight
));
215 int filterPos1
= yfilterSize
* w
;
216 int filterPos2
= yfilterSize
* (w
+ chrWidth
);
217 int cfilterPos1
= cfilterSize
* w
;
220 for (j
= 0; j
< yfilterSize
; j
++)
222 val1
+= src
[srcPos1
+ j
] * yfilter
[filterPos1
+ j
];
223 val2
+= src
[srcPos2
+ j
] * yfilter
[filterPos2
+ j
];
224 val3
+= src
[srcPos3
+ j
] * yfilter
[filterPos1
+ j
];
225 val4
+= src
[srcPos4
+ j
] * yfilter
[filterPos2
+ j
];
226 val5
+= src
[srcc1Pos
+j
] * cfilter
[cfilterPos1
+ j
];
227 val6
+= src
[srcc2Pos
+j
] * cfilter
[cfilterPos1
+ j
];
229 int dstPos1
= h
*dstStride
;
230 int dstPos2
= (h
+ chrHeight
) * dstStride
;
232 dst
[dstPos1
+ w
] = ((val1
>> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val1
>> 7));
233 dst
[dstPos1
+ w
+ chrWidth
] = ((val2
>> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val2
>> 7));
234 dst
[dstPos2
+ w
] = ((val3
>> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val3
>> 7));
235 dst
[dstPos2
+ w
+ chrWidth
] = ((val4
>> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val4
>> 7));
237 int dstPos3
= h
* (dstChrStride
) + w
+ dstStride
* dstHeight
;
238 int dstPos4
= h
* (dstChrStride
) + w
+ dstStride
* dstHeight
+ ((dstChrStride
) * chrHeight
);
239 dst
[dstPos3
] = ((val5
>> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val5
>> 7));
240 dst
[dstPos4
] = ((val6
>> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val6
>> 7));
244 char *kernel_src_hscalefast
= KERNEL (
246 kernel
void hscale_fast_opencl (
248 const global
unsigned char *src
,
261 int w
= get_global_id(0);
262 int h
= get_global_id(1);
264 int chrWidth
= get_global_size(0);
265 int chrHeight
= get_global_size(1);
268 int xx
= xpos1
>> 16;
269 int xalpha
= (xpos1
& 0xFFFF) >> 9;
270 dst
[h
* dstStride
+ w
] = (src
[h
* srcStride
+ xx
] << 7) + (src
[h
* srcStride
+ xx
+ 1] -src
[h
* srcStride
+ xx
]) * xalpha
;
271 int lowpart
= h
+ (chrHeight
);
272 dst
[lowpart
* dstStride
+ w
] = (src
[lowpart
* srcStride
+ xx
] << 7) + (src
[lowpart
* srcStride
+ xx
+ 1] - src
[lowpart
* srcStride
+ xx
]) * xalpha
;
274 int inv_i
= w
* xInc
>> 16;
275 if( inv_i
>= srcWidth
- 1)
277 dst
[h
*dstStride
+ w
] = src
[h
*srcStride
+ srcWidth
-1]*128;
278 dst
[lowpart
*dstStride
+ w
] = src
[lowpart
*srcStride
+ srcWidth
- 1] * 128;
281 int rightpart
= w
+ (chrWidth
);
283 xalpha
= (xpos2
& 0xFFFF) >> 9;
284 dst
[h
* dstStride
+ rightpart
] = (src
[h
*srcStride
+ xx
] << 7) + (src
[h
* srcStride
+ xx
+ 1] - src
[h
* srcStride
+ xx
]) * xalpha
;
285 dst
[lowpart
* dstStride
+ rightpart
] = (src
[lowpart
* srcStride
+ xx
] << 7) + (src
[lowpart
* srcStride
+ xx
+ 1] - src
[lowpart
* srcStride
+ xx
]) * xalpha
;
286 inv_i
= rightpart
* xInc
>> 16;
287 if( inv_i
>= srcWidth
- 1)
289 dst
[h
* dstStride
+ rightpart
] = src
[h
* srcStride
+ srcWidth
- 1] * 128;
290 dst
[lowpart
* dstStride
+ rightpart
] = src
[lowpart
* srcStride
+ srcWidth
- 1] * 128;
296 xalpha
= (xpos
& 0xFFFF) >> 9;
297 src
+= srcStride
* srcHeight
;
298 dst
+= dstStride
* dstHeight
;
299 dst
[h
* (dstChrStride
) + w
] = (src
[h
* (srcChrStride
) + xx
] * (xalpha
^127) + src
[h
* (srcChrStride
) + xx
+ 1] * xalpha
);
300 inv_i
= w
* xInc
>> 16;
301 if( inv_i
>= (srcWidth
>> 1) - 1)
303 dst
[h
* (dstChrStride
) + w
] = src
[h
* (srcChrStride
) + (srcWidth
>> 1) -1]*128;
306 xpos
= chrXInc
* (w
);
308 src
+= srcChrStride
* srcHeight
>> 1;
309 dst
+= (dstChrStride
* chrHeight
);
310 dst
[h
* (dstChrStride
) + w
] = (src
[h
* (srcChrStride
) + xx
] * (xalpha
^127) + src
[h
* (srcChrStride
) + xx
+ 1 ] * xalpha
);
312 if( inv_i
>= (srcWidth
>> 1) - 1)
315 dst
[h
* (dstChrStride
) + w
] = src
[h
* (srcChrStride
) + (srcWidth
>> 1) -1] * 128;
320 char *kernel_src_vscalealldither
= KERNEL (
322 kernel
void vscale_all_dither_opencl (
323 global
unsigned char *dst
,
324 const global
short *src
,
325 const global
short *yfilter
,
327 const global
short *cfilter
,
329 const global
int *yfilterPos
,
330 const global
int *cfilterPos
,
340 const unsigned char hb_dither_8x8_128
[8][8] = {
341 { 36, 68, 60, 92, 34, 66, 58, 90, },
342 { 100, 4, 124, 28, 98, 2, 122, 26, },
343 { 52, 84, 44, 76, 50, 82, 42, 74, },
344 { 116, 20, 108, 12, 114, 18, 106, 10, },
345 { 32, 64, 56, 88, 38, 70, 62, 94, },
346 { 96, 0, 120, 24, 102, 6, 126, 30, },
347 { 48, 80, 40, 72, 54, 86, 46, 78, },
348 { 112, 16, 104, 8, 118, 22, 110, 14, },
352 int w
= get_global_id(0);
353 int h
= get_global_id(1);
355 int chrWidth
= get_global_size(0);
356 int chrHeight
= get_global_size(1);
357 const unsigned char *local_up_dither
;
358 const unsigned char *local_down_dither
;
360 local_up_dither
= hb_dither_8x8_128
[h
& 7];
361 local_down_dither
= hb_dither_8x8_128
[(h
+ chrHeight
) & 7];
364 int srcPos1
= (yfilterPos
[h
]) * srcStride
+ w
;
365 int srcPos2
= (yfilterPos
[h
]) * srcStride
+ w
+ (chrWidth
);
366 int srcPos3
= (yfilterPos
[h
+ chrHeight
]) * srcStride
+ w
;
367 int srcPos4
= (yfilterPos
[h
+ chrHeight
]) * srcStride
+ w
+ chrWidth
;
368 int src1Pos
= dstStride
* srcHeight
+ (cfilterPos
[h
]) * dstChrStride
+ (w
);
369 int src2Pos
= dstStride
* srcHeight
+ (dstChrStride
*(srcHeight
>>1)) + (cfilterPos
[h
]) * dstChrStride
+ w
;
371 int val1
= (local_up_dither
[w
& 7] << 12); //y offset is 0;
372 int val2
= (local_up_dither
[(w
+ chrWidth
) & 7] << 12);
373 int val3
= (local_down_dither
[w
&7] << 12);
374 int val4
= (local_down_dither
[(w
+ chrWidth
) & 7] << 12);
375 int val5
= (local_up_dither
[w
& 7] << 12);
376 int val6
= (local_up_dither
[(w
+ 3) & 7] << 12); // 3 is offset of the chrome channel.
379 int filterPos1
= h
* yfilterSize
;
380 int filterPos2
= ( h
+ chrHeight
) * yfilterSize
;
381 for(j
= 0; j
< yfilterSize
; j
++)
383 val1
+= src
[srcPos1
] * yfilter
[filterPos1
+ j
];
384 srcPos1
+= srcStride
;
385 val2
+= src
[srcPos2
] * yfilter
[filterPos1
+ j
];
386 srcPos2
+= srcStride
;
387 val3
+= src
[srcPos3
] * yfilter
[filterPos2
+ j
];
388 srcPos3
+= srcStride
;
389 val4
+= src
[srcPos4
] * yfilter
[filterPos2
+ j
];
390 srcPos4
+= srcStride
;
391 val5
+= src
[src1Pos
] * cfilter
[filterPos1
+ j
];
392 val6
+= src
[src2Pos
] * cfilter
[filterPos1
+ j
];
393 src1Pos
+= dstChrStride
;
394 src2Pos
+= dstChrStride
;
396 dst
[h
* dstStride
+ w
] = (((val1
>> 19)&(~0xFF)) ? ((-(val1
>> 19)) >> 31) : (val1
>> 19));
397 dst
[h
* dstStride
+ w
+ chrWidth
] = (((val2
>> 19)&(~0xFF)) ? ((-(val2
>> 19)) >> 31) : (val2
>> 19));
398 dst
[(h
+ chrHeight
) * dstStride
+ w
] = (((val3
>> 19)&(~0xFF)) ? ((-(val3
>> 19)) >> 31) : (val3
>> 19));
399 dst
[(h
+ chrHeight
) * dstStride
+ w
+ chrWidth
] = (((val4
>> 19)&(~0xFF)) ? ((-(val4
>> 19)) >> 31) : (val4
>> 19));
401 int dst1Pos
= dstStride
* dstHeight
+ h
*(dstChrStride
)+(w
);
402 int dst2Pos
= (dstChrStride
* chrHeight
) + dst1Pos
;
403 dst
[dst1Pos
] = (((val5
>> 19)&(~0xFF)) ? ((-(val5
>> 19)) >> 31) : (val5
>> 19));
404 dst
[dst2Pos
] = (((val6
>> 19)&(~0xFF)) ? ((-(val6
>> 19)) >> 31) : (val6
>> 19));
408 char *kernel_src_vscaleallnodither
= KERNEL (
410 kernel
void vscale_all_nodither_opencl (
411 global
unsigned char *dst
,
412 const global
short *src
,
413 const global
short *yfilter
,
415 const global
short *cfilter
,
417 const global
int *yfilterPos
,
418 const global
int *cfilterPos
,
428 const unsigned char hb_sws_pb_64
[8] = {
429 64, 64, 64, 64, 64, 64, 64, 64
432 int w
= get_global_id(0);
433 int h
= get_global_id(1);
435 int chrWidth
= get_global_size(0);
436 int chrHeight
= get_global_size(1);
437 const unsigned char *local_up_dither
;
438 const unsigned char *local_down_dither
;
440 local_up_dither
= hb_sws_pb_64
;
441 local_down_dither
= hb_sws_pb_64
;
445 int srcPos1
= (yfilterPos
[h
]) * srcStride
+ w
;
446 int srcPos2
= (yfilterPos
[h
]) * srcStride
+ w
+ (chrWidth
);
447 int srcPos3
= (yfilterPos
[h
+ chrHeight
]) * srcStride
+ w
;
448 int srcPos4
= (yfilterPos
[h
+ chrHeight
]) * srcStride
+ w
+ chrWidth
;
449 int src1Pos
= dstStride
* srcHeight
+ (cfilterPos
[h
]) * dstChrStride
+ (w
);
450 int src2Pos
= dstStride
* srcHeight
+ (dstChrStride
*(srcHeight
>>1)) + (cfilterPos
[h
]) * dstChrStride
+ w
;
452 int val1
= (local_up_dither
[w
& 7] << 12); //y offset is 0;
453 int val2
= (local_up_dither
[(w
+ chrWidth
) & 7] << 12);
454 int val3
= (local_down_dither
[w
&7] << 12);
455 int val4
= (local_down_dither
[(w
+ chrWidth
) & 7] << 12);
456 int val5
= (local_up_dither
[w
& 7] << 12);
457 int val6
= (local_up_dither
[(w
+ 3) & 7] << 12); // 3 is offset of the chrome channel.
461 int filterPos1
= h
* yfilterSize
;
462 int filterPos2
= ( h
+ chrHeight
) * yfilterSize
;
463 for(j
= 0; j
< yfilterSize
; j
++)
465 val1
+= src
[srcPos1
] * yfilter
[filterPos1
+ j
];
466 srcPos1
+= srcStride
;
467 val2
+= src
[srcPos2
] * yfilter
[filterPos1
+ j
];
468 srcPos2
+= srcStride
;
469 val3
+= src
[srcPos3
] * yfilter
[filterPos2
+ j
];
470 srcPos3
+= srcStride
;
471 val4
+= src
[srcPos4
] * yfilter
[filterPos2
+ j
];
472 srcPos4
+= srcStride
;
473 val5
+= src
[src1Pos
] * cfilter
[filterPos1
+ j
];
474 val6
+= src
[src2Pos
] * cfilter
[filterPos1
+ j
];
475 src1Pos
+= dstChrStride
;
476 src2Pos
+= dstChrStride
;
478 dst
[h
* dstStride
+ w
] = (((val1
>> 19)&(~0xFF)) ? ((-(val1
>> 19)) >> 31) : (val1
>> 19));
479 dst
[h
* dstStride
+ w
+ chrWidth
] = (((val2
>> 19)&(~0xFF)) ? ((-(val2
>> 19)) >> 31) : (val2
>> 19));
480 dst
[(h
+ chrHeight
) * dstStride
+ w
] = (((val3
>> 19)&(~0xFF)) ? ((-(val3
>> 19)) >> 31) : (val3
>> 19));
481 dst
[(h
+ chrHeight
) * dstStride
+ w
+ chrWidth
] = (((val4
>> 19)&(~0xFF)) ? ((-(val4
>> 19)) >> 31) : (val4
>> 19));;
483 int dst1Pos
= dstStride
* dstHeight
+ h
* (dstChrStride
) + (w
);
484 int dst2Pos
= (dstChrStride
* chrHeight
) + dst1Pos
;
485 dst
[dst1Pos
] = (((val5
>> 19)&(~0xFF)) ? ((-(val5
>> 19)) >> 31) : (val5
>> 19));
486 dst
[dst2Pos
] = (((val6
>> 19)&(~0xFF)) ? ((-(val6
>> 19)) >> 31) : (val6
>> 19));
490 char *kernel_src_vscalefast
= KERNEL (
492 kernel
void vscale_fast_opencl (
493 global
unsigned char *dst
,
494 const global
short *src
,
495 const global
int *yfilterPos
,
496 const global
int *cfilterPos
,
506 const unsigned char hb_sws_pb_64
[8] = {
507 64, 64, 64, 64, 64, 64, 64, 64
510 int w
= get_global_id(0);
511 int h
= get_global_id(1);
513 int chrWidth
= get_global_size(0);
514 int chrHeight
= get_global_size(1);
516 const unsigned char *local_up_dither
;
517 const unsigned char *local_down_dither
;
519 local_up_dither
= hb_sws_pb_64
;
520 local_down_dither
= hb_sws_pb_64
;
523 int rightpart
= w
+ chrWidth
;
524 int bh
= h
+ chrHeight
; // bottom part
525 short val1
= (src
[(yfilterPos
[h
]) * dstStride
+ w
] + local_up_dither
[(w
+ 0) & 7]) >> 7; //lum offset is 0;
526 short val2
= (src
[(yfilterPos
[h
]) * dstStride
+ rightpart
] + local_up_dither
[rightpart
& 7]) >> 7;
527 short val3
= (src
[(yfilterPos
[bh
]) * dstStride
+ w
] + local_down_dither
[w
& 7]) >> 7;
528 short val4
= (src
[(yfilterPos
[bh
]) * dstStride
+ rightpart
] + local_down_dither
[rightpart
& 7]) >> 7;
529 dst
[h
* dstStride
+ w
] = ((val1
&(~0xFF)) ? ((-val1
) >> 31) : (val1
));
530 dst
[h
* dstStride
+ rightpart
] = ((val2
&(~0xFF)) ? ((-val2
) >> 31) : (val2
));
531 dst
[bh
* dstStride
+ w
] = ((val3
&(~0xFF)) ? ((-val3
) >> 31) : (val3
));
532 dst
[bh
* dstStride
+ rightpart
] = ((val4
&(~0xFF)) ? ((-val4
) >> 31) : (val4
));
534 src
+= dstStride
* srcHeight
;
535 dst
+= dstStride
* dstHeight
;
536 val1
= (src
[cfilterPos
[h
] * (dstChrStride
) + w
] + local_up_dither
[ w
& 7]) >> 7;
537 dst
[h
* (dstChrStride
) + w
] = ((val1
&(~0xFF)) ? ((-val1
) >> 31) : (val1
));
539 src
+= dstChrStride
* (srcHeight
>> 1);
540 dst
+= dstChrStride
* chrHeight
;
541 val1
= (src
[cfilterPos
[h
] * dstChrStride
+ w
] + local_up_dither
[ (w
+ 3) & 7] ) >> 7;
542 dst
[h
* dstChrStride
+ w
] = ((val1
&(~0xFF)) ? ((-val1
) >> 31) : (val1
));
547 char *kernel_src_scale
= KERNEL (
549 __kernel
__attribute__((reqd_work_group_size(64, 1, 1))) void frame_scale(__global uchar
*dst
,
550 __global
const uchar
*src
,
553 const int srcPlaneOffset0
,
554 const int srcPlaneOffset1
,
555 const int srcPlaneOffset2
,
556 const int dstPlaneOffset0
,
557 const int dstPlaneOffset1
,
558 const int dstPlaneOffset2
,
559 const int srcRowWords0
,
560 const int srcRowWords1
,
561 const int srcRowWords2
,
562 const int dstRowWords0
,
563 const int dstRowWords1
,
564 const int dstRowWords2
,
569 __global
const float4
* restrict xweights
,
570 __global
const float4
* restrict yweights
573 const int x
= get_global_id(0);
574 const int y
= get_global_id(1);
575 const int z
= get_global_id(2);
577 // Abort work items outside the dst image bounds.
579 if ((get_group_id(0) * 64 >= (dstWidth
>> ((z
== 0) ? 0 : 1))) || (get_group_id(1) * 16 >= (dstHeight
>> ((z
== 0) ? 0 : 1))))
582 const int srcPlaneOffset
= (z
== 0) ? srcPlaneOffset0
: ((z
== 1) ? srcPlaneOffset1
: srcPlaneOffset2
);
583 const int dstPlaneOffset
= (z
== 0) ? dstPlaneOffset0
: ((z
== 1) ? dstPlaneOffset1
: dstPlaneOffset2
);
584 const int srcRowWords
= (z
== 0) ? srcRowWords0
: ((z
== 1) ? srcRowWords1
: srcRowWords2
);
585 const int dstRowWords
= (z
== 0) ? dstRowWords0
: ((z
== 1) ? dstRowWords1
: dstRowWords2
);
587 __local uchar pixels
[64 * 36];
588 const int localRowPixels
= 64;
589 const int groupHeight
= 16; // src pixel height output by the workgroup
591 const int localx
= get_local_id(0);
593 const int globalStartRow
= floor((get_group_id(1) * groupHeight
) / yscale
);
594 const int globalRowCount
= ceil(groupHeight
/ yscale
) + 2 * ypad
;
596 float4 weights
= xweights
[x
];
597 int4 woffs
= floor(x
/ xscale
);
598 woffs
+= (int4
)(-1, 0, 1, 2);
599 woffs
= clamp(woffs
, 0, (srcWidth
>> ((z
== 0) ? 0 : 1)) - 1);
600 const int maxy
= (srcHeight
>> ((z
== 0) ? 0 : 1)) - 1;
602 // Scale x from global into LDS
604 for (int i
= 0; i
<= globalRowCount
; ++i
) {
605 int4 offs
= srcPlaneOffset
+ clamp(globalStartRow
- ypad
+ i
, 0, maxy
) * srcRowWords
;
607 pixels
[localx
+ i
* localRowPixels
] = convert_uchar(clamp(round(dot(weights
,
608 (float4
)(src
[offs
.x
], src
[offs
.y
], src
[offs
.z
], src
[offs
.w
]))), 0.0f
, 255.0f
));
611 barrier(CLK_LOCAL_MEM_FENCE
);
613 // Scale y from LDS into global
615 if (x
>= dstWidth
>> ((z
== 0) ? 0 : 1))
618 int off
= dstPlaneOffset
+ x
+ (get_group_id(1) * groupHeight
) * dstRowWords
;
620 for (int i
= 0; i
< groupHeight
; ++i
) {
621 if (y
>= dstHeight
>> ((z
== 0) ? 0 : 1))
623 int localy
= floor((get_group_id(1) * groupHeight
+ i
) / yscale
);
624 localy
= localy
- globalStartRow
+ ypad
;
625 int loff
= localx
+ localy
* localRowPixels
;
626 dst
[off
] = convert_uchar(clamp(round(dot(yweights
[get_group_id(1) * groupHeight
+ i
],
627 (float4
)(pixels
[loff
- localRowPixels
], pixels
[loff
], pixels
[loff
+ localRowPixels
]
628 , pixels
[loff
+ localRowPixels
* 2]))), 0.0f
, 255.0f
));
635 char *kernel_src_yadif_filter
= KERNEL(
637 global
unsigned char *dst
,
638 global
unsigned char *prev
,
639 global
unsigned char *cur
,
640 global
unsigned char *next
,
653 int flag
= uvflag
* (y
>=height
) * height
;
654 int prefs
= select(-(inlinesize
), inlinesize
,((y
+1) - flag
) <height
);
655 int mrefs
= select(inlinesize
, -(inlinesize
),y
- flag
);
656 int mode
= select(inmode
,2,(y
- flag
==1) || (y
- flag
+ 2==height
));
659 global
unsigned char *prev2
= parity
? prev
: cur
;
660 global
unsigned char *next2
= parity
? cur
: next
;
661 int index
= x
+ y
* inlinesize
;
662 int outindex
= x
+ y
* outlinesize
;
663 int c
= cur
[index
+ mrefs
];
664 int d
= (prev2
[index
] + next2
[index
])>>1;
665 int e
= cur
[index
+ prefs
];
666 int temporal_diff0
= abs((prev2
[index
]) - (next2
[index
]));
667 int temporal_diff1
=(abs(prev
[index
+ mrefs
] - c
) + abs(prev
[index
+ prefs
] - e
) )>>1;
668 int temporal_diff2
=(abs(next
[index
+ mrefs
] - c
) + abs(next
[index
+ prefs
] - e
) )>>1;
669 int diff
= max(max(temporal_diff0
>>1, temporal_diff1
), temporal_diff2
);
670 int spatial_pred
= (c
+e
)>>1;
671 int spatial_score
= abs(cur
[index
+ mrefs
-1] - cur
[index
+ prefs
-1]) + abs(c
-e
) + abs(cur
[index
+ mrefs
+1] - cur
[index
+ prefs
+1]) - 1;
673 score
= abs(cur
[index
+ mrefs
-2] - cur
[index
+ prefs
])
674 + abs(cur
[index
+ mrefs
-1] - cur
[index
+ prefs
+1])
675 + abs(cur
[index
+ mrefs
] - cur
[index
+ prefs
+2]);
676 if (score
< spatial_score
)
678 spatial_score
= score
;
679 spatial_pred
= (cur
[index
+ mrefs
-1] + cur
[index
+ prefs
+1])>>1;
682 score
= abs(cur
[index
+ mrefs
-3] - cur
[index
+ prefs
+1])
683 + abs(cur
[index
+ mrefs
-2] - cur
[index
+ prefs
+2])
684 + abs(cur
[index
+ mrefs
-1] - cur
[index
+ prefs
+3]);
685 if (score
< spatial_score
)
687 spatial_score
= score
;
688 spatial_pred
= (cur
[index
+ mrefs
-2] + cur
[index
+ prefs
+2])>>1;
691 score
= abs(cur
[index
+ mrefs
] - cur
[index
+ prefs
-2])
692 + abs(cur
[index
+ mrefs
+1] - cur
[index
+ prefs
-1])
693 + abs(cur
[index
+ mrefs
+2] - cur
[index
+ prefs
]);
694 if (score
< spatial_score
)
696 spatial_score
= score
;
697 spatial_pred
= (cur
[index
+ mrefs
+1] + cur
[index
+ prefs
-1])>>1;
700 score
= abs(cur
[index
+ mrefs
+1] - cur
[index
+ prefs
-3])
701 + abs(cur
[index
+ mrefs
+2] - cur
[index
+ prefs
-2])
702 + abs(cur
[index
+ mrefs
+3] - cur
[index
+ prefs
-1]);
703 if (score
< spatial_score
)
705 spatial_score
= score
;
706 spatial_pred
= (cur
[index
+ mrefs
+2] + cur
[index
+ prefs
-2])>>1;
710 int b
= (prev2
[index
+ (mrefs
<<1)] + next2
[index
+ (mrefs
<<1)])>>1;
711 int f
= (prev2
[index
+ (prefs
<<1)] + next2
[index
+ (prefs
<<1)])>>1;
712 int diffmax
= max(max(d
-e
, d
-c
), min(b
-c
, f
-e
));
713 int diffmin
= min(min(d
-e
, d
-c
), max(b
-c
, f
-e
));
715 diff
= max(max(diff
, diffmin
), -diffmax
);
717 if (spatial_pred
> d
+ diff
)
719 spatial_pred
= d
+ diff
;
721 else if (spatial_pred
< d
- diff
)
723 spatial_pred
= d
- diff
;
726 dst
[outindex
] = spatial_pred
;
729 kernel
void yadif_filter(
730 global
unsigned char *dst
,
731 global
unsigned char *prev
,
732 global
unsigned char *cur
,
733 global
unsigned char *next
,
741 int x
=get_global_id(0);
742 int y
=(get_global_id(1)<<1) + (!parity
);
743 int width
=(get_global_size(0)<<1)/3;
744 int height
=get_global_size(1)<<1;
747 global
unsigned char *dst_Y
=dst
;
748 global
unsigned char *dst_U
=dst_Y
+height
*outlinesizeY
;
750 global
unsigned char *prev_Y
=prev
;
751 global
unsigned char *prev_U
=prev_Y
+height
*inlinesizeY
;
753 global
unsigned char *cur_Y
=cur
;
754 global
unsigned char *cur_U
=cur_Y
+height
*inlinesizeY
;
756 global
unsigned char *next_Y
=next
;
757 global
unsigned char *next_U
=next_Y
+height
*inlinesizeY
;
761 filter_v6(dst_Y
,prev_Y
,cur_Y
,next_Y
,x
,y
,width
,height
,parity
,inlinesizeY
,outlinesizeY
,mode
,0);
766 filter_v6(dst_U
,prev_U
,cur_U
,next_U
,x
,y
,width
>>1,height
>>1,parity
,inlinesizeUV
,outlinesizeUV
,mode
,1);