Blur Optimization. [Part 3]
[xy_vsfilter.git] / src / subtitles / xy_filter.cpp
blobb6c8e52326a898ae6f995325462e10d9aa15cc74
1 #include "stdafx.h"
3 typedef const UINT8 CUINT8, *PCUINT8;
5 /****
6 * dst memory layout:
7 * 1. Source content starts from @dst;
8 * 2. Output content starts from @dst-@mwitdth;
9 *
10 * |- stride -|
11 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
12 * |- -|- -|
13 * |- margin -|- src width*height items -|
14 * |- mwidth*heigth items -|- -|
15 * |- do NOT need to init -|- -|
16 * |- when input -|- -|
17 * |---------------------------------------------------------|
18 **/
19 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
21 ASSERT( stride>=4*(width+filter_width) );
23 int xx_fix = width > filter_width ? 0 : filter_width - width;
24 const float *filter_start = filter;
26 BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
27 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
28 for( ; dst_byte<end; dst_byte+=stride )
30 float *dst_f = reinterpret_cast<float*>(dst_byte);
31 float *dst2 = dst_f - filter_width;
32 float *dst_endr = dst_f + width;
33 float *dst_end0 = dst_endr - filter_width;
34 float *dst_endl = dst_f - xx_fix;
35 ASSERT(xx_fix==0 || dst_end0==dst_endl);
37 ASSERT(filter_start == filter);
38 filter_start += filter_width;
39 const float *filter_end = filter_start;
40 for (;dst2<dst_endl;dst2++, filter_start--)//left margin
42 const float *src = dst_f;
43 float sum = 0;
44 for(const float* f=filter_start;f<filter_end;f++, src++)
46 sum += src[0] * f[0];
48 *dst2 = sum;
50 for (;dst2<dst_f;dst2++, filter_start--, filter_end--)//if width < filter_width
52 const float *src = dst_f;
53 float sum = 0;
54 for(const float* f=filter_start;f<filter_end;f++, src++)
56 sum += src[0] * f[0];
58 *dst2 = sum;
60 ASSERT(filter_start==filter);
61 for (;dst2<dst_end0;dst2++)
63 const float *src = dst2;
65 float sum = 0;
66 for(const float* f=filter_start;f<filter_end;f++, src++)
68 sum += src[0] * f[0];
70 *dst2 = sum;
72 for (;dst2<dst_endr;dst2++, filter_end--)//right margin
74 const float *src = dst2;
75 float sum = 0;
76 for(const float* f=filter;f<filter_end;f++, src++)
78 sum += src[0] * f[0];
80 *dst2 = sum;
85 /****
86 * inline function sometimes generates stupid code
88 * @src4, @src_5_8, @f4, @sum : __m128
89 * @src_5_8, @f4: const
90 * @sum : output
91 * @src4: undefined
92 **/
93 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
94 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
95 f4_1 = _mm_mul_ps(f4_1, src4);\
96 sum = _mm_add_ps(sum, f4_1);\
97 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
98 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
99 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
100 sum = _mm_add_ps(sum, f4_1);\
101 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
102 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
103 f4_1 = _mm_mul_ps(f4_1, src4);\
104 sum = _mm_add_ps(sum, f4_1);\
105 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
106 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
107 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
108 sum = _mm_add_ps(sum, f4_1)
110 /****
111 * See @xy_filter_c
113 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
115 ASSERT( stride>=4*(width+filter_width) );
116 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
118 int xx_fix = width > filter_width ? 0 : filter_width - width;
119 const float *filter_start = filter;
120 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
121 BYTE* end = dst_byte + height*stride;
122 for( ; dst_byte<end; dst_byte+=stride )
124 float *dst_f = reinterpret_cast<float*>(dst_byte);
125 float *dst2 = dst_f - filter_width;
126 float *dst_endr = dst_f + width;
127 float *dst_end0 = dst_endr - filter_width;
128 float *dst_endl = dst_f - xx_fix;
129 ASSERT(xx_fix==0 || dst_end0==dst_endl);
131 ASSERT(filter_start == filter);
132 filter_start += filter_width;
133 const float *filter_end = filter_start;
135 for (;dst2<dst_endl;dst2+=4)//left margin
137 const float *src = dst_f;
138 filter_start -= 4;
140 //filter 4
141 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
142 __m128 sum = _mm_setzero_ps();
143 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
145 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
146 __m128 f4 = _mm_load_ps(f);
148 { XY_FILTER_4(src4, src_5_8, f4, sum); }
150 src4 = src_5_8;
152 //store result
153 _mm_store_ps(dst2, sum);
155 for (;dst2<dst_f;dst2+=4)//if width < filter_width
157 const float *src = dst_f;
158 filter_start-=4;
159 filter_end-=4;
161 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
162 __m128 sum = _mm_setzero_ps();
163 __m128 src_5_8, f4;
164 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
166 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
167 f4 = _mm_load_ps(f);
169 { XY_FILTER_4(src4, src_5_8, f4, sum); }
170 src4 = src_5_8;
172 src_5_8 = _mm_setzero_ps();
173 f4 = _mm_load_ps(filter_end);
174 { XY_FILTER_4(src4, src_5_8, f4, sum); }
175 //store result
176 _mm_store_ps(dst2, sum);
178 ASSERT(filter_start == filter);
179 for (;dst2<dst_end0;dst2+=4)
181 const float *src = dst2;
183 //filter 4
184 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
185 __m128 sum = _mm_setzero_ps();
186 for(const float* f=filter_start;f<filter_end;f+=4)
188 src+=4;
189 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
190 __m128 f4 = _mm_load_ps(f);
192 { XY_FILTER_4(src4, src_5_8, f4, sum); }
193 src4 = src_5_8;
195 //store result
196 _mm_store_ps(dst2, sum);
198 for (;dst2<dst_endr;dst2+=4)//right margin
200 const float *src = dst2;
201 filter_end-=4;
203 //filter 4
204 __m128 src4 = _mm_load_ps(src);//1 2 3 4
205 __m128 sum = _mm_setzero_ps();
206 __m128 src_5_8, f4;
207 for(const float* f=filter_start;f<filter_end;f+=4)
209 src+=4;
210 src_5_8 = _mm_load_ps(src);//5 6 7 8
211 f4 = _mm_load_ps(f);
213 { XY_FILTER_4(src4, src_5_8, f4, sum); }
215 src4 = src_5_8;
216 //move new 4 in_n_out to old 4 in_n_out
218 src_5_8 = _mm_setzero_ps();
219 f4 = _mm_load_ps(filter_end);
220 { XY_FILTER_4(src4, src_5_8, f4, sum); }
221 //store result
222 _mm_store_ps(dst2, sum);
227 /****
228 * Copy and convert src to dst line by line.
229 * @dst_width MUST >= @width
230 * if @dst_width>@width, the extra elements will be filled with 0.
232 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
233 PCUINT8 src, int width, int height, int stride)
235 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
237 PCUINT8 src_end = src + height*stride;
238 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
240 PCUINT8 src2 = src;
241 PCUINT8 src_end = src2 + width;
242 float *dst2 = reinterpret_cast<float*>(dst_byte);
244 for (;src2<src_end;src2++, dst2++)
246 *dst2 = *src2;
248 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
249 for (;dst2<dst2_end;dst2++)
251 *dst2=0;
256 /****
257 * See @xy_byte_2_float_c
259 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
260 PCUINT8 src, int width, int height, int stride)
262 ASSERT( dst_width>=width );
263 ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
264 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
266 PCUINT8 src_end = src + height*stride;
267 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
269 PCUINT8 src2 = src;
270 PCUINT8 src2_end0 = src2 + (width&~15);
271 float *dst2 = reinterpret_cast<float*>(dst_byte);
273 for (;src2<src2_end0;src2+=16, dst2+=16)
275 //filter 4
276 __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
277 __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
278 __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
279 __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
280 __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
281 __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
282 __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
283 __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo);
284 __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi);
285 __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo);
286 __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi);
287 _mm_store_ps(dst2, dst_f1);
288 _mm_store_ps(dst2+4, dst_f2);
289 _mm_store_ps(dst2+8, dst_f3);
290 _mm_store_ps(dst2+12, dst_f4);
292 PCUINT8 src2_end = src + width;
293 for (;src2<src2_end;src2++,dst2++)
295 *dst2 = *src2;
297 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
298 for (;dst2<dst2_end;dst2++)
300 *dst2=0;
305 /****
306 * Copy transposed Matrix src to dst.
307 * @dst_width MUST >= @height.
308 * if @dst_width > @height, the extra elements will be filled with 0.
310 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
311 const float *src, int width, int height, int src_stride)
313 ASSERT(dst_width >= height);
314 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
315 const float* src_end = src + width;
316 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
317 for( ; src<src_end; src++, dst_byte+=dst_stride )
319 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
321 float *dst2 = reinterpret_cast<float*>(dst_byte);
322 for (;src2<src2_end;src2+=src_stride,dst2++)
324 *dst2 = *reinterpret_cast<const float*>(src2);
326 float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
327 for (;dst2<dst2_end;dst2++)
329 *dst2 = 0;
334 /****
335 * see @xy_float_2_float_transpose_c
337 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
338 const float *src, int width, int height, int src_stride)
340 typedef float DstT;
341 typedef const float SrcT;
343 ASSERT( (((int)dst|dst_stride)&15)==0 );
344 ASSERT(dst_width >= height);
345 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
346 SrcT* src_end = src + width;
347 PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
348 PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
349 for( ; src<src_end; src++, dst_byte+=dst_stride )
351 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
353 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
354 for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
356 __m128 m1 = _mm_set_ps(
357 *(SrcT*)(src2+3*src_stride),
358 *(SrcT*)(src2+2*src_stride),
359 *(SrcT*)(src2+src_stride),
360 *(SrcT*)(src2));
361 _mm_store_ps(dst2, m1);
363 for (;src2<src2_end2;src2+=src_stride,dst2++)
365 *dst2 = *reinterpret_cast<SrcT*>(src2);
367 float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
368 for (;dst2<dst2_end;dst2++)
370 *dst2 = 0;
375 /****
376 * Transpose and round Matrix src, then copy to dst.
377 * @dst_width MUST >= @height.
378 * if @dst_width > @height, the extra elements will be filled with 0.
380 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
381 const float *src, int width, int height, int src_stride)
383 ASSERT(dst_width >= height);
384 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
385 const float* src_end = src + width;
386 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
387 for( ; src<src_end; src++, dst_byte+=dst_stride )
389 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
391 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
392 for (;src2<src2_end;src2+=src_stride,dst2++)
394 *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
396 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
397 for (;dst2<dst2_end;dst2++)
399 *dst2 = 0;
404 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
405 const float *src, int width, int height, int src_stride)
407 typedef UINT8 DstT;
408 typedef const float SrcT;
410 ASSERT(dst_width >= height);
411 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
412 SrcT* src_end = src + width;
413 PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
414 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
415 for( ; src<src_end; src++, dst_byte+=dst_stride )
417 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
419 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
420 for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
422 __m128 m1 = _mm_set_ps(
423 *(SrcT*)(src2+3*src_stride),
424 *(SrcT*)(src2+2*src_stride),
425 *(SrcT*)(src2+src_stride),
426 *(SrcT*)(src2));
427 __m128 m2 = _mm_set_ps(
428 *(SrcT*)(src2+7*src_stride),
429 *(SrcT*)(src2+6*src_stride),
430 *(SrcT*)(src2+5*src_stride),
431 *(SrcT*)(src2+4*src_stride));
432 __m128 m3 = _mm_set_ps(
433 *(SrcT*)(src2+11*src_stride),
434 *(SrcT*)(src2+10*src_stride),
435 *(SrcT*)(src2+9*src_stride),
436 *(SrcT*)(src2+8*src_stride));
437 __m128 m4 = _mm_set_ps(
438 *(SrcT*)(src2+15*src_stride),
439 *(SrcT*)(src2+14*src_stride),
440 *(SrcT*)(src2+13*src_stride),
441 *(SrcT*)(src2+12*src_stride));
443 __m128i i1 = _mm_cvtps_epi32(m1);
444 __m128i i2 = _mm_cvtps_epi32(m2);
445 __m128i i3 = _mm_cvtps_epi32(m3);
446 __m128i i4 = _mm_cvtps_epi32(m4);
448 i1 = _mm_packs_epi32(i1,i2);
449 i3 = _mm_packs_epi32(i3,i4);
450 i1 = _mm_packus_epi16(i1,i3);
452 _mm_store_si128((__m128i*)dst2, i1);
454 for (;src2<src2_end;src2+=src_stride,dst2++)
456 *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
458 DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
459 for (;dst2<dst2_end;dst2++)
461 *dst2 = 0;
466 /****
467 * To Do: decent CPU capability check
469 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
470 PCUINT8 src, int width, int height, int stride,
471 const float *gt, int r, int gt_ex_width)
473 ASSERT(width<=stride && width+2*r<=dst_stride);
474 int ex_mask_width = ((r*2+1)+3)&~3;
475 ASSERT(ex_mask_width<=gt_ex_width);
476 if (ex_mask_width>gt_ex_width)
478 int o=0;
479 o=o/o;
480 exit(-1);
483 int fwidth = (width+3)&~3;
484 int fstride = (fwidth + ex_mask_width)*sizeof(float);
485 int fheight = (height+3)&~3;
486 int fstride_ver = (fheight+ex_mask_width)*sizeof(float);
488 PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width)*fstride_ver));
490 float *hor_buff_base = reinterpret_cast<float*>(buff_base);
491 float *hor_buff = hor_buff_base + ex_mask_width;
493 // byte to float
494 ASSERT( ((width+15)&~15)<=stride );
495 xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
497 // horizontal pass
498 xy_filter_sse(hor_buff, fwidth, height, fstride, gt, ex_mask_width);
501 // transpose
502 float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
503 float *ver_buff = ver_buff_base + ex_mask_width;
505 int true_width = width+r*2;
506 xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r*2, true_width, height, fstride);
508 // vertical pass
509 xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt, ex_mask_width);
511 // transpose
512 int true_height = height + 2*r;
513 xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r*2, true_height, true_width, fstride_ver);
515 xy_free(buff_base);
516 _mm_empty();