Blur Optimization. [Part 4]
[xy_vsfilter.git] / src / subtitles / xy_filter.cpp
blob77f6531ad1832216399459282b40545c8a8b76cd
1 #include "stdafx.h"
3 typedef const UINT8 CUINT8, *PCUINT8;
5 /****
6 * See @xy_filter_c
7 **/
8 __forceinline void xy_filter_one_line_c(float *dst, int width, const float *filter, int filter_width)
10 const float *filter_start = filter;
11 int xx_fix = width > filter_width ? 0 : filter_width - width;
12 float *dst2 = dst - filter_width;
13 float *dst_endr = dst + width;
14 float *dst_end0 = dst_endr - filter_width;
15 float *dst_endl = dst - xx_fix;
16 ASSERT(xx_fix==0 || dst_end0==dst_endl);
18 ASSERT(filter_start == filter);
19 filter_start += filter_width;
20 const float *filter_end = filter_start;
21 for (;dst2<dst_endl;dst2++, filter_start--)//left margin
23 const float *src = dst;
24 float sum = 0;
25 for(const float* f=filter_start;f<filter_end;f++, src++)
27 sum += src[0] * f[0];
29 *dst2 = sum;
31 for (;dst2<dst;dst2++, filter_start--, filter_end--)//if width < filter_width
33 const float *src = dst;
34 float sum = 0;
35 for(const float* f=filter_start;f<filter_end;f++, src++)
37 sum += src[0] * f[0];
39 *dst2 = sum;
41 ASSERT(filter_start==filter);
42 for (;dst2<dst_end0;dst2++)
44 const float *src = dst2;
46 float sum = 0;
47 for(const float* f=filter_start;f<filter_end;f++, src++)
49 sum += src[0] * f[0];
51 *dst2 = sum;
53 for (;dst2<dst_endr;dst2++, filter_end--)//right margin
55 const float *src = dst2;
56 float sum = 0;
57 for(const float* f=filter;f<filter_end;f++, src++)
59 sum += src[0] * f[0];
61 *dst2 = sum;
65 /****
66 * dst memory layout:
67 * 1. Source content starts from @dst;
68 * 2. Output content starts from @dst-@mwitdth;
70 * |- stride -|
71 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
72 * |- -|- -|
73 * |- margin -|- src width*height items -|
74 * |- mwidth*heigth items -|- -|
75 * |- do NOT need to init -|- -|
76 * |- when input -|- -|
77 * |---------------------------------------------------------|
78 **/
79 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
81 ASSERT( stride>=4*(width+filter_width) );
82 BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
83 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
84 for( ; dst_byte<end; dst_byte+=stride )
86 xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
90 /****
91 * inline function sometimes generates stupid code
93 * @src4, @src_5_8, @f4, @sum : __m128
94 * @src_5_8, @f4: const
95 * @sum : output
96 * @src4: undefined
97 **/
98 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
99 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
100 f4_1 = _mm_mul_ps(f4_1, src4);\
101 sum = _mm_add_ps(sum, f4_1);\
102 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
103 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
104 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
105 sum = _mm_add_ps(sum, f4_1);\
106 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
107 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
108 f4_1 = _mm_mul_ps(f4_1, src4);\
109 sum = _mm_add_ps(sum, f4_1);\
110 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
111 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
112 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
113 sum = _mm_add_ps(sum, f4_1)
116 __forceinline void xy_filter_one_line_sse_v4(float *dst, int width, const float *filter, int filter_width)
118 int xx_fix = width > filter_width ? 0 : filter_width - width;
119 const float *filter_start = filter;
120 float *dst2 = dst - filter_width;
121 float *dst_endr = dst + width;
122 float *dst_end0 = dst_endr - filter_width;
123 float *dst_endl = dst - xx_fix;
124 ASSERT(xx_fix==0 || dst_end0==dst_endl);
126 ASSERT(filter_start == filter);
127 filter_start += filter_width;
128 const float *filter_end = filter_start;
130 for (;dst2<dst_endl;dst2+=4)//left margin
132 const float *src = dst;
133 filter_start -= 4;
135 //filter 4
136 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
137 __m128 sum = _mm_setzero_ps();
138 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
140 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
141 __m128 f4 = _mm_load_ps(f);
143 { XY_FILTER_4(src4, src_5_8, f4, sum); }
145 src4 = src_5_8;
147 //store result
148 _mm_store_ps(dst2, sum);
150 for (;dst2<dst;dst2+=4)//if width < filter_width
152 const float *src = dst;
153 filter_start-=4;
154 filter_end-=4;
156 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
157 __m128 sum = _mm_setzero_ps();
158 __m128 src_5_8, f4;
159 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
161 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
162 f4 = _mm_load_ps(f);
164 { XY_FILTER_4(src4, src_5_8, f4, sum); }
165 src4 = src_5_8;
167 src_5_8 = _mm_setzero_ps();
168 f4 = _mm_load_ps(filter_end);
169 { XY_FILTER_4(src4, src_5_8, f4, sum); }
170 //store result
171 _mm_store_ps(dst2, sum);
173 ASSERT(filter_start == filter);
174 for (;dst2<dst_end0;dst2+=4)
176 const float *src = dst2;
178 //filter 4
179 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
180 __m128 sum = _mm_setzero_ps();
181 for(const float* f=filter_start;f<filter_end;f+=4)
183 src+=4;
184 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
185 __m128 f4 = _mm_load_ps(f);
187 { XY_FILTER_4(src4, src_5_8, f4, sum); }
188 src4 = src_5_8;
190 //store result
191 _mm_store_ps(dst2, sum);
193 for (;dst2<dst_endr;dst2+=4)//right margin
195 const float *src = dst2;
196 filter_end-=4;
198 //filter 4
199 __m128 src4 = _mm_load_ps(src);//1 2 3 4
200 __m128 sum = _mm_setzero_ps();
201 __m128 src_5_8, f4;
202 for(const float* f=filter_start;f<filter_end;f+=4)
204 src+=4;
205 src_5_8 = _mm_load_ps(src);//5 6 7 8
206 f4 = _mm_load_ps(f);
208 { XY_FILTER_4(src4, src_5_8, f4, sum); }
210 src4 = src_5_8;
211 //move new 4 in_n_out to old 4 in_n_out
213 src_5_8 = _mm_setzero_ps();
214 f4 = _mm_load_ps(filter_end);
215 { XY_FILTER_4(src4, src_5_8, f4, sum); }
216 //store result
217 _mm_store_ps(dst2, sum);
221 /****
222 * See @xy_filter_c
224 void xy_filter_sse_v4(float *dst, int width, int height, int stride, const float *filter, int filter_width)
226 ASSERT( stride>=4*(width+filter_width) );
227 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
229 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
230 BYTE* end = dst_byte + height*stride;
231 for( ; dst_byte<end; dst_byte+=stride )
233 xy_filter_one_line_sse_v4(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
237 template<int LENGTH>
238 struct M128s
240 __m128 x;
241 M128s<LENGTH - 4> next;
243 template<int Index> __forceinline __m128& GetAt()
245 return next.GetAt<Index - 4>();
247 template<> __forceinline __m128& GetAt<0>()
249 return x;
252 template<int Start, int Offset> __forceinline __m128& GetAt()
254 return GetAt<Start + Offset>();
257 __forceinline void Load(const float* src)
259 x = _mm_load_ps(src);
260 next.Load(src+4);
264 template<>
265 struct M128s<0>
267 void Load(const float* src)
272 template<int FILTER_LENGTH, int START, int LENGTH>
273 struct Filter4
275 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
277 Filter4<FILTER_LENGTH,START,LENGTH-4>::do_cal(src0_128, src4, filter128s, sum);
278 __m128 src4_128 = _mm_load_ps(src4+LENGTH-4);
279 XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START+LENGTH-4>(), sum);
280 src0_128 = src4_128;
284 template<int FILTER_LENGTH, int START>
285 struct Filter4<FILTER_LENGTH, START, 0>
287 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
292 template<int FILTER_LENGTH,int MARGIN_LENGTH>
293 struct FilterAllMargin
295 static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
297 //filter 4
298 __m128 src0 = _mm_setzero_ps();
299 __m128 sum = _mm_setzero_ps();
300 Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
301 _mm_store_ps(src-MARGIN_LENGTH, sum);
302 FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_left_margin(src, filter128s);
305 static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
307 //filter 4
309 __m128 src0 = _mm_load_ps(src);
310 __m128 sum = _mm_setzero_ps();
311 Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
312 __m128 src4 = _mm_setzero_ps();
313 XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum);
314 //store result
315 _mm_store_ps(src, sum);
317 FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_right_margin(src+4, filter128s);
321 template<int FILTER_LENGTH>
322 struct FilterAllMargin<FILTER_LENGTH,0>
324 static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
327 static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
332 /****
333 * See @xy_filter_c
334 * Constrain:
335 * FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width
337 template<int FILTER_LENGTH>
338 void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter)
340 ASSERT( stride>=4*(width+FILTER_LENGTH) );
341 ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
343 M128s<FILTER_LENGTH> filter128s;
344 filter128s.Load(filter);
346 const float *filter_start = filter;
347 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
348 BYTE* end = dst_byte + height*stride;
349 for( ; dst_byte<end; dst_byte+=stride )
351 float *dst2 = reinterpret_cast<float*>(dst_byte);
353 //left margin
354 FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_left_margin(dst2, filter128s);
355 float *dst_end1 = dst2 + width;
356 float *dst_end0 = dst_end1 - FILTER_LENGTH;
357 for (;dst2<dst_end0;dst2+=4)
359 const float *src = dst2;
361 //filter 4
362 __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
363 src += 4;
364 __m128 sum = _mm_setzero_ps();
365 Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
366 //store result
367 _mm_store_ps(dst2, sum);
369 FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_right_margin(dst2, filter128s);
373 /****
374 * See @xy_filter_c
376 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
378 typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
379 const Filter filters[] = { xy_filter_sse_template<4>, xy_filter_sse_template<8>, xy_filter_sse_template<12>, xy_filter_sse_template<16>,
380 xy_filter_sse_template<20>, xy_filter_sse_template<24>, xy_filter_sse_template<28>
382 if (filter_width<=28 && filter_width<=width)
384 ASSERT(filter_width%4==0);
385 filters[(filter_width-1)/4](dst, width, height, stride, filter);
387 else
389 xy_filter_sse_v4(dst, width, height, stride, filter, filter_width);
393 /****
394 * Copy and convert src to dst line by line.
395 * @dst_width MUST >= @width
396 * if @dst_width>@width, the extra elements will be filled with 0.
398 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
399 PCUINT8 src, int width, int height, int stride)
401 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
403 PCUINT8 src_end = src + height*stride;
404 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
406 PCUINT8 src2 = src;
407 PCUINT8 src_end = src2 + width;
408 float *dst2 = reinterpret_cast<float*>(dst_byte);
410 for (;src2<src_end;src2++, dst2++)
412 *dst2 = *src2;
414 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
415 for (;dst2<dst2_end;dst2++)
417 *dst2=0;
422 /****
423 * See @xy_byte_2_float_c
425 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
426 PCUINT8 src, int width, int height, int stride)
428 ASSERT( dst_width>=width );
429 ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
430 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
432 PCUINT8 src_end = src + height*stride;
433 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
435 PCUINT8 src2 = src;
436 PCUINT8 src2_end0 = src2 + (width&~15);
437 float *dst2 = reinterpret_cast<float*>(dst_byte);
439 for (;src2<src2_end0;src2+=16, dst2+=16)
441 //filter 4
442 __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
443 __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
444 __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
445 __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
446 __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
447 __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
448 __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
449 __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo);
450 __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi);
451 __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo);
452 __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi);
453 _mm_store_ps(dst2, dst_f1);
454 _mm_store_ps(dst2+4, dst_f2);
455 _mm_store_ps(dst2+8, dst_f3);
456 _mm_store_ps(dst2+12, dst_f4);
458 PCUINT8 src2_end = src + width;
459 for (;src2<src2_end;src2++,dst2++)
461 *dst2 = *src2;
463 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
464 for (;dst2<dst2_end;dst2++)
466 *dst2=0;
471 /****
472 * Copy transposed Matrix src to dst.
473 * @dst_width MUST >= @height.
474 * if @dst_width > @height, the extra elements will be filled with 0.
476 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
477 const float *src, int width, int height, int src_stride)
479 ASSERT(dst_width >= height);
480 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
481 const float* src_end = src + width;
482 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
483 for( ; src<src_end; src++, dst_byte+=dst_stride )
485 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
487 float *dst2 = reinterpret_cast<float*>(dst_byte);
488 for (;src2<src2_end;src2+=src_stride,dst2++)
490 *dst2 = *reinterpret_cast<const float*>(src2);
492 float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
493 for (;dst2<dst2_end;dst2++)
495 *dst2 = 0;
500 /****
501 * see @xy_float_2_float_transpose_c
503 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
504 const float *src, int width, int height, int src_stride)
506 typedef float DstT;
507 typedef const float SrcT;
509 ASSERT( (((int)dst|dst_stride)&15)==0 );
510 ASSERT(dst_width >= height);
511 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
512 SrcT* src_end = src + width;
513 PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
514 PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
515 for( ; src<src_end; src++, dst_byte+=dst_stride )
517 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
519 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
520 for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
522 __m128 m1 = _mm_set_ps(
523 *(SrcT*)(src2+3*src_stride),
524 *(SrcT*)(src2+2*src_stride),
525 *(SrcT*)(src2+src_stride),
526 *(SrcT*)(src2));
527 _mm_store_ps(dst2, m1);
529 for (;src2<src2_end2;src2+=src_stride,dst2++)
531 *dst2 = *reinterpret_cast<SrcT*>(src2);
533 float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
534 for (;dst2<dst2_end;dst2++)
536 *dst2 = 0;
541 /****
542 * Transpose and round Matrix src, then copy to dst.
543 * @dst_width MUST >= @height.
544 * if @dst_width > @height, the extra elements will be filled with 0.
546 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
547 const float *src, int width, int height, int src_stride)
549 ASSERT(dst_width >= height);
550 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
551 const float* src_end = src + width;
552 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
553 for( ; src<src_end; src++, dst_byte+=dst_stride )
555 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
557 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
558 for (;src2<src2_end;src2+=src_stride,dst2++)
560 *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
562 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
563 for (;dst2<dst2_end;dst2++)
565 *dst2 = 0;
570 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
571 const float *src, int width, int height, int src_stride)
573 typedef UINT8 DstT;
574 typedef const float SrcT;
576 ASSERT(dst_width >= height);
577 ASSERT((((int)dst|dst_stride)&15)==0);
578 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
579 SrcT* src_end = src + width;
580 PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
581 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
582 for( ; src<src_end; src++, dst_byte+=dst_stride )
584 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
586 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
587 for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
589 __m128 m1 = _mm_set_ps(
590 *(SrcT*)(src2+3*src_stride),
591 *(SrcT*)(src2+2*src_stride),
592 *(SrcT*)(src2+src_stride),
593 *(SrcT*)(src2));
594 __m128 m2 = _mm_set_ps(
595 *(SrcT*)(src2+7*src_stride),
596 *(SrcT*)(src2+6*src_stride),
597 *(SrcT*)(src2+5*src_stride),
598 *(SrcT*)(src2+4*src_stride));
599 __m128 m3 = _mm_set_ps(
600 *(SrcT*)(src2+11*src_stride),
601 *(SrcT*)(src2+10*src_stride),
602 *(SrcT*)(src2+9*src_stride),
603 *(SrcT*)(src2+8*src_stride));
604 __m128 m4 = _mm_set_ps(
605 *(SrcT*)(src2+15*src_stride),
606 *(SrcT*)(src2+14*src_stride),
607 *(SrcT*)(src2+13*src_stride),
608 *(SrcT*)(src2+12*src_stride));
610 __m128i i1 = _mm_cvtps_epi32(m1);
611 __m128i i2 = _mm_cvtps_epi32(m2);
612 __m128i i3 = _mm_cvtps_epi32(m3);
613 __m128i i4 = _mm_cvtps_epi32(m4);
615 i1 = _mm_packs_epi32(i1,i2);
616 i3 = _mm_packs_epi32(i3,i4);
617 i1 = _mm_packus_epi16(i1,i3);
619 _mm_store_si128((__m128i*)dst2, i1);
621 for (;src2<src2_end;src2+=src_stride,dst2++)
623 *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
625 DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
626 for (;dst2<dst2_end;dst2++)
628 *dst2 = 0;
633 /****
634 * To Do: decent CPU capability check
636 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
637 PCUINT8 src, int width, int height, int stride,
638 const float *gt, int r, int gt_ex_width)
640 ASSERT(width<=stride && width+2*r<=dst_stride);
641 int ex_mask_width = ((r*2+1)+3)&~3;
642 ASSERT(ex_mask_width<=gt_ex_width);
643 if (ex_mask_width>gt_ex_width)
645 int o=0;
646 o=o/o;
647 exit(-1);
650 int fwidth = (width+3)&~3;
651 int fstride = (fwidth + ex_mask_width)*sizeof(float);
652 int fheight = (height+3)&~3;
653 int fstride_ver = (fheight+ex_mask_width)*sizeof(float);
655 PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width)*fstride_ver));
657 float *hor_buff_base = reinterpret_cast<float*>(buff_base);
658 float *hor_buff = hor_buff_base + ex_mask_width;
660 // byte to float
661 ASSERT( ((width+15)&~15)<=stride );
662 xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
664 // horizontal pass
665 xy_filter_sse(hor_buff, fwidth, height, fstride, gt, ex_mask_width);
668 // transpose
669 float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
670 float *ver_buff = ver_buff_base + ex_mask_width;
672 int true_width = width+r*2;
673 xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r*2, true_width, height, fstride);
675 // vertical pass
676 xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt, ex_mask_width);
678 // transpose
679 int true_height = height + 2*r;
680 xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r*2, true_height, true_width, fstride_ver);
682 xy_free(buff_base);
683 _mm_empty();