X64 transport [Part 2](we do not(?) need _mm_empty in x64 build)
[xy_vsfilter.git] / src / subtitles / xy_filter.cpp
blob7c4ce28d179f9ba49a9e443dd822237544e3f145
1 #include "stdafx.h"
2 #include "../dsutil/vd.h"
4 typedef const UINT8 CUINT8, *PCUINT8;
5 typedef const UINT CUINT, *PCUINT;
7 //
8 // ref: "Comparing floating point numbers" by Bruce Dawson
9 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
11 bool AlmostEqual(float A, float B, int maxUlps=0)
13 // Make sure maxUlps is non-negative and small enough that the
14 // default NAN won't compare as equal to anything.
15 ASSERT(maxUlps >= 0 && maxUlps < 4 * 1024 * 1024);
16 int aInt = *(int*)&A;
17 // Make aInt lexicographically ordered as a twos-complement int
18 if (aInt < 0)
19 aInt = 0x80000000 - aInt;
20 // Make bInt lexicographically ordered as a twos-complement int
21 int bInt = *(int*)&B;
22 if (bInt < 0)
23 bInt = 0x80000000 - bInt;
25 int intDiff = abs(aInt - bInt);
26 if (intDiff <= maxUlps)
27 return true;
29 return false;
32 /****
33 * See @xy_filter_c
34 **/
35 __forceinline void xy_filter_one_line_c(float *dst, int width, const float *filter, int filter_width)
37 const float *filter_start = filter;
38 int xx_fix = width > filter_width ? 0 : filter_width - width;
39 float *dst2 = dst - filter_width;
40 float *dst_endr = dst + width;
41 float *dst_end0 = dst_endr - filter_width;
42 float *dst_endl = dst - xx_fix;
43 ASSERT(xx_fix==0 || dst_end0==dst_endl);
45 ASSERT(filter_start == filter);
46 filter_start += filter_width;
47 const float *filter_end = filter_start;
48 for (;dst2<dst_endl;dst2++, filter_start--)//left margin
50 const float *src = dst;
51 float sum = 0;
52 for(const float* f=filter_start;f<filter_end;f++, src++)
54 sum += src[0] * f[0];
56 *dst2 = sum;
58 for (;dst2<dst;dst2++, filter_start--, filter_end--)//if width < filter_width
60 const float *src = dst;
61 float sum = 0;
62 for(const float* f=filter_start;f<filter_end;f++, src++)
64 sum += src[0] * f[0];
66 *dst2 = sum;
68 ASSERT(filter_start==filter);
69 for (;dst2<dst_end0;dst2++)
71 const float *src = dst2;
73 float sum = 0;
74 for(const float* f=filter_start;f<filter_end;f++, src++)
76 sum += src[0] * f[0];
78 *dst2 = sum;
80 for (;dst2<dst_endr;dst2++, filter_end--)//right margin
82 const float *src = dst2;
83 float sum = 0;
84 for(const float* f=filter;f<filter_end;f++, src++)
86 sum += src[0] * f[0];
88 *dst2 = sum;
92 /****
93 * dst memory layout:
94 * 1. Source content starts from @dst;
95 * 2. Output content starts from @dst-@mwitdth;
97 * |- stride -|
98 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
99 * |- -|- -|
100 * |- margin -|- src width*height items -|
101 * |- mwidth*heigth items -|- -|
102 * |- do NOT need to init -|- -|
103 * |- when input -|- -|
104 * |---------------------------------------------------------|
106 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
108 ASSERT( stride>=4*(width+filter_width) );
109 BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
110 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
111 for( ; dst_byte<end; dst_byte+=stride )
113 xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
117 /****
118 * inline function sometimes generates stupid code
120 * @src4, @src_5_8, @f4, @sum : __m128
121 * @src_5_8, @f4: const
122 * @sum : output
123 * @src4: undefined
124 **/
125 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
126 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
127 f4_1 = _mm_mul_ps(f4_1, src4);\
128 sum = _mm_add_ps(sum, f4_1);\
129 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
130 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
131 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
132 sum = _mm_add_ps(sum, f4_1);\
133 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
134 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
135 f4_1 = _mm_mul_ps(f4_1, src4);\
136 sum = _mm_add_ps(sum, f4_1);\
137 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
138 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
139 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
140 sum = _mm_add_ps(sum, f4_1)
142 /****
143 * @src4, @f4_1, @sum : __m128
144 * @f4_1: const
145 * @sum : output
146 * @src4: undefined
147 **/
148 #define XY_FILTER_4_1(src4, f4_1, sum) \
149 src4 = _mm_mul_ps(src4, f4_1);\
150 sum = _mm_add_ps(sum, src4);
152 __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width)
154 int xx_fix = width > filter_width ? 0 : filter_width - width;
155 const float *filter_start = filter;
156 float *dst2 = dst - filter_width;
157 float *dst_endr = dst + width;
158 float *dst_end0 = dst_endr - filter_width - 4;
159 float *dst_endl = dst - xx_fix;
160 ASSERT(xx_fix==0 || dst_end0==dst_endl-4);
162 ASSERT(filter_start == filter);
163 filter_start += filter_width;
164 const float *filter_end = filter_start;
166 for (;dst2<dst_endl;dst2+=4)//left margin
168 const float *src = dst;
169 filter_start -= 4;
171 //filter 4
172 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
173 __m128 sum = _mm_setzero_ps();
174 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
176 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
177 __m128 f4 = _mm_load_ps(f);
179 { XY_FILTER_4(src4, src_5_8, f4, sum); }
181 src4 = src_5_8;
183 //store result
184 _mm_store_ps(dst2, sum);
186 for (;dst2<dst;dst2+=4)//if width < filter_width
188 const float *src = dst;
189 filter_start-=4;
190 filter_end-=4;
192 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
193 __m128 sum = _mm_setzero_ps();
194 __m128 src_5_8, f4;
195 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
197 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
198 f4 = _mm_load_ps(f);
200 { XY_FILTER_4(src4, src_5_8, f4, sum); }
201 src4 = src_5_8;
203 src_5_8 = _mm_setzero_ps();
204 f4 = _mm_load_ps(filter_end);
205 { XY_FILTER_4(src4, src_5_8, f4, sum); }
206 //store result
207 _mm_store_ps(dst2, sum);
209 ASSERT(filter_start == filter);
210 for (;dst2<dst_end0;dst2+=8)
212 const float *src = dst2;
213 const float* f=filter_start;
215 //filter 8
216 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
217 src+=4;
218 __m128 src_5_8;
219 __m128 sum = _mm_setzero_ps();
220 __m128 sum2 = _mm_setzero_ps();
221 __m128 f4 = _mm_load_ps(f);
222 f+=4;
223 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
224 src+=4;
225 { XY_FILTER_4(src4, src_5_8, f4, sum); }
226 for(;f<filter_end;f+=4,src+=4)
228 src4 = _mm_load_ps(src);/*1 2 3 4*/
229 __m128 tmp = src_5_8;//important!
230 { XY_FILTER_4(tmp, src4, f4, sum2); }
232 f4 = _mm_load_ps(f);
233 { XY_FILTER_4(src_5_8, src4, f4, sum); }
234 src_5_8 = src4;
236 src4 = _mm_load_ps(src);/*1 2 3 4*/
237 { XY_FILTER_4(src_5_8, src4, f4, sum2); }
239 //store result
240 _mm_store_ps(dst2, sum);
241 _mm_store_ps(dst2+4, sum2);
243 if (dst2==dst_end0)
245 const float *src = dst2;
246 //filter 4
247 __m128 src4 = _mm_load_ps(src);//1 2 3 4
248 src+=4;
249 __m128 sum = _mm_setzero_ps();
250 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
252 __m128 src_5_8 = _mm_load_ps(src);//5 6 7 8
253 __m128 f4 = _mm_load_ps(f);
255 { XY_FILTER_4(src4, src_5_8, f4, sum); }
256 src4 = src_5_8;
258 //store result
259 _mm_store_ps(dst2, sum);
260 dst2+=4;
262 for (;dst2<dst_endr;dst2+=4)//right margin
264 const float *src = dst2;
265 filter_end-=4;
267 //filter 4
268 __m128 src4 = _mm_load_ps(src);//1 2 3 4
269 __m128 sum = _mm_setzero_ps();
270 __m128 src_5_8, f4;
271 for(const float* f=filter_start;f<filter_end;f+=4)
273 src+=4;
274 src_5_8 = _mm_load_ps(src);//5 6 7 8
275 f4 = _mm_load_ps(f);
277 { XY_FILTER_4(src4, src_5_8, f4, sum); }
279 src4 = src_5_8;
280 //move new 4 in_n_out to old 4 in_n_out
282 src_5_8 = _mm_setzero_ps();
283 f4 = _mm_load_ps(filter_end);
284 { XY_FILTER_4(src4, src_5_8, f4, sum); }
285 //store result
286 _mm_store_ps(dst2, sum);
290 /****
291 * See @xy_filter_c
293 void xy_filter_sse_v6(float *dst, int width, int height, int stride, const float *filter, int filter_width)
295 ASSERT( stride>=4*(width+filter_width) );
296 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
298 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
299 BYTE* end = dst_byte + height*stride;
300 for( ; dst_byte<end; dst_byte+=stride )
302 xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
306 /****
307 * Constrain:
308 * LENGTH%4 == 0 || LENGTH%4 == 1
310 template<int LENGTH>
311 struct M128s
313 __m128 x;
314 M128s<LENGTH - 4> next;
316 template<int Index> __forceinline __m128& GetAt()
318 return next.GetAt<Index - 4>();
320 template<> __forceinline __m128& GetAt<0>()
322 return x;
325 template<int Start, int Offset> __forceinline __m128& GetAt()
327 return GetAt<Start + Offset>();
330 __forceinline void Load(const float* src)
332 x = _mm_load_ps(src);
333 next.Load(src+4);
337 template<>
338 struct M128s<1>
340 __m128 x;
342 template<int Index> __forceinline __m128& GetAt()
344 return x;
346 __forceinline void Load(const float* src)
348 x = _mm_set1_ps(*src);
352 template<>
353 struct M128s<0>
355 void Load(const float* src)
360 template<int FILTER_LENGTH, int START, int LENGTH>
361 struct Filter4
363 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
365 __m128 src4_128 = _mm_load_ps(src4);
366 { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START>(), sum); }
367 Filter4<FILTER_LENGTH,START+4,LENGTH-4>::do_cal(src4_128, src4+4, filter128s, sum);
368 src0_128 = src4_128;
372 template<int FILTER_LENGTH, int START>
373 struct Filter4<FILTER_LENGTH, START, 1>
375 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
377 cal_tail<FILTER_LENGTH-START>(src0_128, src4, filter128s, sum);
379 template<int TAIL>
380 static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum);
381 template<>
382 static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
384 { XY_FILTER_4_1(src0_128, filter128s.GetAt<FILTER_LENGTH-1>(), sum); }
388 template<int FILTER_LENGTH, int START>
389 struct Filter4<FILTER_LENGTH, START, 0>
391 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
396 template<int FILTER_LENGTH,int MARGIN_LENGTH>
397 struct FilterAllLeftMargin
399 static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
401 do_cal<FILTER_LENGTH%4>(src, filter128s);
403 template<int FILTER_TAIL>
404 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
406 //filter 4
407 __m128 src0 = _mm_setzero_ps();
408 __m128 sum = _mm_setzero_ps();
409 Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
410 _mm_store_ps(src-MARGIN_LENGTH, sum);
411 FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
413 template<>
414 static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
416 //filter 4
417 __m128 sum = _mm_setzero_ps();
418 //Only one of the last 4 filter coefficiences is non-zero
419 _mm_store_ps(src-MARGIN_LENGTH, sum);
420 FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
424 template<int FILTER_LENGTH, int MARGIN_LENGTH>
425 struct FilterAllRightMargin
427 static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
429 do_cal<FILTER_LENGTH%4>(src, filter128s);
431 template<int FILTER_TAIL>
432 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
434 //filter 4
436 __m128 src0 = _mm_load_ps(src);
437 __m128 sum = _mm_setzero_ps();
438 Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
439 __m128 src4 = _mm_setzero_ps();
440 { XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
441 //store result
442 _mm_store_ps(src, sum);
444 FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
446 template<>
447 static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
449 //filter 4
451 __m128 src0 = _mm_load_ps(src);
452 __m128 sum = _mm_setzero_ps();
453 Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
454 //Only one of the last 4 filter coefficiences is non-zero
455 { XY_FILTER_4_1(src0, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
456 //store result
457 _mm_store_ps(src, sum);
459 FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
463 template<int FILTER_LENGTH>
464 struct FilterAllLeftMargin<FILTER_LENGTH,0>
466 template<int FILTER_TAIL>
467 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
472 template<int FILTER_LENGTH>
473 struct FilterAllRightMargin<FILTER_LENGTH,0>
475 template<int FILTER_TAIL>
476 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
481 /****
482 * Equivalent:
483 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
484 * See @xy_filter_c
485 * Constrain:
486 * FILTER_LENGTH<=width && width%4==0
488 template<int FILTER_LENGTH>
489 void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter)
491 ASSERT( stride>=4*(width+FILTER_LENGTH) );
492 ASSERT( ((stride|(4*width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
494 M128s<FILTER_LENGTH> filter128s;
495 filter128s.Load(filter);
497 const float *filter_start = filter;
498 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
499 BYTE* end = dst_byte + height*stride;
500 for( ; dst_byte<end; dst_byte+=stride )
502 float *dst2 = reinterpret_cast<float*>(dst_byte);
504 //left margin
505 FilterAllLeftMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
506 float *dst_end1 = dst2 + width;
507 float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3);
508 for (;dst2<dst_end0;dst2+=4)
510 const float *src = dst2;
512 //filter 4
513 __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
514 src += 4;
515 __m128 sum = _mm_setzero_ps();
516 Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
517 //store result
518 _mm_store_ps(dst2, sum);
520 FilterAllRightMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
525 /****
526 * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
527 * @src4, @src_5_8, @f3_1, @f3_2: const
528 * @sum: output
530 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
531 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
532 __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
533 sum = _mm_mul_ps(f3_1, src4);\
534 __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
535 __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
536 sum = _mm_add_ps(sum, mul2);\
537 sum = _mm_add_ps(sum, mul3);
539 /****
540 * Equivalent:
541 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
542 * See @xy_filter_c
543 * Constrain:
544 * filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
546 void xy_3_tag_symmetric_filter_sse(float *dst, int width, int height, int stride, const float *filter)
548 const int filter_width = 4;
549 ASSERT( stride>=4*(width+filter_width) );
550 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
552 ASSERT(filter_width==4 && filter[3]==0 && filter[2]==filter[0]);
554 __m128 f3_1 = _mm_set1_ps(filter[0]);
555 __m128 f3_2 = _mm_set1_ps(filter[1]);
557 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
558 BYTE* end = dst_byte + height*stride;
559 for( ; dst_byte<end; dst_byte+=stride )
561 float *dst_f = reinterpret_cast<float*>(dst_byte);
562 float *dst2 = dst_f;
564 float *dst_end0 = dst_f + width - 4;
565 //filter 4
566 __m128 src4 = _mm_load_ps(dst2);/*1 2 3 4*/
568 __m128 sum;
569 __m128 src_4 = _mm_setzero_ps();
570 { XY_3_TAG_SYMMETRIC_FILTER_4(src_4, src4, f3_1, f3_2, sum); }
571 _mm_store_ps(dst2-4, sum);
573 for (;dst2<dst_end0;dst2+=4)
575 __m128 sum;
576 __m128 src_5_8 = _mm_load_ps(dst2+4);/*5 6 7 8*/
577 { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
578 src4 = src_5_8;
579 //store result
580 _mm_store_ps(dst2, sum);
583 __m128 sum;
584 __m128 src_5_8 = _mm_setzero_ps();/*5 6 7 8*/
585 { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
586 src4 = src_5_8;
587 //store result
588 _mm_store_ps(dst2, sum);
594 /****
595 * See @xy_filter_c
597 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
599 typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
600 const Filter filters[] = {
601 NULL,
602 xy_filter_sse_template<1>, xy_filter_sse_template<4>, xy_filter_sse_template<4>, xy_filter_sse_template<4>,
603 xy_filter_sse_template<5>, xy_filter_sse_template<8>, xy_filter_sse_template<8>, xy_filter_sse_template<8>,
604 xy_filter_sse_template<9>, xy_filter_sse_template<12>, xy_filter_sse_template<12>, xy_filter_sse_template<12>,
605 xy_filter_sse_template<13>, xy_filter_sse_template<16>, xy_filter_sse_template<16>, xy_filter_sse_template<16>,
606 xy_filter_sse_template<17>, xy_filter_sse_template<20>, xy_filter_sse_template<20>, xy_filter_sse_template<20>,
607 xy_filter_sse_template<21>, xy_filter_sse_template<24>, xy_filter_sse_template<24>, xy_filter_sse_template<24>,
608 xy_filter_sse_template<25>, xy_filter_sse_template<28>, xy_filter_sse_template<28>, xy_filter_sse_template<28>
610 if (filter_width<=28 && filter_width<=width)
612 int tmp = filter_width;
613 // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
614 while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3 )
615 tmp--;
616 if (tmp==3&&filter[0]==filter[2])
618 xy_3_tag_symmetric_filter_sse(dst, width, height, stride, filter);
620 else
622 filters[tmp](dst, width, height, stride, filter);
625 else
627 xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
631 /****
632 * Copy and convert src to dst line by line.
633 * @dst_width MUST >= @width
634 * if @dst_width>@width, the extra elements will be filled with 0.
636 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
637 PCUINT8 src, int width, int height, int stride)
639 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
641 PCUINT8 src_end = src + height*stride;
642 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
644 PCUINT8 src2 = src;
645 PCUINT8 src_end = src2 + width;
646 float *dst2 = reinterpret_cast<float*>(dst_byte);
648 for (;src2<src_end;src2++, dst2++)
650 *dst2 = *src2;
652 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
653 for (;dst2<dst2_end;dst2++)
655 *dst2=0;
660 /****
661 * See @xy_byte_2_float_c
663 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
664 PCUINT8 src, int width, int height, int stride)
666 ASSERT( dst_width>=width );
667 ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
668 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
670 PCUINT8 src_end = src + height*stride;
671 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
673 PCUINT8 src2 = src;
674 PCUINT8 src2_end0 = src2 + (width&~15);
675 float *dst2 = reinterpret_cast<float*>(dst_byte);
677 for (;src2<src2_end0;src2+=16, dst2+=16)
679 //filter 4
680 __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
681 __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
682 __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
683 __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
684 __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
685 __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
686 __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
687 __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo);
688 __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi);
689 __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo);
690 __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi);
691 _mm_store_ps(dst2, dst_f1);
692 _mm_store_ps(dst2+4, dst_f2);
693 _mm_store_ps(dst2+8, dst_f3);
694 _mm_store_ps(dst2+12, dst_f4);
696 PCUINT8 src2_end = src + width;
697 for (;src2<src2_end;src2++,dst2++)
699 *dst2 = *src2;
701 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
702 for (;dst2<dst2_end;dst2++)
704 *dst2=0;
709 /****
710 * Copy transposed Matrix src to dst.
711 * @dst_width MUST >= @height.
712 * if @dst_width > @height, the extra elements will be filled with 0.
714 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
715 const float *src, int width, int height, int src_stride)
717 ASSERT(dst_width >= height);
718 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
719 const float* src_end = src + width;
720 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
721 for( ; src<src_end; src++, dst_byte+=dst_stride )
723 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
725 float *dst2 = reinterpret_cast<float*>(dst_byte);
726 for (;src2<src2_end;src2+=src_stride,dst2++)
728 *dst2 = *reinterpret_cast<const float*>(src2);
730 float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
731 for (;dst2<dst2_end;dst2++)
733 *dst2 = 0;
738 /****
739 * see @xy_float_2_float_transpose_c
741 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
742 const float *src, int width, int height, int src_stride)
744 typedef float DstT;
745 typedef const float SrcT;
747 ASSERT( (((int)dst|dst_stride)&15)==0 );
748 ASSERT(dst_width >= height);
749 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
750 SrcT* src_end = src + width;
751 PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
752 PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
753 for( ; src<src_end; src++, dst_byte+=dst_stride )
755 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
757 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
758 for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
760 __m128 m1 = _mm_set_ps(
761 *(SrcT*)(src2+3*src_stride),
762 *(SrcT*)(src2+2*src_stride),
763 *(SrcT*)(src2+src_stride),
764 *(SrcT*)(src2));
765 _mm_store_ps(dst2, m1);
767 for (;src2<src2_end2;src2+=src_stride,dst2++)
769 *dst2 = *reinterpret_cast<SrcT*>(src2);
771 float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
772 for (;dst2<dst2_end;dst2++)
774 *dst2 = 0;
779 /****
780 * Transpose and round Matrix src, then copy to dst.
781 * @dst_width MUST >= @height.
782 * if @dst_width > @height, the extra elements will be filled with 0.
784 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
785 const float *src, int width, int height, int src_stride)
787 ASSERT(dst_width >= height);
788 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
789 const float* src_end = src + width;
790 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
791 for( ; src<src_end; src++, dst_byte+=dst_stride )
793 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
795 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
796 for (;src2<src2_end;src2+=src_stride,dst2++)
798 *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
800 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
801 for (;dst2<dst2_end;dst2++)
803 *dst2 = 0;
808 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
809 const float *src, int width, int height, int src_stride)
811 typedef UINT8 DstT;
812 typedef const float SrcT;
814 ASSERT(dst_width >= height);
815 ASSERT((((int)dst|dst_stride)&15)==0);
816 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
817 SrcT* src_end = src + width;
818 PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
819 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
820 for( ; src<src_end; src++, dst_byte+=dst_stride )
822 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
824 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
825 for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
827 __m128 m1 = _mm_set_ps(
828 *(SrcT*)(src2+3*src_stride),
829 *(SrcT*)(src2+2*src_stride),
830 *(SrcT*)(src2+src_stride),
831 *(SrcT*)(src2));
832 __m128 m2 = _mm_set_ps(
833 *(SrcT*)(src2+7*src_stride),
834 *(SrcT*)(src2+6*src_stride),
835 *(SrcT*)(src2+5*src_stride),
836 *(SrcT*)(src2+4*src_stride));
837 __m128 m3 = _mm_set_ps(
838 *(SrcT*)(src2+11*src_stride),
839 *(SrcT*)(src2+10*src_stride),
840 *(SrcT*)(src2+9*src_stride),
841 *(SrcT*)(src2+8*src_stride));
842 __m128 m4 = _mm_set_ps(
843 *(SrcT*)(src2+15*src_stride),
844 *(SrcT*)(src2+14*src_stride),
845 *(SrcT*)(src2+13*src_stride),
846 *(SrcT*)(src2+12*src_stride));
848 __m128i i1 = _mm_cvtps_epi32(m1);
849 __m128i i2 = _mm_cvtps_epi32(m2);
850 __m128i i3 = _mm_cvtps_epi32(m3);
851 __m128i i4 = _mm_cvtps_epi32(m4);
853 i1 = _mm_packs_epi32(i1,i2);
854 i3 = _mm_packs_epi32(i3,i4);
855 i1 = _mm_packus_epi16(i1,i3);
857 _mm_store_si128((__m128i*)dst2, i1);
859 for (;src2<src2_end;src2+=src_stride,dst2++)
861 *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
863 DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
864 for (;dst2<dst2_end;dst2++)
866 *dst2 = 0;
871 /****
872 * To Do: decent CPU capability check
874 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
875 PCUINT8 src, int width, int height, int stride,
876 const float *gt_x, int r_x, int gt_ex_width_x,
877 const float *gt_y, int r_y, int gt_ex_width_y)
879 ASSERT(width<=stride && width+2*r_x<=dst_stride);
880 int ex_mask_width_x = ((r_x*2+1)+3)&~3;
881 ASSERT(ex_mask_width_x<=gt_ex_width_x);
882 if (ex_mask_width_x>gt_ex_width_x)
884 int o=0;
885 o=o/o;
886 exit(-1);
889 int ex_mask_width_y = ((r_y*2+1)+3)&~3;
890 ASSERT(ex_mask_width_y<=gt_ex_width_y);
891 if (ex_mask_width_y>gt_ex_width_y)
893 int o=0;
894 o=o/o;
895 exit(-1);
899 int fwidth = (width+3)&~3;
900 int fstride = (fwidth + ex_mask_width_x)*sizeof(float);
901 int fheight = (height+3)&~3;
902 int fstride_ver = (fheight+ex_mask_width_y)*sizeof(float);
904 PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width_x)*fstride_ver));
906 float *hor_buff_base = reinterpret_cast<float*>(buff_base);
907 float *hor_buff = hor_buff_base + ex_mask_width_x;
909 // byte to float
910 ASSERT( ((width+15)&~15)<=stride );
911 xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
913 // horizontal pass
914 xy_filter_sse(hor_buff, fwidth, height, fstride, gt_x, ex_mask_width_x);
917 // transpose
918 float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
919 float *ver_buff = ver_buff_base + ex_mask_width_y;
921 int true_width = width+r_x*2;
922 xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r_x*2, true_width, height, fstride);
924 // vertical pass
925 xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt_y, ex_mask_width_y);
927 // transpose
928 int true_height = height + 2*r_y;
929 xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r_y*2, true_height, true_width, fstride_ver);
931 xy_free(buff_base);
932 #ifndef _WIN64
933 // TODOX64 : fixme!
934 _mm_empty();
935 #endif
939 enum RoundingPolicy
941 ROUND_DOWN
942 , ROUND_HALF_DOWN
943 , ROUND_HALF_UP
944 , ROUND_HALF_TO_EVEN
945 , COUNT_ROUND_POLICY
948 template<int ROUNDING_POLICY, int precision>
949 struct XyRounding
951 __forceinline void init_sse();
952 __forceinline __m128i round(__m128i in);
953 __forceinline int round(int in);
956 template<int precision>
957 struct XyRounding<ROUND_DOWN, precision>
959 __forceinline void init_sse()
963 __forceinline __m128i round(__m128i in)
965 return in;
968 __forceinline int round(unsigned in)
970 return in;
975 template<int precision>
976 struct XyRounding<ROUND_HALF_DOWN, precision>
978 __forceinline void init_sse()
980 m_rounding_patch = _mm_set1_epi16( (1<<(precision-1))-1 );
982 __forceinline __m128i round(__m128i in)
984 return _mm_adds_epu16(in, m_rounding_patch);
987 __forceinline int round(unsigned in)
989 return in + ((1<<(precision-1))-1);
991 __m128i m_rounding_patch;
995 template<int precision>
996 struct XyRounding<ROUND_HALF_UP, precision>
998 __forceinline void init_sse()
1000 m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
1002 __forceinline __m128i round(__m128i in)
1004 return _mm_adds_epu16(in, m_rounding_patch);
1007 __forceinline int round(unsigned in)
1009 return in + (1<<(precision-1));
1011 __m128i m_rounding_patch;
1015 template<int precision>
1016 struct XyRounding<ROUND_HALF_TO_EVEN, precision>
1018 __forceinline void init_sse()
1020 m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
1022 __forceinline __m128i round(__m128i in)
1024 in = _mm_adds_epu16(in, m_rounding_patch);
1025 __m128i tmp = _mm_slli_epi16(in, 15-precision);
1026 tmp = _mm_srli_epi16(tmp, 15);
1027 return _mm_adds_epu16(in, tmp);
1030 __forceinline int round(unsigned in)
1032 return in + (1<<(precision-1)) + ((in>>precision)&1);
1034 __m128i m_rounding_patch;
1037 /****
1038 * filter with [1,2,1]
1039 * 1. It is a in-place horizontal filter
1040 * 2. Boundary Pixels are filtered by padding 0. E.g.
1041 * dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1043 template<int ROUNDING_POLICY>
1044 void xy_be_filter_c(PUINT8 dst, int width, int height, int stride)
1046 ASSERT(width>=1);
1047 if (width<=0)
1049 return;
1051 PUINT8 dst2 = NULL;
1052 XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1053 for (int y=0;y<height;y++)
1055 dst2 = dst + y*stride;
1056 int old_sum = dst2[0];
1057 int tmp = 0;
1058 int x=0;
1059 for (x=0;x<width-1;x++)
1061 int new_sum = dst2[x]+dst2[x+1];
1062 tmp = old_sum + new_sum;//old_sum == src2[x-1]+src2[x];
1063 dst2[x] = (xy_rounding.round(tmp)>>2);
1064 old_sum = new_sum;
1066 tmp = old_sum + dst2[x];
1067 dst2[x] = (xy_rounding.round(tmp)>>2);
1071 /****
1072 * 1. It is a in-place symmetric 3-tag horizontal filter
1073 * 2. Boundary Pixels are filtered by padding 0. E.g.
1074 * dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1075 * 3. sum(filter) == 256
1077 template<int ROUNDING_POLICY>
1078 void xy_be_filter2_c(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1080 ASSERT(width>=1);
1081 if (width<=0)
1083 return;
1086 const int VOLUME_BITS = 8;
1087 const int VOLUME = (1<<VOLUME_BITS);
1088 if (filter[0]==0)
1090 return;//nothing to do;
1092 else if (filter[0]== (VOLUME>>2))
1094 return xy_be_filter_c<ROUNDING_POLICY>(dst, width, height, stride);
1097 PUINT8 dst2 = NULL;
1098 XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1099 for (int y=0;y<height;y++)
1101 dst2 = dst + y*stride;
1102 int old_pix = 0;
1103 int tmp = 0;
1104 int x=0;
1105 for (x=0;x<width-1;x++)
1107 tmp = (old_pix + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1108 old_pix = dst2[x];
1109 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1111 tmp = old_pix * filter[0] + dst2[x] * filter[1];
1112 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1116 /****
1117 * See @xy_be_filter_c
1118 * No alignment requirement.
1120 template<int ROUNDING_POLICY>
1121 void xy_be_filter_sse(PUINT8 dst, int width, int height, int stride)
1123 ASSERT(width>=1);
1124 if (width<=0)
1126 return;
1128 int width_mod8 = ((width-1)&~7);
1129 XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1130 xy_rounding.init_sse();
1131 for (int y = 0; y < height; y++) {
1132 PUINT8 dst2=dst+y*stride;
1134 __m128i old_pix_128 = _mm_cvtsi32_si128(dst2[0]);
1135 __m128i old_sum_128 = old_pix_128;
1137 int x = 0;
1138 for (; x < width_mod8; x+=8) {
1139 __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1140 new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
1141 __m128i temp = _mm_slli_si128(new_pix,2);
1142 temp = _mm_add_epi16(temp, old_pix_128);
1143 temp = _mm_add_epi16(temp, new_pix);
1144 old_pix_128 = _mm_srli_si128(new_pix,14);
1146 new_pix = _mm_slli_si128(temp,2);
1147 new_pix = _mm_add_epi16(new_pix, old_sum_128);
1148 new_pix = _mm_add_epi16(new_pix, temp);
1149 old_sum_128 = _mm_srli_si128(temp, 14);
1151 new_pix = xy_rounding.round(new_pix);
1153 new_pix = _mm_srli_epi16(new_pix, 2);
1154 new_pix = _mm_packus_epi16(new_pix, new_pix);
1156 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), new_pix );
1158 int old_sum = _mm_cvtsi128_si32(old_sum_128);
1159 old_sum &= 0xffff;
1160 int tmp = 0;
1161 for ( ; x < width-1; x++) {
1162 int new_sum = dst2[x] + dst2[x+1];
1163 tmp = old_sum + new_sum;
1164 dst2[x] = (xy_rounding.round(tmp)>>2);
1165 old_sum = new_sum;
1167 tmp = old_sum + dst2[x];
1168 dst2[x] = (xy_rounding.round(tmp)>>2);
1172 /****
1173 * See @xy_be_filter2_c
1174 * No alignment requirement.
1176 template<int ROUNDING_POLICY>
1177 void xy_be_filter2_sse(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1179 const int VOLUME_BITS = 8;
1180 const int VOLUME = (1<<VOLUME_BITS);
1181 ASSERT(filter[0]==filter[2]);
1182 ASSERT(filter[0]+filter[1]+filter[2]==VOLUME);
1183 ASSERT(width>=1);
1184 if (width<=0)
1186 return;
1189 XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1190 xy_rounding.init_sse();
1191 __m128i f3_1 = _mm_set1_epi16(filter[0]);
1192 __m128i f3_2 = _mm_set1_epi16(filter[1]);
1194 int width_mod8 = ((width-1)&~7);
1195 //__m128i round = _mm_set1_epi16(8);
1197 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1198 PUINT8 end = dst_byte + height*stride;
1199 for( ; dst_byte<end; dst_byte+=stride )
1201 PUINT8 dst2 = dst_byte;
1203 PUINT8 dst_end0 = dst_byte + width - 4;
1205 __m128i old_pix1_128 = _mm_setzero_si128();
1206 __m128i old_pix2_128 = _mm_cvtsi32_si128(dst2[0]);
1208 int x = 0;
1209 for (; x < width_mod8; x+=8) {
1210 __m128i pix2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1211 pix2 = _mm_unpacklo_epi8(pix2, _mm_setzero_si128());
1212 __m128i pix1 = _mm_slli_si128(pix2,2);
1213 pix1 = _mm_add_epi8(pix1, old_pix2_128);
1214 __m128i pix0 = _mm_slli_si128(pix1,2);
1215 pix0 = _mm_add_epi8(pix0, old_pix1_128);
1216 old_pix1_128 = _mm_srli_si128(pix1,14);
1217 old_pix2_128 = _mm_srli_si128(pix2,14);
1219 pix0 = _mm_add_epi16(pix0, pix2);
1220 pix0 = _mm_mullo_epi16(pix0, f3_1);
1221 pix1 = _mm_mullo_epi16(pix1, f3_2);
1223 pix1 = _mm_adds_epu16(pix1, pix0);
1225 pix1 = xy_rounding.round(pix1);
1227 pix1 = _mm_srli_epi16(pix1, VOLUME_BITS);
1228 pix1 = _mm_packus_epi16(pix1, pix1);
1230 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), pix1 );
1232 int old_pix1 = _mm_cvtsi128_si32(old_pix1_128);
1233 old_pix1 &= 0xff;
1234 int tmp = 0;
1235 for ( ; x < width-1; x++) {
1236 tmp = (old_pix1 + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1237 old_pix1 = dst2[x];
1239 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1241 tmp = old_pix1*filter[0] + dst2[x]*filter[1];
1242 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1246 /****
1247 * See @xy_be_blur
1248 * Construct the filter used in the final horizontal/vertical pass of @xy_be_blur when @pass is NOT a integer.
1249 * This filter is constructed to satisfy:
1250 * If @p is a pixel in the middle of the image, pixels @(p-1) and @(p+1) lie just at the left and the right
1251 * of @p respectively. The value of @p after all horizontal filtering equals to
1252 * a*value_old(@(p-1)) + b*value_old(@p) + a*value_old(@(p+1)) + other pixels' weighted sum,
1253 * then
1254 * a/b = @pass/(@pass+1).
1255 * It makes sense because the above property also holds when @pass is a integer.
1257 * @return
1258 * Let n = floor(pass);
1259 * filter = [ (pass-n)(n+2) / (2*(1+3pass-n)), 1-(pass-n)(n+2)/(1+3pass-n), (pass-n)(n+2)/ (2*(1+3pass-n)) ]
1261 void xy_calculate_filter(float pass, PUINT filter)
1263 const int VOLUME = (1<<8);
1264 int n = (int)pass;
1265 if (n==0)
1267 filter[0] = VOLUME * pass/(1+3*pass);
1269 else if (n==1)
1271 filter[0] = VOLUME * (pass-1)/(2*pass);
1273 else
1275 filter[0] = VOLUME * (pass-n)*(n+2)/ (2*(1+3*pass-n));
1277 filter[1] = VOLUME - 2*filter[0];
1278 filter[2] = filter[0];
1280 if (2*filter[0]>filter[1])
1282 //this should not happen
1283 ASSERT(0);
1284 filter[0] = VOLUME/4;
1285 filter[1] = VOLUME - 2*filter[0];
1286 filter[2] = filter[0];
1290 /****
1291 * See @xy_float_2_byte_transpose_c
1293 void xy_byte_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
1294 PCUINT8 src, int width, int height, int src_stride)
1296 ASSERT(dst_width >= height);
1297 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1298 PCUINT8 src_end = src + width;
1299 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1300 for( ; src<src_end; src++, dst_byte+=dst_stride )
1302 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1304 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
1305 for (;src2<src2_end;src2+=src_stride,dst2++)
1307 *dst2 = *src2;
1309 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
1310 for (;dst2<dst2_end;dst2++)
1312 *dst2 = 0;
1317 /****
1318 * Repeat filter [1,2,1] @pass_x times in horizontal and @pass_y times in vertical
1319 * Boundary Pixels are filtered by padding 0, see @xy_be_filter_c.
1321 * @pass_x:
1322 * When @pass_x is not a integer, horizontal filter [1,2,1] is repeated (int)@pass_x times,
1323 * and a 3-tag symmetric filter which generated according to @pass_x is applied.
1324 * The final 3-tag symmetric filter is constructed to satisfy the following property:
1325 * If @pass_xx > @pass_x, the filtered result of @pass_x should NOT be more blur than
1326 * the result of @pass_xx. More specially, the filtered result of @pass_x should NOT be more
1327 * blur than (int)@pass_x+1 and should NOT be less blur than (int)@pass_x;
1329 * Rounding:
1330 * Original VSFilter \be effect uses a simple a round down, which has an bias of -7.5/16 per pass.
1331 * That rounding error is really huge with big @pass value. It has become one part of the \be effect.
1332 * We can simulate VSFilter's rounding bias by combining different rounding methods. However a simple
1333 * test shows that result still has some visual difference from VSFilter's.
1335 * Know issue:
1336 * It seems that this separated filter implementation is more sensitive to precision in comparison to
1337 * VSFilter's simple implementation. Vertical blocky artifact can be observe with big pass value
1338 * (and @pass_x==@pass_y). So it is only used for filtering of fractional part of \be strength.
1340 void xy_be_blur(PUINT8 src, int width, int height, int stride, float pass_x, float pass_y)
1342 //ASSERT(pass_x>0 && pass_y>0);
1344 typedef void (*XyBeFilter)(PUINT8 src, int width, int height, int stride);
1345 typedef void (*XyFilter2)(PUINT8 src, int width, int height, int stride, PCUINT filter);
1347 XyBeFilter filter = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter_sse<ROUND_HALF_TO_EVEN> : xy_be_filter_c<ROUND_HALF_TO_EVEN>;
1348 XyFilter2 filter2 = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter2_sse<ROUND_HALF_TO_EVEN> : xy_be_filter2_c<ROUND_HALF_TO_EVEN>;
1350 int stride_ver = height;
1351 PUINT8 tmp = reinterpret_cast<PUINT8>(xy_malloc(width*height));
1352 ASSERT(tmp);
1353 // horizontal pass
1354 int pass_x_int = static_cast<int>(pass_x);
1355 for (int i=0; i<pass_x_int; i++)
1357 filter(src, width, height, stride);
1359 if (pass_x-pass_x_int>0)
1361 UINT f[3] = {0};
1362 xy_calculate_filter(pass_x, f);
1363 filter2(src, width, height, stride, f);
1366 // transpose
1367 xy_byte_2_byte_transpose_c(tmp, height, stride_ver, src, width, height, stride);
1369 // vertical pass
1370 int pass_y_int = static_cast<int>(pass_y);
1371 for (int i=0;i<pass_y_int;i++)
1373 filter(tmp, height, width, stride_ver);
1375 if (pass_y-pass_y_int>0)
1377 UINT f[3] = {0};
1378 xy_calculate_filter(pass_y, f);
1379 filter2(tmp, height, width, stride_ver, f);
1382 // transpose
1383 xy_byte_2_byte_transpose_c(src, width, stride, tmp, height, width, stride_ver);
1385 xy_free(tmp);
1386 return;