src/subtitles/xy_filter.cpp

   1 #include "stdafx.h"
   2 #include "../dsutil/vd.h"
   3
   4 typedef const UINT8 CUINT8, *PCUINT8;
   5 typedef const UINT CUINT, *PCUINT;
   6
   7 //
   8 // ref: "Comparing floating point numbers" by Bruce Dawson
   9 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
  10 //
  11 bool AlmostEqual(float A, float B, int maxUlps=0)
  12 {
  13     // Make sure maxUlps is non-negative and small enough that the
  14     // default NAN won't compare as equal to anything.
  15     ASSERT(maxUlps >= 0 && maxUlps < 4 * 1024 * 1024);
  16     int aInt = *(int*)&A;
  17     // Make aInt lexicographically ordered as a twos-complement int
  18     if (aInt < 0)
  19         aInt = 0x80000000 - aInt;
  20     // Make bInt lexicographically ordered as a twos-complement int
  21     int bInt = *(int*)&B;
  22     if (bInt < 0)
  23         bInt = 0x80000000 - bInt;
  24
  25     int intDiff = abs(aInt - bInt);
  26     if (intDiff <= maxUlps)
  27         return true;
  28
  29     return false;
  30 }
  31
  32 /****
  33  * See @xy_filter_c
  34  **/
  35 __forceinline void xy_filter_one_line_c(float *dst, int width, const float *filter, int filter_width)
  36 {
  37     const float *filter_start = filter;
  38     int xx_fix = width > filter_width ? 0 : filter_width - width;
  39     float *dst2 = dst - filter_width;
  40     float *dst_endr = dst + width;
  41     float *dst_end0 = dst_endr - filter_width;
  42     float *dst_endl = dst - xx_fix;
  43     ASSERT(xx_fix==0 || dst_end0==dst_endl);
  44
  45     ASSERT(filter_start == filter);
  46     filter_start += filter_width;
  47     const float *filter_end = filter_start;
  48     for (;dst2<dst_endl;dst2++, filter_start--)//left margin
  49     {
  50         const float *src = dst;
  51         float sum = 0;
  52         for(const float* f=filter_start;f<filter_end;f++, src++)
  53         {
  54             sum += src[0] * f[0];
  55         }
  56         *dst2 = sum;
  57     }
  58     for (;dst2<dst;dst2++, filter_start--, filter_end--)//if width < filter_width
  59     {
  60         const float *src = dst;
  61         float sum = 0;
  62         for(const float* f=filter_start;f<filter_end;f++, src++)
  63         {
  64             sum += src[0] * f[0];
  65         }
  66         *dst2 = sum;
  67     }
  68     ASSERT(filter_start==filter);
  69     for (;dst2<dst_end0;dst2++)
  70     {
  71         const float *src = dst2;
  72
  73         float sum = 0;
  74         for(const float* f=filter_start;f<filter_end;f++, src++)
  75         {
  76             sum += src[0] * f[0];
  77         }
  78         *dst2 = sum;
  79     }
  80     for (;dst2<dst_endr;dst2++, filter_end--)//right margin
  81     {
  82         const float *src = dst2;
  83         float sum = 0;
  84         for(const float* f=filter;f<filter_end;f++, src++)
  85         {
  86             sum += src[0] * f[0];
  87         }
  88         *dst2 = sum;
  89     }
  90 }
  91
  92 /****
  93  * dst memory layout:
  94  * 1. Source content starts from @dst;
  95  * 2. Output content starts from @dst-@mwitdth;
  96  *
  97  * |-                     stride                            -|
  98  * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
  99  * |-                       -|-                             -|
 100  * |- margin                -|-  src width*height items     -|
 101  * |- mwidth*heigth items   -|-                             -|
 102  * |- do NOT need to init   -|-                             -|
 103  * |- when input            -|-                             -|
 104  * |---------------------------------------------------------|
 105  **/
 106 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 107 {
 108     ASSERT( stride>=4*(width+filter_width) );
 109     BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
 110     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 111     for( ; dst_byte<end; dst_byte+=stride )
 112     {
 113         xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
 114     }
 115 }
 116
 117 /****
 118  * inline function sometimes generates stupid code
 119  *
 120  * @src4, @src_5_8, @f4, @sum : __m128
 121  * @src_5_8, @f4: const
 122  * @sum : output
 123  * @src4: undefined
 124  **/
 125 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
 126     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
 127     f4_1 = _mm_mul_ps(f4_1, src4);\
 128     sum = _mm_add_ps(sum, f4_1);\
 129     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 130     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
 131     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 132     sum = _mm_add_ps(sum, f4_1);\
 133     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 134     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
 135     f4_1 = _mm_mul_ps(f4_1, src4);\
 136     sum = _mm_add_ps(sum, f4_1);\
 137     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 138     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
 139     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 140     sum = _mm_add_ps(sum, f4_1)
 141
 142 /****
 143  * @src4, @f4_1, @sum : __m128
 144  * @f4_1: const
 145  * @sum : output
 146  * @src4: undefined
 147  **/
 148 #define XY_FILTER_4_1(src4, f4_1, sum) \
 149     src4 = _mm_mul_ps(src4, f4_1);\
 150     sum = _mm_add_ps(sum, src4);
 151
 152 __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width)
 153 {
 154     int xx_fix = width > filter_width ? 0 : filter_width - width;
 155     const float *filter_start = filter;
 156     float *dst2 = dst - filter_width;
 157     float *dst_endr = dst + width;
 158     float *dst_end0 = dst_endr - filter_width - 4;
 159     float *dst_endl = dst - xx_fix;
 160     ASSERT(xx_fix==0 || dst_end0==dst_endl-4);
 161
 162     ASSERT(filter_start == filter);
 163     filter_start += filter_width;
 164     const float *filter_end = filter_start;
 165
 166     for (;dst2<dst_endl;dst2+=4)//left margin
 167     {
 168         const float *src = dst;
 169         filter_start -= 4;
 170
 171         //filter 4
 172         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 173         __m128 sum = _mm_setzero_ps();
 174         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 175         {
 176             __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 177             __m128 f4 = _mm_load_ps(f);
 178
 179             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 180
 181             src4 = src_5_8;
 182         }
 183         //store result
 184         _mm_store_ps(dst2, sum);
 185     }
 186     for (;dst2<dst;dst2+=4)//if width < filter_width
 187     {
 188         const float *src = dst;
 189         filter_start-=4;
 190         filter_end-=4;
 191
 192         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 193         __m128 sum = _mm_setzero_ps();
 194         __m128 src_5_8, f4;
 195         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 196         {
 197             src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 198             f4 = _mm_load_ps(f);
 199
 200             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 201             src4 = src_5_8;
 202         }
 203         src_5_8 = _mm_setzero_ps();
 204         f4 = _mm_load_ps(filter_end);
 205         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 206         //store result
 207         _mm_store_ps(dst2, sum);
 208     }
 209     ASSERT(filter_start == filter);
 210     for (;dst2<dst_end0;dst2+=8)
 211     {
 212         const float *src = dst2;
 213         const float* f=filter_start;
 214
 215         //filter 8
 216         __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 217         src+=4;
 218         __m128 src_5_8;
 219         __m128 sum = _mm_setzero_ps();
 220         __m128 sum2 = _mm_setzero_ps();
 221         __m128 f4 = _mm_load_ps(f);
 222         f+=4;
 223         src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 224         src+=4;
 225         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 226         for(;f<filter_end;f+=4,src+=4)
 227         {
 228             src4 = _mm_load_ps(src);/*1 2 3 4*/
 229             __m128 tmp = src_5_8;//important!
 230             { XY_FILTER_4(tmp, src4, f4, sum2); }
 231
 232             f4 = _mm_load_ps(f);
 233             { XY_FILTER_4(src_5_8, src4, f4, sum); }
 234             src_5_8 = src4;
 235         }
 236         src4 = _mm_load_ps(src);/*1 2 3 4*/
 237         { XY_FILTER_4(src_5_8, src4, f4, sum2); }
 238
 239         //store result
 240         _mm_store_ps(dst2, sum);
 241         _mm_store_ps(dst2+4, sum2);
 242     }
 243     if (dst2==dst_end0)
 244     {
 245         const float *src = dst2;
 246         //filter 4
 247         __m128 src4 = _mm_load_ps(src);//1 2 3 4
 248         src+=4;
 249         __m128 sum = _mm_setzero_ps();
 250         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 251         {
 252             __m128 src_5_8 = _mm_load_ps(src);//5 6 7 8
 253             __m128 f4 = _mm_load_ps(f);
 254
 255             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 256             src4 = src_5_8;
 257         }
 258         //store result
 259         _mm_store_ps(dst2, sum);
 260         dst2+=4;
 261     }
 262     for (;dst2<dst_endr;dst2+=4)//right margin
 263     {
 264         const float *src = dst2;
 265         filter_end-=4;
 266
 267         //filter 4
 268         __m128 src4 = _mm_load_ps(src);//1 2 3 4
 269         __m128 sum = _mm_setzero_ps();
 270         __m128 src_5_8, f4;
 271         for(const float* f=filter_start;f<filter_end;f+=4)
 272         {
 273             src+=4;
 274             src_5_8 = _mm_load_ps(src);//5 6 7 8
 275             f4 = _mm_load_ps(f);
 276
 277             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 278
 279             src4 = src_5_8;
 280             //move new 4 in_n_out to old 4 in_n_out
 281         }
 282         src_5_8 = _mm_setzero_ps();
 283         f4 = _mm_load_ps(filter_end);
 284         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 285         //store result
 286         _mm_store_ps(dst2, sum);
 287     }
 288 }
 289
 290 /****
 291  * See @xy_filter_c
 292  **/
 293 void xy_filter_sse_v6(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 294 {
 295     ASSERT( stride>=4*(width+filter_width) );
 296     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 297
 298     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 299     BYTE* end = dst_byte + height*stride;
 300     for( ; dst_byte<end; dst_byte+=stride )
 301     {
 302         xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
 303     }
 304 }
 305
 306 /****
 307  * Constrain:
 308  *   LENGTH%4 == 0 || LENGTH%4 == 1
 309  **/
 310 template<int LENGTH>
 311 struct M128s
 312 {
 313     __m128 x;
 314     M128s<LENGTH - 4> next;
 315
 316     template<int Index> __forceinline __m128& GetAt()
 317     {
 318         return next.GetAt<Index - 4>();
 319     }
 320     template<> __forceinline __m128& GetAt<0>()
 321     {
 322         return x;
 323     }
 324
 325     template<int Start, int Offset> __forceinline __m128& GetAt()
 326     {
 327         return GetAt<Start + Offset>();
 328     }
 329
 330     __forceinline void Load(const float* src)
 331     {
 332         x = _mm_load_ps(src);
 333         next.Load(src+4);
 334     }
 335 };
 336
 337 template<>
 338 struct M128s<1>
 339 {
 340     __m128 x;
 341
 342     template<int Index> __forceinline __m128& GetAt()
 343     {
 344         return x;
 345     }
 346     __forceinline void Load(const float* src)
 347     {
 348         x = _mm_set1_ps(*src);
 349     }
 350 };
 351
 352 template<>
 353 struct M128s<0>
 354 {
 355     void Load(const float* src)
 356     {
 357     }
 358 };
 359
 360 template<int FILTER_LENGTH, int START, int LENGTH>
 361 struct Filter4
 362 {
 363     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 364     {
 365         __m128 src4_128 = _mm_load_ps(src4);
 366         { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START>(), sum); }
 367         Filter4<FILTER_LENGTH,START+4,LENGTH-4>::do_cal(src4_128, src4+4, filter128s, sum);
 368         src0_128 = src4_128;
 369     }
 370 };
 371
 372 template<int FILTER_LENGTH, int START>
 373 struct Filter4<FILTER_LENGTH, START, 1>
 374 {
 375     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 376     {
 377         cal_tail<FILTER_LENGTH-START>(src0_128, src4, filter128s, sum);
 378     }
 379     template<int TAIL>
 380     static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum);
 381     template<>
 382     static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 383     {
 384         { XY_FILTER_4_1(src0_128, filter128s.GetAt<FILTER_LENGTH-1>(), sum); }
 385     }
 386 };
 387
 388 template<int FILTER_LENGTH, int START>
 389 struct Filter4<FILTER_LENGTH, START, 0>
 390 {
 391     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 392     {
 393     }
 394 };
 395
 396 template<int FILTER_LENGTH,int MARGIN_LENGTH>
 397 struct FilterAllLeftMargin
 398 {
 399     static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 400     {
 401         do_cal<FILTER_LENGTH%4>(src, filter128s);
 402     }
 403     template<int FILTER_TAIL>
 404     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 405     {
 406         //filter 4
 407         __m128 src0 = _mm_setzero_ps();
 408         __m128 sum = _mm_setzero_ps();
 409         Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
 410         _mm_store_ps(src-MARGIN_LENGTH, sum);
 411         FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
 412     }
 413     template<>
 414     static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
 415     {
 416         //filter 4
 417         __m128 sum = _mm_setzero_ps();
 418         //Only one of the last 4 filter coefficiences is non-zero
 419         _mm_store_ps(src-MARGIN_LENGTH, sum);
 420         FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
 421     }
 422 };
 423
 424 template<int FILTER_LENGTH, int MARGIN_LENGTH>
 425 struct FilterAllRightMargin
 426 {
 427     static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 428     {
 429         do_cal<FILTER_LENGTH%4>(src, filter128s);
 430     }
 431     template<int FILTER_TAIL>
 432     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 433     {
 434         //filter 4
 435         {
 436             __m128 src0 = _mm_load_ps(src);
 437             __m128 sum = _mm_setzero_ps();
 438             Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
 439             __m128 src4 = _mm_setzero_ps();
 440             { XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
 441             //store result
 442             _mm_store_ps(src, sum);
 443         }
 444         FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
 445     }
 446     template<>
 447     static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
 448     {
 449         //filter 4
 450         {
 451             __m128 src0 = _mm_load_ps(src);
 452             __m128 sum = _mm_setzero_ps();
 453             Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
 454             //Only one of the last 4 filter coefficiences is non-zero
 455             { XY_FILTER_4_1(src0, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
 456             //store result
 457             _mm_store_ps(src, sum);
 458         }
 459         FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
 460     }
 461 };
 462
 463 template<int FILTER_LENGTH>
 464 struct FilterAllLeftMargin<FILTER_LENGTH,0>
 465 {
 466     template<int FILTER_TAIL>
 467     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 468     {
 469     }
 470 };
 471
 472 template<int FILTER_LENGTH>
 473 struct FilterAllRightMargin<FILTER_LENGTH,0>
 474 {
 475     template<int FILTER_TAIL>
 476     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 477     {
 478     }
 479 };
 480
 481 /****
 482  * Equivalent:
 483  *   xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
 484  * See @xy_filter_c
 485  * Constrain:
 486  *   FILTER_LENGTH<=width && width%4==0
 487  **/
 488 template<int FILTER_LENGTH>
 489 void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter)
 490 {
 491     ASSERT( stride>=4*(width+FILTER_LENGTH) );
 492     ASSERT( ((stride|(4*width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 493
 494     M128s<FILTER_LENGTH> filter128s;
 495     filter128s.Load(filter);
 496
 497     const float *filter_start = filter;
 498     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 499     BYTE* end = dst_byte + height*stride;
 500     for( ; dst_byte<end; dst_byte+=stride )
 501     {
 502         float *dst2 = reinterpret_cast<float*>(dst_byte);
 503
 504         //left margin
 505         FilterAllLeftMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
 506         float *dst_end1 = dst2 + width;
 507         float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3);
 508         for (;dst2<dst_end0;dst2+=4)
 509         {
 510             const float *src = dst2;
 511
 512             //filter 4
 513             __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
 514             src += 4;
 515             __m128 sum = _mm_setzero_ps();
 516             Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
 517             //store result
 518             _mm_store_ps(dst2, sum);
 519         }
 520         FilterAllRightMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
 521     }
 522 }
 523
 524
 525 /****
 526  * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
 527  * @src4, @src_5_8, @f3_1, @f3_2: const
 528  * @sum: output
 529  **/
 530 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
 531     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 532     __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 533     sum = _mm_mul_ps(f3_1, src4);\
 534     __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
 535     __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
 536     sum = _mm_add_ps(sum, mul2);\
 537     sum = _mm_add_ps(sum, mul3);
 538
 539 /****
 540  * Equivalent:
 541  *   xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
 542  * See @xy_filter_c
 543  * Constrain:
 544  *   filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
 545  **/
 546 void xy_3_tag_symmetric_filter_sse(float *dst, int width, int height, int stride, const float *filter)
 547 {
 548     const int filter_width = 4;
 549     ASSERT( stride>=4*(width+filter_width) );
 550     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 551
 552     ASSERT(filter_width==4 && filter[3]==0 && filter[2]==filter[0]);
 553
 554     __m128 f3_1 = _mm_set1_ps(filter[0]);
 555     __m128 f3_2 = _mm_set1_ps(filter[1]);
 556
 557     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 558     BYTE* end = dst_byte + height*stride;
 559     for( ; dst_byte<end; dst_byte+=stride )
 560     {
 561         float *dst_f = reinterpret_cast<float*>(dst_byte);
 562         float *dst2 = dst_f;
 563
 564         float *dst_end0 = dst_f + width - 4;
 565         //filter 4
 566         __m128 src4 = _mm_load_ps(dst2);/*1 2 3 4*/
 567         {
 568             __m128 sum;
 569             __m128 src_4 = _mm_setzero_ps();
 570             { XY_3_TAG_SYMMETRIC_FILTER_4(src_4, src4, f3_1, f3_2, sum); }
 571             _mm_store_ps(dst2-4, sum);
 572         }
 573         for (;dst2<dst_end0;dst2+=4)
 574         {
 575             __m128 sum;
 576             __m128 src_5_8 = _mm_load_ps(dst2+4);/*5 6 7 8*/
 577             { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
 578             src4 = src_5_8;
 579             //store result
 580             _mm_store_ps(dst2, sum);
 581         }
 582         {
 583             __m128 sum;
 584             __m128 src_5_8 = _mm_setzero_ps();/*5 6 7 8*/
 585             { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
 586             src4 = src_5_8;
 587             //store result
 588             _mm_store_ps(dst2, sum);
 589         }
 590     }
 591 }
 592
 593
 594 /****
 595  * See @xy_filter_c
 596  **/
 597 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 598 {
 599     typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
 600     const Filter filters[] = {
 601         NULL,
 602         xy_filter_sse_template<1>, xy_filter_sse_template<4>, xy_filter_sse_template<4>, xy_filter_sse_template<4>,
 603         xy_filter_sse_template<5>, xy_filter_sse_template<8>, xy_filter_sse_template<8>, xy_filter_sse_template<8>,
 604         xy_filter_sse_template<9>, xy_filter_sse_template<12>, xy_filter_sse_template<12>, xy_filter_sse_template<12>,
 605         xy_filter_sse_template<13>, xy_filter_sse_template<16>, xy_filter_sse_template<16>, xy_filter_sse_template<16>,
 606         xy_filter_sse_template<17>, xy_filter_sse_template<20>, xy_filter_sse_template<20>, xy_filter_sse_template<20>,
 607         xy_filter_sse_template<21>, xy_filter_sse_template<24>, xy_filter_sse_template<24>, xy_filter_sse_template<24>,
 608         xy_filter_sse_template<25>, xy_filter_sse_template<28>, xy_filter_sse_template<28>, xy_filter_sse_template<28>
 609     };
 610     if (filter_width<=28 && filter_width<=width)
 611     {
 612         int tmp = filter_width;
 613         // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
 614         while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3 )
 615             tmp--;
 616         if (tmp==3&&filter[0]==filter[2])
 617         {
 618             xy_3_tag_symmetric_filter_sse(dst, width, height, stride, filter);
 619         }
 620         else
 621         {
 622             filters[tmp](dst, width, height, stride, filter);
 623         }
 624     }
 625     else
 626     {
 627         xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
 628     }
 629 }
 630
 631 /****
 632  * Copy and convert src to dst line by line.
 633  * @dst_width MUST >= @width
 634  * if @dst_width>@width, the extra elements will be filled with 0.
 635  **/
 636 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
 637     PCUINT8 src, int width, int height, int stride)
 638 {
 639     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 640
 641     PCUINT8 src_end = src + height*stride;
 642     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 643     {
 644         PCUINT8 src2 = src;
 645         PCUINT8 src_end = src2 + width;
 646         float *dst2 = reinterpret_cast<float*>(dst_byte);
 647
 648         for (;src2<src_end;src2++, dst2++)
 649         {
 650             *dst2 = *src2;
 651         }
 652         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 653         for (;dst2<dst2_end;dst2++)
 654         {
 655             *dst2=0;
 656         }
 657     }
 658 }
 659
 660 /****
 661  * See @xy_byte_2_float_c
 662  **/
 663 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
 664     PCUINT8 src, int width, int height, int stride)
 665 {
 666     ASSERT( dst_width>=width );
 667     ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
 668     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 669
 670     PCUINT8 src_end = src + height*stride;
 671     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 672     {
 673         PCUINT8 src2 = src;
 674         PCUINT8 src2_end0 = src2 + (width&~15);
 675         float *dst2 = reinterpret_cast<float*>(dst_byte);
 676
 677         for (;src2<src2_end0;src2+=16, dst2+=16)
 678         {
 679             //filter 4
 680             __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
 681             __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
 682             __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
 683             __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
 684             __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
 685             __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
 686             __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
 687             __m128 dst_f1 =  _mm_cvtepi32_ps(src16_lo_lo);
 688             __m128 dst_f2 =  _mm_cvtepi32_ps(src16_lo_hi);
 689             __m128 dst_f3 =  _mm_cvtepi32_ps(src16_hi_lo);
 690             __m128 dst_f4 =  _mm_cvtepi32_ps(src16_hi_hi);
 691             _mm_store_ps(dst2, dst_f1);
 692             _mm_store_ps(dst2+4, dst_f2);
 693             _mm_store_ps(dst2+8, dst_f3);
 694             _mm_store_ps(dst2+12, dst_f4);
 695         }
 696         PCUINT8 src2_end = src + width;
 697         for (;src2<src2_end;src2++,dst2++)
 698         {
 699             *dst2 = *src2;
 700         }
 701         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 702         for (;dst2<dst2_end;dst2++)
 703         {
 704             *dst2=0;
 705         }
 706     }
 707 }
 708
 709 /****
 710  * Copy transposed Matrix src to dst.
 711  * @dst_width MUST >= @height.
 712  * if @dst_width > @height, the extra elements will be filled with 0.
 713  **/
 714 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
 715     const float *src, int width, int height, int src_stride)
 716 {
 717     ASSERT(dst_width >= height);
 718     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 719     const float* src_end = src + width;
 720     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 721     for( ; src<src_end; src++, dst_byte+=dst_stride )
 722     {
 723         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 724
 725         float *dst2 = reinterpret_cast<float*>(dst_byte);
 726         for (;src2<src2_end;src2+=src_stride,dst2++)
 727         {
 728             *dst2 = *reinterpret_cast<const float*>(src2);
 729         }
 730         float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
 731         for (;dst2<dst2_end;dst2++)
 732         {
 733             *dst2 = 0;
 734         }
 735     }
 736 }
 737
 738 /****
 739  * see @xy_float_2_float_transpose_c
 740  **/
 741 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
 742     const float *src, int width, int height, int src_stride)
 743 {
 744     typedef float DstT;
 745     typedef const float SrcT;
 746
 747     ASSERT( (((int)dst|dst_stride)&15)==0 );
 748     ASSERT(dst_width >= height);
 749     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 750     SrcT* src_end = src + width;
 751     PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
 752     PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 753     for( ; src<src_end; src++, dst_byte+=dst_stride )
 754     {
 755         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 756
 757         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 758         for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
 759         {
 760             __m128 m1 = _mm_set_ps(
 761                 *(SrcT*)(src2+3*src_stride),
 762                 *(SrcT*)(src2+2*src_stride),
 763                 *(SrcT*)(src2+src_stride),
 764                 *(SrcT*)(src2));
 765             _mm_store_ps(dst2, m1);
 766         }
 767         for (;src2<src2_end2;src2+=src_stride,dst2++)
 768         {
 769             *dst2 = *reinterpret_cast<SrcT*>(src2);
 770         }
 771         float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 772         for (;dst2<dst2_end;dst2++)
 773         {
 774             *dst2 = 0;
 775         }
 776     }
 777 }
 778
 779 /****
 780  * Transpose and round Matrix src, then copy to dst.
 781  * @dst_width MUST >= @height.
 782  * if @dst_width > @height, the extra elements will be filled with 0.
 783  **/
 784 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
 785     const float *src, int width, int height, int src_stride)
 786 {
 787     ASSERT(dst_width >= height);
 788     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 789     const float* src_end = src + width;
 790     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 791     for( ; src<src_end; src++, dst_byte+=dst_stride )
 792     {
 793         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 794
 795         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
 796         for (;src2<src2_end;src2+=src_stride,dst2++)
 797         {
 798             *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
 799         }
 800         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
 801         for (;dst2<dst2_end;dst2++)
 802         {
 803             *dst2 = 0;
 804         }
 805     }
 806 }
 807
 808 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
 809     const float *src, int width, int height, int src_stride)
 810 {
 811     typedef UINT8 DstT;
 812     typedef const float SrcT;
 813
 814     ASSERT(dst_width >= height);
 815     ASSERT((((int)dst|dst_stride)&15)==0);
 816     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 817     SrcT* src_end = src + width;
 818     PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
 819     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 820     for( ; src<src_end; src++, dst_byte+=dst_stride )
 821     {
 822         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 823
 824         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 825         for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
 826         {
 827             __m128 m1 = _mm_set_ps(
 828                 *(SrcT*)(src2+3*src_stride),
 829                 *(SrcT*)(src2+2*src_stride),
 830                 *(SrcT*)(src2+src_stride),
 831                 *(SrcT*)(src2));
 832             __m128 m2 = _mm_set_ps(
 833                 *(SrcT*)(src2+7*src_stride),
 834                 *(SrcT*)(src2+6*src_stride),
 835                 *(SrcT*)(src2+5*src_stride),
 836                 *(SrcT*)(src2+4*src_stride));
 837             __m128 m3 = _mm_set_ps(
 838                 *(SrcT*)(src2+11*src_stride),
 839                 *(SrcT*)(src2+10*src_stride),
 840                 *(SrcT*)(src2+9*src_stride),
 841                 *(SrcT*)(src2+8*src_stride));
 842             __m128 m4 = _mm_set_ps(
 843                 *(SrcT*)(src2+15*src_stride),
 844                 *(SrcT*)(src2+14*src_stride),
 845                 *(SrcT*)(src2+13*src_stride),
 846                 *(SrcT*)(src2+12*src_stride));
 847
 848             __m128i i1 = _mm_cvtps_epi32(m1);
 849             __m128i i2 = _mm_cvtps_epi32(m2);
 850             __m128i i3 = _mm_cvtps_epi32(m3);
 851             __m128i i4 = _mm_cvtps_epi32(m4);
 852
 853             i1 = _mm_packs_epi32(i1,i2);
 854             i3 = _mm_packs_epi32(i3,i4);
 855             i1 = _mm_packus_epi16(i1,i3);
 856
 857             _mm_store_si128((__m128i*)dst2, i1);
 858         }
 859         for (;src2<src2_end;src2+=src_stride,dst2++)
 860         {
 861             *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
 862         }
 863         DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 864         for (;dst2<dst2_end;dst2++)
 865         {
 866             *dst2 = 0;
 867         }
 868     }
 869 }
 870
 871 /****
 872  * To Do: decent CPU capability check
 873  **/
 874 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
 875     PCUINT8 src, int width, int height, int stride,
 876     const float *gt_x, int r_x, int gt_ex_width_x,
 877     const float *gt_y, int r_y, int gt_ex_width_y)
 878 {
 879     ASSERT(width<=stride && width+2*r_x<=dst_stride);
 880     int ex_mask_width_x = ((r_x*2+1)+3)&~3;
 881     ASSERT(ex_mask_width_x<=gt_ex_width_x);
 882     if (ex_mask_width_x>gt_ex_width_x)
 883     {
 884         int o=0;
 885         o=o/o;
 886         exit(-1);
 887     }
 888
 889     int ex_mask_width_y = ((r_y*2+1)+3)&~3;
 890     ASSERT(ex_mask_width_y<=gt_ex_width_y);
 891     if (ex_mask_width_y>gt_ex_width_y)
 892     {
 893         int o=0;
 894         o=o/o;
 895         exit(-1);
 896     }
 897
 898
 899     int fwidth = (width+3)&~3;
 900     int fstride = (fwidth + ex_mask_width_x)*sizeof(float);
 901     int fheight = (height+3)&~3;
 902     int fstride_ver = (fheight+ex_mask_width_y)*sizeof(float);
 903
 904     PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width_x)*fstride_ver));
 905
 906     float *hor_buff_base = reinterpret_cast<float*>(buff_base);
 907     float *hor_buff = hor_buff_base + ex_mask_width_x;
 908
 909     // byte to float
 910     ASSERT( ((width+15)&~15)<=stride );
 911     xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
 912
 913     // horizontal pass
 914     xy_filter_sse(hor_buff, fwidth, height, fstride, gt_x, ex_mask_width_x);
 915
 916
 917     // transpose
 918     float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
 919     float *ver_buff = ver_buff_base + ex_mask_width_y;
 920
 921     int true_width = width+r_x*2;
 922     xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r_x*2, true_width, height, fstride);
 923
 924     // vertical pass
 925     xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt_y, ex_mask_width_y);
 926
 927     // transpose
 928     int true_height = height + 2*r_y;
 929     xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r_y*2, true_height, true_width, fstride_ver);
 930
 931     xy_free(buff_base);
 932 #ifndef _WIN64
 933     // TODOX64 : fixme!
 934     _mm_empty();
 935 #endif
 936 }
 937
 938
 939 enum RoundingPolicy
 940 {
 941     ROUND_DOWN
 942     , ROUND_HALF_DOWN
 943     , ROUND_HALF_UP
 944     , ROUND_HALF_TO_EVEN
 945     , COUNT_ROUND_POLICY
 946 };
 947
 948 template<int ROUNDING_POLICY, int precision>
 949 struct XyRounding
 950 {
 951     __forceinline void init_sse();
 952     __forceinline __m128i round(__m128i in);
 953     __forceinline int round(int in);
 954 };
 955
 956 template<int precision>
 957 struct XyRounding<ROUND_DOWN, precision>
 958 {
 959     __forceinline void init_sse()
 960     {
 961
 962     }
 963     __forceinline __m128i round(__m128i in)
 964     {
 965         return in;
 966     }
 967
 968     __forceinline int round(unsigned in)
 969     {
 970         return in;
 971     }
 972 };
 973
 974
 975 template<int precision>
 976 struct XyRounding<ROUND_HALF_DOWN, precision>
 977 {
 978     __forceinline void init_sse()
 979     {
 980         m_rounding_patch = _mm_set1_epi16( (1<<(precision-1))-1 );
 981     }
 982     __forceinline __m128i round(__m128i in)
 983     {
 984         return _mm_adds_epu16(in, m_rounding_patch);
 985     }
 986
 987     __forceinline int round(unsigned in)
 988     {
 989         return in + ((1<<(precision-1))-1);
 990     }
 991     __m128i m_rounding_patch;
 992 };
 993
 994
 995 template<int precision>
 996 struct XyRounding<ROUND_HALF_UP, precision>
 997 {
 998     __forceinline void init_sse()
 999     {
1000         m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
1001     }
1002     __forceinline __m128i round(__m128i in)
1003     {
1004         return _mm_adds_epu16(in, m_rounding_patch);
1005     }
1006
1007     __forceinline int round(unsigned in)
1008     {
1009         return in + (1<<(precision-1));
1010     }
1011     __m128i m_rounding_patch;
1012 };
1013
1014
1015 template<int precision>
1016 struct XyRounding<ROUND_HALF_TO_EVEN, precision>
1017 {
1018     __forceinline void init_sse()
1019     {
1020         m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
1021     }
1022     __forceinline __m128i round(__m128i in)
1023     {
1024         in = _mm_adds_epu16(in, m_rounding_patch);
1025         __m128i tmp = _mm_slli_epi16(in, 15-precision);
1026         tmp = _mm_srli_epi16(tmp, 15);
1027         return _mm_adds_epu16(in, tmp);
1028     }
1029
1030     __forceinline int round(unsigned in)
1031     {
1032         return in + (1<<(precision-1)) + ((in>>precision)&1);
1033     }
1034     __m128i m_rounding_patch;
1035 };
1036
1037 /****
1038  * filter with [1,2,1]
1039  * 1. It is a in-place horizontal filter
1040  * 2. Boundary Pixels are filtered by padding 0. E.g.
1041  *      dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1042  **/
1043 template<int ROUNDING_POLICY>
1044 void xy_be_filter_c(PUINT8 dst, int width, int height, int stride)
1045 {
1046     ASSERT(width>=1);
1047     if (width<=0)
1048     {
1049         return;
1050     }
1051     PUINT8 dst2 = NULL;
1052     XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1053     for (int y=0;y<height;y++)
1054     {
1055         dst2 = dst + y*stride;
1056         int old_sum = dst2[0];
1057         int tmp = 0;
1058         int x=0;
1059         for (x=0;x<width-1;x++)
1060         {
1061             int new_sum = dst2[x]+dst2[x+1];
1062             tmp = old_sum + new_sum;//old_sum == src2[x-1]+src2[x];
1063             dst2[x] = (xy_rounding.round(tmp)>>2);
1064             old_sum = new_sum;
1065         }
1066         tmp = old_sum + dst2[x];
1067         dst2[x] = (xy_rounding.round(tmp)>>2);
1068     }
1069 }
1070
1071 /****
1072  * 1. It is a in-place symmetric 3-tag horizontal filter
1073  * 2. Boundary Pixels are filtered by padding 0. E.g.
1074  *      dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1075  * 3. sum(filter) == 256
1076  **/
1077 template<int ROUNDING_POLICY>
1078 void xy_be_filter2_c(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1079 {
1080     ASSERT(width>=1);
1081     if (width<=0)
1082     {
1083         return;
1084     }
1085
1086     const int VOLUME_BITS = 8;
1087     const int VOLUME = (1<<VOLUME_BITS);
1088     if (filter[0]==0)
1089     {
1090         return;//nothing to do;
1091     }
1092     else if (filter[0]== (VOLUME>>2))
1093     {
1094         return xy_be_filter_c<ROUNDING_POLICY>(dst, width, height, stride);
1095     }
1096
1097     PUINT8 dst2 = NULL;
1098     XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1099     for (int y=0;y<height;y++)
1100     {
1101         dst2 = dst + y*stride;
1102         int old_pix = 0;
1103         int tmp = 0;
1104         int x=0;
1105         for (x=0;x<width-1;x++)
1106         {
1107             tmp = (old_pix + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1108             old_pix = dst2[x];
1109             dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1110         }
1111         tmp = old_pix * filter[0] + dst2[x] * filter[1];
1112         dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1113     }
1114 }
1115
1116 /****
1117  * See @xy_be_filter_c
1118  * No alignment requirement.
1119  **/
1120 template<int ROUNDING_POLICY>
1121 void xy_be_filter_sse(PUINT8 dst, int width, int height, int stride)
1122 {
1123     ASSERT(width>=1);
1124     if (width<=0)
1125     {
1126         return;
1127     }
1128     int width_mod8 = ((width-1)&~7);
1129     XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1130     xy_rounding.init_sse();
1131     for (int y = 0; y < height; y++) {
1132         PUINT8 dst2=dst+y*stride;
1133
1134         __m128i old_pix_128 = _mm_cvtsi32_si128(dst2[0]);
1135         __m128i old_sum_128 = old_pix_128;
1136
1137         int x = 0;
1138         for (; x < width_mod8; x+=8) {
1139             __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1140             new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
1141             __m128i temp = _mm_slli_si128(new_pix,2);
1142             temp = _mm_add_epi16(temp, old_pix_128);
1143             temp = _mm_add_epi16(temp, new_pix);
1144             old_pix_128 = _mm_srli_si128(new_pix,14);
1145
1146             new_pix = _mm_slli_si128(temp,2);
1147             new_pix = _mm_add_epi16(new_pix, old_sum_128);
1148             new_pix = _mm_add_epi16(new_pix, temp);
1149             old_sum_128 = _mm_srli_si128(temp, 14);
1150
1151             new_pix = xy_rounding.round(new_pix);
1152
1153             new_pix = _mm_srli_epi16(new_pix, 2);
1154             new_pix = _mm_packus_epi16(new_pix, new_pix);
1155
1156             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), new_pix );
1157         }
1158         int old_sum = _mm_cvtsi128_si32(old_sum_128);
1159         old_sum &= 0xffff;
1160         int tmp = 0;
1161         for ( ; x < width-1; x++) {
1162             int new_sum = dst2[x] + dst2[x+1];
1163             tmp = old_sum + new_sum;
1164             dst2[x] = (xy_rounding.round(tmp)>>2);
1165             old_sum = new_sum;
1166         }
1167         tmp = old_sum + dst2[x];
1168         dst2[x] = (xy_rounding.round(tmp)>>2);
1169     }
1170 }
1171
1172 /****
1173  * See @xy_be_filter2_c
1174  * No alignment requirement.
1175  **/
1176 template<int ROUNDING_POLICY>
1177 void xy_be_filter2_sse(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1178 {
1179     const int VOLUME_BITS = 8;
1180     const int VOLUME = (1<<VOLUME_BITS);
1181     ASSERT(filter[0]==filter[2]);
1182     ASSERT(filter[0]+filter[1]+filter[2]==VOLUME);
1183     ASSERT(width>=1);
1184     if (width<=0)
1185     {
1186         return;
1187     }
1188
1189     XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1190     xy_rounding.init_sse();
1191     __m128i f3_1 = _mm_set1_epi16(filter[0]);
1192     __m128i f3_2 = _mm_set1_epi16(filter[1]);
1193
1194     int width_mod8 = ((width-1)&~7);
1195     //__m128i round = _mm_set1_epi16(8);
1196
1197     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1198     PUINT8 end = dst_byte + height*stride;
1199     for( ; dst_byte<end; dst_byte+=stride )
1200     {
1201         PUINT8 dst2 = dst_byte;
1202
1203         PUINT8 dst_end0 = dst_byte + width - 4;
1204
1205         __m128i old_pix1_128 = _mm_setzero_si128();
1206         __m128i old_pix2_128 = _mm_cvtsi32_si128(dst2[0]);
1207
1208         int x = 0;
1209         for (; x < width_mod8; x+=8) {
1210             __m128i pix2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1211             pix2 = _mm_unpacklo_epi8(pix2, _mm_setzero_si128());
1212             __m128i pix1 =  _mm_slli_si128(pix2,2);
1213             pix1 = _mm_add_epi8(pix1, old_pix2_128);
1214             __m128i pix0 =  _mm_slli_si128(pix1,2);
1215             pix0 = _mm_add_epi8(pix0, old_pix1_128);
1216             old_pix1_128 = _mm_srli_si128(pix1,14);
1217             old_pix2_128 = _mm_srli_si128(pix2,14);
1218
1219             pix0 = _mm_add_epi16(pix0, pix2);
1220             pix0 = _mm_mullo_epi16(pix0, f3_1);
1221             pix1 = _mm_mullo_epi16(pix1, f3_2);
1222
1223             pix1 = _mm_adds_epu16(pix1, pix0);
1224
1225             pix1 = xy_rounding.round(pix1);
1226
1227             pix1 = _mm_srli_epi16(pix1, VOLUME_BITS);
1228             pix1 = _mm_packus_epi16(pix1, pix1);
1229
1230             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), pix1 );
1231         }
1232         int old_pix1 = _mm_cvtsi128_si32(old_pix1_128);
1233         old_pix1 &= 0xff;
1234         int tmp = 0;
1235         for ( ; x < width-1; x++) {
1236             tmp = (old_pix1 + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1237             old_pix1 = dst2[x];
1238
1239             dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1240         }
1241         tmp = old_pix1*filter[0] + dst2[x]*filter[1];
1242         dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1243     }
1244 }
1245
1246 /****
1247  * See @xy_be_blur
1248  * Construct the filter used in the final horizontal/vertical pass of @xy_be_blur when @pass is NOT a integer.
1249  * This filter is constructed to satisfy:
1250  *   If @p is a pixel in the middle of the image, pixels @(p-1) and @(p+1) lie just at the left and the right
1251  *   of @p respectively. The value of @p after all horizontal filtering equals to
1252  *      a*value_old(@(p-1)) + b*value_old(@p) + a*value_old(@(p+1)) + other pixels' weighted sum,
1253  *   then
1254  *      a/b = @pass/(@pass+1).
1255  * It makes sense because the above property also holds when @pass is a integer.
1256  *
1257  * @return
1258  *   Let n = floor(pass);
1259  *   filter = [ (pass-n)(n+2) / (2*(1+3pass-n)), 1-(pass-n)(n+2)/(1+3pass-n), (pass-n)(n+2)/ (2*(1+3pass-n)) ]
1260  **/
1261 void xy_calculate_filter(float pass, PUINT filter)
1262 {
1263     const int VOLUME = (1<<8);
1264     int n = (int)pass;
1265     if (n==0)
1266     {
1267         filter[0] = VOLUME * pass/(1+3*pass);
1268     }
1269     else if (n==1)
1270     {
1271         filter[0] = VOLUME * (pass-1)/(2*pass);
1272     }
1273     else
1274     {
1275         filter[0] = VOLUME * (pass-n)*(n+2)/ (2*(1+3*pass-n));
1276     }
1277     filter[1] = VOLUME - 2*filter[0];
1278     filter[2] = filter[0];
1279
1280     if (2*filter[0]>filter[1])
1281     {
1282         //this should not happen
1283         ASSERT(0);
1284         filter[0] = VOLUME/4;
1285         filter[1] = VOLUME - 2*filter[0];
1286         filter[2] = filter[0];
1287     }
1288 }
1289
1290 /****
1291  * See @xy_float_2_byte_transpose_c
1292  **/
1293 void xy_byte_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
1294     PCUINT8 src, int width, int height, int src_stride)
1295 {
1296     ASSERT(dst_width >= height);
1297     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1298     PCUINT8 src_end = src + width;
1299     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1300     for( ; src<src_end; src++, dst_byte+=dst_stride )
1301     {
1302         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1303
1304         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
1305         for (;src2<src2_end;src2+=src_stride,dst2++)
1306         {
1307             *dst2 = *src2;
1308         }
1309         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
1310         for (;dst2<dst2_end;dst2++)
1311         {
1312             *dst2 = 0;
1313         }
1314     }
1315 }
1316
1317 /****
1318  * Repeat filter [1,2,1] @pass_x times in horizontal and @pass_y times in vertical
1319  * Boundary Pixels are filtered by padding 0, see @xy_be_filter_c.
1320  *
1321  * @pass_x:
1322  *   When @pass_x is not a integer, horizontal filter [1,2,1] is repeated (int)@pass_x times,
1323  *   and a 3-tag symmetric filter which generated according to @pass_x is applied.
1324  *   The final 3-tag symmetric filter is constructed to satisfy the following property:
1325  *     If @pass_xx > @pass_x, the filtered result of @pass_x should NOT be more blur than
1326  *     the result of @pass_xx. More specially, the filtered result of @pass_x should NOT be more
1327  *     blur than (int)@pass_x+1 and should NOT be less blur than (int)@pass_x;
1328  *
1329  * Rounding:
1330  *   Original VSFilter \be effect uses a simple a round down, which has an bias of -7.5/16 per pass.
1331  *   That rounding error is really huge with big @pass value. It has become one part of the \be effect.
1332  *   We can simulate VSFilter's rounding bias by combining different rounding methods. However a simple
1333  *   test shows that result still has some visual difference from VSFilter's.
1334  *
1335  * Know issue:
1336  *   It seems that this separated filter implementation is more sensitive to precision in comparison to
1337  *   VSFilter's simple implementation. Vertical blocky artifact can be observe with big pass value
1338  *   (and @pass_x==@pass_y). So it is only used for filtering of fractional part of \be strength.
1339  **/
1340 void xy_be_blur(PUINT8 src, int width, int height, int stride, float pass_x, float pass_y)
1341 {
1342     //ASSERT(pass_x>0 && pass_y>0);
1343
1344     typedef void (*XyBeFilter)(PUINT8 src, int width, int height, int stride);
1345     typedef void (*XyFilter2)(PUINT8 src, int width, int height, int stride, PCUINT filter);
1346
1347     XyBeFilter filter = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter_sse<ROUND_HALF_TO_EVEN> : xy_be_filter_c<ROUND_HALF_TO_EVEN>;
1348     XyFilter2 filter2 = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter2_sse<ROUND_HALF_TO_EVEN> : xy_be_filter2_c<ROUND_HALF_TO_EVEN>;
1349
1350     int stride_ver = height;
1351     PUINT8 tmp = reinterpret_cast<PUINT8>(xy_malloc(width*height));
1352     ASSERT(tmp);
1353     // horizontal pass
1354     int pass_x_int = static_cast<int>(pass_x);
1355     for (int i=0; i<pass_x_int; i++)
1356     {
1357         filter(src, width, height, stride);
1358     }
1359     if (pass_x-pass_x_int>0)
1360     {
1361         UINT f[3] = {0};
1362         xy_calculate_filter(pass_x, f);
1363         filter2(src, width, height, stride, f);
1364     }
1365
1366     // transpose
1367     xy_byte_2_byte_transpose_c(tmp, height, stride_ver, src, width, height, stride);
1368
1369     // vertical pass
1370     int pass_y_int = static_cast<int>(pass_y);
1371     for (int i=0;i<pass_y_int;i++)
1372     {
1373         filter(tmp, height, width, stride_ver);
1374     }
1375     if (pass_y-pass_y_int>0)
1376     {
1377         UINT f[3] = {0};
1378         xy_calculate_filter(pass_y, f);
1379         filter2(tmp, height, width, stride_ver, f);
1380     }
1381
1382     // transpose
1383     xy_byte_2_byte_transpose_c(src, width, stride, tmp, height, width, stride_ver);
1384
1385     xy_free(tmp);
1386     return;
1387 }