test/unit_test/xy_filter_benchmark.h

   1 #ifndef __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__
   2 #define __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__
   3
   4 #include <gtest/gtest.h>
   5 #include <wtypes.h>
   6 #include <math.h>
   7 #include <xmmintrin.h>
   8 #include "xy_malloc.h"
   9
  10 typedef const UINT8 CUINT8, *PCUINT8;
  11
  12 //////////////////////////////////////////////////////////////////////
  13
  14 // transpose test
  15
  16 void xy_filter_c_v0(float *dst, int width, int height, int stride, const float *filter, int filter_width)
  17 {
  18     ASSERT( stride>=4*(width+filter_width) );
  19
  20     int xx_fix = width > filter_width ? 0 : filter_width - width;
  21     const float *filter_start = filter;
  22
  23     BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
  24     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
  25     for( ; dst_byte<end; dst_byte+=stride )
  26     {
  27         float *dst_f = reinterpret_cast<float*>(dst_byte);
  28         float *dst2 = dst_f - filter_width;
  29         float *dst_endr = dst_f + width;
  30         float *dst_end0 = dst_endr - filter_width;
  31         float *dst_endl = dst_f - xx_fix;
  32         ASSERT(xx_fix==0 || dst_end0==dst_endl);
  33
  34         ASSERT(filter_start == filter);
  35         filter_start += filter_width;
  36         const float *filter_end = filter_start;
  37         for (;dst2<dst_endl;dst2++, filter_start--)//left margin
  38         {
  39             const float *src = dst_f;
  40             float sum = 0;
  41             for(const float* f=filter_start;f<filter_end;f++, src++)
  42             {
  43                 sum += src[0] * f[0];
  44             }
  45             *dst2 = sum;
  46         }
  47         for (;dst2<dst_f;dst2++, filter_start--, filter_end--)//if width < filter_width
  48         {
  49             const float *src = dst_f;
  50             float sum = 0;
  51             for(const float* f=filter_start;f<filter_end;f++, src++)
  52             {
  53                 sum += src[0] * f[0];
  54             }
  55             *dst2 = sum;
  56         }
  57         ASSERT(filter_start==filter);
  58         for (;dst2<dst_end0;dst2++)
  59         {
  60             const float *src = dst2;
  61
  62             float sum = 0;
  63             for(const float* f=filter_start;f<filter_end;f++, src++)
  64             {
  65                 sum += src[0] * f[0];
  66             }
  67             *dst2 = sum;
  68         }
  69         for (;dst2<dst_endr;dst2++, filter_end--)//right margin
  70         {
  71             const float *src = dst2;
  72             float sum = 0;
  73             for(const float* f=filter;f<filter_end;f++, src++)
  74             {
  75                 sum += src[0] * f[0];
  76             }
  77             *dst2 = sum;
  78         }
  79     }
  80 }
  81
  82 void xy_filter_sse_v0(float *dst, int width, int height, int stride, const float *filter, int filter_width)
  83 {
  84 #ifdef XY_FILTER_4
  85 #  undef XY_FILTER_4
  86 #endif
  87     ASSERT( stride>=4*(width+filter_width) );
  88     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
  89
  90     int xx_fix = width > filter_width ? 0 : filter_width - width;
  91     const float *filter_start = filter;
  92     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
  93     BYTE* end = dst_byte + height*stride;
  94     for( ; dst_byte<end; dst_byte+=stride )
  95     {
  96         float *dst_f = reinterpret_cast<float*>(dst_byte);
  97         float *dst2 = dst_f - filter_width;
  98         float *dst_endr = dst_f + width;
  99         float *dst_end0 = dst_endr - filter_width;
 100         float *dst_endl = dst_f - xx_fix;
 101         ASSERT(xx_fix==0 || dst_end0==dst_endl);
 102
 103         ASSERT(filter_start == filter);
 104         filter_start += filter_width;
 105         const float *filter_end = filter_start;
 106
 107         for (;dst2<dst_endl;dst2+=4)//left margin
 108         {
 109             const float *src = dst_f;
 110             filter_start -= 4;
 111
 112             //filter 4
 113             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 114             __m128 sum = _mm_setzero_ps();
 115             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 116             {
 117                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 118                 __m128 f4 = _mm_load_ps(f);
 119
 120 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
 121     __m128 f4_1 = _mm_unpacklo_ps(f4,f4);\
 122     f4_1 = _mm_unpacklo_ps(f4_1, f4_1);\
 123     f4_1 = _mm_mul_ps(f4_1, src4);\
 124     sum = _mm_add_ps(sum, f4_1);\
 125     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 126     f4_1 = _mm_unpackhi_ps(f4,f4);\
 127     f4_1 = _mm_unpacklo_ps(f4_1,f4_1);\
 128     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 129     sum = _mm_add_ps(sum, f4_1);\
 130     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 131     f4_1 = _mm_unpacklo_ps(f4,f4);\
 132     f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
 133     f4_1 = _mm_mul_ps(f4_1, src4);\
 134     sum = _mm_add_ps(sum, f4_1);\
 135     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 136     f4_1 = _mm_unpackhi_ps(f4,f4);\
 137     f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
 138     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 139     sum = _mm_add_ps(sum, f4_1)
 140
 141                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 142                 src4 = src_5_8;
 143             }
 144             //store result
 145             _mm_stream_ps(dst2, sum);
 146         }
 147         for (;dst2<dst_f;dst2+=4)//if width < filter_width
 148         {
 149             const float *src = dst_f;
 150             filter_start-=4;
 151             filter_end-=4;
 152
 153             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 154             __m128 sum = _mm_setzero_ps();
 155             __m128 src_5_8, f4;
 156             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 157             {
 158                 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 159                 f4 = _mm_load_ps(f);
 160
 161                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 162                 src4 = src_5_8;
 163             }
 164             src_5_8 = _mm_setzero_ps();
 165             f4 = _mm_load_ps(filter_end);
 166             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 167             //store result
 168             _mm_stream_ps(dst2, sum);
 169         }
 170         ASSERT(filter_start == filter);
 171         for (;dst2<dst_end0;dst2+=4)
 172         {
 173             const float *src = dst2;
 174
 175             //filter 4
 176             __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 177             __m128 sum = _mm_setzero_ps();
 178             for(const float* f=filter_start;f<filter_end;f+=4)
 179             {
 180                 src+=4;
 181                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 182                 __m128 f4 = _mm_load_ps(f);
 183
 184                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 185                 src4 = src_5_8;
 186             }
 187             //store result
 188             _mm_stream_ps(dst2, sum);
 189         }
 190         for (;dst2<dst_endr;dst2+=4)//right margin
 191         {
 192             const float *src = dst2;
 193             filter_end-=4;
 194
 195             //filter 4
 196             __m128 src4 = _mm_load_ps(src);//1 2 3 4
 197             __m128 sum = _mm_setzero_ps();
 198             __m128 src_5_8, f4;
 199             for(const float* f=filter_start;f<filter_end;f+=4)
 200             {
 201                 src+=4;
 202                 src_5_8 = _mm_load_ps(src);//5 6 7 8
 203                 f4 = _mm_load_ps(f);
 204
 205                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 206
 207                 src4 = src_5_8;
 208                 //move new 4 in_n_out to old 4 in_n_out
 209             }
 210             src_5_8 = _mm_setzero_ps();
 211             f4 = _mm_load_ps(filter_end);
 212             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 213             //store result
 214             _mm_stream_ps(dst2, sum);
 215         }
 216     }
 217 #undef XY_FILTER_4
 218 }
 219
 220 void xy_filter_sse_v1(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 221 {
 222 #ifdef XY_FILTER_4
 223 #  undef XY_FILTER_4
 224 #endif
 225     ASSERT( stride>=4*(width+filter_width) );
 226     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 227
 228     int xx_fix = width > filter_width ? 0 : filter_width - width;
 229     const float *filter_start = filter;
 230     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 231     BYTE* end = dst_byte + height*stride;
 232     for( ; dst_byte<end; dst_byte+=stride )
 233     {
 234         float *dst_f = reinterpret_cast<float*>(dst_byte);
 235         float *dst2 = dst_f - filter_width;
 236         float *dst_endr = dst_f + width;
 237         float *dst_end0 = dst_endr - filter_width;
 238         float *dst_endl = dst_f - xx_fix;
 239         ASSERT(xx_fix==0 || dst_end0==dst_endl);
 240
 241         ASSERT(filter_start == filter);
 242         filter_start += filter_width;
 243         const float *filter_end = filter_start;
 244
 245         for (;dst2<dst_endl;dst2+=4)//left margin
 246         {
 247             const float *src = dst_f;
 248             filter_start -= 4;
 249
 250             //filter 4
 251             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 252             __m128 sum = _mm_setzero_ps();
 253             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 254             {
 255                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 256                 __m128 f4 = _mm_load_ps(f);
 257
 258 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
 259     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
 260     f4_1 = _mm_mul_ps(f4_1, src4);\
 261     sum = _mm_add_ps(sum, f4_1);\
 262     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 263     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
 264     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 265     sum = _mm_add_ps(sum, f4_1);\
 266     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 267     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
 268     f4_1 = _mm_mul_ps(f4_1, src4);\
 269     sum = _mm_add_ps(sum, f4_1);\
 270     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 271     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
 272     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 273     sum = _mm_add_ps(sum, f4_1)
 274
 275                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 276                 src4 = src_5_8;
 277             }
 278             //store result
 279             _mm_stream_ps(dst2, sum);
 280         }
 281         for (;dst2<dst_f;dst2+=4)//if width < filter_width
 282         {
 283             const float *src = dst_f;
 284             filter_start-=4;
 285             filter_end-=4;
 286
 287             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 288             __m128 sum = _mm_setzero_ps();
 289             __m128 src_5_8, f4;
 290             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 291             {
 292                 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 293                 f4 = _mm_load_ps(f);
 294
 295                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 296                 src4 = src_5_8;
 297             }
 298             src_5_8 = _mm_setzero_ps();
 299             f4 = _mm_load_ps(filter_end);
 300             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 301             //store result
 302             _mm_stream_ps(dst2, sum);
 303         }
 304         ASSERT(filter_start == filter);
 305         for (;dst2<dst_end0;dst2+=4)
 306         {
 307             const float *src = dst2;
 308
 309             //filter 4
 310             __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 311             __m128 sum = _mm_setzero_ps();
 312             for(const float* f=filter_start;f<filter_end;f+=4)
 313             {
 314                 src+=4;
 315                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 316                 __m128 f4 = _mm_load_ps(f);
 317
 318                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 319                 src4 = src_5_8;
 320             }
 321             //store result
 322             _mm_stream_ps(dst2, sum);
 323         }
 324         for (;dst2<dst_endr;dst2+=4)//right margin
 325         {
 326             const float *src = dst2;
 327             filter_end-=4;
 328
 329             //filter 4
 330             __m128 src4 = _mm_load_ps(src);//1 2 3 4
 331             __m128 sum = _mm_setzero_ps();
 332             __m128 src_5_8, f4;
 333             for(const float* f=filter_start;f<filter_end;f+=4)
 334             {
 335                 src+=4;
 336                 src_5_8 = _mm_load_ps(src);//5 6 7 8
 337                 f4 = _mm_load_ps(f);
 338
 339                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 340
 341                 src4 = src_5_8;
 342                 //move new 4 in_n_out to old 4 in_n_out
 343             }
 344             src_5_8 = _mm_setzero_ps();
 345             f4 = _mm_load_ps(filter_end);
 346             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 347             //store result
 348             _mm_stream_ps(dst2, sum);
 349         }
 350     }
 351 #undef XY_FILTER_4
 352 }
 353
 354 void xy_filter_sse_v2(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 355 {
 356 #ifdef XY_FILTER_4
 357 #  undef XY_FILTER_4
 358 #endif
 359     ASSERT( stride>=4*(width+filter_width) );
 360     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 361
 362     int xx_fix = width > filter_width ? 0 : filter_width - width;
 363     const float *filter_start = filter;
 364     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 365     BYTE* end = dst_byte + height*stride;
 366     for( ; dst_byte<end; dst_byte+=stride )
 367     {
 368         float *dst_f = reinterpret_cast<float*>(dst_byte);
 369         float *dst2 = dst_f - filter_width;
 370         float *dst_endr = dst_f + width;
 371         float *dst_end0 = dst_endr - filter_width;
 372         float *dst_endl = dst_f - xx_fix;
 373         ASSERT(xx_fix==0 || dst_end0==dst_endl);
 374
 375         ASSERT(filter_start == filter);
 376         filter_start += filter_width;
 377         const float *filter_end = filter_start;
 378
 379         for (;dst2<dst_endl;dst2+=4)//left margin
 380         {
 381             const float *src = dst_f;
 382             filter_start -= 4;
 383
 384             //filter 4
 385             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 386             __m128 sum = _mm_setzero_ps();
 387             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 388             {
 389                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 390                 __m128 f4 = _mm_load_ps(f);
 391
 392 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
 393     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
 394     f4_1 = _mm_mul_ps(f4_1, src4);\
 395     sum = _mm_add_ps(sum, f4_1);\
 396     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 397     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
 398     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 399     sum = _mm_add_ps(sum, f4_1);\
 400     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 401     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
 402     f4_1 = _mm_mul_ps(f4_1, src4);\
 403     sum = _mm_add_ps(sum, f4_1);\
 404     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 405     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
 406     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 407     sum = _mm_add_ps(sum, f4_1)
 408
 409                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 410                 src4 = src_5_8;
 411             }
 412             //store result
 413             _mm_store_ps(dst2, sum);
 414         }
 415         for (;dst2<dst_f;dst2+=4)//if width < filter_width
 416         {
 417             const float *src = dst_f;
 418             filter_start-=4;
 419             filter_end-=4;
 420
 421             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 422             __m128 sum = _mm_setzero_ps();
 423             __m128 src_5_8, f4;
 424             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 425             {
 426                 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 427                 f4 = _mm_load_ps(f);
 428
 429                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 430                 src4 = src_5_8;
 431             }
 432             src_5_8 = _mm_setzero_ps();
 433             f4 = _mm_load_ps(filter_end);
 434             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 435             //store result
 436             _mm_store_ps(dst2, sum);
 437         }
 438         ASSERT(filter_start == filter);
 439         for (;dst2<dst_end0;dst2+=4)
 440         {
 441             const float *src = dst2;
 442
 443             //filter 4
 444             __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 445             __m128 sum = _mm_setzero_ps();
 446             for(const float* f=filter_start;f<filter_end;f+=4)
 447             {
 448                 src+=4;
 449                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 450                 __m128 f4 = _mm_load_ps(f);
 451
 452                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 453                 src4 = src_5_8;
 454             }
 455             //store result
 456             _mm_store_ps(dst2, sum);
 457         }
 458         for (;dst2<dst_endr;dst2+=4)//right margin
 459         {
 460             const float *src = dst2;
 461             filter_end-=4;
 462
 463             //filter 4
 464             __m128 src4 = _mm_load_ps(src);//1 2 3 4
 465             __m128 sum = _mm_setzero_ps();
 466             __m128 src_5_8, f4;
 467             for(const float* f=filter_start;f<filter_end;f+=4)
 468             {
 469                 src+=4;
 470                 src_5_8 = _mm_load_ps(src);//5 6 7 8
 471                 f4 = _mm_load_ps(f);
 472
 473                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 474
 475                 src4 = src_5_8;
 476                 //move new 4 in_n_out to old 4 in_n_out
 477             }
 478             src_5_8 = _mm_setzero_ps();
 479             f4 = _mm_load_ps(filter_end);
 480             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 481             //store result
 482             _mm_store_ps(dst2, sum);
 483         }
 484     }
 485 #undef XY_FILTER_4
 486 }
 487
 488 static __forceinline void xy_filter_4_inline(__m128& src4, const __m128& src_5_8, const __m128& f4, __m128& sum)
 489 {
 490     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));
 491     f4_1 = _mm_mul_ps(f4_1, src4);
 492     sum = _mm_add_ps(sum, f4_1);
 493     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/
 494     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));
 495     f4_1 = _mm_mul_ps(f4_1, src_3_6);
 496     sum = _mm_add_ps(sum, f4_1);
 497     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/
 498     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));
 499     f4_1 = _mm_mul_ps(f4_1, src4);
 500     sum = _mm_add_ps(sum, f4_1);
 501     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/
 502     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));
 503     f4_1 = _mm_mul_ps(f4_1, src_3_6);
 504     sum = _mm_add_ps(sum, f4_1);
 505 }
 506
 507 void xy_filter_sse_v3(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 508 {
 509     ASSERT( stride>=4*(width+filter_width) );
 510     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 511
 512     int xx_fix = width > filter_width ? 0 : filter_width - width;
 513     const float *filter_start = filter;
 514     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 515     BYTE* end = dst_byte + height*stride;
 516     for( ; dst_byte<end; dst_byte+=stride )
 517     {
 518         float *dst_f = reinterpret_cast<float*>(dst_byte);
 519         float *dst2 = dst_f - filter_width;
 520         float *dst_endr = dst_f + width;
 521         float *dst_end0 = dst_endr - filter_width;
 522         float *dst_endl = dst_f - xx_fix;
 523         ASSERT(xx_fix==0 || dst_end0==dst_endl);
 524
 525         ASSERT(filter_start == filter);
 526         filter_start += filter_width;
 527         const float *filter_end = filter_start;
 528
 529         for (;dst2<dst_endl;dst2+=4)//left margin
 530         {
 531             const float *src = dst_f;
 532             filter_start -= 4;
 533
 534             //filter 4
 535             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 536             __m128 sum = _mm_setzero_ps();
 537             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 538             {
 539                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 540                 __m128 f4 = _mm_load_ps(f);
 541
 542                 xy_filter_4_inline(src4, src_5_8, f4, sum);
 543
 544                 src4 = src_5_8;
 545             }
 546             //store result
 547             _mm_store_ps(dst2, sum);
 548         }
 549         for (;dst2<dst_f;dst2+=4)//if width < filter_width
 550         {
 551             const float *src = dst_f;
 552             filter_start-=4;
 553             filter_end-=4;
 554
 555             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 556             __m128 sum = _mm_setzero_ps();
 557             __m128 src_5_8, f4;
 558             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 559             {
 560                 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 561                 f4 = _mm_load_ps(f);
 562
 563                 xy_filter_4_inline(src4, src_5_8, f4, sum);
 564                 src4 = src_5_8;
 565             }
 566             src_5_8 = _mm_setzero_ps();
 567             f4 = _mm_load_ps(filter_end);
 568             xy_filter_4_inline(src4, src_5_8, f4, sum);
 569             //store result
 570             _mm_store_ps(dst2, sum);
 571         }
 572         ASSERT(filter_start == filter);
 573         for (;dst2<dst_end0;dst2+=4)
 574         {
 575             const float *src = dst2;
 576
 577             //filter 4
 578             __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 579             __m128 sum = _mm_setzero_ps();
 580             for(const float* f=filter_start;f<filter_end;f+=4)
 581             {
 582                 src+=4;
 583                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 584                 __m128 f4 = _mm_load_ps(f);
 585
 586                 xy_filter_4_inline(src4, src_5_8, f4, sum);
 587                 src4 = src_5_8;
 588             }
 589             //store result
 590             _mm_store_ps(dst2, sum);
 591         }
 592         for (;dst2<dst_endr;dst2+=4)//right margin
 593         {
 594             const float *src = dst2;
 595             filter_end-=4;
 596
 597             //filter 4
 598             __m128 src4 = _mm_load_ps(src);//1 2 3 4
 599             __m128 sum = _mm_setzero_ps();
 600             __m128 src_5_8, f4;
 601             for(const float* f=filter_start;f<filter_end;f+=4)
 602             {
 603                 src+=4;
 604                 src_5_8 = _mm_load_ps(src);//5 6 7 8
 605                 f4 = _mm_load_ps(f);
 606
 607                 xy_filter_4_inline(src4, src_5_8, f4, sum);
 608
 609                 src4 = src_5_8;
 610                 //move new 4 in_n_out to old 4 in_n_out
 611             }
 612             src_5_8 = _mm_setzero_ps();
 613             f4 = _mm_load_ps(filter_end);
 614             xy_filter_4_inline(src4, src_5_8, f4, sum);
 615             //store result
 616             _mm_store_ps(dst2, sum);
 617         }
 618     }
 619 }
 620
 621 /****
 622  * inline function sometimes generates stupid code
 623  *
 624  * @src4, @src_5_8, @f4, @sum : __m128
 625  * @src_5_8, @f4: const
 626  * @sum : output
 627  * @src4: undefined
 628  **/
 629 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
 630     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
 631     f4_1 = _mm_mul_ps(f4_1, src4);\
 632     sum = _mm_add_ps(sum, f4_1);\
 633     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 634     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
 635     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 636     sum = _mm_add_ps(sum, f4_1);\
 637     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 638     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
 639     f4_1 = _mm_mul_ps(f4_1, src4);\
 640     sum = _mm_add_ps(sum, f4_1);\
 641     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 642     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
 643     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 644     sum = _mm_add_ps(sum, f4_1)
 645
 646
 647 __forceinline void xy_filter_one_line_sse_v4(float *dst, int width, const float *filter, int filter_width)
 648 {
 649     int xx_fix = width > filter_width ? 0 : filter_width - width;
 650     const float *filter_start = filter;
 651     float *dst2 = dst - filter_width;
 652     float *dst_endr = dst + width;
 653     float *dst_end0 = dst_endr - filter_width;
 654     float *dst_endl = dst - xx_fix;
 655     ASSERT(xx_fix==0 || dst_end0==dst_endl);
 656
 657     ASSERT(filter_start == filter);
 658     filter_start += filter_width;
 659     const float *filter_end = filter_start;
 660
 661     for (;dst2<dst_endl;dst2+=4)//left margin
 662     {
 663         const float *src = dst;
 664         filter_start -= 4;
 665
 666         //filter 4
 667         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 668         __m128 sum = _mm_setzero_ps();
 669         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 670         {
 671             __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 672             __m128 f4 = _mm_load_ps(f);
 673
 674             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 675
 676             src4 = src_5_8;
 677         }
 678         //store result
 679         _mm_store_ps(dst2, sum);
 680     }
 681     for (;dst2<dst;dst2+=4)//if width < filter_width
 682     {
 683         const float *src = dst;
 684         filter_start-=4;
 685         filter_end-=4;
 686
 687         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 688         __m128 sum = _mm_setzero_ps();
 689         __m128 src_5_8, f4;
 690         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 691         {
 692             src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 693             f4 = _mm_load_ps(f);
 694
 695             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 696             src4 = src_5_8;
 697         }
 698         src_5_8 = _mm_setzero_ps();
 699         f4 = _mm_load_ps(filter_end);
 700         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 701         //store result
 702         _mm_store_ps(dst2, sum);
 703     }
 704     ASSERT(filter_start == filter);
 705     for (;dst2<dst_end0;dst2+=4)
 706     {
 707         const float *src = dst2;
 708
 709         //filter 4
 710         __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 711         __m128 sum = _mm_setzero_ps();
 712         for(const float* f=filter_start;f<filter_end;f+=4)
 713         {
 714             src+=4;
 715             __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 716             __m128 f4 = _mm_load_ps(f);
 717
 718             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 719             src4 = src_5_8;
 720         }
 721         //store result
 722         _mm_store_ps(dst2, sum);
 723     }
 724     for (;dst2<dst_endr;dst2+=4)//right margin
 725     {
 726         const float *src = dst2;
 727         filter_end-=4;
 728
 729         //filter 4
 730         __m128 src4 = _mm_load_ps(src);//1 2 3 4
 731         __m128 sum = _mm_setzero_ps();
 732         __m128 src_5_8, f4;
 733         for(const float* f=filter_start;f<filter_end;f+=4)
 734         {
 735             src+=4;
 736             src_5_8 = _mm_load_ps(src);//5 6 7 8
 737             f4 = _mm_load_ps(f);
 738
 739             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 740
 741             src4 = src_5_8;
 742             //move new 4 in_n_out to old 4 in_n_out
 743         }
 744         src_5_8 = _mm_setzero_ps();
 745         f4 = _mm_load_ps(filter_end);
 746         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 747         //store result
 748         _mm_store_ps(dst2, sum);
 749     }
 750 }
 751
 752 void xy_filter_sse_v4(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 753 {
 754     ASSERT( stride>=4*(width+filter_width) );
 755     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 756
 757     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 758     BYTE* end = dst_byte + height*stride;
 759     for( ; dst_byte<end; dst_byte+=stride )
 760     {
 761         xy_filter_one_line_sse_v4(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
 762     }
 763 }
 764
 765 // v5
 766 template<int LENGTH>
 767 struct M128s
 768 {
 769     __m128 x;
 770     M128s<LENGTH - 4> next;
 771
 772     template<int Index> __forceinline __m128& GetAt()
 773     {
 774         return next.GetAt<Index - 4>();
 775     }
 776     template<> __forceinline __m128& GetAt<0>()
 777     {
 778         return x;
 779     }
 780
 781     template<int Start, int Offset> __forceinline __m128& GetAt()
 782     {
 783         return GetAt<Start + Offset>();
 784     }
 785
 786     __forceinline void Load(const float* src)
 787     {
 788         x = _mm_load_ps(src);
 789         next.Load(src+4);
 790     }
 791 };
 792
 793 template<>
 794 struct M128s<0>
 795 {
 796     void Load(const float* src)
 797     {
 798     }
 799 };
 800
 801 template<int FILTER_LENGTH, int START, int LENGTH>
 802 struct Filter4
 803 {
 804     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 805     {
 806         Filter4<FILTER_LENGTH,START,LENGTH-4>::do_cal(src0_128, src4, filter128s, sum);
 807         __m128 src4_128 = _mm_load_ps(src4+LENGTH-4);
 808         XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START+LENGTH-4>(), sum);
 809         src0_128 = src4_128;
 810     }
 811 };
 812
 813 template<int FILTER_LENGTH, int START>
 814 struct Filter4<FILTER_LENGTH, START, 0>
 815 {
 816     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 817     {
 818     }
 819 };
 820
 821 template<int FILTER_LENGTH,int MARGIN_LENGTH>
 822 struct FilterAllMargin
 823 {
 824     static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 825     {
 826         //filter 4
 827         __m128 src0 = _mm_setzero_ps();
 828         __m128 sum = _mm_setzero_ps();
 829         Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
 830         _mm_store_ps(src-MARGIN_LENGTH, sum);
 831         FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_left_margin(src, filter128s);
 832     }
 833
 834     static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 835     {
 836         //filter 4
 837         {
 838             __m128 src0 = _mm_load_ps(src);
 839             __m128 sum = _mm_setzero_ps();
 840             Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
 841             __m128 src4 = _mm_setzero_ps();
 842             XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum);
 843             //store result
 844             _mm_store_ps(src, sum);
 845         }
 846         FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_right_margin(src+4, filter128s);
 847     }
 848 };
 849
 850 template<int FILTER_LENGTH>
 851 struct FilterAllMargin<FILTER_LENGTH,0>
 852 {
 853     static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 854     {
 855     }
 856     static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 857     {
 858     }
 859 };
 860
 861 /****
 862  * See @xy_filter_c
 863  * Constrain:
 864  *   FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width
 865  **/
 866 template<int FILTER_LENGTH>
 867 void xy_filter_sse_template_v0(float *dst, int width, int height, int stride, const float *filter)
 868 {
 869     ASSERT( stride>=4*(width+FILTER_LENGTH) );
 870     ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 871
 872     M128s<FILTER_LENGTH> filter128s;
 873     filter128s.Load(filter);
 874
 875     const float *filter_start = filter;
 876     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 877     BYTE* end = dst_byte + height*stride;
 878     for( ; dst_byte<end; dst_byte+=stride )
 879     {
 880         float *dst2 = reinterpret_cast<float*>(dst_byte);
 881
 882         //left margin
 883         FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_left_margin(dst2, filter128s);
 884         float *dst_end1 = dst2 + width;
 885         float *dst_end0 = dst_end1 - FILTER_LENGTH;
 886         for (;dst2<dst_end0;dst2+=4)
 887         {
 888             const float *src = dst2;
 889
 890             //filter 4
 891             __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
 892             src += 4;
 893             __m128 sum = _mm_setzero_ps();
 894             Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
 895             //store result
 896             _mm_store_ps(dst2, sum);
 897         }
 898         FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_right_margin(dst2, filter128s);
 899     }
 900 }
 901
 902 void xy_filter_sse_v5(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 903 {
 904     typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
 905     const Filter filters[] = { xy_filter_sse_template_v0<4>, xy_filter_sse_template_v0<8>, xy_filter_sse_template_v0<12>, xy_filter_sse_template_v0<16>,
 906         xy_filter_sse_template_v0<20>, xy_filter_sse_template_v0<24>, xy_filter_sse_template_v0<28>
 907     };
 908     if (filter_width<=28 && filter_width<=width)
 909     {
 910         ASSERT(filter_width%4==0);
 911         filters[(filter_width-1)/4](dst, width, height, stride, filter);
 912     }
 913     else
 914     {
 915         xy_filter_sse_v4(dst, width, height, stride, filter, filter_width);
 916     }
 917 }
 918
 919 // v5 end
 920
 921 __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width)
 922 {
 923     int xx_fix = width > filter_width ? 0 : filter_width - width;
 924     const float *filter_start = filter;
 925     float *dst2 = dst - filter_width;
 926     float *dst_endr = dst + width;
 927     float *dst_end0 = dst_endr - filter_width - 4;
 928     float *dst_endl = dst - xx_fix;
 929     ASSERT(xx_fix==0 || dst_end0==dst_endl-4);
 930
 931     ASSERT(filter_start == filter);
 932     filter_start += filter_width;
 933     const float *filter_end = filter_start;
 934
 935     for (;dst2<dst_endl;dst2+=4)//left margin
 936     {
 937         const float *src = dst;
 938         filter_start -= 4;
 939
 940         //filter 4
 941         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 942         __m128 sum = _mm_setzero_ps();
 943         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 944         {
 945             __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 946             __m128 f4 = _mm_load_ps(f);
 947
 948             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 949
 950             src4 = src_5_8;
 951         }
 952         //store result
 953         _mm_store_ps(dst2, sum);
 954     }
 955     for (;dst2<dst;dst2+=4)//if width < filter_width
 956     {
 957         const float *src = dst;
 958         filter_start-=4;
 959         filter_end-=4;
 960
 961         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 962         __m128 sum = _mm_setzero_ps();
 963         __m128 src_5_8, f4;
 964         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 965         {
 966             src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 967             f4 = _mm_load_ps(f);
 968
 969             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 970             src4 = src_5_8;
 971         }
 972         src_5_8 = _mm_setzero_ps();
 973         f4 = _mm_load_ps(filter_end);
 974         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 975         //store result
 976         _mm_store_ps(dst2, sum);
 977     }
 978     ASSERT(filter_start == filter);
 979     for (;dst2<dst_end0;dst2+=8)
 980     {
 981         const float *src = dst2;
 982         const float* f=filter_start;
 983
 984         //filter 4
 985         __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 986         src+=4;
 987         __m128 src_5_8;
 988         __m128 sum = _mm_setzero_ps();
 989         __m128 sum2 = _mm_setzero_ps();
 990         __m128 f4 = _mm_load_ps(f);
 991         f+=4;
 992         src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 993         src+=4;
 994         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 995         for(;f<filter_end;f+=4,src+=4)
 996         {
 997             src4 = _mm_load_ps(src);/*1 2 3 4*/
 998             __m128 tmp = src_5_8;//important!
 999             { XY_FILTER_4(tmp, src4, f4, sum2); }
1000
1001             f4 = _mm_load_ps(f);
1002             { XY_FILTER_4(src_5_8, src4, f4, sum); }
1003             src_5_8 = src4;
1004         }
1005         src4 = _mm_load_ps(src);/*1 2 3 4*/
1006         { XY_FILTER_4(src_5_8, src4, f4, sum2); }
1007
1008         //store result
1009         _mm_store_ps(dst2, sum);
1010         _mm_store_ps(dst2+4, sum2);
1011     }
1012     if (dst2==dst_end0)
1013     {
1014         const float *src = dst2;
1015         //filter 4
1016         __m128 src4 = _mm_load_ps(src);//1 2 3 4
1017         src+=4;
1018         __m128 sum = _mm_setzero_ps();
1019         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
1020         {
1021             __m128 src_5_8 = _mm_load_ps(src);//5 6 7 8
1022             __m128 f4 = _mm_load_ps(f);
1023
1024             { XY_FILTER_4(src4, src_5_8, f4, sum); }
1025             src4 = src_5_8;
1026         }
1027         //store result
1028         _mm_store_ps(dst2, sum);
1029         dst2+=4;
1030     }
1031     for (;dst2<dst_endr;dst2+=4)//right margin
1032     {
1033         const float *src = dst2;
1034         filter_end-=4;
1035
1036         //filter 4
1037         __m128 src4 = _mm_load_ps(src);//1 2 3 4
1038         __m128 sum = _mm_setzero_ps();
1039         __m128 src_5_8, f4;
1040         for(const float* f=filter_start;f<filter_end;f+=4)
1041         {
1042             src+=4;
1043             src_5_8 = _mm_load_ps(src);//5 6 7 8
1044             f4 = _mm_load_ps(f);
1045
1046             { XY_FILTER_4(src4, src_5_8, f4, sum); }
1047
1048             src4 = src_5_8;
1049             //move new 4 in_n_out to old 4 in_n_out
1050         }
1051         src_5_8 = _mm_setzero_ps();
1052         f4 = _mm_load_ps(filter_end);
1053         { XY_FILTER_4(src4, src_5_8, f4, sum); }
1054         //store result
1055         _mm_store_ps(dst2, sum);
1056     }
1057 }
1058
1059 void xy_filter_sse_v6(float *dst, int width, int height, int stride, const float *filter, int filter_width)
1060 {
1061     ASSERT( stride>=4*(width+filter_width) );
1062     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
1063
1064     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
1065     BYTE* end = dst_byte + height*stride;
1066     for( ; dst_byte<end; dst_byte+=stride )
1067     {
1068         xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
1069     }
1070 }
1071
1072 // v7
1073
1074 //
1075 // ref: "Comparing floating point numbers" by Bruce Dawson
1076 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
1077 //
1078 bool AlmostEqual(float A, float B, int maxUlps=0)
1079 {
1080     // Make sure maxUlps is non-negative and small enough that the
1081     // default NAN won't compare as equal to anything.
1082     ASSERT(maxUlps > 0 && maxUlps < 4 * 1024 * 1024);
1083     int aInt = *(int*)&A;
1084     // Make aInt lexicographically ordered as a twos-complement int
1085     if (aInt < 0)
1086         aInt = 0x80000000 - aInt;
1087     // Make bInt lexicographically ordered as a twos-complement int
1088     int bInt = *(int*)&B;
1089     if (bInt < 0)
1090         bInt = 0x80000000 - bInt;
1091
1092     int intDiff = abs(aInt - bInt);
1093     if (intDiff <= maxUlps)
1094         return true;
1095
1096     return false;
1097 }
1098
1099 /****
1100  * @src4, @f4_1, @sum : __m128
1101  * @f4_1: const
1102  * @sum : output
1103  * @src4: undefined
1104  **/
1105 #define XY_FILTER_4_1(src4, f4_1, sum) \
1106     src4 = _mm_mul_ps(src4, f4_1);\
1107     sum = _mm_add_ps(sum, src4);
1108
1109 /****
1110  * Constrain:
1111  *   LENGTH%4 == 0 || LENGTH%4 == 1
1112  **/
1113 template<int LENGTH>
1114 struct M128s_V1
1115 {
1116     __m128 x;
1117     M128s_V1<LENGTH - 4> next;
1118
1119     template<int Index> __forceinline __m128& GetAt()
1120     {
1121         return next.GetAt<Index - 4>();
1122     }
1123     template<> __forceinline __m128& GetAt<0>()
1124     {
1125         return x;
1126     }
1127
1128     template<int Start, int Offset> __forceinline __m128& GetAt()
1129     {
1130         return GetAt<Start + Offset>();
1131     }
1132
1133     __forceinline void Load(const float* src)
1134     {
1135         x = _mm_load_ps(src);
1136         next.Load(src+4);
1137     }
1138 };
1139
1140 template<>
1141 struct M128s_V1<1>
1142 {
1143     __m128 x;
1144
1145     template<int Index> __forceinline __m128& GetAt()
1146     {
1147         return x;
1148     }
1149     __forceinline void Load(const float* src)
1150     {
1151         x = _mm_set1_ps(*src);
1152     }
1153 };
1154
1155 template<>
1156 struct M128s_V1<0>
1157 {
1158     void Load(const float* src)
1159     {
1160     }
1161 };
1162
1163 template<int FILTER_LENGTH, int START, int LENGTH>
1164 struct Filter4_V1
1165 {
1166     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1167     {
1168         __m128 src4_128 = _mm_load_ps(src4);
1169         { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START>(), sum); }
1170         Filter4_V1<FILTER_LENGTH,START+4,LENGTH-4>::do_cal(src4_128, src4+4, filter128s, sum);
1171         src0_128 = src4_128;
1172     }
1173 };
1174
1175 template<int FILTER_LENGTH, int START>
1176 struct Filter4_V1<FILTER_LENGTH, START, 1>
1177 {
1178     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1179     {
1180         cal_tail<FILTER_LENGTH-START>(src0_128, src4, filter128s, sum);
1181     }
1182     template<int TAIL>
1183     static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1184     {
1185
1186     }
1187     template<>
1188     static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1189     {
1190         { XY_FILTER_4_1(src0_128, filter128s.GetAt<FILTER_LENGTH-1>(), sum); }
1191     }
1192 };
1193
1194 template<int FILTER_LENGTH, int START>
1195 struct Filter4_V1<FILTER_LENGTH, START, 0>
1196 {
1197     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s_V1<FILTER_LENGTH>& filter128s, __m128& sum)
1198     {
1199     }
1200 };
1201
1202 template<int FILTER_LENGTH,int MARGIN_LENGTH>
1203 struct FilterAllLeftMargin_V1
1204 {
1205     static __forceinline void cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1206     {
1207         do_cal<FILTER_LENGTH%4>(src, filter128s);
1208     }
1209     template<int FILTER_TAIL>
1210     static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1211     {
1212         //filter 4
1213         __m128 src0 = _mm_setzero_ps();
1214         __m128 sum = _mm_setzero_ps();
1215         Filter4_V1<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
1216         _mm_store_ps(src-MARGIN_LENGTH, sum);
1217         FilterAllLeftMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
1218     }
1219     template<>
1220     static __forceinline void do_cal<1>(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1221     {
1222         //filter 4
1223         __m128 sum = _mm_setzero_ps();
1224         //Only one of the last 4 filter coefficiences is non-zero
1225         _mm_store_ps(src-MARGIN_LENGTH, sum);
1226         FilterAllLeftMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
1227     }
1228 };
1229
1230 template<int FILTER_LENGTH, int MARGIN_LENGTH>
1231 struct FilterAllRightMargin_V1
1232 {
1233     static __forceinline void cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1234     {
1235         do_cal<FILTER_LENGTH%4>(src, filter128s);
1236     }
1237     template<int FILTER_TAIL>
1238     static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1239     {
1240         //filter 4
1241         {
1242             __m128 src0 = _mm_load_ps(src);
1243             __m128 sum = _mm_setzero_ps();
1244             Filter4_V1<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
1245             __m128 src4 = _mm_setzero_ps();
1246             { XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
1247             //store result
1248             _mm_store_ps(src, sum);
1249         }
1250         FilterAllRightMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
1251     }
1252     template<>
1253     static __forceinline void do_cal<1>(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1254     {
1255         //filter 4
1256         {
1257             __m128 src0 = _mm_load_ps(src);
1258             __m128 sum = _mm_setzero_ps();
1259             Filter4_V1<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
1260             //Only one of the last 4 filter coefficiences is non-zero
1261             { XY_FILTER_4_1(src0, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
1262             //store result
1263             _mm_store_ps(src, sum);
1264         }
1265         FilterAllRightMargin_V1<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
1266     }
1267 };
1268
1269 template<int FILTER_LENGTH>
1270 struct FilterAllLeftMargin_V1<FILTER_LENGTH,0>
1271 {
1272     template<int FILTER_TAIL>
1273     static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1274     {
1275     }
1276 };
1277
1278 template<int FILTER_LENGTH>
1279 struct FilterAllRightMargin_V1<FILTER_LENGTH,0>
1280 {
1281     template<int FILTER_TAIL>
1282     static __forceinline void do_cal(float * src, M128s_V1<FILTER_LENGTH>& filter128s)
1283     {
1284     }
1285 };
1286
1287 /****
1288  * Equivalent:
1289  *   xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
1290  * See @xy_filter_c
1291  * Constrain:
1292  *   FILTER_LENGTH<=width && width%4==0
1293  **/
1294 template<int FILTER_LENGTH>
1295 void xy_filter_sse_template_v1(float *dst, int width, int height, int stride, const float *filter)
1296 {
1297     ASSERT( stride>=4*(width+FILTER_LENGTH) );
1298     ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
1299
1300     M128s_V1<FILTER_LENGTH> filter128s;
1301     filter128s.Load(filter);
1302
1303     const float *filter_start = filter;
1304     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
1305     BYTE* end = dst_byte + height*stride;
1306     for( ; dst_byte<end; dst_byte+=stride )
1307     {
1308         float *dst2 = reinterpret_cast<float*>(dst_byte);
1309
1310         //left margin
1311         FilterAllLeftMargin_V1<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
1312         float *dst_end1 = dst2 + width;
1313         float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3);
1314         for (;dst2<dst_end0;dst2+=4)
1315         {
1316             const float *src = dst2;
1317
1318             //filter 4
1319             __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
1320             src += 4;
1321             __m128 sum = _mm_setzero_ps();
1322             Filter4_V1<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
1323             //store result
1324             _mm_store_ps(dst2, sum);
1325         }
1326         FilterAllRightMargin_V1<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
1327     }
1328 }
1329
1330 /****
1331  * See @xy_filter_c
1332  **/
1333 void xy_filter_sse_v7(float *dst, int width, int height, int stride, const float *filter, int filter_width)
1334 {
1335     typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
1336     const Filter filters[] = {
1337         NULL,
1338         xy_filter_sse_template_v1<1>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>,
1339         xy_filter_sse_template_v1<5>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>,
1340         xy_filter_sse_template_v1<9>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>,
1341         xy_filter_sse_template_v1<13>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>,
1342         xy_filter_sse_template_v1<17>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>,
1343         xy_filter_sse_template_v1<21>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>,
1344         xy_filter_sse_template_v1<25>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>
1345     };
1346     if (filter_width<=28 && filter_width<=width)
1347     {
1348         int tmp = filter_width;
1349         while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3)//Skip tail zero, but limited to 3. We cannot support more current.
1350             tmp--;
1351         filters[tmp](dst, width, height, stride, filter);
1352     }
1353     else
1354     {
1355         xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
1356     }
1357 }
1358
1359 // v7 end
1360
1361
1362 // v8
1363
1364 /****
1365  * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
1366  * @src4, @src_5_8, @f3_1, @f3_2: const
1367  * @sum: output
1368  **/
1369 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
1370     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
1371     __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
1372     sum = _mm_mul_ps(f3_1, src4);\
1373     __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
1374     __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
1375     sum = _mm_add_ps(sum, mul2);\
1376     sum = _mm_add_ps(sum, mul3);
1377
1378 /****
1379  * Equivalent:
1380  *   xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
1381  * See @xy_filter_c
1382  * Constrain:
1383  *   filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
1384  **/
1385 void xy_3_tag_symmetric_filter_sse(float *dst, int width, int height, int stride, const float *filter)
1386 {
1387     const int filter_width = 4;
1388     ASSERT( stride>=4*(width+filter_width) );
1389     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
1390
1391     ASSERT(filter_width==4 && filter[3]==0 && filter[2]==filter[0]);
1392
1393     __m128 f3_1 = _mm_set1_ps(filter[0]);
1394     __m128 f3_2 = _mm_set1_ps(filter[1]);
1395
1396     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
1397     BYTE* end = dst_byte + height*stride;
1398     for( ; dst_byte<end; dst_byte+=stride )
1399     {
1400         float *dst_f = reinterpret_cast<float*>(dst_byte);
1401         float *dst2 = dst_f;
1402
1403         float *dst_end0 = dst_f + width - 4;
1404         //filter 4
1405         __m128 src4 = _mm_load_ps(dst2);/*1 2 3 4*/
1406         {
1407             __m128 sum;
1408             __m128 src_4 = _mm_setzero_ps();
1409             { XY_3_TAG_SYMMETRIC_FILTER_4(src_4, src4, f3_1, f3_2, sum); }
1410             _mm_store_ps(dst2, sum);
1411         }
1412         for (;dst2<dst_end0;dst2+=4)
1413         {
1414             __m128 sum;
1415             __m128 src_5_8 = _mm_load_ps(dst2+4);/*5 6 7 8*/
1416             { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
1417             src4 = src_5_8;
1418             //store result
1419             _mm_store_ps(dst2, sum);
1420         }
1421         {
1422             __m128 sum;
1423             __m128 src_5_8 = _mm_setzero_ps();/*5 6 7 8*/
1424             { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
1425             src4 = src_5_8;
1426             //store result
1427             _mm_store_ps(dst2, sum);
1428         }
1429     }
1430 }
1431
1432 void xy_filter_sse_v8(float *dst, int width, int height, int stride, const float *filter, int filter_width)
1433 {
1434     typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
1435     const Filter filters[] = {
1436         NULL,
1437         xy_filter_sse_template_v1<1>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>, xy_filter_sse_template_v1<4>,
1438         xy_filter_sse_template_v1<5>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>, xy_filter_sse_template_v1<8>,
1439         xy_filter_sse_template_v1<9>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>, xy_filter_sse_template_v1<12>,
1440         xy_filter_sse_template_v1<13>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>, xy_filter_sse_template_v1<16>,
1441         xy_filter_sse_template_v1<17>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>, xy_filter_sse_template_v1<20>,
1442         xy_filter_sse_template_v1<21>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>, xy_filter_sse_template_v1<24>,
1443         xy_filter_sse_template_v1<25>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>, xy_filter_sse_template_v1<28>
1444     };
1445     if (filter_width<=28 && filter_width<=width)
1446     {
1447         int tmp = filter_width;
1448         // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
1449         while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3 )
1450             tmp--;
1451         if (tmp==3&&filter[0]==filter[2])
1452         {
1453             xy_3_tag_symmetric_filter_sse(dst, width, height, stride, filter);
1454         }
1455         else
1456         {
1457             filters[tmp](dst, width, height, stride, filter);
1458         }
1459     }
1460     else
1461     {
1462         xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
1463     }
1464 }
1465
1466 // v8 end
1467
1468 class XyFilterTest : public ::testing::Test
1469 {
1470 public:
1471     static const int MAX_FILTER_LENGTH = 256;
1472     static const int MAX_DATA_BUFF_SIZE = 512*512;
1473     static const int MAX_BUFF_SIZE = MAX_DATA_BUFF_SIZE + 2*MAX_FILTER_LENGTH;
1474
1475     float *buff_base;
1476
1477     float *filter_f;
1478
1479     float *data;
1480     int w, h, pitch;
1481     int filter_width;
1482     int ex_filter_width;
1483
1484     XyFilterTest():buff_base(NULL),filter_f(NULL),data(NULL)
1485     {
1486         buff_base = (float*)xy_malloc(MAX_BUFF_SIZE*sizeof(float));
1487         data = buff_base + MAX_FILTER_LENGTH;
1488         filter_f = data + MAX_DATA_BUFF_SIZE;
1489     }
1490     ~XyFilterTest()
1491     {
1492         xy_free(buff_base);buff_base=NULL;
1493     }
1494
1495     const XyFilterTest& copy (const XyFilterTest& rhs)
1496     {
1497         w = rhs.w;
1498         h = rhs.h;
1499         pitch = rhs.pitch;
1500         filter_width = rhs.filter_width;
1501         ex_filter_width = rhs.ex_filter_width;
1502
1503         memcpy(buff_base, rhs.buff_base, MAX_BUFF_SIZE*sizeof(float));
1504         return *this;
1505     }
1506
1507     void FillRandData(int w, int h, int filter_width)
1508     {
1509         this->w = w;
1510         this->h = h;
1511         this->filter_width = filter_width;
1512         this->ex_filter_width = ((filter_width+3)&~3);
1513         this->pitch = ((w+ex_filter_width+3)&~3)*sizeof(float);
1514
1515         ASSERT(this->pitch*this->h <= MAX_DATA_BUFF_SIZE);
1516
1517         for (int i=0;i<h;i++)
1518         {
1519             float *data2 = (float*)((PUINT8)data + i*pitch);
1520             for (int j=0;j<w+filter_width;j++)
1521             {
1522                 data2[j] = rand()&0xFF;
1523             }
1524         }
1525         float sum = 0;
1526         for (int i=0;i<filter_width/2+1;i++)
1527         {
1528             filter_f[i] = abs((rand()&0xFFFFF)+0xFFFFF);
1529             sum += filter_f[i];
1530         }
1531         for (int i=filter_width/2+1;i<filter_width;i++)
1532         {
1533             filter_f[i] = filter_f[filter_width-i-1];
1534             sum += filter_f[i];
1535         }
1536         for (int i=0;i<filter_width;i++)
1537         {
1538             filter_f[i] /= sum;
1539         }
1540         for (int i=filter_width;i<ex_filter_width;i++)
1541         {
1542             filter_f[i] = 0;
1543         }
1544     }
1545 protected:
1546     virtual void SetUp()
1547     {
1548     }
1549 private:
1550     XyFilterTest(const XyFilterTest&);
1551     const XyFilterTest& operator= (const XyFilterTest& rhs);
1552 };
1553
1554 #define FilterTest(width, height, FILTER_LENGTH, loop_num, function) \
1555 TEST_F(XyFilterTest, function ## _ ## width ## _ ## height ## _ ## FILTER_LENGTH ## _ ## loop_num ) \
1556 {\
1557     FillRandData(width, height, FILTER_LENGTH);\
1558     for(int i=0;i<loop_num;i++)\
1559     {\
1560         function(data, w, h, pitch, filter_f, (FILTER_LENGTH+3)&~3);\
1561     }\
1562     /*ASSERT_EQ(data[-filter_width],0)*/;/*anit compiler optimize*/\
1563     ASSERT_EQ(0,0);\
1564 }
1565
1566 #define DUAL_TEST(width, height, FILTER_LENGTH, loop_num) \
1567     FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v7) \
1568     FilterTest(width, height, FILTER_LENGTH, loop_num, xy_filter_sse_v8)
1569
1570 DUAL_TEST(128, 16, 3, 30000)
1571     DUAL_TEST(256, 16, 3, 30000)
1572     DUAL_TEST(512, 16, 3, 30000)
1573     DUAL_TEST(128, 32, 3, 30000)
1574     DUAL_TEST(256, 32, 3, 30000)
1575     DUAL_TEST(512, 32, 3, 30000)
1576     DUAL_TEST(128, 64, 3, 30000)
1577     DUAL_TEST(256, 64, 3, 30000)
1578     DUAL_TEST(512, 64, 3, 30000)
1579 DUAL_TEST(128, 16, 20, 20000)
1580     DUAL_TEST(256, 16, 20, 20000)
1581     DUAL_TEST(512, 16, 20, 20000)
1582     DUAL_TEST(128, 32, 20, 20000)
1583     DUAL_TEST(256, 32, 20, 20000)
1584     DUAL_TEST(512, 32, 20, 20000)
1585     DUAL_TEST(128, 64, 20, 20000)
1586     DUAL_TEST(256, 64, 20, 20000)
1587     DUAL_TEST(512, 64, 20, 20000)
1588 DUAL_TEST(128, 16, 21, 20000)
1589     DUAL_TEST(256, 16, 21, 20000)
1590     DUAL_TEST(512, 16, 21, 20000)
1591     DUAL_TEST(128, 32, 21, 20000)
1592     DUAL_TEST(256, 32, 21, 20000)
1593     DUAL_TEST(512, 32, 21, 20000)
1594     DUAL_TEST(128, 64, 21, 20000)
1595     DUAL_TEST(256, 64, 21, 20000)
1596     DUAL_TEST(512, 64, 21, 20000)
1597
1598 //////////////////////////////////////////////////////////////////////
1599
1600 // transpose test
1601
1602 void xy_float_2_float_transpose_c_v0(float *dst, int dst_width, int dst_stride,
1603     const float *src, int width, int height, int src_stride)
1604 {
1605     ASSERT(dst_width >= height);
1606     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1607     const float* src_end = src + width;
1608     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1609     for( ; src<src_end; src++, dst_byte+=dst_stride )
1610     {
1611         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1612
1613         float *dst2 = reinterpret_cast<float*>(dst_byte);
1614         for (;src2<src2_end;src2+=src_stride,dst2++)
1615         {
1616             *dst2 = *reinterpret_cast<const float*>(src2);
1617         }
1618         float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
1619         for (;dst2<dst2_end;dst2++)
1620         {
1621             *dst2 = 0;
1622         }
1623     }
1624 }
1625
1626 void xy_float_2_float_transpose_sse_v0(float *dst, int dst_width, int dst_stride,
1627     const float *src, int width, int height, int src_stride)
1628 {
1629     typedef float DstT;
1630     typedef const float SrcT;
1631
1632     ASSERT( (((int)dst|dst_stride)&15)==0 );
1633     ASSERT(dst_width >= height);
1634     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1635     SrcT* src_end = src + width;
1636     PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
1637     PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1638     for( ; src<src_end; src++, dst_byte+=dst_stride )
1639     {
1640         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1641
1642         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
1643         for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
1644         {
1645             __m128 m1 = _mm_set_ps(
1646                 *(SrcT*)(src2+3*src_stride),
1647                 *(SrcT*)(src2+2*src_stride),
1648                 *(SrcT*)(src2+src_stride),
1649                 *(SrcT*)(src2));
1650             _mm_store_ps(dst2, m1);
1651         }
1652         for (;src2<src2_end2;src2+=src_stride,dst2++)
1653         {
1654             *dst2 = *reinterpret_cast<SrcT*>(src2);
1655         }
1656         float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
1657         for (;dst2<dst2_end;dst2++)
1658         {
1659             *dst2 = 0;
1660         }
1661     }
1662 }
1663
1664 void float_2_byte_transpose_v0(UINT8 *dst, int dst_width, int dst_stride,
1665     const float *src, int width, int height, int src_stride)
1666 {
1667     ASSERT(dst_width >= height);
1668     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1669     const float* src_end = src + width;
1670     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1671     for( ; src<src_end; src++, dst_byte+=dst_stride )
1672     {
1673         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1674
1675         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
1676         for (;src2<src2_end;src2+=src_stride,dst2++)
1677         {
1678             *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
1679         }
1680         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
1681         for (;dst2<dst2_end;dst2++)
1682         {
1683             *dst2 = 0;
1684         }
1685     }
1686 }
1687
1688 void float_2_byte_transpose_sse_v0(UINT8 *dst, int dst_width, int dst_stride,
1689     const float *src, int width, int height, int src_stride)
1690 {
1691     typedef UINT8 DstT;
1692     typedef const float SrcT;
1693
1694     ASSERT(dst_width >= height);
1695     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1696     SrcT* src_end = src + width;
1697     PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
1698     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1699     for( ; src<src_end; src++, dst_byte+=dst_stride )
1700     {
1701         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1702
1703         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
1704         for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
1705         {
1706             __m128 m1 = _mm_set_ps(
1707                 *(SrcT*)(src2+3*src_stride),
1708                 *(SrcT*)(src2+2*src_stride),
1709                 *(SrcT*)(src2+src_stride),
1710                 *(SrcT*)(src2));
1711             __m128 m2 = _mm_set_ps(
1712                 *(SrcT*)(src2+7*src_stride),
1713                 *(SrcT*)(src2+6*src_stride),
1714                 *(SrcT*)(src2+5*src_stride),
1715                 *(SrcT*)(src2+4*src_stride));
1716             __m128 m3 = _mm_set_ps(
1717                 *(SrcT*)(src2+11*src_stride),
1718                 *(SrcT*)(src2+10*src_stride),
1719                 *(SrcT*)(src2+9*src_stride),
1720                 *(SrcT*)(src2+8*src_stride));
1721             __m128 m4 = _mm_set_ps(
1722                 *(SrcT*)(src2+15*src_stride),
1723                 *(SrcT*)(src2+14*src_stride),
1724                 *(SrcT*)(src2+13*src_stride),
1725                 *(SrcT*)(src2+12*src_stride));
1726
1727             __m128i i1 = _mm_cvtps_epi32(m1);
1728             __m128i i2 = _mm_cvtps_epi32(m2);
1729             __m128i i3 = _mm_cvtps_epi32(m3);
1730             __m128i i4 = _mm_cvtps_epi32(m4);
1731
1732             i1 = _mm_packs_epi32(i1,i2);
1733             i3 = _mm_packs_epi32(i3,i4);
1734             i1 = _mm_packus_epi16(i1,i3);
1735
1736             _mm_store_si128((__m128i*)dst2, i1);
1737         }
1738         for (;src2<src2_end;src2+=src_stride,dst2++)
1739         {
1740             *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
1741         }
1742         DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
1743         for (;dst2<dst2_end;dst2++)
1744         {
1745             *dst2 = 0;
1746         }
1747     }
1748 }
1749
1750 void byte_2_float_c_v0(float *dst, int dst_width, int dst_stride,
1751     PCUINT8 src, int width, int height, int stride)
1752 {
1753     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1754
1755     PCUINT8 src_end = src + height*stride;
1756     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
1757     {
1758         PCUINT8 src2 = src;
1759         PCUINT8 src_end = src2 + width;
1760         float *dst2 = reinterpret_cast<float*>(dst_byte);
1761
1762         for (;src2<src_end;src2++, dst2++)
1763         {
1764             *dst2 = *src2;
1765         }
1766         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
1767         for (;dst2<dst2_end;dst2++)
1768         {
1769             *dst2=0;
1770         }
1771     }
1772 }
1773
1774 void byte_2_float_sse_v0(float *dst, int dst_width, int dst_stride,
1775     PCUINT8 src, int width, int height, int stride)
1776 {
1777     ASSERT( dst_width>=width );
1778     ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
1779     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1780
1781     PCUINT8 src_end = src + height*stride;
1782     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
1783     {
1784         PCUINT8 src2 = src;
1785         PCUINT8 src2_end0 = src2 + (width&~15);
1786         float *dst2 = reinterpret_cast<float*>(dst_byte);
1787
1788         for (;src2<src2_end0;src2+=16, dst2+=16)
1789         {
1790             //filter 4
1791             __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
1792             __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
1793             __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
1794             __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
1795             __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
1796             __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
1797             __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
1798             __m128 dst_f1 =  _mm_cvtepi32_ps(src16_lo_lo);
1799             __m128 dst_f2 =  _mm_cvtepi32_ps(src16_lo_hi);
1800             __m128 dst_f3 =  _mm_cvtepi32_ps(src16_hi_lo);
1801             __m128 dst_f4 =  _mm_cvtepi32_ps(src16_hi_hi);
1802             _mm_stream_ps(dst2, dst_f1);
1803             _mm_stream_ps(dst2+4, dst_f2);
1804             _mm_stream_ps(dst2+8, dst_f3);
1805             _mm_stream_ps(dst2+12, dst_f4);
1806         }
1807         PCUINT8 src2_end = src + width;
1808         for (;src2<src2_end;src2++,dst2++)
1809         {
1810             *dst2 = *src2;
1811         }
1812         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
1813         for (;dst2<dst2_end;dst2++)
1814         {
1815             *dst2=0;
1816         }
1817     }
1818 }
1819
1820 void byte_2_float_sse_v1(float *dst, int dst_width, int dst_stride,
1821     PCUINT8 src, int width, int height, int stride)
1822 {
1823     ASSERT( dst_width>=width );
1824     ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
1825     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1826
1827     PCUINT8 src_end = src + height*stride;
1828     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
1829     {
1830         PCUINT8 src2 = src;
1831         PCUINT8 src2_end0 = src2 + (width&~15);
1832         float *dst2 = reinterpret_cast<float*>(dst_byte);
1833
1834         for (;src2<src2_end0;src2+=16, dst2+=16)
1835         {
1836             //filter 4
1837             __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
1838             __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
1839             __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
1840             __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
1841             __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
1842             __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
1843             __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
1844             __m128 dst_f1 =  _mm_cvtepi32_ps(src16_lo_lo);
1845             __m128 dst_f2 =  _mm_cvtepi32_ps(src16_lo_hi);
1846             __m128 dst_f3 =  _mm_cvtepi32_ps(src16_hi_lo);
1847             __m128 dst_f4 =  _mm_cvtepi32_ps(src16_hi_hi);
1848             _mm_store_ps(dst2, dst_f1);
1849             _mm_store_ps(dst2+4, dst_f2);
1850             _mm_store_ps(dst2+8, dst_f3);
1851             _mm_store_ps(dst2+12, dst_f4);
1852         }
1853         PCUINT8 src2_end = src + width;
1854         for (;src2<src2_end;src2++,dst2++)
1855         {
1856             *dst2 = *src2;
1857         }
1858         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
1859         for (;dst2<dst2_end;dst2++)
1860         {
1861             *dst2=0;
1862         }
1863     }
1864 }
1865
1866 class XyTransposeTest : public ::testing::Test
1867 {
1868 public:
1869     static const int MAX_FILTER_LENGTH = 128;
1870     static const int MAX_DATA_BUFF_SIZE = 512*128;
1871
1872     UINT8 data_byte_base[MAX_DATA_BUFF_SIZE+15];
1873     UINT8 *data_b;
1874
1875     float data_base_f[2*MAX_DATA_BUFF_SIZE+3];
1876     float *data_f;
1877     float *data_f2;
1878     int w, h, pitch;
1879
1880 protected:
1881     virtual void SetUp()
1882     {
1883     }
1884
1885     void FillRandData(int w, int h)
1886     {
1887         this->w = w;
1888         this->h = h;
1889         ASSERT(w%4==0);
1890
1891         this->pitch = ((w+3)&~3)*sizeof(float);
1892         ASSERT(this->pitch*this->h <= MAX_DATA_BUFF_SIZE);
1893
1894         data_f = data_base_f;
1895         if ((int)(data_f)&15)
1896         {
1897             data_f = (float*)(((int)data_f + 15) & ~15);
1898         }
1899         for (int i=0;i<h;i++)
1900         {
1901             float *data2 = (float*)((PUINT8)data_f + i*pitch);
1902             for (int j=0;j<w;j++)
1903             {
1904                 data2[j] = rand()*0.1f/0xFF;
1905             }
1906         }
1907         data_f2 = data_f + h*pitch;
1908
1909         data_b = data_byte_base;
1910         if ((int)(data_b)&15)
1911         {
1912             data_b = (UINT8*)(((int)data_b + 15) & ~15);
1913         }
1914     }
1915 };
1916
1917
1918 #define Float2FloatTest(width, height, loop_num, function) \
1919 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1920 {\
1921     FillRandData(width, height);\
1922     for(int i=0;i<loop_num;i++)\
1923     {\
1924         function(data_f2, height, (width+15)&~15, data_f, width, height, pitch);\
1925     }\
1926     ASSERT_EQ(data_f2[0],data_f2[0]);\
1927 }
1928
1929 Float2FloatTest( 272, 80, 10000, xy_float_2_float_transpose_c_v0)
1930     Float2FloatTest( 272, 80, 10000, xy_float_2_float_transpose_sse_v0)
1931
1932
1933 #define Float2ByteTest(width, height, loop_num, function) \
1934 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1935 {\
1936     FillRandData(width, height);\
1937     for(int i=0;i<loop_num;i++)\
1938     {\
1939         function(data_b, height, (width+15)&~15, data_f, width, height, pitch);\
1940     }\
1941     ASSERT_EQ(data_b[0],data_b[0]);\
1942 }
1943
1944 Float2ByteTest( 272, 80, 10000, float_2_byte_transpose_v0)
1945     Float2ByteTest( 272, 80, 10000, float_2_byte_transpose_sse_v0)
1946
1947
1948 #define Byte2FloatTest(dst_width, width, height, loop_num, function) \
1949 TEST_F(XyTransposeTest, function ## _ ## width ## _ ## height ## _ ## loop_num ) \
1950 {\
1951     FillRandData(width, height);\
1952     for(int i=0;i<loop_num;i++)\
1953     {\
1954         function(data_f, width, (width+15)&~15, data_b, width, height, pitch);\
1955     }\
1956     ASSERT_EQ(data_b[0],data_b[0]);\
1957 }
1958
1959
1960 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_c_v0)
1961 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_sse_v0)
1962 Byte2FloatTest( 272, 256, 64, 10000, byte_2_float_sse_v1)
1963
1964 #endif // __XY_FILTER_BENCHMARK_F0ED9F75_C0A7_4B0E_910E_7A82ECE2D7AD_H__