src/subtitles/xy_filter.cpp

   1 #include "stdafx.h"
   2
   3 typedef const UINT8 CUINT8, *PCUINT8;
   4
   5 /****
   6  * See @xy_filter_c
   7  **/
   8 __forceinline void xy_filter_one_line_c(float *dst, int width, const float *filter, int filter_width)
   9 {
  10     const float *filter_start = filter;
  11     int xx_fix = width > filter_width ? 0 : filter_width - width;
  12     float *dst2 = dst - filter_width;
  13     float *dst_endr = dst + width;
  14     float *dst_end0 = dst_endr - filter_width;
  15     float *dst_endl = dst - xx_fix;
  16     ASSERT(xx_fix==0 || dst_end0==dst_endl);
  17
  18     ASSERT(filter_start == filter);
  19     filter_start += filter_width;
  20     const float *filter_end = filter_start;
  21     for (;dst2<dst_endl;dst2++, filter_start--)//left margin
  22     {
  23         const float *src = dst;
  24         float sum = 0;
  25         for(const float* f=filter_start;f<filter_end;f++, src++)
  26         {
  27             sum += src[0] * f[0];
  28         }
  29         *dst2 = sum;
  30     }
  31     for (;dst2<dst;dst2++, filter_start--, filter_end--)//if width < filter_width
  32     {
  33         const float *src = dst;
  34         float sum = 0;
  35         for(const float* f=filter_start;f<filter_end;f++, src++)
  36         {
  37             sum += src[0] * f[0];
  38         }
  39         *dst2 = sum;
  40     }
  41     ASSERT(filter_start==filter);
  42     for (;dst2<dst_end0;dst2++)
  43     {
  44         const float *src = dst2;
  45
  46         float sum = 0;
  47         for(const float* f=filter_start;f<filter_end;f++, src++)
  48         {
  49             sum += src[0] * f[0];
  50         }
  51         *dst2 = sum;
  52     }
  53     for (;dst2<dst_endr;dst2++, filter_end--)//right margin
  54     {
  55         const float *src = dst2;
  56         float sum = 0;
  57         for(const float* f=filter;f<filter_end;f++, src++)
  58         {
  59             sum += src[0] * f[0];
  60         }
  61         *dst2 = sum;
  62     }
  63 }
  64
  65 /****
  66  * dst memory layout:
  67  * 1. Source content starts from @dst;
  68  * 2. Output content starts from @dst-@mwitdth;
  69  *
  70  * |-                     stride                            -|
  71  * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
  72  * |-                       -|-                             -|
  73  * |- margin                -|-  src width*height items     -|
  74  * |- mwidth*heigth items   -|-                             -|
  75  * |- do NOT need to init   -|-                             -|
  76  * |- when input            -|-                             -|
  77  * |---------------------------------------------------------|
  78  **/
  79 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
  80 {
  81     ASSERT( stride>=4*(width+filter_width) );
  82     BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
  83     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
  84     for( ; dst_byte<end; dst_byte+=stride )
  85     {
  86         xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
  87     }
  88 }
  89
  90 /****
  91  * inline function sometimes generates stupid code
  92  *
  93  * @src4, @src_5_8, @f4, @sum : __m128
  94  * @src_5_8, @f4: const
  95  * @sum : output
  96  * @src4: undefined
  97  **/
  98 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
  99     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
 100     f4_1 = _mm_mul_ps(f4_1, src4);\
 101     sum = _mm_add_ps(sum, f4_1);\
 102     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 103     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
 104     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 105     sum = _mm_add_ps(sum, f4_1);\
 106     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 107     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
 108     f4_1 = _mm_mul_ps(f4_1, src4);\
 109     sum = _mm_add_ps(sum, f4_1);\
 110     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 111     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
 112     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 113     sum = _mm_add_ps(sum, f4_1)
 114
 115
 116 __forceinline void xy_filter_one_line_sse_v4(float *dst, int width, const float *filter, int filter_width)
 117 {
 118     int xx_fix = width > filter_width ? 0 : filter_width - width;
 119     const float *filter_start = filter;
 120     float *dst2 = dst - filter_width;
 121     float *dst_endr = dst + width;
 122     float *dst_end0 = dst_endr - filter_width;
 123     float *dst_endl = dst - xx_fix;
 124     ASSERT(xx_fix==0 || dst_end0==dst_endl);
 125
 126     ASSERT(filter_start == filter);
 127     filter_start += filter_width;
 128     const float *filter_end = filter_start;
 129
 130     for (;dst2<dst_endl;dst2+=4)//left margin
 131     {
 132         const float *src = dst;
 133         filter_start -= 4;
 134
 135         //filter 4
 136         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 137         __m128 sum = _mm_setzero_ps();
 138         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 139         {
 140             __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 141             __m128 f4 = _mm_load_ps(f);
 142
 143             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 144
 145             src4 = src_5_8;
 146         }
 147         //store result
 148         _mm_store_ps(dst2, sum);
 149     }
 150     for (;dst2<dst;dst2+=4)//if width < filter_width
 151     {
 152         const float *src = dst;
 153         filter_start-=4;
 154         filter_end-=4;
 155
 156         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 157         __m128 sum = _mm_setzero_ps();
 158         __m128 src_5_8, f4;
 159         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 160         {
 161             src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 162             f4 = _mm_load_ps(f);
 163
 164             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 165             src4 = src_5_8;
 166         }
 167         src_5_8 = _mm_setzero_ps();
 168         f4 = _mm_load_ps(filter_end);
 169         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 170         //store result
 171         _mm_store_ps(dst2, sum);
 172     }
 173     ASSERT(filter_start == filter);
 174     for (;dst2<dst_end0;dst2+=4)
 175     {
 176         const float *src = dst2;
 177
 178         //filter 4
 179         __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 180         __m128 sum = _mm_setzero_ps();
 181         for(const float* f=filter_start;f<filter_end;f+=4)
 182         {
 183             src+=4;
 184             __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 185             __m128 f4 = _mm_load_ps(f);
 186
 187             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 188             src4 = src_5_8;
 189         }
 190         //store result
 191         _mm_store_ps(dst2, sum);
 192     }
 193     for (;dst2<dst_endr;dst2+=4)//right margin
 194     {
 195         const float *src = dst2;
 196         filter_end-=4;
 197
 198         //filter 4
 199         __m128 src4 = _mm_load_ps(src);//1 2 3 4
 200         __m128 sum = _mm_setzero_ps();
 201         __m128 src_5_8, f4;
 202         for(const float* f=filter_start;f<filter_end;f+=4)
 203         {
 204             src+=4;
 205             src_5_8 = _mm_load_ps(src);//5 6 7 8
 206             f4 = _mm_load_ps(f);
 207
 208             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 209
 210             src4 = src_5_8;
 211             //move new 4 in_n_out to old 4 in_n_out
 212         }
 213         src_5_8 = _mm_setzero_ps();
 214         f4 = _mm_load_ps(filter_end);
 215         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 216         //store result
 217         _mm_store_ps(dst2, sum);
 218     }
 219 }
 220
 221 /****
 222  * See @xy_filter_c
 223  **/
 224 void xy_filter_sse_v4(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 225 {
 226     ASSERT( stride>=4*(width+filter_width) );
 227     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 228
 229     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 230     BYTE* end = dst_byte + height*stride;
 231     for( ; dst_byte<end; dst_byte+=stride )
 232     {
 233         xy_filter_one_line_sse_v4(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
 234     }
 235 }
 236
 237 template<int LENGTH>
 238 struct M128s
 239 {
 240     __m128 x;
 241     M128s<LENGTH - 4> next;
 242
 243     template<int Index> __forceinline __m128& GetAt()
 244     {
 245         return next.GetAt<Index - 4>();
 246     }
 247     template<> __forceinline __m128& GetAt<0>()
 248     {
 249         return x;
 250     }
 251
 252     template<int Start, int Offset> __forceinline __m128& GetAt()
 253     {
 254         return GetAt<Start + Offset>();
 255     }
 256
 257     __forceinline void Load(const float* src)
 258     {
 259         x = _mm_load_ps(src);
 260         next.Load(src+4);
 261     }
 262 };
 263
 264 template<>
 265 struct M128s<0>
 266 {
 267     void Load(const float* src)
 268     {
 269     }
 270 };
 271
 272 template<int FILTER_LENGTH, int START, int LENGTH>
 273 struct Filter4
 274 {
 275     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 276     {
 277         Filter4<FILTER_LENGTH,START,LENGTH-4>::do_cal(src0_128, src4, filter128s, sum);
 278         __m128 src4_128 = _mm_load_ps(src4+LENGTH-4);
 279         XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START+LENGTH-4>(), sum);
 280         src0_128 = src4_128;
 281     }
 282 };
 283
 284 template<int FILTER_LENGTH, int START>
 285 struct Filter4<FILTER_LENGTH, START, 0>
 286 {
 287     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 288     {
 289     }
 290 };
 291
 292 template<int FILTER_LENGTH,int MARGIN_LENGTH>
 293 struct FilterAllMargin
 294 {
 295     static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 296     {
 297         //filter 4
 298         __m128 src0 = _mm_setzero_ps();
 299         __m128 sum = _mm_setzero_ps();
 300         Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
 301         _mm_store_ps(src-MARGIN_LENGTH, sum);
 302         FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_left_margin(src, filter128s);
 303     }
 304
 305     static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 306     {
 307         //filter 4
 308         {
 309             __m128 src0 = _mm_load_ps(src);
 310             __m128 sum = _mm_setzero_ps();
 311             Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
 312             __m128 src4 = _mm_setzero_ps();
 313             XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum);
 314             //store result
 315             _mm_store_ps(src, sum);
 316         }
 317         FilterAllMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::cal_right_margin(src+4, filter128s);
 318     }
 319 };
 320
 321 template<int FILTER_LENGTH>
 322 struct FilterAllMargin<FILTER_LENGTH,0>
 323 {
 324     static __forceinline void cal_left_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 325     {
 326     }
 327     static __forceinline void cal_right_margin(float * src, M128s<FILTER_LENGTH>& filter128s)
 328     {
 329     }
 330 };
 331
 332 /****
 333  * See @xy_filter_c
 334  * Constrain:
 335  *   FILTER_LENGTH%4 == 0 && FILTER_LENGTH<=width
 336  **/
 337 template<int FILTER_LENGTH>
 338 void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter)
 339 {
 340     ASSERT( stride>=4*(width+FILTER_LENGTH) );
 341     ASSERT( ((stride|(4*width)|(4*FILTER_LENGTH)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 342
 343     M128s<FILTER_LENGTH> filter128s;
 344     filter128s.Load(filter);
 345
 346     const float *filter_start = filter;
 347     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 348     BYTE* end = dst_byte + height*stride;
 349     for( ; dst_byte<end; dst_byte+=stride )
 350     {
 351         float *dst2 = reinterpret_cast<float*>(dst_byte);
 352
 353         //left margin
 354         FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_left_margin(dst2, filter128s);
 355         float *dst_end1 = dst2 + width;
 356         float *dst_end0 = dst_end1 - FILTER_LENGTH;
 357         for (;dst2<dst_end0;dst2+=4)
 358         {
 359             const float *src = dst2;
 360
 361             //filter 4
 362             __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
 363             src += 4;
 364             __m128 sum = _mm_setzero_ps();
 365             Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
 366             //store result
 367             _mm_store_ps(dst2, sum);
 368         }
 369         FilterAllMargin<FILTER_LENGTH,FILTER_LENGTH>::cal_right_margin(dst2, filter128s);
 370     }
 371 }
 372
 373 /****
 374  * See @xy_filter_c
 375  **/
 376 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 377 {
 378     typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
 379     const Filter filters[] = { xy_filter_sse_template<4>, xy_filter_sse_template<8>, xy_filter_sse_template<12>, xy_filter_sse_template<16>,
 380         xy_filter_sse_template<20>, xy_filter_sse_template<24>, xy_filter_sse_template<28>
 381     };
 382     if (filter_width<=28 && filter_width<=width)
 383     {
 384         ASSERT(filter_width%4==0);
 385         filters[(filter_width-1)/4](dst, width, height, stride, filter);
 386     }
 387     else
 388     {
 389         xy_filter_sse_v4(dst, width, height, stride, filter, filter_width);
 390     }
 391 }
 392
 393 /****
 394  * Copy and convert src to dst line by line.
 395  * @dst_width MUST >= @width
 396  * if @dst_width>@width, the extra elements will be filled with 0.
 397  **/
 398 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
 399     PCUINT8 src, int width, int height, int stride)
 400 {
 401     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 402
 403     PCUINT8 src_end = src + height*stride;
 404     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 405     {
 406         PCUINT8 src2 = src;
 407         PCUINT8 src_end = src2 + width;
 408         float *dst2 = reinterpret_cast<float*>(dst_byte);
 409
 410         for (;src2<src_end;src2++, dst2++)
 411         {
 412             *dst2 = *src2;
 413         }
 414         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 415         for (;dst2<dst2_end;dst2++)
 416         {
 417             *dst2=0;
 418         }
 419     }
 420 }
 421
 422 /****
 423  * See @xy_byte_2_float_c
 424  **/
 425 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
 426     PCUINT8 src, int width, int height, int stride)
 427 {
 428     ASSERT( dst_width>=width );
 429     ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
 430     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 431
 432     PCUINT8 src_end = src + height*stride;
 433     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 434     {
 435         PCUINT8 src2 = src;
 436         PCUINT8 src2_end0 = src2 + (width&~15);
 437         float *dst2 = reinterpret_cast<float*>(dst_byte);
 438
 439         for (;src2<src2_end0;src2+=16, dst2+=16)
 440         {
 441             //filter 4
 442             __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
 443             __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
 444             __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
 445             __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
 446             __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
 447             __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
 448             __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
 449             __m128 dst_f1 =  _mm_cvtepi32_ps(src16_lo_lo);
 450             __m128 dst_f2 =  _mm_cvtepi32_ps(src16_lo_hi);
 451             __m128 dst_f3 =  _mm_cvtepi32_ps(src16_hi_lo);
 452             __m128 dst_f4 =  _mm_cvtepi32_ps(src16_hi_hi);
 453             _mm_store_ps(dst2, dst_f1);
 454             _mm_store_ps(dst2+4, dst_f2);
 455             _mm_store_ps(dst2+8, dst_f3);
 456             _mm_store_ps(dst2+12, dst_f4);
 457         }
 458         PCUINT8 src2_end = src + width;
 459         for (;src2<src2_end;src2++,dst2++)
 460         {
 461             *dst2 = *src2;
 462         }
 463         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 464         for (;dst2<dst2_end;dst2++)
 465         {
 466             *dst2=0;
 467         }
 468     }
 469 }
 470
 471 /****
 472  * Copy transposed Matrix src to dst.
 473  * @dst_width MUST >= @height.
 474  * if @dst_width > @height, the extra elements will be filled with 0.
 475  **/
 476 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
 477     const float *src, int width, int height, int src_stride)
 478 {
 479     ASSERT(dst_width >= height);
 480     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 481     const float* src_end = src + width;
 482     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 483     for( ; src<src_end; src++, dst_byte+=dst_stride )
 484     {
 485         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 486
 487         float *dst2 = reinterpret_cast<float*>(dst_byte);
 488         for (;src2<src2_end;src2+=src_stride,dst2++)
 489         {
 490             *dst2 = *reinterpret_cast<const float*>(src2);
 491         }
 492         float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
 493         for (;dst2<dst2_end;dst2++)
 494         {
 495             *dst2 = 0;
 496         }
 497     }
 498 }
 499
 500 /****
 501  * see @xy_float_2_float_transpose_c
 502  **/
 503 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
 504     const float *src, int width, int height, int src_stride)
 505 {
 506     typedef float DstT;
 507     typedef const float SrcT;
 508
 509     ASSERT( (((int)dst|dst_stride)&15)==0 );
 510     ASSERT(dst_width >= height);
 511     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 512     SrcT* src_end = src + width;
 513     PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
 514     PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 515     for( ; src<src_end; src++, dst_byte+=dst_stride )
 516     {
 517         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 518
 519         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 520         for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
 521         {
 522             __m128 m1 = _mm_set_ps(
 523                 *(SrcT*)(src2+3*src_stride),
 524                 *(SrcT*)(src2+2*src_stride),
 525                 *(SrcT*)(src2+src_stride),
 526                 *(SrcT*)(src2));
 527             _mm_store_ps(dst2, m1);
 528         }
 529         for (;src2<src2_end2;src2+=src_stride,dst2++)
 530         {
 531             *dst2 = *reinterpret_cast<SrcT*>(src2);
 532         }
 533         float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 534         for (;dst2<dst2_end;dst2++)
 535         {
 536             *dst2 = 0;
 537         }
 538     }
 539 }
 540
 541 /****
 542  * Transpose and round Matrix src, then copy to dst.
 543  * @dst_width MUST >= @height.
 544  * if @dst_width > @height, the extra elements will be filled with 0.
 545  **/
 546 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
 547     const float *src, int width, int height, int src_stride)
 548 {
 549     ASSERT(dst_width >= height);
 550     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 551     const float* src_end = src + width;
 552     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 553     for( ; src<src_end; src++, dst_byte+=dst_stride )
 554     {
 555         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 556
 557         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
 558         for (;src2<src2_end;src2+=src_stride,dst2++)
 559         {
 560             *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
 561         }
 562         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
 563         for (;dst2<dst2_end;dst2++)
 564         {
 565             *dst2 = 0;
 566         }
 567     }
 568 }
 569
 570 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
 571     const float *src, int width, int height, int src_stride)
 572 {
 573     typedef UINT8 DstT;
 574     typedef const float SrcT;
 575
 576     ASSERT(dst_width >= height);
 577     ASSERT((((int)dst|dst_stride)&15)==0);
 578     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 579     SrcT* src_end = src + width;
 580     PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
 581     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 582     for( ; src<src_end; src++, dst_byte+=dst_stride )
 583     {
 584         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 585
 586         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 587         for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
 588         {
 589             __m128 m1 = _mm_set_ps(
 590                 *(SrcT*)(src2+3*src_stride),
 591                 *(SrcT*)(src2+2*src_stride),
 592                 *(SrcT*)(src2+src_stride),
 593                 *(SrcT*)(src2));
 594             __m128 m2 = _mm_set_ps(
 595                 *(SrcT*)(src2+7*src_stride),
 596                 *(SrcT*)(src2+6*src_stride),
 597                 *(SrcT*)(src2+5*src_stride),
 598                 *(SrcT*)(src2+4*src_stride));
 599             __m128 m3 = _mm_set_ps(
 600                 *(SrcT*)(src2+11*src_stride),
 601                 *(SrcT*)(src2+10*src_stride),
 602                 *(SrcT*)(src2+9*src_stride),
 603                 *(SrcT*)(src2+8*src_stride));
 604             __m128 m4 = _mm_set_ps(
 605                 *(SrcT*)(src2+15*src_stride),
 606                 *(SrcT*)(src2+14*src_stride),
 607                 *(SrcT*)(src2+13*src_stride),
 608                 *(SrcT*)(src2+12*src_stride));
 609
 610             __m128i i1 = _mm_cvtps_epi32(m1);
 611             __m128i i2 = _mm_cvtps_epi32(m2);
 612             __m128i i3 = _mm_cvtps_epi32(m3);
 613             __m128i i4 = _mm_cvtps_epi32(m4);
 614
 615             i1 = _mm_packs_epi32(i1,i2);
 616             i3 = _mm_packs_epi32(i3,i4);
 617             i1 = _mm_packus_epi16(i1,i3);
 618
 619             _mm_store_si128((__m128i*)dst2, i1);
 620         }
 621         for (;src2<src2_end;src2+=src_stride,dst2++)
 622         {
 623             *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
 624         }
 625         DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 626         for (;dst2<dst2_end;dst2++)
 627         {
 628             *dst2 = 0;
 629         }
 630     }
 631 }
 632
 633 /****
 634  * To Do: decent CPU capability check
 635  **/
 636 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
 637     PCUINT8 src, int width, int height, int stride,
 638     const float *gt, int r, int gt_ex_width)
 639 {
 640     ASSERT(width<=stride && width+2*r<=dst_stride);
 641     int ex_mask_width = ((r*2+1)+3)&~3;
 642     ASSERT(ex_mask_width<=gt_ex_width);
 643     if (ex_mask_width>gt_ex_width)
 644     {
 645         int o=0;
 646         o=o/o;
 647         exit(-1);
 648     }
 649
 650     int fwidth = (width+3)&~3;
 651     int fstride = (fwidth + ex_mask_width)*sizeof(float);
 652     int fheight = (height+3)&~3;
 653     int fstride_ver = (fheight+ex_mask_width)*sizeof(float);
 654
 655     PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width)*fstride_ver));
 656
 657     float *hor_buff_base = reinterpret_cast<float*>(buff_base);
 658     float *hor_buff = hor_buff_base + ex_mask_width;
 659
 660     // byte to float
 661     ASSERT( ((width+15)&~15)<=stride );
 662     xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
 663
 664     // horizontal pass
 665     xy_filter_sse(hor_buff, fwidth, height, fstride, gt, ex_mask_width);
 666
 667
 668     // transpose
 669     float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
 670     float *ver_buff = ver_buff_base + ex_mask_width;
 671
 672     int true_width = width+r*2;
 673     xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r*2, true_width, height, fstride);
 674
 675     // vertical pass
 676     xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt, ex_mask_width);
 677
 678     // transpose
 679     int true_height = height + 2*r;
 680     xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r*2, true_height, true_width, fstride_ver);
 681
 682     xy_free(buff_base);
 683     _mm_empty();
 684 }