src/subtitles/xy_filter.cpp

   1 #include "stdafx.h"
   2
   3 typedef const UINT8 CUINT8, *PCUINT8;
   4
   5 /****
   6  * dst memory layout:
   7  * 1. Source content starts from @dst;
   8  * 2. Output content starts from @dst-@mwitdth;
   9  *
  10  * |-                     stride                            -|
  11  * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
  12  * |-                       -|-                             -|
  13  * |- margin                -|-  src width*height items     -|
  14  * |- mwidth*heigth items   -|-                             -|
  15  * |- do NOT need to init   -|-                             -|
  16  * |- when input            -|-                             -|
  17  * |---------------------------------------------------------|
  18  **/
  19 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
  20 {
  21     ASSERT( stride>=4*(width+filter_width) );
  22
  23     int xx_fix = width > filter_width ? 0 : filter_width - width;
  24     const float *filter_start = filter;
  25
  26     BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
  27     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
  28     for( ; dst_byte<end; dst_byte+=stride )
  29     {
  30         float *dst_f = reinterpret_cast<float*>(dst_byte);
  31         float *dst2 = dst_f - filter_width;
  32         float *dst_endr = dst_f + width;
  33         float *dst_end0 = dst_endr - filter_width;
  34         float *dst_endl = dst_f - xx_fix;
  35         ASSERT(xx_fix==0 || dst_end0==dst_endl);
  36
  37         ASSERT(filter_start == filter);
  38         filter_start += filter_width;
  39         const float *filter_end = filter_start;
  40         for (;dst2<dst_endl;dst2++, filter_start--)//left margin
  41         {
  42             const float *src = dst_f;
  43             float sum = 0;
  44             for(const float* f=filter_start;f<filter_end;f++, src++)
  45             {
  46                 sum += src[0] * f[0];
  47             }
  48             *dst2 = sum;
  49         }
  50         for (;dst2<dst_f;dst2++, filter_start--, filter_end--)//if width < filter_width
  51         {
  52             const float *src = dst_f;
  53             float sum = 0;
  54             for(const float* f=filter_start;f<filter_end;f++, src++)
  55             {
  56                 sum += src[0] * f[0];
  57             }
  58             *dst2 = sum;
  59         }
  60         ASSERT(filter_start==filter);
  61         for (;dst2<dst_end0;dst2++)
  62         {
  63             const float *src = dst2;
  64
  65             float sum = 0;
  66             for(const float* f=filter_start;f<filter_end;f++, src++)
  67             {
  68                 sum += src[0] * f[0];
  69             }
  70             *dst2 = sum;
  71         }
  72         for (;dst2<dst_endr;dst2++, filter_end--)//right margin
  73         {
  74             const float *src = dst2;
  75             float sum = 0;
  76             for(const float* f=filter;f<filter_end;f++, src++)
  77             {
  78                 sum += src[0] * f[0];
  79             }
  80             *dst2 = sum;
  81         }
  82     }
  83 }
  84
  85 /****
  86  * See @xy_filter_c
  87  **/
  88 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
  89 {
  90 #ifdef XY_FILTER_4
  91 #  undef XY_FILTER_4
  92 #endif
  93     ASSERT( stride>=4*(width+filter_width) );
  94     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
  95
  96     int xx_fix = width > filter_width ? 0 : filter_width - width;
  97     const float *filter_start = filter;
  98     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
  99     BYTE* end = dst_byte + height*stride;
 100     for( ; dst_byte<end; dst_byte+=stride )
 101     {
 102         float *dst_f = reinterpret_cast<float*>(dst_byte);
 103         float *dst2 = dst_f - filter_width;
 104         float *dst_endr = dst_f + width;
 105         float *dst_end0 = dst_endr - filter_width;
 106         float *dst_endl = dst_f - xx_fix;
 107         ASSERT(xx_fix==0 || dst_end0==dst_endl);
 108
 109         ASSERT(filter_start == filter);
 110         filter_start += filter_width;
 111         const float *filter_end = filter_start;
 112
 113         for (;dst2<dst_endl;dst2+=4)//left margin
 114         {
 115             const float *src = dst_f;
 116             filter_start -= 4;
 117
 118             //filter 4
 119             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 120             __m128 sum = _mm_setzero_ps();
 121             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 122             {
 123                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 124                 __m128 f4 = _mm_load_ps(f);
 125
 126 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
 127                 __m128 f4_1 = _mm_unpacklo_ps(f4,f4);\
 128                 f4_1 = _mm_unpacklo_ps(f4_1, f4_1);\
 129                 f4_1 = _mm_mul_ps(f4_1, src4);\
 130                 sum = _mm_add_ps(sum, f4_1);\
 131                 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 132                 f4_1 = _mm_unpackhi_ps(f4,f4);\
 133                 f4_1 = _mm_unpacklo_ps(f4_1,f4_1);\
 134                 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 135                 sum = _mm_add_ps(sum, f4_1);\
 136                 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 137                 f4_1 = _mm_unpacklo_ps(f4,f4);\
 138                 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
 139                 f4_1 = _mm_mul_ps(f4_1, src4);\
 140                 sum = _mm_add_ps(sum, f4_1);\
 141                 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 142                 f4_1 = _mm_unpackhi_ps(f4,f4);\
 143                 f4_1 = _mm_unpackhi_ps(f4_1, f4_1);\
 144                 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 145                 sum = _mm_add_ps(sum, f4_1)
 146
 147                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 148                 src4 = src_5_8;
 149             }
 150             //store result
 151             _mm_stream_ps(dst2, sum);
 152         }
 153         for (;dst2<dst_f;dst2+=4)//if width < filter_width
 154         {
 155             const float *src = dst_f;
 156             filter_start-=4;
 157             filter_end-=4;
 158
 159             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 160             __m128 sum = _mm_setzero_ps();
 161             __m128 src_5_8, f4;
 162             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 163             {
 164                 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 165                 f4 = _mm_load_ps(f);
 166
 167                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 168                 src4 = src_5_8;
 169             }
 170             src_5_8 = _mm_setzero_ps();
 171             f4 = _mm_load_ps(filter_end);
 172             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 173             //store result
 174             _mm_stream_ps(dst2, sum);
 175         }
 176         ASSERT(filter_start == filter);
 177         for (;dst2<dst_end0;dst2+=4)
 178         {
 179             const float *src = dst2;
 180
 181             //filter 4
 182             __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 183             __m128 sum = _mm_setzero_ps();
 184             for(const float* f=filter_start;f<filter_end;f+=4)
 185             {
 186                 src+=4;
 187                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 188                 __m128 f4 = _mm_load_ps(f);
 189
 190                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 191                 src4 = src_5_8;
 192             }
 193             //store result
 194             _mm_stream_ps(dst2, sum);
 195         }
 196         for (;dst2<dst_endr;dst2+=4)//right margin
 197         {
 198             const float *src = dst2;
 199             filter_end-=4;
 200
 201             //filter 4
 202             __m128 src4 = _mm_load_ps(src);//1 2 3 4
 203             __m128 sum = _mm_setzero_ps();
 204             __m128 src_5_8, f4;
 205             for(const float* f=filter_start;f<filter_end;f+=4)
 206             {
 207                 src+=4;
 208                 src_5_8 = _mm_load_ps(src);//5 6 7 8
 209                 f4 = _mm_load_ps(f);
 210
 211                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 212
 213                 src4 = src_5_8;
 214                 //move new 4 in_n_out to old 4 in_n_out
 215             }
 216             src_5_8 = _mm_setzero_ps();
 217             f4 = _mm_load_ps(filter_end);
 218             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 219             //store result
 220             _mm_stream_ps(dst2, sum);
 221         }
 222     }
 223 #undef XY_FILTER_4
 224 }
 225
 226 /****
 227  * Copy and convert src to dst line by line.
 228  * @dst_width MUST >= @width
 229  * if @dst_width>@width, the extra elements will be filled with 0.
 230  **/
 231 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
 232     PCUINT8 src, int width, int height, int stride)
 233 {
 234     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 235
 236     PCUINT8 src_end = src + height*stride;
 237     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 238     {
 239         PCUINT8 src2 = src;
 240         PCUINT8 src_end = src2 + width;
 241         float *dst2 = reinterpret_cast<float*>(dst_byte);
 242
 243         for (;src2<src_end;src2++, dst2++)
 244         {
 245             *dst2 = *src2;
 246         }
 247         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 248         for (;dst2<dst2_end;dst2++)
 249         {
 250             *dst2=0;
 251         }
 252     }
 253 }
 254
 255 /****
 256  * See @xy_byte_2_float_c
 257  **/
 258 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
 259     PCUINT8 src, int width, int height, int stride)
 260 {
 261     ASSERT( dst_width>=width );
 262     ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
 263     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 264
 265     PCUINT8 src_end = src + height*stride;
 266     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 267     {
 268         PCUINT8 src2 = src;
 269         PCUINT8 src2_end0 = src2 + (width&~15);
 270         float *dst2 = reinterpret_cast<float*>(dst_byte);
 271
 272         for (;src2<src2_end0;src2+=16, dst2+=16)
 273         {
 274             //filter 4
 275             __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
 276             __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
 277             __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
 278             __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
 279             __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
 280             __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
 281             __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
 282             __m128 dst_f1 =  _mm_cvtepi32_ps(src16_lo_lo);
 283             __m128 dst_f2 =  _mm_cvtepi32_ps(src16_lo_hi);
 284             __m128 dst_f3 =  _mm_cvtepi32_ps(src16_hi_lo);
 285             __m128 dst_f4 =  _mm_cvtepi32_ps(src16_hi_hi);
 286             _mm_stream_ps(dst2, dst_f1);
 287             _mm_stream_ps(dst2+4, dst_f2);
 288             _mm_stream_ps(dst2+8, dst_f3);
 289             _mm_stream_ps(dst2+12, dst_f4);
 290         }
 291         PCUINT8 src2_end = src + width;
 292         for (;src2<src2_end;src2++,dst2++)
 293         {
 294             *dst2 = *src2;
 295         }
 296         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 297         for (;dst2<dst2_end;dst2++)
 298         {
 299             *dst2=0;
 300         }
 301     }
 302 }
 303
 304 /****
 305  * Copy transposed Matrix src to dst.
 306  * @dst_width MUST >= @height.
 307  * if @dst_width > @height, the extra elements will be filled with 0.
 308  **/
 309 void xy_transpose(float *dst, int dst_width, int dst_stride,
 310     const float *src, int width, int height, int src_stride)
 311 {
 312     ASSERT(dst_width >= height);
 313     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 314     const float* src_end = src + width;
 315     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 316     for( ; src<src_end; src++, dst_byte+=dst_stride )
 317     {
 318         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 319
 320         float *dst2 = reinterpret_cast<float*>(dst_byte);
 321         for (;src2<src2_end;src2+=src_stride,dst2++)
 322         {
 323             *dst2 = *reinterpret_cast<const float*>(src2);
 324         }
 325         float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
 326         for (;dst2<dst2_end;dst2++)
 327         {
 328             *dst2 = 0;
 329         }
 330     }
 331 }
 332
 333 /****
 334  * Transpose and round Matrix src, then copy to dst.
 335  * @dst_width MUST >= @height.
 336  * if @dst_width > @height, the extra elements will be filled with 0.
 337  **/
 338 void xy_transpose(UINT8 *dst, int dst_width, int dst_stride,
 339     const float *src, int width, int height, int src_stride)
 340 {
 341     ASSERT(dst_width >= height);
 342     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 343     const float* src_end = src + width;
 344     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 345     for( ; src<src_end; src++, dst_byte+=dst_stride )
 346     {
 347         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 348
 349         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
 350         for (;src2<src2_end;src2+=src_stride,dst2++)
 351         {
 352             *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
 353         }
 354         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
 355         for (;dst2<dst2_end;dst2++)
 356         {
 357             *dst2 = 0;
 358         }
 359     }
 360 }
 361
 362 /****
 363  * To Do: decent CPU capability check
 364  **/
 365 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
 366     PCUINT8 src, int width, int height, int stride,
 367     const float *gt, int r, int gt_ex_width)
 368 {
 369     ASSERT(width<=stride && width+2*r<=dst_stride);
 370     int ex_mask_width = ((r*2+1)+3)&~3;
 371     ASSERT(ex_mask_width<=gt_ex_width);
 372     if (ex_mask_width>gt_ex_width)
 373     {
 374         int o=0;
 375         o=o/o;
 376         exit(-1);
 377     }
 378
 379     int fwidth = (width+3)&~3;
 380     int fstride = (fwidth + ex_mask_width)*sizeof(float);
 381     int fheight = (height+3)&~3;
 382     int fstride_ver = (fheight+ex_mask_width)*sizeof(float);
 383
 384     PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width)*fstride_ver));
 385
 386     float *hor_buff_base = reinterpret_cast<float*>(buff_base);
 387     float *hor_buff = hor_buff_base + ex_mask_width;
 388
 389     // byte to float
 390     ASSERT( ((width+15)&~15)<=stride );
 391     xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
 392
 393     // horizontal pass
 394     xy_filter_sse(hor_buff, fwidth, height, fstride, gt, ex_mask_width);
 395
 396
 397     // transpose
 398     float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
 399     float *ver_buff = ver_buff_base + ex_mask_width;
 400
 401     int true_width = width+r*2;
 402     xy_transpose(ver_buff, fheight, fstride_ver, hor_buff-r*2, true_width, height, fstride);
 403
 404     // vertical pass
 405     xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt, ex_mask_width);
 406
 407     // transpose
 408     int true_height = height + 2*r;
 409     xy_transpose(dst, true_width, dst_stride, ver_buff-r*2, true_height, true_width, fstride_ver);
 410
 411     xy_free(buff_base);
 412     _mm_empty();
 413 }