src/subtitles/xy_filter.cpp

   1 #include "stdafx.h"
   2
   3 typedef const UINT8 CUINT8, *PCUINT8;
   4
   5 /****
   6  * dst memory layout:
   7  * 1. Source content starts from @dst;
   8  * 2. Output content starts from @dst-@mwitdth;
   9  *
  10  * |-                     stride                            -|
  11  * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
  12  * |-                       -|-                             -|
  13  * |- margin                -|-  src width*height items     -|
  14  * |- mwidth*heigth items   -|-                             -|
  15  * |- do NOT need to init   -|-                             -|
  16  * |- when input            -|-                             -|
  17  * |---------------------------------------------------------|
  18  **/
  19 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
  20 {
  21     ASSERT( stride>=4*(width+filter_width) );
  22
  23     int xx_fix = width > filter_width ? 0 : filter_width - width;
  24     const float *filter_start = filter;
  25
  26     BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
  27     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
  28     for( ; dst_byte<end; dst_byte+=stride )
  29     {
  30         float *dst_f = reinterpret_cast<float*>(dst_byte);
  31         float *dst2 = dst_f - filter_width;
  32         float *dst_endr = dst_f + width;
  33         float *dst_end0 = dst_endr - filter_width;
  34         float *dst_endl = dst_f - xx_fix;
  35         ASSERT(xx_fix==0 || dst_end0==dst_endl);
  36
  37         ASSERT(filter_start == filter);
  38         filter_start += filter_width;
  39         const float *filter_end = filter_start;
  40         for (;dst2<dst_endl;dst2++, filter_start--)//left margin
  41         {
  42             const float *src = dst_f;
  43             float sum = 0;
  44             for(const float* f=filter_start;f<filter_end;f++, src++)
  45             {
  46                 sum += src[0] * f[0];
  47             }
  48             *dst2 = sum;
  49         }
  50         for (;dst2<dst_f;dst2++, filter_start--, filter_end--)//if width < filter_width
  51         {
  52             const float *src = dst_f;
  53             float sum = 0;
  54             for(const float* f=filter_start;f<filter_end;f++, src++)
  55             {
  56                 sum += src[0] * f[0];
  57             }
  58             *dst2 = sum;
  59         }
  60         ASSERT(filter_start==filter);
  61         for (;dst2<dst_end0;dst2++)
  62         {
  63             const float *src = dst2;
  64
  65             float sum = 0;
  66             for(const float* f=filter_start;f<filter_end;f++, src++)
  67             {
  68                 sum += src[0] * f[0];
  69             }
  70             *dst2 = sum;
  71         }
  72         for (;dst2<dst_endr;dst2++, filter_end--)//right margin
  73         {
  74             const float *src = dst2;
  75             float sum = 0;
  76             for(const float* f=filter;f<filter_end;f++, src++)
  77             {
  78                 sum += src[0] * f[0];
  79             }
  80             *dst2 = sum;
  81         }
  82     }
  83 }
  84
  85 /****
  86  * inline function sometimes generates stupid code
  87  *
  88  * @src4, @src_5_8, @f4, @sum : __m128
  89  * @src_5_8, @f4: const
  90  * @sum : output
  91  * @src4: undefined
  92  **/
  93 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
  94     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
  95     f4_1 = _mm_mul_ps(f4_1, src4);\
  96     sum = _mm_add_ps(sum, f4_1);\
  97     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
  98     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
  99     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 100     sum = _mm_add_ps(sum, f4_1);\
 101     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 102     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
 103     f4_1 = _mm_mul_ps(f4_1, src4);\
 104     sum = _mm_add_ps(sum, f4_1);\
 105     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 106     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
 107     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 108     sum = _mm_add_ps(sum, f4_1)
 109
 110 /****
 111  * See @xy_filter_c
 112  **/
 113 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 114 {
 115     ASSERT( stride>=4*(width+filter_width) );
 116     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 117
 118     int xx_fix = width > filter_width ? 0 : filter_width - width;
 119     const float *filter_start = filter;
 120     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 121     BYTE* end = dst_byte + height*stride;
 122     for( ; dst_byte<end; dst_byte+=stride )
 123     {
 124         float *dst_f = reinterpret_cast<float*>(dst_byte);
 125         float *dst2 = dst_f - filter_width;
 126         float *dst_endr = dst_f + width;
 127         float *dst_end0 = dst_endr - filter_width;
 128         float *dst_endl = dst_f - xx_fix;
 129         ASSERT(xx_fix==0 || dst_end0==dst_endl);
 130
 131         ASSERT(filter_start == filter);
 132         filter_start += filter_width;
 133         const float *filter_end = filter_start;
 134
 135         for (;dst2<dst_endl;dst2+=4)//left margin
 136         {
 137             const float *src = dst_f;
 138             filter_start -= 4;
 139
 140             //filter 4
 141             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 142             __m128 sum = _mm_setzero_ps();
 143             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 144             {
 145                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 146                 __m128 f4 = _mm_load_ps(f);
 147
 148                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 149
 150                 src4 = src_5_8;
 151             }
 152             //store result
 153             _mm_store_ps(dst2, sum);
 154         }
 155         for (;dst2<dst_f;dst2+=4)//if width < filter_width
 156         {
 157             const float *src = dst_f;
 158             filter_start-=4;
 159             filter_end-=4;
 160
 161             __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 162             __m128 sum = _mm_setzero_ps();
 163             __m128 src_5_8, f4;
 164             for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 165             {
 166                 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 167                 f4 = _mm_load_ps(f);
 168
 169                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 170                 src4 = src_5_8;
 171             }
 172             src_5_8 = _mm_setzero_ps();
 173             f4 = _mm_load_ps(filter_end);
 174             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 175             //store result
 176             _mm_store_ps(dst2, sum);
 177         }
 178         ASSERT(filter_start == filter);
 179         for (;dst2<dst_end0;dst2+=4)
 180         {
 181             const float *src = dst2;
 182
 183             //filter 4
 184             __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 185             __m128 sum = _mm_setzero_ps();
 186             for(const float* f=filter_start;f<filter_end;f+=4)
 187             {
 188                 src+=4;
 189                 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 190                 __m128 f4 = _mm_load_ps(f);
 191
 192                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 193                 src4 = src_5_8;
 194             }
 195             //store result
 196             _mm_store_ps(dst2, sum);
 197         }
 198         for (;dst2<dst_endr;dst2+=4)//right margin
 199         {
 200             const float *src = dst2;
 201             filter_end-=4;
 202
 203             //filter 4
 204             __m128 src4 = _mm_load_ps(src);//1 2 3 4
 205             __m128 sum = _mm_setzero_ps();
 206             __m128 src_5_8, f4;
 207             for(const float* f=filter_start;f<filter_end;f+=4)
 208             {
 209                 src+=4;
 210                 src_5_8 = _mm_load_ps(src);//5 6 7 8
 211                 f4 = _mm_load_ps(f);
 212
 213                 { XY_FILTER_4(src4, src_5_8, f4, sum); }
 214
 215                 src4 = src_5_8;
 216                 //move new 4 in_n_out to old 4 in_n_out
 217             }
 218             src_5_8 = _mm_setzero_ps();
 219             f4 = _mm_load_ps(filter_end);
 220             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 221             //store result
 222             _mm_store_ps(dst2, sum);
 223         }
 224     }
 225 }
 226
 227 /****
 228  * Copy and convert src to dst line by line.
 229  * @dst_width MUST >= @width
 230  * if @dst_width>@width, the extra elements will be filled with 0.
 231  **/
 232 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
 233     PCUINT8 src, int width, int height, int stride)
 234 {
 235     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 236
 237     PCUINT8 src_end = src + height*stride;
 238     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 239     {
 240         PCUINT8 src2 = src;
 241         PCUINT8 src_end = src2 + width;
 242         float *dst2 = reinterpret_cast<float*>(dst_byte);
 243
 244         for (;src2<src_end;src2++, dst2++)
 245         {
 246             *dst2 = *src2;
 247         }
 248         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 249         for (;dst2<dst2_end;dst2++)
 250         {
 251             *dst2=0;
 252         }
 253     }
 254 }
 255
 256 /****
 257  * See @xy_byte_2_float_c
 258  **/
 259 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
 260     PCUINT8 src, int width, int height, int stride)
 261 {
 262     ASSERT( dst_width>=width );
 263     ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
 264     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 265
 266     PCUINT8 src_end = src + height*stride;
 267     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 268     {
 269         PCUINT8 src2 = src;
 270         PCUINT8 src2_end0 = src2 + (width&~15);
 271         float *dst2 = reinterpret_cast<float*>(dst_byte);
 272
 273         for (;src2<src2_end0;src2+=16, dst2+=16)
 274         {
 275             //filter 4
 276             __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
 277             __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
 278             __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
 279             __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
 280             __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
 281             __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
 282             __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
 283             __m128 dst_f1 =  _mm_cvtepi32_ps(src16_lo_lo);
 284             __m128 dst_f2 =  _mm_cvtepi32_ps(src16_lo_hi);
 285             __m128 dst_f3 =  _mm_cvtepi32_ps(src16_hi_lo);
 286             __m128 dst_f4 =  _mm_cvtepi32_ps(src16_hi_hi);
 287             _mm_store_ps(dst2, dst_f1);
 288             _mm_store_ps(dst2+4, dst_f2);
 289             _mm_store_ps(dst2+8, dst_f3);
 290             _mm_store_ps(dst2+12, dst_f4);
 291         }
 292         PCUINT8 src2_end = src + width;
 293         for (;src2<src2_end;src2++,dst2++)
 294         {
 295             *dst2 = *src2;
 296         }
 297         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 298         for (;dst2<dst2_end;dst2++)
 299         {
 300             *dst2=0;
 301         }
 302     }
 303 }
 304
 305 /****
 306  * Copy transposed Matrix src to dst.
 307  * @dst_width MUST >= @height.
 308  * if @dst_width > @height, the extra elements will be filled with 0.
 309  **/
 310 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
 311     const float *src, int width, int height, int src_stride)
 312 {
 313     ASSERT(dst_width >= height);
 314     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 315     const float* src_end = src + width;
 316     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 317     for( ; src<src_end; src++, dst_byte+=dst_stride )
 318     {
 319         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 320
 321         float *dst2 = reinterpret_cast<float*>(dst_byte);
 322         for (;src2<src2_end;src2+=src_stride,dst2++)
 323         {
 324             *dst2 = *reinterpret_cast<const float*>(src2);
 325         }
 326         float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
 327         for (;dst2<dst2_end;dst2++)
 328         {
 329             *dst2 = 0;
 330         }
 331     }
 332 }
 333
 334 /****
 335  * see @xy_float_2_float_transpose_c
 336  **/
 337 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
 338     const float *src, int width, int height, int src_stride)
 339 {
 340     typedef float DstT;
 341     typedef const float SrcT;
 342
 343     ASSERT( (((int)dst|dst_stride)&15)==0 );
 344     ASSERT(dst_width >= height);
 345     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 346     SrcT* src_end = src + width;
 347     PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
 348     PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 349     for( ; src<src_end; src++, dst_byte+=dst_stride )
 350     {
 351         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 352
 353         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 354         for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
 355         {
 356             __m128 m1 = _mm_set_ps(
 357                 *(SrcT*)(src2+3*src_stride),
 358                 *(SrcT*)(src2+2*src_stride),
 359                 *(SrcT*)(src2+src_stride),
 360                 *(SrcT*)(src2));
 361             _mm_store_ps(dst2, m1);
 362         }
 363         for (;src2<src2_end2;src2+=src_stride,dst2++)
 364         {
 365             *dst2 = *reinterpret_cast<SrcT*>(src2);
 366         }
 367         float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 368         for (;dst2<dst2_end;dst2++)
 369         {
 370             *dst2 = 0;
 371         }
 372     }
 373 }
 374
 375 /****
 376  * Transpose and round Matrix src, then copy to dst.
 377  * @dst_width MUST >= @height.
 378  * if @dst_width > @height, the extra elements will be filled with 0.
 379  **/
 380 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
 381     const float *src, int width, int height, int src_stride)
 382 {
 383     ASSERT(dst_width >= height);
 384     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 385     const float* src_end = src + width;
 386     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 387     for( ; src<src_end; src++, dst_byte+=dst_stride )
 388     {
 389         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 390
 391         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
 392         for (;src2<src2_end;src2+=src_stride,dst2++)
 393         {
 394             *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
 395         }
 396         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
 397         for (;dst2<dst2_end;dst2++)
 398         {
 399             *dst2 = 0;
 400         }
 401     }
 402 }
 403
 404 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
 405     const float *src, int width, int height, int src_stride)
 406 {
 407     typedef UINT8 DstT;
 408     typedef const float SrcT;
 409
 410     ASSERT(dst_width >= height);
 411     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 412     SrcT* src_end = src + width;
 413     PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
 414     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 415     for( ; src<src_end; src++, dst_byte+=dst_stride )
 416     {
 417         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 418
 419         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 420         for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
 421         {
 422             __m128 m1 = _mm_set_ps(
 423                 *(SrcT*)(src2+3*src_stride),
 424                 *(SrcT*)(src2+2*src_stride),
 425                 *(SrcT*)(src2+src_stride),
 426                 *(SrcT*)(src2));
 427             __m128 m2 = _mm_set_ps(
 428                 *(SrcT*)(src2+7*src_stride),
 429                 *(SrcT*)(src2+6*src_stride),
 430                 *(SrcT*)(src2+5*src_stride),
 431                 *(SrcT*)(src2+4*src_stride));
 432             __m128 m3 = _mm_set_ps(
 433                 *(SrcT*)(src2+11*src_stride),
 434                 *(SrcT*)(src2+10*src_stride),
 435                 *(SrcT*)(src2+9*src_stride),
 436                 *(SrcT*)(src2+8*src_stride));
 437             __m128 m4 = _mm_set_ps(
 438                 *(SrcT*)(src2+15*src_stride),
 439                 *(SrcT*)(src2+14*src_stride),
 440                 *(SrcT*)(src2+13*src_stride),
 441                 *(SrcT*)(src2+12*src_stride));
 442
 443             __m128i i1 = _mm_cvtps_epi32(m1);
 444             __m128i i2 = _mm_cvtps_epi32(m2);
 445             __m128i i3 = _mm_cvtps_epi32(m3);
 446             __m128i i4 = _mm_cvtps_epi32(m4);
 447
 448             i1 = _mm_packs_epi32(i1,i2);
 449             i3 = _mm_packs_epi32(i3,i4);
 450             i1 = _mm_packus_epi16(i1,i3);
 451
 452             _mm_store_si128((__m128i*)dst2, i1);
 453         }
 454         for (;src2<src2_end;src2+=src_stride,dst2++)
 455         {
 456             *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
 457         }
 458         DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 459         for (;dst2<dst2_end;dst2++)
 460         {
 461             *dst2 = 0;
 462         }
 463     }
 464 }
 465
 466 /****
 467  * To Do: decent CPU capability check
 468  **/
 469 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
 470     PCUINT8 src, int width, int height, int stride,
 471     const float *gt, int r, int gt_ex_width)
 472 {
 473     ASSERT(width<=stride && width+2*r<=dst_stride);
 474     int ex_mask_width = ((r*2+1)+3)&~3;
 475     ASSERT(ex_mask_width<=gt_ex_width);
 476     if (ex_mask_width>gt_ex_width)
 477     {
 478         int o=0;
 479         o=o/o;
 480         exit(-1);
 481     }
 482
 483     int fwidth = (width+3)&~3;
 484     int fstride = (fwidth + ex_mask_width)*sizeof(float);
 485     int fheight = (height+3)&~3;
 486     int fstride_ver = (fheight+ex_mask_width)*sizeof(float);
 487
 488     PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width)*fstride_ver));
 489
 490     float *hor_buff_base = reinterpret_cast<float*>(buff_base);
 491     float *hor_buff = hor_buff_base + ex_mask_width;
 492
 493     // byte to float
 494     ASSERT( ((width+15)&~15)<=stride );
 495     xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
 496
 497     // horizontal pass
 498     xy_filter_sse(hor_buff, fwidth, height, fstride, gt, ex_mask_width);
 499
 500
 501     // transpose
 502     float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
 503     float *ver_buff = ver_buff_base + ex_mask_width;
 504
 505     int true_width = width+r*2;
 506     xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r*2, true_width, height, fstride);
 507
 508     // vertical pass
 509     xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt, ex_mask_width);
 510
 511     // transpose
 512     int true_height = height + 2*r;
 513     xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r*2, true_height, true_width, fstride_ver);
 514
 515     xy_free(buff_base);
 516     _mm_empty();
 517 }