src/subtitles/Rasterizer.cpp

   1 /*
   2  *      Copyright (C) 2003-2006 Gabest
   3  *      http://www.gabest.org
   4  *
   5  *  This Program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2, or (at your option)
   8  *  any later version.
   9  *
  10  *  This Program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with GNU Make; see the file COPYING.  If not, write to
  17  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18  *  http://www.gnu.org/copyleft/gpl.html
  19  *
  20  */
  21
  22 #include "stdafx.h"
  23 #include <string.h>
  24 #include <math.h>
  25 #include <vector>
  26 #include <algorithm>
  27 #include "Rasterizer.h"
  28 #include "SeparableFilter.h"
  29 #include "xy_logger.h"
  30 #include <boost/flyweight/key_value.hpp>
  31 #include "xy_bitmap.h"
  32
  33 #ifndef _MAX    /* avoid collision with common (nonconforming) macros */
  34 #define _MAX    (std::max)
  35 #define _MIN    (std::min)
  36 #define _IMPL_MAX std::max
  37 #define _IMPL_MIN std::min
  38 #else
  39 #define _IMPL_MAX _MAX
  40 #define _IMPL_MIN _MIN
  41 #endif
  42
  43
  44 //NOTE: signed or unsigned affects the result seriously
  45 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
  46
  47 #define SPLIT_AYUV(color, a, y, u, v) do { \
  48         *(v)=(color)&0xff; \
  49         *(u)=((color)>>8) &0xff; \
  50         *(y)=((color)>>16)&0xff;\
  51         *(a)=((color)>>24)&0xff;\
  52     } while(0)
  53
  54 class ass_synth_priv
  55 {
  56 public:
  57     static const int VOLUME_BITS = 22;//should not exceed 32-8, and better not exceed 31-8
  58
  59     ass_synth_priv(const double sigma);
  60     ass_synth_priv(const ass_synth_priv& priv);
  61
  62     ~ass_synth_priv();
  63     int generate_tables(double sigma);
  64
  65     int g_r;
  66     int g_w;
  67
  68     unsigned *g;
  69     unsigned *gt2;
  70
  71     double sigma;
  72 };
  73
  74 struct ass_synth_priv_key
  75 {
  76     const double& operator()(const ass_synth_priv& x)const
  77     {
  78         return x.sigma;
  79     }
  80 };
  81
  82 struct ass_tmp_buf
  83 {
  84 public:
  85     ass_tmp_buf(size_t size);
  86     ass_tmp_buf(const ass_tmp_buf& buf);
  87     ~ass_tmp_buf();
  88     size_t size;
  89     unsigned *tmp;
  90 };
  91
  92 struct ass_tmp_buf_get_size
  93 {
  94     const size_t& operator()(const ass_tmp_buf& buf)const
  95     {
  96         return buf.size;
  97     }
  98 };
  99
 100 static const unsigned int maxcolor = 255;
 101 static const unsigned base = 256;
 102
 103 ass_synth_priv::ass_synth_priv(const double sigma)
 104 {
 105     g_r = 0;
 106     g_w = 0;
 107
 108     g = NULL;
 109     gt2 = NULL;
 110
 111     this->sigma = 0;
 112     generate_tables(sigma);
 113 }
 114
 115 ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma)
 116 {
 117     if (this->g_w > 0 && this != &priv) {
 118         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 119         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 120         //if (this->g == null || this->gt2 == null) {
 121         //    return -1;
 122         //}
 123         memcpy(g, priv.g, this->g_w * sizeof(unsigned));
 124         memcpy(gt2, priv.gt2, 256 * this->g_w * sizeof(unsigned));
 125     }
 126 }
 127
 128 ass_synth_priv::~ass_synth_priv()
 129 {
 130     free(g); g=NULL;
 131     free(gt2); gt2=NULL;
 132 }
 133
 134 int ass_synth_priv::generate_tables(double sigma)
 135 {
 136     const int TARGET_VOLUME = 1<<VOLUME_BITS;
 137     const int MAX_VOLUME_ERROR = VOLUME_BITS>=22 ? 16 : 1;
 138
 139     double a = -1 / (sigma * sigma * 2);
 140     double exp_a = exp(a);
 141
 142     double volume_factor = 0;
 143     double volume_start =  0, volume_end = 0;
 144     unsigned volume;
 145
 146     if (this->sigma == sigma)
 147         return 0;
 148     else
 149         this->sigma = sigma;
 150
 151     this->g_w = (int)ceil(sigma*3) | 1;
 152     this->g_r = this->g_w / 2;
 153
 154     if (this->g_w > 0) {
 155         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 156         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 157         if (this->g == NULL || this->gt2 == NULL) {
 158             return -1;
 159         }
 160     }
 161
 162     if (this->g_w > 0) {
 163         volume_start = 0;
 164
 165         double exp_0 = 1.0;
 166         double exp_1 = exp_a;
 167         double exp_2 = exp_1 * exp_1;
 168         volume_start += exp_0;
 169         for(int i=0;i<this->g_r;++i)
 170         {
 171             exp_0 *= exp_1;
 172             exp_1 *= exp_2;
 173             volume_start += exp_0;
 174             volume_start += exp_0;
 175         }
 176         //euqivalent:
 177         //  for (i = 0; i < this->g_w; ++i) {
 178         //      volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
 179         //  }
 180
 181         volume_end = (TARGET_VOLUME+g_w)/volume_start;
 182         volume_start = (TARGET_VOLUME-g_w)/volume_start;
 183
 184         volume = 0;
 185         while( volume_start+0.000001<volume_end )
 186         {
 187             volume_factor = (volume_start+volume_end)*0.5;
 188             volume = 0;
 189
 190             exp_0 = volume_factor;
 191             exp_1 = exp_a;
 192             exp_2 = exp_1 * exp_1;
 193
 194             volume = static_cast<int>(exp_0+.5);
 195             this->g[this->g_r] = volume;
 196
 197             unsigned* p_left = this->g+this->g_r-1;
 198             unsigned* p_right= this->g+this->g_r+1;
 199             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 200             {
 201                 exp_0 *= exp_1;
 202                 exp_1 *= exp_2;
 203                 *p_left = static_cast<int>(exp_0+.5);
 204                 *p_right = *p_left;
 205                 volume += (*p_left<<1);
 206             }
 207             //equivalent:
 208             //    for (i = 0; i < this->g_w; ++i) {
 209             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 210             //        volume += this->g[i];
 211             //    }
 212
 213             // volume don't have to be equal to TARGET_VOLUME,
 214             // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
 215             // max error introducing in later blur operation,
 216             // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
 217             // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
 218             // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
 219             //
 220             // NOTE: when it comes to rounding, no matter how small the error is,
 221             // it may result a different rounding output
 222             if( volume>=TARGET_VOLUME && volume< (TARGET_VOLUME+MAX_VOLUME_ERROR) )
 223                 break;
 224             else if(volume < TARGET_VOLUME)
 225             {
 226                 volume_start = volume_factor;
 227             }
 228             else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR)
 229             {
 230                 volume_end = volume_factor;
 231             }
 232         }
 233         if(volume==0)
 234         {
 235             volume_factor = volume_end;
 236
 237             exp_0 = volume_factor;
 238             exp_1 = exp_a;
 239             exp_2 = exp_1 * exp_1;
 240
 241             volume = static_cast<int>(exp_0+.5);
 242             this->g[this->g_r] = volume;
 243
 244             unsigned* p_left = this->g+this->g_r-1;
 245             unsigned* p_right= this->g+this->g_r+1;
 246             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 247             {
 248                 exp_0 *= exp_1;
 249                 exp_1 *= exp_2;
 250                 *p_left = static_cast<int>(exp_0+.5);
 251                 *p_right = *p_left;
 252                 volume += (*p_left<<1);
 253             }
 254             //equivalent:
 255             //    for (i = 0; i < this->g_w; ++i) {
 256             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 257             //        volume += this->g[i];
 258             //    }
 259         }
 260
 261         // gauss table:
 262         for (int mx = 0; mx < this->g_w; mx++) {
 263             int last_mul = 0;
 264             unsigned *p_gt2 = this->gt2 + mx;
 265             *p_gt2 = 0;
 266             for (int i = 1; i < 256; i++) {
 267                 last_mul = last_mul+this->g[mx];
 268                 p_gt2 += this->g_w;
 269                 *p_gt2 = last_mul;
 270                 //equivalent:
 271                 //    this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
 272             }
 273         }
 274     }
 275     return 0;
 276 }
 277
 278 ass_tmp_buf::ass_tmp_buf(size_t size)
 279 {
 280     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 281     this->size = size;
 282 }
 283
 284 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf& buf)
 285     :size(buf.size)
 286 {
 287     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 288 }
 289
 290 ass_tmp_buf::~ass_tmp_buf()
 291 {
 292     free(tmp);
 293 }
 294
 295 /*
 296  * \brief gaussian blur.  an fast pure c implementation from libass.
 297  */
 298 static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
 299                            int width, int height, int stride, const unsigned *m2,
 300                            int r, int mwidth)
 301 {
 302
 303     int x, y;
 304
 305     unsigned char *s = buffer;
 306     unsigned *t = tmp2 + 1;
 307     for (y = 0; y < height; y++) {
 308         memset(t - 1, 0, (width + 1) * sizeof(*t));
 309         x = 0;
 310         if(x < r)//in case that r < 0
 311         {
 312             const int src = s[x];
 313             if (src) {
 314                 register unsigned *dstp = t + x - r;
 315                 int mx;
 316                 const unsigned *m3 = m2 + src * mwidth;
 317                 unsigned sum = 0;
 318                 for (mx = mwidth-1; mx >= r - x ; mx--) {
 319                     sum += m3[mx];
 320                     dstp[mx] += sum;
 321                 }
 322             }
 323         }
 324
 325         for (x = 1; x < r; x++) {
 326             const int src = s[x];
 327             if (src) {
 328                 register unsigned *dstp = t + x - r;
 329                 int mx;
 330                 const unsigned *m3 = m2 + src * mwidth;
 331                 for (mx = r - x; mx < mwidth; mx++) {
 332                     dstp[mx] += m3[mx];
 333                 }
 334             }
 335         }
 336
 337         for (; x < width - r; x++) {
 338             const int src = s[x];
 339             if (src) {
 340                 register unsigned *dstp = t + x - r;
 341                 int mx;
 342                 const unsigned *m3 = m2 + src * mwidth;
 343                 for (mx = 0; mx < mwidth; mx++) {
 344                     dstp[mx] += m3[mx];
 345                 }
 346             }
 347         }
 348
 349         for (; x < width-1; x++) {
 350             const int src = s[x];
 351             if (src) {
 352                 register unsigned *dstp = t + x - r;
 353                 int mx;
 354                 const int x2 = r + width - x;
 355                 const unsigned *m3 = m2 + src * mwidth;
 356                 for (mx = 0; mx < x2; mx++) {
 357                     dstp[mx] += m3[mx];
 358                 }
 359             }
 360         }
 361         if(x==width-1) //important: x==width-1 failed, if r==0
 362         {
 363             const int src = s[x];
 364             if (src) {
 365                 register unsigned *dstp = t + x - r;
 366                 int mx;
 367                 const int x2 = r + width - x;
 368                 const unsigned *m3 = m2 + src * mwidth;
 369                 unsigned sum = 0;
 370                 for (mx = 0; mx < x2; mx++) {
 371                     sum += m3[mx];
 372                     dstp[mx] += sum;
 373                 }
 374             }
 375         }
 376
 377         s += stride;
 378         t += width + 1;
 379     }
 380
 381     t = tmp2;
 382     for (x = 0; x < width; x++) {
 383         y = 0;
 384         if(y < r)//in case that r<0
 385         {
 386             unsigned *srcp = t + y * (width + 1) + 1;
 387             int src = *srcp;
 388             if (src) {
 389                 register unsigned *dstp = srcp - 1 + (mwidth -r +y)*(width + 1);
 390                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 391                 const unsigned *m3 = m2 + src2 * mwidth;
 392                 unsigned sum = 0;
 393                 int mx;
 394                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 395                 for (mx = mwidth-1; mx >=r - y ; mx--) {
 396                     sum += m3[mx];
 397                     *dstp += sum;
 398                     dstp -= width + 1;
 399                 }
 400             }
 401         }
 402         for (y = 1; y < r; y++) {
 403             unsigned *srcp = t + y * (width + 1) + 1;
 404             int src = *srcp;
 405             if (src) {
 406                 register unsigned *dstp = srcp - 1 + width + 1;
 407                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 408                 const unsigned *m3 = m2 + src2 * mwidth;
 409
 410                 int mx;
 411                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 412                 for (mx = r - y; mx < mwidth; mx++) {
 413                     *dstp += m3[mx];
 414                     dstp += width + 1;
 415                 }
 416             }
 417         }
 418         for (; y < height - r; y++) {
 419             unsigned *srcp = t + y * (width + 1) + 1;
 420             int src = *srcp;
 421             if (src) {
 422                 register unsigned *dstp = srcp - 1 - r * (width + 1);
 423                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 424                 const unsigned *m3 = m2 + src2 * mwidth;
 425
 426                 int mx;
 427                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 428                 for (mx = 0; mx < mwidth; mx++) {
 429                     *dstp += m3[mx];
 430                     dstp += width + 1;
 431                 }
 432             }
 433         }
 434         for (; y < height-1; y++) {
 435             unsigned *srcp = t + y * (width + 1) + 1;
 436             int src = *srcp;
 437             if (src) {
 438                 const int y2 = r + height - y;
 439                 register unsigned *dstp = srcp - 1 - r * (width + 1);
 440                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 441                 const unsigned *m3 = m2 + src2 * mwidth;
 442
 443                 int mx;
 444                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 445                 for (mx = 0; mx < y2; mx++) {
 446                     *dstp += m3[mx];
 447                     dstp += width + 1;
 448                 }
 449             }
 450         }
 451         if(y == height - 1)//important: y == height - 1 failed if r==0
 452         {
 453             unsigned *srcp = t + y * (width + 1) + 1;
 454             int src = *srcp;
 455             if (src) {
 456                 const int y2 = r + height - y;
 457                 register unsigned *dstp = srcp - 1 - r * (width + 1);
 458                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 459                 const unsigned *m3 = m2 + src2 * mwidth;
 460                 unsigned sum = 0;
 461                 int mx;
 462                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 463                 for (mx = 0; mx < y2; mx++) {
 464                     sum += m3[mx];
 465                     *dstp += sum;
 466                     dstp += width + 1;
 467                 }
 468             }
 469         }
 470         t++;
 471     }
 472
 473     t = tmp2;
 474     s = buffer;
 475     for (y = 0; y < height; y++) {
 476         for (x = 0; x < width; x++) {
 477             s[x] = t[x] >> ass_synth_priv::VOLUME_BITS;
 478         }
 479         s += stride;
 480         t += width + 1;
 481     }
 482 }
 483
 484 /**
 485  * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
 486  */
 487 static void be_blur(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
 488 {
 489     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 490     WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 491     if(!col_sum_buf_base || !col_pix_buf_base)
 492     {
 493         //ToDo: error handling
 494         return;
 495     }
 496     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 497     memset(col_sum_buf_base, 0, w*sizeof(WORD));
 498     WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
 499     WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
 500     {
 501         int y = 0;
 502         unsigned char *src=buf+y*stride;
 503
 504         int x = 2;
 505         int old_pix = src[x-1];
 506         int old_sum = old_pix + src[x-2];
 507         for ( ; x < w; x++) {
 508             int temp1 = src[x];
 509             int temp2 = old_pix + temp1;
 510             old_pix = temp1;
 511             temp1 = old_sum + temp2;
 512             old_sum = temp2;
 513             col_pix_buf[x] = temp1;
 514         }
 515     }
 516     {
 517         int y = 1;
 518         unsigned char *src=buf+y*stride;
 519
 520
 521         int x = 2;
 522         int old_pix = src[x-1];
 523         int old_sum = old_pix + src[x-2];
 524         for ( ; x < w; x++) {
 525             int temp1 = src[x];
 526             int temp2 = old_pix + temp1;
 527             old_pix = temp1;
 528             temp1 = old_sum + temp2;
 529             old_sum = temp2;
 530
 531             temp2 = col_pix_buf[x] + temp1;
 532             col_pix_buf[x] = temp1;
 533             //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 534             col_sum_buf[x] = temp2;
 535         }
 536     }
 537
 538     //__m128i round = _mm_set1_epi16(8);
 539     for (int y = 2; y < h; y++) {
 540         unsigned char *src=buf+y*stride;
 541         unsigned char *dst=buf+(y-1)*stride;
 542
 543
 544         int x = 2;
 545         __m128i old_pix_128 = _mm_cvtsi32_si128(src[1]);
 546         __m128i old_sum_128 = _mm_cvtsi32_si128(src[0]+src[1]);
 547         for ( ; x < ((w-2)&(~7)); x+=8) {
 548             __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src+x));
 549             new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
 550             __m128i temp = _mm_slli_si128(new_pix,2);
 551             temp = _mm_add_epi16(temp, old_pix_128);
 552             temp = _mm_add_epi16(temp, new_pix);
 553             old_pix_128 = _mm_srli_si128(new_pix,14);
 554
 555             new_pix = _mm_slli_si128(temp,2);
 556             new_pix = _mm_add_epi16(new_pix, old_sum_128);
 557             new_pix = _mm_add_epi16(new_pix, temp);
 558             old_sum_128 = _mm_srli_si128(temp, 14);
 559
 560             __m128i old_col_pix = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_pix_buf+x) );
 561             __m128i old_col_sum = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_sum_buf+x) );
 562             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_pix_buf+x), new_pix );
 563             temp = _mm_add_epi16(new_pix, old_col_pix);
 564             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_sum_buf+x), temp );
 565
 566             old_col_sum = _mm_add_epi16(old_col_sum, temp);
 567             //old_col_sum = _mm_add_epi16(old_col_sum, round);
 568             old_col_sum = _mm_srli_epi16(old_col_sum, 4);
 569             old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
 570             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst+x-1), old_col_sum );
 571         }
 572         int old_pix = src[x-1];
 573         int old_sum = old_pix + src[x-2];
 574         for ( ; x < w; x++) {
 575             int temp1 = src[x];
 576             int temp2 = old_pix + temp1;
 577             old_pix = temp1;
 578             temp1 = old_sum + temp2;
 579             old_sum = temp2;
 580
 581             temp2 = col_pix_buf[x] + temp1;
 582             col_pix_buf[x] = temp1;
 583             dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 584             col_sum_buf[x] = temp2;
 585         }
 586     }
 587
 588     xy_free(col_sum_buf_base);
 589     xy_free(col_pix_buf_base);
 590 }
 591
 592 static void Bilinear(unsigned char *buf, int w, int h, int stride, int x_factor, int y_factor)
 593 {
 594     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 595     if(!col_pix_buf_base)
 596     {
 597         //ToDo: error handling
 598         return;
 599     }
 600     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 601
 602     for (int y = 0; y < h; y++){
 603         unsigned char *src=buf+y*stride;
 604
 605         WORD *col_pix_buf = col_pix_buf_base;
 606         int last=0;
 607         for(int x = 0; x < w; x++)
 608         {
 609             int temp1 = src[x];
 610             int temp2 = temp1*x_factor;
 611             temp1 <<= 3;
 612             temp1 -= temp2;
 613             temp1 += last;
 614             last = temp2;
 615
 616             temp2 = temp1*y_factor;
 617             temp1 <<= 3;
 618             temp1 -= temp2;
 619             temp1 += col_pix_buf[x];
 620             src[x] = ((temp1+32)>>6);
 621             col_pix_buf[x] = temp2;
 622         }
 623     }
 624     xy_free(col_pix_buf_base);
 625 }
 626
 627 bool Rasterizer::Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay)
 628 {
 629     using namespace ::boost::flyweights;
 630
 631     if(!overlay)
 632     {
 633         return false;
 634     }
 635     overlay->CleanUp();
 636     const ScanLineData& scan_line_data = *scan_line_data2.m_scan_line_data;
 637     if(!scan_line_data.mWidth || !scan_line_data.mHeight)
 638     {
 639         return true;
 640     }
 641     xsub &= 7;
 642     ysub &= 7;
 643     //xsub = ysub = 0;
 644     int width = scan_line_data.mWidth + xsub;
 645     int height = scan_line_data.mHeight + ysub;
 646     overlay->mfWideOutlineEmpty = scan_line_data2.mWideOutline.empty();
 647     if(!overlay->mfWideOutlineEmpty)
 648     {
 649         int wide_border = (scan_line_data2.mWideBorder+7)&~7;
 650
 651         width += 2*wide_border ;
 652         height += 2*wide_border ;
 653         xsub += wide_border ;
 654         ysub += wide_border ;
 655     }
 656     overlay->mOffsetX = scan_line_data2.mPathOffsetX - xsub;
 657     overlay->mOffsetY = scan_line_data2.mPathOffsetY - ysub;
 658
 659     overlay->mWidth = width;
 660     overlay->mHeight = height;
 661     overlay->mOverlayWidth = ((width+7)>>3) + 1;
 662     overlay->mOverlayHeight = ((height+7)>>3) + 1;
 663     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
 664
 665     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
 666     if( body==NULL )
 667     {
 668         return false;
 669     }
 670     overlay->mBody.reset(body, xy_free);
 671     memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
 672     BYTE* border = NULL;
 673     if (!overlay->mfWideOutlineEmpty)
 674     {
 675         border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
 676         if (border==NULL)
 677         {
 678             return false;
 679         }
 680         overlay->mBorder.reset(border, xy_free);
 681         memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
 682     }
 683
 684     // Are we doing a border?
 685     const tSpanBuffer* pOutline[2] = {&(scan_line_data.mOutline), &(scan_line_data2.mWideOutline)};
 686     for(int i = countof(pOutline)-1; i >= 0; i--)
 687     {
 688         tSpanBuffer::const_iterator it = pOutline[i]->begin();
 689         tSpanBuffer::const_iterator itEnd = pOutline[i]->end();
 690         byte* plan_selected = i==0 ? body : border;
 691         int pitch = overlay->mOverlayPitch;
 692         for(; it!=itEnd; ++it)
 693         {
 694             int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
 695             int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
 696             int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
 697             if(x2 > x1)
 698             {
 699                 int first = x1>>3;
 700                 int last = (x2-1)>>3;
 701                 byte* dst = plan_selected + (pitch*(y>>3) + first);
 702                 if(first == last)
 703                     *dst += x2-x1;
 704                 else
 705                 {
 706                     *dst += ((first+1)<<3) - x1;
 707                     dst += 1;
 708                     while(++first < last)
 709                     {
 710                         *dst += 0x08;
 711                         dst += 1;
 712                     }
 713                     *dst += x2 - (last<<3);
 714                 }
 715             }
 716         }
 717     }
 718
 719     return true;
 720 }
 721
 722 // @return: true if actually a blur operation has done, or else false and output is leave unset.
 723 bool Rasterizer::Blur(const Overlay& input_overlay, int fBlur, double fGaussianBlur,
 724     SharedPtrOverlay output_overlay)
 725 {
 726     using namespace ::boost::flyweights;
 727
 728     if(!output_overlay)
 729     {
 730         return false;
 731     }
 732     output_overlay->CleanUp();
 733
 734     output_overlay->mOffsetX = input_overlay.mOffsetX;
 735     output_overlay->mOffsetY = input_overlay.mOffsetY;
 736     output_overlay->mWidth = input_overlay.mWidth;
 737     output_overlay->mHeight = input_overlay.mHeight;
 738     output_overlay->mOverlayWidth = input_overlay.mOverlayWidth;
 739     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight;
 740     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
 741
 742     int bluradjust = 0;
 743     if(fBlur || fGaussianBlur > 0.1)
 744     {
 745         if (fGaussianBlur > 0)
 746             bluradjust += (int)(fGaussianBlur*3*8 + 0.5) | 1;
 747         if (fBlur)
 748             bluradjust += 8;
 749         // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
 750         bluradjust = (bluradjust+7)&~7;
 751
 752         output_overlay->mOffsetX -= bluradjust;
 753         output_overlay->mOffsetY -= bluradjust;
 754         output_overlay->mWidth += (bluradjust<<1);
 755         output_overlay->mHeight += (bluradjust<<1);
 756         output_overlay->mOverlayWidth += (bluradjust>>2);
 757         output_overlay->mOverlayHeight += (bluradjust>>2);
 758     }
 759     else
 760     {
 761         return false;
 762     }
 763
 764     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
 765
 766     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
 767     if( body==NULL )
 768     {
 769         return false;
 770     }
 771     output_overlay->mBody.reset(body, xy_free);
 772     memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
 773     BYTE* border = NULL;
 774     if (!output_overlay->mfWideOutlineEmpty)
 775     {
 776         border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
 777         if (border==NULL)
 778         {
 779             return false;
 780         }
 781         output_overlay->mBorder.reset(border, xy_free);
 782         memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
 783     }
 784
 785     //copy buffer
 786     for(int i = 1; i >= 0; i--)
 787     {
 788         byte* plan_selected = i==0 ? body : border;
 789         const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
 790
 791         plan_selected += (bluradjust>>3) + (bluradjust>>3)*output_overlay->mOverlayPitch;
 792         if ( plan_selected!=NULL && plan_input!=NULL )
 793         {
 794             for (int j=0;j<input_overlay.mOverlayHeight;j++)
 795             {
 796                 memcpy(plan_selected, plan_input, input_overlay.mOverlayPitch);
 797                 plan_selected += output_overlay->mOverlayPitch;
 798                 plan_input += input_overlay.mOverlayPitch;
 799             }
 800         }
 801     }
 802
 803     ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
 804     //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
 805     // Do some gaussian blur magic
 806     if (fGaussianBlur > 0.1)//(fGaussianBlur > 0) return true even if fGaussianBlur very small
 807     {
 808         byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border;
 809         flyweight<key_value<double, ass_synth_priv, ass_synth_priv_key>, no_locking> fw_priv_blur(fGaussianBlur);
 810         const ass_synth_priv& priv_blur = fw_priv_blur.get();
 811         if (output_overlay->mOverlayWidth>=priv_blur.g_w && output_overlay->mOverlayHeight>=priv_blur.g_w)
 812         {
 813             ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch,
 814                 priv_blur.gt2, priv_blur.g_r, priv_blur.g_w);
 815         }
 816     }
 817
 818     for (int pass = 0; pass < fBlur; pass++)
 819     {
 820         if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
 821         {
 822             int pitch = output_overlay->mOverlayPitch;
 823             byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border;
 824             be_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
 825         }
 826     }
 827     return true;
 828 }
 829
 830 ///////////////////////////////////////////////////////////////////////////
 831
 832 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
 833 {
 834     int a = alpha;
 835     // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
 836     int ia = 256-a;
 837     a+=1;
 838     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
 839            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
 840            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
 841 }
 842
 843 static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
 844 {
 845     int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
 846     int ia = 256-a;
 847     a+=1;
 848     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
 849            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
 850            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
 851 }
 852
 853 #include <xmmintrin.h>
 854 #include <emmintrin.h>
 855
 856 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
 857 {
 858 //    alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
 859     color &= 0xffffff;
 860     __m128i zero = _mm_setzero_si128();
 861     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
 862     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
 863     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
 864     __m128i r = _mm_unpacklo_epi16(d, s);
 865     r = _mm_madd_epi16(r, a);
 866     r = _mm_srli_epi32(r, 8);
 867     r = _mm_packs_epi32(r, r);
 868     r = _mm_packus_epi16(r, r);
 869     *dst = (DWORD)_mm_cvtsi128_si32(r);
 870 }
 871
 872 static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
 873 {
 874     int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
 875     color &= 0xffffff;
 876     __m128i zero = _mm_setzero_si128();
 877     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
 878     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
 879     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
 880     __m128i r = _mm_unpacklo_epi16(d, s);
 881     r = _mm_madd_epi16(r, a);
 882     r = _mm_srli_epi32(r, 8);
 883     r = _mm_packs_epi32(r, r);
 884     r = _mm_packus_epi16(r, r);
 885     *dst = (DWORD)_mm_cvtsi128_si32(r);
 886 }
 887
 888 #include <mmintrin.h>
 889
 890 // Calculate a - b clamping to 0 instead of underflowing
 891 static __forceinline DWORD safe_subtract(DWORD a, DWORD b)
 892 {
 893     __m64 ap = _mm_cvtsi32_si64(a);
 894     __m64 bp = _mm_cvtsi32_si64(b);
 895     __m64 rp = _mm_subs_pu16(ap, bp);
 896     DWORD r = (DWORD)_mm_cvtsi64_si32(rp);
 897     _mm_empty();
 898     return r;
 899     //return (b > a) ? 0 : a - b;
 900 }
 901
 902 /***
 903  * No aligned requirement
 904  *
 905  **/
 906 void AlphaBlt(byte* pY,
 907     const byte* pAlphaMask,
 908     const byte Y,
 909     int h, int w, int src_stride, int dst_stride)
 910 {
 911     __m128i zero = _mm_setzero_si128();
 912     __m128i s = _mm_set1_epi16(Y);               //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
 913
 914     if( w>16 )//IMPORTANT! The result of the following code is undefined with w<15.
 915     {
 916         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
 917         {
 918             const BYTE* sa = pAlphaMask;
 919             BYTE* dy = pY;
 920             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
 921             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
 922             const BYTE* dy_end = pY + w;
 923
 924             for(;dy < dy_first_mod16; sa++, dy++)
 925             {
 926                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
 927             }
 928             for(; dy < dy_end_mod16; sa+=8, dy+=16)
 929             {
 930                 __m128i a = _mm_loadl_epi64((__m128i*)sa);
 931
 932                 //Y
 933                 __m128i d = _mm_load_si128((__m128i*)dy);
 934
 935                 //__m128i ones = _mm_cmpeq_epi32(zero,zero); //ones = ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 936                 //__m128i ia = _mm_xor_si128(a,ones);        //ia   = ~a
 937                 //ia = _mm_unpacklo_epi8(ia,zero);           //ia   = ~a0 0 ~a1 0 ~a2 0 ~a3 0 ~a4 0 ~a5 0 ~a6 0 ~a7 0
 938                 a = _mm_unpacklo_epi8(a,zero);               //a= a0 0  a1 0  a2 0  a3 0  a4 0  a5 0  a6 0  a7 0
 939                 __m128i ones = _mm_set1_epi16(256);          //ones = 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1
 940                 __m128i ia = _mm_sub_epi16(ones, a);         //ia   = 256-a0 ... 256-a7
 941                 ones = _mm_srli_epi16(ones, 8);
 942                 a = _mm_add_epi16(a, ones);                  //a= 1+a0 ... 1+a7
 943
 944                 __m128i dl = _mm_unpacklo_epi8(d,zero);               //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
 945                 __m128i sl = _mm_mullo_epi16(s,a);            //sl   = c0*a0  c1*a1  ... c7*a7
 946
 947                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
 948
 949                 dl = _mm_add_epi16(dl,sl);                     //d   = d + sl
 950                 dl = _mm_srli_epi16(dl, 8);                    //d   = d>>8
 951
 952                 sa += 8;
 953                 a = _mm_loadl_epi64((__m128i*)sa);
 954
 955                 a = _mm_unpacklo_epi8(a,zero);
 956                 ones = _mm_slli_epi16(ones, 8);
 957                 ia = _mm_sub_epi16(ones, a);
 958                 ones = _mm_srli_epi16(ones, 8);
 959                 a = _mm_add_epi16(a,ones);
 960
 961                 d = _mm_unpackhi_epi8(d,zero);
 962                 sl = _mm_mullo_epi16(s,a);
 963                 d = _mm_mullo_epi16(d,ia);
 964                 d = _mm_add_epi16(d,sl);
 965                 d = _mm_srli_epi16(d, 8);
 966
 967                 dl = _mm_packus_epi16(dl,d);
 968
 969                 _mm_store_si128((__m128i*)dy, dl);
 970             }
 971             for(;dy < dy_end; sa++, dy++)
 972             {
 973                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
 974             }
 975         }
 976     }
 977     else
 978     {
 979         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
 980         {
 981             const BYTE* sa = pAlphaMask;
 982             BYTE* dy = pY;
 983             const BYTE* dy_end = pY + w;
 984
 985             for(;dy < dy_end; sa++, dy++)
 986             {
 987                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
 988             }
 989         }
 990     }
 991     //__asm emms;
 992 }
 993
 994 /***
 995  * No aligned requirement
 996  *
 997  **/
 998 void AlphaBlt(byte* pY,
 999     const byte alpha,
1000     const byte Y,
1001     int h, int w, int dst_stride)
1002 {
1003     int yPremul = Y*(alpha+1);
1004     int dstAlpha = 0x100 - alpha;
1005     if( w>32 )//IMPORTANT! The result of the following code is undefined with w<15.
1006     {
1007         __m128i zero = _mm_setzero_si128();
1008         __m128i s = _mm_set1_epi16(yPremul);    //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
1009         __m128i ia = _mm_set1_epi16(dstAlpha);
1010         for( ; h>0; h--, pY += dst_stride )
1011         {
1012             BYTE* dy = pY;
1013             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
1014             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1015             const BYTE* dy_end = pY + w;
1016
1017             for(;dy < dy_first_mod16; dy++)
1018             {
1019                 *dy = (*dy * dstAlpha + yPremul)>>8;
1020             }
1021             for(; dy < dy_end_mod16; dy+=16)
1022             {
1023                 //Y
1024                 __m128i d = _mm_load_si128(reinterpret_cast<const __m128i*>(dy));
1025                 __m128i dl = _mm_unpacklo_epi8(d,zero);        //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
1026
1027                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
1028                 dl = _mm_adds_epu16(dl,s);                     //d   = d + s
1029                 dl = _mm_srli_epi16(dl, 8);                    //d   = d>>8
1030
1031                 d = _mm_unpackhi_epi8(d,zero);
1032                 d = _mm_mullo_epi16(d,ia);
1033                 d = _mm_adds_epu16(d,s);
1034                 d = _mm_srli_epi16(d, 8);
1035
1036                 dl = _mm_packus_epi16(dl,d);
1037
1038                 _mm_store_si128(reinterpret_cast<__m128i*>(dy), dl);
1039             }
1040             for(;dy < dy_end; dy++)
1041             {
1042                 *dy = (*dy * dstAlpha + yPremul)>>8;
1043             }
1044         }
1045     }
1046     else
1047     {
1048         for( ; h>0; h--, pY += dst_stride )
1049         {
1050             BYTE* dy = pY;
1051             const BYTE* dy_end = pY + w;
1052
1053             for(;dy < dy_end; dy++)
1054             {
1055                 *dy = (*dy * dstAlpha + yPremul)>>8;
1056             }
1057         }
1058     }
1059     //__asm emms;
1060 }
1061
1062 /***
1063  * No aligned requirement
1064  *
1065  **/
1066 void AlphaBltC(byte* pY,
1067     const byte alpha,
1068     const byte Y,
1069     int h, int w, int dst_stride)
1070 {
1071     int yPremul = Y*(alpha+1);
1072     int dstAlpha = 0x100 - alpha;
1073
1074     for( ; h>0; h--, pY += dst_stride )
1075     {
1076         BYTE* dy = pY;
1077         const BYTE* dy_end = pY + w;
1078
1079         for(;dy < dy_end; dy++)
1080         {
1081             *dy = (*dy * dstAlpha + yPremul)>>8;
1082         }
1083     }
1084 }
1085
1086 // For CPUID usage in Rasterizer::Draw
1087 #include "../dsutil/vd.h"
1088
1089 void OverlapRegion(tSpanBuffer& dst, const tSpanBuffer& src, int dx, int dy)
1090 {
1091     tSpanBuffer temp;
1092     temp.reserve(dst.size() + src.size());
1093     dst.swap(temp);
1094     tSpanBuffer::iterator itA = temp.begin();
1095     tSpanBuffer::iterator itAE = temp.end();
1096     tSpanBuffer::const_iterator itB = src.begin();
1097     tSpanBuffer::const_iterator itBE = src.end();
1098     // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
1099     unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
1100     unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
1101     while(itA != itAE && itB != itBE)
1102     {
1103         if((*itB).first + offset1 < (*itA).first)
1104         {
1105             // B span is earlier.  Use it.
1106             unsigned __int64 x1 = (*itB).first + offset1;
1107             unsigned __int64 x2 = (*itB).second + offset2;
1108             ++itB;
1109             // B spans don't overlap, so begin merge loop with A first.
1110             for(;;)
1111             {
1112                 // If we run out of A spans or the A span doesn't overlap,
1113                 // then the next B span can't either (because B spans don't
1114                 // overlap) and we exit.
1115                 if(itA == itAE || (*itA).first > x2)
1116                     break;
1117                 do {x2 = _MAX(x2, (*itA++).second);}
1118                 while(itA != itAE && (*itA).first <= x2);
1119                 // If we run out of B spans or the B span doesn't overlap,
1120                 // then the next A span can't either (because A spans don't
1121                 // overlap) and we exit.
1122                 if(itB == itBE || (*itB).first + offset1 > x2)
1123                     break;
1124                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1125                 while(itB != itBE && (*itB).first + offset1 <= x2);
1126             }
1127             // Flush span.
1128             dst.push_back(tSpan(x1, x2));
1129         }
1130         else
1131         {
1132             // A span is earlier.  Use it.
1133             unsigned __int64 x1 = (*itA).first;
1134             unsigned __int64 x2 = (*itA).second;
1135             ++itA;
1136             // A spans don't overlap, so begin merge loop with B first.
1137             for(;;)
1138             {
1139                 // If we run out of B spans or the B span doesn't overlap,
1140                 // then the next A span can't either (because A spans don't
1141                 // overlap) and we exit.
1142                 if(itB == itBE || (*itB).first + offset1 > x2)
1143                     break;
1144                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1145                 while(itB != itBE && (*itB).first + offset1 <= x2);
1146                 // If we run out of A spans or the A span doesn't overlap,
1147                 // then the next B span can't either (because B spans don't
1148                 // overlap) and we exit.
1149                 if(itA == itAE || (*itA).first > x2)
1150                     break;
1151                 do {x2 = _MAX(x2, (*itA++).second);}
1152                 while(itA != itAE && (*itA).first <= x2);
1153             }
1154             // Flush span.
1155             dst.push_back(tSpan(x1, x2));
1156         }
1157     }
1158     // Copy over leftover spans.
1159     while(itA != itAE)
1160         dst.push_back(*itA++);
1161     while(itB != itBE)
1162     {
1163         dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
1164         ++itB;
1165     }
1166 }
1167
1168 // Render a subpicture onto a surface.
1169 // spd is the surface to render on.
1170 // clipRect is a rectangular clip region to render inside.
1171 // pAlphaMask is an alpha clipping mask.
1172 // xsub and ysub ???
1173 // switchpts seems to be an array of fill colours interlaced with coordinates.
1174 //    switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1175 // fBody tells whether to render the body of the subs.
1176 // fBorder tells whether to render the border of the subs.
1177 SharedPtrByte Rasterizer::CompositeAlphaMask(const SharedPtrOverlay& overlay, const CRect& clipRect,
1178     const GrayImage2* alpha_mask,
1179     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder,
1180     CRect *outputDirtyRect)
1181 {
1182     //fix me: check and log error
1183     SharedPtrByte result;
1184     *outputDirtyRect = CRect(0, 0, 0, 0);
1185     if (!switchpts || !fBody && !fBorder) return result;
1186     if (fBorder && !overlay->mBorder) return result;
1187
1188     CRect r = clipRect;
1189     if (alpha_mask!=NULL)
1190     {
1191         r &= CRect(alpha_mask->left_top, alpha_mask->size);
1192     }
1193
1194     // Remember that all subtitle coordinates are specified in 1/8 pixels
1195     // (x+4)>>3 rounds to nearest whole pixel.
1196     // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1197     int x = (xsub + overlay->mOffsetX + 4)>>3;
1198     int y = (ysub + overlay->mOffsetY + 4)>>3;
1199     int w = overlay->mOverlayWidth;
1200     int h = overlay->mOverlayHeight;
1201     int xo = 0, yo = 0;
1202     // Again, limiting?
1203     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1204     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1205     if(x+w > r.right) w = r.right-x;
1206     if(y+h > r.bottom) h = r.bottom-y;
1207     // Check if there's actually anything to render
1208     if(w <= 0 || h <= 0) return(result);
1209     outputDirtyRect->SetRect(x, y, x+w, y+h);
1210
1211     bool fSingleColor = (switchpts[1]==0xffffffff);
1212
1213     // draw
1214     // Grab the first colour
1215     DWORD color = switchpts[0];
1216     byte* s_base = (byte*)xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight);
1217     const byte* alpha_mask_data = alpha_mask != NULL ? alpha_mask->data.get() : NULL;
1218     const int alpha_mask_pitch = alpha_mask != NULL ? alpha_mask->pitch : 0;
1219     if(alpha_mask_data!=NULL )
1220         alpha_mask_data += alpha_mask->pitch * y + x - alpha_mask->left_top.y*alpha_mask->pitch - alpha_mask->left_top.x;
1221
1222     if(fSingleColor)
1223     {
1224         overlay->FillAlphaMash(s_base, fBody, fBorder, xo, yo, w, h,
1225             alpha_mask_data, alpha_mask_pitch,
1226             color>>24 );
1227     }
1228     else
1229     {
1230         int last_x = xo;
1231         const DWORD *sw = switchpts;
1232         while( last_x<w+xo )
1233         {
1234             byte alpha = sw[0]>>24;
1235             while( sw[3]<w+xo && (sw[2]>>24)==alpha )
1236             {
1237                 sw += 2;
1238             }
1239             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1240             overlay->FillAlphaMash(s_base, fBody, fBorder,
1241                 last_x, yo, new_x-last_x, h,
1242                 alpha_mask_data, alpha_mask_pitch,
1243                 alpha );
1244             last_x = new_x;
1245             sw += 2;
1246         }
1247     }
1248     result.reset( s_base, xy_free );
1249     return result;
1250 }
1251
1252
1253 //
1254 // draw overlay[clipRect] to bitmap[0,0,w,h]
1255 //
1256 void Rasterizer::Draw(XyBitmap* bitmap, SharedPtrOverlay overlay, const CRect& clipRect, byte* s_base,
1257     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder)
1258 {
1259     if (!switchpts || !fBody && !fBorder) return;
1260     if (bitmap==NULL)
1261     {
1262         ASSERT(0);
1263         return;
1264     }
1265     // clip
1266     // Limit drawn area to rectangular clip area
1267     CRect r = clipRect;
1268     // Remember that all subtitle coordinates are specified in 1/8 pixels
1269     // (x+4)>>3 rounds to nearest whole pixel.
1270     int overlayPitch = overlay->mOverlayPitch;
1271     int x = (xsub + overlay->mOffsetX + 4)>>3;
1272     int y = (ysub + overlay->mOffsetY + 4)>>3;
1273     int w = overlay->mOverlayWidth;
1274     int h = overlay->mOverlayHeight;
1275     int xo = 0, yo = 0;
1276
1277     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1278     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1279     if(x+w > r.right) w = r.right-x;
1280     if(y+h > r.bottom) h = r.bottom-y;
1281     // Check if there's actually anything to render
1282     if (w <= 0 || h <= 0) return;
1283     // must have enough space to draw into
1284     ASSERT(x >= bitmap->x && y >= bitmap->y && x+w <= bitmap->x + bitmap->w && y+h <= bitmap->y + bitmap->h );
1285
1286     // CPUID from VDub
1287     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1288     bool fSingleColor = (switchpts[1]==0xffffffff);
1289     bool PLANAR = (bitmap->type==XyBitmap::PLANNA);
1290     int draw_method = 0;
1291     if(fSingleColor)
1292         draw_method |= DM::SINGLE_COLOR;
1293     if(fSSE2)
1294         draw_method |= DM::SSE2;
1295     if(PLANAR)
1296         draw_method |= DM::AYUV_PLANAR;
1297
1298     // draw
1299     // Grab the first colour
1300     DWORD color = switchpts[0];
1301     const byte* s = s_base + overlay->mOverlayPitch*yo + xo;
1302
1303     int dst_offset = 0;
1304     if (bitmap->type==XyBitmap::PLANNA)
1305         dst_offset = bitmap->pitch*(y-bitmap->y) + x - bitmap->x;
1306     else
1307         dst_offset = bitmap->pitch*(y-bitmap->y) + (x - bitmap->x)*4;
1308     unsigned long* dst = (unsigned long*)((BYTE*)bitmap->plans[0] + dst_offset);
1309
1310     // Every remaining line in the bitmap to be rendered...
1311     switch(draw_method)
1312     {
1313     case   DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1314     {
1315         while(h--)
1316         {
1317             for(int wt=0; wt<w; ++wt)
1318                 // The <<6 is due to pixmix expecting the alpha parameter to be
1319                 // the multiplication of two 6-bit unsigned numbers but we
1320                 // only have one here. (No alpha mask.)
1321                 pixmix_sse2(&dst[wt], color, s[wt]);
1322             s += overlayPitch;
1323             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1324         }
1325     }
1326     break;
1327     case   DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1328     {
1329         while(h--)
1330         {
1331             for(int wt=0; wt<w; ++wt)
1332                 pixmix(&dst[wt], color, s[wt]);
1333             s += overlayPitch;
1334             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1335         }
1336     }
1337     break;
1338     case 0*DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1339     {
1340         while(h--)
1341         {
1342             const DWORD *sw = switchpts;
1343             for(int wt=0; wt<w; ++wt)
1344             {
1345                 // xo is the offset (usually negative) we have moved into the image
1346                 // So if we have passed the switchpoint (?) switch to another colour
1347                 // (So switchpts stores both colours *and* coordinates?)
1348                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1349                 pixmix_sse2(&dst[wt], color, s[wt]);
1350             }
1351             s += overlayPitch;
1352             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1353         }
1354     }
1355     break;
1356     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1357     {
1358         while(h--)
1359         {
1360             const DWORD *sw = switchpts;
1361             for(int wt=0; wt<w; ++wt)
1362             {
1363                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1364                 pixmix(&dst[wt], color, s[wt]);
1365             }
1366             s += overlayPitch;
1367             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1368         }
1369     }
1370     break;
1371     case   DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1372     {
1373         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1374         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1375         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1376         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1377
1378         AlphaBlt(dst_Y, s, ((color)>>16)&0xff, h, w, overlayPitch, bitmap->pitch);
1379         AlphaBlt(dst_U, s, ((color)>>8)&0xff, h, w, overlayPitch, bitmap->pitch);
1380         AlphaBlt(dst_V, s, ((color))&0xff, h, w, overlayPitch, bitmap->pitch);
1381         AlphaBlt(dst_A, s, 0, h, w, overlayPitch, bitmap->pitch);
1382     }
1383     break;
1384     case 0*DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1385     {
1386         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1387         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1388         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1389         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1390
1391         const DWORD *sw = switchpts;
1392         int last_x = xo;
1393         color = sw[0];
1394         while(last_x<w+xo)
1395         {
1396             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1397             color = sw[0];
1398             sw += 2;
1399             if( new_x < last_x )
1400                 continue;
1401             AlphaBlt(dst_Y, s + last_x - xo, (color>>16)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1402             AlphaBlt(dst_U, s + last_x - xo, (color>>8)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1403             AlphaBlt(dst_V, s + last_x - xo, (color)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1404             AlphaBlt(dst_A, s + last_x - xo, 0, h, new_x-last_x, overlayPitch, bitmap->pitch);
1405
1406             dst_A += new_x - last_x;
1407             dst_Y += new_x - last_x;
1408             dst_U += new_x - last_x;
1409             dst_V += new_x - last_x;
1410             last_x = new_x;
1411         }
1412     }
1413     break;
1414     case   DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1415     {
1416 //        char * debug_dst=(char*)dst;int h2 = h;
1417 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1418 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1419 //        debug_dst += spd.pitch*spd.h;
1420 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1421 //        debug_dst += spd.pitch*spd.h;
1422 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1423 //        debug_dst += spd.pitch*spd.h;
1424 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1425 //        debug_dst=(char*)dst;
1426
1427         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1428         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1429         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1430         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1431         while(h--)
1432         {
1433             for(int wt=0; wt<w; ++wt)
1434             {
1435                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1436                 pixmix(&temp, color, s[wt]);
1437                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1438             }
1439             s += overlayPitch;
1440             dst_A += bitmap->pitch;
1441             dst_Y += bitmap->pitch;
1442             dst_U += bitmap->pitch;
1443             dst_V += bitmap->pitch;
1444         }
1445 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1446 //        debug_dst += spd.pitch*spd.h;
1447 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1448 //        debug_dst += spd.pitch*spd.h;
1449 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1450 //        debug_dst += spd.pitch*spd.h;
1451 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1452     }
1453     break;
1454     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1455     {
1456         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1457         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1458         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1459         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1460         while(h--)
1461         {
1462             const DWORD *sw = switchpts;
1463             for(int wt=0; wt<w; ++wt)
1464             {
1465                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1466                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1467                 pixmix(&temp, color, (s[wt]*(color>>24))>>8);
1468                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1469             }
1470             s += overlayPitch;
1471             dst_A += bitmap->pitch;
1472             dst_Y += bitmap->pitch;
1473             dst_U += bitmap->pitch;
1474             dst_V += bitmap->pitch;
1475         }
1476     }
1477     break;
1478     }
1479     // Remember to EMMS!
1480     // Rendering fails in funny ways if we don't do this.
1481     _mm_empty();
1482     return;
1483 }
1484
1485 void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nHeight, DWORD argb)
1486 {
1487     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1488     bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1489     int draw_method = 0;
1490     if(fSSE2)
1491         draw_method |= DM::SSE2;
1492     if(AYUV_PLANAR)
1493         draw_method |= DM::AYUV_PLANAR;
1494
1495     switch (draw_method)
1496     {
1497     case   DM::SSE2 | 0*DM::AYUV_PLANAR :
1498     {
1499         for (int wy=y; wy<y+nHeight; wy++) {
1500             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1501             for(int wt=0; wt<nWidth; ++wt) {
1502                 pixmix_sse2(&dst[wt], argb, argb>>24);
1503             }
1504         }
1505     }
1506     break;
1507     case 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1508     {
1509         for (int wy=y; wy<y+nHeight; wy++) {
1510             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1511             for(int wt=0; wt<nWidth; ++wt) {
1512                 pixmix(&dst[wt], argb,  argb>>24);
1513             }
1514         }
1515     }
1516     break;
1517     case   DM::SSE2 |   DM::AYUV_PLANAR :
1518     {
1519         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1520         BYTE* dst_A = dst;
1521         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1522         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1523         BYTE* dst_V = dst_U + spd.pitch*spd.h;
1524         AlphaBlt(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1525         AlphaBlt(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1526         AlphaBlt(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1527         AlphaBlt(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1528     }
1529     break;
1530     case 0*DM::SSE2 |   DM::AYUV_PLANAR :
1531     {
1532         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1533         BYTE* dst_A = dst;
1534         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1535         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1536         BYTE* dst_V = dst_U + spd.pitch*spd.h;
1537         AlphaBltC(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1538         AlphaBltC(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1539         AlphaBltC(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1540         AlphaBltC(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1541     }
1542     break;
1543     }
1544     _mm_empty();
1545 }
1546
1547 ///////////////////////////////////////////////////////////////
1548
1549 // Overlay
1550
1551 void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
1552     const byte* pAlphaMask, int pitch, DWORD color_alpha )
1553 {
1554     pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
1555     pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
1556     byte* dst = outputAlphaMask + y*mOverlayPitch + x;
1557
1558     const int x0 = ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) < w ?
1559                     ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) : w; //IMPORTANT! Should not exceed w.
1560     const int x00 = ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) < w ?
1561                     ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) : w;//IMPORTANT! Should not exceed w.
1562     const int x_end00  = ((reinterpret_cast<int>(dst)+w)&~15) - reinterpret_cast<int>(dst);
1563     const int x_end0 = ((reinterpret_cast<int>(dst)+w)&~3) - reinterpret_cast<int>(dst);
1564     const int x_end = w;
1565
1566     __m64 color_alpha_64 = _mm_set1_pi16(color_alpha);
1567     __m128i color_alpha_128 = _mm_set1_epi16(color_alpha);
1568
1569     if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
1570     {
1571         /*
1572         __asm
1573         {
1574         mov        eax, color_alpha
1575         movd       XMM3, eax
1576         punpcklwd  XMM3, XMM3
1577         pshufd     XMM3, XMM3, 0
1578         }
1579         */
1580         while(h--)
1581         {
1582             int j=0;
1583             for( ; j<x0; j++ )
1584             {
1585                 int temp = pBorder[j]-pBody[j];
1586                 temp = temp<0 ? 0 : temp;
1587                 dst[j] = (temp * color_alpha)>>6;
1588             }
1589             for( ;j<x00;j+=4 )
1590             {
1591                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1592                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1593                 border = _mm_subs_pu8(border, body);
1594                 __m64 zero = _mm_setzero_si64();
1595                 border = _mm_unpacklo_pi8(border, zero);
1596                 border = _mm_mullo_pi16(border, color_alpha_64);
1597                 border = _mm_srli_pi16(border, 6);
1598                 border = _mm_packs_pu16(border,border);
1599                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1600             }
1601             __m128i zero = _mm_setzero_si128();
1602             for( ;j<x_end00;j+=16)
1603             {
1604                 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1605                 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1606                 border = _mm_subs_epu8(border,body);
1607                 __m128i srchi = border;
1608                 border = _mm_unpacklo_epi8(border, zero);
1609                 srchi = _mm_unpackhi_epi8(srchi, zero);
1610                 border = _mm_mullo_epi16(border, color_alpha_128);
1611                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1612                 border = _mm_srli_epi16(border, 6);
1613                 srchi = _mm_srli_epi16(srchi, 6);
1614                 border = _mm_packus_epi16(border, srchi);
1615                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1616             }
1617             for( ;j<x_end0;j+=4)
1618             {
1619                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1620                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1621                 border = _mm_subs_pu8(border, body);
1622                 __m64 zero = _mm_setzero_si64();
1623                 border = _mm_unpacklo_pi8(border, zero);
1624                 border = _mm_mullo_pi16(border, color_alpha_64);
1625                 border = _mm_srli_pi16(border, 6);
1626                 border = _mm_packs_pu16(border,border);
1627                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1628             }
1629             for( ;j<x_end;j++)
1630             {
1631                 int temp = pBorder[j]-pBody[j];
1632                 temp = temp<0 ? 0 : temp;
1633                 dst[j] = (temp * color_alpha)>>6;
1634             }
1635             pBody += mOverlayPitch;
1636             pBorder += mOverlayPitch;
1637             //pAlphaMask += pitch;
1638             dst += mOverlayPitch;
1639         }
1640     }
1641     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
1642     {
1643         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1644         while(h--)
1645         {
1646             int j=0;
1647             for( ; j<x0; j++ )
1648             {
1649                 dst[j] = (src1[j] * color_alpha)>>6;
1650             }
1651             for( ;j<x00;j+=4 )
1652             {
1653                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1654                 __m64 zero = _mm_setzero_si64();
1655                 src = _mm_unpacklo_pi8(src, zero);
1656                 src = _mm_mullo_pi16(src, color_alpha_64);
1657                 src = _mm_srli_pi16(src, 6);
1658                 src = _mm_packs_pu16(src,src);
1659                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1660             }
1661             __m128i zero = _mm_setzero_si128();
1662             for( ;j<x_end00;j+=16)
1663             {
1664                 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1665                 __m128i srchi = src;
1666                 src = _mm_unpacklo_epi8(src, zero);
1667                 srchi = _mm_unpackhi_epi8(srchi, zero);
1668                 src = _mm_mullo_epi16(src, color_alpha_128);
1669                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1670                 src = _mm_srli_epi16(src, 6);
1671                 srchi = _mm_srli_epi16(srchi, 6);
1672                 src = _mm_packus_epi16(src, srchi);
1673                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1674             }
1675             for( ;j<x_end0;j+=4)
1676             {
1677                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1678                 __m64 zero = _mm_setzero_si64();
1679                 src = _mm_unpacklo_pi8(src, zero);
1680                 src = _mm_mullo_pi16(src, color_alpha_64);
1681                 src = _mm_srli_pi16(src, 6);
1682                 src = _mm_packs_pu16(src,src);
1683                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1684             }
1685             for( ;j<x_end;j++)
1686             {
1687                 dst[j] = (src1[j] * color_alpha)>>6;
1688             }
1689             src1 += mOverlayPitch;
1690             //pAlphaMask += pitch;
1691             dst += mOverlayPitch;
1692         }
1693     }
1694     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
1695     {
1696         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1697         while(h--)
1698         {
1699             int j=0;
1700             for( ; j<x0; j++ )
1701             {
1702                 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1703             }
1704             for( ;j<x00;j+=4 )
1705             {
1706                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1707                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1708                 __m64 zero = _mm_setzero_si64();
1709                 src = _mm_unpacklo_pi8(src, zero);
1710                 src = _mm_mullo_pi16(src, color_alpha_64);
1711                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1712                 src = _mm_mulhi_pi16(src, mask); //important!
1713                 src = _mm_srli_pi16(src, 12+8-16); //important!
1714                 src = _mm_packs_pu16(src,src);
1715                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1716             }
1717             __m128i zero = _mm_setzero_si128();
1718             for( ;j<x_end00;j+=16)
1719             {
1720                 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1721                 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1722                 __m128i srchi = src;
1723                 __m128i maskhi = mask;
1724                 src = _mm_unpacklo_epi8(src, zero);
1725                 srchi = _mm_unpackhi_epi8(srchi, zero);
1726                 mask = _mm_unpacklo_epi8(zero, mask); //important!
1727                 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1728                 src = _mm_mullo_epi16(src, color_alpha_128);
1729                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1730                 src = _mm_mulhi_epu16(src, mask); //important!
1731                 srchi = _mm_mulhi_epu16(srchi, maskhi);
1732                 src = _mm_srli_epi16(src, 12+8-16); //important!
1733                 srchi = _mm_srli_epi16(srchi, 12+8-16);
1734                 src = _mm_packus_epi16(src, srchi);
1735                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1736             }
1737             for( ;j<x_end0;j+=4)
1738             {
1739                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1740                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1741                 __m64 zero = _mm_setzero_si64();
1742                 src = _mm_unpacklo_pi8(src, zero);
1743                 src = _mm_mullo_pi16(src, color_alpha_64);
1744                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1745                 src = _mm_mulhi_pi16(src, mask); //important!
1746                 src = _mm_srli_pi16(src, 12+8-16); //important!
1747                 src = _mm_packs_pu16(src,src);
1748                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1749             }
1750             for( ;j<x_end;j++)
1751             {
1752                 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1753             }
1754             src1 += mOverlayPitch;
1755             pAlphaMask += pitch;
1756             dst += mOverlayPitch;
1757         }
1758     }
1759     else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
1760     {
1761         while(h--)
1762         {
1763             int j=0;
1764             for( ; j<x0; j++ )
1765             {
1766                 int temp = pBorder[j]-pBody[j];
1767                 temp = temp<0 ? 0 : temp;
1768                 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1769             }
1770             for( ;j<x00;j+=4 )
1771             {
1772                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1773                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1774                 border = _mm_subs_pu8(border, body);
1775                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1776                 __m64 zero = _mm_setzero_si64();
1777                 border = _mm_unpacklo_pi8(border, zero);
1778                 border = _mm_mullo_pi16(border, color_alpha_64);
1779                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1780                 border = _mm_mulhi_pi16(border, mask); //important!
1781                 border = _mm_srli_pi16(border, 12+8-16); //important!
1782                 border = _mm_packs_pu16(border,border);
1783                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1784             }
1785             __m128i zero = _mm_setzero_si128();
1786             for( ;j<x_end00;j+=16)
1787             {
1788                 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1789                 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1790                 border = _mm_subs_epu8(border,body);
1791
1792                 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1793                 __m128i srchi = border;
1794                 __m128i maskhi = mask;
1795                 border = _mm_unpacklo_epi8(border, zero);
1796                 srchi = _mm_unpackhi_epi8(srchi, zero);
1797                 mask = _mm_unpacklo_epi8(zero, mask); //important!
1798                 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1799                 border = _mm_mullo_epi16(border, color_alpha_128);
1800                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1801                 border = _mm_mulhi_epu16(border, mask); //important!
1802                 srchi = _mm_mulhi_epu16(srchi, maskhi);
1803                 border = _mm_srli_epi16(border, 12+8-16); //important!
1804                 srchi = _mm_srli_epi16(srchi, 12+8-16);
1805                 border = _mm_packus_epi16(border, srchi);
1806                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1807             }
1808             for( ;j<x_end0;j+=4)
1809             {
1810                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1811                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1812                 border = _mm_subs_pu8(border, body);
1813                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1814                 __m64 zero = _mm_setzero_si64();
1815                 border = _mm_unpacklo_pi8(border, zero);
1816                 border = _mm_mullo_pi16(border, color_alpha_64);
1817                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1818                 border = _mm_mulhi_pi16(border, mask); //important!
1819                 border = _mm_srli_pi16(border, 12+8-16); //important!
1820                 border = _mm_packs_pu16(border,border);
1821                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1822             }
1823             for( ;j<x_end;j++)
1824             {
1825                 int temp = pBorder[j]-pBody[j];
1826                 temp = temp<0 ? 0 : temp;
1827                 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1828             }
1829             pBody += mOverlayPitch;
1830             pBorder += mOverlayPitch;
1831             pAlphaMask += pitch;
1832             dst += mOverlayPitch;
1833         }
1834     }
1835     else
1836     {
1837         //should NOT happen!
1838         ASSERT(0);
1839         while(h--)
1840         {
1841             for(int j=0;j<x_end;j++)
1842             {
1843                 dst[j] = 0;
1844             }
1845             dst += mOverlayPitch;
1846         }
1847     }
1848 }
1849
1850 void Overlay::FillAlphaMash( byte* outputAlphaMask, bool fBody, bool fBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha)
1851 {
1852     if(!fBorder && fBody && pAlphaMask==NULL)
1853     {
1854         _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1855     }
1856     else if(/*fBorder &&*/ fBody && pAlphaMask==NULL)
1857     {
1858         _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
1859     }
1860     else if(!fBody && fBorder /* pAlphaMask==NULL or not*/)
1861     {
1862         _DoFillAlphaMash(outputAlphaMask, mBody.get(), mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
1863     }
1864     else if(!fBorder && fBody && pAlphaMask!=NULL)
1865     {
1866         _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1867     }
1868     else if(fBorder && fBody && pAlphaMask!=NULL)
1869     {
1870         _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
1871     }
1872     else
1873     {
1874         //should NOT happen
1875         ASSERT(0);
1876     }
1877 }
1878
1879 Overlay* Overlay::GetSubpixelVariance(unsigned int xshift, unsigned int yshift)
1880 {
1881     Overlay* overlay = new Overlay();
1882     if(!overlay)
1883     {
1884         return NULL;
1885     }
1886     xshift &= 7;
1887     yshift &= 7;
1888
1889     overlay->mOffsetX = mOffsetX - xshift;
1890     overlay->mOffsetY = mOffsetY - yshift;
1891     overlay->mWidth = mWidth + xshift;
1892     overlay->mHeight = mHeight + yshift;
1893
1894     overlay->mOverlayWidth = ((overlay->mWidth+7)>>3) + 1;
1895     overlay->mOverlayHeight = ((overlay->mHeight + 7)>>3) + 1;
1896     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
1897
1898
1899     overlay->mfWideOutlineEmpty = mfWideOutlineEmpty;
1900
1901     if (overlay->mOverlayPitch * overlay->mOverlayHeight<=0)
1902     {
1903         return NULL;
1904     }
1905
1906     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
1907     if( body==NULL )
1908     {
1909         return NULL;
1910     }
1911     overlay->mBody.reset(body, xy_free);
1912     BYTE* border = NULL;
1913     if (!overlay->mfWideOutlineEmpty)
1914     {
1915         border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
1916         if (border==NULL)
1917         {
1918             return NULL;
1919         }
1920         overlay->mBorder.reset(border, xy_free);
1921     }
1922
1923     if(overlay->mOverlayPitch==mOverlayPitch && overlay->mOverlayHeight>=mOverlayHeight)
1924     {
1925         if (body && mBody)
1926         {
1927             memcpy(body, mBody.get(), mOverlayPitch * mOverlayHeight);
1928             memset(body+mOverlayPitch*mOverlayHeight, 0, mOverlayPitch * (overlay->mOverlayHeight-mOverlayHeight));
1929         }
1930         else if ( (!!body)!=(!!mBody)/*==NULL*/)
1931         {
1932             return NULL;
1933         }
1934
1935         if (border && mBorder)
1936         {
1937             memcpy(border, mBorder.get(), mOverlayPitch * mOverlayHeight);
1938             memset(border+mOverlayPitch*mOverlayHeight, 0, mOverlayPitch * (overlay->mOverlayHeight-mOverlayHeight));
1939         }
1940         else if ( (!!border)!=(!!mBorder)/*==NULL*/ )
1941         {
1942             return NULL;
1943         }
1944     }
1945     else
1946     {
1947         memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
1948         byte* dst = body;
1949         const byte* src = mBody.get();
1950         for (int i=0;i<mOverlayHeight;i++)
1951         {
1952             memcpy(dst, src, mOverlayPitch);
1953             dst += overlay->mOverlayPitch;
1954             src += mOverlayPitch;
1955         }
1956         if (!overlay->mfWideOutlineEmpty)
1957         {
1958             ASSERT(border && mBorder);
1959             memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
1960             dst = border;
1961             src = mBorder.get();
1962             for (int i=0;i<mOverlayHeight;i++)
1963             {
1964                 memcpy(dst, src, mOverlayPitch);
1965                 dst += overlay->mOverlayPitch;
1966                 src += mOverlayPitch;
1967             }
1968         }
1969     }
1970     //not equal
1971     //  Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1972     Bilinear(body, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1973     if (!overlay->mfWideOutlineEmpty)
1974     {
1975         Bilinear(border, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1976     }
1977     return overlay;
1978 }
1979
1980 ///////////////////////////////////////////////////////////////
1981
1982 // PathData
1983
1984 PathData::PathData():mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0)
1985 {
1986 }
1987
1988 PathData::PathData( const PathData& src ):mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(src.mPathPoints)
1989 {
1990     //TODO: deal with the case that src.mPathPoints<0
1991     if(mPathPoints>0)
1992     {
1993         mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
1994         mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));
1995     }
1996     if(mPathPoints>0)
1997     {
1998         memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
1999         memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2000     }
2001 }
2002
2003 const PathData& PathData::operator=( const PathData& src )
2004 {
2005     if(this!=&src)
2006     {
2007         if(mPathPoints!=src.mPathPoints && src.mPathPoints>0)
2008         {
2009             _TrashPath();
2010             mPathPoints = src.mPathPoints;
2011             mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2012             mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));//better than realloc
2013         }
2014         if(src.mPathPoints>0)
2015         {
2016             memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2017             memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2018         }
2019     }
2020     return *this;
2021 }
2022
2023 PathData::~PathData()
2024 {
2025     _TrashPath();
2026 }
2027
2028 bool PathData::operator==( const PathData& rhs ) const
2029 {
2030     return (this==&rhs) || (
2031         mPathPoints==rhs.mPathPoints
2032         && !memcmp(mpPathTypes, rhs.mpPathTypes, mPathPoints * sizeof(BYTE) )
2033         && !memcmp(mpPathPoints, rhs.mpPathPoints, mPathPoints * sizeof(POINT) )
2034         );
2035 }
2036
2037 void PathData::_TrashPath()
2038 {
2039     if (mpPathTypes)
2040     {
2041         free(mpPathTypes);
2042         mpPathTypes = NULL;
2043     }
2044     if (mpPathPoints)
2045     {
2046         free(mpPathPoints);
2047         mpPathPoints = NULL;
2048     }
2049     mPathPoints = 0;
2050 }
2051
2052 bool PathData::BeginPath(HDC hdc)
2053 {
2054     _TrashPath();
2055     return !!::BeginPath(hdc);
2056 }
2057
2058 bool PathData::EndPath(HDC hdc)
2059 {
2060     ::CloseFigure(hdc);
2061     if(::EndPath(hdc))
2062     {
2063         mPathPoints = GetPath(hdc, NULL, NULL, 0);
2064         if(!mPathPoints)
2065             return true;
2066         mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
2067         mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
2068         if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
2069             return true;
2070     }
2071     ::AbortPath(hdc);
2072     return false;
2073 }
2074
2075 bool PathData::PartialBeginPath(HDC hdc, bool bClearPath)
2076 {
2077     if(bClearPath)
2078         _TrashPath();
2079     return !!::BeginPath(hdc);
2080 }
2081
2082 bool PathData::PartialEndPath(HDC hdc, long dx, long dy)
2083 {
2084     ::CloseFigure(hdc);
2085     if(::EndPath(hdc))
2086     {
2087         int nPoints;
2088         BYTE* pNewTypes;
2089         POINT* pNewPoints;
2090         nPoints = GetPath(hdc, NULL, NULL, 0);
2091         if(!nPoints)
2092             return true;
2093         pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
2094         pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
2095         if(pNewTypes)
2096             mpPathTypes = pNewTypes;
2097         if(pNewPoints)
2098             mpPathPoints = pNewPoints;
2099         BYTE* pTypes = new BYTE[nPoints];
2100         POINT* pPoints = new POINT[nPoints];
2101         if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
2102         {
2103             for(int i = 0; i < nPoints; ++i)
2104             {
2105                 mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
2106                 mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
2107                 mpPathTypes[mPathPoints + i] = pTypes[i];
2108             }
2109             mPathPoints += nPoints;
2110             delete[] pTypes;
2111             delete[] pPoints;
2112             return true;
2113         }
2114         else
2115             DebugBreak();
2116         delete[] pTypes;
2117         delete[] pPoints;
2118     }
2119     ::AbortPath(hdc);
2120     return false;
2121 }
2122
2123 void PathData::AlignLeftTop(CPoint *left_top, CSize *size)
2124 {
2125     int minx = INT_MAX;
2126     int miny = INT_MAX;
2127     int maxx = INT_MIN;
2128     int maxy = INT_MIN;
2129     for(int i=0; i<mPathPoints; ++i)
2130     {
2131         int ix = mpPathPoints[i].x;
2132         int iy = mpPathPoints[i].y;
2133         if(ix < minx) minx = ix;
2134         if(ix > maxx) maxx = ix;
2135         if(iy < miny) miny = iy;
2136         if(iy > maxy) maxy = iy;
2137     }
2138     if(minx > maxx || miny > maxy)
2139     {
2140         _TrashPath();
2141         *left_top = CPoint(0, 0);
2142         *size = CSize(0, 0);
2143         return;
2144     }
2145     minx = (minx >> 3) & ~7;
2146     miny = (miny >> 3) & ~7;
2147     maxx = (maxx + 7) >> 3;
2148     maxy = (maxy + 7) >> 3;
2149     for(int i=0; i<mPathPoints; ++i)
2150     {
2151         mpPathPoints[i].x -= minx*8;
2152         mpPathPoints[i].y -= miny*8;
2153     }
2154     *left_top = CPoint(minx, miny);
2155     *size = CSize(maxx+1-minx, maxy+1-miny);
2156     return;
2157 }
2158
2159 //////////////////////////////////////////////////////////////////////////
2160
2161 // ScanLineData
2162
2163 ScanLineData::ScanLineData()
2164 {
2165 }
2166
2167 ScanLineData::~ScanLineData()
2168 {
2169 }
2170
2171 void ScanLineData::_ReallocEdgeBuffer(int edges)
2172 {
2173     mEdgeHeapSize = edges;
2174     mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
2175 }
2176
2177 void ScanLineData::_EvaluateBezier(const PathData& path_data, int ptbase, bool fBSpline)
2178 {
2179     const POINT* pt0 = path_data.mpPathPoints + ptbase;
2180     const POINT* pt1 = path_data.mpPathPoints + ptbase + 1;
2181     const POINT* pt2 = path_data.mpPathPoints + ptbase + 2;
2182     const POINT* pt3 = path_data.mpPathPoints + ptbase + 3;
2183     double x0 = pt0->x;
2184     double x1 = pt1->x;
2185     double x2 = pt2->x;
2186     double x3 = pt3->x;
2187     double y0 = pt0->y;
2188     double y1 = pt1->y;
2189     double y2 = pt2->y;
2190     double y3 = pt3->y;
2191     double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
2192     if(fBSpline)
2193     {
2194         // 1   [-1 +3 -3 +1]
2195         // - * [+3 -6 +3  0]
2196         // 6   [-3  0 +3  0]
2197         //         [+1 +4 +1  0]
2198         double _1div6 = 1.0/6.0;
2199         cx3 = _1div6*(-  x0+3*x1-3*x2+x3);
2200         cx2 = _1div6*( 3*x0-6*x1+3*x2);
2201         cx1 = _1div6*(-3*x0        +3*x2);
2202         cx0 = _1div6*(   x0+4*x1+1*x2);
2203         cy3 = _1div6*(-  y0+3*y1-3*y2+y3);
2204         cy2 = _1div6*( 3*y0-6*y1+3*y2);
2205         cy1 = _1div6*(-3*y0     +3*y2);
2206         cy0 = _1div6*(   y0+4*y1+1*y2);
2207     }
2208     else // bezier
2209     {
2210         // [-1 +3 -3 +1]
2211         // [+3 -6 +3  0]
2212         // [-3 +3  0  0]
2213         // [+1  0  0  0]
2214         cx3 = -  x0+3*x1-3*x2+x3;
2215         cx2 =  3*x0-6*x1+3*x2;
2216         cx1 = -3*x0+3*x1;
2217         cx0 =    x0;
2218         cy3 = -  y0+3*y1-3*y2+y3;
2219         cy2 =  3*y0-6*y1+3*y2;
2220         cy1 = -3*y0+3*y1;
2221         cy0 =    y0;
2222     }
2223     //
2224     // This equation is from Graphics Gems I.
2225     //
2226     // The idea is that since we're approximating a cubic curve with lines,
2227     // any error we incur is due to the curvature of the line, which we can
2228     // estimate by calculating the maximum acceleration of the curve.  For
2229     // a cubic, the acceleration (second derivative) is a line, meaning that
2230     // the absolute maximum acceleration must occur at either the beginning
2231     // (|c2|) or the end (|c2+c3|).  Our bounds here are a little more
2232     // conservative than that, but that's okay.
2233     //
2234     // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2235     // that component of the curve is linear and does not incur any error.
2236     // If a=0 for both X and Y, the curve is a line segment and we can
2237     // use a step size of 1.
2238     double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
2239     double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
2240     double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
2241     double h = 1.0;
2242     if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
2243     if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
2244     for(double t = 0; t < 1.0; t += h)
2245     {
2246         double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
2247         double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
2248         _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2249     }
2250     double x = cx0 + cx1 + cx2 + cx3;
2251     double y = cy0 + cy1 + cy2 + cy3;
2252     _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2253 }
2254
2255 void ScanLineData::_EvaluateLine(const PathData& path_data, int pt1idx, int pt2idx)
2256 {
2257     const POINT* pt1 = path_data.mpPathPoints + pt1idx;
2258     const POINT* pt2 = path_data.mpPathPoints + pt2idx;
2259     _EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
2260 }
2261
2262 void ScanLineData::_EvaluateLine(int x0, int y0, int x1, int y1)
2263 {
2264     if(lastp.x != x0 || lastp.y != y0)
2265     {
2266         _EvaluateLine(lastp.x, lastp.y, x0, y0);
2267     }
2268     if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
2269     lastp.x = x1;
2270     lastp.y = y1;
2271     if(y1 > y0) // down
2272     {
2273         __int64 xacc = (__int64)x0 << 13;
2274         // prestep y0 down
2275         int dy = y1 - y0;
2276         int y = ((y0 + 3)&~7) + 4;
2277         int iy = y >> 3;
2278         y1 = (y1 - 5) >> 3;
2279         if(iy <= y1)
2280         {
2281             __int64 invslope = (__int64(x1 - x0) << 16) / dy;
2282             while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
2283                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2284             xacc += (invslope * (y - y0)) >> 3;
2285             while(iy <= y1)
2286             {
2287                 int ix = (int)((xacc + 32768) >> 16);
2288                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2289                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
2290                 mpScanBuffer[iy] = mEdgeNext++;
2291                 ++iy;
2292                 xacc += invslope;
2293             }
2294         }
2295     }
2296     else if(y1 < y0) // up
2297     {
2298         __int64 xacc = (__int64)x1 << 13;
2299         // prestep y1 down
2300         int dy = y0 - y1;
2301         int y = ((y1 + 3)&~7) + 4;
2302         int iy = y >> 3;
2303         y0 = (y0 - 5) >> 3;
2304         if(iy <= y0)
2305         {
2306             __int64 invslope = (__int64(x0 - x1) << 16) / dy;
2307             while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
2308                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2309             xacc += (invslope * (y - y1)) >> 3;
2310             while(iy <= y0)
2311             {
2312                 int ix = (int)((xacc + 32768) >> 16);
2313                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2314                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
2315                 mpScanBuffer[iy] = mEdgeNext++;
2316                 ++iy;
2317                 xacc += invslope;
2318             }
2319         }
2320     }
2321 }
2322
2323 bool ScanLineData::ScanConvert(const PathData& path_data, const CSize& size)
2324 {
2325     int lastmoveto = -1;
2326     int i;
2327     // Drop any outlines we may have.
2328     mOutline.clear();
2329     // Determine bounding box
2330     if(!path_data.mPathPoints)
2331     {
2332         mWidth = mHeight = 0;
2333         return false;
2334     }
2335     mWidth = size.cx;
2336     mHeight = size.cy;
2337     // Initialize edge buffer.  We use edge 0 as a sentinel.
2338     mEdgeNext = 1;
2339     mEdgeHeapSize = 2048;
2340     mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
2341     // Initialize scanline list.
2342     mpScanBuffer = new unsigned int[mHeight];
2343     memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
2344     // Scan convert the outline.  Yuck, Bezier curves....
2345     // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2346     // paths with all but the first figure left open, so we can't rely
2347     // on the PT_CLOSEFIGURE flag being used appropriately.
2348     fFirstSet = false;
2349     firstp.x = firstp.y = 0;
2350     lastp.x = lastp.y = 0;
2351     for(i=0; i<path_data.mPathPoints; ++i)
2352     {
2353         BYTE t = path_data.mpPathTypes[i] & ~PT_CLOSEFIGURE;
2354         switch(t)
2355         {
2356         case PT_MOVETO:
2357             if(lastmoveto >= 0 && firstp != lastp)
2358                 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2359             lastmoveto = i;
2360             fFirstSet = false;
2361             lastp = path_data.mpPathPoints[i];
2362             break;
2363         case PT_MOVETONC:
2364             break;
2365         case PT_LINETO:
2366             if(path_data.mPathPoints - (i-1) >= 2) _EvaluateLine(path_data, i-1, i);
2367             break;
2368         case PT_BEZIERTO:
2369             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, false);
2370             i += 2;
2371             break;
2372         case PT_BSPLINETO:
2373             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, true);
2374             i += 2;
2375             break;
2376         case PT_BSPLINEPATCHTO:
2377             if(path_data.mPathPoints - (i-3) >= 4) _EvaluateBezier(path_data, i-3, true);
2378             break;
2379         }
2380     }
2381     if(lastmoveto >= 0 && firstp != lastp)
2382         _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2383     // Convert the edges to spans.  We couldn't do this before because some of
2384     // the regions may have winding numbers >+1 and it would have been a pain
2385     // to try to adjust the spans on the fly.  We use one heap to detangle
2386     // a scanline's worth of edges from the singly-linked lists, and another
2387     // to collect the actual scans.
2388     std::vector<int> heap;
2389     mOutline.reserve(mEdgeNext / 2);
2390     __int64 y = 0;
2391     for(y=0; y<mHeight; ++y)
2392     {
2393         int count = 0;
2394         // Detangle scanline into edge heap.
2395         for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
2396         {
2397             heap.push_back(mpEdgeBuffer[ptr].posandflag);
2398         }
2399         // Sort edge heap.  Note that we conveniently made the opening edges
2400         // one more than closing edges at the same spot, so we won't have any
2401         // problems with abutting spans.
2402         std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
2403         // Process edges and add spans.  Since we only check for a non-zero
2404         // winding number, it doesn't matter which way the outlines go!
2405         std::vector<int>::iterator itX1 = heap.begin();
2406         std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
2407         int x1, x2;
2408         for(; itX1 != itX2; ++itX1)
2409         {
2410             int x = *itX1;
2411             if(!count)
2412                 x1 = (x>>1);
2413             if(x&1)
2414                 ++count;
2415             else
2416                 --count;
2417             if(!count)
2418             {
2419                 x2 = (x>>1);
2420                 if(x2>x1)
2421                     mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
2422             }
2423         }
2424         heap.clear();
2425     }
2426     // Dump the edge and scan buffers, since we no longer need them.
2427     free(mpEdgeBuffer);
2428     delete [] mpScanBuffer;
2429     // All done!
2430     return true;
2431 }
2432
2433 using namespace std;
2434
2435 void ScanLineData::DeleteOutlines()
2436 {
2437     mOutline.clear();
2438 }
2439
2440 bool ScanLineData2::CreateWidenedRegion(int rx, int ry)
2441 {
2442     if(rx < 0) rx = 0;
2443     if(ry < 0) ry = 0;
2444     mWideBorder = max(rx,ry);
2445     mWideOutline.clear();
2446
2447     const tSpanBuffer& out_line = m_scan_line_data->mOutline;
2448     if (ry > 0)
2449     {
2450         // Do a half circle.
2451         // _OverlapRegion mirrors this so both halves are done.
2452         for(int y = -ry; y <= ry; ++y)
2453         {
2454             int x = (int)(0.5 + sqrt(float(ry*ry - y*y)) * float(rx)/float(ry));
2455             OverlapRegion(mWideOutline, out_line, x, y);
2456         }
2457     }
2458     else if (ry == 0 && rx > 0)
2459     {
2460         // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
2461         OverlapRegion(mWideOutline, out_line, rx, 0);
2462         OverlapRegion(mWideOutline, out_line, rx, 0);
2463     }
2464     return true;
2465 }