src/subtitles/Rasterizer.cpp

   1 /*
   2  *      Copyright (C) 2003-2006 Gabest
   3  *      http://www.gabest.org
   4  *
   5  *  This Program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2, or (at your option)
   8  *  any later version.
   9  *
  10  *  This Program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with GNU Make; see the file COPYING.  If not, write to
  17  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18  *  http://www.gnu.org/copyleft/gpl.html
  19  *
  20  */
  21
  22 #include "stdafx.h"
  23 #include <string.h>
  24 #include <math.h>
  25 #include <vector>
  26 #include <algorithm>
  27 #include "Rasterizer.h"
  28 #include "SeparableFilter.h"
  29 #include "xy_logger.h"
  30 #include <boost/flyweight/key_value.hpp>
  31
  32 #ifndef _MAX    /* avoid collision with common (nonconforming) macros */
  33 #define _MAX    (max)
  34 #define _MIN    (min)
  35 #define _IMPL_MAX max
  36 #define _IMPL_MIN min
  37 #else
  38 #define _IMPL_MAX _MAX
  39 #define _IMPL_MIN _MIN
  40 #endif
  41
  42
  43 //NOTE: signed or unsigned affects the result seriously
  44 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
  45
  46 #define SPLIT_AYUV(color, a, y, u, v) do { \
  47         *(v)=(color)&0xff; \
  48         *(u)=((color)>>8) &0xff; \
  49         *(y)=((color)>>16)&0xff;\
  50         *(a)=((color)>>24)&0xff;\
  51     } while(0)
  52
  53 class ass_synth_priv
  54 {
  55 public:
  56     static const int VOLUME_BITS = 22;//should not exceed 32-8, and better not exceed 31-8
  57
  58     ass_synth_priv(const double sigma);
  59     ass_synth_priv(const ass_synth_priv& priv);
  60
  61     ~ass_synth_priv();
  62     int generate_tables(double sigma);
  63
  64     int g_r;
  65     int g_w;
  66
  67     unsigned *g;
  68     unsigned *gt2;
  69
  70     double sigma;
  71 };
  72
  73 struct ass_synth_priv_key
  74 {
  75     const double& operator()(const ass_synth_priv& x)const
  76     {
  77         return x.sigma;
  78     }
  79 };
  80
  81 struct ass_tmp_buf
  82 {
  83 public:
  84     ass_tmp_buf(size_t size);
  85     ass_tmp_buf(const ass_tmp_buf& buf);
  86     ~ass_tmp_buf();
  87     size_t size;
  88     unsigned *tmp;
  89 };
  90
  91 struct ass_tmp_buf_get_size
  92 {
  93     const size_t& operator()(const ass_tmp_buf& buf)const
  94     {
  95         return buf.size;
  96     }
  97 };
  98
  99 static const unsigned int maxcolor = 255;
 100 static const unsigned base = 256;
 101
 102 ass_synth_priv::ass_synth_priv(const double sigma)
 103 {
 104     g_r = 0;
 105     g_w = 0;
 106
 107     g = NULL;
 108     gt2 = NULL;
 109
 110     this->sigma = 0;
 111     generate_tables(sigma);
 112 }
 113
 114 ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma)
 115 {
 116     if (this->g_w > 0 && this != &priv) {
 117         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 118         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 119         //if (this->g == null || this->gt2 == null) {
 120         //    return -1;
 121         //}
 122         memcpy(g, priv.g, this->g_w * sizeof(unsigned));
 123         memcpy(gt2, priv.gt2, 256 * this->g_w * sizeof(unsigned));
 124     }
 125 }
 126
 127 ass_synth_priv::~ass_synth_priv()
 128 {
 129     free(g); g=NULL;
 130     free(gt2); gt2=NULL;
 131 }
 132
 133 int ass_synth_priv::generate_tables(double sigma)
 134 {
 135     const int TARGET_VOLUME = 1<<VOLUME_BITS;
 136     const int MAX_VOLUME_ERROR = VOLUME_BITS>=22 ? 16 : 1;
 137
 138     double a = -1 / (sigma * sigma * 2);
 139     double exp_a = exp(a);
 140
 141     double volume_factor = 0;
 142     double volume_start =  0, volume_end = 0;
 143     unsigned volume;
 144
 145     if (this->sigma == sigma)
 146         return 0;
 147     else
 148         this->sigma = sigma;
 149
 150     this->g_w = (int)ceil(sigma*3) | 1;
 151     this->g_r = this->g_w / 2;
 152
 153     if (this->g_w > 0) {
 154         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 155         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 156         if (this->g == NULL || this->gt2 == NULL) {
 157             return -1;
 158         }
 159     }
 160
 161     if (this->g_w > 0) {
 162         volume_start = 0;
 163
 164         double exp_0 = 1.0;
 165         double exp_1 = exp_a;
 166         double exp_2 = exp_1 * exp_1;
 167         volume_start += exp_0;
 168         for(int i=0;i<this->g_r;++i)
 169         {
 170             exp_0 *= exp_1;
 171             exp_1 *= exp_2;
 172             volume_start += exp_0;
 173             volume_start += exp_0;
 174         }
 175         //euqivalent:
 176         //  for (i = 0; i < this->g_w; ++i) {
 177         //      volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
 178         //  }
 179
 180         volume_end = (TARGET_VOLUME+g_w)/volume_start;
 181         volume_start = (TARGET_VOLUME-g_w)/volume_start;
 182
 183         volume = 0;
 184         while( volume_start+0.000001<volume_end )
 185         {
 186             volume_factor = (volume_start+volume_end)*0.5;
 187             volume = 0;
 188
 189             exp_0 = volume_factor;
 190             exp_1 = exp_a;
 191             exp_2 = exp_1 * exp_1;
 192
 193             volume = static_cast<int>(exp_0+.5);
 194             this->g[this->g_r] = volume;
 195
 196             unsigned* p_left = this->g+this->g_r-1;
 197             unsigned* p_right= this->g+this->g_r+1;
 198             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 199             {
 200                 exp_0 *= exp_1;
 201                 exp_1 *= exp_2;
 202                 *p_left = static_cast<int>(exp_0+.5);
 203                 *p_right = *p_left;
 204                 volume += (*p_left<<1);
 205             }
 206             //equivalent:
 207             //    for (i = 0; i < this->g_w; ++i) {
 208             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 209             //        volume += this->g[i];
 210             //    }
 211
 212             // volume don't have to be equal to TARGET_VOLUME,
 213             // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
 214             // max error introducing in later blur operation,
 215             // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
 216             // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
 217             // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
 218             //
 219             // NOTE: when it comes to rounding, no matter how small the error is,
 220             // it may result a different rounding output
 221             if( volume>=TARGET_VOLUME && volume< (TARGET_VOLUME+MAX_VOLUME_ERROR) )
 222                 break;
 223             else if(volume < TARGET_VOLUME)
 224             {
 225                 volume_start = volume_factor;
 226             }
 227             else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR)
 228             {
 229                 volume_end = volume_factor;
 230             }
 231         }
 232         if(volume==0)
 233         {
 234             volume_factor = volume_end;
 235
 236             exp_0 = volume_factor;
 237             exp_1 = exp_a;
 238             exp_2 = exp_1 * exp_1;
 239
 240             volume = static_cast<int>(exp_0+.5);
 241             this->g[this->g_r] = volume;
 242
 243             unsigned* p_left = this->g+this->g_r-1;
 244             unsigned* p_right= this->g+this->g_r+1;
 245             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 246             {
 247                 exp_0 *= exp_1;
 248                 exp_1 *= exp_2;
 249                 *p_left = static_cast<int>(exp_0+.5);
 250                 *p_right = *p_left;
 251                 volume += (*p_left<<1);
 252             }
 253             //equivalent:
 254             //    for (i = 0; i < this->g_w; ++i) {
 255             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 256             //        volume += this->g[i];
 257             //    }
 258         }
 259
 260         // gauss table:
 261         for (int mx = 0; mx < this->g_w; mx++) {
 262             int last_mul = 0;
 263             unsigned *p_gt2 = this->gt2 + mx;
 264             *p_gt2 = 0;
 265             for (int i = 1; i < 256; i++) {
 266                 last_mul = last_mul+this->g[mx];
 267                 p_gt2 += this->g_w;
 268                 *p_gt2 = last_mul;
 269                 //equivalent:
 270                 //    this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
 271             }
 272         }
 273     }
 274     return 0;
 275 }
 276
 277 ass_tmp_buf::ass_tmp_buf(size_t size)
 278 {
 279     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 280     this->size = size;
 281 }
 282
 283 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf& buf)
 284     :size(buf.size)
 285 {
 286     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 287 }
 288
 289 ass_tmp_buf::~ass_tmp_buf()
 290 {
 291     free(tmp);
 292 }
 293
 294 /*
 295  * \brief gaussian blur.  an fast pure c implementation from libass.
 296  */
 297 static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
 298                            int width, int height, int stride, const unsigned *m2,
 299                            int r, int mwidth)
 300 {
 301
 302     int x, y;
 303
 304     unsigned char *s = buffer;
 305     unsigned *t = tmp2 + 1;
 306     for (y = 0; y < height; y++) {
 307         memset(t - 1, 0, (width + 1) * sizeof(*t));
 308         x = 0;
 309         if(x < r)//in case that r < 0
 310         {
 311             const int src = s[x];
 312             if (src) {
 313                 register unsigned *dstp = t + x - r;
 314                 int mx;
 315                 const unsigned *m3 = m2 + src * mwidth;
 316                 unsigned sum = 0;
 317                 for (mx = mwidth-1; mx >= r - x ; mx--) {
 318                     sum += m3[mx];
 319                     dstp[mx] += sum;
 320                 }
 321             }
 322         }
 323
 324         for (x = 1; x < r; x++) {
 325             const int src = s[x];
 326             if (src) {
 327                 register unsigned *dstp = t + x - r;
 328                 int mx;
 329                 const unsigned *m3 = m2 + src * mwidth;
 330                 for (mx = r - x; mx < mwidth; mx++) {
 331                     dstp[mx] += m3[mx];
 332                 }
 333             }
 334         }
 335
 336         for (; x < width - r; x++) {
 337             const int src = s[x];
 338             if (src) {
 339                 register unsigned *dstp = t + x - r;
 340                 int mx;
 341                 const unsigned *m3 = m2 + src * mwidth;
 342                 for (mx = 0; mx < mwidth; mx++) {
 343                     dstp[mx] += m3[mx];
 344                 }
 345             }
 346         }
 347
 348         for (; x < width-1; x++) {
 349             const int src = s[x];
 350             if (src) {
 351                 register unsigned *dstp = t + x - r;
 352                 int mx;
 353                 const int x2 = r + width - x;
 354                 const unsigned *m3 = m2 + src * mwidth;
 355                 for (mx = 0; mx < x2; mx++) {
 356                     dstp[mx] += m3[mx];
 357                 }
 358             }
 359         }
 360         if(x==width-1) //important: x==width-1 failed, if r==0
 361         {
 362             const int src = s[x];
 363             if (src) {
 364                 register unsigned *dstp = t + x - r;
 365                 int mx;
 366                 const int x2 = r + width - x;
 367                 const unsigned *m3 = m2 + src * mwidth;
 368                 unsigned sum = 0;
 369                 for (mx = 0; mx < x2; mx++) {
 370                     sum += m3[mx];
 371                     dstp[mx] += sum;
 372                 }
 373             }
 374         }
 375
 376         s += stride;
 377         t += width + 1;
 378     }
 379
 380     t = tmp2;
 381     for (x = 0; x < width; x++) {
 382         y = 0;
 383         if(y < r)//in case that r<0
 384         {
 385             unsigned *srcp = t + y * (width + 1) + 1;
 386             int src = *srcp;
 387             if (src) {
 388                 register unsigned *dstp = srcp - 1 + (mwidth -r +y)*(width + 1);
 389                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 390                 const unsigned *m3 = m2 + src2 * mwidth;
 391                 unsigned sum = 0;
 392                 int mx;
 393                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 394                 for (mx = mwidth-1; mx >=r - y ; mx--) {
 395                     sum += m3[mx];
 396                     *dstp += sum;
 397                     dstp -= width + 1;
 398                 }
 399             }
 400         }
 401         for (y = 1; y < r; y++) {
 402             unsigned *srcp = t + y * (width + 1) + 1;
 403             int src = *srcp;
 404             if (src) {
 405                 register unsigned *dstp = srcp - 1 + width + 1;
 406                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 407                 const unsigned *m3 = m2 + src2 * mwidth;
 408
 409                 int mx;
 410                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 411                 for (mx = r - y; mx < mwidth; mx++) {
 412                     *dstp += m3[mx];
 413                     dstp += width + 1;
 414                 }
 415             }
 416         }
 417         for (; y < height - r; y++) {
 418             unsigned *srcp = t + y * (width + 1) + 1;
 419             int src = *srcp;
 420             if (src) {
 421                 register unsigned *dstp = srcp - 1 - r * (width + 1);
 422                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 423                 const unsigned *m3 = m2 + src2 * mwidth;
 424
 425                 int mx;
 426                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 427                 for (mx = 0; mx < mwidth; mx++) {
 428                     *dstp += m3[mx];
 429                     dstp += width + 1;
 430                 }
 431             }
 432         }
 433         for (; y < height-1; y++) {
 434             unsigned *srcp = t + y * (width + 1) + 1;
 435             int src = *srcp;
 436             if (src) {
 437                 const int y2 = r + height - y;
 438                 register unsigned *dstp = srcp - 1 - r * (width + 1);
 439                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 440                 const unsigned *m3 = m2 + src2 * mwidth;
 441
 442                 int mx;
 443                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 444                 for (mx = 0; mx < y2; mx++) {
 445                     *dstp += m3[mx];
 446                     dstp += width + 1;
 447                 }
 448             }
 449         }
 450         if(y == height - 1)//important: y == height - 1 failed if r==0
 451         {
 452             unsigned *srcp = t + y * (width + 1) + 1;
 453             int src = *srcp;
 454             if (src) {
 455                 const int y2 = r + height - y;
 456                 register unsigned *dstp = srcp - 1 - r * (width + 1);
 457                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 458                 const unsigned *m3 = m2 + src2 * mwidth;
 459                 unsigned sum = 0;
 460                 int mx;
 461                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 462                 for (mx = 0; mx < y2; mx++) {
 463                     sum += m3[mx];
 464                     *dstp += sum;
 465                     dstp += width + 1;
 466                 }
 467             }
 468         }
 469         t++;
 470     }
 471
 472     t = tmp2;
 473     s = buffer;
 474     for (y = 0; y < height; y++) {
 475         for (x = 0; x < width; x++) {
 476             s[x] = t[x] >> ass_synth_priv::VOLUME_BITS;
 477         }
 478         s += stride;
 479         t += width + 1;
 480     }
 481 }
 482
 483 /**
 484  * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
 485  */
 486 static void be_blur(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
 487 {
 488     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 489     WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 490     if(!col_sum_buf_base || !col_pix_buf_base)
 491     {
 492         //ToDo: error handling
 493         return;
 494     }
 495     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 496     memset(col_sum_buf_base, 0, w*sizeof(WORD));
 497     WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
 498     WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
 499     {
 500         int y = 0;
 501         unsigned char *src=buf+y*stride;
 502
 503         int x = 2;
 504         int old_pix = src[x-1];
 505         int old_sum = old_pix + src[x-2];
 506         for ( ; x < w; x++) {
 507             int temp1 = src[x];
 508             int temp2 = old_pix + temp1;
 509             old_pix = temp1;
 510             temp1 = old_sum + temp2;
 511             old_sum = temp2;
 512             col_pix_buf[x] = temp1;
 513         }
 514     }
 515     {
 516         int y = 1;
 517         unsigned char *src=buf+y*stride;
 518
 519
 520         int x = 2;
 521         int old_pix = src[x-1];
 522         int old_sum = old_pix + src[x-2];
 523         for ( ; x < w; x++) {
 524             int temp1 = src[x];
 525             int temp2 = old_pix + temp1;
 526             old_pix = temp1;
 527             temp1 = old_sum + temp2;
 528             old_sum = temp2;
 529
 530             temp2 = col_pix_buf[x] + temp1;
 531             col_pix_buf[x] = temp1;
 532             //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 533             col_sum_buf[x] = temp2;
 534         }
 535     }
 536
 537     //__m128i round = _mm_set1_epi16(8);
 538     for (int y = 2; y < h; y++) {
 539         unsigned char *src=buf+y*stride;
 540         unsigned char *dst=buf+(y-1)*stride;
 541
 542
 543         int x = 2;
 544         __m128i old_pix_128 = _mm_cvtsi32_si128(src[1]);
 545         __m128i old_sum_128 = _mm_cvtsi32_si128(src[0]+src[1]);
 546         for ( ; x < ((w-2)&(~7)); x+=8) {
 547             __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src+x));
 548             new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
 549             __m128i temp = _mm_slli_si128(new_pix,2);
 550             temp = _mm_add_epi16(temp, old_pix_128);
 551             temp = _mm_add_epi16(temp, new_pix);
 552             old_pix_128 = _mm_srli_si128(new_pix,14);
 553
 554             new_pix = _mm_slli_si128(temp,2);
 555             new_pix = _mm_add_epi16(new_pix, old_sum_128);
 556             new_pix = _mm_add_epi16(new_pix, temp);
 557             old_sum_128 = _mm_srli_si128(temp, 14);
 558
 559             __m128i old_col_pix = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_pix_buf+x) );
 560             __m128i old_col_sum = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_sum_buf+x) );
 561             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_pix_buf+x), new_pix );
 562             temp = _mm_add_epi16(new_pix, old_col_pix);
 563             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_sum_buf+x), temp );
 564
 565             old_col_sum = _mm_add_epi16(old_col_sum, temp);
 566             //old_col_sum = _mm_add_epi16(old_col_sum, round);
 567             old_col_sum = _mm_srli_epi16(old_col_sum, 4);
 568             old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
 569             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst+x-1), old_col_sum );
 570         }
 571         int old_pix = src[x-1];
 572         int old_sum = old_pix + src[x-2];
 573         for ( ; x < w; x++) {
 574             int temp1 = src[x];
 575             int temp2 = old_pix + temp1;
 576             old_pix = temp1;
 577             temp1 = old_sum + temp2;
 578             old_sum = temp2;
 579
 580             temp2 = col_pix_buf[x] + temp1;
 581             col_pix_buf[x] = temp1;
 582             dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 583             col_sum_buf[x] = temp2;
 584         }
 585     }
 586
 587     xy_free(col_sum_buf_base);
 588     xy_free(col_pix_buf_base);
 589 }
 590
 591 static void Bilinear(unsigned char *buf, int w, int h, int stride, int x_factor, int y_factor)
 592 {
 593     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 594     if(!col_pix_buf_base)
 595     {
 596         //ToDo: error handling
 597         return;
 598     }
 599     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 600
 601     for (int y = 0; y < h; y++){
 602         unsigned char *src=buf+y*stride;
 603
 604         WORD *col_pix_buf = col_pix_buf_base;
 605         int last=0;
 606         for(int x = 0; x < w; x++)
 607         {
 608             int temp1 = src[x];
 609             int temp2 = temp1*x_factor;
 610             temp1 <<= 3;
 611             temp1 -= temp2;
 612             temp1 += last;
 613             last = temp2;
 614
 615             temp2 = temp1*y_factor;
 616             temp1 <<= 3;
 617             temp1 -= temp2;
 618             temp1 += col_pix_buf[x];
 619             src[x] = ((temp1+32)>>6);
 620             col_pix_buf[x] = temp2;
 621         }
 622     }
 623     xy_free(col_pix_buf_base);
 624 }
 625
 626 bool Rasterizer::Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay)
 627 {
 628     using namespace ::boost::flyweights;
 629
 630     if(!overlay)
 631     {
 632         return false;
 633     }
 634     overlay->CleanUp();
 635
 636     if(!scan_line_data2.mWidth || !scan_line_data2.mHeight)
 637     {
 638         return true;
 639     }
 640     xsub &= 7;
 641     ysub &= 7;
 642     //xsub = ysub = 0;
 643     int width = scan_line_data2.mWidth + xsub;
 644     int height = scan_line_data2.mHeight + ysub;
 645     overlay->mOffsetX = scan_line_data2.mPathOffsetX - xsub;
 646     overlay->mOffsetY = scan_line_data2.mPathOffsetY - ysub;
 647     int wide_border = (scan_line_data2.mWideBorder+7)&~7;
 648     overlay->mfWideOutlineEmpty = scan_line_data2.mWideOutline.empty();
 649     if(!overlay->mfWideOutlineEmpty)
 650     {
 651         width += 2*wide_border ;
 652         height += 2*wide_border ;
 653         xsub += wide_border ;
 654         ysub += wide_border ;
 655         overlay->mOffsetX -= wide_border;
 656         overlay->mOffsetY -= wide_border;
 657     }
 658
 659     overlay->mWidth = width;
 660     overlay->mHeight = height;
 661     overlay->mOverlayWidth = ((width+7)>>3) + 1;
 662     overlay->mOverlayHeight = ((height+7)>>3) + 1;
 663     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
 664
 665     overlay->mpOverlayBuffer.base = (byte*)xy_malloc(2 * overlay->mOverlayPitch * overlay->mOverlayHeight);
 666     memset(overlay->mpOverlayBuffer.base, 0, 2 * overlay->mOverlayPitch * overlay->mOverlayHeight);
 667     overlay->mpOverlayBuffer.body = overlay->mpOverlayBuffer.base;
 668     overlay->mpOverlayBuffer.border = overlay->mpOverlayBuffer.base + overlay->mOverlayPitch * overlay->mOverlayHeight;
 669
 670     // Are we doing a border?
 671     const ScanLineData::tSpanBuffer* pOutline[2] = {&(scan_line_data2.mOutline), &(scan_line_data2.mWideOutline)};
 672     for(int i = countof(pOutline)-1; i >= 0; i--)
 673     {
 674         ScanLineData::tSpanBuffer::const_iterator it = pOutline[i]->begin();
 675         ScanLineData::tSpanBuffer::const_iterator itEnd = pOutline[i]->end();
 676         byte* plan_selected = i==0 ? overlay->mpOverlayBuffer.body : overlay->mpOverlayBuffer.border;
 677         int pitch = overlay->mOverlayPitch;
 678         for(; it!=itEnd; ++it)
 679         {
 680             int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
 681             int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
 682             int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
 683             if(x2 > x1)
 684             {
 685                 int first = x1>>3;
 686                 int last = (x2-1)>>3;
 687                 byte* dst = plan_selected + (pitch*(y>>3) + first);
 688                 if(first == last)
 689                     *dst += x2-x1;
 690                 else
 691                 {
 692                     *dst += ((first+1)<<3) - x1;
 693                     dst += 1;
 694                     while(++first < last)
 695                     {
 696                         *dst += 0x08;
 697                         dst += 1;
 698                     }
 699                     *dst += x2 - (last<<3);
 700                 }
 701             }
 702         }
 703     }
 704
 705     return true;
 706 }
 707
 708 // @return: true if actually a blur operation has done, or else false and output is leave unset.
 709 bool Rasterizer::Blur(const Overlay& input_overlay, int fBlur, double fGaussianBlur,
 710     SharedPtrOverlay output_overlay)
 711 {
 712     using namespace ::boost::flyweights;
 713
 714     if(!output_overlay)
 715     {
 716         return false;
 717     }
 718     output_overlay->CleanUp();
 719
 720     output_overlay->mOffsetX = input_overlay.mOffsetX;
 721     output_overlay->mOffsetY = input_overlay.mOffsetY;
 722     output_overlay->mWidth = input_overlay.mWidth;
 723     output_overlay->mHeight = input_overlay.mHeight;
 724     output_overlay->mOverlayWidth = input_overlay.mOverlayWidth;
 725     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight;
 726     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
 727
 728     int bluradjust = 0;
 729     if(fBlur || fGaussianBlur > 0.1)
 730     {
 731         if (fGaussianBlur > 0)
 732             bluradjust += (int)(fGaussianBlur*3*8 + 0.5) | 1;
 733         if (fBlur)
 734             bluradjust += 8;
 735         // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
 736         bluradjust = (bluradjust+7)&~7;
 737
 738         output_overlay->mOffsetX -= bluradjust;
 739         output_overlay->mOffsetY -= bluradjust;
 740         output_overlay->mWidth += (bluradjust<<1);
 741         output_overlay->mHeight += (bluradjust<<1);
 742         output_overlay->mOverlayWidth += (bluradjust>>2);
 743         output_overlay->mOverlayHeight += (bluradjust>>2);
 744     }
 745     else
 746     {
 747         return false;
 748     }
 749
 750     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
 751
 752     output_overlay->mpOverlayBuffer.base = (byte*)xy_malloc(2 * output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
 753     memset(output_overlay->mpOverlayBuffer.base, 0, 2 * output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
 754     output_overlay->mpOverlayBuffer.body = output_overlay->mpOverlayBuffer.base;
 755     output_overlay->mpOverlayBuffer.border = output_overlay->mpOverlayBuffer.base + output_overlay->mOverlayPitch * output_overlay->mOverlayHeight;
 756
 757     //copy buffer
 758     for(int i = 1; i >= 0; i--)
 759     {
 760         byte* plan_selected = i==0 ? output_overlay->mpOverlayBuffer.body : output_overlay->mpOverlayBuffer.border;
 761         const byte* plan_input = i==0 ? input_overlay.mpOverlayBuffer.body : input_overlay.mpOverlayBuffer.border;
 762
 763         plan_selected += (bluradjust>>3) + (bluradjust>>3)*output_overlay->mOverlayPitch;
 764         for (int j=0;j<input_overlay.mOverlayHeight;j++)
 765         {
 766             memcpy(plan_selected, plan_input, input_overlay.mOverlayPitch);
 767             plan_selected += output_overlay->mOverlayPitch;
 768             plan_input += input_overlay.mOverlayPitch;
 769         }
 770     }
 771
 772     ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
 773     //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
 774     // Do some gaussian blur magic
 775     if (fGaussianBlur > 0.1)//(fGaussianBlur > 0) return true even if fGaussianBlur very small
 776     {
 777         byte* plan_selected= output_overlay->mfWideOutlineEmpty ? output_overlay->mpOverlayBuffer.body : output_overlay->mpOverlayBuffer.border;
 778         flyweight<key_value<double, ass_synth_priv, ass_synth_priv_key>, no_locking> fw_priv_blur(fGaussianBlur);
 779         const ass_synth_priv& priv_blur = fw_priv_blur.get();
 780         if (output_overlay->mOverlayWidth>=priv_blur.g_w && output_overlay->mOverlayHeight>=priv_blur.g_w)
 781         {
 782             ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch,
 783                 priv_blur.gt2, priv_blur.g_r, priv_blur.g_w);
 784         }
 785     }
 786
 787     for (int pass = 0; pass < fBlur; pass++)
 788     {
 789         if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
 790         {
 791             int pitch = output_overlay->mOverlayPitch;
 792             byte* plan_selected= output_overlay->mfWideOutlineEmpty ? output_overlay->mpOverlayBuffer.body : output_overlay->mpOverlayBuffer.border;
 793             be_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
 794         }
 795     }
 796     return true;
 797 }
 798
 799 ///////////////////////////////////////////////////////////////////////////
 800
 801 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
 802 {
 803     int a = alpha;
 804     // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
 805     int ia = 256-a;
 806     a+=1;
 807     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
 808            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
 809            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
 810 }
 811
 812 static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
 813 {
 814     int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
 815     int ia = 256-a;
 816     a+=1;
 817     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
 818            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
 819            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
 820 }
 821
 822 #include <xmmintrin.h>
 823 #include <emmintrin.h>
 824
 825 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
 826 {
 827 //    alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
 828     color &= 0xffffff;
 829     __m128i zero = _mm_setzero_si128();
 830     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
 831     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
 832     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
 833     __m128i r = _mm_unpacklo_epi16(d, s);
 834     r = _mm_madd_epi16(r, a);
 835     r = _mm_srli_epi32(r, 8);
 836     r = _mm_packs_epi32(r, r);
 837     r = _mm_packus_epi16(r, r);
 838     *dst = (DWORD)_mm_cvtsi128_si32(r);
 839 }
 840
 841 static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
 842 {
 843     int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
 844     color &= 0xffffff;
 845     __m128i zero = _mm_setzero_si128();
 846     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
 847     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
 848     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
 849     __m128i r = _mm_unpacklo_epi16(d, s);
 850     r = _mm_madd_epi16(r, a);
 851     r = _mm_srli_epi32(r, 8);
 852     r = _mm_packs_epi32(r, r);
 853     r = _mm_packus_epi16(r, r);
 854     *dst = (DWORD)_mm_cvtsi128_si32(r);
 855 }
 856
 857 #include <mmintrin.h>
 858
 859 // Calculate a - b clamping to 0 instead of underflowing
 860 static __forceinline DWORD safe_subtract(DWORD a, DWORD b)
 861 {
 862     __m64 ap = _mm_cvtsi32_si64(a);
 863     __m64 bp = _mm_cvtsi32_si64(b);
 864     __m64 rp = _mm_subs_pu16(ap, bp);
 865     DWORD r = (DWORD)_mm_cvtsi64_si32(rp);
 866     _mm_empty();
 867     return r;
 868     //return (b > a) ? 0 : a - b;
 869 }
 870
 871 /***
 872  * No aligned requirement
 873  *
 874  **/
 875 void AlphaBlt(byte* pY,
 876     const byte* pAlphaMask,
 877     const byte Y,
 878     int h, int w, int src_stride, int dst_stride)
 879 {
 880     __m128i zero = _mm_setzero_si128();
 881     __m128i s = _mm_set1_epi16(Y);               //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
 882
 883     if( w>16 )//IMPORTANT! The result of the following code is undefined with w<15.
 884     {
 885         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
 886         {
 887             const BYTE* sa = pAlphaMask;
 888             BYTE* dy = pY;
 889             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
 890             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
 891             const BYTE* dy_end = pY + w;
 892
 893             for(;dy < dy_first_mod16; sa++, dy++)
 894             {
 895                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
 896             }
 897             for(; dy < dy_end_mod16; sa+=8, dy+=16)
 898             {
 899                 __m128i a = _mm_loadl_epi64((__m128i*)sa);
 900
 901                 //Y
 902                 __m128i d = _mm_load_si128((__m128i*)dy);
 903
 904                 //__m128i ones = _mm_cmpeq_epi32(zero,zero); //ones = ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 905                 //__m128i ia = _mm_xor_si128(a,ones);        //ia   = ~a
 906                 //ia = _mm_unpacklo_epi8(ia,zero);           //ia   = ~a0 0 ~a1 0 ~a2 0 ~a3 0 ~a4 0 ~a5 0 ~a6 0 ~a7 0
 907                 a = _mm_unpacklo_epi8(a,zero);               //a= a0 0  a1 0  a2 0  a3 0  a4 0  a5 0  a6 0  a7 0
 908                 __m128i ones = _mm_set1_epi16(256);          //ones = 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1
 909                 __m128i ia = _mm_sub_epi16(ones, a);         //ia   = 256-a0 ... 256-a7
 910                 ones = _mm_srli_epi16(ones, 8);
 911                 a = _mm_add_epi16(a, ones);                  //a= 1+a0 ... 1+a7
 912
 913                 __m128i dl = _mm_unpacklo_epi8(d,zero);               //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
 914                 __m128i sl = _mm_mullo_epi16(s,a);            //sl   = c0*a0  c1*a1  ... c7*a7
 915
 916                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
 917
 918                 dl = _mm_add_epi16(dl,sl);                     //d   = d + sl
 919                 dl = _mm_srli_epi16(dl, 8);                    //d   = d>>8
 920
 921                 sa += 8;
 922                 a = _mm_loadl_epi64((__m128i*)sa);
 923
 924                 a = _mm_unpacklo_epi8(a,zero);
 925                 ones = _mm_slli_epi16(ones, 8);
 926                 ia = _mm_sub_epi16(ones, a);
 927                 ones = _mm_srli_epi16(ones, 8);
 928                 a = _mm_add_epi16(a,ones);
 929
 930                 d = _mm_unpackhi_epi8(d,zero);
 931                 sl = _mm_mullo_epi16(s,a);
 932                 d = _mm_mullo_epi16(d,ia);
 933                 d = _mm_add_epi16(d,sl);
 934                 d = _mm_srli_epi16(d, 8);
 935
 936                 dl = _mm_packus_epi16(dl,d);
 937
 938                 _mm_store_si128((__m128i*)dy, dl);
 939             }
 940             for(;dy < dy_end; sa++, dy++)
 941             {
 942                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
 943             }
 944         }
 945     }
 946     else
 947     {
 948         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
 949         {
 950             const BYTE* sa = pAlphaMask;
 951             BYTE* dy = pY;
 952             const BYTE* dy_end = pY + w;
 953
 954             for(;dy < dy_end; sa++, dy++)
 955             {
 956                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
 957             }
 958         }
 959     }
 960     //__asm emms;
 961 }
 962
 963 /***
 964  * No aligned requirement
 965  *
 966  **/
 967 void AlphaBlt(byte* pY,
 968     const byte alpha,
 969     const byte Y,
 970     int h, int w, int dst_stride)
 971 {
 972     int yPremul = Y*(alpha+1);
 973     int dstAlpha = 0x100 - alpha;
 974     if( w>32 )//IMPORTANT! The result of the following code is undefined with w<15.
 975     {
 976         __m128i zero = _mm_setzero_si128();
 977         __m128i s = _mm_set1_epi16(yPremul);    //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
 978         __m128i ia = _mm_set1_epi16(dstAlpha);
 979         for( ; h>0; h--, pY += dst_stride )
 980         {
 981             BYTE* dy = pY;
 982             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
 983             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
 984             const BYTE* dy_end = pY + w;
 985
 986             for(;dy < dy_first_mod16; dy++)
 987             {
 988                 *dy = (*dy * dstAlpha + yPremul)>>8;
 989             }
 990             for(; dy < dy_end_mod16; dy+=16)
 991             {
 992                 //Y
 993                 __m128i d = _mm_load_si128(reinterpret_cast<const __m128i*>(dy));
 994                 __m128i dl = _mm_unpacklo_epi8(d,zero);        //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
 995
 996                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
 997                 dl = _mm_adds_epu16(dl,s);                     //d   = d + s
 998                 dl = _mm_srli_epi16(dl, 8);                    //d   = d>>8
 999
1000                 d = _mm_unpackhi_epi8(d,zero);
1001                 d = _mm_mullo_epi16(d,ia);
1002                 d = _mm_adds_epu16(d,s);
1003                 d = _mm_srli_epi16(d, 8);
1004
1005                 dl = _mm_packus_epi16(dl,d);
1006
1007                 _mm_store_si128(reinterpret_cast<__m128i*>(dy), dl);
1008             }
1009             for(;dy < dy_end; dy++)
1010             {
1011                 *dy = (*dy * dstAlpha + yPremul)>>8;
1012             }
1013         }
1014     }
1015     else
1016     {
1017         for( ; h>0; h--, pY += dst_stride )
1018         {
1019             BYTE* dy = pY;
1020             const BYTE* dy_end = pY + w;
1021
1022             for(;dy < dy_end; dy++)
1023             {
1024                 *dy = (*dy * dstAlpha + yPremul)>>8;
1025             }
1026         }
1027     }
1028     //__asm emms;
1029 }
1030
1031 /***
1032  * No aligned requirement
1033  *
1034  **/
1035 void AlphaBltC(byte* pY,
1036     const byte alpha,
1037     const byte Y,
1038     int h, int w, int dst_stride)
1039 {
1040     int yPremul = Y*(alpha+1);
1041     int dstAlpha = 0x100 - alpha;
1042
1043     for( ; h>0; h--, pY += dst_stride )
1044     {
1045         BYTE* dy = pY;
1046         const BYTE* dy_end = pY + w;
1047
1048         for(;dy < dy_end; dy++)
1049         {
1050             *dy = (*dy * dstAlpha + yPremul)>>8;
1051         }
1052     }
1053 }
1054
1055 // For CPUID usage in Rasterizer::Draw
1056 #include "../dsutil/vd.h"
1057
1058 static const __int64 _00ff00ff00ff00ff = 0x00ff00ff00ff00ffi64;
1059
1060 // Render a subpicture onto a surface.
1061 // spd is the surface to render on.
1062 // clipRect is a rectangular clip region to render inside.
1063 // pAlphaMask is an alpha clipping mask.
1064 // xsub and ysub ???
1065 // switchpts seems to be an array of fill colours interlaced with coordinates.
1066 //    switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1067 // fBody tells whether to render the body of the subs.
1068 // fBorder tells whether to render the border of the subs.
1069 SharedPtrByte Rasterizer::CompositeAlphaMask(SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, byte* pAlphaMask,
1070     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder,
1071     CRect *outputDirtyRect)
1072 {
1073     //fix me: check and log error
1074     SharedPtrByte result;
1075     *outputDirtyRect = CRect(0, 0, 0, 0);
1076     if(!switchpts || !fBody && !fBorder) return(result);
1077
1078     // clip
1079     // Limit drawn area to intersection of rendering surface and rectangular clip area
1080     CRect r(0, 0, spd.w, spd.h);
1081     r &= clipRect;
1082     // Remember that all subtitle coordinates are specified in 1/8 pixels
1083     // (x+4)>>3 rounds to nearest whole pixel.
1084     // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1085     int x = (xsub + overlay->mOffsetX + 4)>>3;
1086     int y = (ysub + overlay->mOffsetY + 4)>>3;
1087     int w = overlay->mOverlayWidth;
1088     int h = overlay->mOverlayHeight;
1089     int xo = 0, yo = 0;
1090     // Again, limiting?
1091     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1092     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1093     if(x+w > r.right) w = r.right-x;
1094     if(y+h > r.bottom) h = r.bottom-y;
1095     // Check if there's actually anything to render
1096     if(w <= 0 || h <= 0) return(result);
1097     outputDirtyRect->SetRect(x, y, x+w, y+h);
1098     *outputDirtyRect &= CRect(0, 0, spd.w, spd.h);
1099
1100     bool fSingleColor = (switchpts[1]==0xffffffff);
1101
1102     // draw
1103     // Grab the first colour
1104     DWORD color = switchpts[0];
1105     byte* s_base = (byte*)xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight);
1106
1107     if(fSingleColor)
1108     {
1109         overlay->FillAlphaMash(s_base, fBody, fBorder, xo, yo, w, h,
1110             pAlphaMask==NULL ? NULL : pAlphaMask + spd.w * y + x, spd.w,
1111             color>>24 );
1112     }
1113     else
1114     {
1115         int last_x = xo;
1116         const DWORD *sw = switchpts;
1117         while( last_x<w+xo )
1118         {
1119             byte alpha = sw[0]>>24;
1120             while( sw[3]<w+xo && (sw[2]>>24)==alpha )
1121             {
1122                 sw += 2;
1123             }
1124             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1125             overlay->FillAlphaMash(s_base, fBody, fBorder,
1126                 last_x, yo, new_x-last_x, h,
1127                 pAlphaMask==NULL ? NULL : pAlphaMask + spd.w * y + x + last_x - xo, spd.w,
1128                 alpha );
1129             last_x = new_x;
1130             sw += 2;
1131         }
1132     }
1133     result.reset( s_base, xy_free );
1134     return result;
1135 }
1136
1137 CRect Rasterizer::Draw(SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, byte* pAlphaMask,
1138     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder)
1139 {
1140     CRect bbox(0,0,0,0);
1141     if(!switchpts || !fBody && !fBorder) return(bbox);
1142
1143     // clip
1144     // Limit drawn area to intersection of rendering surface and rectangular clip area
1145     CRect r(0, 0, spd.w, spd.h);
1146     r &= clipRect;
1147     // Remember that all subtitle coordinates are specified in 1/8 pixels
1148     // (x+4)>>3 rounds to nearest whole pixel.
1149     // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1150     int overlayPitch = overlay->mOverlayPitch;
1151     int x = (xsub + overlay->mOffsetX + 4)>>3;
1152     int y = (ysub + overlay->mOffsetY + 4)>>3;
1153     int w = overlay->mOverlayWidth;
1154     int h = overlay->mOverlayHeight;
1155     int xo = 0, yo = 0;
1156     // Again, limiting?
1157     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1158     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1159     if(x+w > r.right) w = r.right-x;
1160     if(y+h > r.bottom) h = r.bottom-y;
1161     // Check if there's actually anything to render
1162     if(w <= 0 || h <= 0) return(bbox);
1163
1164     // CPUID from VDub
1165     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1166     bool fSingleColor = (switchpts[1]==0xffffffff);
1167     bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1168     int draw_method = 0;
1169     if(fSingleColor)
1170         draw_method |= DM::SINGLE_COLOR;
1171     if(fSSE2)
1172         draw_method |= DM::SSE2;
1173     if(AYUV_PLANAR)
1174         draw_method |= DM::AYUV_PLANAR;
1175
1176     // draw
1177     // Grab the first colour
1178     DWORD color = switchpts[0];
1179     SharedPtrByte s_base = CompositeAlphaMask(spd, overlay, clipRect, pAlphaMask, xsub, ysub, switchpts,
1180         fBody, fBorder, &bbox);
1181     const byte* s = s_base.get() + overlay->mOverlayPitch*yo + xo;
1182
1183     // How would this differ from src?
1184     unsigned long* dst = (unsigned long *)(((char *)spd.bits + spd.pitch * y) + ((x*spd.bpp)>>3));
1185
1186     // Every remaining line in the bitmap to be rendered...
1187     switch(draw_method)
1188     {
1189     case   DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1190     {
1191         while(h--)
1192         {
1193             for(int wt=0; wt<w; ++wt)
1194                 // The <<6 is due to pixmix expecting the alpha parameter to be
1195                 // the multiplication of two 6-bit unsigned numbers but we
1196                 // only have one here. (No alpha mask.)
1197                 pixmix_sse2(&dst[wt], color, s[wt]);
1198             s += overlayPitch;
1199             dst = (unsigned long *)((char *)dst + spd.pitch);
1200         }
1201     }
1202     break;
1203     case   DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1204     {
1205         while(h--)
1206         {
1207             for(int wt=0; wt<w; ++wt)
1208                 pixmix(&dst[wt], color, s[wt]);
1209             s += overlayPitch;
1210             dst = (unsigned long *)((char *)dst + spd.pitch);
1211         }
1212     }
1213     break;
1214     case 0*DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1215     {
1216         while(h--)
1217         {
1218             const DWORD *sw = switchpts;
1219             for(int wt=0; wt<w; ++wt)
1220             {
1221                 // xo is the offset (usually negative) we have moved into the image
1222                 // So if we have passed the switchpoint (?) switch to another colour
1223                 // (So switchpts stores both colours *and* coordinates?)
1224                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1225                 pixmix_sse2(&dst[wt], color, s[wt]);
1226             }
1227             s += overlayPitch;
1228             dst = (unsigned long *)((char *)dst + spd.pitch);
1229         }
1230     }
1231     break;
1232     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1233     {
1234         while(h--)
1235         {
1236             const DWORD *sw = switchpts;
1237             for(int wt=0; wt<w; ++wt)
1238             {
1239                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1240                 pixmix(&dst[wt], color, s[wt]);
1241             }
1242             s += overlayPitch;
1243             dst = (unsigned long *)((char *)dst + spd.pitch);
1244         }
1245     }
1246     break;
1247     case   DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1248     {
1249         unsigned char* dst_A = (unsigned char*)dst;
1250         unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1251         unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1252         unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1253
1254         AlphaBlt(dst_Y, s, ((color)>>16)&0xff, h, w, overlayPitch, spd.pitch);
1255         AlphaBlt(dst_U, s, ((color)>>8)&0xff, h, w, overlayPitch, spd.pitch);
1256         AlphaBlt(dst_V, s, ((color))&0xff, h, w, overlayPitch, spd.pitch);
1257         AlphaBlt(dst_A, s, 0, h, w, overlayPitch, spd.pitch);
1258     }
1259     break;
1260     case 0*DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1261     {
1262         unsigned char* dst_A = (unsigned char*)dst;
1263         unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1264         unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1265         unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1266
1267         const DWORD *sw = switchpts;
1268         int last_x = xo;
1269         color = sw[0];
1270         while(last_x<w+xo)
1271         {
1272             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1273             color = sw[0];
1274             sw += 2;
1275             if( new_x < last_x )
1276                 continue;
1277             AlphaBlt(dst_Y, s + last_x - xo, (color>>16)&0xff, h, new_x-last_x, overlayPitch, spd.pitch);
1278             AlphaBlt(dst_U, s + last_x - xo, (color>>8)&0xff, h, new_x-last_x, overlayPitch, spd.pitch);
1279             AlphaBlt(dst_V, s + last_x - xo, (color)&0xff, h, new_x-last_x, overlayPitch, spd.pitch);
1280             AlphaBlt(dst_A, s + last_x - xo, 0, h, new_x-last_x, overlayPitch, spd.pitch);
1281
1282             dst_A += new_x - last_x;
1283             dst_Y += new_x - last_x;
1284             dst_U += new_x - last_x;
1285             dst_V += new_x - last_x;
1286             last_x = new_x;
1287         }
1288     }
1289     break;
1290     case   DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1291     {
1292 //        char * debug_dst=(char*)dst;int h2 = h;
1293 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1294 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1295 //        debug_dst += spd.pitch*spd.h;
1296 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1297 //        debug_dst += spd.pitch*spd.h;
1298 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1299 //        debug_dst += spd.pitch*spd.h;
1300 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1301 //        debug_dst=(char*)dst;
1302
1303         unsigned char* dst_A = (unsigned char*)dst;
1304         unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1305         unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1306         unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1307         while(h--)
1308         {
1309             for(int wt=0; wt<w; ++wt)
1310             {
1311                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1312                 pixmix(&temp, color, s[wt]);
1313                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1314             }
1315             s += overlayPitch;
1316             dst_A += spd.pitch;
1317             dst_Y += spd.pitch;
1318             dst_U += spd.pitch;
1319             dst_V += spd.pitch;
1320         }
1321 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1322 //        debug_dst += spd.pitch*spd.h;
1323 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1324 //        debug_dst += spd.pitch*spd.h;
1325 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1326 //        debug_dst += spd.pitch*spd.h;
1327 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1328     }
1329     break;
1330     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1331     {
1332         unsigned char* dst_A = (unsigned char*)dst;
1333         unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1334         unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1335         unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1336         while(h--)
1337         {
1338             const DWORD *sw = switchpts;
1339             for(int wt=0; wt<w; ++wt)
1340             {
1341                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1342                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1343                 pixmix(&temp, color, (s[wt]*(color>>24))>>8);
1344                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1345             }
1346             s += overlayPitch;
1347             dst_A += spd.pitch;
1348             dst_Y += spd.pitch;
1349             dst_U += spd.pitch;
1350             dst_V += spd.pitch;
1351         }
1352     }
1353     break;
1354     }
1355     // Remember to EMMS!
1356     // Rendering fails in funny ways if we don't do this.
1357     _mm_empty();
1358     return bbox;
1359 }
1360
1361 CRect Rasterizer::Draw( SubPicDesc& spd, DrawItem& draw_item )
1362 {
1363     return Draw(spd, draw_item.overlay, draw_item.clip_rect, draw_item.alpha_mask.get(),
1364         draw_item.xsub, draw_item.ysub, draw_item.switchpts, draw_item.fBody, draw_item.fBorder);
1365 }
1366
1367 DrawItem* Rasterizer::CreateDrawItem( SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, SharedArrayByte pAlphaMask, int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder )
1368 {
1369     DrawItem* result = new DrawItem();
1370     result->overlay = overlay;
1371     result->clip_rect = clipRect;
1372     result->alpha_mask = pAlphaMask;
1373     result->xsub = xsub;
1374     result->ysub = ysub;
1375
1376     memcpy(result->switchpts, switchpts, sizeof(result->switchpts));
1377     result->fBody = fBody;
1378     result->fBorder = fBorder;
1379     return result;
1380 }
1381
1382 CRect Rasterizer::DryDraw( SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, byte* pAlphaMask, int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder )
1383 {
1384     CRect bbox(0, 0, 0, 0);
1385     if(!switchpts || !fBody && !fBorder) return(bbox);
1386
1387     // clip
1388     // Limit drawn area to intersection of rendering surface and rectangular clip area
1389     CRect r(0, 0, spd.w, spd.h);
1390     r &= clipRect;
1391     // Remember that all subtitle coordinates are specified in 1/8 pixels
1392     // (x+4)>>3 rounds to nearest whole pixel.
1393     // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1394     int overlayPitch = overlay->mOverlayPitch;
1395     int x = (xsub + overlay->mOffsetX + 4)>>3;
1396     int y = (ysub + overlay->mOffsetY + 4)>>3;
1397     int w = overlay->mOverlayWidth;
1398     int h = overlay->mOverlayHeight;
1399     int xo = 0, yo = 0;
1400     // Again, limiting?
1401     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1402     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1403     if(x+w > r.right) w = r.right-x;
1404     if(y+h > r.bottom) h = r.bottom-y;
1405     // Check if there's actually anything to render
1406     if(w <= 0 || h <= 0) return(bbox);
1407     bbox.SetRect(x, y, x+w, y+h);
1408     bbox &= CRect(0, 0, spd.w, spd.h);
1409
1410     return bbox;
1411 }
1412
1413 CRect Rasterizer::DryDraw( SubPicDesc& spd, DrawItem& draw_item )
1414 {
1415     return DryDraw(spd, draw_item.overlay, draw_item.clip_rect, draw_item.alpha_mask.get(),
1416         draw_item.xsub, draw_item.ysub, draw_item.switchpts, draw_item.fBody, draw_item.fBorder);
1417 }
1418
1419 void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nHeight, DWORD argb)
1420 {
1421     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1422     bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1423     int draw_method = 0;
1424     if(fSSE2)
1425         draw_method |= DM::SSE2;
1426     if(AYUV_PLANAR)
1427         draw_method |= DM::AYUV_PLANAR;
1428
1429     switch (draw_method)
1430     {
1431     case   DM::SSE2 | 0*DM::AYUV_PLANAR :
1432     {
1433         for (int wy=y; wy<y+nHeight; wy++) {
1434             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1435             for(int wt=0; wt<nWidth; ++wt) {
1436                 pixmix_sse2(&dst[wt], argb, argb>>24);
1437             }
1438         }
1439     }
1440     break;
1441     case 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1442     {
1443         for (int wy=y; wy<y+nHeight; wy++) {
1444             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1445             for(int wt=0; wt<nWidth; ++wt) {
1446                 pixmix(&dst[wt], argb,  argb>>24);
1447             }
1448         }
1449     }
1450     break;
1451     case   DM::SSE2 |   DM::AYUV_PLANAR :
1452     {
1453         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1454         BYTE* dst_A = dst;
1455         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1456         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1457         BYTE* dst_V = dst_U + spd.pitch*spd.h;
1458         AlphaBlt(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1459         AlphaBlt(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1460         AlphaBlt(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1461         AlphaBlt(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1462     }
1463     break;
1464     case 0*DM::SSE2 |   DM::AYUV_PLANAR :
1465     {
1466         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1467         BYTE* dst_A = dst;
1468         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1469         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1470         BYTE* dst_V = dst_U + spd.pitch*spd.h;
1471         AlphaBltC(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1472         AlphaBltC(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1473         AlphaBltC(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1474         AlphaBltC(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1475     }
1476     break;
1477     }
1478     _mm_empty();
1479 }
1480
1481 ///////////////////////////////////////////////////////////////
1482
1483 // Overlay
1484
1485 void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha )
1486 {
1487     pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
1488     pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
1489     byte* dst = outputAlphaMask + y*mOverlayPitch + x;
1490
1491     const int x0 = ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) < w ?
1492                     ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) : w; //IMPORTANT! Should not exceed w.
1493     const int x00 = ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) < w ?
1494                     ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) : w;//IMPORTANT! Should not exceed w.
1495     const int x_end00  = ((reinterpret_cast<int>(dst)+w)&~15) - reinterpret_cast<int>(dst);
1496     const int x_end0 = ((reinterpret_cast<int>(dst)+w)&~3) - reinterpret_cast<int>(dst);
1497     const int x_end = w;
1498
1499     __m64 color_alpha_64 = _mm_set1_pi16(color_alpha);
1500     __m128i color_alpha_128 = _mm_set1_epi16(color_alpha);
1501
1502     if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
1503     {
1504         /*
1505         __asm
1506         {
1507         mov        eax, color_alpha
1508         movd       XMM3, eax
1509         punpcklwd  XMM3, XMM3
1510         pshufd     XMM3, XMM3, 0
1511         }
1512         */
1513         while(h--)
1514         {
1515             int j=0;
1516             for( ; j<x0; j++ )
1517             {
1518                 int temp = pBorder[j]-pBody[j];
1519                 temp = temp<0 ? 0 : temp;
1520                 dst[j] = (temp * color_alpha)>>6;
1521             }
1522             for( ;j<x00;j+=4 )
1523             {
1524                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1525                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1526                 border = _mm_subs_pu8(border, body);
1527                 __m64 zero = _mm_setzero_si64();
1528                 border = _mm_unpacklo_pi8(border, zero);
1529                 border = _mm_mullo_pi16(border, color_alpha_64);
1530                 border = _mm_srli_pi16(border, 6);
1531                 border = _mm_packs_pu16(border,border);
1532                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1533             }
1534             __m128i zero = _mm_setzero_si128();
1535             for( ;j<x_end00;j+=16)
1536             {
1537                 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1538                 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1539                 border = _mm_subs_epu8(border,body);
1540                 __m128i srchi = border;
1541                 border = _mm_unpacklo_epi8(border, zero);
1542                 srchi = _mm_unpackhi_epi8(srchi, zero);
1543                 border = _mm_mullo_epi16(border, color_alpha_128);
1544                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1545                 border = _mm_srli_epi16(border, 6);
1546                 srchi = _mm_srli_epi16(srchi, 6);
1547                 border = _mm_packus_epi16(border, srchi);
1548                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1549             }
1550             for( ;j<x_end0;j+=4)
1551             {
1552                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1553                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1554                 border = _mm_subs_pu8(border, body);
1555                 __m64 zero = _mm_setzero_si64();
1556                 border = _mm_unpacklo_pi8(border, zero);
1557                 border = _mm_mullo_pi16(border, color_alpha_64);
1558                 border = _mm_srli_pi16(border, 6);
1559                 border = _mm_packs_pu16(border,border);
1560                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1561             }
1562             for( ;j<x_end;j++)
1563             {
1564                 int temp = pBorder[j]-pBody[j];
1565                 temp = temp<0 ? 0 : temp;
1566                 dst[j] = (temp * color_alpha)>>6;
1567             }
1568             pBody += mOverlayPitch;
1569             pBorder += mOverlayPitch;
1570             //pAlphaMask += pitch;
1571             dst += mOverlayPitch;
1572         }
1573     }
1574     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
1575     {
1576         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1577         while(h--)
1578         {
1579             int j=0;
1580             for( ; j<x0; j++ )
1581             {
1582                 dst[j] = (src1[j] * color_alpha)>>6;
1583             }
1584             for( ;j<x00;j+=4 )
1585             {
1586                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1587                 __m64 zero = _mm_setzero_si64();
1588                 src = _mm_unpacklo_pi8(src, zero);
1589                 src = _mm_mullo_pi16(src, color_alpha_64);
1590                 src = _mm_srli_pi16(src, 6);
1591                 src = _mm_packs_pu16(src,src);
1592                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1593             }
1594             __m128i zero = _mm_setzero_si128();
1595             for( ;j<x_end00;j+=16)
1596             {
1597                 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1598                 __m128i srchi = src;
1599                 src = _mm_unpacklo_epi8(src, zero);
1600                 srchi = _mm_unpackhi_epi8(srchi, zero);
1601                 src = _mm_mullo_epi16(src, color_alpha_128);
1602                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1603                 src = _mm_srli_epi16(src, 6);
1604                 srchi = _mm_srli_epi16(srchi, 6);
1605                 src = _mm_packus_epi16(src, srchi);
1606                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1607             }
1608             for( ;j<x_end0;j+=4)
1609             {
1610                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1611                 __m64 zero = _mm_setzero_si64();
1612                 src = _mm_unpacklo_pi8(src, zero);
1613                 src = _mm_mullo_pi16(src, color_alpha_64);
1614                 src = _mm_srli_pi16(src, 6);
1615                 src = _mm_packs_pu16(src,src);
1616                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1617             }
1618             for( ;j<x_end;j++)
1619             {
1620                 dst[j] = (src1[j] * color_alpha)>>6;
1621             }
1622             src1 += mOverlayPitch;
1623             //pAlphaMask += pitch;
1624             dst += mOverlayPitch;
1625         }
1626     }
1627     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
1628     {
1629         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1630         while(h--)
1631         {
1632             int j=0;
1633             for( ; j<x0; j++ )
1634             {
1635                 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1636             }
1637             for( ;j<x00;j+=4 )
1638             {
1639                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1640                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1641                 __m64 zero = _mm_setzero_si64();
1642                 src = _mm_unpacklo_pi8(src, zero);
1643                 src = _mm_mullo_pi16(src, color_alpha_64);
1644                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1645                 src = _mm_mulhi_pi16(src, mask); //important!
1646                 src = _mm_srli_pi16(src, 12+8-16); //important!
1647                 src = _mm_packs_pu16(src,src);
1648                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1649             }
1650             __m128i zero = _mm_setzero_si128();
1651             for( ;j<x_end00;j+=16)
1652             {
1653                 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1654                 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1655                 __m128i srchi = src;
1656                 __m128i maskhi = mask;
1657                 src = _mm_unpacklo_epi8(src, zero);
1658                 srchi = _mm_unpackhi_epi8(srchi, zero);
1659                 mask = _mm_unpacklo_epi8(zero, mask); //important!
1660                 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1661                 src = _mm_mullo_epi16(src, color_alpha_128);
1662                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1663                 src = _mm_mulhi_epu16(src, mask); //important!
1664                 srchi = _mm_mulhi_epu16(srchi, maskhi);
1665                 src = _mm_srli_epi16(src, 12+8-16); //important!
1666                 srchi = _mm_srli_epi16(srchi, 12+8-16);
1667                 src = _mm_packus_epi16(src, srchi);
1668                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1669             }
1670             for( ;j<x_end0;j+=4)
1671             {
1672                 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1673                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1674                 __m64 zero = _mm_setzero_si64();
1675                 src = _mm_unpacklo_pi8(src, zero);
1676                 src = _mm_mullo_pi16(src, color_alpha_64);
1677                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1678                 src = _mm_mulhi_pi16(src, mask); //important!
1679                 src = _mm_srli_pi16(src, 12+8-16); //important!
1680                 src = _mm_packs_pu16(src,src);
1681                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1682             }
1683             for( ;j<x_end;j++)
1684             {
1685                 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1686             }
1687             src1 += mOverlayPitch;
1688             pAlphaMask += pitch;
1689             dst += mOverlayPitch;
1690         }
1691     }
1692     else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
1693     {
1694         while(h--)
1695         {
1696             int j=0;
1697             for( ; j<x0; j++ )
1698             {
1699                 int temp = pBorder[j]-pBody[j];
1700                 temp = temp<0 ? 0 : temp;
1701                 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1702             }
1703             for( ;j<x00;j+=4 )
1704             {
1705                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1706                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1707                 border = _mm_subs_pu8(border, body);
1708                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1709                 __m64 zero = _mm_setzero_si64();
1710                 border = _mm_unpacklo_pi8(border, zero);
1711                 border = _mm_mullo_pi16(border, color_alpha_64);
1712                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1713                 border = _mm_mulhi_pi16(border, mask); //important!
1714                 border = _mm_srli_pi16(border, 12+8-16); //important!
1715                 border = _mm_packs_pu16(border,border);
1716                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1717             }
1718             __m128i zero = _mm_setzero_si128();
1719             for( ;j<x_end00;j+=16)
1720             {
1721                 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1722                 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1723                 border = _mm_subs_epu8(border,body);
1724
1725                 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1726                 __m128i srchi = border;
1727                 __m128i maskhi = mask;
1728                 border = _mm_unpacklo_epi8(border, zero);
1729                 srchi = _mm_unpackhi_epi8(srchi, zero);
1730                 mask = _mm_unpacklo_epi8(zero, mask); //important!
1731                 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1732                 border = _mm_mullo_epi16(border, color_alpha_128);
1733                 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1734                 border = _mm_mulhi_epu16(border, mask); //important!
1735                 srchi = _mm_mulhi_epu16(srchi, maskhi);
1736                 border = _mm_srli_epi16(border, 12+8-16); //important!
1737                 srchi = _mm_srli_epi16(srchi, 12+8-16);
1738                 border = _mm_packus_epi16(border, srchi);
1739                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1740             }
1741             for( ;j<x_end0;j+=4)
1742             {
1743                 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1744                 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1745                 border = _mm_subs_pu8(border, body);
1746                 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1747                 __m64 zero = _mm_setzero_si64();
1748                 border = _mm_unpacklo_pi8(border, zero);
1749                 border = _mm_mullo_pi16(border, color_alpha_64);
1750                 mask = _mm_unpacklo_pi8(zero, mask); //important!
1751                 border = _mm_mulhi_pi16(border, mask); //important!
1752                 border = _mm_srli_pi16(border, 12+8-16); //important!
1753                 border = _mm_packs_pu16(border,border);
1754                 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1755             }
1756             for( ;j<x_end;j++)
1757             {
1758                 int temp = pBorder[j]-pBody[j];
1759                 temp = temp<0 ? 0 : temp;
1760                 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1761             }
1762             pBody += mOverlayPitch;
1763             pBorder += mOverlayPitch;
1764             pAlphaMask += pitch;
1765             dst += mOverlayPitch;
1766         }
1767     }
1768     else
1769     {
1770         //should NOT happen!
1771         ASSERT(0);
1772     }
1773 }
1774
1775 void Overlay::FillAlphaMash( byte* outputAlphaMask, bool fBody, bool fBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha)
1776 {
1777     if(!fBorder && fBody && pAlphaMask==NULL)
1778     {
1779         _DoFillAlphaMash(outputAlphaMask, mpOverlayBuffer.body, NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1780     }
1781     else if(/*fBorder &&*/ fBody && pAlphaMask==NULL)
1782     {
1783         _DoFillAlphaMash(outputAlphaMask, NULL, mpOverlayBuffer.border, x, y, w, h, pAlphaMask, pitch, color_alpha);
1784     }
1785     else if(!fBody && fBorder /* pAlphaMask==NULL or not*/)
1786     {
1787         _DoFillAlphaMash(outputAlphaMask, mpOverlayBuffer.body, mpOverlayBuffer.border, x, y, w, h, pAlphaMask, pitch, color_alpha);
1788     }
1789     else if(!fBorder && fBody && pAlphaMask!=NULL)
1790     {
1791         _DoFillAlphaMash(outputAlphaMask, mpOverlayBuffer.body, NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1792     }
1793     else if(fBorder && fBody && pAlphaMask!=NULL)
1794     {
1795         _DoFillAlphaMash(outputAlphaMask, NULL, mpOverlayBuffer.border, x, y, w, h, pAlphaMask, pitch, color_alpha);
1796     }
1797     else
1798     {
1799         //should NOT happen
1800         ASSERT(0);
1801     }
1802 }
1803
1804 Overlay* Overlay::GetSubpixelVariance(unsigned int xshift, unsigned int yshift)
1805 {
1806     Overlay* overlay = new Overlay();
1807     if(!overlay)
1808     {
1809         return NULL;
1810     }
1811     xshift &= 7;
1812     yshift &= 7;
1813
1814     overlay->mOffsetX = mOffsetX - xshift;
1815     overlay->mOffsetY = mOffsetY - yshift;
1816     overlay->mWidth = mWidth + xshift;
1817     overlay->mHeight = mHeight + yshift;
1818
1819     overlay->mOverlayWidth = ((overlay->mWidth+7)>>3) + 1;
1820     overlay->mOverlayHeight = ((overlay->mHeight + 7)>>3) + 1;
1821     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
1822
1823     overlay->mpOverlayBuffer.base = reinterpret_cast<byte*>(xy_malloc(2 * overlay->mOverlayPitch * overlay->mOverlayHeight));
1824     overlay->mpOverlayBuffer.body = overlay->mpOverlayBuffer.base;
1825     overlay->mpOverlayBuffer.border = overlay->mpOverlayBuffer.base + overlay->mOverlayPitch * overlay->mOverlayHeight;
1826
1827     overlay->mfWideOutlineEmpty = mfWideOutlineEmpty;
1828
1829     if(overlay->mOverlayWidth==mOverlayWidth && overlay->mOverlayHeight==mOverlayHeight)
1830         memcpy(overlay->mpOverlayBuffer.base, mpOverlayBuffer.base, 2 * mOverlayPitch * mOverlayHeight);
1831     else
1832     {
1833         memset(overlay->mpOverlayBuffer.base, 0, 2 * overlay->mOverlayPitch * overlay->mOverlayHeight);
1834         byte* dst = overlay->mpOverlayBuffer.body;
1835         const byte* src = mpOverlayBuffer.body;
1836         for (int i=0;i<mOverlayHeight;i++)
1837         {
1838             memcpy(dst, src, mOverlayPitch);
1839             dst += overlay->mOverlayPitch;
1840             src += mOverlayPitch;
1841         }
1842         dst = overlay->mpOverlayBuffer.border;
1843         src = mpOverlayBuffer.border;
1844         for (int i=0;i<mOverlayHeight;i++)
1845         {
1846             memcpy(dst, src, mOverlayPitch);
1847             dst += overlay->mOverlayPitch;
1848             src += mOverlayPitch;
1849         }
1850     }
1851     //not equal
1852     //  Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1853     Bilinear(overlay->mpOverlayBuffer.body, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1854     Bilinear(overlay->mpOverlayBuffer.border, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1855     return overlay;
1856 }
1857
1858 ///////////////////////////////////////////////////////////////
1859
1860 // PathData
1861
1862 PathData::PathData():mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0)
1863 {
1864 }
1865
1866 PathData::PathData( const PathData& src ):mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(src.mPathPoints)
1867 {
1868     //TODO: deal with the case that src.mPathPoints<0
1869     if(mPathPoints>0)
1870     {
1871         mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
1872         mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));
1873     }
1874     if(mPathPoints>0)
1875     {
1876         memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
1877         memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
1878     }
1879 }
1880
1881 const PathData& PathData::operator=( const PathData& src )
1882 {
1883     if(this!=&src)
1884     {
1885         if(mPathPoints!=src.mPathPoints && src.mPathPoints>0)
1886         {
1887             _TrashPath();
1888             mPathPoints = src.mPathPoints;
1889             mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
1890             mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));//better than realloc
1891         }
1892         if(src.mPathPoints>0)
1893         {
1894             memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
1895             memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
1896         }
1897     }
1898     return *this;
1899 }
1900
1901 PathData::~PathData()
1902 {
1903     _TrashPath();
1904 }
1905
1906 void PathData::_TrashPath()
1907 {
1908     if (mpPathTypes)
1909     {
1910         free(mpPathTypes);
1911         mpPathTypes = NULL;
1912     }
1913     if (mpPathPoints)
1914     {
1915         free(mpPathPoints);
1916         mpPathPoints = NULL;
1917     }
1918     mPathPoints = 0;
1919 }
1920
1921 bool PathData::BeginPath(HDC hdc)
1922 {
1923     _TrashPath();
1924     return !!::BeginPath(hdc);
1925 }
1926
1927 bool PathData::EndPath(HDC hdc)
1928 {
1929     ::CloseFigure(hdc);
1930     if(::EndPath(hdc))
1931     {
1932         mPathPoints = GetPath(hdc, NULL, NULL, 0);
1933         if(!mPathPoints)
1934             return true;
1935         mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
1936         mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
1937         if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
1938             return true;
1939     }
1940     ::AbortPath(hdc);
1941     return false;
1942 }
1943
1944 bool PathData::PartialBeginPath(HDC hdc, bool bClearPath)
1945 {
1946     if(bClearPath)
1947         _TrashPath();
1948     return !!::BeginPath(hdc);
1949 }
1950
1951 bool PathData::PartialEndPath(HDC hdc, long dx, long dy)
1952 {
1953     ::CloseFigure(hdc);
1954     if(::EndPath(hdc))
1955     {
1956         int nPoints;
1957         BYTE* pNewTypes;
1958         POINT* pNewPoints;
1959         nPoints = GetPath(hdc, NULL, NULL, 0);
1960         if(!nPoints)
1961             return true;
1962         pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
1963         pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
1964         if(pNewTypes)
1965             mpPathTypes = pNewTypes;
1966         if(pNewPoints)
1967             mpPathPoints = pNewPoints;
1968         BYTE* pTypes = new BYTE[nPoints];
1969         POINT* pPoints = new POINT[nPoints];
1970         if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
1971         {
1972             for(int i = 0; i < nPoints; ++i)
1973             {
1974                 mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
1975                 mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
1976                 mpPathTypes[mPathPoints + i] = pTypes[i];
1977             }
1978             mPathPoints += nPoints;
1979             delete[] pTypes;
1980             delete[] pPoints;
1981             return true;
1982         }
1983         else
1984             DebugBreak();
1985         delete[] pTypes;
1986         delete[] pPoints;
1987     }
1988     ::AbortPath(hdc);
1989     return false;
1990 }
1991
1992 void PathData::AlignLeftTop(CPoint *left_top, CSize *size)
1993 {
1994     int minx = INT_MAX;
1995     int miny = INT_MAX;
1996     int maxx = INT_MIN;
1997     int maxy = INT_MIN;
1998     for(int i=0; i<mPathPoints; ++i)
1999     {
2000         int ix = mpPathPoints[i].x;
2001         int iy = mpPathPoints[i].y;
2002         if(ix < minx) minx = ix;
2003         if(ix > maxx) maxx = ix;
2004         if(iy < miny) miny = iy;
2005         if(iy > maxy) maxy = iy;
2006     }
2007     if(minx > maxx || miny > maxy)
2008     {
2009         _TrashPath();
2010         *left_top = CPoint(0, 0);
2011         *size = CSize(0, 0);
2012         return;
2013     }
2014     minx = (minx >> 3) & ~7;
2015     miny = (miny >> 3) & ~7;
2016     maxx = (maxx + 7) >> 3;
2017     maxy = (maxy + 7) >> 3;
2018     for(int i=0; i<mPathPoints; ++i)
2019     {
2020         mpPathPoints[i].x -= minx*8;
2021         mpPathPoints[i].y -= miny*8;
2022     }
2023     *left_top = CPoint(minx, miny);
2024     *size = CSize(maxx+1-minx, maxy+1-miny);
2025     return;
2026 }
2027
2028 //////////////////////////////////////////////////////////////////////////
2029
2030 // ScanLineData
2031
2032 ScanLineData::ScanLineData()
2033 {
2034 }
2035
2036 ScanLineData::~ScanLineData()
2037 {
2038 }
2039
2040 void ScanLineData::_ReallocEdgeBuffer(int edges)
2041 {
2042     mEdgeHeapSize = edges;
2043     mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
2044 }
2045
2046 void ScanLineData::_EvaluateBezier(const PathData& path_data, int ptbase, bool fBSpline)
2047 {
2048     const POINT* pt0 = path_data.mpPathPoints + ptbase;
2049     const POINT* pt1 = path_data.mpPathPoints + ptbase + 1;
2050     const POINT* pt2 = path_data.mpPathPoints + ptbase + 2;
2051     const POINT* pt3 = path_data.mpPathPoints + ptbase + 3;
2052     double x0 = pt0->x;
2053     double x1 = pt1->x;
2054     double x2 = pt2->x;
2055     double x3 = pt3->x;
2056     double y0 = pt0->y;
2057     double y1 = pt1->y;
2058     double y2 = pt2->y;
2059     double y3 = pt3->y;
2060     double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
2061     if(fBSpline)
2062     {
2063         // 1   [-1 +3 -3 +1]
2064         // - * [+3 -6 +3  0]
2065         // 6   [-3  0 +3  0]
2066         //         [+1 +4 +1  0]
2067         double _1div6 = 1.0/6.0;
2068         cx3 = _1div6*(-  x0+3*x1-3*x2+x3);
2069         cx2 = _1div6*( 3*x0-6*x1+3*x2);
2070         cx1 = _1div6*(-3*x0        +3*x2);
2071         cx0 = _1div6*(   x0+4*x1+1*x2);
2072         cy3 = _1div6*(-  y0+3*y1-3*y2+y3);
2073         cy2 = _1div6*( 3*y0-6*y1+3*y2);
2074         cy1 = _1div6*(-3*y0     +3*y2);
2075         cy0 = _1div6*(   y0+4*y1+1*y2);
2076     }
2077     else // bezier
2078     {
2079         // [-1 +3 -3 +1]
2080         // [+3 -6 +3  0]
2081         // [-3 +3  0  0]
2082         // [+1  0  0  0]
2083         cx3 = -  x0+3*x1-3*x2+x3;
2084         cx2 =  3*x0-6*x1+3*x2;
2085         cx1 = -3*x0+3*x1;
2086         cx0 =    x0;
2087         cy3 = -  y0+3*y1-3*y2+y3;
2088         cy2 =  3*y0-6*y1+3*y2;
2089         cy1 = -3*y0+3*y1;
2090         cy0 =    y0;
2091     }
2092     //
2093     // This equation is from Graphics Gems I.
2094     //
2095     // The idea is that since we're approximating a cubic curve with lines,
2096     // any error we incur is due to the curvature of the line, which we can
2097     // estimate by calculating the maximum acceleration of the curve.  For
2098     // a cubic, the acceleration (second derivative) is a line, meaning that
2099     // the absolute maximum acceleration must occur at either the beginning
2100     // (|c2|) or the end (|c2+c3|).  Our bounds here are a little more
2101     // conservative than that, but that's okay.
2102     //
2103     // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2104     // that component of the curve is linear and does not incur any error.
2105     // If a=0 for both X and Y, the curve is a line segment and we can
2106     // use a step size of 1.
2107     double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
2108     double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
2109     double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
2110     double h = 1.0;
2111     if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
2112     if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
2113     for(double t = 0; t < 1.0; t += h)
2114     {
2115         double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
2116         double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
2117         _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2118     }
2119     double x = cx0 + cx1 + cx2 + cx3;
2120     double y = cy0 + cy1 + cy2 + cy3;
2121     _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2122 }
2123
2124 void ScanLineData::_EvaluateLine(const PathData& path_data, int pt1idx, int pt2idx)
2125 {
2126     const POINT* pt1 = path_data.mpPathPoints + pt1idx;
2127     const POINT* pt2 = path_data.mpPathPoints + pt2idx;
2128     _EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
2129 }
2130
2131 void ScanLineData::_EvaluateLine(int x0, int y0, int x1, int y1)
2132 {
2133     if(lastp.x != x0 || lastp.y != y0)
2134     {
2135         _EvaluateLine(lastp.x, lastp.y, x0, y0);
2136     }
2137     if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
2138     lastp.x = x1;
2139     lastp.y = y1;
2140     if(y1 > y0) // down
2141     {
2142         __int64 xacc = (__int64)x0 << 13;
2143         // prestep y0 down
2144         int dy = y1 - y0;
2145         int y = ((y0 + 3)&~7) + 4;
2146         int iy = y >> 3;
2147         y1 = (y1 - 5) >> 3;
2148         if(iy <= y1)
2149         {
2150             __int64 invslope = (__int64(x1 - x0) << 16) / dy;
2151             while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
2152                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2153             xacc += (invslope * (y - y0)) >> 3;
2154             while(iy <= y1)
2155             {
2156                 int ix = (int)((xacc + 32768) >> 16);
2157                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2158                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
2159                 mpScanBuffer[iy] = mEdgeNext++;
2160                 ++iy;
2161                 xacc += invslope;
2162             }
2163         }
2164     }
2165     else if(y1 < y0) // up
2166     {
2167         __int64 xacc = (__int64)x1 << 13;
2168         // prestep y1 down
2169         int dy = y0 - y1;
2170         int y = ((y1 + 3)&~7) + 4;
2171         int iy = y >> 3;
2172         y0 = (y0 - 5) >> 3;
2173         if(iy <= y0)
2174         {
2175             __int64 invslope = (__int64(x0 - x1) << 16) / dy;
2176             while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
2177                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2178             xacc += (invslope * (y - y1)) >> 3;
2179             while(iy <= y0)
2180             {
2181                 int ix = (int)((xacc + 32768) >> 16);
2182                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2183                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
2184                 mpScanBuffer[iy] = mEdgeNext++;
2185                 ++iy;
2186                 xacc += invslope;
2187             }
2188         }
2189     }
2190 }
2191
2192 bool ScanLineData::ScanConvert(const PathData& path_data, const CSize& size)
2193 {
2194     int lastmoveto = -1;
2195     int i;
2196     // Drop any outlines we may have.
2197     mOutline.clear();
2198     mWideOutline.clear();
2199     mWideBorder = 0;
2200     // Determine bounding box
2201     if(!path_data.mPathPoints)
2202     {
2203         mWidth = mHeight = 0;
2204         return false;
2205     }
2206     mWidth = size.cx;
2207     mHeight = size.cy;
2208     // Initialize edge buffer.  We use edge 0 as a sentinel.
2209     mEdgeNext = 1;
2210     mEdgeHeapSize = 2048;
2211     mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
2212     // Initialize scanline list.
2213     mpScanBuffer = new unsigned int[mHeight];
2214     memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
2215     // Scan convert the outline.  Yuck, Bezier curves....
2216     // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2217     // paths with all but the first figure left open, so we can't rely
2218     // on the PT_CLOSEFIGURE flag being used appropriately.
2219     fFirstSet = false;
2220     firstp.x = firstp.y = 0;
2221     lastp.x = lastp.y = 0;
2222     for(i=0; i<path_data.mPathPoints; ++i)
2223     {
2224         BYTE t = path_data.mpPathTypes[i] & ~PT_CLOSEFIGURE;
2225         switch(t)
2226         {
2227         case PT_MOVETO:
2228             if(lastmoveto >= 0 && firstp != lastp)
2229                 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2230             lastmoveto = i;
2231             fFirstSet = false;
2232             lastp = path_data.mpPathPoints[i];
2233             break;
2234         case PT_MOVETONC:
2235             break;
2236         case PT_LINETO:
2237             if(path_data.mPathPoints - (i-1) >= 2) _EvaluateLine(path_data, i-1, i);
2238             break;
2239         case PT_BEZIERTO:
2240             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, false);
2241             i += 2;
2242             break;
2243         case PT_BSPLINETO:
2244             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, true);
2245             i += 2;
2246             break;
2247         case PT_BSPLINEPATCHTO:
2248             if(path_data.mPathPoints - (i-3) >= 4) _EvaluateBezier(path_data, i-3, true);
2249             break;
2250         }
2251     }
2252     if(lastmoveto >= 0 && firstp != lastp)
2253         _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2254     // Convert the edges to spans.  We couldn't do this before because some of
2255     // the regions may have winding numbers >+1 and it would have been a pain
2256     // to try to adjust the spans on the fly.  We use one heap to detangle
2257     // a scanline's worth of edges from the singly-linked lists, and another
2258     // to collect the actual scans.
2259     std::vector<int> heap;
2260     mOutline.reserve(mEdgeNext / 2);
2261     __int64 y = 0;
2262     for(y=0; y<mHeight; ++y)
2263     {
2264         int count = 0;
2265         // Detangle scanline into edge heap.
2266         for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
2267         {
2268             heap.push_back(mpEdgeBuffer[ptr].posandflag);
2269         }
2270         // Sort edge heap.  Note that we conveniently made the opening edges
2271         // one more than closing edges at the same spot, so we won't have any
2272         // problems with abutting spans.
2273         std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
2274         // Process edges and add spans.  Since we only check for a non-zero
2275         // winding number, it doesn't matter which way the outlines go!
2276         std::vector<int>::iterator itX1 = heap.begin();
2277         std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
2278         int x1, x2;
2279         for(; itX1 != itX2; ++itX1)
2280         {
2281             int x = *itX1;
2282             if(!count)
2283                 x1 = (x>>1);
2284             if(x&1)
2285                 ++count;
2286             else
2287                 --count;
2288             if(!count)
2289             {
2290                 x2 = (x>>1);
2291                 if(x2>x1)
2292                     mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
2293             }
2294         }
2295         heap.clear();
2296     }
2297     // Dump the edge and scan buffers, since we no longer need them.
2298     free(mpEdgeBuffer);
2299     delete [] mpScanBuffer;
2300     // All done!
2301     return true;
2302 }
2303
2304 using namespace std;
2305
2306 void ScanLineData::_OverlapRegion(tSpanBuffer& dst, tSpanBuffer& src, int dx, int dy)
2307 {
2308     tSpanBuffer temp;
2309     temp.reserve(dst.size() + src.size());
2310     dst.swap(temp);
2311     tSpanBuffer::iterator itA = temp.begin();
2312     tSpanBuffer::iterator itAE = temp.end();
2313     tSpanBuffer::iterator itB = src.begin();
2314     tSpanBuffer::iterator itBE = src.end();
2315     // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
2316     unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
2317     unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
2318     while(itA != itAE && itB != itBE)
2319     {
2320         if((*itB).first + offset1 < (*itA).first)
2321         {
2322             // B span is earlier.  Use it.
2323             unsigned __int64 x1 = (*itB).first + offset1;
2324             unsigned __int64 x2 = (*itB).second + offset2;
2325             ++itB;
2326             // B spans don't overlap, so begin merge loop with A first.
2327             for(;;)
2328             {
2329                 // If we run out of A spans or the A span doesn't overlap,
2330                 // then the next B span can't either (because B spans don't
2331                 // overlap) and we exit.
2332                 if(itA == itAE || (*itA).first > x2)
2333                     break;
2334                 do {x2 = _MAX(x2, (*itA++).second);}
2335                 while(itA != itAE && (*itA).first <= x2);
2336                 // If we run out of B spans or the B span doesn't overlap,
2337                 // then the next A span can't either (because A spans don't
2338                 // overlap) and we exit.
2339                 if(itB == itBE || (*itB).first + offset1 > x2)
2340                     break;
2341                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
2342                 while(itB != itBE && (*itB).first + offset1 <= x2);
2343             }
2344             // Flush span.
2345             dst.push_back(tSpan(x1, x2));
2346         }
2347         else
2348         {
2349             // A span is earlier.  Use it.
2350             unsigned __int64 x1 = (*itA).first;
2351             unsigned __int64 x2 = (*itA).second;
2352             ++itA;
2353             // A spans don't overlap, so begin merge loop with B first.
2354             for(;;)
2355             {
2356                 // If we run out of B spans or the B span doesn't overlap,
2357                 // then the next A span can't either (because A spans don't
2358                 // overlap) and we exit.
2359                 if(itB == itBE || (*itB).first + offset1 > x2)
2360                     break;
2361                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
2362                 while(itB != itBE && (*itB).first + offset1 <= x2);
2363                 // If we run out of A spans or the A span doesn't overlap,
2364                 // then the next B span can't either (because B spans don't
2365                 // overlap) and we exit.
2366                 if(itA == itAE || (*itA).first > x2)
2367                     break;
2368                 do {x2 = _MAX(x2, (*itA++).second);}
2369                 while(itA != itAE && (*itA).first <= x2);
2370             }
2371             // Flush span.
2372             dst.push_back(tSpan(x1, x2));
2373         }
2374     }
2375     // Copy over leftover spans.
2376     while(itA != itAE)
2377         dst.push_back(*itA++);
2378     while(itB != itBE)
2379     {
2380         dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
2381         ++itB;
2382     }
2383 }
2384
2385 bool ScanLineData::CreateWidenedRegion(int rx, int ry)
2386 {
2387     if(rx < 0) rx = 0;
2388     if(ry < 0) ry = 0;
2389     mWideBorder = max(rx,ry);
2390     if (ry > 0)
2391     {
2392         // Do a half circle.
2393         // _OverlapRegion mirrors this so both halves are done.
2394         for(int y = -ry; y <= ry; ++y)
2395         {
2396             int x = (int)(0.5 + sqrt(float(ry*ry - y*y)) * float(rx)/float(ry));
2397             _OverlapRegion(mWideOutline, mOutline, x, y);
2398         }
2399     }
2400     else if (ry == 0 && rx > 0)
2401     {
2402         // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
2403         _OverlapRegion(mWideOutline, mOutline, rx, 0);
2404         _OverlapRegion(mWideOutline, mOutline, rx, 0);
2405     }
2406     return true;
2407 }
2408
2409 void ScanLineData::DeleteOutlines()
2410 {
2411     mWideOutline.clear();
2412     mOutline.clear();
2413 }