Fixed(?) _mm_empty
[xy_vsfilter.git] / src / subtitles / Rasterizer.cpp
bloba44ff4f731571fa11f0d8c08b8572abc0d2cd822
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include <string.h>
24 #include <cmath>
25 #include <vector>
26 #include <algorithm>
27 #include "Rasterizer.h"
28 #include "SeparableFilter.h"
29 #include "xy_logger.h"
30 #include <boost/flyweight/key_value.hpp>
31 #include "xy_bitmap.h"
32 #include "xy_widen_regoin.h"
34 #ifndef _MAX /* avoid collision with common (nonconforming) macros */
35 #define _MAX (std::max)
36 #define _MIN (std::min)
37 #define _IMPL_MAX std::max
38 #define _IMPL_MIN std::min
39 #else
40 #define _IMPL_MAX _MAX
41 #define _IMPL_MIN _MIN
42 #endif
44 typedef const UINT8 CUINT8, *PCUINT8;
46 //NOTE: signed or unsigned affects the result seriously
47 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
49 #define SPLIT_AYUV(color, a, y, u, v) do { \
50 *(v)=(color)&0xff; \
51 *(u)=((color)>>8) &0xff; \
52 *(y)=((color)>>16)&0xff;\
53 *(a)=((color)>>24)&0xff;\
54 } while(0)
56 class GaussianCoefficients
58 public:
59 int g_r;
60 int g_w;
61 int g_w_ex;
62 float *g_f;
64 double sigma;
65 public:
66 GaussianCoefficients(const double sigma)
68 g_r = 0;
69 g_w = 0;
70 g_w_ex = 0;
72 g_f = NULL;
74 this->sigma = 0;
75 init(sigma);
77 GaussianCoefficients(const GaussianCoefficients& priv)
78 :g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma),g_f(NULL)
79 ,g_w_ex(priv.g_w_ex)
81 if (this->g_w_ex > 0 && this != &priv) {
82 this->g_f = reinterpret_cast<float*>(xy_malloc(this->g_w_ex * sizeof(float)));
83 ASSERT(this->g_f);
84 memcpy(g_f, priv.g_f, this->g_w_ex * sizeof(g_f[0]));
88 ~GaussianCoefficients()
90 xy_free(g_f); g_f=NULL;
93 private:
94 int init(double sigma)
96 double a = -1 / (sigma * sigma * 2);
97 double exp_a = exp(a);
99 double volume = 0;
101 if (this->sigma == sigma)
102 return 0;
103 else
104 this->sigma = sigma;
106 this->g_w = (int)ceil(sigma*3) | 1;
107 this->g_r = this->g_w / 2;
108 this->g_w_ex = (this->g_w + 3) & ~3;
110 if (this->g_w_ex > 0) {
111 xy_free(this->g_f);
112 this->g_f = reinterpret_cast<float*>(xy_malloc(this->g_w_ex * sizeof(float)));
113 if (this->g_f == NULL) {
114 return -1;
118 if (this->g_w > 0) {
119 volume = 0;
121 double exp_0 = 1.0;
122 double exp_1 = exp_a;
123 double exp_2 = exp_1 * exp_1;
124 volume = exp_0;
125 this->g_f[this->g_r] = exp_0;
126 float* p_left = this->g_f+this->g_r-1;
127 float* p_right= this->g_f+this->g_r+1;
128 for(int i=0; i<this->g_r;++i,p_left--,p_right++)
130 exp_0 *= exp_1;
131 exp_1 *= exp_2;
133 *p_left = exp_0;
134 *p_right = exp_0;
136 volume += exp_0;
137 volume += exp_0;
139 //equivalent:
140 // for (i = 0; i < this->g_w; ++i) {
141 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
142 // volume += this->g[i];
143 // }
144 ASSERT(volume>0);
145 for (int i=0;i<this->g_w;i++)
147 this->g_f[i] /= volume;
149 for (int i=this->g_w;i<this->g_w_ex;i++)
151 this->g_f[i] = 0;
154 return 0;
159 class ass_synth_priv
161 public:
162 static const int VOLUME_BITS = 22;//should not exceed 32-8, and better not exceed 31-8
164 ass_synth_priv(const double sigma);
165 ass_synth_priv(const ass_synth_priv& priv);
167 ~ass_synth_priv();
168 int generate_tables(double sigma);
170 int g_r;
171 int g_w;
173 unsigned *g;
174 unsigned *gt2;
176 double sigma;
180 // GaussianFilter = GaussianCoefficients or ass_synth_priv
181 template<typename GaussianFilter>
182 struct GaussianFilterKey
184 const double& operator()(const GaussianFilter& x)const
186 return x.sigma;
190 struct ass_tmp_buf
192 public:
193 ass_tmp_buf(size_t size);
194 ass_tmp_buf(const ass_tmp_buf& buf);
195 ~ass_tmp_buf();
196 size_t size;
197 unsigned *tmp;
200 struct ass_tmp_buf_get_size
202 const size_t& operator()(const ass_tmp_buf& buf)const
204 return buf.size;
208 static const unsigned int maxcolor = 255;
209 static const unsigned base = 256;
211 ass_synth_priv::ass_synth_priv(const double sigma)
213 g_r = 0;
214 g_w = 0;
216 g = NULL;
217 gt2 = NULL;
219 this->sigma = 0;
220 generate_tables(sigma);
223 ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma)
225 if (this->g_w > 0 && this != &priv) {
226 this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
227 this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
228 //if (this->g == null || this->gt2 == null) {
229 // return -1;
231 memcpy(g, priv.g, this->g_w * sizeof(unsigned));
232 memcpy(gt2, priv.gt2, 256 * this->g_w * sizeof(unsigned));
236 ass_synth_priv::~ass_synth_priv()
238 free(g); g=NULL;
239 free(gt2); gt2=NULL;
242 int ass_synth_priv::generate_tables(double sigma)
244 const int TARGET_VOLUME = 1<<VOLUME_BITS;
245 const int MAX_VOLUME_ERROR = VOLUME_BITS>=22 ? 16 : 1;
247 double a = -1 / (sigma * sigma * 2);
248 double exp_a = exp(a);
250 double volume_factor = 0;
251 double volume_start = 0, volume_end = 0;
252 unsigned volume;
254 if (this->sigma == sigma)
255 return 0;
256 else
257 this->sigma = sigma;
259 this->g_w = (int)ceil(sigma*3) | 1;
260 this->g_r = this->g_w / 2;
262 if (this->g_w > 0) {
263 this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
264 this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
265 if (this->g == NULL || this->gt2 == NULL) {
266 return -1;
270 if (this->g_w > 0) {
271 volume_start = 0;
273 double exp_0 = 1.0;
274 double exp_1 = exp_a;
275 double exp_2 = exp_1 * exp_1;
276 volume_start += exp_0;
277 for(int i=0;i<this->g_r;++i)
279 exp_0 *= exp_1;
280 exp_1 *= exp_2;
281 volume_start += exp_0;
282 volume_start += exp_0;
284 //euqivalent:
285 // for (i = 0; i < this->g_w; ++i) {
286 // volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
287 // }
289 volume_end = (TARGET_VOLUME+g_w)/volume_start;
290 volume_start = (TARGET_VOLUME-g_w)/volume_start;
292 volume = 0;
293 while( volume_start+0.000001<volume_end )
295 volume_factor = (volume_start+volume_end)*0.5;
296 volume = 0;
298 exp_0 = volume_factor;
299 exp_1 = exp_a;
300 exp_2 = exp_1 * exp_1;
302 volume = static_cast<int>(exp_0+.5);
303 this->g[this->g_r] = volume;
305 unsigned* p_left = this->g+this->g_r-1;
306 unsigned* p_right= this->g+this->g_r+1;
307 for(int i=0; i<this->g_r;++i,p_left--,p_right++)
309 exp_0 *= exp_1;
310 exp_1 *= exp_2;
311 *p_left = static_cast<int>(exp_0+.5);
312 *p_right = *p_left;
313 volume += (*p_left<<1);
315 //equivalent:
316 // for (i = 0; i < this->g_w; ++i) {
317 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
318 // volume += this->g[i];
319 // }
321 // volume don't have to be equal to TARGET_VOLUME,
322 // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
323 // max error introducing in later blur operation,
324 // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
325 // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
326 // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
328 // NOTE: when it comes to rounding, no matter how small the error is,
329 // it may result a different rounding output
330 if( volume>=TARGET_VOLUME && volume< (TARGET_VOLUME+MAX_VOLUME_ERROR) )
331 break;
332 else if(volume < TARGET_VOLUME)
334 volume_start = volume_factor;
336 else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR)
338 volume_end = volume_factor;
341 if(volume==0)
343 volume_factor = volume_end;
345 exp_0 = volume_factor;
346 exp_1 = exp_a;
347 exp_2 = exp_1 * exp_1;
349 volume = static_cast<int>(exp_0+.5);
350 this->g[this->g_r] = volume;
352 unsigned* p_left = this->g+this->g_r-1;
353 unsigned* p_right= this->g+this->g_r+1;
354 for(int i=0; i<this->g_r;++i,p_left--,p_right++)
356 exp_0 *= exp_1;
357 exp_1 *= exp_2;
358 *p_left = static_cast<int>(exp_0+.5);
359 *p_right = *p_left;
360 volume += (*p_left<<1);
362 //equivalent:
363 // for (i = 0; i < this->g_w; ++i) {
364 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
365 // volume += this->g[i];
366 // }
369 // gauss table:
370 for (int mx = 0; mx < this->g_w; mx++) {
371 int last_mul = 0;
372 unsigned *p_gt2 = this->gt2 + mx;
373 *p_gt2 = 0;
374 for (int i = 1; i < 256; i++) {
375 last_mul = last_mul+this->g[mx];
376 p_gt2 += this->g_w;
377 *p_gt2 = last_mul;
378 //equivalent:
379 // this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
383 return 0;
386 ass_tmp_buf::ass_tmp_buf(size_t size)
388 tmp = (unsigned *)malloc(size * sizeof(unsigned));
389 this->size = size;
392 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf& buf)
393 :size(buf.size)
395 tmp = (unsigned *)malloc(size * sizeof(unsigned));
398 ass_tmp_buf::~ass_tmp_buf()
400 free(tmp);
404 * \brief gaussian blur. an fast pure c implementation from libass.
406 static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
407 int width, int height, int stride,
408 const unsigned *g_t_x, int g_r_x, int g_width_x,
409 const unsigned *g_t_y, int g_r_y, int g_width_y)
412 int x, y;
414 unsigned char *s = buffer;
415 unsigned *t = tmp2 + 1;
416 for (y = 0; y < height; y++) {
417 memset(t - 1, 0, (width + 1) * sizeof(*t));
418 x = 0;
419 if(x < g_r_x)//in case that r < 0
421 const int src = s[x];
422 if (src) {
423 register unsigned *dstp = t + x - g_r_x;
424 int mx;
425 const unsigned *m3 = g_t_x + src * g_width_x;
426 unsigned sum = 0;
427 for (mx = g_width_x-1; mx >= g_r_x - x ; mx--) {
428 sum += m3[mx];
429 dstp[mx] += sum;
434 for (x = 1; x < g_r_x; x++) {
435 const int src = s[x];
436 if (src) {
437 register unsigned *dstp = t + x - g_r_x;
438 int mx;
439 const unsigned *m3 = g_t_x + src * g_width_x;
440 for (mx = g_r_x - x; mx < g_width_x; mx++) {
441 dstp[mx] += m3[mx];
446 for (; x < width - g_r_x; x++) {
447 const int src = s[x];
448 if (src) {
449 register unsigned *dstp = t + x - g_r_x;
450 int mx;
451 const unsigned *m3 = g_t_x + src * g_width_x;
452 for (mx = 0; mx < g_width_x; mx++) {
453 dstp[mx] += m3[mx];
458 for (; x < width-1; x++) {
459 const int src = s[x];
460 if (src) {
461 register unsigned *dstp = t + x - g_r_x;
462 int mx;
463 const int x2 = g_r_x + width - x;
464 const unsigned *m3 = g_t_x + src * g_width_x;
465 for (mx = 0; mx < x2; mx++) {
466 dstp[mx] += m3[mx];
470 if(x==width-1) //important: x==width-1 failed, if r==0
472 const int src = s[x];
473 if (src) {
474 register unsigned *dstp = t + x - g_r_x;
475 int mx;
476 const int x2 = g_r_x + width - x;
477 const unsigned *m3 = g_t_x + src * g_width_x;
478 unsigned sum = 0;
479 for (mx = 0; mx < x2; mx++) {
480 sum += m3[mx];
481 dstp[mx] += sum;
486 s += stride;
487 t += width + 1;
490 t = tmp2;
491 for (x = 0; x < width; x++) {
492 y = 0;
493 if(y < g_r_y)//in case that r<0
495 unsigned *srcp = t + y * (width + 1) + 1;
496 int src = *srcp;
497 if (src) {
498 register unsigned *dstp = srcp - 1 + (g_width_y -g_r_y +y)*(width + 1);
499 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
500 const unsigned *m3 = g_t_y + src2 * g_width_y;
501 unsigned sum = 0;
502 int mx;
503 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
504 for (mx = g_width_y-1; mx >=g_r_y - y ; mx--) {
505 sum += m3[mx];
506 *dstp += sum;
507 dstp -= width + 1;
511 for (y = 1; y < g_r_y; y++) {
512 unsigned *srcp = t + y * (width + 1) + 1;
513 int src = *srcp;
514 if (src) {
515 register unsigned *dstp = srcp - 1 + width + 1;
516 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
517 const unsigned *m3 = g_t_y + src2 * g_width_y;
519 int mx;
520 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
521 for (mx = g_r_y - y; mx < g_width_y; mx++) {
522 *dstp += m3[mx];
523 dstp += width + 1;
527 for (; y < height - g_r_y; y++) {
528 unsigned *srcp = t + y * (width + 1) + 1;
529 int src = *srcp;
530 if (src) {
531 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
532 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
533 const unsigned *m3 = g_t_y + src2 * g_width_y;
535 int mx;
536 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
537 for (mx = 0; mx < g_width_y; mx++) {
538 *dstp += m3[mx];
539 dstp += width + 1;
543 for (; y < height-1; y++) {
544 unsigned *srcp = t + y * (width + 1) + 1;
545 int src = *srcp;
546 if (src) {
547 const int y2 = g_r_y + height - y;
548 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
549 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
550 const unsigned *m3 = g_t_y + src2 * g_width_y;
552 int mx;
553 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
554 for (mx = 0; mx < y2; mx++) {
555 *dstp += m3[mx];
556 dstp += width + 1;
560 if(y == height - 1)//important: y == height - 1 failed if r==0
562 unsigned *srcp = t + y * (width + 1) + 1;
563 int src = *srcp;
564 if (src) {
565 const int y2 = g_r_y + height - y;
566 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
567 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
568 const unsigned *m3 = g_t_y + src2 * g_width_y;
569 unsigned sum = 0;
570 int mx;
571 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
572 for (mx = 0; mx < y2; mx++) {
573 sum += m3[mx];
574 *dstp += sum;
575 dstp += width + 1;
579 t++;
582 t = tmp2;
583 s = buffer;
584 for (y = 0; y < height; y++) {
585 for (x = 0; x < width; x++) {
586 s[x] = t[x] >> ass_synth_priv::VOLUME_BITS;
588 s += stride;
589 t += width + 1;
593 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
594 PCUINT8 src, int width, int height, int stride,
595 const float *gt_x, int r_x, int gt_ex_width_x,
596 const float *gt_y, int r_y, int gt_ex_width_y);
598 void xy_be_blur(PUINT8 src, int width, int height, int stride, float pass_x, float pass_y);
601 * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
603 static void be_blur(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
605 WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
606 WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
607 if(!col_sum_buf_base || !col_pix_buf_base)
609 //ToDo: error handling
610 return;
612 memset(col_pix_buf_base, 0, w*sizeof(WORD));
613 memset(col_sum_buf_base, 0, w*sizeof(WORD));
614 WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
615 WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
617 int y = 0;
618 unsigned char *src=buf+y*stride;
620 int x = 2;
621 int old_pix = src[x-1];
622 int old_sum = old_pix + src[x-2];
623 for ( ; x < w; x++) {
624 int temp1 = src[x];
625 int temp2 = old_pix + temp1;
626 old_pix = temp1;
627 temp1 = old_sum + temp2;
628 old_sum = temp2;
629 col_pix_buf[x] = temp1;
633 int y = 1;
634 unsigned char *src=buf+y*stride;
637 int x = 2;
638 int old_pix = src[x-1];
639 int old_sum = old_pix + src[x-2];
640 for ( ; x < w; x++) {
641 int temp1 = src[x];
642 int temp2 = old_pix + temp1;
643 old_pix = temp1;
644 temp1 = old_sum + temp2;
645 old_sum = temp2;
647 temp2 = col_pix_buf[x] + temp1;
648 col_pix_buf[x] = temp1;
649 //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
650 col_sum_buf[x] = temp2;
654 //__m128i round = _mm_set1_epi16(8);
655 for (int y = 2; y < h; y++) {
656 unsigned char *src=buf+y*stride;
657 unsigned char *dst=buf+(y-1)*stride;
660 int x = 2;
661 __m128i old_pix_128 = _mm_cvtsi32_si128(src[1]);
662 __m128i old_sum_128 = _mm_cvtsi32_si128(src[0]+src[1]);
663 for ( ; x < ((w-2)&(~7)); x+=8) {
664 __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src+x));
665 new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
666 __m128i temp = _mm_slli_si128(new_pix,2);
667 temp = _mm_add_epi16(temp, old_pix_128);
668 temp = _mm_add_epi16(temp, new_pix);
669 old_pix_128 = _mm_srli_si128(new_pix,14);
671 new_pix = _mm_slli_si128(temp,2);
672 new_pix = _mm_add_epi16(new_pix, old_sum_128);
673 new_pix = _mm_add_epi16(new_pix, temp);
674 old_sum_128 = _mm_srli_si128(temp, 14);
676 __m128i old_col_pix = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_pix_buf+x) );
677 __m128i old_col_sum = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_sum_buf+x) );
678 _mm_storeu_si128( reinterpret_cast<__m128i*>(col_pix_buf+x), new_pix );
679 temp = _mm_add_epi16(new_pix, old_col_pix);
680 _mm_storeu_si128( reinterpret_cast<__m128i*>(col_sum_buf+x), temp );
682 old_col_sum = _mm_add_epi16(old_col_sum, temp);
683 //old_col_sum = _mm_add_epi16(old_col_sum, round);
684 old_col_sum = _mm_srli_epi16(old_col_sum, 4);
685 old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
686 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst+x-1), old_col_sum );
688 int old_pix = src[x-1];
689 int old_sum = old_pix + src[x-2];
690 for ( ; x < w; x++) {
691 int temp1 = src[x];
692 int temp2 = old_pix + temp1;
693 old_pix = temp1;
694 temp1 = old_sum + temp2;
695 old_sum = temp2;
697 temp2 = col_pix_buf[x] + temp1;
698 col_pix_buf[x] = temp1;
699 dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
700 col_sum_buf[x] = temp2;
704 xy_free(col_sum_buf_base);
705 xy_free(col_pix_buf_base);
709 * see @be_blur
711 static void be_blur_c(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
713 WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
714 WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
715 if(!col_sum_buf_base || !col_pix_buf_base)
717 //ToDo: error handling
718 return;
720 memset(col_pix_buf_base, 0, w*sizeof(WORD));
721 memset(col_sum_buf_base, 0, w*sizeof(WORD));
722 WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
723 WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
725 int y = 0;
726 unsigned char *src=buf+y*stride;
728 int x = 2;
729 int old_pix = src[x-1];
730 int old_sum = old_pix + src[x-2];
731 for ( ; x < w; x++) {
732 int temp1 = src[x];
733 int temp2 = old_pix + temp1;
734 old_pix = temp1;
735 temp1 = old_sum + temp2;
736 old_sum = temp2;
737 col_pix_buf[x] = temp1;
741 int y = 1;
742 unsigned char *src=buf+y*stride;
745 int x = 2;
746 int old_pix = src[x-1];
747 int old_sum = old_pix + src[x-2];
748 for ( ; x < w; x++) {
749 int temp1 = src[x];
750 int temp2 = old_pix + temp1;
751 old_pix = temp1;
752 temp1 = old_sum + temp2;
753 old_sum = temp2;
755 temp2 = col_pix_buf[x] + temp1;
756 col_pix_buf[x] = temp1;
757 //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
758 col_sum_buf[x] = temp2;
762 for (int y = 2; y < h; y++) {
763 unsigned char *src=buf+y*stride;
764 unsigned char *dst=buf+(y-1)*stride;
766 int x = 2;
767 int old_pix = src[x-1];
768 int old_sum = old_pix + src[x-2];
769 for ( ; x < w; x++) {
770 int temp1 = src[x];
771 int temp2 = old_pix + temp1;
772 old_pix = temp1;
773 temp1 = old_sum + temp2;
774 old_sum = temp2;
776 temp2 = col_pix_buf[x] + temp1;
777 col_pix_buf[x] = temp1;
778 dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
779 col_sum_buf[x] = temp2;
783 xy_free(col_sum_buf_base);
784 xy_free(col_pix_buf_base);
787 static void Bilinear(unsigned char *buf, int w, int h, int stride, int x_factor, int y_factor)
789 WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
790 if(!col_pix_buf_base)
792 //ToDo: error handling
793 return;
795 memset(col_pix_buf_base, 0, w*sizeof(WORD));
797 for (int y = 0; y < h; y++){
798 unsigned char *src=buf+y*stride;
800 WORD *col_pix_buf = col_pix_buf_base;
801 int last=0;
802 for(int x = 0; x < w; x++)
804 int temp1 = src[x];
805 int temp2 = temp1*x_factor;
806 temp1 <<= 3;
807 temp1 -= temp2;
808 temp1 += last;
809 last = temp2;
811 temp2 = temp1*y_factor;
812 temp1 <<= 3;
813 temp1 -= temp2;
814 temp1 += col_pix_buf[x];
815 src[x] = ((temp1+32)>>6);
816 col_pix_buf[x] = temp2;
819 xy_free(col_pix_buf_base);
822 bool Rasterizer::Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay)
824 using namespace ::boost::flyweights;
826 if(!overlay)
828 return false;
830 overlay->CleanUp();
831 const ScanLineData& scan_line_data = *scan_line_data2.m_scan_line_data;
832 if(!scan_line_data.mWidth || !scan_line_data.mHeight)
834 return true;
836 xsub &= 7;
837 ysub &= 7;
838 //xsub = ysub = 0;
839 int width = scan_line_data.mWidth + xsub;
840 int height = scan_line_data.mHeight + ysub;
841 overlay->mfWideOutlineEmpty = scan_line_data2.mWideOutline.empty();
842 if(!overlay->mfWideOutlineEmpty)
844 int wide_border = (scan_line_data2.mWideBorder+7)&~7;
846 width += 2*wide_border ;
847 height += 2*wide_border ;
848 xsub += wide_border ;
849 ysub += wide_border ;
851 overlay->mOffsetX = scan_line_data2.mPathOffsetX - xsub;
852 overlay->mOffsetY = scan_line_data2.mPathOffsetY - ysub;
854 overlay->mWidth = width;
855 overlay->mHeight = height;
856 overlay->mOverlayWidth = ((width+7)>>3) + 1;
857 overlay->mOverlayHeight = ((height+7)>>3) + 1;
858 overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
860 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
861 if( body==NULL )
863 return false;
865 overlay->mBody.reset(body, xy_free);
866 memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
867 BYTE* border = NULL;
868 if (!overlay->mfWideOutlineEmpty)
870 border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
871 if (border==NULL)
873 return false;
875 overlay->mBorder.reset(border, xy_free);
876 memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
879 // Are we doing a border?
880 const tSpanBuffer* pOutline[2] = {&(scan_line_data.mOutline), &(scan_line_data2.mWideOutline)};
881 for(int i = countof(pOutline)-1; i >= 0; i--)
883 tSpanBuffer::const_iterator it = pOutline[i]->begin();
884 tSpanBuffer::const_iterator itEnd = pOutline[i]->end();
885 byte* plan_selected = i==0 ? body : border;
886 int pitch = overlay->mOverlayPitch;
887 for(; it!=itEnd; ++it)
889 int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
890 int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
891 int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
892 if(x2 > x1)
894 int first = x1>>3;
895 int last = (x2-1)>>3;
896 byte* dst = plan_selected + (pitch*(y>>3) + first);
897 if(first == last)
898 *dst += x2-x1;
899 else
901 *dst += ((first+1)<<3) - x1;
902 dst += 1;
903 while(++first < last)
905 *dst += 0x08;
906 dst += 1;
908 *dst += x2 - (last<<3);
914 return true;
917 const float Rasterizer::GAUSSIAN_BLUR_THREHOLD = 0.333333f;
919 bool Rasterizer::IsItReallyBlur( float be_strength, double gaussian_blur_strength )
921 if (be_strength<=0 && gaussian_blur_strength<=GAUSSIAN_BLUR_THREHOLD)
923 return false;
925 return true;
928 // @return: true if actually a blur operation has done, or else false and output is leave unset.
929 // To Do: rewrite it or delete it
930 bool Rasterizer::OldFixedPointBlur(const Overlay& input_overlay, float be_strength, double gaussian_blur_strength,
931 double target_scale_x, double target_scale_y, SharedPtrOverlay output_overlay)
933 using namespace ::boost::flyweights;
935 ASSERT(IsItReallyBlur(be_strength, gaussian_blur_strength));
936 if(!output_overlay)
938 return false;
940 output_overlay->CleanUp();
942 output_overlay->mOffsetX = input_overlay.mOffsetX;
943 output_overlay->mOffsetY = input_overlay.mOffsetY;
944 output_overlay->mWidth = input_overlay.mWidth;
945 output_overlay->mHeight = input_overlay.mHeight;
946 output_overlay->mOverlayWidth = input_overlay.mOverlayWidth;
947 output_overlay->mOverlayHeight = input_overlay.mOverlayHeight;
948 output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
950 double gaussian_blur_strength_x = gaussian_blur_strength*target_scale_x;
951 double gaussian_blur_strength_y = gaussian_blur_strength*target_scale_y;
953 int gaussian_blur_radius_x = (static_cast<int>( ceil(gaussian_blur_strength_x*3) ) | 1)/2;//fix me: rounding err?
954 int gaussian_blur_radius_y = (static_cast<int>( ceil(gaussian_blur_strength_y*3) ) | 1)/2;//fix me: rounding err?
955 if( gaussian_blur_radius_x < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
956 gaussian_blur_radius_x = 1;//make sure that it really do a blur
957 if( gaussian_blur_radius_y < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
958 gaussian_blur_radius_y = 1;//make sure that it really do a blur
960 int bluradjust_x = 0, bluradjust_y = 0;
961 if ( IsItReallyBlur(be_strength, gaussian_blur_strength) )
963 if (gaussian_blur_strength > 0)
965 bluradjust_x += gaussian_blur_radius_x * 8;
966 bluradjust_y += gaussian_blur_radius_y * 8;
968 if (be_strength)
970 int be_adjust_x = static_cast<int>( target_scale_x*std::sqrt(be_strength*0.25f)+0.5 );//fix me: rounding err?
971 be_adjust_x *= 8;
972 int be_adjust_y = static_cast<int>(target_scale_y*std::sqrt(be_strength*0.25f)+0.5);//fix me: rounding err?
973 be_adjust_y *= 8;
975 bluradjust_x += be_adjust_x;
976 bluradjust_y += be_adjust_y;
978 // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
979 bluradjust_x = (bluradjust_x+7)&~7;
980 bluradjust_y = (bluradjust_y+7)&~7;
982 output_overlay->mOffsetX -= bluradjust_x;
983 output_overlay->mOffsetY -= bluradjust_y;
984 output_overlay->mWidth += (bluradjust_x<<1);
985 output_overlay->mHeight += (bluradjust_y<<1);
986 output_overlay->mOverlayWidth += (bluradjust_x>>2);
987 output_overlay->mOverlayHeight += (bluradjust_y>>2);
989 else
991 return false;
994 output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
996 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
997 if( body==NULL )
999 return false;
1001 output_overlay->mBody.reset(body, xy_free);
1002 memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1003 BYTE* border = NULL;
1004 if (!output_overlay->mfWideOutlineEmpty)
1006 border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1007 if (border==NULL)
1009 return false;
1011 output_overlay->mBorder.reset(border, xy_free);
1012 memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1015 //copy buffer
1016 for(int i = 1; i >= 0; i--)
1018 byte* plan_selected = i==0 ? body : border;
1019 const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1021 plan_selected += (bluradjust_x>>3) + (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1022 if ( plan_selected!=NULL && plan_input!=NULL )
1024 for (int j=0;j<input_overlay.mOverlayHeight;j++)
1026 memcpy(plan_selected, plan_input, input_overlay.mOverlayPitch);
1027 plan_selected += output_overlay->mOverlayPitch;
1028 plan_input += input_overlay.mOverlayPitch;
1033 ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
1034 //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
1035 // Do some gaussian blur magic
1036 if ( gaussian_blur_strength > GAUSSIAN_BLUR_THREHOLD )
1038 byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border;
1040 flyweight<key_value<double, ass_synth_priv, GaussianFilterKey<ass_synth_priv>>, no_locking>
1041 fw_priv_blur_x(gaussian_blur_strength_x);
1042 flyweight<key_value<double, ass_synth_priv, GaussianFilterKey<ass_synth_priv>>, no_locking>
1043 fw_priv_blur_y(gaussian_blur_strength_y);
1045 const ass_synth_priv& priv_blur_x = fw_priv_blur_x.get();
1046 const ass_synth_priv& priv_blur_y = fw_priv_blur_y.get();
1047 if (output_overlay->mOverlayWidth>=priv_blur_x.g_w && output_overlay->mOverlayHeight>=priv_blur_y.g_w)
1049 ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch,
1050 priv_blur_x.gt2, priv_blur_x.g_r, priv_blur_x.g_w,
1051 priv_blur_y.gt2, priv_blur_y.g_r, priv_blur_y.g_w);
1055 float scaled_be_strength = be_strength * 0.5f * (target_scale_x+target_scale_y);
1056 int pass_num = static_cast<int>(scaled_be_strength);
1057 int pitch = output_overlay->mOverlayPitch;
1058 byte* blur_plan = output_overlay->mfWideOutlineEmpty ? body : border;
1060 for (int pass = 0; pass < pass_num; pass++)
1062 if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
1064 if (g_cpuid.m_flags & CCpuID::sse2)
1066 be_blur(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1068 else
1070 be_blur_c(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1074 if (scaled_be_strength>pass_num)
1076 xy_be_blur(blur_plan, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch,
1077 scaled_be_strength-pass_num, scaled_be_strength-pass_num);
1080 return true;
1083 // @return: true if actually a blur operation has done, or else false and output is leave unset.
1084 bool Rasterizer::Blur(const Overlay& input_overlay, float be_strength,
1085 double gaussian_blur_strength,
1086 double target_scale_x, double target_scale_y,
1087 SharedPtrOverlay output_overlay)
1089 using namespace ::boost::flyweights;
1091 ASSERT(IsItReallyBlur(be_strength, gaussian_blur_strength));
1092 if(!output_overlay || !IsItReallyBlur(be_strength, gaussian_blur_strength))
1094 return false;
1096 if (input_overlay.mOverlayWidth<=0 || input_overlay.mOverlayHeight<=0)
1098 return true;
1101 if (!(g_cpuid.m_flags & CCpuID::sse2))
1103 // C code path of floating point version is extremely slow,
1104 // so we fall back to fixed point version instead
1105 return Rasterizer::OldFixedPointBlur(input_overlay, be_strength,
1106 gaussian_blur_strength, target_scale_x, target_scale_y, output_overlay);//fix me: important!
1109 if (gaussian_blur_strength>0)
1111 if (be_strength)//this insane thing should NEVER happen
1113 SharedPtrOverlay tmp(new Overlay());
1115 bool rv = GaussianBlur(input_overlay, gaussian_blur_strength, target_scale_x, target_scale_y, tmp);
1116 ASSERT(rv);
1117 rv = BeBlur(*tmp, be_strength, target_scale_x, target_scale_y, output_overlay);
1118 ASSERT(rv);
1120 else
1122 bool rv = GaussianBlur(input_overlay, gaussian_blur_strength, target_scale_x, target_scale_y, output_overlay);
1123 ASSERT(rv);
1126 else if (be_strength)
1128 bool rv = BeBlur(input_overlay, be_strength, target_scale_x, target_scale_y, output_overlay);
1129 ASSERT(rv);
1131 return true;
1134 bool Rasterizer::GaussianBlur( const Overlay& input_overlay, double gaussian_blur_strength,
1135 double target_scale_x, double target_scale_y,
1136 SharedPtrOverlay output_overlay )
1138 using namespace ::boost::flyweights;
1140 ASSERT(output_overlay);
1141 output_overlay->CleanUp();
1142 output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
1144 ASSERT(gaussian_blur_strength > 0);
1146 double gaussian_blur_strength_x = gaussian_blur_strength*target_scale_x;
1147 double gaussian_blur_strength_y = gaussian_blur_strength*target_scale_y;
1149 int gaussian_blur_radius_x = (static_cast<int>( ceil(gaussian_blur_strength_x*3) ) | 1)/2;//fix me: rounding err?
1150 int gaussian_blur_radius_y = (static_cast<int>( ceil(gaussian_blur_strength_y*3) ) | 1)/2;//fix me: rounding err?
1151 if( gaussian_blur_radius_x < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
1152 gaussian_blur_radius_x = 1;//make sure that it really do a blur
1153 if( gaussian_blur_radius_y < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
1154 gaussian_blur_radius_y = 1;//make sure that it really do a blur
1156 flyweight<key_value<double, GaussianCoefficients, GaussianFilterKey<GaussianCoefficients>>, no_locking>
1157 fw_filter_x(gaussian_blur_strength_x);
1158 flyweight<key_value<double, GaussianCoefficients, GaussianFilterKey<GaussianCoefficients>>, no_locking>
1159 fw_filter_y(gaussian_blur_strength_y);
1161 const GaussianCoefficients& filter_x = fw_filter_x.get();
1162 const GaussianCoefficients& filter_y = fw_filter_y.get();
1164 int bluradjust_x = filter_x.g_r * 8;
1165 int bluradjust_y = filter_y.g_r * 8;
1166 output_overlay->mOffsetX = input_overlay.mOffsetX - bluradjust_x;
1167 output_overlay->mOffsetY = input_overlay.mOffsetY - bluradjust_y;
1168 output_overlay->mWidth = input_overlay.mWidth + (bluradjust_x<<1);
1169 output_overlay->mHeight = input_overlay.mHeight + (bluradjust_y<<1);
1170 output_overlay->mOverlayWidth = input_overlay.mOverlayWidth + (bluradjust_x>>2);
1171 output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust_y>>2);
1173 output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
1175 BYTE* blur_plan = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1176 //memset(blur_plan, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1178 const BYTE* plan_input = input_overlay.mfWideOutlineEmpty ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1179 ASSERT(output_overlay->mOverlayWidth>=filter_x.g_w && output_overlay->mOverlayHeight>=filter_y.g_w);
1180 xy_gaussian_blur(blur_plan, output_overlay->mOverlayPitch,
1181 plan_input, input_overlay.mOverlayWidth, input_overlay.mOverlayHeight, input_overlay.mOverlayPitch,
1182 filter_x.g_f, filter_x.g_r, filter_x.g_w_ex,
1183 filter_y.g_f, filter_y.g_r, filter_y.g_w_ex);
1184 if (input_overlay.mfWideOutlineEmpty)
1186 output_overlay->mBody.reset(blur_plan, xy_free);
1188 else
1190 output_overlay->mBorder.reset(blur_plan, xy_free);
1192 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1193 if( body==NULL )
1195 return false;
1197 output_overlay->mBody.reset(body, xy_free);
1198 memset(body, 0, output_overlay->mOverlayPitch * (bluradjust_y>>3));
1199 body += (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1200 plan_input = input_overlay.mBody.get();
1201 ASSERT(plan_input);
1202 for (int j=0;j<input_overlay.mOverlayHeight;j++)
1204 memset(body, 0, (bluradjust_x>>3));
1205 memcpy(body+(bluradjust_x>>3), plan_input, input_overlay.mOverlayWidth);
1206 memset(body+(bluradjust_x>>3)+input_overlay.mOverlayWidth, 0, (bluradjust_x>>3));
1207 body += output_overlay->mOverlayPitch;
1208 plan_input += input_overlay.mOverlayPitch;
1210 memset(body, 0, output_overlay->mOverlayPitch * (bluradjust_y>>3));
1212 return true;
1215 bool Rasterizer::BeBlur( const Overlay& input_overlay, float be_strength,
1216 float target_scale_x, float target_scale_y, SharedPtrOverlay output_overlay )
1218 ASSERT(output_overlay);
1219 output_overlay->CleanUp();
1220 output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
1222 ASSERT(be_strength>0 && target_scale_x>0 && target_scale_y>0);
1223 int bluradjust_x = static_cast<int>( target_scale_x*std::sqrt(be_strength*0.25f)+0.5 );//fix me: rounding err?
1224 bluradjust_x *= 8;
1225 int bluradjust_y = static_cast<int>(target_scale_y*std::sqrt(be_strength*0.25f)+0.5);//fix me: rounding err?
1226 bluradjust_y *= 8;
1228 output_overlay->mOffsetX = input_overlay.mOffsetX - bluradjust_x;
1229 output_overlay->mOffsetY = input_overlay.mOffsetY - bluradjust_y;
1230 output_overlay->mWidth = input_overlay.mWidth + (bluradjust_x<<1);
1231 output_overlay->mHeight = input_overlay.mHeight + (bluradjust_y<<1);
1232 output_overlay->mOverlayWidth = input_overlay.mOverlayWidth + (bluradjust_x>>2);
1233 output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust_y>>2);
1235 output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
1237 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1238 if( body==NULL )
1240 return false;
1242 output_overlay->mBody.reset(body, xy_free);
1243 memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1244 BYTE* border = NULL;
1245 if (!output_overlay->mfWideOutlineEmpty)
1247 border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1248 if (border==NULL)
1250 return false;
1252 output_overlay->mBorder.reset(border, xy_free);
1253 memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1256 //copy buffer
1257 for(int i = 1; i >= 0; i--)
1259 byte* plan_selected = i==0 ? body : border;
1260 const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1262 plan_selected += (bluradjust_x>>3) + (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1263 if ( plan_selected!=NULL && plan_input!=NULL )
1265 for (int j=0;j<input_overlay.mOverlayHeight;j++)
1267 memcpy(plan_selected, plan_input, input_overlay.mOverlayWidth*sizeof(plan_input[0]));
1268 plan_selected += output_overlay->mOverlayPitch;
1269 plan_input += input_overlay.mOverlayPitch;
1273 if (be_strength<=0)
1275 return true;
1278 float scaled_be_strength = be_strength * 0.5f * (target_scale_x+target_scale_y);
1279 int pass_num = static_cast<int>(scaled_be_strength);
1280 int pitch = output_overlay->mOverlayPitch;
1281 byte* blur_plan = output_overlay->mfWideOutlineEmpty ? body : border;
1282 ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
1283 for (int pass = 0; pass < pass_num; pass++)
1285 if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
1287 if (g_cpuid.m_flags & CCpuID::sse2)
1289 be_blur(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1291 else
1293 be_blur_c(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1297 if (scaled_be_strength>pass_num)
1299 xy_be_blur(blur_plan, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch,
1300 scaled_be_strength-pass_num, scaled_be_strength-pass_num);
1303 return true;
1306 ///////////////////////////////////////////////////////////////////////////
1308 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
1310 int a = alpha;
1311 // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
1312 int ia = 256-a;
1313 a+=1;
1314 *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
1315 | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
1316 | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
1319 static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
1321 int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
1322 int ia = 256-a;
1323 a+=1;
1324 *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
1325 | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
1326 | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
1329 #include <xmmintrin.h>
1330 #include <emmintrin.h>
1332 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
1334 // alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
1335 color &= 0xffffff;
1336 __m128i zero = _mm_setzero_si128();
1337 __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
1338 __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
1339 __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
1340 __m128i r = _mm_unpacklo_epi16(d, s);
1341 r = _mm_madd_epi16(r, a);
1342 r = _mm_srli_epi32(r, 8);
1343 r = _mm_packs_epi32(r, r);
1344 r = _mm_packus_epi16(r, r);
1345 *dst = (DWORD)_mm_cvtsi128_si32(r);
1348 static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
1350 int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
1351 color &= 0xffffff;
1352 __m128i zero = _mm_setzero_si128();
1353 __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
1354 __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
1355 __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
1356 __m128i r = _mm_unpacklo_epi16(d, s);
1357 r = _mm_madd_epi16(r, a);
1358 r = _mm_srli_epi32(r, 8);
1359 r = _mm_packs_epi32(r, r);
1360 r = _mm_packus_epi16(r, r);
1361 *dst = (DWORD)_mm_cvtsi128_si32(r);
1364 #include <mmintrin.h>
1366 // Calculate a - b clamping to 0 instead of underflowing
1367 static __forceinline DWORD safe_subtract(DWORD a, DWORD b)
1369 __m64 ap = _mm_cvtsi32_si64(a);
1370 __m64 bp = _mm_cvtsi32_si64(b);
1371 __m64 rp = _mm_subs_pu16(ap, bp);
1372 DWORD r = (DWORD)_mm_cvtsi64_si32(rp);
1373 _mm_empty();
1374 return r;
1375 //return (b > a) ? 0 : a - b;
1378 /***
1379 * No aligned requirement
1382 void AlphaBlt(byte* pY,
1383 const byte* pAlphaMask,
1384 const byte Y,
1385 int h, int w, int src_stride, int dst_stride)
1387 __m128i zero = _mm_setzero_si128();
1388 __m128i s = _mm_set1_epi16(Y); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
1390 if( w>16 )//IMPORTANT! The result of the following code is undefined with w<15.
1392 for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
1394 const BYTE* sa = pAlphaMask;
1395 BYTE* dy = pY;
1396 const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15); //IMPORTANT! w must >= 15
1397 const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1398 const BYTE* dy_end = pY + w;
1400 for(;dy < dy_first_mod16; sa++, dy++)
1402 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1404 for(; dy < dy_end_mod16; sa+=8, dy+=16)
1406 __m128i a = _mm_loadl_epi64((__m128i*)sa);
1409 __m128i d = _mm_load_si128((__m128i*)dy);
1411 //__m128i ones = _mm_cmpeq_epi32(zero,zero); //ones = ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
1412 //__m128i ia = _mm_xor_si128(a,ones); //ia = ~a
1413 //ia = _mm_unpacklo_epi8(ia,zero); //ia = ~a0 0 ~a1 0 ~a2 0 ~a3 0 ~a4 0 ~a5 0 ~a6 0 ~a7 0
1414 a = _mm_unpacklo_epi8(a,zero); //a= a0 0 a1 0 a2 0 a3 0 a4 0 a5 0 a6 0 a7 0
1415 __m128i ones = _mm_set1_epi16(256); //ones = 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
1416 __m128i ia = _mm_sub_epi16(ones, a); //ia = 256-a0 ... 256-a7
1417 ones = _mm_srli_epi16(ones, 8);
1418 a = _mm_add_epi16(a, ones); //a= 1+a0 ... 1+a7
1420 __m128i dl = _mm_unpacklo_epi8(d,zero); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
1421 __m128i sl = _mm_mullo_epi16(s,a); //sl = c0*a0 c1*a1 ... c7*a7
1423 dl = _mm_mullo_epi16(dl,ia); //d = b0*~a0 b1*~a1 ... b7*~a7
1425 dl = _mm_add_epi16(dl,sl); //d = d + sl
1426 dl = _mm_srli_epi16(dl, 8); //d = d>>8
1428 sa += 8;
1429 a = _mm_loadl_epi64((__m128i*)sa);
1431 a = _mm_unpacklo_epi8(a,zero);
1432 ones = _mm_slli_epi16(ones, 8);
1433 ia = _mm_sub_epi16(ones, a);
1434 ones = _mm_srli_epi16(ones, 8);
1435 a = _mm_add_epi16(a,ones);
1437 d = _mm_unpackhi_epi8(d,zero);
1438 sl = _mm_mullo_epi16(s,a);
1439 d = _mm_mullo_epi16(d,ia);
1440 d = _mm_add_epi16(d,sl);
1441 d = _mm_srli_epi16(d, 8);
1443 dl = _mm_packus_epi16(dl,d);
1445 _mm_store_si128((__m128i*)dy, dl);
1447 for(;dy < dy_end; sa++, dy++)
1449 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1453 else
1455 for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
1457 const BYTE* sa = pAlphaMask;
1458 BYTE* dy = pY;
1459 const BYTE* dy_end = pY + w;
1461 for(;dy < dy_end; sa++, dy++)
1463 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1467 //__asm emms;
1470 /***
1471 * No aligned requirement
1474 void AlphaBlt(byte* pY,
1475 const byte alpha,
1476 const byte Y,
1477 int h, int w, int dst_stride)
1479 int yPremul = Y*(alpha+1);
1480 int dstAlpha = 0x100 - alpha;
1481 if( w>32 )//IMPORTANT! The result of the following code is undefined with w<15.
1483 __m128i zero = _mm_setzero_si128();
1484 __m128i s = _mm_set1_epi16(yPremul); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
1485 __m128i ia = _mm_set1_epi16(dstAlpha);
1486 for( ; h>0; h--, pY += dst_stride )
1488 BYTE* dy = pY;
1489 const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15); //IMPORTANT! w must >= 15
1490 const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1491 const BYTE* dy_end = pY + w;
1493 for(;dy < dy_first_mod16; dy++)
1495 *dy = (*dy * dstAlpha + yPremul)>>8;
1497 for(; dy < dy_end_mod16; dy+=16)
1500 __m128i d = _mm_load_si128(reinterpret_cast<const __m128i*>(dy));
1501 __m128i dl = _mm_unpacklo_epi8(d,zero); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
1503 dl = _mm_mullo_epi16(dl,ia); //d = b0*~a0 b1*~a1 ... b7*~a7
1504 dl = _mm_adds_epu16(dl,s); //d = d + s
1505 dl = _mm_srli_epi16(dl, 8); //d = d>>8
1507 d = _mm_unpackhi_epi8(d,zero);
1508 d = _mm_mullo_epi16(d,ia);
1509 d = _mm_adds_epu16(d,s);
1510 d = _mm_srli_epi16(d, 8);
1512 dl = _mm_packus_epi16(dl,d);
1514 _mm_store_si128(reinterpret_cast<__m128i*>(dy), dl);
1516 for(;dy < dy_end; dy++)
1518 *dy = (*dy * dstAlpha + yPremul)>>8;
1522 else
1524 for( ; h>0; h--, pY += dst_stride )
1526 BYTE* dy = pY;
1527 const BYTE* dy_end = pY + w;
1529 for(;dy < dy_end; dy++)
1531 *dy = (*dy * dstAlpha + yPremul)>>8;
1535 //__asm emms;
1538 /***
1539 * No aligned requirement
1542 void AlphaBltC(byte* pY,
1543 const byte alpha,
1544 const byte Y,
1545 int h, int w, int dst_stride)
1547 int yPremul = Y*(alpha+1);
1548 int dstAlpha = 0x100 - alpha;
1550 for( ; h>0; h--, pY += dst_stride )
1552 BYTE* dy = pY;
1553 const BYTE* dy_end = pY + w;
1555 for(;dy < dy_end; dy++)
1557 *dy = (*dy * dstAlpha + yPremul)>>8;
1562 // For CPUID usage in Rasterizer::Draw
1563 #include "../dsutil/vd.h"
1565 void OverlapRegion(tSpanBuffer& dst, const tSpanBuffer& src, int dx, int dy)
1567 tSpanBuffer temp;
1568 temp.reserve(dst.size() + src.size());
1569 dst.swap(temp);
1570 tSpanBuffer::iterator itA = temp.begin();
1571 tSpanBuffer::iterator itAE = temp.end();
1572 tSpanBuffer::const_iterator itB = src.begin();
1573 tSpanBuffer::const_iterator itBE = src.end();
1574 // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
1575 unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
1576 unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
1577 while(itA != itAE && itB != itBE)
1579 if((*itB).first + offset1 < (*itA).first)
1581 // B span is earlier. Use it.
1582 unsigned __int64 x1 = (*itB).first + offset1;
1583 unsigned __int64 x2 = (*itB).second + offset2;
1584 ++itB;
1585 // B spans don't overlap, so begin merge loop with A first.
1586 for(;;)
1588 // If we run out of A spans or the A span doesn't overlap,
1589 // then the next B span can't either (because B spans don't
1590 // overlap) and we exit.
1591 if(itA == itAE || (*itA).first > x2)
1592 break;
1593 do {x2 = _MAX(x2, (*itA++).second);}
1594 while(itA != itAE && (*itA).first <= x2);
1595 // If we run out of B spans or the B span doesn't overlap,
1596 // then the next A span can't either (because A spans don't
1597 // overlap) and we exit.
1598 if(itB == itBE || (*itB).first + offset1 > x2)
1599 break;
1600 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1601 while(itB != itBE && (*itB).first + offset1 <= x2);
1603 // Flush span.
1604 dst.push_back(tSpan(x1, x2));
1606 else
1608 // A span is earlier. Use it.
1609 unsigned __int64 x1 = (*itA).first;
1610 unsigned __int64 x2 = (*itA).second;
1611 ++itA;
1612 // A spans don't overlap, so begin merge loop with B first.
1613 for(;;)
1615 // If we run out of B spans or the B span doesn't overlap,
1616 // then the next A span can't either (because A spans don't
1617 // overlap) and we exit.
1618 if(itB == itBE || (*itB).first + offset1 > x2)
1619 break;
1620 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1621 while(itB != itBE && (*itB).first + offset1 <= x2);
1622 // If we run out of A spans or the A span doesn't overlap,
1623 // then the next B span can't either (because B spans don't
1624 // overlap) and we exit.
1625 if(itA == itAE || (*itA).first > x2)
1626 break;
1627 do {x2 = _MAX(x2, (*itA++).second);}
1628 while(itA != itAE && (*itA).first <= x2);
1630 // Flush span.
1631 dst.push_back(tSpan(x1, x2));
1634 // Copy over leftover spans.
1635 while(itA != itAE)
1636 dst.push_back(*itA++);
1637 while(itB != itBE)
1639 dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
1640 ++itB;
1644 // Render a subpicture onto a surface.
1645 // spd is the surface to render on.
1646 // clipRect is a rectangular clip region to render inside.
1647 // pAlphaMask is an alpha clipping mask.
1648 // xsub and ysub ???
1649 // switchpts seems to be an array of fill colours interlaced with coordinates.
1650 // switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1651 // fBody tells whether to render the body of the subs.
1652 // fBorder tells whether to render the border of the subs.
1653 SharedPtrByte Rasterizer::CompositeAlphaMask(const SharedPtrOverlay& overlay, const CRect& clipRect,
1654 const GrayImage2* alpha_mask,
1655 int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder,
1656 CRect *outputDirtyRect)
1658 //fix me: check and log error
1659 SharedPtrByte result;
1660 *outputDirtyRect = CRect(0, 0, 0, 0);
1661 if (!switchpts || !fBody && !fBorder) return result;
1662 if (fBorder && !overlay->mBorder) return result;
1664 CRect r = clipRect;
1665 if (alpha_mask!=NULL)
1667 r &= CRect(alpha_mask->left_top, alpha_mask->size);
1670 // Remember that all subtitle coordinates are specified in 1/8 pixels
1671 // (x+4)>>3 rounds to nearest whole pixel.
1672 // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1673 int x = (xsub + overlay->mOffsetX + 4)>>3;
1674 int y = (ysub + overlay->mOffsetY + 4)>>3;
1675 int w = overlay->mOverlayWidth;
1676 int h = overlay->mOverlayHeight;
1677 int xo = 0, yo = 0;
1678 // Again, limiting?
1679 if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1680 if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1681 if(x+w > r.right) w = r.right-x;
1682 if(y+h > r.bottom) h = r.bottom-y;
1683 // Check if there's actually anything to render
1684 if(w <= 0 || h <= 0) return(result);
1685 outputDirtyRect->SetRect(x, y, x+w, y+h);
1687 bool fSingleColor = (switchpts[1]==0xffffffff);
1689 // draw
1690 // Grab the first colour
1691 DWORD color = switchpts[0];
1692 byte* s_base = (byte*)xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight);
1693 const byte* alpha_mask_data = alpha_mask != NULL ? alpha_mask->data.get() : NULL;
1694 const int alpha_mask_pitch = alpha_mask != NULL ? alpha_mask->pitch : 0;
1695 if(alpha_mask_data!=NULL )
1696 alpha_mask_data += alpha_mask->pitch * y + x - alpha_mask->left_top.y*alpha_mask->pitch - alpha_mask->left_top.x;
1698 if(fSingleColor)
1700 overlay->FillAlphaMash(s_base, fBody, fBorder, xo, yo, w, h,
1701 alpha_mask_data, alpha_mask_pitch,
1702 color>>24 );
1704 else
1706 int last_x = xo;
1707 const DWORD *sw = switchpts;
1708 while( last_x<w+xo )
1710 byte alpha = sw[0]>>24;
1711 while( sw[3]<w+xo && (sw[2]>>24)==alpha )
1713 sw += 2;
1715 int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1716 overlay->FillAlphaMash(s_base, fBody, fBorder,
1717 last_x, yo, new_x-last_x, h,
1718 alpha_mask_data, alpha_mask_pitch,
1719 alpha );
1720 last_x = new_x;
1721 sw += 2;
1724 result.reset( s_base, xy_free );
1725 return result;
1730 // draw overlay[clipRect] to bitmap[0,0,w,h]
1732 void Rasterizer::Draw(XyBitmap* bitmap, SharedPtrOverlay overlay, const CRect& clipRect, byte* s_base,
1733 int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder)
1735 if (!switchpts || !fBody && !fBorder) return;
1736 if (bitmap==NULL)
1738 ASSERT(0);
1739 return;
1741 // clip
1742 // Limit drawn area to rectangular clip area
1743 CRect r = clipRect;
1744 // Remember that all subtitle coordinates are specified in 1/8 pixels
1745 // (x+4)>>3 rounds to nearest whole pixel.
1746 int overlayPitch = overlay->mOverlayPitch;
1747 int x = (xsub + overlay->mOffsetX + 4)>>3;
1748 int y = (ysub + overlay->mOffsetY + 4)>>3;
1749 int w = overlay->mOverlayWidth;
1750 int h = overlay->mOverlayHeight;
1751 int xo = 0, yo = 0;
1753 if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1754 if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1755 if(x+w > r.right) w = r.right-x;
1756 if(y+h > r.bottom) h = r.bottom-y;
1757 // Check if there's actually anything to render
1758 if (w <= 0 || h <= 0) return;
1759 // must have enough space to draw into
1760 ASSERT(x >= bitmap->x && y >= bitmap->y && x+w <= bitmap->x + bitmap->w && y+h <= bitmap->y + bitmap->h );
1762 // CPUID from VDub
1763 bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1764 bool fSingleColor = (switchpts[1]==0xffffffff);
1765 bool PLANAR = (bitmap->type==XyBitmap::PLANNA);
1766 int draw_method = 0;
1767 if(fSingleColor)
1768 draw_method |= DM::SINGLE_COLOR;
1769 if(fSSE2)
1770 draw_method |= DM::SSE2;
1771 if(PLANAR)
1772 draw_method |= DM::AYUV_PLANAR;
1774 // draw
1775 // Grab the first colour
1776 DWORD color = switchpts[0];
1777 const byte* s = s_base + overlay->mOverlayPitch*yo + xo;
1779 int dst_offset = 0;
1780 if (bitmap->type==XyBitmap::PLANNA)
1781 dst_offset = bitmap->pitch*(y-bitmap->y) + x - bitmap->x;
1782 else
1783 dst_offset = bitmap->pitch*(y-bitmap->y) + (x - bitmap->x)*4;
1784 unsigned long* dst = (unsigned long*)((BYTE*)bitmap->plans[0] + dst_offset);
1786 // Every remaining line in the bitmap to be rendered...
1787 switch(draw_method)
1789 case DM::SINGLE_COLOR | DM::SSE2 | 0*DM::AYUV_PLANAR :
1791 while(h--)
1793 for(int wt=0; wt<w; ++wt)
1794 // The <<6 is due to pixmix expecting the alpha parameter to be
1795 // the multiplication of two 6-bit unsigned numbers but we
1796 // only have one here. (No alpha mask.)
1797 pixmix_sse2(&dst[wt], color, s[wt]);
1798 s += overlayPitch;
1799 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1802 break;
1803 case DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1805 while(h--)
1807 for(int wt=0; wt<w; ++wt)
1808 pixmix(&dst[wt], color, s[wt]);
1809 s += overlayPitch;
1810 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1813 break;
1814 case 0*DM::SINGLE_COLOR | DM::SSE2 | 0*DM::AYUV_PLANAR :
1816 while(h--)
1818 const DWORD *sw = switchpts;
1819 for(int wt=0; wt<w; ++wt)
1821 // xo is the offset (usually negative) we have moved into the image
1822 // So if we have passed the switchpoint (?) switch to another colour
1823 // (So switchpts stores both colours *and* coordinates?)
1824 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1825 pixmix_sse2(&dst[wt], color, s[wt]);
1827 s += overlayPitch;
1828 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1831 break;
1832 case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1834 while(h--)
1836 const DWORD *sw = switchpts;
1837 for(int wt=0; wt<w; ++wt)
1839 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1840 pixmix(&dst[wt], color, s[wt]);
1842 s += overlayPitch;
1843 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1846 break;
1847 case DM::SINGLE_COLOR | DM::SSE2 | DM::AYUV_PLANAR :
1849 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1850 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1851 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1852 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1854 AlphaBlt(dst_Y, s, ((color)>>16)&0xff, h, w, overlayPitch, bitmap->pitch);
1855 AlphaBlt(dst_U, s, ((color)>>8)&0xff, h, w, overlayPitch, bitmap->pitch);
1856 AlphaBlt(dst_V, s, ((color))&0xff, h, w, overlayPitch, bitmap->pitch);
1857 AlphaBlt(dst_A, s, 0, h, w, overlayPitch, bitmap->pitch);
1859 break;
1860 case 0*DM::SINGLE_COLOR | DM::SSE2 | DM::AYUV_PLANAR :
1862 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1863 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1864 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1865 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1867 const DWORD *sw = switchpts;
1868 int last_x = xo;
1869 color = sw[0];
1870 while(last_x<w+xo)
1872 int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1873 color = sw[0];
1874 sw += 2;
1875 if( new_x < last_x )
1876 continue;
1877 AlphaBlt(dst_Y, s + last_x - xo, (color>>16)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1878 AlphaBlt(dst_U, s + last_x - xo, (color>>8)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1879 AlphaBlt(dst_V, s + last_x - xo, (color)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1880 AlphaBlt(dst_A, s + last_x - xo, 0, h, new_x-last_x, overlayPitch, bitmap->pitch);
1882 dst_A += new_x - last_x;
1883 dst_Y += new_x - last_x;
1884 dst_U += new_x - last_x;
1885 dst_V += new_x - last_x;
1886 last_x = new_x;
1889 break;
1890 case DM::SINGLE_COLOR | 0*DM::SSE2 | DM::AYUV_PLANAR :
1892 // char * debug_dst=(char*)dst;int h2 = h;
1893 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1894 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1895 // debug_dst += spd.pitch*spd.h;
1896 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1897 // debug_dst += spd.pitch*spd.h;
1898 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1899 // debug_dst += spd.pitch*spd.h;
1900 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1901 // debug_dst=(char*)dst;
1903 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1904 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1905 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1906 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1907 while(h--)
1909 for(int wt=0; wt<w; ++wt)
1911 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1912 pixmix(&temp, color, s[wt]);
1913 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1915 s += overlayPitch;
1916 dst_A += bitmap->pitch;
1917 dst_Y += bitmap->pitch;
1918 dst_U += bitmap->pitch;
1919 dst_V += bitmap->pitch;
1921 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1922 // debug_dst += spd.pitch*spd.h;
1923 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1924 // debug_dst += spd.pitch*spd.h;
1925 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1926 // debug_dst += spd.pitch*spd.h;
1927 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1929 break;
1930 case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | DM::AYUV_PLANAR :
1932 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1933 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1934 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1935 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1936 while(h--)
1938 const DWORD *sw = switchpts;
1939 for(int wt=0; wt<w; ++wt)
1941 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1942 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1943 pixmix(&temp, color, (s[wt]*(color>>24))>>8);
1944 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1946 s += overlayPitch;
1947 dst_A += bitmap->pitch;
1948 dst_Y += bitmap->pitch;
1949 dst_U += bitmap->pitch;
1950 dst_V += bitmap->pitch;
1953 break;
1955 // Remember to EMMS!
1956 // Rendering fails in funny ways if we don't do this.
1957 return;
1960 void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nHeight, DWORD argb)
1962 bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1963 bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1964 int draw_method = 0;
1965 if(fSSE2)
1966 draw_method |= DM::SSE2;
1967 if(AYUV_PLANAR)
1968 draw_method |= DM::AYUV_PLANAR;
1970 switch (draw_method)
1972 case DM::SSE2 | 0*DM::AYUV_PLANAR :
1974 for (int wy=y; wy<y+nHeight; wy++) {
1975 DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1976 for(int wt=0; wt<nWidth; ++wt) {
1977 pixmix_sse2(&dst[wt], argb, argb>>24);
1981 break;
1982 case 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1984 for (int wy=y; wy<y+nHeight; wy++) {
1985 DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1986 for(int wt=0; wt<nWidth; ++wt) {
1987 pixmix(&dst[wt], argb, argb>>24);
1991 break;
1992 case DM::SSE2 | DM::AYUV_PLANAR :
1994 BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1995 BYTE* dst_A = dst;
1996 BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1997 BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1998 BYTE* dst_V = dst_U + spd.pitch*spd.h;
1999 AlphaBlt(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
2000 AlphaBlt(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
2001 AlphaBlt(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
2002 AlphaBlt(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
2004 break;
2005 case 0*DM::SSE2 | DM::AYUV_PLANAR :
2007 BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
2008 BYTE* dst_A = dst;
2009 BYTE* dst_Y = dst_A + spd.pitch*spd.h;
2010 BYTE* dst_U = dst_Y + spd.pitch*spd.h;
2011 BYTE* dst_V = dst_U + spd.pitch*spd.h;
2012 AlphaBltC(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
2013 AlphaBltC(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
2014 AlphaBltC(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
2015 AlphaBltC(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
2017 break;
2022 ///////////////////////////////////////////////////////////////
2024 // Overlay
2026 void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
2027 const byte* pAlphaMask, int pitch, DWORD color_alpha )
2029 if (g_cpuid.m_flags & CCpuID::sse2)
2031 pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
2032 pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
2033 byte* dst = outputAlphaMask + y*mOverlayPitch + x;
2035 const int x0 = ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) < w ?
2036 ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) : w; //IMPORTANT! Should not exceed w.
2037 const int x00 = ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) < w ?
2038 ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) : w;//IMPORTANT! Should not exceed w.
2039 const int x_end00 = ((reinterpret_cast<int>(dst)+w)&~15) - reinterpret_cast<int>(dst);
2040 const int x_end0 = ((reinterpret_cast<int>(dst)+w)&~3) - reinterpret_cast<int>(dst);
2041 const int x_end = w;
2043 __m64 color_alpha_64 = _mm_set1_pi16(color_alpha);
2044 __m128i color_alpha_128 = _mm_set1_epi16(color_alpha);
2046 if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
2049 __asm
2051 mov eax, color_alpha
2052 movd XMM3, eax
2053 punpcklwd XMM3, XMM3
2054 pshufd XMM3, XMM3, 0
2057 while(h--)
2059 int j=0;
2060 for( ; j<x0; j++ )
2062 int temp = pBorder[j]-pBody[j];
2063 temp = temp<0 ? 0 : temp;
2064 dst[j] = (temp * color_alpha)>>6;
2066 for( ;j<x00;j+=4 )
2068 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2069 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2070 border = _mm_subs_pu8(border, body);
2071 __m64 zero = _mm_setzero_si64();
2072 border = _mm_unpacklo_pi8(border, zero);
2073 border = _mm_mullo_pi16(border, color_alpha_64);
2074 border = _mm_srli_pi16(border, 6);
2075 border = _mm_packs_pu16(border,border);
2076 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2078 __m128i zero = _mm_setzero_si128();
2079 for( ;j<x_end00;j+=16)
2081 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
2082 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
2083 border = _mm_subs_epu8(border,body);
2084 __m128i srchi = border;
2085 border = _mm_unpacklo_epi8(border, zero);
2086 srchi = _mm_unpackhi_epi8(srchi, zero);
2087 border = _mm_mullo_epi16(border, color_alpha_128);
2088 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2089 border = _mm_srli_epi16(border, 6);
2090 srchi = _mm_srli_epi16(srchi, 6);
2091 border = _mm_packus_epi16(border, srchi);
2092 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
2094 for( ;j<x_end0;j+=4)
2096 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2097 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2098 border = _mm_subs_pu8(border, body);
2099 __m64 zero = _mm_setzero_si64();
2100 border = _mm_unpacklo_pi8(border, zero);
2101 border = _mm_mullo_pi16(border, color_alpha_64);
2102 border = _mm_srli_pi16(border, 6);
2103 border = _mm_packs_pu16(border,border);
2104 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2106 for( ;j<x_end;j++)
2108 int temp = pBorder[j]-pBody[j];
2109 temp = temp<0 ? 0 : temp;
2110 dst[j] = (temp * color_alpha)>>6;
2112 pBody += mOverlayPitch;
2113 pBorder += mOverlayPitch;
2114 //pAlphaMask += pitch;
2115 dst += mOverlayPitch;
2118 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
2120 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2121 while(h--)
2123 int j=0;
2124 for( ; j<x0; j++ )
2126 dst[j] = (src1[j] * color_alpha)>>6;
2128 for( ;j<x00;j+=4 )
2130 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2131 __m64 zero = _mm_setzero_si64();
2132 src = _mm_unpacklo_pi8(src, zero);
2133 src = _mm_mullo_pi16(src, color_alpha_64);
2134 src = _mm_srli_pi16(src, 6);
2135 src = _mm_packs_pu16(src,src);
2136 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2138 __m128i zero = _mm_setzero_si128();
2139 for( ;j<x_end00;j+=16)
2141 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
2142 __m128i srchi = src;
2143 src = _mm_unpacklo_epi8(src, zero);
2144 srchi = _mm_unpackhi_epi8(srchi, zero);
2145 src = _mm_mullo_epi16(src, color_alpha_128);
2146 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2147 src = _mm_srli_epi16(src, 6);
2148 srchi = _mm_srli_epi16(srchi, 6);
2149 src = _mm_packus_epi16(src, srchi);
2150 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
2152 for( ;j<x_end0;j+=4)
2154 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2155 __m64 zero = _mm_setzero_si64();
2156 src = _mm_unpacklo_pi8(src, zero);
2157 src = _mm_mullo_pi16(src, color_alpha_64);
2158 src = _mm_srli_pi16(src, 6);
2159 src = _mm_packs_pu16(src,src);
2160 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2162 for( ;j<x_end;j++)
2164 dst[j] = (src1[j] * color_alpha)>>6;
2166 src1 += mOverlayPitch;
2167 //pAlphaMask += pitch;
2168 dst += mOverlayPitch;
2171 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
2173 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2174 while(h--)
2176 int j=0;
2177 for( ; j<x0; j++ )
2179 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2181 for( ;j<x00;j+=4 )
2183 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2184 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2185 __m64 zero = _mm_setzero_si64();
2186 src = _mm_unpacklo_pi8(src, zero);
2187 src = _mm_mullo_pi16(src, color_alpha_64);
2188 mask = _mm_unpacklo_pi8(zero, mask); //important!
2189 src = _mm_mulhi_pi16(src, mask); //important!
2190 src = _mm_srli_pi16(src, 12+8-16); //important!
2191 src = _mm_packs_pu16(src,src);
2192 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2194 __m128i zero = _mm_setzero_si128();
2195 for( ;j<x_end00;j+=16)
2197 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
2198 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
2199 __m128i srchi = src;
2200 __m128i maskhi = mask;
2201 src = _mm_unpacklo_epi8(src, zero);
2202 srchi = _mm_unpackhi_epi8(srchi, zero);
2203 mask = _mm_unpacklo_epi8(zero, mask); //important!
2204 maskhi = _mm_unpackhi_epi8(zero, maskhi);
2205 src = _mm_mullo_epi16(src, color_alpha_128);
2206 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2207 src = _mm_mulhi_epu16(src, mask); //important!
2208 srchi = _mm_mulhi_epu16(srchi, maskhi);
2209 src = _mm_srli_epi16(src, 12+8-16); //important!
2210 srchi = _mm_srli_epi16(srchi, 12+8-16);
2211 src = _mm_packus_epi16(src, srchi);
2212 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
2214 for( ;j<x_end0;j+=4)
2216 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2217 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2218 __m64 zero = _mm_setzero_si64();
2219 src = _mm_unpacklo_pi8(src, zero);
2220 src = _mm_mullo_pi16(src, color_alpha_64);
2221 mask = _mm_unpacklo_pi8(zero, mask); //important!
2222 src = _mm_mulhi_pi16(src, mask); //important!
2223 src = _mm_srli_pi16(src, 12+8-16); //important!
2224 src = _mm_packs_pu16(src,src);
2225 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2227 for( ;j<x_end;j++)
2229 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2231 src1 += mOverlayPitch;
2232 pAlphaMask += pitch;
2233 dst += mOverlayPitch;
2236 else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
2238 while(h--)
2240 int j=0;
2241 for( ; j<x0; j++ )
2243 int temp = pBorder[j]-pBody[j];
2244 temp = temp<0 ? 0 : temp;
2245 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2247 for( ;j<x00;j+=4 )
2249 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2250 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2251 border = _mm_subs_pu8(border, body);
2252 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2253 __m64 zero = _mm_setzero_si64();
2254 border = _mm_unpacklo_pi8(border, zero);
2255 border = _mm_mullo_pi16(border, color_alpha_64);
2256 mask = _mm_unpacklo_pi8(zero, mask); //important!
2257 border = _mm_mulhi_pi16(border, mask); //important!
2258 border = _mm_srli_pi16(border, 12+8-16); //important!
2259 border = _mm_packs_pu16(border,border);
2260 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2262 __m128i zero = _mm_setzero_si128();
2263 for( ;j<x_end00;j+=16)
2265 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
2266 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
2267 border = _mm_subs_epu8(border,body);
2269 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
2270 __m128i srchi = border;
2271 __m128i maskhi = mask;
2272 border = _mm_unpacklo_epi8(border, zero);
2273 srchi = _mm_unpackhi_epi8(srchi, zero);
2274 mask = _mm_unpacklo_epi8(zero, mask); //important!
2275 maskhi = _mm_unpackhi_epi8(zero, maskhi);
2276 border = _mm_mullo_epi16(border, color_alpha_128);
2277 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2278 border = _mm_mulhi_epu16(border, mask); //important!
2279 srchi = _mm_mulhi_epu16(srchi, maskhi);
2280 border = _mm_srli_epi16(border, 12+8-16); //important!
2281 srchi = _mm_srli_epi16(srchi, 12+8-16);
2282 border = _mm_packus_epi16(border, srchi);
2283 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
2285 for( ;j<x_end0;j+=4)
2287 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2288 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2289 border = _mm_subs_pu8(border, body);
2290 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2291 __m64 zero = _mm_setzero_si64();
2292 border = _mm_unpacklo_pi8(border, zero);
2293 border = _mm_mullo_pi16(border, color_alpha_64);
2294 mask = _mm_unpacklo_pi8(zero, mask); //important!
2295 border = _mm_mulhi_pi16(border, mask); //important!
2296 border = _mm_srli_pi16(border, 12+8-16); //important!
2297 border = _mm_packs_pu16(border,border);
2298 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2300 for( ;j<x_end;j++)
2302 int temp = pBorder[j]-pBody[j];
2303 temp = temp<0 ? 0 : temp;
2304 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2306 pBody += mOverlayPitch;
2307 pBorder += mOverlayPitch;
2308 pAlphaMask += pitch;
2309 dst += mOverlayPitch;
2312 else
2314 //should NOT happen!
2315 ASSERT(0);
2316 while(h--)
2318 for(int j=0;j<x_end;j++)
2320 dst[j] = 0;
2322 dst += mOverlayPitch;
2325 _mm_empty();
2327 else
2329 _DoFillAlphaMash_c(outputAlphaMask, pBody, pBorder, x, y, w, h, pAlphaMask, pitch, color_alpha);
2330 return;
2334 void Overlay::_DoFillAlphaMash_c(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
2335 const byte* pAlphaMask, int pitch, DWORD color_alpha )
2337 pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
2338 pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
2339 byte* dst = outputAlphaMask + y*mOverlayPitch + x;
2341 if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
2343 while(h--)
2345 int j=0;
2346 for( ;j<w;j++)
2348 int temp = pBorder[j]-pBody[j];
2349 temp = temp<0 ? 0 : temp;
2350 dst[j] = (temp * color_alpha)>>6;
2352 pBody += mOverlayPitch;
2353 pBorder += mOverlayPitch;
2354 //pAlphaMask += pitch;
2355 dst += mOverlayPitch;
2358 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
2360 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2361 while(h--)
2363 int j=0;
2364 for( ; j<w; j++ )
2366 dst[j] = (src1[j] * color_alpha)>>6;
2368 src1 += mOverlayPitch;
2369 //pAlphaMask += pitch;
2370 dst += mOverlayPitch;
2373 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
2375 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2376 while(h--)
2378 int j=0;
2379 for( ; j<w; j++ )
2381 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2383 src1 += mOverlayPitch;
2384 pAlphaMask += pitch;
2385 dst += mOverlayPitch;
2388 else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
2390 while(h--)
2392 int j=0;
2393 for( ; j<w; j++ )
2395 int temp = pBorder[j]-pBody[j];
2396 temp = temp<0 ? 0 : temp;
2397 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2399 pBody += mOverlayPitch;
2400 pBorder += mOverlayPitch;
2401 pAlphaMask += pitch;
2402 dst += mOverlayPitch;
2405 else
2407 //should NOT happen!
2408 ASSERT(0);
2409 while(h--)
2411 for(int j=0;j<w;j++)
2413 dst[j] = 0;
2415 dst += mOverlayPitch;
2420 void Overlay::FillAlphaMash( byte* outputAlphaMask, bool fBody, bool fBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha)
2422 if(!fBorder && fBody && pAlphaMask==NULL)
2424 _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
2426 else if(/*fBorder &&*/ fBody && pAlphaMask==NULL)
2428 _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2430 else if(!fBody && fBorder /* pAlphaMask==NULL or not*/)
2432 _DoFillAlphaMash(outputAlphaMask, mBody.get(), mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2434 else if(!fBorder && fBody && pAlphaMask!=NULL)
2436 _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
2438 else if(fBorder && fBody && pAlphaMask!=NULL)
2440 _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2442 else
2444 //should NOT happen
2445 ASSERT(0);
2449 Overlay* Overlay::GetSubpixelVariance(unsigned int xshift, unsigned int yshift)
2451 Overlay* overlay = new Overlay();
2452 if(!overlay)
2454 return NULL;
2456 xshift &= 7;
2457 yshift &= 7;
2459 overlay->mOffsetX = mOffsetX - xshift;
2460 overlay->mOffsetY = mOffsetY - yshift;
2461 overlay->mWidth = mWidth + xshift;
2462 overlay->mHeight = mHeight + yshift;
2464 overlay->mOverlayWidth = ((overlay->mWidth+7)>>3) + 1;
2465 overlay->mOverlayHeight = ((overlay->mHeight + 7)>>3) + 1;
2466 overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
2469 overlay->mfWideOutlineEmpty = mfWideOutlineEmpty;
2471 if (overlay->mOverlayPitch * overlay->mOverlayHeight<=0)
2473 return NULL;
2476 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
2477 if( body==NULL )
2479 return NULL;
2481 overlay->mBody.reset(body, xy_free);
2482 memset(body, 0, overlay->mOverlayPitch*overlay->mOverlayHeight);
2483 BYTE* border = NULL;
2484 if (!overlay->mfWideOutlineEmpty)
2486 border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
2487 if (border==NULL)
2489 return NULL;
2491 overlay->mBorder.reset(border, xy_free);
2492 memset(border, 0, overlay->mOverlayPitch*overlay->mOverlayHeight);
2495 if( overlay->mOverlayPitch==mOverlayPitch && overlay->mOverlayWidth==mOverlayWidth &&
2496 overlay->mOverlayHeight>=mOverlayHeight )
2498 if (body && mBody)
2500 memcpy(body, mBody.get(), mOverlayPitch * mOverlayHeight);
2502 else if ( (!!body)!=(!!mBody)/*==NULL*/)
2504 return NULL;
2507 if (border && mBorder)
2509 memcpy(border, mBorder.get(), mOverlayPitch * mOverlayHeight);
2511 else if ( (!!border)!=(!!mBorder)/*==NULL*/ )
2513 return NULL;
2516 else
2518 byte* dst = body;
2519 const byte* src = mBody.get();
2520 for (int i=0;i<mOverlayHeight;i++)
2522 memcpy(dst, src, mOverlayWidth);
2523 dst += overlay->mOverlayPitch;
2524 src += mOverlayPitch;
2526 if (!overlay->mfWideOutlineEmpty)
2528 ASSERT(border && mBorder);
2529 dst = border;
2530 src = mBorder.get();
2531 for (int i=0;i<mOverlayHeight;i++)
2533 memcpy(dst, src, mOverlayWidth);
2534 dst += overlay->mOverlayPitch;
2535 src += mOverlayPitch;
2539 //not equal
2540 // Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2541 Bilinear(body, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2542 if (!overlay->mfWideOutlineEmpty)
2544 Bilinear(border, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2546 return overlay;
2549 ///////////////////////////////////////////////////////////////
2551 // PathData
2553 PathData::PathData():mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0)
2557 PathData::PathData( const PathData& src ):mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(src.mPathPoints)
2559 //TODO: deal with the case that src.mPathPoints<0
2560 if(mPathPoints>0)
2562 mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2563 mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));
2565 if(mPathPoints>0)
2567 memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2568 memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2572 const PathData& PathData::operator=( const PathData& src )
2574 if(this!=&src)
2576 if(mPathPoints!=src.mPathPoints && src.mPathPoints>0)
2578 _TrashPath();
2579 mPathPoints = src.mPathPoints;
2580 mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2581 mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));//better than realloc
2583 if(src.mPathPoints>0)
2585 memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2586 memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2589 return *this;
2592 PathData::~PathData()
2594 _TrashPath();
2597 bool PathData::operator==( const PathData& rhs ) const
2599 return (this==&rhs) || (
2600 mPathPoints==rhs.mPathPoints
2601 && !memcmp(mpPathTypes, rhs.mpPathTypes, mPathPoints * sizeof(BYTE) )
2602 && !memcmp(mpPathPoints, rhs.mpPathPoints, mPathPoints * sizeof(POINT) )
2606 void PathData::_TrashPath()
2608 if (mpPathTypes)
2610 free(mpPathTypes);
2611 mpPathTypes = NULL;
2613 if (mpPathPoints)
2615 free(mpPathPoints);
2616 mpPathPoints = NULL;
2618 mPathPoints = 0;
2621 bool PathData::BeginPath(HDC hdc)
2623 _TrashPath();
2624 return !!::BeginPath(hdc);
2627 bool PathData::EndPath(HDC hdc)
2629 ::CloseFigure(hdc);
2630 if(::EndPath(hdc))
2632 mPathPoints = GetPath(hdc, NULL, NULL, 0);
2633 if(!mPathPoints)
2634 return true;
2635 mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
2636 mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
2637 if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
2638 return true;
2640 ::AbortPath(hdc);
2641 return false;
2644 bool PathData::PartialBeginPath(HDC hdc, bool bClearPath)
2646 if(bClearPath)
2647 _TrashPath();
2648 return !!::BeginPath(hdc);
2651 bool PathData::PartialEndPath(HDC hdc, long dx, long dy)
2653 ::CloseFigure(hdc);
2654 if(::EndPath(hdc))
2656 int nPoints;
2657 BYTE* pNewTypes;
2658 POINT* pNewPoints;
2659 nPoints = GetPath(hdc, NULL, NULL, 0);
2660 if(!nPoints)
2661 return true;
2662 pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
2663 pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
2664 if(pNewTypes)
2665 mpPathTypes = pNewTypes;
2666 if(pNewPoints)
2667 mpPathPoints = pNewPoints;
2668 BYTE* pTypes = new BYTE[nPoints];
2669 POINT* pPoints = new POINT[nPoints];
2670 if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
2672 for(int i = 0; i < nPoints; ++i)
2674 mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
2675 mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
2676 mpPathTypes[mPathPoints + i] = pTypes[i];
2678 mPathPoints += nPoints;
2679 delete[] pTypes;
2680 delete[] pPoints;
2681 return true;
2683 else
2684 DebugBreak();
2685 delete[] pTypes;
2686 delete[] pPoints;
2688 ::AbortPath(hdc);
2689 return false;
2692 void PathData::AlignLeftTop(CPoint *left_top, CSize *size)
2694 int minx = INT_MAX;
2695 int miny = INT_MAX;
2696 int maxx = INT_MIN;
2697 int maxy = INT_MIN;
2698 for(int i=0; i<mPathPoints; ++i)
2700 int ix = mpPathPoints[i].x;
2701 int iy = mpPathPoints[i].y;
2702 if(ix < minx) minx = ix;
2703 if(ix > maxx) maxx = ix;
2704 if(iy < miny) miny = iy;
2705 if(iy > maxy) maxy = iy;
2707 if(minx > maxx || miny > maxy)
2709 _TrashPath();
2710 *left_top = CPoint(0, 0);
2711 *size = CSize(0, 0);
2712 return;
2714 minx = (minx >> 3) & ~7;
2715 miny = (miny >> 3) & ~7;
2716 maxx = (maxx + 7) >> 3;
2717 maxy = (maxy + 7) >> 3;
2718 for(int i=0; i<mPathPoints; ++i)
2720 mpPathPoints[i].x -= minx*8;
2721 mpPathPoints[i].y -= miny*8;
2723 *left_top = CPoint(minx, miny);
2724 *size = CSize(maxx+1-minx, maxy+1-miny);
2725 return;
2728 //////////////////////////////////////////////////////////////////////////
2730 // ScanLineData
2732 ScanLineData::ScanLineData()
2736 ScanLineData::~ScanLineData()
2740 void ScanLineData::_ReallocEdgeBuffer(int edges)
2742 mEdgeHeapSize = edges;
2743 mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
2746 void ScanLineData::_EvaluateBezier(const PathData& path_data, int ptbase, bool fBSpline)
2748 const POINT* pt0 = path_data.mpPathPoints + ptbase;
2749 const POINT* pt1 = path_data.mpPathPoints + ptbase + 1;
2750 const POINT* pt2 = path_data.mpPathPoints + ptbase + 2;
2751 const POINT* pt3 = path_data.mpPathPoints + ptbase + 3;
2752 double x0 = pt0->x;
2753 double x1 = pt1->x;
2754 double x2 = pt2->x;
2755 double x3 = pt3->x;
2756 double y0 = pt0->y;
2757 double y1 = pt1->y;
2758 double y2 = pt2->y;
2759 double y3 = pt3->y;
2760 double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
2761 if(fBSpline)
2763 // 1 [-1 +3 -3 +1]
2764 // - * [+3 -6 +3 0]
2765 // 6 [-3 0 +3 0]
2766 // [+1 +4 +1 0]
2767 double _1div6 = 1.0/6.0;
2768 cx3 = _1div6*(- x0+3*x1-3*x2+x3);
2769 cx2 = _1div6*( 3*x0-6*x1+3*x2);
2770 cx1 = _1div6*(-3*x0 +3*x2);
2771 cx0 = _1div6*( x0+4*x1+1*x2);
2772 cy3 = _1div6*(- y0+3*y1-3*y2+y3);
2773 cy2 = _1div6*( 3*y0-6*y1+3*y2);
2774 cy1 = _1div6*(-3*y0 +3*y2);
2775 cy0 = _1div6*( y0+4*y1+1*y2);
2777 else // bezier
2779 // [-1 +3 -3 +1]
2780 // [+3 -6 +3 0]
2781 // [-3 +3 0 0]
2782 // [+1 0 0 0]
2783 cx3 = - x0+3*x1-3*x2+x3;
2784 cx2 = 3*x0-6*x1+3*x2;
2785 cx1 = -3*x0+3*x1;
2786 cx0 = x0;
2787 cy3 = - y0+3*y1-3*y2+y3;
2788 cy2 = 3*y0-6*y1+3*y2;
2789 cy1 = -3*y0+3*y1;
2790 cy0 = y0;
2793 // This equation is from Graphics Gems I.
2795 // The idea is that since we're approximating a cubic curve with lines,
2796 // any error we incur is due to the curvature of the line, which we can
2797 // estimate by calculating the maximum acceleration of the curve. For
2798 // a cubic, the acceleration (second derivative) is a line, meaning that
2799 // the absolute maximum acceleration must occur at either the beginning
2800 // (|c2|) or the end (|c2+c3|). Our bounds here are a little more
2801 // conservative than that, but that's okay.
2803 // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2804 // that component of the curve is linear and does not incur any error.
2805 // If a=0 for both X and Y, the curve is a line segment and we can
2806 // use a step size of 1.
2807 double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
2808 double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
2809 double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
2810 double h = 1.0;
2811 if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
2812 if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
2813 for(double t = 0; t < 1.0; t += h)
2815 double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
2816 double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
2817 _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2819 double x = cx0 + cx1 + cx2 + cx3;
2820 double y = cy0 + cy1 + cy2 + cy3;
2821 _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2824 void ScanLineData::_EvaluateLine(const PathData& path_data, int pt1idx, int pt2idx)
2826 const POINT* pt1 = path_data.mpPathPoints + pt1idx;
2827 const POINT* pt2 = path_data.mpPathPoints + pt2idx;
2828 _EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
2831 void ScanLineData::_EvaluateLine(int x0, int y0, int x1, int y1)
2833 if(lastp.x != x0 || lastp.y != y0)
2835 _EvaluateLine(lastp.x, lastp.y, x0, y0);
2837 if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
2838 lastp.x = x1;
2839 lastp.y = y1;
2840 if(y1 > y0) // down
2842 __int64 xacc = (__int64)x0 << 13;
2843 // prestep y0 down
2844 int dy = y1 - y0;
2845 int y = ((y0 + 3)&~7) + 4;
2846 int iy = y >> 3;
2847 y1 = (y1 - 5) >> 3;
2848 if(iy <= y1)
2850 __int64 invslope = (__int64(x1 - x0) << 16) / dy;
2851 while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
2852 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2853 xacc += (invslope * (y - y0)) >> 3;
2854 while(iy <= y1)
2856 int ix = (int)((xacc + 32768) >> 16);
2857 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2858 mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
2859 mpScanBuffer[iy] = mEdgeNext++;
2860 ++iy;
2861 xacc += invslope;
2865 else if(y1 < y0) // up
2867 __int64 xacc = (__int64)x1 << 13;
2868 // prestep y1 down
2869 int dy = y0 - y1;
2870 int y = ((y1 + 3)&~7) + 4;
2871 int iy = y >> 3;
2872 y0 = (y0 - 5) >> 3;
2873 if(iy <= y0)
2875 __int64 invslope = (__int64(x0 - x1) << 16) / dy;
2876 while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
2877 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2878 xacc += (invslope * (y - y1)) >> 3;
2879 while(iy <= y0)
2881 int ix = (int)((xacc + 32768) >> 16);
2882 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2883 mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
2884 mpScanBuffer[iy] = mEdgeNext++;
2885 ++iy;
2886 xacc += invslope;
2892 bool ScanLineData::ScanConvert(const PathData& path_data, const CSize& size)
2894 int lastmoveto = -1;
2895 int i;
2896 // Drop any outlines we may have.
2897 mOutline.clear();
2898 // Determine bounding box
2899 if(!path_data.mPathPoints)
2901 mWidth = mHeight = 0;
2902 return false;
2904 mWidth = size.cx;
2905 mHeight = size.cy;
2906 // Initialize edge buffer. We use edge 0 as a sentinel.
2907 mEdgeNext = 1;
2908 mEdgeHeapSize = 2048;
2909 mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
2910 // Initialize scanline list.
2911 mpScanBuffer = new unsigned int[mHeight];
2912 memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
2913 // Scan convert the outline. Yuck, Bezier curves....
2914 // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2915 // paths with all but the first figure left open, so we can't rely
2916 // on the PT_CLOSEFIGURE flag being used appropriately.
2917 fFirstSet = false;
2918 firstp.x = firstp.y = 0;
2919 lastp.x = lastp.y = 0;
2920 for(i=0; i<path_data.mPathPoints; ++i)
2922 BYTE t = path_data.mpPathTypes[i] & ~PT_CLOSEFIGURE;
2923 switch(t)
2925 case PT_MOVETO:
2926 if(lastmoveto >= 0 && firstp != lastp)
2927 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2928 lastmoveto = i;
2929 fFirstSet = false;
2930 lastp = path_data.mpPathPoints[i];
2931 break;
2932 case PT_MOVETONC:
2933 break;
2934 case PT_LINETO:
2935 if(path_data.mPathPoints - (i-1) >= 2) _EvaluateLine(path_data, i-1, i);
2936 break;
2937 case PT_BEZIERTO:
2938 if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, false);
2939 i += 2;
2940 break;
2941 case PT_BSPLINETO:
2942 if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, true);
2943 i += 2;
2944 break;
2945 case PT_BSPLINEPATCHTO:
2946 if(path_data.mPathPoints - (i-3) >= 4) _EvaluateBezier(path_data, i-3, true);
2947 break;
2950 if(lastmoveto >= 0 && firstp != lastp)
2951 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2952 // Convert the edges to spans. We couldn't do this before because some of
2953 // the regions may have winding numbers >+1 and it would have been a pain
2954 // to try to adjust the spans on the fly. We use one heap to detangle
2955 // a scanline's worth of edges from the singly-linked lists, and another
2956 // to collect the actual scans.
2957 std::vector<int> heap;
2958 mOutline.reserve(mEdgeNext / 2);
2959 __int64 y = 0;
2960 for(y=0; y<mHeight; ++y)
2962 int count = 0;
2963 // Detangle scanline into edge heap.
2964 for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
2966 heap.push_back(mpEdgeBuffer[ptr].posandflag);
2968 // Sort edge heap. Note that we conveniently made the opening edges
2969 // one more than closing edges at the same spot, so we won't have any
2970 // problems with abutting spans.
2971 std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
2972 // Process edges and add spans. Since we only check for a non-zero
2973 // winding number, it doesn't matter which way the outlines go!
2974 std::vector<int>::iterator itX1 = heap.begin();
2975 std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
2976 int x1, x2;
2977 for(; itX1 != itX2; ++itX1)
2979 int x = *itX1;
2980 if(!count)
2981 x1 = (x>>1);
2982 if(x&1)
2983 ++count;
2984 else
2985 --count;
2986 if(!count)
2988 x2 = (x>>1);
2989 if(x2>x1)
2990 mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
2993 heap.clear();
2995 // Dump the edge and scan buffers, since we no longer need them.
2996 free(mpEdgeBuffer);
2997 delete [] mpScanBuffer;
2998 // All done!
2999 return true;
3002 void ScanLineData::DeleteOutlines()
3004 mOutline.clear();
3007 bool ScanLineData2::CreateWidenedRegion(int rx, int ry)
3009 if(rx < 0) rx = 0;
3010 if(ry < 0) ry = 0;
3011 mWideBorder = max(rx,ry);
3012 mWideOutline.clear();
3014 const tSpanBuffer& out_line = m_scan_line_data->mOutline;
3015 if (ry > 0)
3017 WidenRegionCreater *widen_region_creater = WidenRegionCreater::GetDefaultWidenRegionCreater();
3018 widen_region_creater->xy_overlap_region(&mWideOutline, out_line, rx, ry);
3020 else if (ry == 0 && rx > 0)
3022 // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
3023 OverlapRegion(mWideOutline, out_line, rx, 0);
3024 OverlapRegion(mWideOutline, out_line, rx, 0);
3026 return true;