Minor refactor.
[xy_vsfilter.git] / src / subtitles / Rasterizer.cpp
blob32cdebd6752f8d9b7390e072eed624feded290bc
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include <string.h>
24 #include <math.h>
25 #include <vector>
26 #include <algorithm>
27 #include "Rasterizer.h"
28 #include "SeparableFilter.h"
29 #include "xy_logger.h"
30 #include <boost/flyweight/key_value.hpp>
31 #include "xy_bitmap.h"
33 #ifndef _MAX /* avoid collision with common (nonconforming) macros */
34 #define _MAX (std::max)
35 #define _MIN (std::min)
36 #define _IMPL_MAX std::max
37 #define _IMPL_MIN std::min
38 #else
39 #define _IMPL_MAX _MAX
40 #define _IMPL_MIN _MIN
41 #endif
44 //NOTE: signed or unsigned affects the result seriously
45 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
47 #define SPLIT_AYUV(color, a, y, u, v) do { \
48 *(v)=(color)&0xff; \
49 *(u)=((color)>>8) &0xff; \
50 *(y)=((color)>>16)&0xff;\
51 *(a)=((color)>>24)&0xff;\
52 } while(0)
54 class ass_synth_priv
56 public:
57 static const int VOLUME_BITS = 22;//should not exceed 32-8, and better not exceed 31-8
59 ass_synth_priv(const double sigma);
60 ass_synth_priv(const ass_synth_priv& priv);
62 ~ass_synth_priv();
63 int generate_tables(double sigma);
65 int g_r;
66 int g_w;
68 unsigned *g;
69 unsigned *gt2;
71 double sigma;
74 struct ass_synth_priv_key
76 const double& operator()(const ass_synth_priv& x)const
78 return x.sigma;
82 struct ass_tmp_buf
84 public:
85 ass_tmp_buf(size_t size);
86 ass_tmp_buf(const ass_tmp_buf& buf);
87 ~ass_tmp_buf();
88 size_t size;
89 unsigned *tmp;
92 struct ass_tmp_buf_get_size
94 const size_t& operator()(const ass_tmp_buf& buf)const
96 return buf.size;
100 static const unsigned int maxcolor = 255;
101 static const unsigned base = 256;
103 ass_synth_priv::ass_synth_priv(const double sigma)
105 g_r = 0;
106 g_w = 0;
108 g = NULL;
109 gt2 = NULL;
111 this->sigma = 0;
112 generate_tables(sigma);
115 ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma)
117 if (this->g_w > 0 && this != &priv) {
118 this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
119 this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
120 //if (this->g == null || this->gt2 == null) {
121 // return -1;
123 memcpy(g, priv.g, this->g_w * sizeof(unsigned));
124 memcpy(gt2, priv.gt2, 256 * this->g_w * sizeof(unsigned));
128 ass_synth_priv::~ass_synth_priv()
130 free(g); g=NULL;
131 free(gt2); gt2=NULL;
134 int ass_synth_priv::generate_tables(double sigma)
136 const int TARGET_VOLUME = 1<<VOLUME_BITS;
137 const int MAX_VOLUME_ERROR = VOLUME_BITS>=22 ? 16 : 1;
139 double a = -1 / (sigma * sigma * 2);
140 double exp_a = exp(a);
142 double volume_factor = 0;
143 double volume_start = 0, volume_end = 0;
144 unsigned volume;
146 if (this->sigma == sigma)
147 return 0;
148 else
149 this->sigma = sigma;
151 this->g_w = (int)ceil(sigma*3) | 1;
152 this->g_r = this->g_w / 2;
154 if (this->g_w > 0) {
155 this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
156 this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
157 if (this->g == NULL || this->gt2 == NULL) {
158 return -1;
162 if (this->g_w > 0) {
163 volume_start = 0;
165 double exp_0 = 1.0;
166 double exp_1 = exp_a;
167 double exp_2 = exp_1 * exp_1;
168 volume_start += exp_0;
169 for(int i=0;i<this->g_r;++i)
171 exp_0 *= exp_1;
172 exp_1 *= exp_2;
173 volume_start += exp_0;
174 volume_start += exp_0;
176 //euqivalent:
177 // for (i = 0; i < this->g_w; ++i) {
178 // volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
179 // }
181 volume_end = (TARGET_VOLUME+g_w)/volume_start;
182 volume_start = (TARGET_VOLUME-g_w)/volume_start;
184 volume = 0;
185 while( volume_start+0.000001<volume_end )
187 volume_factor = (volume_start+volume_end)*0.5;
188 volume = 0;
190 exp_0 = volume_factor;
191 exp_1 = exp_a;
192 exp_2 = exp_1 * exp_1;
194 volume = static_cast<int>(exp_0+.5);
195 this->g[this->g_r] = volume;
197 unsigned* p_left = this->g+this->g_r-1;
198 unsigned* p_right= this->g+this->g_r+1;
199 for(int i=0; i<this->g_r;++i,p_left--,p_right++)
201 exp_0 *= exp_1;
202 exp_1 *= exp_2;
203 *p_left = static_cast<int>(exp_0+.5);
204 *p_right = *p_left;
205 volume += (*p_left<<1);
207 //equivalent:
208 // for (i = 0; i < this->g_w; ++i) {
209 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
210 // volume += this->g[i];
211 // }
213 // volume don't have to be equal to TARGET_VOLUME,
214 // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
215 // max error introducing in later blur operation,
216 // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
217 // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
218 // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
220 // NOTE: when it comes to rounding, no matter how small the error is,
221 // it may result a different rounding output
222 if( volume>=TARGET_VOLUME && volume< (TARGET_VOLUME+MAX_VOLUME_ERROR) )
223 break;
224 else if(volume < TARGET_VOLUME)
226 volume_start = volume_factor;
228 else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR)
230 volume_end = volume_factor;
233 if(volume==0)
235 volume_factor = volume_end;
237 exp_0 = volume_factor;
238 exp_1 = exp_a;
239 exp_2 = exp_1 * exp_1;
241 volume = static_cast<int>(exp_0+.5);
242 this->g[this->g_r] = volume;
244 unsigned* p_left = this->g+this->g_r-1;
245 unsigned* p_right= this->g+this->g_r+1;
246 for(int i=0; i<this->g_r;++i,p_left--,p_right++)
248 exp_0 *= exp_1;
249 exp_1 *= exp_2;
250 *p_left = static_cast<int>(exp_0+.5);
251 *p_right = *p_left;
252 volume += (*p_left<<1);
254 //equivalent:
255 // for (i = 0; i < this->g_w; ++i) {
256 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
257 // volume += this->g[i];
258 // }
261 // gauss table:
262 for (int mx = 0; mx < this->g_w; mx++) {
263 int last_mul = 0;
264 unsigned *p_gt2 = this->gt2 + mx;
265 *p_gt2 = 0;
266 for (int i = 1; i < 256; i++) {
267 last_mul = last_mul+this->g[mx];
268 p_gt2 += this->g_w;
269 *p_gt2 = last_mul;
270 //equivalent:
271 // this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
275 return 0;
278 ass_tmp_buf::ass_tmp_buf(size_t size)
280 tmp = (unsigned *)malloc(size * sizeof(unsigned));
281 this->size = size;
284 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf& buf)
285 :size(buf.size)
287 tmp = (unsigned *)malloc(size * sizeof(unsigned));
290 ass_tmp_buf::~ass_tmp_buf()
292 free(tmp);
296 * \brief gaussian blur. an fast pure c implementation from libass.
298 static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
299 int width, int height, int stride, const unsigned *m2,
300 int r, int mwidth)
303 int x, y;
305 unsigned char *s = buffer;
306 unsigned *t = tmp2 + 1;
307 for (y = 0; y < height; y++) {
308 memset(t - 1, 0, (width + 1) * sizeof(*t));
309 x = 0;
310 if(x < r)//in case that r < 0
312 const int src = s[x];
313 if (src) {
314 register unsigned *dstp = t + x - r;
315 int mx;
316 const unsigned *m3 = m2 + src * mwidth;
317 unsigned sum = 0;
318 for (mx = mwidth-1; mx >= r - x ; mx--) {
319 sum += m3[mx];
320 dstp[mx] += sum;
325 for (x = 1; x < r; x++) {
326 const int src = s[x];
327 if (src) {
328 register unsigned *dstp = t + x - r;
329 int mx;
330 const unsigned *m3 = m2 + src * mwidth;
331 for (mx = r - x; mx < mwidth; mx++) {
332 dstp[mx] += m3[mx];
337 for (; x < width - r; x++) {
338 const int src = s[x];
339 if (src) {
340 register unsigned *dstp = t + x - r;
341 int mx;
342 const unsigned *m3 = m2 + src * mwidth;
343 for (mx = 0; mx < mwidth; mx++) {
344 dstp[mx] += m3[mx];
349 for (; x < width-1; x++) {
350 const int src = s[x];
351 if (src) {
352 register unsigned *dstp = t + x - r;
353 int mx;
354 const int x2 = r + width - x;
355 const unsigned *m3 = m2 + src * mwidth;
356 for (mx = 0; mx < x2; mx++) {
357 dstp[mx] += m3[mx];
361 if(x==width-1) //important: x==width-1 failed, if r==0
363 const int src = s[x];
364 if (src) {
365 register unsigned *dstp = t + x - r;
366 int mx;
367 const int x2 = r + width - x;
368 const unsigned *m3 = m2 + src * mwidth;
369 unsigned sum = 0;
370 for (mx = 0; mx < x2; mx++) {
371 sum += m3[mx];
372 dstp[mx] += sum;
377 s += stride;
378 t += width + 1;
381 t = tmp2;
382 for (x = 0; x < width; x++) {
383 y = 0;
384 if(y < r)//in case that r<0
386 unsigned *srcp = t + y * (width + 1) + 1;
387 int src = *srcp;
388 if (src) {
389 register unsigned *dstp = srcp - 1 + (mwidth -r +y)*(width + 1);
390 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
391 const unsigned *m3 = m2 + src2 * mwidth;
392 unsigned sum = 0;
393 int mx;
394 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
395 for (mx = mwidth-1; mx >=r - y ; mx--) {
396 sum += m3[mx];
397 *dstp += sum;
398 dstp -= width + 1;
402 for (y = 1; y < r; y++) {
403 unsigned *srcp = t + y * (width + 1) + 1;
404 int src = *srcp;
405 if (src) {
406 register unsigned *dstp = srcp - 1 + width + 1;
407 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
408 const unsigned *m3 = m2 + src2 * mwidth;
410 int mx;
411 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
412 for (mx = r - y; mx < mwidth; mx++) {
413 *dstp += m3[mx];
414 dstp += width + 1;
418 for (; y < height - r; y++) {
419 unsigned *srcp = t + y * (width + 1) + 1;
420 int src = *srcp;
421 if (src) {
422 register unsigned *dstp = srcp - 1 - r * (width + 1);
423 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
424 const unsigned *m3 = m2 + src2 * mwidth;
426 int mx;
427 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
428 for (mx = 0; mx < mwidth; mx++) {
429 *dstp += m3[mx];
430 dstp += width + 1;
434 for (; y < height-1; y++) {
435 unsigned *srcp = t + y * (width + 1) + 1;
436 int src = *srcp;
437 if (src) {
438 const int y2 = r + height - y;
439 register unsigned *dstp = srcp - 1 - r * (width + 1);
440 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
441 const unsigned *m3 = m2 + src2 * mwidth;
443 int mx;
444 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
445 for (mx = 0; mx < y2; mx++) {
446 *dstp += m3[mx];
447 dstp += width + 1;
451 if(y == height - 1)//important: y == height - 1 failed if r==0
453 unsigned *srcp = t + y * (width + 1) + 1;
454 int src = *srcp;
455 if (src) {
456 const int y2 = r + height - y;
457 register unsigned *dstp = srcp - 1 - r * (width + 1);
458 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
459 const unsigned *m3 = m2 + src2 * mwidth;
460 unsigned sum = 0;
461 int mx;
462 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
463 for (mx = 0; mx < y2; mx++) {
464 sum += m3[mx];
465 *dstp += sum;
466 dstp += width + 1;
470 t++;
473 t = tmp2;
474 s = buffer;
475 for (y = 0; y < height; y++) {
476 for (x = 0; x < width; x++) {
477 s[x] = t[x] >> ass_synth_priv::VOLUME_BITS;
479 s += stride;
480 t += width + 1;
485 * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
487 static void be_blur(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
489 WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
490 WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
491 if(!col_sum_buf_base || !col_pix_buf_base)
493 //ToDo: error handling
494 return;
496 memset(col_pix_buf_base, 0, w*sizeof(WORD));
497 memset(col_sum_buf_base, 0, w*sizeof(WORD));
498 WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
499 WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
501 int y = 0;
502 unsigned char *src=buf+y*stride;
504 int x = 2;
505 int old_pix = src[x-1];
506 int old_sum = old_pix + src[x-2];
507 for ( ; x < w; x++) {
508 int temp1 = src[x];
509 int temp2 = old_pix + temp1;
510 old_pix = temp1;
511 temp1 = old_sum + temp2;
512 old_sum = temp2;
513 col_pix_buf[x] = temp1;
517 int y = 1;
518 unsigned char *src=buf+y*stride;
521 int x = 2;
522 int old_pix = src[x-1];
523 int old_sum = old_pix + src[x-2];
524 for ( ; x < w; x++) {
525 int temp1 = src[x];
526 int temp2 = old_pix + temp1;
527 old_pix = temp1;
528 temp1 = old_sum + temp2;
529 old_sum = temp2;
531 temp2 = col_pix_buf[x] + temp1;
532 col_pix_buf[x] = temp1;
533 //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
534 col_sum_buf[x] = temp2;
538 //__m128i round = _mm_set1_epi16(8);
539 for (int y = 2; y < h; y++) {
540 unsigned char *src=buf+y*stride;
541 unsigned char *dst=buf+(y-1)*stride;
544 int x = 2;
545 __m128i old_pix_128 = _mm_cvtsi32_si128(src[1]);
546 __m128i old_sum_128 = _mm_cvtsi32_si128(src[0]+src[1]);
547 for ( ; x < ((w-2)&(~7)); x+=8) {
548 __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src+x));
549 new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
550 __m128i temp = _mm_slli_si128(new_pix,2);
551 temp = _mm_add_epi16(temp, old_pix_128);
552 temp = _mm_add_epi16(temp, new_pix);
553 old_pix_128 = _mm_srli_si128(new_pix,14);
555 new_pix = _mm_slli_si128(temp,2);
556 new_pix = _mm_add_epi16(new_pix, old_sum_128);
557 new_pix = _mm_add_epi16(new_pix, temp);
558 old_sum_128 = _mm_srli_si128(temp, 14);
560 __m128i old_col_pix = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_pix_buf+x) );
561 __m128i old_col_sum = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_sum_buf+x) );
562 _mm_storeu_si128( reinterpret_cast<__m128i*>(col_pix_buf+x), new_pix );
563 temp = _mm_add_epi16(new_pix, old_col_pix);
564 _mm_storeu_si128( reinterpret_cast<__m128i*>(col_sum_buf+x), temp );
566 old_col_sum = _mm_add_epi16(old_col_sum, temp);
567 //old_col_sum = _mm_add_epi16(old_col_sum, round);
568 old_col_sum = _mm_srli_epi16(old_col_sum, 4);
569 old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
570 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst+x-1), old_col_sum );
572 int old_pix = src[x-1];
573 int old_sum = old_pix + src[x-2];
574 for ( ; x < w; x++) {
575 int temp1 = src[x];
576 int temp2 = old_pix + temp1;
577 old_pix = temp1;
578 temp1 = old_sum + temp2;
579 old_sum = temp2;
581 temp2 = col_pix_buf[x] + temp1;
582 col_pix_buf[x] = temp1;
583 dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
584 col_sum_buf[x] = temp2;
588 xy_free(col_sum_buf_base);
589 xy_free(col_pix_buf_base);
592 static void Bilinear(unsigned char *buf, int w, int h, int stride, int x_factor, int y_factor)
594 WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
595 if(!col_pix_buf_base)
597 //ToDo: error handling
598 return;
600 memset(col_pix_buf_base, 0, w*sizeof(WORD));
602 for (int y = 0; y < h; y++){
603 unsigned char *src=buf+y*stride;
605 WORD *col_pix_buf = col_pix_buf_base;
606 int last=0;
607 for(int x = 0; x < w; x++)
609 int temp1 = src[x];
610 int temp2 = temp1*x_factor;
611 temp1 <<= 3;
612 temp1 -= temp2;
613 temp1 += last;
614 last = temp2;
616 temp2 = temp1*y_factor;
617 temp1 <<= 3;
618 temp1 -= temp2;
619 temp1 += col_pix_buf[x];
620 src[x] = ((temp1+32)>>6);
621 col_pix_buf[x] = temp2;
624 xy_free(col_pix_buf_base);
627 bool Rasterizer::Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay)
629 using namespace ::boost::flyweights;
631 if(!overlay)
633 return false;
635 overlay->CleanUp();
636 const ScanLineData& scan_line_data = *scan_line_data2.m_scan_line_data;
637 if(!scan_line_data.mWidth || !scan_line_data.mHeight)
639 return true;
641 xsub &= 7;
642 ysub &= 7;
643 //xsub = ysub = 0;
644 int width = scan_line_data.mWidth + xsub;
645 int height = scan_line_data.mHeight + ysub;
646 overlay->mfWideOutlineEmpty = scan_line_data2.mWideOutline.empty();
647 if(!overlay->mfWideOutlineEmpty)
649 int wide_border = (scan_line_data2.mWideBorder+7)&~7;
651 width += 2*wide_border ;
652 height += 2*wide_border ;
653 xsub += wide_border ;
654 ysub += wide_border ;
656 overlay->mOffsetX = scan_line_data2.mPathOffsetX - xsub;
657 overlay->mOffsetY = scan_line_data2.mPathOffsetY - ysub;
659 overlay->mWidth = width;
660 overlay->mHeight = height;
661 overlay->mOverlayWidth = ((width+7)>>3) + 1;
662 overlay->mOverlayHeight = ((height+7)>>3) + 1;
663 overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
665 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
666 if( body==NULL )
668 return false;
670 overlay->mBody.reset(body, xy_free);
671 memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
672 BYTE* border = NULL;
673 if (!overlay->mfWideOutlineEmpty)
675 border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
676 if (border==NULL)
678 return false;
680 overlay->mBorder.reset(border, xy_free);
681 memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
684 // Are we doing a border?
685 const tSpanBuffer* pOutline[2] = {&(scan_line_data.mOutline), &(scan_line_data2.mWideOutline)};
686 for(int i = countof(pOutline)-1; i >= 0; i--)
688 tSpanBuffer::const_iterator it = pOutline[i]->begin();
689 tSpanBuffer::const_iterator itEnd = pOutline[i]->end();
690 byte* plan_selected = i==0 ? body : border;
691 int pitch = overlay->mOverlayPitch;
692 for(; it!=itEnd; ++it)
694 int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
695 int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
696 int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
697 if(x2 > x1)
699 int first = x1>>3;
700 int last = (x2-1)>>3;
701 byte* dst = plan_selected + (pitch*(y>>3) + first);
702 if(first == last)
703 *dst += x2-x1;
704 else
706 *dst += ((first+1)<<3) - x1;
707 dst += 1;
708 while(++first < last)
710 *dst += 0x08;
711 dst += 1;
713 *dst += x2 - (last<<3);
719 return true;
722 // @return: true if actually a blur operation has done, or else false and output is leave unset.
723 bool Rasterizer::Blur(const Overlay& input_overlay, int fBlur, double fGaussianBlur,
724 SharedPtrOverlay output_overlay)
726 using namespace ::boost::flyweights;
728 if(!output_overlay)
730 return false;
732 output_overlay->CleanUp();
734 output_overlay->mOffsetX = input_overlay.mOffsetX;
735 output_overlay->mOffsetY = input_overlay.mOffsetY;
736 output_overlay->mWidth = input_overlay.mWidth;
737 output_overlay->mHeight = input_overlay.mHeight;
738 output_overlay->mOverlayWidth = input_overlay.mOverlayWidth;
739 output_overlay->mOverlayHeight = input_overlay.mOverlayHeight;
740 output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
742 int bluradjust = 0;
743 if(fBlur || fGaussianBlur > 0.1)
745 if (fGaussianBlur > 0)
746 bluradjust += (int)(fGaussianBlur*3*8 + 0.5) | 1;
747 if (fBlur)
748 bluradjust += 8;
749 // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
750 bluradjust = (bluradjust+7)&~7;
752 output_overlay->mOffsetX -= bluradjust;
753 output_overlay->mOffsetY -= bluradjust;
754 output_overlay->mWidth += (bluradjust<<1);
755 output_overlay->mHeight += (bluradjust<<1);
756 output_overlay->mOverlayWidth += (bluradjust>>2);
757 output_overlay->mOverlayHeight += (bluradjust>>2);
759 else
761 return false;
764 output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
766 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
767 if( body==NULL )
769 return false;
771 output_overlay->mBody.reset(body, xy_free);
772 memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
773 BYTE* border = NULL;
774 if (!output_overlay->mfWideOutlineEmpty)
776 border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
777 if (border==NULL)
779 return false;
781 output_overlay->mBorder.reset(border, xy_free);
782 memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
785 //copy buffer
786 for(int i = 1; i >= 0; i--)
788 byte* plan_selected = i==0 ? body : border;
789 const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
791 plan_selected += (bluradjust>>3) + (bluradjust>>3)*output_overlay->mOverlayPitch;
792 if ( plan_selected!=NULL && plan_input!=NULL )
794 for (int j=0;j<input_overlay.mOverlayHeight;j++)
796 memcpy(plan_selected, plan_input, input_overlay.mOverlayPitch);
797 plan_selected += output_overlay->mOverlayPitch;
798 plan_input += input_overlay.mOverlayPitch;
803 ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
804 //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
805 // Do some gaussian blur magic
806 if (fGaussianBlur > 0.1)//(fGaussianBlur > 0) return true even if fGaussianBlur very small
808 byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border;
809 flyweight<key_value<double, ass_synth_priv, ass_synth_priv_key>, no_locking> fw_priv_blur(fGaussianBlur);
810 const ass_synth_priv& priv_blur = fw_priv_blur.get();
811 if (output_overlay->mOverlayWidth>=priv_blur.g_w && output_overlay->mOverlayHeight>=priv_blur.g_w)
813 ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch,
814 priv_blur.gt2, priv_blur.g_r, priv_blur.g_w);
818 for (int pass = 0; pass < fBlur; pass++)
820 if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
822 int pitch = output_overlay->mOverlayPitch;
823 byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border;
824 be_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
827 return true;
830 ///////////////////////////////////////////////////////////////////////////
832 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
834 int a = alpha;
835 // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
836 int ia = 256-a;
837 a+=1;
838 *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
839 | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
840 | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
843 static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
845 int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
846 int ia = 256-a;
847 a+=1;
848 *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
849 | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
850 | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
853 #include <xmmintrin.h>
854 #include <emmintrin.h>
856 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
858 // alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
859 color &= 0xffffff;
860 __m128i zero = _mm_setzero_si128();
861 __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
862 __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
863 __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
864 __m128i r = _mm_unpacklo_epi16(d, s);
865 r = _mm_madd_epi16(r, a);
866 r = _mm_srli_epi32(r, 8);
867 r = _mm_packs_epi32(r, r);
868 r = _mm_packus_epi16(r, r);
869 *dst = (DWORD)_mm_cvtsi128_si32(r);
872 static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
874 int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
875 color &= 0xffffff;
876 __m128i zero = _mm_setzero_si128();
877 __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
878 __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
879 __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
880 __m128i r = _mm_unpacklo_epi16(d, s);
881 r = _mm_madd_epi16(r, a);
882 r = _mm_srli_epi32(r, 8);
883 r = _mm_packs_epi32(r, r);
884 r = _mm_packus_epi16(r, r);
885 *dst = (DWORD)_mm_cvtsi128_si32(r);
888 #include <mmintrin.h>
890 // Calculate a - b clamping to 0 instead of underflowing
891 static __forceinline DWORD safe_subtract(DWORD a, DWORD b)
893 __m64 ap = _mm_cvtsi32_si64(a);
894 __m64 bp = _mm_cvtsi32_si64(b);
895 __m64 rp = _mm_subs_pu16(ap, bp);
896 DWORD r = (DWORD)_mm_cvtsi64_si32(rp);
897 _mm_empty();
898 return r;
899 //return (b > a) ? 0 : a - b;
902 /***
903 * No aligned requirement
906 void AlphaBlt(byte* pY,
907 const byte* pAlphaMask,
908 const byte Y,
909 int h, int w, int src_stride, int dst_stride)
911 __m128i zero = _mm_setzero_si128();
912 __m128i s = _mm_set1_epi16(Y); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
914 if( w>16 )//IMPORTANT! The result of the following code is undefined with w<15.
916 for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
918 const BYTE* sa = pAlphaMask;
919 BYTE* dy = pY;
920 const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15); //IMPORTANT! w must >= 15
921 const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
922 const BYTE* dy_end = pY + w;
924 for(;dy < dy_first_mod16; sa++, dy++)
926 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
928 for(; dy < dy_end_mod16; sa+=8, dy+=16)
930 __m128i a = _mm_loadl_epi64((__m128i*)sa);
933 __m128i d = _mm_load_si128((__m128i*)dy);
935 //__m128i ones = _mm_cmpeq_epi32(zero,zero); //ones = ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
936 //__m128i ia = _mm_xor_si128(a,ones); //ia = ~a
937 //ia = _mm_unpacklo_epi8(ia,zero); //ia = ~a0 0 ~a1 0 ~a2 0 ~a3 0 ~a4 0 ~a5 0 ~a6 0 ~a7 0
938 a = _mm_unpacklo_epi8(a,zero); //a= a0 0 a1 0 a2 0 a3 0 a4 0 a5 0 a6 0 a7 0
939 __m128i ones = _mm_set1_epi16(256); //ones = 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
940 __m128i ia = _mm_sub_epi16(ones, a); //ia = 256-a0 ... 256-a7
941 ones = _mm_srli_epi16(ones, 8);
942 a = _mm_add_epi16(a, ones); //a= 1+a0 ... 1+a7
944 __m128i dl = _mm_unpacklo_epi8(d,zero); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
945 __m128i sl = _mm_mullo_epi16(s,a); //sl = c0*a0 c1*a1 ... c7*a7
947 dl = _mm_mullo_epi16(dl,ia); //d = b0*~a0 b1*~a1 ... b7*~a7
949 dl = _mm_add_epi16(dl,sl); //d = d + sl
950 dl = _mm_srli_epi16(dl, 8); //d = d>>8
952 sa += 8;
953 a = _mm_loadl_epi64((__m128i*)sa);
955 a = _mm_unpacklo_epi8(a,zero);
956 ones = _mm_slli_epi16(ones, 8);
957 ia = _mm_sub_epi16(ones, a);
958 ones = _mm_srli_epi16(ones, 8);
959 a = _mm_add_epi16(a,ones);
961 d = _mm_unpackhi_epi8(d,zero);
962 sl = _mm_mullo_epi16(s,a);
963 d = _mm_mullo_epi16(d,ia);
964 d = _mm_add_epi16(d,sl);
965 d = _mm_srli_epi16(d, 8);
967 dl = _mm_packus_epi16(dl,d);
969 _mm_store_si128((__m128i*)dy, dl);
971 for(;dy < dy_end; sa++, dy++)
973 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
977 else
979 for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
981 const BYTE* sa = pAlphaMask;
982 BYTE* dy = pY;
983 const BYTE* dy_end = pY + w;
985 for(;dy < dy_end; sa++, dy++)
987 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
991 //__asm emms;
994 /***
995 * No aligned requirement
998 void AlphaBlt(byte* pY,
999 const byte alpha,
1000 const byte Y,
1001 int h, int w, int dst_stride)
1003 int yPremul = Y*(alpha+1);
1004 int dstAlpha = 0x100 - alpha;
1005 if( w>32 )//IMPORTANT! The result of the following code is undefined with w<15.
1007 __m128i zero = _mm_setzero_si128();
1008 __m128i s = _mm_set1_epi16(yPremul); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
1009 __m128i ia = _mm_set1_epi16(dstAlpha);
1010 for( ; h>0; h--, pY += dst_stride )
1012 BYTE* dy = pY;
1013 const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15); //IMPORTANT! w must >= 15
1014 const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1015 const BYTE* dy_end = pY + w;
1017 for(;dy < dy_first_mod16; dy++)
1019 *dy = (*dy * dstAlpha + yPremul)>>8;
1021 for(; dy < dy_end_mod16; dy+=16)
1024 __m128i d = _mm_load_si128(reinterpret_cast<const __m128i*>(dy));
1025 __m128i dl = _mm_unpacklo_epi8(d,zero); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
1027 dl = _mm_mullo_epi16(dl,ia); //d = b0*~a0 b1*~a1 ... b7*~a7
1028 dl = _mm_adds_epu16(dl,s); //d = d + s
1029 dl = _mm_srli_epi16(dl, 8); //d = d>>8
1031 d = _mm_unpackhi_epi8(d,zero);
1032 d = _mm_mullo_epi16(d,ia);
1033 d = _mm_adds_epu16(d,s);
1034 d = _mm_srli_epi16(d, 8);
1036 dl = _mm_packus_epi16(dl,d);
1038 _mm_store_si128(reinterpret_cast<__m128i*>(dy), dl);
1040 for(;dy < dy_end; dy++)
1042 *dy = (*dy * dstAlpha + yPremul)>>8;
1046 else
1048 for( ; h>0; h--, pY += dst_stride )
1050 BYTE* dy = pY;
1051 const BYTE* dy_end = pY + w;
1053 for(;dy < dy_end; dy++)
1055 *dy = (*dy * dstAlpha + yPremul)>>8;
1059 //__asm emms;
1062 /***
1063 * No aligned requirement
1066 void AlphaBltC(byte* pY,
1067 const byte alpha,
1068 const byte Y,
1069 int h, int w, int dst_stride)
1071 int yPremul = Y*(alpha+1);
1072 int dstAlpha = 0x100 - alpha;
1074 for( ; h>0; h--, pY += dst_stride )
1076 BYTE* dy = pY;
1077 const BYTE* dy_end = pY + w;
1079 for(;dy < dy_end; dy++)
1081 *dy = (*dy * dstAlpha + yPremul)>>8;
1086 // For CPUID usage in Rasterizer::Draw
1087 #include "../dsutil/vd.h"
1089 void OverlapRegion(tSpanBuffer& dst, const tSpanBuffer& src, int dx, int dy)
1091 tSpanBuffer temp;
1092 temp.reserve(dst.size() + src.size());
1093 dst.swap(temp);
1094 tSpanBuffer::iterator itA = temp.begin();
1095 tSpanBuffer::iterator itAE = temp.end();
1096 tSpanBuffer::const_iterator itB = src.begin();
1097 tSpanBuffer::const_iterator itBE = src.end();
1098 // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
1099 unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
1100 unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
1101 while(itA != itAE && itB != itBE)
1103 if((*itB).first + offset1 < (*itA).first)
1105 // B span is earlier. Use it.
1106 unsigned __int64 x1 = (*itB).first + offset1;
1107 unsigned __int64 x2 = (*itB).second + offset2;
1108 ++itB;
1109 // B spans don't overlap, so begin merge loop with A first.
1110 for(;;)
1112 // If we run out of A spans or the A span doesn't overlap,
1113 // then the next B span can't either (because B spans don't
1114 // overlap) and we exit.
1115 if(itA == itAE || (*itA).first > x2)
1116 break;
1117 do {x2 = _MAX(x2, (*itA++).second);}
1118 while(itA != itAE && (*itA).first <= x2);
1119 // If we run out of B spans or the B span doesn't overlap,
1120 // then the next A span can't either (because A spans don't
1121 // overlap) and we exit.
1122 if(itB == itBE || (*itB).first + offset1 > x2)
1123 break;
1124 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1125 while(itB != itBE && (*itB).first + offset1 <= x2);
1127 // Flush span.
1128 dst.push_back(tSpan(x1, x2));
1130 else
1132 // A span is earlier. Use it.
1133 unsigned __int64 x1 = (*itA).first;
1134 unsigned __int64 x2 = (*itA).second;
1135 ++itA;
1136 // A spans don't overlap, so begin merge loop with B first.
1137 for(;;)
1139 // If we run out of B spans or the B span doesn't overlap,
1140 // then the next A span can't either (because A spans don't
1141 // overlap) and we exit.
1142 if(itB == itBE || (*itB).first + offset1 > x2)
1143 break;
1144 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1145 while(itB != itBE && (*itB).first + offset1 <= x2);
1146 // If we run out of A spans or the A span doesn't overlap,
1147 // then the next B span can't either (because B spans don't
1148 // overlap) and we exit.
1149 if(itA == itAE || (*itA).first > x2)
1150 break;
1151 do {x2 = _MAX(x2, (*itA++).second);}
1152 while(itA != itAE && (*itA).first <= x2);
1154 // Flush span.
1155 dst.push_back(tSpan(x1, x2));
1158 // Copy over leftover spans.
1159 while(itA != itAE)
1160 dst.push_back(*itA++);
1161 while(itB != itBE)
1163 dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
1164 ++itB;
1168 // Render a subpicture onto a surface.
1169 // spd is the surface to render on.
1170 // clipRect is a rectangular clip region to render inside.
1171 // pAlphaMask is an alpha clipping mask.
1172 // xsub and ysub ???
1173 // switchpts seems to be an array of fill colours interlaced with coordinates.
1174 // switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1175 // fBody tells whether to render the body of the subs.
1176 // fBorder tells whether to render the border of the subs.
1177 SharedPtrByte Rasterizer::CompositeAlphaMask(const SharedPtrOverlay& overlay, const CRect& clipRect,
1178 const GrayImage2* alpha_mask,
1179 int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder,
1180 CRect *outputDirtyRect)
1182 //fix me: check and log error
1183 SharedPtrByte result;
1184 *outputDirtyRect = CRect(0, 0, 0, 0);
1185 if (!switchpts || !fBody && !fBorder) return result;
1186 if (fBorder && !overlay->mBorder) return result;
1188 CRect r = clipRect;
1189 if (alpha_mask!=NULL)
1191 r &= CRect(alpha_mask->left_top, alpha_mask->size);
1194 // Remember that all subtitle coordinates are specified in 1/8 pixels
1195 // (x+4)>>3 rounds to nearest whole pixel.
1196 // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1197 int x = (xsub + overlay->mOffsetX + 4)>>3;
1198 int y = (ysub + overlay->mOffsetY + 4)>>3;
1199 int w = overlay->mOverlayWidth;
1200 int h = overlay->mOverlayHeight;
1201 int xo = 0, yo = 0;
1202 // Again, limiting?
1203 if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1204 if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1205 if(x+w > r.right) w = r.right-x;
1206 if(y+h > r.bottom) h = r.bottom-y;
1207 // Check if there's actually anything to render
1208 if(w <= 0 || h <= 0) return(result);
1209 outputDirtyRect->SetRect(x, y, x+w, y+h);
1211 bool fSingleColor = (switchpts[1]==0xffffffff);
1213 // draw
1214 // Grab the first colour
1215 DWORD color = switchpts[0];
1216 byte* s_base = (byte*)xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight);
1217 const byte* alpha_mask_data = alpha_mask != NULL ? alpha_mask->data.get() : NULL;
1218 const int alpha_mask_pitch = alpha_mask != NULL ? alpha_mask->pitch : 0;
1219 if(alpha_mask_data!=NULL )
1220 alpha_mask_data += alpha_mask->pitch * y + x - alpha_mask->left_top.y*alpha_mask->pitch - alpha_mask->left_top.x;
1222 if(fSingleColor)
1224 overlay->FillAlphaMash(s_base, fBody, fBorder, xo, yo, w, h,
1225 alpha_mask_data, alpha_mask_pitch,
1226 color>>24 );
1228 else
1230 int last_x = xo;
1231 const DWORD *sw = switchpts;
1232 while( last_x<w+xo )
1234 byte alpha = sw[0]>>24;
1235 while( sw[3]<w+xo && (sw[2]>>24)==alpha )
1237 sw += 2;
1239 int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1240 overlay->FillAlphaMash(s_base, fBody, fBorder,
1241 last_x, yo, new_x-last_x, h,
1242 alpha_mask_data, alpha_mask_pitch,
1243 alpha );
1244 last_x = new_x;
1245 sw += 2;
1248 result.reset( s_base, xy_free );
1249 return result;
1254 // draw overlay[clipRect] to bitmap[0,0,w,h]
1256 void Rasterizer::Draw(XyBitmap* bitmap, SharedPtrOverlay overlay, const CRect& clipRect, byte* s_base,
1257 int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder)
1259 if (!switchpts || !fBody && !fBorder) return;
1260 if (bitmap==NULL)
1262 ASSERT(0);
1263 return;
1265 // clip
1266 // Limit drawn area to rectangular clip area
1267 CRect r = clipRect;
1268 // Remember that all subtitle coordinates are specified in 1/8 pixels
1269 // (x+4)>>3 rounds to nearest whole pixel.
1270 int overlayPitch = overlay->mOverlayPitch;
1271 int x = (xsub + overlay->mOffsetX + 4)>>3;
1272 int y = (ysub + overlay->mOffsetY + 4)>>3;
1273 int w = overlay->mOverlayWidth;
1274 int h = overlay->mOverlayHeight;
1275 int xo = 0, yo = 0;
1277 if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1278 if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1279 if(x+w > r.right) w = r.right-x;
1280 if(y+h > r.bottom) h = r.bottom-y;
1281 // Check if there's actually anything to render
1282 if (w <= 0 || h <= 0) return;
1283 // must have enough space to draw into
1284 ASSERT(x >= bitmap->x && y >= bitmap->y && x+w <= bitmap->x + bitmap->w && y+h <= bitmap->y + bitmap->h );
1286 // CPUID from VDub
1287 bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1288 bool fSingleColor = (switchpts[1]==0xffffffff);
1289 bool PLANAR = (bitmap->type==XyBitmap::PLANNA);
1290 int draw_method = 0;
1291 if(fSingleColor)
1292 draw_method |= DM::SINGLE_COLOR;
1293 if(fSSE2)
1294 draw_method |= DM::SSE2;
1295 if(PLANAR)
1296 draw_method |= DM::AYUV_PLANAR;
1298 // draw
1299 // Grab the first colour
1300 DWORD color = switchpts[0];
1301 const byte* s = s_base + overlay->mOverlayPitch*yo + xo;
1303 int dst_offset = 0;
1304 if (bitmap->type==XyBitmap::PLANNA)
1305 dst_offset = bitmap->pitch*(y-bitmap->y) + x - bitmap->x;
1306 else
1307 dst_offset = bitmap->pitch*(y-bitmap->y) + (x - bitmap->x)*4;
1308 unsigned long* dst = (unsigned long*)((BYTE*)bitmap->plans[0] + dst_offset);
1310 // Every remaining line in the bitmap to be rendered...
1311 switch(draw_method)
1313 case DM::SINGLE_COLOR | DM::SSE2 | 0*DM::AYUV_PLANAR :
1315 while(h--)
1317 for(int wt=0; wt<w; ++wt)
1318 // The <<6 is due to pixmix expecting the alpha parameter to be
1319 // the multiplication of two 6-bit unsigned numbers but we
1320 // only have one here. (No alpha mask.)
1321 pixmix_sse2(&dst[wt], color, s[wt]);
1322 s += overlayPitch;
1323 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1326 break;
1327 case DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1329 while(h--)
1331 for(int wt=0; wt<w; ++wt)
1332 pixmix(&dst[wt], color, s[wt]);
1333 s += overlayPitch;
1334 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1337 break;
1338 case 0*DM::SINGLE_COLOR | DM::SSE2 | 0*DM::AYUV_PLANAR :
1340 while(h--)
1342 const DWORD *sw = switchpts;
1343 for(int wt=0; wt<w; ++wt)
1345 // xo is the offset (usually negative) we have moved into the image
1346 // So if we have passed the switchpoint (?) switch to another colour
1347 // (So switchpts stores both colours *and* coordinates?)
1348 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1349 pixmix_sse2(&dst[wt], color, s[wt]);
1351 s += overlayPitch;
1352 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1355 break;
1356 case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1358 while(h--)
1360 const DWORD *sw = switchpts;
1361 for(int wt=0; wt<w; ++wt)
1363 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1364 pixmix(&dst[wt], color, s[wt]);
1366 s += overlayPitch;
1367 dst = (unsigned long *)((char *)dst + bitmap->pitch);
1370 break;
1371 case DM::SINGLE_COLOR | DM::SSE2 | DM::AYUV_PLANAR :
1373 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1374 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1375 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1376 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1378 AlphaBlt(dst_Y, s, ((color)>>16)&0xff, h, w, overlayPitch, bitmap->pitch);
1379 AlphaBlt(dst_U, s, ((color)>>8)&0xff, h, w, overlayPitch, bitmap->pitch);
1380 AlphaBlt(dst_V, s, ((color))&0xff, h, w, overlayPitch, bitmap->pitch);
1381 AlphaBlt(dst_A, s, 0, h, w, overlayPitch, bitmap->pitch);
1383 break;
1384 case 0*DM::SINGLE_COLOR | DM::SSE2 | DM::AYUV_PLANAR :
1386 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1387 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1388 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1389 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1391 const DWORD *sw = switchpts;
1392 int last_x = xo;
1393 color = sw[0];
1394 while(last_x<w+xo)
1396 int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1397 color = sw[0];
1398 sw += 2;
1399 if( new_x < last_x )
1400 continue;
1401 AlphaBlt(dst_Y, s + last_x - xo, (color>>16)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1402 AlphaBlt(dst_U, s + last_x - xo, (color>>8)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1403 AlphaBlt(dst_V, s + last_x - xo, (color)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1404 AlphaBlt(dst_A, s + last_x - xo, 0, h, new_x-last_x, overlayPitch, bitmap->pitch);
1406 dst_A += new_x - last_x;
1407 dst_Y += new_x - last_x;
1408 dst_U += new_x - last_x;
1409 dst_V += new_x - last_x;
1410 last_x = new_x;
1413 break;
1414 case DM::SINGLE_COLOR | 0*DM::SSE2 | DM::AYUV_PLANAR :
1416 // char * debug_dst=(char*)dst;int h2 = h;
1417 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1418 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1419 // debug_dst += spd.pitch*spd.h;
1420 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1421 // debug_dst += spd.pitch*spd.h;
1422 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1423 // debug_dst += spd.pitch*spd.h;
1424 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1425 // debug_dst=(char*)dst;
1427 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1428 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1429 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1430 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1431 while(h--)
1433 for(int wt=0; wt<w; ++wt)
1435 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1436 pixmix(&temp, color, s[wt]);
1437 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1439 s += overlayPitch;
1440 dst_A += bitmap->pitch;
1441 dst_Y += bitmap->pitch;
1442 dst_U += bitmap->pitch;
1443 dst_V += bitmap->pitch;
1445 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1446 // debug_dst += spd.pitch*spd.h;
1447 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1448 // debug_dst += spd.pitch*spd.h;
1449 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1450 // debug_dst += spd.pitch*spd.h;
1451 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1453 break;
1454 case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | DM::AYUV_PLANAR :
1456 unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1457 unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1458 unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1459 unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1460 while(h--)
1462 const DWORD *sw = switchpts;
1463 for(int wt=0; wt<w; ++wt)
1465 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1466 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1467 pixmix(&temp, color, (s[wt]*(color>>24))>>8);
1468 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1470 s += overlayPitch;
1471 dst_A += bitmap->pitch;
1472 dst_Y += bitmap->pitch;
1473 dst_U += bitmap->pitch;
1474 dst_V += bitmap->pitch;
1477 break;
1479 // Remember to EMMS!
1480 // Rendering fails in funny ways if we don't do this.
1481 _mm_empty();
1482 return;
1485 void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nHeight, DWORD argb)
1487 bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1488 bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1489 int draw_method = 0;
1490 if(fSSE2)
1491 draw_method |= DM::SSE2;
1492 if(AYUV_PLANAR)
1493 draw_method |= DM::AYUV_PLANAR;
1495 switch (draw_method)
1497 case DM::SSE2 | 0*DM::AYUV_PLANAR :
1499 for (int wy=y; wy<y+nHeight; wy++) {
1500 DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1501 for(int wt=0; wt<nWidth; ++wt) {
1502 pixmix_sse2(&dst[wt], argb, argb>>24);
1506 break;
1507 case 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1509 for (int wy=y; wy<y+nHeight; wy++) {
1510 DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1511 for(int wt=0; wt<nWidth; ++wt) {
1512 pixmix(&dst[wt], argb, argb>>24);
1516 break;
1517 case DM::SSE2 | DM::AYUV_PLANAR :
1519 BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1520 BYTE* dst_A = dst;
1521 BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1522 BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1523 BYTE* dst_V = dst_U + spd.pitch*spd.h;
1524 AlphaBlt(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1525 AlphaBlt(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1526 AlphaBlt(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1527 AlphaBlt(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1529 break;
1530 case 0*DM::SSE2 | DM::AYUV_PLANAR :
1532 BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1533 BYTE* dst_A = dst;
1534 BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1535 BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1536 BYTE* dst_V = dst_U + spd.pitch*spd.h;
1537 AlphaBltC(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1538 AlphaBltC(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1539 AlphaBltC(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1540 AlphaBltC(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1542 break;
1544 _mm_empty();
1547 ///////////////////////////////////////////////////////////////
1549 // Overlay
1551 void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
1552 const byte* pAlphaMask, int pitch, DWORD color_alpha )
1554 pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
1555 pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
1556 byte* dst = outputAlphaMask + y*mOverlayPitch + x;
1558 const int x0 = ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) < w ?
1559 ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) : w; //IMPORTANT! Should not exceed w.
1560 const int x00 = ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) < w ?
1561 ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) : w;//IMPORTANT! Should not exceed w.
1562 const int x_end00 = ((reinterpret_cast<int>(dst)+w)&~15) - reinterpret_cast<int>(dst);
1563 const int x_end0 = ((reinterpret_cast<int>(dst)+w)&~3) - reinterpret_cast<int>(dst);
1564 const int x_end = w;
1566 __m64 color_alpha_64 = _mm_set1_pi16(color_alpha);
1567 __m128i color_alpha_128 = _mm_set1_epi16(color_alpha);
1569 if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
1572 __asm
1574 mov eax, color_alpha
1575 movd XMM3, eax
1576 punpcklwd XMM3, XMM3
1577 pshufd XMM3, XMM3, 0
1580 while(h--)
1582 int j=0;
1583 for( ; j<x0; j++ )
1585 int temp = pBorder[j]-pBody[j];
1586 temp = temp<0 ? 0 : temp;
1587 dst[j] = (temp * color_alpha)>>6;
1589 for( ;j<x00;j+=4 )
1591 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1592 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1593 border = _mm_subs_pu8(border, body);
1594 __m64 zero = _mm_setzero_si64();
1595 border = _mm_unpacklo_pi8(border, zero);
1596 border = _mm_mullo_pi16(border, color_alpha_64);
1597 border = _mm_srli_pi16(border, 6);
1598 border = _mm_packs_pu16(border,border);
1599 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1601 __m128i zero = _mm_setzero_si128();
1602 for( ;j<x_end00;j+=16)
1604 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1605 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1606 border = _mm_subs_epu8(border,body);
1607 __m128i srchi = border;
1608 border = _mm_unpacklo_epi8(border, zero);
1609 srchi = _mm_unpackhi_epi8(srchi, zero);
1610 border = _mm_mullo_epi16(border, color_alpha_128);
1611 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1612 border = _mm_srli_epi16(border, 6);
1613 srchi = _mm_srli_epi16(srchi, 6);
1614 border = _mm_packus_epi16(border, srchi);
1615 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1617 for( ;j<x_end0;j+=4)
1619 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1620 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1621 border = _mm_subs_pu8(border, body);
1622 __m64 zero = _mm_setzero_si64();
1623 border = _mm_unpacklo_pi8(border, zero);
1624 border = _mm_mullo_pi16(border, color_alpha_64);
1625 border = _mm_srli_pi16(border, 6);
1626 border = _mm_packs_pu16(border,border);
1627 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1629 for( ;j<x_end;j++)
1631 int temp = pBorder[j]-pBody[j];
1632 temp = temp<0 ? 0 : temp;
1633 dst[j] = (temp * color_alpha)>>6;
1635 pBody += mOverlayPitch;
1636 pBorder += mOverlayPitch;
1637 //pAlphaMask += pitch;
1638 dst += mOverlayPitch;
1641 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
1643 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1644 while(h--)
1646 int j=0;
1647 for( ; j<x0; j++ )
1649 dst[j] = (src1[j] * color_alpha)>>6;
1651 for( ;j<x00;j+=4 )
1653 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1654 __m64 zero = _mm_setzero_si64();
1655 src = _mm_unpacklo_pi8(src, zero);
1656 src = _mm_mullo_pi16(src, color_alpha_64);
1657 src = _mm_srli_pi16(src, 6);
1658 src = _mm_packs_pu16(src,src);
1659 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1661 __m128i zero = _mm_setzero_si128();
1662 for( ;j<x_end00;j+=16)
1664 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1665 __m128i srchi = src;
1666 src = _mm_unpacklo_epi8(src, zero);
1667 srchi = _mm_unpackhi_epi8(srchi, zero);
1668 src = _mm_mullo_epi16(src, color_alpha_128);
1669 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1670 src = _mm_srli_epi16(src, 6);
1671 srchi = _mm_srli_epi16(srchi, 6);
1672 src = _mm_packus_epi16(src, srchi);
1673 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1675 for( ;j<x_end0;j+=4)
1677 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1678 __m64 zero = _mm_setzero_si64();
1679 src = _mm_unpacklo_pi8(src, zero);
1680 src = _mm_mullo_pi16(src, color_alpha_64);
1681 src = _mm_srli_pi16(src, 6);
1682 src = _mm_packs_pu16(src,src);
1683 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1685 for( ;j<x_end;j++)
1687 dst[j] = (src1[j] * color_alpha)>>6;
1689 src1 += mOverlayPitch;
1690 //pAlphaMask += pitch;
1691 dst += mOverlayPitch;
1694 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
1696 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1697 while(h--)
1699 int j=0;
1700 for( ; j<x0; j++ )
1702 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1704 for( ;j<x00;j+=4 )
1706 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1707 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1708 __m64 zero = _mm_setzero_si64();
1709 src = _mm_unpacklo_pi8(src, zero);
1710 src = _mm_mullo_pi16(src, color_alpha_64);
1711 mask = _mm_unpacklo_pi8(zero, mask); //important!
1712 src = _mm_mulhi_pi16(src, mask); //important!
1713 src = _mm_srli_pi16(src, 12+8-16); //important!
1714 src = _mm_packs_pu16(src,src);
1715 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1717 __m128i zero = _mm_setzero_si128();
1718 for( ;j<x_end00;j+=16)
1720 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1721 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1722 __m128i srchi = src;
1723 __m128i maskhi = mask;
1724 src = _mm_unpacklo_epi8(src, zero);
1725 srchi = _mm_unpackhi_epi8(srchi, zero);
1726 mask = _mm_unpacklo_epi8(zero, mask); //important!
1727 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1728 src = _mm_mullo_epi16(src, color_alpha_128);
1729 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1730 src = _mm_mulhi_epu16(src, mask); //important!
1731 srchi = _mm_mulhi_epu16(srchi, maskhi);
1732 src = _mm_srli_epi16(src, 12+8-16); //important!
1733 srchi = _mm_srli_epi16(srchi, 12+8-16);
1734 src = _mm_packus_epi16(src, srchi);
1735 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1737 for( ;j<x_end0;j+=4)
1739 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1740 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1741 __m64 zero = _mm_setzero_si64();
1742 src = _mm_unpacklo_pi8(src, zero);
1743 src = _mm_mullo_pi16(src, color_alpha_64);
1744 mask = _mm_unpacklo_pi8(zero, mask); //important!
1745 src = _mm_mulhi_pi16(src, mask); //important!
1746 src = _mm_srli_pi16(src, 12+8-16); //important!
1747 src = _mm_packs_pu16(src,src);
1748 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1750 for( ;j<x_end;j++)
1752 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1754 src1 += mOverlayPitch;
1755 pAlphaMask += pitch;
1756 dst += mOverlayPitch;
1759 else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
1761 while(h--)
1763 int j=0;
1764 for( ; j<x0; j++ )
1766 int temp = pBorder[j]-pBody[j];
1767 temp = temp<0 ? 0 : temp;
1768 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1770 for( ;j<x00;j+=4 )
1772 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1773 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1774 border = _mm_subs_pu8(border, body);
1775 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1776 __m64 zero = _mm_setzero_si64();
1777 border = _mm_unpacklo_pi8(border, zero);
1778 border = _mm_mullo_pi16(border, color_alpha_64);
1779 mask = _mm_unpacklo_pi8(zero, mask); //important!
1780 border = _mm_mulhi_pi16(border, mask); //important!
1781 border = _mm_srli_pi16(border, 12+8-16); //important!
1782 border = _mm_packs_pu16(border,border);
1783 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1785 __m128i zero = _mm_setzero_si128();
1786 for( ;j<x_end00;j+=16)
1788 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1789 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1790 border = _mm_subs_epu8(border,body);
1792 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1793 __m128i srchi = border;
1794 __m128i maskhi = mask;
1795 border = _mm_unpacklo_epi8(border, zero);
1796 srchi = _mm_unpackhi_epi8(srchi, zero);
1797 mask = _mm_unpacklo_epi8(zero, mask); //important!
1798 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1799 border = _mm_mullo_epi16(border, color_alpha_128);
1800 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1801 border = _mm_mulhi_epu16(border, mask); //important!
1802 srchi = _mm_mulhi_epu16(srchi, maskhi);
1803 border = _mm_srli_epi16(border, 12+8-16); //important!
1804 srchi = _mm_srli_epi16(srchi, 12+8-16);
1805 border = _mm_packus_epi16(border, srchi);
1806 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1808 for( ;j<x_end0;j+=4)
1810 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1811 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1812 border = _mm_subs_pu8(border, body);
1813 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1814 __m64 zero = _mm_setzero_si64();
1815 border = _mm_unpacklo_pi8(border, zero);
1816 border = _mm_mullo_pi16(border, color_alpha_64);
1817 mask = _mm_unpacklo_pi8(zero, mask); //important!
1818 border = _mm_mulhi_pi16(border, mask); //important!
1819 border = _mm_srli_pi16(border, 12+8-16); //important!
1820 border = _mm_packs_pu16(border,border);
1821 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1823 for( ;j<x_end;j++)
1825 int temp = pBorder[j]-pBody[j];
1826 temp = temp<0 ? 0 : temp;
1827 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1829 pBody += mOverlayPitch;
1830 pBorder += mOverlayPitch;
1831 pAlphaMask += pitch;
1832 dst += mOverlayPitch;
1835 else
1837 //should NOT happen!
1838 ASSERT(0);
1839 while(h--)
1841 for(int j=0;j<x_end;j++)
1843 dst[j] = 0;
1845 dst += mOverlayPitch;
1850 void Overlay::FillAlphaMash( byte* outputAlphaMask, bool fBody, bool fBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha)
1852 if(!fBorder && fBody && pAlphaMask==NULL)
1854 _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1856 else if(/*fBorder &&*/ fBody && pAlphaMask==NULL)
1858 _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
1860 else if(!fBody && fBorder /* pAlphaMask==NULL or not*/)
1862 _DoFillAlphaMash(outputAlphaMask, mBody.get(), mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
1864 else if(!fBorder && fBody && pAlphaMask!=NULL)
1866 _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1868 else if(fBorder && fBody && pAlphaMask!=NULL)
1870 _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
1872 else
1874 //should NOT happen
1875 ASSERT(0);
1879 Overlay* Overlay::GetSubpixelVariance(unsigned int xshift, unsigned int yshift)
1881 Overlay* overlay = new Overlay();
1882 if(!overlay)
1884 return NULL;
1886 xshift &= 7;
1887 yshift &= 7;
1889 overlay->mOffsetX = mOffsetX - xshift;
1890 overlay->mOffsetY = mOffsetY - yshift;
1891 overlay->mWidth = mWidth + xshift;
1892 overlay->mHeight = mHeight + yshift;
1894 overlay->mOverlayWidth = ((overlay->mWidth+7)>>3) + 1;
1895 overlay->mOverlayHeight = ((overlay->mHeight + 7)>>3) + 1;
1896 overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
1899 overlay->mfWideOutlineEmpty = mfWideOutlineEmpty;
1901 if (overlay->mOverlayPitch * overlay->mOverlayHeight<=0)
1903 return NULL;
1906 BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
1907 if( body==NULL )
1909 return NULL;
1911 overlay->mBody.reset(body, xy_free);
1912 BYTE* border = NULL;
1913 if (!overlay->mfWideOutlineEmpty)
1915 border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
1916 if (border==NULL)
1918 return NULL;
1920 overlay->mBorder.reset(border, xy_free);
1923 if(overlay->mOverlayPitch==mOverlayPitch && overlay->mOverlayHeight>=mOverlayHeight)
1925 if (body && mBody)
1927 memcpy(body, mBody.get(), mOverlayPitch * mOverlayHeight);
1928 memset(body+mOverlayPitch*mOverlayHeight, 0, mOverlayPitch * (overlay->mOverlayHeight-mOverlayHeight));
1930 else if ( (!!body)!=(!!mBody)/*==NULL*/)
1932 return NULL;
1935 if (border && mBorder)
1937 memcpy(border, mBorder.get(), mOverlayPitch * mOverlayHeight);
1938 memset(border+mOverlayPitch*mOverlayHeight, 0, mOverlayPitch * (overlay->mOverlayHeight-mOverlayHeight));
1940 else if ( (!!border)!=(!!mBorder)/*==NULL*/ )
1942 return NULL;
1945 else
1947 memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
1948 byte* dst = body;
1949 const byte* src = mBody.get();
1950 for (int i=0;i<mOverlayHeight;i++)
1952 memcpy(dst, src, mOverlayPitch);
1953 dst += overlay->mOverlayPitch;
1954 src += mOverlayPitch;
1956 if (!overlay->mfWideOutlineEmpty)
1958 ASSERT(border && mBorder);
1959 memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
1960 dst = border;
1961 src = mBorder.get();
1962 for (int i=0;i<mOverlayHeight;i++)
1964 memcpy(dst, src, mOverlayPitch);
1965 dst += overlay->mOverlayPitch;
1966 src += mOverlayPitch;
1970 //not equal
1971 // Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1972 Bilinear(body, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1973 if (!overlay->mfWideOutlineEmpty)
1975 Bilinear(border, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1977 return overlay;
1980 ///////////////////////////////////////////////////////////////
1982 // PathData
1984 PathData::PathData():mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0)
1988 PathData::PathData( const PathData& src ):mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(src.mPathPoints)
1990 //TODO: deal with the case that src.mPathPoints<0
1991 if(mPathPoints>0)
1993 mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
1994 mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));
1996 if(mPathPoints>0)
1998 memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
1999 memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2003 const PathData& PathData::operator=( const PathData& src )
2005 if(this!=&src)
2007 if(mPathPoints!=src.mPathPoints && src.mPathPoints>0)
2009 _TrashPath();
2010 mPathPoints = src.mPathPoints;
2011 mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2012 mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));//better than realloc
2014 if(src.mPathPoints>0)
2016 memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2017 memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2020 return *this;
2023 PathData::~PathData()
2025 _TrashPath();
2028 bool PathData::operator==( const PathData& rhs ) const
2030 return (this==&rhs) || (
2031 mPathPoints==rhs.mPathPoints
2032 && !memcmp(mpPathTypes, rhs.mpPathTypes, mPathPoints * sizeof(BYTE) )
2033 && !memcmp(mpPathPoints, rhs.mpPathPoints, mPathPoints * sizeof(POINT) )
2037 void PathData::_TrashPath()
2039 if (mpPathTypes)
2041 free(mpPathTypes);
2042 mpPathTypes = NULL;
2044 if (mpPathPoints)
2046 free(mpPathPoints);
2047 mpPathPoints = NULL;
2049 mPathPoints = 0;
2052 bool PathData::BeginPath(HDC hdc)
2054 _TrashPath();
2055 return !!::BeginPath(hdc);
2058 bool PathData::EndPath(HDC hdc)
2060 ::CloseFigure(hdc);
2061 if(::EndPath(hdc))
2063 mPathPoints = GetPath(hdc, NULL, NULL, 0);
2064 if(!mPathPoints)
2065 return true;
2066 mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
2067 mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
2068 if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
2069 return true;
2071 ::AbortPath(hdc);
2072 return false;
2075 bool PathData::PartialBeginPath(HDC hdc, bool bClearPath)
2077 if(bClearPath)
2078 _TrashPath();
2079 return !!::BeginPath(hdc);
2082 bool PathData::PartialEndPath(HDC hdc, long dx, long dy)
2084 ::CloseFigure(hdc);
2085 if(::EndPath(hdc))
2087 int nPoints;
2088 BYTE* pNewTypes;
2089 POINT* pNewPoints;
2090 nPoints = GetPath(hdc, NULL, NULL, 0);
2091 if(!nPoints)
2092 return true;
2093 pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
2094 pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
2095 if(pNewTypes)
2096 mpPathTypes = pNewTypes;
2097 if(pNewPoints)
2098 mpPathPoints = pNewPoints;
2099 BYTE* pTypes = new BYTE[nPoints];
2100 POINT* pPoints = new POINT[nPoints];
2101 if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
2103 for(int i = 0; i < nPoints; ++i)
2105 mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
2106 mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
2107 mpPathTypes[mPathPoints + i] = pTypes[i];
2109 mPathPoints += nPoints;
2110 delete[] pTypes;
2111 delete[] pPoints;
2112 return true;
2114 else
2115 DebugBreak();
2116 delete[] pTypes;
2117 delete[] pPoints;
2119 ::AbortPath(hdc);
2120 return false;
2123 void PathData::AlignLeftTop(CPoint *left_top, CSize *size)
2125 int minx = INT_MAX;
2126 int miny = INT_MAX;
2127 int maxx = INT_MIN;
2128 int maxy = INT_MIN;
2129 for(int i=0; i<mPathPoints; ++i)
2131 int ix = mpPathPoints[i].x;
2132 int iy = mpPathPoints[i].y;
2133 if(ix < minx) minx = ix;
2134 if(ix > maxx) maxx = ix;
2135 if(iy < miny) miny = iy;
2136 if(iy > maxy) maxy = iy;
2138 if(minx > maxx || miny > maxy)
2140 _TrashPath();
2141 *left_top = CPoint(0, 0);
2142 *size = CSize(0, 0);
2143 return;
2145 minx = (minx >> 3) & ~7;
2146 miny = (miny >> 3) & ~7;
2147 maxx = (maxx + 7) >> 3;
2148 maxy = (maxy + 7) >> 3;
2149 for(int i=0; i<mPathPoints; ++i)
2151 mpPathPoints[i].x -= minx*8;
2152 mpPathPoints[i].y -= miny*8;
2154 *left_top = CPoint(minx, miny);
2155 *size = CSize(maxx+1-minx, maxy+1-miny);
2156 return;
2159 //////////////////////////////////////////////////////////////////////////
2161 // ScanLineData
2163 ScanLineData::ScanLineData()
2167 ScanLineData::~ScanLineData()
2171 void ScanLineData::_ReallocEdgeBuffer(int edges)
2173 mEdgeHeapSize = edges;
2174 mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
2177 void ScanLineData::_EvaluateBezier(const PathData& path_data, int ptbase, bool fBSpline)
2179 const POINT* pt0 = path_data.mpPathPoints + ptbase;
2180 const POINT* pt1 = path_data.mpPathPoints + ptbase + 1;
2181 const POINT* pt2 = path_data.mpPathPoints + ptbase + 2;
2182 const POINT* pt3 = path_data.mpPathPoints + ptbase + 3;
2183 double x0 = pt0->x;
2184 double x1 = pt1->x;
2185 double x2 = pt2->x;
2186 double x3 = pt3->x;
2187 double y0 = pt0->y;
2188 double y1 = pt1->y;
2189 double y2 = pt2->y;
2190 double y3 = pt3->y;
2191 double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
2192 if(fBSpline)
2194 // 1 [-1 +3 -3 +1]
2195 // - * [+3 -6 +3 0]
2196 // 6 [-3 0 +3 0]
2197 // [+1 +4 +1 0]
2198 double _1div6 = 1.0/6.0;
2199 cx3 = _1div6*(- x0+3*x1-3*x2+x3);
2200 cx2 = _1div6*( 3*x0-6*x1+3*x2);
2201 cx1 = _1div6*(-3*x0 +3*x2);
2202 cx0 = _1div6*( x0+4*x1+1*x2);
2203 cy3 = _1div6*(- y0+3*y1-3*y2+y3);
2204 cy2 = _1div6*( 3*y0-6*y1+3*y2);
2205 cy1 = _1div6*(-3*y0 +3*y2);
2206 cy0 = _1div6*( y0+4*y1+1*y2);
2208 else // bezier
2210 // [-1 +3 -3 +1]
2211 // [+3 -6 +3 0]
2212 // [-3 +3 0 0]
2213 // [+1 0 0 0]
2214 cx3 = - x0+3*x1-3*x2+x3;
2215 cx2 = 3*x0-6*x1+3*x2;
2216 cx1 = -3*x0+3*x1;
2217 cx0 = x0;
2218 cy3 = - y0+3*y1-3*y2+y3;
2219 cy2 = 3*y0-6*y1+3*y2;
2220 cy1 = -3*y0+3*y1;
2221 cy0 = y0;
2224 // This equation is from Graphics Gems I.
2226 // The idea is that since we're approximating a cubic curve with lines,
2227 // any error we incur is due to the curvature of the line, which we can
2228 // estimate by calculating the maximum acceleration of the curve. For
2229 // a cubic, the acceleration (second derivative) is a line, meaning that
2230 // the absolute maximum acceleration must occur at either the beginning
2231 // (|c2|) or the end (|c2+c3|). Our bounds here are a little more
2232 // conservative than that, but that's okay.
2234 // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2235 // that component of the curve is linear and does not incur any error.
2236 // If a=0 for both X and Y, the curve is a line segment and we can
2237 // use a step size of 1.
2238 double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
2239 double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
2240 double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
2241 double h = 1.0;
2242 if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
2243 if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
2244 for(double t = 0; t < 1.0; t += h)
2246 double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
2247 double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
2248 _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2250 double x = cx0 + cx1 + cx2 + cx3;
2251 double y = cy0 + cy1 + cy2 + cy3;
2252 _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2255 void ScanLineData::_EvaluateLine(const PathData& path_data, int pt1idx, int pt2idx)
2257 const POINT* pt1 = path_data.mpPathPoints + pt1idx;
2258 const POINT* pt2 = path_data.mpPathPoints + pt2idx;
2259 _EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
2262 void ScanLineData::_EvaluateLine(int x0, int y0, int x1, int y1)
2264 if(lastp.x != x0 || lastp.y != y0)
2266 _EvaluateLine(lastp.x, lastp.y, x0, y0);
2268 if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
2269 lastp.x = x1;
2270 lastp.y = y1;
2271 if(y1 > y0) // down
2273 __int64 xacc = (__int64)x0 << 13;
2274 // prestep y0 down
2275 int dy = y1 - y0;
2276 int y = ((y0 + 3)&~7) + 4;
2277 int iy = y >> 3;
2278 y1 = (y1 - 5) >> 3;
2279 if(iy <= y1)
2281 __int64 invslope = (__int64(x1 - x0) << 16) / dy;
2282 while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
2283 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2284 xacc += (invslope * (y - y0)) >> 3;
2285 while(iy <= y1)
2287 int ix = (int)((xacc + 32768) >> 16);
2288 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2289 mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
2290 mpScanBuffer[iy] = mEdgeNext++;
2291 ++iy;
2292 xacc += invslope;
2296 else if(y1 < y0) // up
2298 __int64 xacc = (__int64)x1 << 13;
2299 // prestep y1 down
2300 int dy = y0 - y1;
2301 int y = ((y1 + 3)&~7) + 4;
2302 int iy = y >> 3;
2303 y0 = (y0 - 5) >> 3;
2304 if(iy <= y0)
2306 __int64 invslope = (__int64(x0 - x1) << 16) / dy;
2307 while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
2308 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2309 xacc += (invslope * (y - y1)) >> 3;
2310 while(iy <= y0)
2312 int ix = (int)((xacc + 32768) >> 16);
2313 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2314 mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
2315 mpScanBuffer[iy] = mEdgeNext++;
2316 ++iy;
2317 xacc += invslope;
2323 bool ScanLineData::ScanConvert(const PathData& path_data, const CSize& size)
2325 int lastmoveto = -1;
2326 int i;
2327 // Drop any outlines we may have.
2328 mOutline.clear();
2329 // Determine bounding box
2330 if(!path_data.mPathPoints)
2332 mWidth = mHeight = 0;
2333 return false;
2335 mWidth = size.cx;
2336 mHeight = size.cy;
2337 // Initialize edge buffer. We use edge 0 as a sentinel.
2338 mEdgeNext = 1;
2339 mEdgeHeapSize = 2048;
2340 mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
2341 // Initialize scanline list.
2342 mpScanBuffer = new unsigned int[mHeight];
2343 memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
2344 // Scan convert the outline. Yuck, Bezier curves....
2345 // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2346 // paths with all but the first figure left open, so we can't rely
2347 // on the PT_CLOSEFIGURE flag being used appropriately.
2348 fFirstSet = false;
2349 firstp.x = firstp.y = 0;
2350 lastp.x = lastp.y = 0;
2351 for(i=0; i<path_data.mPathPoints; ++i)
2353 BYTE t = path_data.mpPathTypes[i] & ~PT_CLOSEFIGURE;
2354 switch(t)
2356 case PT_MOVETO:
2357 if(lastmoveto >= 0 && firstp != lastp)
2358 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2359 lastmoveto = i;
2360 fFirstSet = false;
2361 lastp = path_data.mpPathPoints[i];
2362 break;
2363 case PT_MOVETONC:
2364 break;
2365 case PT_LINETO:
2366 if(path_data.mPathPoints - (i-1) >= 2) _EvaluateLine(path_data, i-1, i);
2367 break;
2368 case PT_BEZIERTO:
2369 if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, false);
2370 i += 2;
2371 break;
2372 case PT_BSPLINETO:
2373 if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, true);
2374 i += 2;
2375 break;
2376 case PT_BSPLINEPATCHTO:
2377 if(path_data.mPathPoints - (i-3) >= 4) _EvaluateBezier(path_data, i-3, true);
2378 break;
2381 if(lastmoveto >= 0 && firstp != lastp)
2382 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2383 // Convert the edges to spans. We couldn't do this before because some of
2384 // the regions may have winding numbers >+1 and it would have been a pain
2385 // to try to adjust the spans on the fly. We use one heap to detangle
2386 // a scanline's worth of edges from the singly-linked lists, and another
2387 // to collect the actual scans.
2388 std::vector<int> heap;
2389 mOutline.reserve(mEdgeNext / 2);
2390 __int64 y = 0;
2391 for(y=0; y<mHeight; ++y)
2393 int count = 0;
2394 // Detangle scanline into edge heap.
2395 for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
2397 heap.push_back(mpEdgeBuffer[ptr].posandflag);
2399 // Sort edge heap. Note that we conveniently made the opening edges
2400 // one more than closing edges at the same spot, so we won't have any
2401 // problems with abutting spans.
2402 std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
2403 // Process edges and add spans. Since we only check for a non-zero
2404 // winding number, it doesn't matter which way the outlines go!
2405 std::vector<int>::iterator itX1 = heap.begin();
2406 std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
2407 int x1, x2;
2408 for(; itX1 != itX2; ++itX1)
2410 int x = *itX1;
2411 if(!count)
2412 x1 = (x>>1);
2413 if(x&1)
2414 ++count;
2415 else
2416 --count;
2417 if(!count)
2419 x2 = (x>>1);
2420 if(x2>x1)
2421 mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
2424 heap.clear();
2426 // Dump the edge and scan buffers, since we no longer need them.
2427 free(mpEdgeBuffer);
2428 delete [] mpScanBuffer;
2429 // All done!
2430 return true;
2433 using namespace std;
2435 void ScanLineData::DeleteOutlines()
2437 mOutline.clear();
2440 bool ScanLineData2::CreateWidenedRegion(int rx, int ry)
2442 if(rx < 0) rx = 0;
2443 if(ry < 0) ry = 0;
2444 mWideBorder = max(rx,ry);
2445 mWideOutline.clear();
2447 const tSpanBuffer& out_line = m_scan_line_data->mOutline;
2448 if (ry > 0)
2450 // Do a half circle.
2451 // _OverlapRegion mirrors this so both halves are done.
2452 for(int y = -ry; y <= ry; ++y)
2454 int x = (int)(0.5 + sqrt(float(ry*ry - y*y)) * float(rx)/float(ry));
2455 OverlapRegion(mWideOutline, out_line, x, y);
2458 else if (ry == 0 && rx > 0)
2460 // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
2461 OverlapRegion(mWideOutline, out_line, rx, 0);
2462 OverlapRegion(mWideOutline, out_line, rx, 0);
2464 return true;