Split scan convert operation for a new cache. [Part 2]
[xy_vsfilter.git] / src / subtitles / Rasterizer.cpp
bloba7fc92c8ac1563c4c11d3eded0646f83be93f3bd
1 /*
2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
8 * any later version.
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
22 #include "stdafx.h"
23 #include <string.h>
24 #include <math.h>
25 #include <vector>
26 #include <algorithm>
27 #include "Rasterizer.h"
28 #include "SeparableFilter.h"
29 #include "xy_logger.h"
30 #include <boost/flyweight/key_value.hpp>
32 #ifndef _MAX /* avoid collision with common (nonconforming) macros */
33 #define _MAX (max)
34 #define _MIN (min)
35 #define _IMPL_MAX max
36 #define _IMPL_MIN min
37 #else
38 #define _IMPL_MAX _MAX
39 #define _IMPL_MIN _MIN
40 #endif
43 //NOTE: signed or unsigned affects the result seriously
44 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
46 #define SPLIT_AYUV(color, a, y, u, v) do { \
47 *(v)=(color)&0xff; \
48 *(u)=((color)>>8) &0xff; \
49 *(y)=((color)>>16)&0xff;\
50 *(a)=((color)>>24)&0xff;\
51 } while(0)
53 class ass_synth_priv
55 public:
56 static const int VOLUME_BITS = 22;//should not exceed 32-8, and better not exceed 31-8
58 ass_synth_priv(const double sigma);
59 ass_synth_priv(const ass_synth_priv& priv);
61 ~ass_synth_priv();
62 int generate_tables(double sigma);
64 int g_r;
65 int g_w;
67 unsigned *g;
68 unsigned *gt2;
70 double sigma;
73 struct ass_synth_priv_key
75 const double& operator()(const ass_synth_priv& x)const
77 return x.sigma;
81 struct ass_tmp_buf
83 public:
84 ass_tmp_buf(size_t size);
85 ass_tmp_buf(const ass_tmp_buf& buf);
86 ~ass_tmp_buf();
87 size_t size;
88 unsigned *tmp;
91 struct ass_tmp_buf_get_size
93 const size_t& operator()(const ass_tmp_buf& buf)const
95 return buf.size;
99 static const unsigned int maxcolor = 255;
100 static const unsigned base = 256;
102 ass_synth_priv::ass_synth_priv(const double sigma)
104 g_r = 0;
105 g_w = 0;
107 g = NULL;
108 gt2 = NULL;
110 this->sigma = 0;
111 generate_tables(sigma);
114 ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma)
116 if (this->g_w > 0 && this != &priv) {
117 this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
118 this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
119 //if (this->g == null || this->gt2 == null) {
120 // return -1;
122 memcpy(g, priv.g, this->g_w * sizeof(unsigned));
123 memcpy(gt2, priv.gt2, 256 * this->g_w * sizeof(unsigned));
127 ass_synth_priv::~ass_synth_priv()
129 free(g); g=NULL;
130 free(gt2); gt2=NULL;
133 int ass_synth_priv::generate_tables(double sigma)
135 const int TARGET_VOLUME = 1<<VOLUME_BITS;
136 const int MAX_VOLUME_ERROR = VOLUME_BITS>=22 ? 16 : 1;
138 double a = -1 / (sigma * sigma * 2);
139 double exp_a = exp(a);
141 double volume_factor = 0;
142 double volume_start = 0, volume_end = 0;
143 unsigned volume;
145 if (this->sigma == sigma)
146 return 0;
147 else
148 this->sigma = sigma;
150 this->g_w = (int)ceil(sigma*3) | 1;
151 this->g_r = this->g_w / 2;
153 if (this->g_w > 0) {
154 this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
155 this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
156 if (this->g == NULL || this->gt2 == NULL) {
157 return -1;
161 if (this->g_w > 0) {
162 volume_start = 0;
164 double exp_0 = 1.0;
165 double exp_1 = exp_a;
166 double exp_2 = exp_1 * exp_1;
167 volume_start += exp_0;
168 for(int i=0;i<this->g_r;++i)
170 exp_0 *= exp_1;
171 exp_1 *= exp_2;
172 volume_start += exp_0;
173 volume_start += exp_0;
175 //euqivalent:
176 // for (i = 0; i < this->g_w; ++i) {
177 // volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
178 // }
180 volume_end = (TARGET_VOLUME+g_w)/volume_start;
181 volume_start = (TARGET_VOLUME-g_w)/volume_start;
183 volume = 0;
184 while( volume_start+0.000001<volume_end )
186 volume_factor = (volume_start+volume_end)*0.5;
187 volume = 0;
189 exp_0 = volume_factor;
190 exp_1 = exp_a;
191 exp_2 = exp_1 * exp_1;
193 volume = static_cast<int>(exp_0+.5);
194 this->g[this->g_r] = volume;
196 unsigned* p_left = this->g+this->g_r-1;
197 unsigned* p_right= this->g+this->g_r+1;
198 for(int i=0; i<this->g_r;++i,p_left--,p_right++)
200 exp_0 *= exp_1;
201 exp_1 *= exp_2;
202 *p_left = static_cast<int>(exp_0+.5);
203 *p_right = *p_left;
204 volume += (*p_left<<1);
206 //equivalent:
207 // for (i = 0; i < this->g_w; ++i) {
208 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
209 // volume += this->g[i];
210 // }
212 // volume don't have to be equal to TARGET_VOLUME,
213 // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
214 // max error introducing in later blur operation,
215 // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
216 // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
217 // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
219 // NOTE: when it comes to rounding, no matter how small the error is,
220 // it may result a different rounding output
221 if( volume>=TARGET_VOLUME && volume< (TARGET_VOLUME+MAX_VOLUME_ERROR) )
222 break;
223 else if(volume < TARGET_VOLUME)
225 volume_start = volume_factor;
227 else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR)
229 volume_end = volume_factor;
232 if(volume==0)
234 volume_factor = volume_end;
236 exp_0 = volume_factor;
237 exp_1 = exp_a;
238 exp_2 = exp_1 * exp_1;
240 volume = static_cast<int>(exp_0+.5);
241 this->g[this->g_r] = volume;
243 unsigned* p_left = this->g+this->g_r-1;
244 unsigned* p_right= this->g+this->g_r+1;
245 for(int i=0; i<this->g_r;++i,p_left--,p_right++)
247 exp_0 *= exp_1;
248 exp_1 *= exp_2;
249 *p_left = static_cast<int>(exp_0+.5);
250 *p_right = *p_left;
251 volume += (*p_left<<1);
253 //equivalent:
254 // for (i = 0; i < this->g_w; ++i) {
255 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
256 // volume += this->g[i];
257 // }
260 // gauss table:
261 for (int mx = 0; mx < this->g_w; mx++) {
262 int last_mul = 0;
263 unsigned *p_gt2 = this->gt2 + mx;
264 *p_gt2 = 0;
265 for (int i = 1; i < 256; i++) {
266 last_mul = last_mul+this->g[mx];
267 p_gt2 += this->g_w;
268 *p_gt2 = last_mul;
269 //equivalent:
270 // this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
274 return 0;
277 ass_tmp_buf::ass_tmp_buf(size_t size)
279 tmp = (unsigned *)malloc(size * sizeof(unsigned));
280 this->size = size;
283 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf& buf)
284 :size(buf.size)
286 tmp = (unsigned *)malloc(size * sizeof(unsigned));
289 ass_tmp_buf::~ass_tmp_buf()
291 free(tmp);
295 * \brief gaussian blur. an fast pure c implementation from libass.
297 static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
298 int width, int height, int stride, const unsigned *m2,
299 int r, int mwidth)
302 int x, y;
304 unsigned char *s = buffer;
305 unsigned *t = tmp2 + 1;
306 for (y = 0; y < height; y++) {
307 memset(t - 1, 0, (width + 1) * sizeof(*t));
308 x = 0;
309 if(x < r)//in case that r < 0
311 const int src = s[x];
312 if (src) {
313 register unsigned *dstp = t + x - r;
314 int mx;
315 const unsigned *m3 = m2 + src * mwidth;
316 unsigned sum = 0;
317 for (mx = mwidth-1; mx >= r - x ; mx--) {
318 sum += m3[mx];
319 dstp[mx] += sum;
324 for (x = 1; x < r; x++) {
325 const int src = s[x];
326 if (src) {
327 register unsigned *dstp = t + x - r;
328 int mx;
329 const unsigned *m3 = m2 + src * mwidth;
330 for (mx = r - x; mx < mwidth; mx++) {
331 dstp[mx] += m3[mx];
336 for (; x < width - r; x++) {
337 const int src = s[x];
338 if (src) {
339 register unsigned *dstp = t + x - r;
340 int mx;
341 const unsigned *m3 = m2 + src * mwidth;
342 for (mx = 0; mx < mwidth; mx++) {
343 dstp[mx] += m3[mx];
348 for (; x < width-1; x++) {
349 const int src = s[x];
350 if (src) {
351 register unsigned *dstp = t + x - r;
352 int mx;
353 const int x2 = r + width - x;
354 const unsigned *m3 = m2 + src * mwidth;
355 for (mx = 0; mx < x2; mx++) {
356 dstp[mx] += m3[mx];
360 if(x==width-1) //important: x==width-1 failed, if r==0
362 const int src = s[x];
363 if (src) {
364 register unsigned *dstp = t + x - r;
365 int mx;
366 const int x2 = r + width - x;
367 const unsigned *m3 = m2 + src * mwidth;
368 unsigned sum = 0;
369 for (mx = 0; mx < x2; mx++) {
370 sum += m3[mx];
371 dstp[mx] += sum;
376 s += stride;
377 t += width + 1;
380 t = tmp2;
381 for (x = 0; x < width; x++) {
382 y = 0;
383 if(y < r)//in case that r<0
385 unsigned *srcp = t + y * (width + 1) + 1;
386 int src = *srcp;
387 if (src) {
388 register unsigned *dstp = srcp - 1 + (mwidth -r +y)*(width + 1);
389 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
390 const unsigned *m3 = m2 + src2 * mwidth;
391 unsigned sum = 0;
392 int mx;
393 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
394 for (mx = mwidth-1; mx >=r - y ; mx--) {
395 sum += m3[mx];
396 *dstp += sum;
397 dstp -= width + 1;
401 for (y = 1; y < r; y++) {
402 unsigned *srcp = t + y * (width + 1) + 1;
403 int src = *srcp;
404 if (src) {
405 register unsigned *dstp = srcp - 1 + width + 1;
406 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
407 const unsigned *m3 = m2 + src2 * mwidth;
409 int mx;
410 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
411 for (mx = r - y; mx < mwidth; mx++) {
412 *dstp += m3[mx];
413 dstp += width + 1;
417 for (; y < height - r; y++) {
418 unsigned *srcp = t + y * (width + 1) + 1;
419 int src = *srcp;
420 if (src) {
421 register unsigned *dstp = srcp - 1 - r * (width + 1);
422 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
423 const unsigned *m3 = m2 + src2 * mwidth;
425 int mx;
426 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
427 for (mx = 0; mx < mwidth; mx++) {
428 *dstp += m3[mx];
429 dstp += width + 1;
433 for (; y < height-1; y++) {
434 unsigned *srcp = t + y * (width + 1) + 1;
435 int src = *srcp;
436 if (src) {
437 const int y2 = r + height - y;
438 register unsigned *dstp = srcp - 1 - r * (width + 1);
439 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
440 const unsigned *m3 = m2 + src2 * mwidth;
442 int mx;
443 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
444 for (mx = 0; mx < y2; mx++) {
445 *dstp += m3[mx];
446 dstp += width + 1;
450 if(y == height - 1)//important: y == height - 1 failed if r==0
452 unsigned *srcp = t + y * (width + 1) + 1;
453 int src = *srcp;
454 if (src) {
455 const int y2 = r + height - y;
456 register unsigned *dstp = srcp - 1 - r * (width + 1);
457 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
458 const unsigned *m3 = m2 + src2 * mwidth;
459 unsigned sum = 0;
460 int mx;
461 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
462 for (mx = 0; mx < y2; mx++) {
463 sum += m3[mx];
464 *dstp += sum;
465 dstp += width + 1;
469 t++;
472 t = tmp2;
473 s = buffer;
474 for (y = 0; y < height; y++) {
475 for (x = 0; x < width; x++) {
476 s[x] = t[x] >> ass_synth_priv::VOLUME_BITS;
478 s += stride;
479 t += width + 1;
484 * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
486 static void be_blur(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
488 WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
489 WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
490 if(!col_sum_buf_base || !col_pix_buf_base)
492 //ToDo: error handling
493 return;
495 memset(col_pix_buf_base, 0, w*sizeof(WORD));
496 memset(col_sum_buf_base, 0, w*sizeof(WORD));
497 WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
498 WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
500 int y = 0;
501 unsigned char *src=buf+y*stride;
503 int x = 2;
504 int old_pix = src[x-1];
505 int old_sum = old_pix + src[x-2];
506 for ( ; x < w; x++) {
507 int temp1 = src[x];
508 int temp2 = old_pix + temp1;
509 old_pix = temp1;
510 temp1 = old_sum + temp2;
511 old_sum = temp2;
512 col_pix_buf[x] = temp1;
516 int y = 1;
517 unsigned char *src=buf+y*stride;
520 int x = 2;
521 int old_pix = src[x-1];
522 int old_sum = old_pix + src[x-2];
523 for ( ; x < w; x++) {
524 int temp1 = src[x];
525 int temp2 = old_pix + temp1;
526 old_pix = temp1;
527 temp1 = old_sum + temp2;
528 old_sum = temp2;
530 temp2 = col_pix_buf[x] + temp1;
531 col_pix_buf[x] = temp1;
532 //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
533 col_sum_buf[x] = temp2;
537 //__m128i round = _mm_set1_epi16(8);
538 for (int y = 2; y < h; y++) {
539 unsigned char *src=buf+y*stride;
540 unsigned char *dst=buf+(y-1)*stride;
543 int x = 2;
544 __m128i old_pix_128 = _mm_cvtsi32_si128(src[1]);
545 __m128i old_sum_128 = _mm_cvtsi32_si128(src[0]+src[1]);
546 for ( ; x < ((w-2)&(~7)); x+=8) {
547 __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src+x));
548 new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
549 __m128i temp = _mm_slli_si128(new_pix,2);
550 temp = _mm_add_epi16(temp, old_pix_128);
551 temp = _mm_add_epi16(temp, new_pix);
552 old_pix_128 = _mm_srli_si128(new_pix,14);
554 new_pix = _mm_slli_si128(temp,2);
555 new_pix = _mm_add_epi16(new_pix, old_sum_128);
556 new_pix = _mm_add_epi16(new_pix, temp);
557 old_sum_128 = _mm_srli_si128(temp, 14);
559 __m128i old_col_pix = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_pix_buf+x) );
560 __m128i old_col_sum = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_sum_buf+x) );
561 _mm_storeu_si128( reinterpret_cast<__m128i*>(col_pix_buf+x), new_pix );
562 temp = _mm_add_epi16(new_pix, old_col_pix);
563 _mm_storeu_si128( reinterpret_cast<__m128i*>(col_sum_buf+x), temp );
565 old_col_sum = _mm_add_epi16(old_col_sum, temp);
566 //old_col_sum = _mm_add_epi16(old_col_sum, round);
567 old_col_sum = _mm_srli_epi16(old_col_sum, 4);
568 old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
569 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst+x-1), old_col_sum );
571 int old_pix = src[x-1];
572 int old_sum = old_pix + src[x-2];
573 for ( ; x < w; x++) {
574 int temp1 = src[x];
575 int temp2 = old_pix + temp1;
576 old_pix = temp1;
577 temp1 = old_sum + temp2;
578 old_sum = temp2;
580 temp2 = col_pix_buf[x] + temp1;
581 col_pix_buf[x] = temp1;
582 dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
583 col_sum_buf[x] = temp2;
587 xy_free(col_sum_buf_base);
588 xy_free(col_pix_buf_base);
591 static void Bilinear(unsigned char *buf, int w, int h, int stride, int x_factor, int y_factor)
593 WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
594 if(!col_pix_buf_base)
596 //ToDo: error handling
597 return;
599 memset(col_pix_buf_base, 0, w*sizeof(WORD));
601 for (int y = 0; y < h; y++){
602 unsigned char *src=buf+y*stride;
604 WORD *col_pix_buf = col_pix_buf_base;
605 int last=0;
606 for(int x = 0; x < w; x++)
608 int temp1 = src[x];
609 int temp2 = temp1*x_factor;
610 temp1 <<= 3;
611 temp1 -= temp2;
612 temp1 += last;
613 last = temp2;
615 temp2 = temp1*y_factor;
616 temp1 <<= 3;
617 temp1 -= temp2;
618 temp1 += col_pix_buf[x];
619 src[x] = ((temp1+32)>>6);
620 col_pix_buf[x] = temp2;
623 xy_free(col_pix_buf_base);
626 bool Rasterizer::Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay)
628 using namespace ::boost::flyweights;
630 if(!overlay)
632 return false;
634 overlay->CleanUp();
636 if(!scan_line_data2.mWidth || !scan_line_data2.mHeight)
638 return true;
640 xsub &= 7;
641 ysub &= 7;
642 //xsub = ysub = 0;
643 int width = scan_line_data2.mWidth + xsub;
644 int height = scan_line_data2.mHeight + ysub;
645 overlay->mOffsetX = scan_line_data2.mPathOffsetX - xsub;
646 overlay->mOffsetY = scan_line_data2.mPathOffsetY - ysub;
647 int wide_border = (scan_line_data2.mWideBorder+7)&~7;
648 overlay->mfWideOutlineEmpty = scan_line_data2.mWideOutline.empty();
649 if(!overlay->mfWideOutlineEmpty)
651 width += 2*wide_border ;
652 height += 2*wide_border ;
653 xsub += wide_border ;
654 ysub += wide_border ;
655 overlay->mOffsetX -= wide_border;
656 overlay->mOffsetY -= wide_border;
659 overlay->mWidth = width;
660 overlay->mHeight = height;
661 overlay->mOverlayWidth = ((width+7)>>3) + 1;
662 overlay->mOverlayHeight = ((height+7)>>3) + 1;
663 overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
665 overlay->mpOverlayBuffer.base = (byte*)xy_malloc(2 * overlay->mOverlayPitch * overlay->mOverlayHeight);
666 memset(overlay->mpOverlayBuffer.base, 0, 2 * overlay->mOverlayPitch * overlay->mOverlayHeight);
667 overlay->mpOverlayBuffer.body = overlay->mpOverlayBuffer.base;
668 overlay->mpOverlayBuffer.border = overlay->mpOverlayBuffer.base + overlay->mOverlayPitch * overlay->mOverlayHeight;
670 // Are we doing a border?
671 const ScanLineData::tSpanBuffer* pOutline[2] = {&(scan_line_data2.mOutline), &(scan_line_data2.mWideOutline)};
672 for(int i = countof(pOutline)-1; i >= 0; i--)
674 ScanLineData::tSpanBuffer::const_iterator it = pOutline[i]->begin();
675 ScanLineData::tSpanBuffer::const_iterator itEnd = pOutline[i]->end();
676 byte* plan_selected = i==0 ? overlay->mpOverlayBuffer.body : overlay->mpOverlayBuffer.border;
677 int pitch = overlay->mOverlayPitch;
678 for(; it!=itEnd; ++it)
680 int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
681 int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
682 int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
683 if(x2 > x1)
685 int first = x1>>3;
686 int last = (x2-1)>>3;
687 byte* dst = plan_selected + (pitch*(y>>3) + first);
688 if(first == last)
689 *dst += x2-x1;
690 else
692 *dst += ((first+1)<<3) - x1;
693 dst += 1;
694 while(++first < last)
696 *dst += 0x08;
697 dst += 1;
699 *dst += x2 - (last<<3);
705 return true;
708 // @return: true if actually a blur operation has done, or else false and output is leave unset.
709 bool Rasterizer::Blur(const Overlay& input_overlay, int fBlur, double fGaussianBlur,
710 SharedPtrOverlay output_overlay)
712 using namespace ::boost::flyweights;
714 if(!output_overlay)
716 return false;
718 output_overlay->CleanUp();
720 output_overlay->mOffsetX = input_overlay.mOffsetX;
721 output_overlay->mOffsetY = input_overlay.mOffsetY;
722 output_overlay->mWidth = input_overlay.mWidth;
723 output_overlay->mHeight = input_overlay.mHeight;
724 output_overlay->mOverlayWidth = input_overlay.mOverlayWidth;
725 output_overlay->mOverlayHeight = input_overlay.mOverlayHeight;
726 output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
728 int bluradjust = 0;
729 if(fBlur || fGaussianBlur > 0.1)
731 if (fGaussianBlur > 0)
732 bluradjust += (int)(fGaussianBlur*3*8 + 0.5) | 1;
733 if (fBlur)
734 bluradjust += 8;
735 // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
736 bluradjust = (bluradjust+7)&~7;
738 output_overlay->mOffsetX -= bluradjust;
739 output_overlay->mOffsetY -= bluradjust;
740 output_overlay->mWidth += (bluradjust<<1);
741 output_overlay->mHeight += (bluradjust<<1);
742 output_overlay->mOverlayWidth += (bluradjust>>2);
743 output_overlay->mOverlayHeight += (bluradjust>>2);
745 else
747 return false;
750 output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
752 output_overlay->mpOverlayBuffer.base = (byte*)xy_malloc(2 * output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
753 memset(output_overlay->mpOverlayBuffer.base, 0, 2 * output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
754 output_overlay->mpOverlayBuffer.body = output_overlay->mpOverlayBuffer.base;
755 output_overlay->mpOverlayBuffer.border = output_overlay->mpOverlayBuffer.base + output_overlay->mOverlayPitch * output_overlay->mOverlayHeight;
757 //copy buffer
758 for(int i = 1; i >= 0; i--)
760 byte* plan_selected = i==0 ? output_overlay->mpOverlayBuffer.body : output_overlay->mpOverlayBuffer.border;
761 const byte* plan_input = i==0 ? input_overlay.mpOverlayBuffer.body : input_overlay.mpOverlayBuffer.border;
763 plan_selected += (bluradjust>>3) + (bluradjust>>3)*output_overlay->mOverlayPitch;
764 for (int j=0;j<input_overlay.mOverlayHeight;j++)
766 memcpy(plan_selected, plan_input, input_overlay.mOverlayPitch);
767 plan_selected += output_overlay->mOverlayPitch;
768 plan_input += input_overlay.mOverlayPitch;
772 ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
773 //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
774 // Do some gaussian blur magic
775 if (fGaussianBlur > 0.1)//(fGaussianBlur > 0) return true even if fGaussianBlur very small
777 byte* plan_selected= output_overlay->mfWideOutlineEmpty ? output_overlay->mpOverlayBuffer.body : output_overlay->mpOverlayBuffer.border;
778 flyweight<key_value<double, ass_synth_priv, ass_synth_priv_key>, no_locking> fw_priv_blur(fGaussianBlur);
779 const ass_synth_priv& priv_blur = fw_priv_blur.get();
780 if (output_overlay->mOverlayWidth>=priv_blur.g_w && output_overlay->mOverlayHeight>=priv_blur.g_w)
782 ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch,
783 priv_blur.gt2, priv_blur.g_r, priv_blur.g_w);
787 for (int pass = 0; pass < fBlur; pass++)
789 if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
791 int pitch = output_overlay->mOverlayPitch;
792 byte* plan_selected= output_overlay->mfWideOutlineEmpty ? output_overlay->mpOverlayBuffer.body : output_overlay->mpOverlayBuffer.border;
793 be_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
796 return true;
799 ///////////////////////////////////////////////////////////////////////////
801 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
803 int a = alpha;
804 // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
805 int ia = 256-a;
806 a+=1;
807 *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
808 | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
809 | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
812 static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
814 int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
815 int ia = 256-a;
816 a+=1;
817 *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
818 | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
819 | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
822 #include <xmmintrin.h>
823 #include <emmintrin.h>
825 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
827 // alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
828 color &= 0xffffff;
829 __m128i zero = _mm_setzero_si128();
830 __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
831 __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
832 __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
833 __m128i r = _mm_unpacklo_epi16(d, s);
834 r = _mm_madd_epi16(r, a);
835 r = _mm_srli_epi32(r, 8);
836 r = _mm_packs_epi32(r, r);
837 r = _mm_packus_epi16(r, r);
838 *dst = (DWORD)_mm_cvtsi128_si32(r);
841 static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
843 int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
844 color &= 0xffffff;
845 __m128i zero = _mm_setzero_si128();
846 __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
847 __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
848 __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
849 __m128i r = _mm_unpacklo_epi16(d, s);
850 r = _mm_madd_epi16(r, a);
851 r = _mm_srli_epi32(r, 8);
852 r = _mm_packs_epi32(r, r);
853 r = _mm_packus_epi16(r, r);
854 *dst = (DWORD)_mm_cvtsi128_si32(r);
857 #include <mmintrin.h>
859 // Calculate a - b clamping to 0 instead of underflowing
860 static __forceinline DWORD safe_subtract(DWORD a, DWORD b)
862 __m64 ap = _mm_cvtsi32_si64(a);
863 __m64 bp = _mm_cvtsi32_si64(b);
864 __m64 rp = _mm_subs_pu16(ap, bp);
865 DWORD r = (DWORD)_mm_cvtsi64_si32(rp);
866 _mm_empty();
867 return r;
868 //return (b > a) ? 0 : a - b;
871 /***
872 * No aligned requirement
875 void AlphaBlt(byte* pY,
876 const byte* pAlphaMask,
877 const byte Y,
878 int h, int w, int src_stride, int dst_stride)
880 __m128i zero = _mm_setzero_si128();
881 __m128i s = _mm_set1_epi16(Y); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
883 if( w>16 )//IMPORTANT! The result of the following code is undefined with w<15.
885 for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
887 const BYTE* sa = pAlphaMask;
888 BYTE* dy = pY;
889 const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15); //IMPORTANT! w must >= 15
890 const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
891 const BYTE* dy_end = pY + w;
893 for(;dy < dy_first_mod16; sa++, dy++)
895 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
897 for(; dy < dy_end_mod16; sa+=8, dy+=16)
899 __m128i a = _mm_loadl_epi64((__m128i*)sa);
902 __m128i d = _mm_load_si128((__m128i*)dy);
904 //__m128i ones = _mm_cmpeq_epi32(zero,zero); //ones = ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
905 //__m128i ia = _mm_xor_si128(a,ones); //ia = ~a
906 //ia = _mm_unpacklo_epi8(ia,zero); //ia = ~a0 0 ~a1 0 ~a2 0 ~a3 0 ~a4 0 ~a5 0 ~a6 0 ~a7 0
907 a = _mm_unpacklo_epi8(a,zero); //a= a0 0 a1 0 a2 0 a3 0 a4 0 a5 0 a6 0 a7 0
908 __m128i ones = _mm_set1_epi16(256); //ones = 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
909 __m128i ia = _mm_sub_epi16(ones, a); //ia = 256-a0 ... 256-a7
910 ones = _mm_srli_epi16(ones, 8);
911 a = _mm_add_epi16(a, ones); //a= 1+a0 ... 1+a7
913 __m128i dl = _mm_unpacklo_epi8(d,zero); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
914 __m128i sl = _mm_mullo_epi16(s,a); //sl = c0*a0 c1*a1 ... c7*a7
916 dl = _mm_mullo_epi16(dl,ia); //d = b0*~a0 b1*~a1 ... b7*~a7
918 dl = _mm_add_epi16(dl,sl); //d = d + sl
919 dl = _mm_srli_epi16(dl, 8); //d = d>>8
921 sa += 8;
922 a = _mm_loadl_epi64((__m128i*)sa);
924 a = _mm_unpacklo_epi8(a,zero);
925 ones = _mm_slli_epi16(ones, 8);
926 ia = _mm_sub_epi16(ones, a);
927 ones = _mm_srli_epi16(ones, 8);
928 a = _mm_add_epi16(a,ones);
930 d = _mm_unpackhi_epi8(d,zero);
931 sl = _mm_mullo_epi16(s,a);
932 d = _mm_mullo_epi16(d,ia);
933 d = _mm_add_epi16(d,sl);
934 d = _mm_srli_epi16(d, 8);
936 dl = _mm_packus_epi16(dl,d);
938 _mm_store_si128((__m128i*)dy, dl);
940 for(;dy < dy_end; sa++, dy++)
942 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
946 else
948 for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
950 const BYTE* sa = pAlphaMask;
951 BYTE* dy = pY;
952 const BYTE* dy_end = pY + w;
954 for(;dy < dy_end; sa++, dy++)
956 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
960 //__asm emms;
963 /***
964 * No aligned requirement
967 void AlphaBlt(byte* pY,
968 const byte alpha,
969 const byte Y,
970 int h, int w, int dst_stride)
972 int yPremul = Y*(alpha+1);
973 int dstAlpha = 0x100 - alpha;
974 if( w>32 )//IMPORTANT! The result of the following code is undefined with w<15.
976 __m128i zero = _mm_setzero_si128();
977 __m128i s = _mm_set1_epi16(yPremul); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
978 __m128i ia = _mm_set1_epi16(dstAlpha);
979 for( ; h>0; h--, pY += dst_stride )
981 BYTE* dy = pY;
982 const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15); //IMPORTANT! w must >= 15
983 const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
984 const BYTE* dy_end = pY + w;
986 for(;dy < dy_first_mod16; dy++)
988 *dy = (*dy * dstAlpha + yPremul)>>8;
990 for(; dy < dy_end_mod16; dy+=16)
993 __m128i d = _mm_load_si128(reinterpret_cast<const __m128i*>(dy));
994 __m128i dl = _mm_unpacklo_epi8(d,zero); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
996 dl = _mm_mullo_epi16(dl,ia); //d = b0*~a0 b1*~a1 ... b7*~a7
997 dl = _mm_adds_epu16(dl,s); //d = d + s
998 dl = _mm_srli_epi16(dl, 8); //d = d>>8
1000 d = _mm_unpackhi_epi8(d,zero);
1001 d = _mm_mullo_epi16(d,ia);
1002 d = _mm_adds_epu16(d,s);
1003 d = _mm_srli_epi16(d, 8);
1005 dl = _mm_packus_epi16(dl,d);
1007 _mm_store_si128(reinterpret_cast<__m128i*>(dy), dl);
1009 for(;dy < dy_end; dy++)
1011 *dy = (*dy * dstAlpha + yPremul)>>8;
1015 else
1017 for( ; h>0; h--, pY += dst_stride )
1019 BYTE* dy = pY;
1020 const BYTE* dy_end = pY + w;
1022 for(;dy < dy_end; dy++)
1024 *dy = (*dy * dstAlpha + yPremul)>>8;
1028 //__asm emms;
1031 /***
1032 * No aligned requirement
1035 void AlphaBltC(byte* pY,
1036 const byte alpha,
1037 const byte Y,
1038 int h, int w, int dst_stride)
1040 int yPremul = Y*(alpha+1);
1041 int dstAlpha = 0x100 - alpha;
1043 for( ; h>0; h--, pY += dst_stride )
1045 BYTE* dy = pY;
1046 const BYTE* dy_end = pY + w;
1048 for(;dy < dy_end; dy++)
1050 *dy = (*dy * dstAlpha + yPremul)>>8;
1055 // For CPUID usage in Rasterizer::Draw
1056 #include "../dsutil/vd.h"
1058 static const __int64 _00ff00ff00ff00ff = 0x00ff00ff00ff00ffi64;
1060 // Render a subpicture onto a surface.
1061 // spd is the surface to render on.
1062 // clipRect is a rectangular clip region to render inside.
1063 // pAlphaMask is an alpha clipping mask.
1064 // xsub and ysub ???
1065 // switchpts seems to be an array of fill colours interlaced with coordinates.
1066 // switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1067 // fBody tells whether to render the body of the subs.
1068 // fBorder tells whether to render the border of the subs.
1069 SharedPtrByte Rasterizer::CompositeAlphaMask(SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, byte* pAlphaMask,
1070 int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder,
1071 CRect *outputDirtyRect)
1073 //fix me: check and log error
1074 SharedPtrByte result;
1075 *outputDirtyRect = CRect(0, 0, 0, 0);
1076 if(!switchpts || !fBody && !fBorder) return(result);
1078 // clip
1079 // Limit drawn area to intersection of rendering surface and rectangular clip area
1080 CRect r(0, 0, spd.w, spd.h);
1081 r &= clipRect;
1082 // Remember that all subtitle coordinates are specified in 1/8 pixels
1083 // (x+4)>>3 rounds to nearest whole pixel.
1084 // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1085 int x = (xsub + overlay->mOffsetX + 4)>>3;
1086 int y = (ysub + overlay->mOffsetY + 4)>>3;
1087 int w = overlay->mOverlayWidth;
1088 int h = overlay->mOverlayHeight;
1089 int xo = 0, yo = 0;
1090 // Again, limiting?
1091 if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1092 if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1093 if(x+w > r.right) w = r.right-x;
1094 if(y+h > r.bottom) h = r.bottom-y;
1095 // Check if there's actually anything to render
1096 if(w <= 0 || h <= 0) return(result);
1097 outputDirtyRect->SetRect(x, y, x+w, y+h);
1098 *outputDirtyRect &= CRect(0, 0, spd.w, spd.h);
1100 bool fSingleColor = (switchpts[1]==0xffffffff);
1102 // draw
1103 // Grab the first colour
1104 DWORD color = switchpts[0];
1105 byte* s_base = (byte*)xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight);
1107 if(fSingleColor)
1109 overlay->FillAlphaMash(s_base, fBody, fBorder, xo, yo, w, h,
1110 pAlphaMask==NULL ? NULL : pAlphaMask + spd.w * y + x, spd.w,
1111 color>>24 );
1113 else
1115 int last_x = xo;
1116 const DWORD *sw = switchpts;
1117 while( last_x<w+xo )
1119 byte alpha = sw[0]>>24;
1120 while( sw[3]<w+xo && (sw[2]>>24)==alpha )
1122 sw += 2;
1124 int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1125 overlay->FillAlphaMash(s_base, fBody, fBorder,
1126 last_x, yo, new_x-last_x, h,
1127 pAlphaMask==NULL ? NULL : pAlphaMask + spd.w * y + x + last_x - xo, spd.w,
1128 alpha );
1129 last_x = new_x;
1130 sw += 2;
1133 result.reset( s_base, xy_free );
1134 return result;
1137 CRect Rasterizer::Draw(SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, byte* pAlphaMask,
1138 int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder)
1140 CRect bbox(0,0,0,0);
1141 if(!switchpts || !fBody && !fBorder) return(bbox);
1143 // clip
1144 // Limit drawn area to intersection of rendering surface and rectangular clip area
1145 CRect r(0, 0, spd.w, spd.h);
1146 r &= clipRect;
1147 // Remember that all subtitle coordinates are specified in 1/8 pixels
1148 // (x+4)>>3 rounds to nearest whole pixel.
1149 // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1150 int overlayPitch = overlay->mOverlayPitch;
1151 int x = (xsub + overlay->mOffsetX + 4)>>3;
1152 int y = (ysub + overlay->mOffsetY + 4)>>3;
1153 int w = overlay->mOverlayWidth;
1154 int h = overlay->mOverlayHeight;
1155 int xo = 0, yo = 0;
1156 // Again, limiting?
1157 if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1158 if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1159 if(x+w > r.right) w = r.right-x;
1160 if(y+h > r.bottom) h = r.bottom-y;
1161 // Check if there's actually anything to render
1162 if(w <= 0 || h <= 0) return(bbox);
1164 // CPUID from VDub
1165 bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1166 bool fSingleColor = (switchpts[1]==0xffffffff);
1167 bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1168 int draw_method = 0;
1169 if(fSingleColor)
1170 draw_method |= DM::SINGLE_COLOR;
1171 if(fSSE2)
1172 draw_method |= DM::SSE2;
1173 if(AYUV_PLANAR)
1174 draw_method |= DM::AYUV_PLANAR;
1176 // draw
1177 // Grab the first colour
1178 DWORD color = switchpts[0];
1179 SharedPtrByte s_base = CompositeAlphaMask(spd, overlay, clipRect, pAlphaMask, xsub, ysub, switchpts,
1180 fBody, fBorder, &bbox);
1181 const byte* s = s_base.get() + overlay->mOverlayPitch*yo + xo;
1183 // How would this differ from src?
1184 unsigned long* dst = (unsigned long *)(((char *)spd.bits + spd.pitch * y) + ((x*spd.bpp)>>3));
1186 // Every remaining line in the bitmap to be rendered...
1187 switch(draw_method)
1189 case DM::SINGLE_COLOR | DM::SSE2 | 0*DM::AYUV_PLANAR :
1191 while(h--)
1193 for(int wt=0; wt<w; ++wt)
1194 // The <<6 is due to pixmix expecting the alpha parameter to be
1195 // the multiplication of two 6-bit unsigned numbers but we
1196 // only have one here. (No alpha mask.)
1197 pixmix_sse2(&dst[wt], color, s[wt]);
1198 s += overlayPitch;
1199 dst = (unsigned long *)((char *)dst + spd.pitch);
1202 break;
1203 case DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1205 while(h--)
1207 for(int wt=0; wt<w; ++wt)
1208 pixmix(&dst[wt], color, s[wt]);
1209 s += overlayPitch;
1210 dst = (unsigned long *)((char *)dst + spd.pitch);
1213 break;
1214 case 0*DM::SINGLE_COLOR | DM::SSE2 | 0*DM::AYUV_PLANAR :
1216 while(h--)
1218 const DWORD *sw = switchpts;
1219 for(int wt=0; wt<w; ++wt)
1221 // xo is the offset (usually negative) we have moved into the image
1222 // So if we have passed the switchpoint (?) switch to another colour
1223 // (So switchpts stores both colours *and* coordinates?)
1224 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1225 pixmix_sse2(&dst[wt], color, s[wt]);
1227 s += overlayPitch;
1228 dst = (unsigned long *)((char *)dst + spd.pitch);
1231 break;
1232 case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1234 while(h--)
1236 const DWORD *sw = switchpts;
1237 for(int wt=0; wt<w; ++wt)
1239 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1240 pixmix(&dst[wt], color, s[wt]);
1242 s += overlayPitch;
1243 dst = (unsigned long *)((char *)dst + spd.pitch);
1246 break;
1247 case DM::SINGLE_COLOR | DM::SSE2 | DM::AYUV_PLANAR :
1249 unsigned char* dst_A = (unsigned char*)dst;
1250 unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1251 unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1252 unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1254 AlphaBlt(dst_Y, s, ((color)>>16)&0xff, h, w, overlayPitch, spd.pitch);
1255 AlphaBlt(dst_U, s, ((color)>>8)&0xff, h, w, overlayPitch, spd.pitch);
1256 AlphaBlt(dst_V, s, ((color))&0xff, h, w, overlayPitch, spd.pitch);
1257 AlphaBlt(dst_A, s, 0, h, w, overlayPitch, spd.pitch);
1259 break;
1260 case 0*DM::SINGLE_COLOR | DM::SSE2 | DM::AYUV_PLANAR :
1262 unsigned char* dst_A = (unsigned char*)dst;
1263 unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1264 unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1265 unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1267 const DWORD *sw = switchpts;
1268 int last_x = xo;
1269 color = sw[0];
1270 while(last_x<w+xo)
1272 int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1273 color = sw[0];
1274 sw += 2;
1275 if( new_x < last_x )
1276 continue;
1277 AlphaBlt(dst_Y, s + last_x - xo, (color>>16)&0xff, h, new_x-last_x, overlayPitch, spd.pitch);
1278 AlphaBlt(dst_U, s + last_x - xo, (color>>8)&0xff, h, new_x-last_x, overlayPitch, spd.pitch);
1279 AlphaBlt(dst_V, s + last_x - xo, (color)&0xff, h, new_x-last_x, overlayPitch, spd.pitch);
1280 AlphaBlt(dst_A, s + last_x - xo, 0, h, new_x-last_x, overlayPitch, spd.pitch);
1282 dst_A += new_x - last_x;
1283 dst_Y += new_x - last_x;
1284 dst_U += new_x - last_x;
1285 dst_V += new_x - last_x;
1286 last_x = new_x;
1289 break;
1290 case DM::SINGLE_COLOR | 0*DM::SSE2 | DM::AYUV_PLANAR :
1292 // char * debug_dst=(char*)dst;int h2 = h;
1293 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1294 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1295 // debug_dst += spd.pitch*spd.h;
1296 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1297 // debug_dst += spd.pitch*spd.h;
1298 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1299 // debug_dst += spd.pitch*spd.h;
1300 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1301 // debug_dst=(char*)dst;
1303 unsigned char* dst_A = (unsigned char*)dst;
1304 unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1305 unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1306 unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1307 while(h--)
1309 for(int wt=0; wt<w; ++wt)
1311 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1312 pixmix(&temp, color, s[wt]);
1313 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1315 s += overlayPitch;
1316 dst_A += spd.pitch;
1317 dst_Y += spd.pitch;
1318 dst_U += spd.pitch;
1319 dst_V += spd.pitch;
1321 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1322 // debug_dst += spd.pitch*spd.h;
1323 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1324 // debug_dst += spd.pitch*spd.h;
1325 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1326 // debug_dst += spd.pitch*spd.h;
1327 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1329 break;
1330 case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | DM::AYUV_PLANAR :
1332 unsigned char* dst_A = (unsigned char*)dst;
1333 unsigned char* dst_Y = dst_A + spd.pitch*spd.h;
1334 unsigned char* dst_U = dst_Y + spd.pitch*spd.h;
1335 unsigned char* dst_V = dst_U + spd.pitch*spd.h;
1336 while(h--)
1338 const DWORD *sw = switchpts;
1339 for(int wt=0; wt<w; ++wt)
1341 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1342 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1343 pixmix(&temp, color, (s[wt]*(color>>24))>>8);
1344 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1346 s += overlayPitch;
1347 dst_A += spd.pitch;
1348 dst_Y += spd.pitch;
1349 dst_U += spd.pitch;
1350 dst_V += spd.pitch;
1353 break;
1355 // Remember to EMMS!
1356 // Rendering fails in funny ways if we don't do this.
1357 _mm_empty();
1358 return bbox;
1361 CRect Rasterizer::Draw( SubPicDesc& spd, DrawItem& draw_item )
1363 return Draw(spd, draw_item.overlay, draw_item.clip_rect, draw_item.alpha_mask.get(),
1364 draw_item.xsub, draw_item.ysub, draw_item.switchpts, draw_item.fBody, draw_item.fBorder);
1367 DrawItem* Rasterizer::CreateDrawItem( SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, SharedArrayByte pAlphaMask, int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder )
1369 DrawItem* result = new DrawItem();
1370 result->overlay = overlay;
1371 result->clip_rect = clipRect;
1372 result->alpha_mask = pAlphaMask;
1373 result->xsub = xsub;
1374 result->ysub = ysub;
1376 memcpy(result->switchpts, switchpts, sizeof(result->switchpts));
1377 result->fBody = fBody;
1378 result->fBorder = fBorder;
1379 return result;
1382 CRect Rasterizer::DryDraw( SubPicDesc& spd, SharedPtrOverlay overlay, const CRect& clipRect, byte* pAlphaMask, int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder )
1384 CRect bbox(0, 0, 0, 0);
1385 if(!switchpts || !fBody && !fBorder) return(bbox);
1387 // clip
1388 // Limit drawn area to intersection of rendering surface and rectangular clip area
1389 CRect r(0, 0, spd.w, spd.h);
1390 r &= clipRect;
1391 // Remember that all subtitle coordinates are specified in 1/8 pixels
1392 // (x+4)>>3 rounds to nearest whole pixel.
1393 // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1394 int overlayPitch = overlay->mOverlayPitch;
1395 int x = (xsub + overlay->mOffsetX + 4)>>3;
1396 int y = (ysub + overlay->mOffsetY + 4)>>3;
1397 int w = overlay->mOverlayWidth;
1398 int h = overlay->mOverlayHeight;
1399 int xo = 0, yo = 0;
1400 // Again, limiting?
1401 if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1402 if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1403 if(x+w > r.right) w = r.right-x;
1404 if(y+h > r.bottom) h = r.bottom-y;
1405 // Check if there's actually anything to render
1406 if(w <= 0 || h <= 0) return(bbox);
1407 bbox.SetRect(x, y, x+w, y+h);
1408 bbox &= CRect(0, 0, spd.w, spd.h);
1410 return bbox;
1413 CRect Rasterizer::DryDraw( SubPicDesc& spd, DrawItem& draw_item )
1415 return DryDraw(spd, draw_item.overlay, draw_item.clip_rect, draw_item.alpha_mask.get(),
1416 draw_item.xsub, draw_item.ysub, draw_item.switchpts, draw_item.fBody, draw_item.fBorder);
1419 void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nHeight, DWORD argb)
1421 bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1422 bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1423 int draw_method = 0;
1424 if(fSSE2)
1425 draw_method |= DM::SSE2;
1426 if(AYUV_PLANAR)
1427 draw_method |= DM::AYUV_PLANAR;
1429 switch (draw_method)
1431 case DM::SSE2 | 0*DM::AYUV_PLANAR :
1433 for (int wy=y; wy<y+nHeight; wy++) {
1434 DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1435 for(int wt=0; wt<nWidth; ++wt) {
1436 pixmix_sse2(&dst[wt], argb, argb>>24);
1440 break;
1441 case 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1443 for (int wy=y; wy<y+nHeight; wy++) {
1444 DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1445 for(int wt=0; wt<nWidth; ++wt) {
1446 pixmix(&dst[wt], argb, argb>>24);
1450 break;
1451 case DM::SSE2 | DM::AYUV_PLANAR :
1453 BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1454 BYTE* dst_A = dst;
1455 BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1456 BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1457 BYTE* dst_V = dst_U + spd.pitch*spd.h;
1458 AlphaBlt(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1459 AlphaBlt(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1460 AlphaBlt(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1461 AlphaBlt(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1463 break;
1464 case 0*DM::SSE2 | DM::AYUV_PLANAR :
1466 BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1467 BYTE* dst_A = dst;
1468 BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1469 BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1470 BYTE* dst_V = dst_U + spd.pitch*spd.h;
1471 AlphaBltC(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
1472 AlphaBltC(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
1473 AlphaBltC(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
1474 AlphaBltC(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
1476 break;
1478 _mm_empty();
1481 ///////////////////////////////////////////////////////////////
1483 // Overlay
1485 void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha )
1487 pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
1488 pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
1489 byte* dst = outputAlphaMask + y*mOverlayPitch + x;
1491 const int x0 = ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) < w ?
1492 ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) : w; //IMPORTANT! Should not exceed w.
1493 const int x00 = ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) < w ?
1494 ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) : w;//IMPORTANT! Should not exceed w.
1495 const int x_end00 = ((reinterpret_cast<int>(dst)+w)&~15) - reinterpret_cast<int>(dst);
1496 const int x_end0 = ((reinterpret_cast<int>(dst)+w)&~3) - reinterpret_cast<int>(dst);
1497 const int x_end = w;
1499 __m64 color_alpha_64 = _mm_set1_pi16(color_alpha);
1500 __m128i color_alpha_128 = _mm_set1_epi16(color_alpha);
1502 if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
1505 __asm
1507 mov eax, color_alpha
1508 movd XMM3, eax
1509 punpcklwd XMM3, XMM3
1510 pshufd XMM3, XMM3, 0
1513 while(h--)
1515 int j=0;
1516 for( ; j<x0; j++ )
1518 int temp = pBorder[j]-pBody[j];
1519 temp = temp<0 ? 0 : temp;
1520 dst[j] = (temp * color_alpha)>>6;
1522 for( ;j<x00;j+=4 )
1524 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1525 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1526 border = _mm_subs_pu8(border, body);
1527 __m64 zero = _mm_setzero_si64();
1528 border = _mm_unpacklo_pi8(border, zero);
1529 border = _mm_mullo_pi16(border, color_alpha_64);
1530 border = _mm_srli_pi16(border, 6);
1531 border = _mm_packs_pu16(border,border);
1532 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1534 __m128i zero = _mm_setzero_si128();
1535 for( ;j<x_end00;j+=16)
1537 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1538 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1539 border = _mm_subs_epu8(border,body);
1540 __m128i srchi = border;
1541 border = _mm_unpacklo_epi8(border, zero);
1542 srchi = _mm_unpackhi_epi8(srchi, zero);
1543 border = _mm_mullo_epi16(border, color_alpha_128);
1544 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1545 border = _mm_srli_epi16(border, 6);
1546 srchi = _mm_srli_epi16(srchi, 6);
1547 border = _mm_packus_epi16(border, srchi);
1548 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1550 for( ;j<x_end0;j+=4)
1552 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1553 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1554 border = _mm_subs_pu8(border, body);
1555 __m64 zero = _mm_setzero_si64();
1556 border = _mm_unpacklo_pi8(border, zero);
1557 border = _mm_mullo_pi16(border, color_alpha_64);
1558 border = _mm_srli_pi16(border, 6);
1559 border = _mm_packs_pu16(border,border);
1560 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1562 for( ;j<x_end;j++)
1564 int temp = pBorder[j]-pBody[j];
1565 temp = temp<0 ? 0 : temp;
1566 dst[j] = (temp * color_alpha)>>6;
1568 pBody += mOverlayPitch;
1569 pBorder += mOverlayPitch;
1570 //pAlphaMask += pitch;
1571 dst += mOverlayPitch;
1574 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
1576 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1577 while(h--)
1579 int j=0;
1580 for( ; j<x0; j++ )
1582 dst[j] = (src1[j] * color_alpha)>>6;
1584 for( ;j<x00;j+=4 )
1586 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1587 __m64 zero = _mm_setzero_si64();
1588 src = _mm_unpacklo_pi8(src, zero);
1589 src = _mm_mullo_pi16(src, color_alpha_64);
1590 src = _mm_srli_pi16(src, 6);
1591 src = _mm_packs_pu16(src,src);
1592 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1594 __m128i zero = _mm_setzero_si128();
1595 for( ;j<x_end00;j+=16)
1597 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1598 __m128i srchi = src;
1599 src = _mm_unpacklo_epi8(src, zero);
1600 srchi = _mm_unpackhi_epi8(srchi, zero);
1601 src = _mm_mullo_epi16(src, color_alpha_128);
1602 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1603 src = _mm_srli_epi16(src, 6);
1604 srchi = _mm_srli_epi16(srchi, 6);
1605 src = _mm_packus_epi16(src, srchi);
1606 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1608 for( ;j<x_end0;j+=4)
1610 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1611 __m64 zero = _mm_setzero_si64();
1612 src = _mm_unpacklo_pi8(src, zero);
1613 src = _mm_mullo_pi16(src, color_alpha_64);
1614 src = _mm_srli_pi16(src, 6);
1615 src = _mm_packs_pu16(src,src);
1616 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1618 for( ;j<x_end;j++)
1620 dst[j] = (src1[j] * color_alpha)>>6;
1622 src1 += mOverlayPitch;
1623 //pAlphaMask += pitch;
1624 dst += mOverlayPitch;
1627 else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
1629 const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
1630 while(h--)
1632 int j=0;
1633 for( ; j<x0; j++ )
1635 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1637 for( ;j<x00;j+=4 )
1639 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1640 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1641 __m64 zero = _mm_setzero_si64();
1642 src = _mm_unpacklo_pi8(src, zero);
1643 src = _mm_mullo_pi16(src, color_alpha_64);
1644 mask = _mm_unpacklo_pi8(zero, mask); //important!
1645 src = _mm_mulhi_pi16(src, mask); //important!
1646 src = _mm_srli_pi16(src, 12+8-16); //important!
1647 src = _mm_packs_pu16(src,src);
1648 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1650 __m128i zero = _mm_setzero_si128();
1651 for( ;j<x_end00;j+=16)
1653 __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
1654 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1655 __m128i srchi = src;
1656 __m128i maskhi = mask;
1657 src = _mm_unpacklo_epi8(src, zero);
1658 srchi = _mm_unpackhi_epi8(srchi, zero);
1659 mask = _mm_unpacklo_epi8(zero, mask); //important!
1660 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1661 src = _mm_mullo_epi16(src, color_alpha_128);
1662 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1663 src = _mm_mulhi_epu16(src, mask); //important!
1664 srchi = _mm_mulhi_epu16(srchi, maskhi);
1665 src = _mm_srli_epi16(src, 12+8-16); //important!
1666 srchi = _mm_srli_epi16(srchi, 12+8-16);
1667 src = _mm_packus_epi16(src, srchi);
1668 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
1670 for( ;j<x_end0;j+=4)
1672 __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
1673 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1674 __m64 zero = _mm_setzero_si64();
1675 src = _mm_unpacklo_pi8(src, zero);
1676 src = _mm_mullo_pi16(src, color_alpha_64);
1677 mask = _mm_unpacklo_pi8(zero, mask); //important!
1678 src = _mm_mulhi_pi16(src, mask); //important!
1679 src = _mm_srli_pi16(src, 12+8-16); //important!
1680 src = _mm_packs_pu16(src,src);
1681 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
1683 for( ;j<x_end;j++)
1685 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
1687 src1 += mOverlayPitch;
1688 pAlphaMask += pitch;
1689 dst += mOverlayPitch;
1692 else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
1694 while(h--)
1696 int j=0;
1697 for( ; j<x0; j++ )
1699 int temp = pBorder[j]-pBody[j];
1700 temp = temp<0 ? 0 : temp;
1701 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1703 for( ;j<x00;j+=4 )
1705 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1706 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1707 border = _mm_subs_pu8(border, body);
1708 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1709 __m64 zero = _mm_setzero_si64();
1710 border = _mm_unpacklo_pi8(border, zero);
1711 border = _mm_mullo_pi16(border, color_alpha_64);
1712 mask = _mm_unpacklo_pi8(zero, mask); //important!
1713 border = _mm_mulhi_pi16(border, mask); //important!
1714 border = _mm_srli_pi16(border, 12+8-16); //important!
1715 border = _mm_packs_pu16(border,border);
1716 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1718 __m128i zero = _mm_setzero_si128();
1719 for( ;j<x_end00;j+=16)
1721 __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
1722 __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
1723 border = _mm_subs_epu8(border,body);
1725 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
1726 __m128i srchi = border;
1727 __m128i maskhi = mask;
1728 border = _mm_unpacklo_epi8(border, zero);
1729 srchi = _mm_unpackhi_epi8(srchi, zero);
1730 mask = _mm_unpacklo_epi8(zero, mask); //important!
1731 maskhi = _mm_unpackhi_epi8(zero, maskhi);
1732 border = _mm_mullo_epi16(border, color_alpha_128);
1733 srchi = _mm_mullo_epi16(srchi, color_alpha_128);
1734 border = _mm_mulhi_epu16(border, mask); //important!
1735 srchi = _mm_mulhi_epu16(srchi, maskhi);
1736 border = _mm_srli_epi16(border, 12+8-16); //important!
1737 srchi = _mm_srli_epi16(srchi, 12+8-16);
1738 border = _mm_packus_epi16(border, srchi);
1739 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
1741 for( ;j<x_end0;j+=4)
1743 __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
1744 __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
1745 border = _mm_subs_pu8(border, body);
1746 __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
1747 __m64 zero = _mm_setzero_si64();
1748 border = _mm_unpacklo_pi8(border, zero);
1749 border = _mm_mullo_pi16(border, color_alpha_64);
1750 mask = _mm_unpacklo_pi8(zero, mask); //important!
1751 border = _mm_mulhi_pi16(border, mask); //important!
1752 border = _mm_srli_pi16(border, 12+8-16); //important!
1753 border = _mm_packs_pu16(border,border);
1754 *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
1756 for( ;j<x_end;j++)
1758 int temp = pBorder[j]-pBody[j];
1759 temp = temp<0 ? 0 : temp;
1760 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
1762 pBody += mOverlayPitch;
1763 pBorder += mOverlayPitch;
1764 pAlphaMask += pitch;
1765 dst += mOverlayPitch;
1768 else
1770 //should NOT happen!
1771 ASSERT(0);
1775 void Overlay::FillAlphaMash( byte* outputAlphaMask, bool fBody, bool fBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha)
1777 if(!fBorder && fBody && pAlphaMask==NULL)
1779 _DoFillAlphaMash(outputAlphaMask, mpOverlayBuffer.body, NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1781 else if(/*fBorder &&*/ fBody && pAlphaMask==NULL)
1783 _DoFillAlphaMash(outputAlphaMask, NULL, mpOverlayBuffer.border, x, y, w, h, pAlphaMask, pitch, color_alpha);
1785 else if(!fBody && fBorder /* pAlphaMask==NULL or not*/)
1787 _DoFillAlphaMash(outputAlphaMask, mpOverlayBuffer.body, mpOverlayBuffer.border, x, y, w, h, pAlphaMask, pitch, color_alpha);
1789 else if(!fBorder && fBody && pAlphaMask!=NULL)
1791 _DoFillAlphaMash(outputAlphaMask, mpOverlayBuffer.body, NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
1793 else if(fBorder && fBody && pAlphaMask!=NULL)
1795 _DoFillAlphaMash(outputAlphaMask, NULL, mpOverlayBuffer.border, x, y, w, h, pAlphaMask, pitch, color_alpha);
1797 else
1799 //should NOT happen
1800 ASSERT(0);
1804 Overlay* Overlay::GetSubpixelVariance(unsigned int xshift, unsigned int yshift)
1806 Overlay* overlay = new Overlay();
1807 if(!overlay)
1809 return NULL;
1811 xshift &= 7;
1812 yshift &= 7;
1814 overlay->mOffsetX = mOffsetX - xshift;
1815 overlay->mOffsetY = mOffsetY - yshift;
1816 overlay->mWidth = mWidth + xshift;
1817 overlay->mHeight = mHeight + yshift;
1819 overlay->mOverlayWidth = ((overlay->mWidth+7)>>3) + 1;
1820 overlay->mOverlayHeight = ((overlay->mHeight + 7)>>3) + 1;
1821 overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
1823 overlay->mpOverlayBuffer.base = reinterpret_cast<byte*>(xy_malloc(2 * overlay->mOverlayPitch * overlay->mOverlayHeight));
1824 overlay->mpOverlayBuffer.body = overlay->mpOverlayBuffer.base;
1825 overlay->mpOverlayBuffer.border = overlay->mpOverlayBuffer.base + overlay->mOverlayPitch * overlay->mOverlayHeight;
1827 overlay->mfWideOutlineEmpty = mfWideOutlineEmpty;
1829 if(overlay->mOverlayWidth==mOverlayWidth && overlay->mOverlayHeight==mOverlayHeight)
1830 memcpy(overlay->mpOverlayBuffer.base, mpOverlayBuffer.base, 2 * mOverlayPitch * mOverlayHeight);
1831 else
1833 memset(overlay->mpOverlayBuffer.base, 0, 2 * overlay->mOverlayPitch * overlay->mOverlayHeight);
1834 byte* dst = overlay->mpOverlayBuffer.body;
1835 const byte* src = mpOverlayBuffer.body;
1836 for (int i=0;i<mOverlayHeight;i++)
1838 memcpy(dst, src, mOverlayPitch);
1839 dst += overlay->mOverlayPitch;
1840 src += mOverlayPitch;
1842 dst = overlay->mpOverlayBuffer.border;
1843 src = mpOverlayBuffer.border;
1844 for (int i=0;i<mOverlayHeight;i++)
1846 memcpy(dst, src, mOverlayPitch);
1847 dst += overlay->mOverlayPitch;
1848 src += mOverlayPitch;
1851 //not equal
1852 // Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1853 Bilinear(overlay->mpOverlayBuffer.body, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1854 Bilinear(overlay->mpOverlayBuffer.border, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
1855 return overlay;
1858 ///////////////////////////////////////////////////////////////
1860 // PathData
1862 PathData::PathData():mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0)
1866 PathData::PathData( const PathData& src ):mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(src.mPathPoints)
1868 //TODO: deal with the case that src.mPathPoints<0
1869 if(mPathPoints>0)
1871 mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
1872 mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));
1874 if(mPathPoints>0)
1876 memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
1877 memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
1881 const PathData& PathData::operator=( const PathData& src )
1883 if(this!=&src)
1885 if(mPathPoints!=src.mPathPoints && src.mPathPoints>0)
1887 _TrashPath();
1888 mPathPoints = src.mPathPoints;
1889 mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
1890 mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));//better than realloc
1892 if(src.mPathPoints>0)
1894 memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
1895 memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
1898 return *this;
1901 PathData::~PathData()
1903 _TrashPath();
1906 void PathData::_TrashPath()
1908 if (mpPathTypes)
1910 free(mpPathTypes);
1911 mpPathTypes = NULL;
1913 if (mpPathPoints)
1915 free(mpPathPoints);
1916 mpPathPoints = NULL;
1918 mPathPoints = 0;
1921 bool PathData::BeginPath(HDC hdc)
1923 _TrashPath();
1924 return !!::BeginPath(hdc);
1927 bool PathData::EndPath(HDC hdc)
1929 ::CloseFigure(hdc);
1930 if(::EndPath(hdc))
1932 mPathPoints = GetPath(hdc, NULL, NULL, 0);
1933 if(!mPathPoints)
1934 return true;
1935 mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
1936 mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
1937 if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
1938 return true;
1940 ::AbortPath(hdc);
1941 return false;
1944 bool PathData::PartialBeginPath(HDC hdc, bool bClearPath)
1946 if(bClearPath)
1947 _TrashPath();
1948 return !!::BeginPath(hdc);
1951 bool PathData::PartialEndPath(HDC hdc, long dx, long dy)
1953 ::CloseFigure(hdc);
1954 if(::EndPath(hdc))
1956 int nPoints;
1957 BYTE* pNewTypes;
1958 POINT* pNewPoints;
1959 nPoints = GetPath(hdc, NULL, NULL, 0);
1960 if(!nPoints)
1961 return true;
1962 pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
1963 pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
1964 if(pNewTypes)
1965 mpPathTypes = pNewTypes;
1966 if(pNewPoints)
1967 mpPathPoints = pNewPoints;
1968 BYTE* pTypes = new BYTE[nPoints];
1969 POINT* pPoints = new POINT[nPoints];
1970 if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
1972 for(int i = 0; i < nPoints; ++i)
1974 mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
1975 mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
1976 mpPathTypes[mPathPoints + i] = pTypes[i];
1978 mPathPoints += nPoints;
1979 delete[] pTypes;
1980 delete[] pPoints;
1981 return true;
1983 else
1984 DebugBreak();
1985 delete[] pTypes;
1986 delete[] pPoints;
1988 ::AbortPath(hdc);
1989 return false;
1992 void PathData::AlignLeftTop(CPoint *left_top, CSize *size)
1994 int minx = INT_MAX;
1995 int miny = INT_MAX;
1996 int maxx = INT_MIN;
1997 int maxy = INT_MIN;
1998 for(int i=0; i<mPathPoints; ++i)
2000 int ix = mpPathPoints[i].x;
2001 int iy = mpPathPoints[i].y;
2002 if(ix < minx) minx = ix;
2003 if(ix > maxx) maxx = ix;
2004 if(iy < miny) miny = iy;
2005 if(iy > maxy) maxy = iy;
2007 if(minx > maxx || miny > maxy)
2009 _TrashPath();
2010 *left_top = CPoint(0, 0);
2011 *size = CSize(0, 0);
2012 return;
2014 minx = (minx >> 3) & ~7;
2015 miny = (miny >> 3) & ~7;
2016 maxx = (maxx + 7) >> 3;
2017 maxy = (maxy + 7) >> 3;
2018 for(int i=0; i<mPathPoints; ++i)
2020 mpPathPoints[i].x -= minx*8;
2021 mpPathPoints[i].y -= miny*8;
2023 *left_top = CPoint(minx, miny);
2024 *size = CSize(maxx+1-minx, maxy+1-miny);
2025 return;
2028 //////////////////////////////////////////////////////////////////////////
2030 // ScanLineData
2032 ScanLineData::ScanLineData()
2036 ScanLineData::~ScanLineData()
2040 void ScanLineData::_ReallocEdgeBuffer(int edges)
2042 mEdgeHeapSize = edges;
2043 mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
2046 void ScanLineData::_EvaluateBezier(const PathData& path_data, int ptbase, bool fBSpline)
2048 const POINT* pt0 = path_data.mpPathPoints + ptbase;
2049 const POINT* pt1 = path_data.mpPathPoints + ptbase + 1;
2050 const POINT* pt2 = path_data.mpPathPoints + ptbase + 2;
2051 const POINT* pt3 = path_data.mpPathPoints + ptbase + 3;
2052 double x0 = pt0->x;
2053 double x1 = pt1->x;
2054 double x2 = pt2->x;
2055 double x3 = pt3->x;
2056 double y0 = pt0->y;
2057 double y1 = pt1->y;
2058 double y2 = pt2->y;
2059 double y3 = pt3->y;
2060 double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
2061 if(fBSpline)
2063 // 1 [-1 +3 -3 +1]
2064 // - * [+3 -6 +3 0]
2065 // 6 [-3 0 +3 0]
2066 // [+1 +4 +1 0]
2067 double _1div6 = 1.0/6.0;
2068 cx3 = _1div6*(- x0+3*x1-3*x2+x3);
2069 cx2 = _1div6*( 3*x0-6*x1+3*x2);
2070 cx1 = _1div6*(-3*x0 +3*x2);
2071 cx0 = _1div6*( x0+4*x1+1*x2);
2072 cy3 = _1div6*(- y0+3*y1-3*y2+y3);
2073 cy2 = _1div6*( 3*y0-6*y1+3*y2);
2074 cy1 = _1div6*(-3*y0 +3*y2);
2075 cy0 = _1div6*( y0+4*y1+1*y2);
2077 else // bezier
2079 // [-1 +3 -3 +1]
2080 // [+3 -6 +3 0]
2081 // [-3 +3 0 0]
2082 // [+1 0 0 0]
2083 cx3 = - x0+3*x1-3*x2+x3;
2084 cx2 = 3*x0-6*x1+3*x2;
2085 cx1 = -3*x0+3*x1;
2086 cx0 = x0;
2087 cy3 = - y0+3*y1-3*y2+y3;
2088 cy2 = 3*y0-6*y1+3*y2;
2089 cy1 = -3*y0+3*y1;
2090 cy0 = y0;
2093 // This equation is from Graphics Gems I.
2095 // The idea is that since we're approximating a cubic curve with lines,
2096 // any error we incur is due to the curvature of the line, which we can
2097 // estimate by calculating the maximum acceleration of the curve. For
2098 // a cubic, the acceleration (second derivative) is a line, meaning that
2099 // the absolute maximum acceleration must occur at either the beginning
2100 // (|c2|) or the end (|c2+c3|). Our bounds here are a little more
2101 // conservative than that, but that's okay.
2103 // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2104 // that component of the curve is linear and does not incur any error.
2105 // If a=0 for both X and Y, the curve is a line segment and we can
2106 // use a step size of 1.
2107 double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
2108 double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
2109 double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
2110 double h = 1.0;
2111 if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
2112 if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
2113 for(double t = 0; t < 1.0; t += h)
2115 double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
2116 double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
2117 _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2119 double x = cx0 + cx1 + cx2 + cx3;
2120 double y = cy0 + cy1 + cy2 + cy3;
2121 _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2124 void ScanLineData::_EvaluateLine(const PathData& path_data, int pt1idx, int pt2idx)
2126 const POINT* pt1 = path_data.mpPathPoints + pt1idx;
2127 const POINT* pt2 = path_data.mpPathPoints + pt2idx;
2128 _EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
2131 void ScanLineData::_EvaluateLine(int x0, int y0, int x1, int y1)
2133 if(lastp.x != x0 || lastp.y != y0)
2135 _EvaluateLine(lastp.x, lastp.y, x0, y0);
2137 if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
2138 lastp.x = x1;
2139 lastp.y = y1;
2140 if(y1 > y0) // down
2142 __int64 xacc = (__int64)x0 << 13;
2143 // prestep y0 down
2144 int dy = y1 - y0;
2145 int y = ((y0 + 3)&~7) + 4;
2146 int iy = y >> 3;
2147 y1 = (y1 - 5) >> 3;
2148 if(iy <= y1)
2150 __int64 invslope = (__int64(x1 - x0) << 16) / dy;
2151 while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
2152 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2153 xacc += (invslope * (y - y0)) >> 3;
2154 while(iy <= y1)
2156 int ix = (int)((xacc + 32768) >> 16);
2157 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2158 mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
2159 mpScanBuffer[iy] = mEdgeNext++;
2160 ++iy;
2161 xacc += invslope;
2165 else if(y1 < y0) // up
2167 __int64 xacc = (__int64)x1 << 13;
2168 // prestep y1 down
2169 int dy = y0 - y1;
2170 int y = ((y1 + 3)&~7) + 4;
2171 int iy = y >> 3;
2172 y0 = (y0 - 5) >> 3;
2173 if(iy <= y0)
2175 __int64 invslope = (__int64(x0 - x1) << 16) / dy;
2176 while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
2177 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2178 xacc += (invslope * (y - y1)) >> 3;
2179 while(iy <= y0)
2181 int ix = (int)((xacc + 32768) >> 16);
2182 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2183 mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
2184 mpScanBuffer[iy] = mEdgeNext++;
2185 ++iy;
2186 xacc += invslope;
2192 bool ScanLineData::ScanConvert(const PathData& path_data, const CSize& size)
2194 int lastmoveto = -1;
2195 int i;
2196 // Drop any outlines we may have.
2197 mOutline.clear();
2198 mWideOutline.clear();
2199 mWideBorder = 0;
2200 // Determine bounding box
2201 if(!path_data.mPathPoints)
2203 mWidth = mHeight = 0;
2204 return false;
2206 mWidth = size.cx;
2207 mHeight = size.cy;
2208 // Initialize edge buffer. We use edge 0 as a sentinel.
2209 mEdgeNext = 1;
2210 mEdgeHeapSize = 2048;
2211 mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
2212 // Initialize scanline list.
2213 mpScanBuffer = new unsigned int[mHeight];
2214 memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
2215 // Scan convert the outline. Yuck, Bezier curves....
2216 // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2217 // paths with all but the first figure left open, so we can't rely
2218 // on the PT_CLOSEFIGURE flag being used appropriately.
2219 fFirstSet = false;
2220 firstp.x = firstp.y = 0;
2221 lastp.x = lastp.y = 0;
2222 for(i=0; i<path_data.mPathPoints; ++i)
2224 BYTE t = path_data.mpPathTypes[i] & ~PT_CLOSEFIGURE;
2225 switch(t)
2227 case PT_MOVETO:
2228 if(lastmoveto >= 0 && firstp != lastp)
2229 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2230 lastmoveto = i;
2231 fFirstSet = false;
2232 lastp = path_data.mpPathPoints[i];
2233 break;
2234 case PT_MOVETONC:
2235 break;
2236 case PT_LINETO:
2237 if(path_data.mPathPoints - (i-1) >= 2) _EvaluateLine(path_data, i-1, i);
2238 break;
2239 case PT_BEZIERTO:
2240 if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, false);
2241 i += 2;
2242 break;
2243 case PT_BSPLINETO:
2244 if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, true);
2245 i += 2;
2246 break;
2247 case PT_BSPLINEPATCHTO:
2248 if(path_data.mPathPoints - (i-3) >= 4) _EvaluateBezier(path_data, i-3, true);
2249 break;
2252 if(lastmoveto >= 0 && firstp != lastp)
2253 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2254 // Convert the edges to spans. We couldn't do this before because some of
2255 // the regions may have winding numbers >+1 and it would have been a pain
2256 // to try to adjust the spans on the fly. We use one heap to detangle
2257 // a scanline's worth of edges from the singly-linked lists, and another
2258 // to collect the actual scans.
2259 std::vector<int> heap;
2260 mOutline.reserve(mEdgeNext / 2);
2261 __int64 y = 0;
2262 for(y=0; y<mHeight; ++y)
2264 int count = 0;
2265 // Detangle scanline into edge heap.
2266 for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
2268 heap.push_back(mpEdgeBuffer[ptr].posandflag);
2270 // Sort edge heap. Note that we conveniently made the opening edges
2271 // one more than closing edges at the same spot, so we won't have any
2272 // problems with abutting spans.
2273 std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
2274 // Process edges and add spans. Since we only check for a non-zero
2275 // winding number, it doesn't matter which way the outlines go!
2276 std::vector<int>::iterator itX1 = heap.begin();
2277 std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
2278 int x1, x2;
2279 for(; itX1 != itX2; ++itX1)
2281 int x = *itX1;
2282 if(!count)
2283 x1 = (x>>1);
2284 if(x&1)
2285 ++count;
2286 else
2287 --count;
2288 if(!count)
2290 x2 = (x>>1);
2291 if(x2>x1)
2292 mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
2295 heap.clear();
2297 // Dump the edge and scan buffers, since we no longer need them.
2298 free(mpEdgeBuffer);
2299 delete [] mpScanBuffer;
2300 // All done!
2301 return true;
2304 using namespace std;
2306 void ScanLineData::_OverlapRegion(tSpanBuffer& dst, tSpanBuffer& src, int dx, int dy)
2308 tSpanBuffer temp;
2309 temp.reserve(dst.size() + src.size());
2310 dst.swap(temp);
2311 tSpanBuffer::iterator itA = temp.begin();
2312 tSpanBuffer::iterator itAE = temp.end();
2313 tSpanBuffer::iterator itB = src.begin();
2314 tSpanBuffer::iterator itBE = src.end();
2315 // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
2316 unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
2317 unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
2318 while(itA != itAE && itB != itBE)
2320 if((*itB).first + offset1 < (*itA).first)
2322 // B span is earlier. Use it.
2323 unsigned __int64 x1 = (*itB).first + offset1;
2324 unsigned __int64 x2 = (*itB).second + offset2;
2325 ++itB;
2326 // B spans don't overlap, so begin merge loop with A first.
2327 for(;;)
2329 // If we run out of A spans or the A span doesn't overlap,
2330 // then the next B span can't either (because B spans don't
2331 // overlap) and we exit.
2332 if(itA == itAE || (*itA).first > x2)
2333 break;
2334 do {x2 = _MAX(x2, (*itA++).second);}
2335 while(itA != itAE && (*itA).first <= x2);
2336 // If we run out of B spans or the B span doesn't overlap,
2337 // then the next A span can't either (because A spans don't
2338 // overlap) and we exit.
2339 if(itB == itBE || (*itB).first + offset1 > x2)
2340 break;
2341 do {x2 = _MAX(x2, (*itB++).second + offset2);}
2342 while(itB != itBE && (*itB).first + offset1 <= x2);
2344 // Flush span.
2345 dst.push_back(tSpan(x1, x2));
2347 else
2349 // A span is earlier. Use it.
2350 unsigned __int64 x1 = (*itA).first;
2351 unsigned __int64 x2 = (*itA).second;
2352 ++itA;
2353 // A spans don't overlap, so begin merge loop with B first.
2354 for(;;)
2356 // If we run out of B spans or the B span doesn't overlap,
2357 // then the next A span can't either (because A spans don't
2358 // overlap) and we exit.
2359 if(itB == itBE || (*itB).first + offset1 > x2)
2360 break;
2361 do {x2 = _MAX(x2, (*itB++).second + offset2);}
2362 while(itB != itBE && (*itB).first + offset1 <= x2);
2363 // If we run out of A spans or the A span doesn't overlap,
2364 // then the next B span can't either (because B spans don't
2365 // overlap) and we exit.
2366 if(itA == itAE || (*itA).first > x2)
2367 break;
2368 do {x2 = _MAX(x2, (*itA++).second);}
2369 while(itA != itAE && (*itA).first <= x2);
2371 // Flush span.
2372 dst.push_back(tSpan(x1, x2));
2375 // Copy over leftover spans.
2376 while(itA != itAE)
2377 dst.push_back(*itA++);
2378 while(itB != itBE)
2380 dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
2381 ++itB;
2385 bool ScanLineData::CreateWidenedRegion(int rx, int ry)
2387 if(rx < 0) rx = 0;
2388 if(ry < 0) ry = 0;
2389 mWideBorder = max(rx,ry);
2390 if (ry > 0)
2392 // Do a half circle.
2393 // _OverlapRegion mirrors this so both halves are done.
2394 for(int y = -ry; y <= ry; ++y)
2396 int x = (int)(0.5 + sqrt(float(ry*ry - y*y)) * float(rx)/float(ry));
2397 _OverlapRegion(mWideOutline, mOutline, x, y);
2400 else if (ry == 0 && rx > 0)
2402 // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
2403 _OverlapRegion(mWideOutline, mOutline, rx, 0);
2404 _OverlapRegion(mWideOutline, mOutline, rx, 0);
2406 return true;
2409 void ScanLineData::DeleteOutlines()
2411 mWideOutline.clear();
2412 mOutline.clear();