Added MMX/SSE optimizations for blur.
[tagua/yd.git] / src / imageeffects.cpp
bloba6828d3935ab4e779caab8a27e529c870e16c920
1 /*
2 Copyright (c) 2006 Maurizio Monge <maurizio.monge@kdemail.net>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 Part of the code been contributed by Jani Huhtanen.
10 Copyright (c) 2006 Jani Huhtanen.
14 #include <cmath>
15 #include <QPainter>
16 #include "imageeffects.h"
17 #if defined(HAVE_X86_MMX) || defined(HAVE_X86_SSE)
18 #include <kcpuinfo.h>
19 #endif
23 template<int aprec, int zprec>
24 static inline void blurinner(unsigned char *bptr, int &zR,
25 int &zG, int &zB, int &zA, int alpha);
27 template<int aprec,int zprec>
28 static inline void blurrow(QImage & im, int line, int alpha);
30 template<int aprec, int zprec>
31 static inline void blurcol(QImage & im, int col, int alpha);
34 * expblur(QImage &img, int radius)
36 * In-place blur of image 'img' with kernel
37 * of approximate radius 'radius'.
39 * Blurs with two sided exponential impulse
40 * response.
42 * aprec = precision of alpha parameter
43 * in fixed-point format 0.aprec
45 * zprec = precision of state parameters
46 * zR,zG,zB and zA in fp format 8.zprec
48 template<int aprec,int zprec>
49 static void expblur(QImage &img, int radius )
51 if (radius < 1)
52 return;
54 /* Calculate the alpha such that 90% of
55 the kernel is within the radius.
56 (Kernel extends to infinity)
58 int alpha = (int)((1<<aprec)*(1.0f-expf(-2.3f/(radius+1.f))));
60 for(int row=0;row<img.height();row++) {
61 blurrow<aprec,zprec>(img,row,alpha);
64 for(int col=0;col<img.width();col++) {
65 blurcol<aprec,zprec>(img,col,alpha);
67 return;
70 template<int aprec, int zprec>
71 static inline void blurinner(unsigned char *bptr,
72 int &zR, int &zG, int &zB, int &zA, int alpha)
74 int R,G,B,A;
75 R = *bptr;
76 G = *(bptr+1);
77 B = *(bptr+2);
78 A = *(bptr+3);
80 zR += (alpha * ((R<<zprec)-zR))>>aprec;
81 zG += (alpha * ((G<<zprec)-zG))>>aprec;
82 zB += (alpha * ((B<<zprec)-zB))>>aprec;
83 zA += (alpha * ((A<<zprec)-zA))>>aprec;
85 *bptr = zR>>zprec;
86 *(bptr+1) = zG>>zprec;
87 *(bptr+2) = zB>>zprec;
88 *(bptr+3) = zA>>zprec;
91 template<int aprec,int zprec>
92 static inline void blurrow( QImage & im, int line, int alpha)
94 int zR,zG,zB,zA;
96 QRgb *ptr = (QRgb *)im.scanLine(line);
98 zR = *((unsigned char *)ptr )<<zprec;
99 zG = *((unsigned char *)ptr + 1)<<zprec;
100 zB = *((unsigned char *)ptr + 2)<<zprec;
101 zA = *((unsigned char *)ptr + 3)<<zprec;
103 for(int index=1; index<im.width(); index++) {
104 blurinner<aprec,zprec>((unsigned char *)&ptr[index],
105 zR, zG, zB, zA, alpha);
107 for(int index=im.width()-2; index>=0; index--) {
108 blurinner<aprec,zprec>((unsigned char *)&ptr[index],
109 zR, zG, zB, zA, alpha);
115 template<int aprec, int zprec>
116 static inline void blurcol(QImage & im, int col, int alpha)
118 int zR,zG,zB,zA;
120 QRgb *ptr = (QRgb *)im.bits();
121 ptr+=col;
123 zR = *((unsigned char *)ptr )<<zprec;
124 zG = *((unsigned char *)ptr + 1)<<zprec;
125 zB = *((unsigned char *)ptr + 2)<<zprec;
126 zA = *((unsigned char *)ptr + 3)<<zprec;
128 for(int index=im.width(); index<(im.height()-1)*im.width();
129 index+=im.width()) {
130 blurinner<aprec,zprec>((unsigned char *)&ptr[index],
131 zR, zG, zB, zA, alpha);
134 for(int index=(im.height()-2)*im.width(); index>=0;
135 index-=im.width()) {
136 blurinner<aprec,zprec>((unsigned char *)&ptr[index],
137 zR, zG, zB, zA, alpha);
142 #ifdef HAVE_X86_SSE
144 #include <inttypes.h>
145 #include <xmmintrin.h>
147 union vec4i
149 uint16_t i[8];
150 __m128i v;
153 static inline void blur_sse_near(void* pixels, __m128i& state, __m128i alpha)
155 uint64_t z1 = 0ULL;
156 uint64_t z2 = 0ULL;
157 uint64_t z3 = 0ULL;
159 asm(
160 "movq %[ppix], %[pixels]\n"
161 "punpcklbw %[pixels], %[aux1]\n" // unpack two pixels setting their bytes
162 // as the most significant in the corr. word
163 "psrlw $1, %[aux1]\n" // shift right by 1, i.e. shift the colour
164 // bytes left by 7
165 "psubw %[state], %[aux1]\n" // - state
166 "pmulhw %[alpha], %[aux1]\n" // * alpha, and take the 16 most significant bits
167 "psllw $1, %[aux1]\n" // shift left (we trade 1 bit for performance, here)
168 "paddw %[aux1], %[state]\n" // sum result to state
169 "movdqa %[state], %[aux2]\n" // copy state to the aux2 register
170 "psrlw $7, %[aux2]\n" // shift right by 7: this is the new pixel value
171 "packuswb %[aux2], %[aux2]\n" // pack pixels as 8 bits
172 "movq %[aux2], %[ppix]\n"
173 : [state] "+x"(state)
174 , [ppix] "+m"(*(uint64_t*)pixels)
175 , [aux1] "+x"(z1)
176 , [aux2] "+x"(z2)
177 , [pixels] "+x"(z3)
178 : [alpha] "x"(alpha)
182 static inline void blur_sse_sep(void* pixel1, void* pixel2, __m128i& state, __m128i alpha)
184 uint64_t z1 = 0ULL;
185 uint64_t z2 = 0ULL;
186 uint64_t z3 = 0ULL;
187 uint64_t z4 = 0ULL;
189 asm(
190 "movd %[ppix1], %[pixels]\n" // load the first pixel
191 "movd %[ppix2], %[tmp]\n" // load the second pixel in [tmp]
192 "pslldq $4, %[tmp]\n" // shift left the second pixel
193 "paddd %[tmp], %[pixels]\n" // now both pixel are packed in [pixels]
195 "punpcklbw %[pixels], %[aux1]\n" // unpack two pixels setting their bytes
196 // as the most significant in the corr. word
197 "psrlw $1, %[aux1]\n" // shift right by 1, i.e. shift the colour
198 // bytes left by 7
199 "psubw %[state], %[aux1]\n" // - state
200 "pmulhw %[alpha], %[aux1]\n" // * alpha, and take the 16 most significant bits
201 "psllw $1, %[aux1]\n" // shift left (we trade 1 bit for performance, here)
202 "paddw %[aux1], %[state]\n" // sum result to state
203 "movdqa %[state], %[aux2]\n" // copy state to the aux2 register
204 "psrlw $7, %[aux2]\n" // shift right by 7: this is the new pixel value
205 "packuswb %[aux2], %[aux2]\n" // pack pixels as 8 bits
207 "movd %[aux2], %[ppix1]\n"
208 "psrldq $4, %[aux2]\n"
209 "movd %[aux2], %[ppix2]\n"
210 : [state] "+x"(state)
211 , [ppix1] "+m"(*(uint32_t*)pixel1)
212 , [ppix2] "+m"(*(uint32_t*)pixel2)
213 , [aux1] "+x"(z1)
214 , [aux2] "+x"(z2)
215 , [tmp] "+x"(z3)
216 , [pixels] "+x"(z4)
217 : [alpha] "x"(alpha)
221 static void expblur_sse( QImage &img, int radius )
223 if(radius<1)
224 return;
226 /* Calculate the alpha such that 90% of
227 the kernel is within the radius.
228 (Kernel extends to infinity)
230 uint16_t alpha = (uint16_t)((1<<15)*(1.0f-expf(-2.3f/(radius+1.f))));
232 vec4i a;
233 QRgb *ptr = (QRgb *)img.bits();
234 int h = img.height();
235 int w = img.width();
236 int hw = (img.height()-1)*img.width();
237 for(int i=0;i<8;i++)
238 a.i[i] = alpha;
240 for(int row=0;row<h-1;row+=2)
242 vec4i z;
243 uint8_t *cptr = (uint8_t*)(ptr+row*w);
244 for(int i=0;i<4;i++)
245 z.i[i] = cptr[i]<<7;
246 for(int i=0;i<4;i++)
247 z.i[4+i] = cptr[w*4+i]<<7;
249 for(int index=1; index<w; index++)
250 blur_sse_sep(&cptr[index*4], &cptr[(index+w)*4], z.v, a.v);
252 for(int index=w-2; index>=0; index--)
253 blur_sse_sep(&cptr[index*4], &cptr[(index+w)*4] , z.v, a.v);
256 if(h & 1)
258 vec4i z;
259 int dummy;
260 uint8_t *cptr = (uint8_t*)(ptr+(h-1)*w);
261 for(int i=0;i<4;i++)
262 z.i[i] = cptr[i]<<7;
264 for(int index=1; index<w; index++)
265 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
267 for(int index=w-2; index>=0; index--)
268 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
271 for(int col=0;col<w-1;col+=2)
273 vec4i z;
274 uint8_t *cptr = (uint8_t*)(ptr+col);
276 for(int i=0;i<8;i++)
277 z.i[i] = cptr[i]<<7;
279 for(int index=w; index<hw; index+=w)
280 blur_sse_near(&cptr[index*4], z.v, a.v);
282 for(int index=hw-2*w; index>=0; index-=w)
283 blur_sse_near(&cptr[index*4], z.v, a.v);
286 if(w & 1)
288 vec4i z;
289 int dummy;
290 uint8_t *cptr = (uint8_t*)(ptr+w-1);
292 for(int i=0;i<4;i++)
293 z.i[i] = cptr[i]<<7;
295 for(int index=w; index<hw; index+=w)
296 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
298 for(int index=hw-w; index>=0; index-=w)
299 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
302 return;
304 #endif //HAVE_X86_SSE
307 #ifdef HAVE_X86_MMX
309 #include <inttypes.h>
310 #include <mmintrin.h>
312 union vec4s
314 uint16_t i[4];
315 __m64 v;
318 static inline void blur_mmx(void *px, __m64& v, __m64& alpha)
320 uint64_t z1 = 0ULL;
321 uint64_t z2 = 0ULL;
322 asm(
323 "movd %[pixel], %[t1]\n"
324 "punpcklbw %[t1], %[t2]\n"
325 "psrlw $1, %[t2]\n"
326 "psubw %[accum], %[t2]\n"
327 "pmulhw %[alpha], %[t2]\n"
328 "psllw $1, %[t2]\n"
329 "paddw %[t2], %[accum]\n"
330 "movq %[accum], %[t1]\n"
331 "psrlw $7, %[t1]\n"
332 // "pand %[mask], %[t1]\n"
333 "packuswb %[t1], %[t1]\n"
334 "movd %[t1], %[pixel]\n"
335 : [pixel] "+m"(*(uint32_t*)px)
336 , [accum] "+y"(v)
337 , [t1] "+y"(z1)
338 , [t2] "+y"(z2)
339 : [alpha] "y"(alpha)
340 // , [mask] "y"(0x00ff00ff00ff00ffULL)
344 static void expblur_mmx( QImage &img, int radius )
346 if(radius<1)
347 return;
349 /* Calculate the alpha such that 90% of
350 the kernel is within the radius.
351 (Kernel extends to infinity)
353 uint16_t alpha = (uint16_t)((1<<15)*(1.0f-expf(-2.3f/(radius+1.f))));
355 vec4s a;
356 QRgb *ptr = (QRgb *)img.bits();
357 int h = img.height();
358 int w = img.width();
359 int hw = (img.height()-1)*img.width();
360 for(int i=0;i<4;i++)
361 a.i[i] = alpha;
363 for(int row=0;row<h;row++)
365 vec4s z;
366 uint8_t *cptr = (uint8_t*)(ptr+row*w);
367 for(int i=0;i<4;i++)
368 z.i[i] = cptr[i]<<7;
370 for(int index=1; index<w; index++)
371 blur_mmx(&cptr[index*4], z.v, a.v);
373 for(int index=w-2; index>=0; index--)
374 blur_mmx(&cptr[index*4], z.v, a.v);
377 for(int col=0;col<w;col++)
379 vec4s z;
380 uint8_t *cptr = (uint8_t*)(ptr+col);
382 for(int i=0;i<4;i++)
383 z.i[i] = cptr[i]<<7;
385 for(int index=w; index<hw; index+=w)
386 blur_mmx(&cptr[index*4], z.v, a.v);
388 for(int index=hw-w; index>=0; index-=w)
389 blur_mmx(&cptr[index*4], z.v, a.v);
392 asm("emms");
393 return;
395 #endif //HAVE_X86_MMX
400 namespace ImageEffects {
402 void expBlur(QImage& img, int radius) {
403 #ifdef HAVE_X86_SSE
404 if(KCPUInfo::haveExtension( KCPUInfo::IntelSSE ) )
405 return expblur_sse(img, radius);
406 #endif
407 #ifdef HAVE_X86_MMX
408 if(KCPUInfo::haveExtension( KCPUInfo::IntelMMX ) )
409 return expblur_mmx(img, radius);
410 #endif
411 return expblur<15,7>(img, radius);
414 QImage addShadow(const QImage& image, int r, QColor color,
415 int offx, int offy, int growx, int growy) {
416 QPainter p;
417 QImage retv(image.width()+growx, image.height()+growy, QImage::Format_ARGB32_Premultiplied);
418 int dx = (growx-offx)/2, dy = (growy-offy)/2;
420 p.begin(&retv);
421 p.setCompositionMode(QPainter::CompositionMode_Source);
422 p.fillRect(0,0,retv.width(), retv.height(), QColor(0,0,0,0));
423 p.fillRect(dx+offx, dy+offy, image.width(), image.height(), color);
424 p.setCompositionMode(QPainter::CompositionMode_DestinationAtop );
425 p.drawImage(dx+offx, dy+offy, image);
426 p.end();
428 expBlur(retv, r);
430 p.begin(&retv);
431 p.drawImage(dx, dy, image);
432 p.end();
434 return retv;
437 QImage growBorder(const QImage& image) {
438 int w = image.width();
439 int h = image.height();
440 QPainter p;
441 QImage retv(w+2, h+2, QImage::Format_ARGB32_Premultiplied);
443 p.begin(&retv);
444 p.setCompositionMode(QPainter::CompositionMode_Source);
445 p.drawImage(0, 0, image, 0, 0, 1, 1);
446 p.drawImage(w+1, 0, image, w-1, 0, 1, 1);
447 p.drawImage(0, h+1, image, 0, h-1, 1, 1);
448 p.drawImage(w+1, h+1, image, w-1, h-1, 1, 1);
449 p.drawImage(1, 0, image, 0, 0, w, 1);
450 p.drawImage(1, h+1, image, 0, h-1, w, 1);
451 p.drawImage(0, 1, image, 0, 0, 1, h);
452 p.drawImage(w+1, 1, image, w-1, 0, 1, h);
453 p.drawImage(1, 1, image);
454 p.end();
456 return retv;
459 struct Line {
460 int y, x1, x2;
461 Line(){}
462 Line(int _y, int _x1, int _x2)
463 : y(_y), x1(_x1), x2(_x2) {}
466 void floodFill(QImage& image, QPoint point, QColor color,
467 bool invade_border, std::vector<QPoint>* border) {
469 int* ptr = (int*)image.bits();
470 int h = image.height();
471 int w = image.width();
472 int newcol = color.rgba();
473 int oldcol = ptr[point.x()+point.y()*w];
474 std::vector<Line> lines;
477 Line l(point.y(), point.x(), point.x()+1);
478 int *scanline = ptr+point.y()*w;
479 scanline[l.x1] = newcol;
480 while(l.x1 > 0 && scanline[l.x1-1] == oldcol)
481 scanline[--l.x1] = newcol;
482 while(l.x2 < w && scanline[l.x2] == oldcol)
483 scanline[l.x2++] = newcol;
484 lines.push_back(l);
487 while(!lines.empty()) {
488 Line ll = lines[lines.size()-1];
489 lines.pop_back();
491 if(ll.x1>0) {
492 if(invade_border)
493 ptr[ll.y*w + ll.x1 - 1] = newcol;
494 if(border)
495 border->push_back(QPoint(ll.x1-1, ll.y));
497 if(ll.x2<w) {
498 if(invade_border)
499 ptr[ll.y*w + ll.x2] = newcol;
500 if(border)
501 border->push_back(QPoint(ll.x2, ll.y));
504 for(int d=-1; d<=1; d+=2)
505 if( (d == -1) ? (ll.y > 0) : (ll.y < h-1) ) {
506 int *scanline = ptr + (ll.y+d)*w;
508 for(int i=ll.x1;i<ll.x2;i++){
509 if(scanline[i]==oldcol) {
510 Line l(ll.y+d, i, i+1);
512 scanline[l.x1] = newcol;
513 while(l.x1 > 0 && scanline[l.x1-1] == oldcol)
514 scanline[--l.x1] = newcol;
515 while(l.x2 < w && scanline[l.x2] == oldcol)
516 scanline[l.x2++] = newcol;
517 lines.push_back(l);
518 i = l.x2;
520 if(i<ll.x2 && scanline[i]!=newcol) {
521 if(invade_border)
522 scanline[i]=newcol;
523 if(border)
524 border->push_back(QPoint(i, ll.y+d));
531 void floodFillBlueThreshold(QImage& image, QPoint point, QColor color, unsigned int thresh,
532 bool invade_border, std::vector<QPoint>* border) {
534 unsigned int* ptr = (unsigned int*)image.bits();
535 int h = image.height();
536 int w = image.width();
537 unsigned int newcol = color.rgba();
538 std::vector<Line> lines;
540 #define TEST(x) ((((x) & 0xff) < thresh) && ((x) != newcol))
542 Line l(point.y(), point.x(), point.x()+1);
543 unsigned int *scanline = ptr+point.y()*w;
544 scanline[l.x1] = newcol;
545 while(l.x1 > 0 && TEST(scanline[l.x1-1]))
546 scanline[--l.x1] = newcol;
547 while(l.x2 < w && TEST(scanline[l.x2]))
548 scanline[l.x2++] = newcol;
549 lines.push_back(l);
552 while(!lines.empty()) {
553 Line ll = lines[lines.size()-1];
554 lines.pop_back();
556 if(ll.x1>0) {
557 if(invade_border)
558 ptr[ll.y*w + ll.x1 - 1] = newcol;
559 if(border)
560 border->push_back(QPoint(ll.x1-1, ll.y));
562 if(ll.x2<w) {
563 if(invade_border)
564 ptr[ll.y*w + ll.x2] = newcol;
565 if(border)
566 border->push_back(QPoint(ll.x2, ll.y));
569 for(int d=-1; d<=1; d+=2)
570 if( (d == -1) ? (ll.y > 0) : (ll.y < h-1) ) {
571 unsigned int *scanline = ptr + (ll.y+d)*w;
573 for(int i=ll.x1;i<ll.x2;i++){
574 if(TEST(scanline[i])) {
575 Line l(ll.y+d, i, i+1);
577 scanline[l.x1] = newcol;
578 while(l.x1 > 0 && TEST(scanline[l.x1-1]))
579 scanline[--l.x1] = newcol;
580 while(l.x2 < w && TEST(scanline[l.x2]))
581 scanline[l.x2++] = newcol;
582 lines.push_back(l);
583 i = l.x2;
585 if(i<ll.x2 && scanline[i]!=newcol) {
586 if(invade_border)
587 scanline[i]=newcol;
588 if(border)
589 border->push_back(QPoint(i, ll.y+d));