Prepare 1.0 alpha3 release.
[tagua/yd.git] / src / imageeffects_sse.cpp
blobcdcfd10e289367c439e523788756bd4012a34894
1 /*
2 Copyright (c) 2006 Paolo Capriotti <p.capriotti@gmail.com>
3 (c) 2006 Maurizio Monge <maurizio.monge@kdemail.net>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9 */
11 #include <math.h>
12 #include <inttypes.h>
13 #include <xmmintrin.h>
14 #include <QImage>
17 namespace ImageEffects {
19 union vec4i
21 uint16_t i[8];
22 __m128i v;
25 static inline void blur_sse_near(void* pixels, __m128i& state, __m128i alpha)
27 uint64_t z1 = 0ULL;
28 uint64_t z2 = 0ULL;
29 uint64_t z3 = 0ULL;
31 asm(
32 "movq %[ppix], %[pixels]\n"
33 "punpcklbw %[pixels], %[aux1]\n" // unpack two pixels setting their bytes
34 // as the most significant in the corr. word
35 "psrlw $1, %[aux1]\n" // shift right by 1, i.e. shift the colour
36 // bytes left by 7
37 "psubw %[state], %[aux1]\n" // - state
38 "pmulhw %[alpha], %[aux1]\n" // * alpha, and take the 16 most significant bits
39 "psllw $1, %[aux1]\n" // shift left (we trade 1 bit for performance, here)
40 "paddw %[aux1], %[state]\n" // sum result to state
41 "movdqa %[state], %[aux2]\n" // copy state to the aux2 register
42 "psrlw $7, %[aux2]\n" // shift right by 7: this is the new pixel value
43 "packuswb %[aux2], %[aux2]\n" // pack pixels as 8 bits
44 "movq %[aux2], %[ppix]\n"
45 : [state] "+x"(state)
46 , [ppix] "+m"(*(uint64_t*)pixels)
47 , [aux1] "+x"(z1)
48 , [aux2] "+x"(z2)
49 , [pixels] "+x"(z3)
50 : [alpha] "x"(alpha)
54 static inline void blur_sse_sep(void* pixel1, void* pixel2, __m128i& state, __m128i alpha)
56 uint64_t z1 = 0ULL;
57 uint64_t z2 = 0ULL;
58 uint64_t z3 = 0ULL;
59 uint64_t z4 = 0ULL;
61 asm(
62 "movd %[ppix1], %[pixels]\n" // load the first pixel
63 "movd %[ppix2], %[tmp]\n" // load the second pixel in [tmp]
64 "pslldq $4, %[tmp]\n" // shift left the second pixel
65 "paddd %[tmp], %[pixels]\n" // now both pixel are packed in [pixels]
67 "punpcklbw %[pixels], %[aux1]\n" // unpack two pixels setting their bytes
68 // as the most significant in the corr. word
69 "psrlw $1, %[aux1]\n" // shift right by 1, i.e. shift the colour
70 // bytes left by 7
71 "psubw %[state], %[aux1]\n" // - state
72 "pmulhw %[alpha], %[aux1]\n" // * alpha, and take the 16 most significant bits
73 "psllw $1, %[aux1]\n" // shift left (we trade 1 bit for performance, here)
74 "paddw %[aux1], %[state]\n" // sum result to state
75 "movdqa %[state], %[aux2]\n" // copy state to the aux2 register
76 "psrlw $7, %[aux2]\n" // shift right by 7: this is the new pixel value
77 "packuswb %[aux2], %[aux2]\n" // pack pixels as 8 bits
79 "movd %[aux2], %[ppix1]\n"
80 "psrldq $4, %[aux2]\n"
81 "movd %[aux2], %[ppix2]\n"
82 : [state] "+x"(state)
83 , [ppix1] "+m"(*(uint32_t*)pixel1)
84 , [ppix2] "+m"(*(uint32_t*)pixel2)
85 , [aux1] "+x"(z1)
86 , [aux2] "+x"(z2)
87 , [tmp] "+x"(z3)
88 , [pixels] "+x"(z4)
89 : [alpha] "x"(alpha)
93 void expblur_sse( QImage &img, int radius )
95 if(radius<1)
96 return;
98 /* Calculate the alpha such that 90% of
99 the kernel is within the radius.
100 (Kernel extends to infinity)
102 uint16_t alpha = (uint16_t)((1<<15)*(1.0f-expf(-2.3f/(radius+1.f))));
104 vec4i a;
105 QRgb *ptr = (QRgb *)img.bits();
106 int h = img.height();
107 int w = img.width();
108 int hw = (img.height()-1)*img.width();
109 for(int i=0;i<8;i++)
110 a.i[i] = alpha;
112 for(int row=0;row<h-1;row+=2)
114 vec4i z;
115 uint8_t *cptr = (uint8_t*)(ptr+row*w);
116 for(int i=0;i<4;i++)
117 z.i[i] = cptr[i]<<7;
118 for(int i=0;i<4;i++)
119 z.i[4+i] = cptr[w*4+i]<<7;
121 for(int index=1; index<w; index++)
122 blur_sse_sep(&cptr[index*4], &cptr[(index+w)*4], z.v, a.v);
124 for(int index=w-2; index>=0; index--)
125 blur_sse_sep(&cptr[index*4], &cptr[(index+w)*4] , z.v, a.v);
128 if(h & 1)
130 vec4i z;
131 int dummy;
132 uint8_t *cptr = (uint8_t*)(ptr+(h-1)*w);
133 for(int i=0;i<4;i++)
134 z.i[i] = cptr[i]<<7;
136 for(int index=1; index<w; index++)
137 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
139 for(int index=w-2; index>=0; index--)
140 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
143 for(int col=0;col<w-1;col+=2)
145 vec4i z;
146 uint8_t *cptr = (uint8_t*)(ptr+col);
148 for(int i=0;i<8;i++)
149 z.i[i] = cptr[i]<<7;
151 for(int index=w; index<hw; index+=w)
152 blur_sse_near(&cptr[index*4], z.v, a.v);
154 for(int index=hw-2*w; index>=0; index-=w)
155 blur_sse_near(&cptr[index*4], z.v, a.v);
158 if(w & 1)
160 vec4i z;
161 int dummy;
162 uint8_t *cptr = (uint8_t*)(ptr+w-1);
164 for(int i=0;i<4;i++)
165 z.i[i] = cptr[i]<<7;
167 for(int index=w; index<hw; index+=w)
168 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
170 for(int index=hw-w; index>=0; index-=w)
171 blur_sse_sep(&cptr[index*4], &dummy, z.v, a.v);
174 return;
177 } //end namespace ImageEffects