egra: better, faster, and more flexible vertical gradients in agg mini
[iv.d.git] / egra / gfx / lowlevel.d
blob9f808bbce7132778723212b32b17cbac0692a2d0
1 /*
2 * Simple Framebuffer Gfx/GUI lib
4 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
5 * Understanding is not required. Only obedience.
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, version 3 of the License ONLY.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 module iv.egra.gfx.lowlevel /*is aliced*/;
20 private:
22 // uncomment this to disable SSE4.1 optimisations
23 //version = egfx_disable_sse41;
26 version(egfx_disable_sse41) {
27 version(egfx_use_sse41) {
28 static assert(false, "EGRA: SSE4.1 is both forced and disabled. wtf?!");
30 } else {
31 version(D_InlineAsm_X86) {
32 version(X86) {
33 version = egfx_use_sse41;
34 } else {
35 version(egfx_use_sse41) {
36 static assert(false, "EGRA: SSE4.1 is not supported on 64-bit architectures.");
39 } else {
40 version(egfx_use_sse41) {
41 static assert(false, "EGRA: SSE4.1 is not supported on non-DMD compilers.");
46 version(egfx_use_sse41) {
47 public enum EGfxUseSSE41 = true;
48 } else {
49 public enum EGfxUseSSE41 = false;
53 // ////////////////////////////////////////////////////////////////////////// //
54 public void egfxCheckCPU () nothrow @trusted @nogc {
55 version(egfx_use_sse41) {
56 import core.cpuid : sse41;
57 if (!sse41) {
58 import core.stdc.stdio : stderr, fprintf;
59 fprintf(stderr, "ERROR: EGRA requires CPU with SSE4.1 support!");
60 assert(0, "ERROR: EGRA requires CPU with SSE4.1 support!");
66 // ////////////////////////////////////////////////////////////////////////// //
67 // mix `dcvar` with ARGB (or ABGR) `colvar`; dc A is ignored (set to 255)
68 // main code almost never calls this with solid or transparent `colvar`
69 // the result will be put to `destvar` (it is written only once, at the end)
70 // `colvar` and `dcvar` may be read several times
71 // see http://stereopsis.com/doubleblend.html for the inspiration
72 version(none) {
73 // this works for solid and transparent colors too
74 public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
75 immutable uint col_ = `~colvar~`;
76 immutable uint dc_ = (`~dcvar~`)&0xffffffu;
77 /*immutable uint a_ = 256-(col_>>24);*/ /* to not loose bits */
78 immutable uint a_ = (col_>>24)+1; /* so it will work for both 0 and 255 correctly */
79 immutable uint srb_ = (col_&0xff00ffu);
80 immutable uint sg_ = (col_&0x00ff00u);
81 immutable uint drb_ = (dc_&0xff00ffu);
82 immutable uint dg_ = (dc_&0x00ff00u);
83 immutable uint orb_ = (drb_+(((srb_-drb_)*a_+0x800080u)>>8))&0xff00ffu;
84 immutable uint og_ = (dg_+(((sg_-dg_)*a_+0x008000u)>>8))&0x00ff00u;
85 (`~destvar~`) = orb_|og_|0xff_00_00_00u;
86 }`;
87 } else {
88 // this works for solid and transparent colors too
89 public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
90 immutable uint a_ = ((`~colvar~`)>>24)+1u; /* to not loose bits */
91 uint rb_ = (`~dcvar~`)&0xff00ffu;
92 uint g_ = (`~dcvar~`)&0x00ff00u;
93 rb_ += ((cast(uint)((`~colvar~`)&0xff00ffu)-rb_)*a_)>>8;
94 g_ += ((cast(uint)((`~colvar~`)&0x00ff00u)-g_)*a_)>>8;
95 /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */
96 (`~destvar~`) = (rb_&0xff00ffu)|(g_&0xff_00ff00u)|0xff_00_00_00u;
97 }`;
101 // t is [0..1]
102 public int gxInterpolateColorF (in uint c0, in uint c1, in float t) pure nothrow @safe @nogc {
103 import iv.bclamp;
104 import iv.egra.gfx.base;
106 if (t <= 0.0f) return c0;
107 if (t >= 1.0f) return c1;
109 static ubyte interpByte (in ubyte b0, in ubyte b1, in float t) pure nothrow @safe @nogc {
110 pragma(inline, true);
111 return (b0 == b1 ? b0 : clampToByte(b0+cast(int)((cast(int)b1-cast(int)b0)*t)));
114 immutable ubyte r = interpByte(gxGetRed(c0), gxGetRed(c1), t);
115 immutable ubyte g = interpByte(gxGetGreen(c0), gxGetGreen(c1), t);
116 immutable ubyte b = interpByte(gxGetBlue(c0), gxGetBlue(c1), t);
117 immutable ubyte a = interpByte(gxGetAlpha(c0), gxGetAlpha(c1), t);
118 return (a<<24)|(r<<16)|(g<<8)|b;
122 // t is [0..65535]
123 public int gxInterpolateColorI (in uint c0, in uint c1, in int t) pure nothrow @safe @nogc {
124 if (t <= 0) return c0;
125 if (t >= 65535) return c1;
126 if (c0 == c1) return c0;
128 version(all) {
129 immutable uint a_ = cast(uint)(t>>8)+1; // to not loose bits
130 uint rb_ = c0&0xff00ffu;
131 uint g_ = c0&0x00ff00u;
132 rb_ += (((c1&0xff00ffu)-rb_)*a_)>>8;
133 g_ += (((c1&0x00ff00u)-g_)*a_)>>8;
134 /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */
135 immutable uint res = (rb_&0xff00ffu)|(g_&0x00ff00u);
136 // now mix alpha
137 immutable int a0 = (c0>>24);
138 immutable int a1 = (c1>>24);
139 // same alpha?
140 if (a0 == a1) return res|(c0&0xff000000u);
141 // mix both alphas
142 return res|(((((a1-a0)*(t+1))>>16)+a0)<<24);
143 } else {
144 //return gxInterpolateColorF(c0, c1, cast(float)t/65535.0);
146 int b0 = cast(int)cast(ubyte)c0;
147 int b1 = cast(int)cast(ubyte)c1;
148 if (b0 != b1) b0 = cast(ubyte)((((b1-b0)*(t+1))>>16)+b0);
150 int g0 = cast(int)cast(ubyte)(c0>>8);
151 int g1 = cast(int)cast(ubyte)(c1>>8);
152 if (g0 != g1) g0 = cast(ubyte)((((g1-g0)*(t+1))>>16)+g0);
154 int r0 = cast(int)cast(ubyte)(c0>>16);
155 int r1 = cast(int)cast(ubyte)(c1>>16);
156 if (r0 != r1) r0 = cast(ubyte)((((r1-r0)*(t+1))>>16)+r0);
158 int a0 = cast(int)cast(ubyte)(c0>>24);
159 int a1 = cast(int)cast(ubyte)(c1>>24);
160 if (a0 != a1) a0 = cast(ubyte)((((a1-a0)*(t+1))>>16)+a0);
162 return cast(uint)((a0<<24)|(r0<<16)|(g0<<8)|b0);
167 // ////////////////////////////////////////////////////////////////////////// //
168 // size is in dwords
169 version(egfx_use_sse41) {
170 //pragma(msg," !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ");
173 align(16) immutable ubyte[16] sseSpreadOneColor = [
174 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
175 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
179 // for x86 naked functions, DMD will pass last arg in EAX
180 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
181 // we need to preserve ESI and EDI (and EBX in case of PIC code)
182 public uint* memFillDW (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
183 asm nothrow @trusted @nogc {
184 naked;
185 xchg EDI,/*SS:*/[ESP+8]; // EDI=mptr; also, save old EDI
186 cmp EAX,0;
187 jle done;
189 mov ECX,EAX; // ECX=count (because last arg is in EAX)
190 mov EAX,/*SS:*/[ESP+4]; // EAX=value
192 cmp ECX,8;
193 jc simplestore; // too small
195 // load XMM0 with our color
196 push EAX;
197 push EAX;
198 // used `movdqu`, because it indicates int type
199 // this doesn't matter, it just looks nicer
200 // also, `movlps` is one byte shorter
201 movlps XMM0,/*SS:*/[ESP];
202 movlhps XMM0,XMM0; // copy low 64 bits of XMM0 to high 64 bits of XMM0
203 //movdqu XMM0,/*SS:*/[ESP];
204 //pshufb XMM0,[sseSpreadOneColor];
205 add ESP,8;
207 // if we cannot align at all, use "rep stosd"
208 // this should not happen, so i won't bother optimising it
209 test EDI,0x03;
210 jnz simplestore;
212 // align EDI (we have at least 8 pixels to fill here, so it is safe)
213 alignloop:
214 test EDI,0x0f;
215 jz alignok;
216 stosd;
217 dec ECX;
218 jmp alignloop;
220 alignok:
221 // ECX is never zero here
222 cmp ECX,4;
223 jc simplestore; // too small
225 // save last 2 bits of counter (we'll mask them later)
226 movzx EDX,CL;
228 // fill by 4 pixels while we can
229 shr ECX,2;
230 //align 16; // why not
231 alignfill:
232 movaps [EDI],XMM0;
233 add EDI,16;
234 dec ECX;
235 jnz alignfill;
237 // fill last 1-3 pixels
238 mov ECX,EDX;
239 and CL,0x03;
240 jz done;
242 simplestore:
243 rep; stosd;
245 done:
246 mov EAX,EDI; // return new mptr
247 mov EDI,/*SS:*/[ESP+8]; // restore EDI
248 ret 4*2;
252 // WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
253 // WARNING! do not call it with fully opaque or fully transparent `clr`!
254 public alias memBlendColor = sseBlendColor;
256 public uint* memBlendColor (uint* mptr, in uint clr, int count) nothrow @trusted @nogc {
257 pragma(inline, true);
258 version(all) {
259 if (count < 1) return mptr;
260 immutable int c4 = (count>>2); // it is actually unsigned
261 if (c4) { mptr = sseBlendColor4px(mptr, clr, cast(uint)c4); count -= (c4<<2); }
262 return (count ? memBlendColorSlow(mptr, clr, count) : mptr);
263 } else {
264 return memBlendColorSlow(mptr, clr, count);
270 align(16) immutable ubyte[16] sseSpreadAlpha = [
271 0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
272 0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
275 align(16) immutable ubyte[16] sseMaxAlpha = [
276 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
277 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
280 align(16) immutable ubyte[16] sseFullByteAlpha = [
281 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
282 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
285 // mix foreground to background
286 // EAX is pixel count
287 // background = (alpha * foreground) + (1-alpha)*background
288 // WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
289 // WARNING! do not call it with fully opaque or fully transparent `clr`!
290 public uint* sseBlendColor (uint* dest, uint clr, uint count) nothrow @trusted @nogc {
291 asm nothrow @trusted @nogc {
292 naked;
293 //enter 0,0; // this actually slower than the byte soup below
294 push EBP;
295 mov EBP,ESP;
296 // save modified registers
297 push EDI;
299 mov EDI,[EBP+12]; // dest
300 // it can be negative
301 cmp EAX,0;
302 jle done;
303 mov ECX,EAX; // counter
305 // EAX: count
306 // [EBP+8]: clr
307 // [EBP+16]: dest
309 // align stack
310 sub ESP,16;
311 and ESP,0xfffffff0u;
313 mov EAX,[EBP+8]; // clr
315 // we can premultiply clr first, and convert alpha to 255-alpha
317 // prepare SSE data -- 2 pixels
318 mov /*SS:*/[ESP],EAX;
319 mov /*SS:*/[ESP+4],EAX;
321 // used `movdqa`, because it indicates int type
322 // this doesn't matter, it just looks nicer
323 // also, `movlps` is one byte shorter
324 movlps XMM0,/*SS:*/[ESP];
325 //movdqa XMM0,/*SS:*/[ESP];
326 // expand 8 ubytes to 8 ushorts
327 pmovzxbw XMM1,XMM0;
328 // XMM0: xx xx xx xx ar gb ar gb
329 // XMM1: 0a 0r 0g 0b 0a 0r 0g 0b
330 pshufb XMM0,[sseSpreadAlpha];
331 // XMM0: 00 0a 0a 0a 00 0a 0a 0a
332 movdqa XMM7,[sseMaxAlpha];
333 psubw XMM7,XMM0; // XMM7 is 255-alpha
334 // XMM7: 00 0a 0a 0a 00 0a 0a 0a
335 pmulhuw XMM0,XMM1;
336 // XMM0: 00 0r 0g 0b 00 0r 0g 0b
337 movdqa XMM6,[sseFullByteAlpha];
339 //XMM0: 2 premultiplied colors
340 //XMM7: 2 inverted alphas
341 //XMM6: destination alpha (replace value)
343 // totally unaligned?
344 // this should never happen, but meh...
345 test EDI,0x03;
346 jnz slowestpath; // alas, the slowest path
348 // align the address (if necessary)
349 test EDI,0x0f;
350 jz trymix8aligned;
352 // we need to mix 1-3 pixels to make the address aligned
353 // check counter here to allow "slow, but aligned" path (see the code below)
354 cmp ECX,4;
355 jc slowestpath; // alas
357 // process 4 pixels (we will drop unused ones)
358 movdqu XMM5,[EDI]; // 4 background pixels
359 pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
360 // copy high part of XMM5 to low part of XMM5
361 movhlps XMM5,XMM5;
362 pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
363 //XMM1: 2 lower pixels
364 //XMM2: 2 upper pixels
366 pmulhuw XMM1,XMM7; // multiply by alpha
367 pmulhuw XMM2,XMM7; // multiply by alpha
369 paddusw XMM1,XMM0; // add premultiplied colors
370 paddusw XMM2,XMM0; // add premultiplied colors
372 packuswb XMM1,XMM2;
374 // set destination alpha
375 por XMM1,XMM6;
377 // now write 1-3 pixels to align the address
378 // we are guaranteed to have at least 4 pixels to mix here
379 // i.e. 4 processed pixels, and at least 4 pixels in the counter
381 // put in temp storage (it is aligned)
382 movdqa /*SS:*/[ESP],XMM1;
383 mov EDX,ESI; // save ESI (DMD expects it unchanged)
384 lea ESI,[ESP];
385 uastoreloop:
386 movsd;
387 dec ECX;
388 test EDI,0x0f;
389 jnz uastoreloop;
390 mov ESI,EDX; // restore ESI
391 // ECX is at least 1 here, and EDI is aligned
393 trymix8aligned:
394 // ECX is never zero here
395 // use "slow, but aligned" path if we have less than 8 pixels to process
396 cmp ECX,8;
397 jc slowalignedpath;
399 // save last 3 bits in EAX
400 // we'll mask it later
401 movzx EAX,CL;
403 // process by 8 pixels while we can
404 shr ECX,3;
406 mix8aligned:
407 movdqa XMM5,[EDI]; // 4 background pixels
408 pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
409 // copy high part of XMM5 to low part of XMM5
410 movhlps XMM5,XMM5;
411 pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
412 //XMM1: 2 lower pixels
413 //XMM2: 2 upper pixels
415 movdqa XMM5,[EDI+16]; // 4 background pixels
416 pmovzxbw XMM3,XMM5; // expand 2 lower pixels to XMM3
417 // copy high part of XMM5 to low part of XMM5
418 movhlps XMM5,XMM5;
419 pmovzxbw XMM4,XMM5; // expand 2 upper pixels to XMM4
420 //XMM3: 2 lower pixels
421 //XMM4: 2 upper pixels
423 pmulhuw XMM1,XMM7; // multiply by alpha
424 pmulhuw XMM2,XMM7; // multiply by alpha
425 pmulhuw XMM3,XMM7; // multiply by alpha
426 pmulhuw XMM4,XMM7; // multiply by alpha
428 paddusw XMM1,XMM0; // add premultiplied colors
429 paddusw XMM2,XMM0; // add premultiplied colors
430 paddusw XMM3,XMM0; // add premultiplied colors
431 paddusw XMM4,XMM0; // add premultiplied colors
433 packuswb XMM1,XMM2;
434 packuswb XMM3,XMM4;
436 // set destination alpha
437 por XMM1,XMM6;
438 por XMM3,XMM6;
440 movdqa [EDI],XMM1;
441 movdqa [EDI+16],XMM3;
443 add EDI,32;
444 dec ECX;
445 jnz mix8aligned;
447 // do last 1-7 pixels (last counter is in EAX)
448 // EDI is guaranteed to be aligned here
449 mov ECX,EAX;
450 and CL,0x07;
451 jnz slowalignedpath;
453 // we're done
454 mov EAX,EDI;
455 mov EDI,[EBP-4]; // restore EDI
456 //leave; // this actually slower than the byte soup below
457 mov ESP,EBP;
458 pop EBP;
459 ret 4*2;
461 align 16;
462 // mix by 4 pixels, unaligned
463 slowestpath:
464 // mix 4 pixels
465 movdqu XMM5,[EDI]; // 4 background pixels
466 pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
467 // copy high part of XMM5 to low part of XMM5
468 movhlps XMM5,XMM5;
469 pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
470 //XMM1: 2 lower pixels
471 //XMM2: 2 upper pixels
473 pmulhuw XMM1,XMM7; // multiply by alpha
474 pmulhuw XMM2,XMM7; // multiply by alpha
476 paddusw XMM1,XMM0; // add premultiplied colors
477 paddusw XMM2,XMM0; // add premultiplied colors
479 packuswb XMM1,XMM2;
481 // set destination alpha
482 por XMM1,XMM6;
484 sub ECX,4;
485 jc slowestlast;
487 movdqu [EDI],XMM1;
488 add EDI,16;
489 jecxz done;
490 jmp slowestpath;
492 // last 1-3 pixels (never 0)
493 slowestlast:
494 // put in temp storage (it is aligned)
495 movdqa /*SS:*/[ESP],XMM1;
496 mov EDX,ESI; // save ESI (DMD expects it unchanged)
497 lea ESI,[ESP];
498 and ECX,0x03; // left counter
499 rep; movsd;
500 mov ESI,EDX; // restore ESI
501 jmp done;
503 done:
504 mov EAX,EDI;
505 mov EDI,[EBP-4]; // restore EDI
506 //leave; // this actually slower than the byte soup below
507 mov ESP,EBP;
508 pop EBP;
509 ret 4*2;
512 align 16;
513 // mix by 4 pixels, aligned (used for 1-7 pixels)
514 slowalignedpath:
515 // mix 4 pixels
516 movdqa XMM5,[EDI]; // 4 background pixels
517 pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
518 // copy high part of XMM5 to low part of XMM5
519 movhlps XMM5,XMM5;
520 pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
521 //XMM1: 2 lower pixels
522 //XMM2: 2 upper pixels
524 pmulhuw XMM1,XMM7; // multiply by alpha
525 pmulhuw XMM2,XMM7; // multiply by alpha
527 paddusw XMM1,XMM0; // add premultiplied colors
528 paddusw XMM2,XMM0; // add premultiplied colors
530 packuswb XMM1,XMM2;
532 // set destination alpha
533 por XMM1,XMM6;
535 sub ECX,4;
536 jc slowestlast;
538 movdqa [EDI],XMM1;
539 add EDI,16;
540 jecxz done;
541 jmp slowalignedpath;
545 // for x86 naked functions, DMD will pass last arg in EAX
546 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
547 // we need to preserve ESI and EDI (and EBX in case of PIC code)
548 // this doesn't change every 2nd pixel; `count` is count of ALL pixels
549 public uint* memFillDWDash (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
550 asm nothrow @trusted @nogc {
551 naked;
552 xchg EDI,/*SS:*/[ESP+8]; // EDI=mptr; also, save old EDI
553 // it can be negative
554 cmp EAX,1;
555 jl quit; // 0 or less
556 // another jump is done later, after the setup
557 mov ECX,EAX; // ECX=count (because last arg is in EAX)
558 mov EAX,/*SS:*/[ESP+4]; // EAX=value
559 // if we only filling one pixel, just do it
560 je onepixel;
562 // ECX is always >=2 here, and we are actually processing 2 pixels at a time anyway
563 mov DL,CL; // save the last bit for later use (we may need to set the last pixel)
564 shr ECX,1;
566 storeloop:
567 mov [EDI],EAX;
568 add EDI,8;
569 dec ECX;
570 jnz storeloop;
572 // set last pixel
573 test DL,1;
574 jz quit;
575 onepixel:
576 stosd;
578 quit:
579 mov EAX,EDI; // return new mptr
580 mov EDI,/*SS:*/[ESP+8]; // restore EDI
581 ret 8;
586 //TODO: rewrite this with SSE
587 // EAX is `count`
588 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
589 // we need to preserve ESI and EDI (and EBX in case of PIC code)
590 // this doesn't change every 2nd pixel; `count` is count of ALL pixels
591 public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
592 asm nothrow @trusted @nogc {
593 naked;
595 xchg EDI,/*SS:*/[ESP+8]; // EDI=mptr; also, save old EDI
596 cmp EAX,0;
597 jle quit;
599 push EBP; // EBP will contain the counter
600 push EBX; // EBX is temporary register
601 push ESI; // DMD expects ESI to be unmodified at exit
602 mov EBP,EAX; // EBP=counter
604 mov EAX,/*SS:*/[ESP+16]; // EAX=clr
605 mov ECX,EAX; // ECX will be clrA
606 // clrG=clr&0x00ff00u;
607 and EAX,0x00ff00u;
608 push EAX;
609 // clrRB=clr&0xff00ffu;
610 mov EAX,ECX;
611 and EAX,0xff00ffu;
612 push EAX;
613 // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision
614 shr ECX,24;
615 inc ECX;
617 // [ESP+0]: clrRB
618 // [ESP+4]: clrG
619 // ESI
620 // EBX
621 // EBP
622 // ret addr
623 // clr
624 // mptr
625 // EBP=counter
626 // EDI=mptr
627 // ECX=clrA
629 align 16; // why not
632 clrA = (clr>>24)+1;
633 clrRB = clr&0xff00ffu;
634 clrG = clr&0x00ff00u;
636 rb = (*mptr)&0xff00ffu;
637 rb += ((clrRB-rb)*clrA)>>8;
638 rb &= 0xff00ffu;
640 g = (*mptr)&0x00ff00u;
641 g += ((clrG-g)*clrA)>>8;
642 g &= 0x00ff00u;
644 *mptr++ = rb|g|0xff000000u;
647 mixloop:
648 // rb = (*mptr)&0xff00ffu;
649 // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu;
650 mov EBX,[EDI];
651 mov ESI,EBX; // save `*mptr`
652 and EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu
653 mov EAX,/*SS:*/[ESP]; // EAX=clrRB
654 sub EAX,EBX; // EAX=clrRB-rb
655 mul ECX; // EAX=(clrRB-rb)*clrA (EDX is dead)
656 shr EAX,8; // EAX=((clrRB-rb)*clrA)>>8
657 add EBX,EAX; // EBX=rb+(((clrRB-rb)*clrA)>>8)
658 and EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu
660 // g = (*mptr)&0x00ff00u;
661 // g += (((clrG-g)*clrA)>>8)&0x00ff00u;
662 mov EDX,ESI; // EDX=*mptr
663 and EDX,0x00ff00u; // EDX=g=(*mptr)&0x00ff00u
664 mov ESI,EDX; // save g, we well need it later
665 mov EAX,/*SS:*/[ESP+4]; // EAX=clrG
666 sub EAX,EDX; // EAX=clrG-g
667 mul ECX; // EAX=(clrG-g)*clrA (EDX is dead)
668 shr EAX,8; // EAX=((clrG-g)*clrA)>>8
669 add EAX,ESI; // EAX=(((clrG-g)*clrA)>>8)+g
670 and EAX,0x00ff00u; // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u
672 // mix
673 or EAX,EBX;
674 or EAX,0xff000000u;
676 stosd;
677 dec EBP;
678 jz mixdone;
679 add EDI,4;
680 dec EBP;
681 jnz mixloop;
683 mixdone:
684 add ESP,2*4; // drop temp vars
685 // restore registers
686 pop ESI;
687 pop EBX;
688 pop EBP;
690 quit:
691 mov EAX,EDI; // result
692 mov EDI,/*SS:*/[ESP+8]; // restore EDI
693 ret 8;
697 // EAX is `count`
698 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
699 // we need to preserve ESI and EDI (and EBX in case of PIC code)
700 // this is using a branch for empty/opaque alphas; i didn't profiled it, but i think it is faster than 3 muls
701 public void memBlendColorCoverage (uint* mptr, const(ubyte)* coverage, in uint clr, in int count) nothrow @trusted @nogc {
702 asm nothrow @trusted @nogc {
703 naked;
704 //enter 0,0; // this actually slower than the byte soup below
705 push EBP;
706 mov EBP,ESP;
707 // save modified registers
708 push EDI;
709 push ESI;
710 push EBX;
712 mov EDI,[EBP+16]; // dest
714 cmp EAX,0;
715 jle done;
716 mov ECX,EAX;
718 mov AL,byte ptr [EBP+11]; // c.a
719 // skip completely opaque pixels (just in case)
720 or AL,AL;
721 jz done;
723 mov ESI,[EBP+12]; // coverage
724 // for fully opaque pixels we can skip one load and one mul
725 inc AL;
726 jz fullyopaque;
728 mixloop:
729 movzx EAX,byte ptr [ESI]; // load coverage byte
730 inc ESI;
731 or AL,AL;
732 jz mixskip;
733 // alpha = (*coverage)*c.a;
734 movzx EDX,byte ptr [EBP+11]; // c.a
735 inc EDX; // for better precision
736 mul EDX;
737 // is the source color completely opaque?
738 cmp AX,0xff00; // 256*255 == 0xff00
739 jz mixopaque;
740 mov EBX,EAX;
741 // EBX: alpha
743 // cast(ubyte)((((c.b-v)*alpha)>>16)+v);
744 movzx EAX,byte ptr [EBP+8]; // c.b
745 movzx EDX,byte ptr [EDI]; // v
746 sub EAX,EDX;
747 // EAX: c.b-v
748 mul EBX;
749 // EAX: (c.b-v)*alpha
750 // EDX: dead
751 shr EAX,16;
752 add byte ptr [EDI],AL;
753 inc EDI;
755 movzx EAX,byte ptr [EBP+9]; // c.r
756 movzx EDX,byte ptr [EDI]; // v
757 sub EAX,EDX;
758 // EAX: c.b-v
759 mul EBX;
760 // EAX: (c.b-v)*alpha
761 // EDX: dead
762 shr EAX,16;
763 add byte ptr [EDI],AL;
764 inc EDI;
766 movzx EAX,byte ptr [EBP+10]; // c.g
767 movzx EDX,byte ptr [EDI]; // v
768 sub EAX,EDX;
769 // EAX: c.b-v
770 mul EBX;
771 // EAX: (c.b-v)*alpha
772 // EDX: dead
773 shr EAX,16;
774 add byte ptr [EDI],AL;
775 inc EDI;
777 mov byte ptr [EDI],0xff;
778 inc EDI;
779 mixnext:
780 dec ECX;
781 jnz mixloop;
783 done:
784 pop EBX;
785 pop ESI;
786 pop EDI;
787 //leave; // this actually slower than the byte soup below
788 mov ESP,EBP;
789 pop EBP;
790 ret 4*3;
792 mixskip:
793 add EDI,4; // skip destination pixel
794 jmp mixnext;
796 mixopaque:
797 mov EAX,[EBP+8]; // source pixel
798 or EAX,0xff000000; // alpha
799 stosd; // copy it
800 jmp mixnext;
802 // the source color is completely opaque
803 // use slightly faster code to calculate coverage alpha
804 fullyopaque:
805 movzx EAX,byte ptr [ESI]; // load coverage byte
806 inc ESI;
807 or AL,AL;
808 jz mixskipopaque;
809 cmp AL,0xff;
810 jz mixopaqueopaque;
811 shl EAX,8;
812 mov EBX,EAX;
813 // EBX: alpha
815 // cast(ubyte)((((c.b-v)*alpha)>>16)+v);
816 movzx EAX,byte ptr [EBP+8]; // c.b
817 movzx EDX,byte ptr [EDI]; // v
818 sub EAX,EDX;
819 // EAX: c.b-v
820 mul EBX;
821 // EAX: (c.b-v)*alpha
822 // EDX: dead
823 shr EAX,16;
824 add byte ptr [EDI],AL;
825 inc EDI;
827 movzx EAX,byte ptr [EBP+9]; // c.r
828 movzx EDX,byte ptr [EDI]; // v
829 sub EAX,EDX;
830 // EAX: c.b-v
831 mul EBX;
832 // EAX: (c.b-v)*alpha
833 // EDX: dead
834 shr EAX,16;
835 add byte ptr [EDI],AL;
836 inc EDI;
838 movzx EAX,byte ptr [EBP+10]; // c.g
839 movzx EDX,byte ptr [EDI]; // v
840 sub EAX,EDX;
841 // EAX: c.b-v
842 mul EBX;
843 // EAX: (c.b-v)*alpha
844 // EDX: dead
845 shr EAX,16;
846 add byte ptr [EDI],AL;
847 inc EDI;
849 mov byte ptr [EDI],0xff;
850 inc EDI;
851 mixnextopaque:
852 dec ECX;
853 jnz fullyopaque;
854 jmp done;
856 mixskipopaque:
857 add EDI,4; // skip destination pixel
858 jmp mixnextopaque;
860 mixopaqueopaque:
861 mov EAX,[EBP+8]; // source pixel
862 or EAX,0xff000000; // alpha
863 stosd; // copy it
864 jmp mixnextopaque;
869 } else {
870 // no SSE
871 public uint* memFillDW (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
872 pragma(inline, true);
873 if (count > 0) {
874 ptr[0..cast(usize)count] = value;
875 ptr += cast(usize)count;
877 return ptr;
879 public uint* memFillDWDash (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
880 pragma(inline, true);
881 foreach (immutable c; 0..count) { if (!(c&1)) *ptr++ = value; else ++ptr; }
882 if (count > 0) {
883 ptr[0..cast(usize)count] = value;
884 ptr += cast(usize)count;
886 return ptr;
888 public uint* memBlendColor (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
889 foreach (immutable _; 0..count) { mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); }
890 return mptr;
892 public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
893 foreach (immutable c; 0..count) { if (!(c&1)) mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); else ++mptr; }
894 return mptr;
896 public void memBlendColorCoverage (uint* mptr, const(ubyte)* coverage, in uint clr, in int count) nothrow @trusted @nogc {
897 immutable uint cb = clr&0xff;
898 immutable uint cg = (clr>>8)&0xff;
899 immutable uint cr = (clr>>16)&0xff;
900 immutable uint ca = clr>>24;
901 ubyte* p = cast(ubyte*)mptr;
902 foreach (immutable _; 0..count) {
903 immutable uint alpha = (*coverage++)*ca;
904 version(none) {
905 uint v = *p; *p++ = cast(ubyte)((((cb-v)*alpha)+(v<<16))>>16);
906 v = *p; *p++ = cast(ubyte)((((cg-v)*alpha)+(v<<16))>>16);
907 v = *p; *p++ = cast(ubyte)((((cr-v)*alpha)+(v<<16))>>16);
908 } else {
909 uint v = *p; *p++ = cast(ubyte)((((cb-v)*alpha)>>16)+v);
910 v = *p; *p++ = cast(ubyte)((((cg-v)*alpha)>>16)+v);
911 v = *p; *p++ = cast(ubyte)((((cr-v)*alpha)>>16)+v);
913 *p++ = 0xff;