egra/gfx/lowlevel.d

   1 /*
   2  * Simple Framebuffer Gfx/GUI lib
   3  *
   4  * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
   5  * Understanding is not required. Only obedience.
   6  *
   7  * This program is free software: you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation, version 3 of the License ONLY.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18  */
  19 module iv.egra.gfx.lowlevel /*is aliced*/;
  20 private:
  21
  22 // uncomment this to disable SSE4.1 optimisations
  23 //version = egfx_disable_sse41;
  24
  25
  26 version(egfx_disable_sse41) {
  27   version(egfx_use_sse41) {
  28     static assert(false, "EGRA: SSE4.1 is both forced and disabled. wtf?!");
  29   }
  30 } else {
  31   version(D_InlineAsm_X86) {
  32     version(X86) {
  33       version = egfx_use_sse41;
  34     } else {
  35       version(egfx_use_sse41) {
  36         static assert(false, "EGRA: SSE4.1 is not supported on 64-bit architectures.");
  37       }
  38     }
  39   } else {
  40     version(egfx_use_sse41) {
  41       static assert(false, "EGRA: SSE4.1 is not supported on non-DMD compilers.");
  42     }
  43   }
  44 }
  45
  46 version(egfx_use_sse41) {
  47   public enum EGfxUseSSE41 = true;
  48 } else {
  49   public enum EGfxUseSSE41 = false;
  50 }
  51
  52
  53 // ////////////////////////////////////////////////////////////////////////// //
  54 public void egfxCheckCPU () nothrow @trusted @nogc {
  55   version(egfx_use_sse41) {
  56     import core.cpuid : sse41;
  57     if (!sse41) {
  58       import core.stdc.stdio : stderr, fprintf;
  59       fprintf(stderr, "ERROR: EGRA requires CPU with SSE4.1 support!");
  60       assert(0, "ERROR: EGRA requires CPU with SSE4.1 support!");
  61     }
  62   }
  63 }
  64
  65
  66 // ////////////////////////////////////////////////////////////////////////// //
  67 // mix `dcvar` with ARGB (or ABGR) `colvar`; dc A is ignored (set to 255)
  68 // main code almost never calls this with solid or transparent `colvar`
  69 // the result will be put to `destvar` (it is written only once, at the end)
  70 // `colvar` and `dcvar` may be read several times
  71 // see http://stereopsis.com/doubleblend.html for the inspiration
  72 version(none) {
  73 // this works for solid and transparent colors too
  74 public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
  75   immutable uint col_ = `~colvar~`;
  76   immutable uint dc_ = (`~dcvar~`)&0xffffffu;
  77   /*immutable uint a_ = 256-(col_>>24);*/ /* to not loose bits */
  78   immutable uint a_ = (col_>>24)+1; /* so it will work for both 0 and 255 correctly */
  79   immutable uint srb_ = (col_&0xff00ffu);
  80   immutable uint sg_ = (col_&0x00ff00u);
  81   immutable uint drb_ = (dc_&0xff00ffu);
  82   immutable uint dg_ = (dc_&0x00ff00u);
  83   immutable uint orb_ = (drb_+(((srb_-drb_)*a_+0x800080u)>>8))&0xff00ffu;
  84   immutable uint og_ = (dg_+(((sg_-dg_)*a_+0x008000u)>>8))&0x00ff00u;
  85   (`~destvar~`) = orb_|og_|0xff_00_00_00u;
  86 }`;
  87 } else {
  88 // this works for solid and transparent colors too
  89 public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
  90   immutable uint a_ = ((`~colvar~`)>>24)+1u; /* to not loose bits */
  91   uint rb_ = (`~dcvar~`)&0xff00ffu;
  92   uint g_  = (`~dcvar~`)&0x00ff00u;
  93   rb_ += ((cast(uint)((`~colvar~`)&0xff00ffu)-rb_)*a_)>>8;
  94   g_  += ((cast(uint)((`~colvar~`)&0x00ff00u)-g_)*a_)>>8;
  95   /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */
  96   (`~destvar~`) = (rb_&0xff00ffu)|(g_&0xff_00ff00u)|0xff_00_00_00u;
  97 }`;
  98 }
  99
 100
 101 // t is [0..1]
 102 public int gxInterpolateColorF (in uint c0, in uint c1, in float t) pure nothrow @safe @nogc {
 103   import iv.bclamp;
 104   import iv.egra.gfx.base;
 105
 106   if (t <= 0.0f) return c0;
 107   if (t >= 1.0f) return c1;
 108
 109   static ubyte interpByte (in ubyte b0, in ubyte b1, in float t) pure nothrow @safe @nogc {
 110     pragma(inline, true);
 111     return (b0 == b1 ? b0 : clampToByte(b0+cast(int)((cast(int)b1-cast(int)b0)*t)));
 112   }
 113
 114   immutable ubyte r = interpByte(gxGetRed(c0), gxGetRed(c1), t);
 115   immutable ubyte g = interpByte(gxGetGreen(c0), gxGetGreen(c1), t);
 116   immutable ubyte b = interpByte(gxGetBlue(c0), gxGetBlue(c1), t);
 117   immutable ubyte a = interpByte(gxGetAlpha(c0), gxGetAlpha(c1), t);
 118   return (a<<24)|(r<<16)|(g<<8)|b;
 119 }
 120
 121
 122 // t is [0..65535]
 123 public int gxInterpolateColorI (in uint c0, in uint c1, in int t) pure nothrow @safe @nogc {
 124   if (t <= 0) return c0;
 125   if (t >= 65535) return c1;
 126   if (c0 == c1) return c0;
 127
 128   version(all) {
 129     immutable uint a_ = cast(uint)(t>>8)+1; // to not loose bits
 130     uint rb_ = c0&0xff00ffu;
 131     uint g_  = c0&0x00ff00u;
 132     rb_ += (((c1&0xff00ffu)-rb_)*a_)>>8;
 133     g_  += (((c1&0x00ff00u)-g_)*a_)>>8;
 134     /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */
 135     immutable uint res = (rb_&0xff00ffu)|(g_&0x00ff00u);
 136     // now mix alpha
 137     immutable int a0 = (c0>>24);
 138     immutable int a1 = (c1>>24);
 139     // same alpha?
 140     if (a0 == a1) return res|(c0&0xff000000u);
 141     // mix both alphas
 142     return res|(((((a1-a0)*(t+1))>>16)+a0)<<24);
 143   } else {
 144     //return gxInterpolateColorF(c0, c1, cast(float)t/65535.0);
 145
 146     int b0 = cast(int)cast(ubyte)c0;
 147     int b1 = cast(int)cast(ubyte)c1;
 148     if (b0 != b1) b0 = cast(ubyte)((((b1-b0)*(t+1))>>16)+b0);
 149
 150     int g0 = cast(int)cast(ubyte)(c0>>8);
 151     int g1 = cast(int)cast(ubyte)(c1>>8);
 152     if (g0 != g1) g0 = cast(ubyte)((((g1-g0)*(t+1))>>16)+g0);
 153
 154     int r0 = cast(int)cast(ubyte)(c0>>16);
 155     int r1 = cast(int)cast(ubyte)(c1>>16);
 156     if (r0 != r1) r0 = cast(ubyte)((((r1-r0)*(t+1))>>16)+r0);
 157
 158     int a0 = cast(int)cast(ubyte)(c0>>24);
 159     int a1 = cast(int)cast(ubyte)(c1>>24);
 160     if (a0 != a1) a0 = cast(ubyte)((((a1-a0)*(t+1))>>16)+a0);
 161
 162     return cast(uint)((a0<<24)|(r0<<16)|(g0<<8)|b0);
 163   }
 164 }
 165
 166
 167 // ////////////////////////////////////////////////////////////////////////// //
 168 // size is in dwords
 169 version(egfx_use_sse41) {
 170   //pragma(msg," !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ");
 171
 172 /*
 173 align(16) immutable ubyte[16] sseSpreadOneColor = [
 174 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
 175 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
 176 ];
 177 */
 178
 179 // for x86 naked functions, DMD will pass last arg in EAX
 180 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
 181 // we need to preserve ESI and EDI (and EBX in case of PIC code)
 182 public uint* memFillDW (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
 183   asm nothrow @trusted @nogc {
 184     naked;
 185     xchg    EDI,/*SS:*/[ESP+8]; // EDI=mptr; also, save old EDI
 186     cmp     EAX,0;
 187     jle     done;
 188
 189     mov     ECX,EAX;        // ECX=count (because last arg is in EAX)
 190     mov     EAX,/*SS:*/[ESP+4]; // EAX=value
 191
 192     cmp     ECX,8;
 193     jc      simplestore;  // too small
 194
 195     // load XMM0 with our color
 196     push    EAX;
 197     push    EAX;
 198     // used `movdqu`, because it indicates int type
 199     // this doesn't matter, it just looks nicer
 200     // also, `movlps` is one byte shorter
 201     movlps  XMM0,/*SS:*/[ESP];
 202     movlhps XMM0,XMM0; // copy low 64 bits of XMM0 to high 64 bits of XMM0
 203     //movdqu  XMM0,/*SS:*/[ESP];
 204     //pshufb  XMM0,[sseSpreadOneColor];
 205     add     ESP,8;
 206
 207     // if we cannot align at all, use "rep stosd"
 208     // this should not happen, so i won't bother optimising it
 209     test    EDI,0x03;
 210     jnz     simplestore;
 211
 212     // align EDI (we have at least 8 pixels to fill here, so it is safe)
 213 alignloop:
 214     test    EDI,0x0f;
 215     jz      alignok;
 216     stosd;
 217     dec     ECX;
 218     jmp     alignloop;
 219
 220 alignok:
 221     // ECX is never zero here
 222     cmp     ECX,4;
 223     jc      simplestore;  // too small
 224
 225     // save last 2 bits of counter (we'll mask them later)
 226     movzx   EDX,CL;
 227
 228     // fill by 4 pixels while we can
 229     shr     ECX,2;
 230     //align   16; // why not
 231 alignfill:
 232     movaps  [EDI],XMM0;
 233     add     EDI,16;
 234     dec     ECX;
 235     jnz     alignfill;
 236
 237     // fill last 1-3 pixels
 238     mov     ECX,EDX;
 239     and     CL,0x03;
 240     jz      done;
 241
 242 simplestore:
 243     rep; stosd;
 244
 245 done:
 246     mov  EAX,EDI; // return new mptr
 247     mov  EDI,/*SS:*/[ESP+8];  // restore EDI
 248     ret  4*2;
 249   }
 250 }
 251
 252 // WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
 253 // WARNING! do not call it with fully opaque or fully transparent `clr`!
 254 public alias memBlendColor = sseBlendColor;
 255 /*
 256 public uint* memBlendColor (uint* mptr, in uint clr, int count) nothrow @trusted @nogc {
 257   pragma(inline, true);
 258   version(all) {
 259     if (count < 1) return mptr;
 260     immutable int c4 = (count>>2); // it is actually unsigned
 261     if (c4) { mptr = sseBlendColor4px(mptr, clr, cast(uint)c4); count -= (c4<<2); }
 262     return (count ? memBlendColorSlow(mptr, clr, count) : mptr);
 263   } else {
 264     return memBlendColorSlow(mptr, clr, count);
 265   }
 266 }
 267 */
 268
 269
 270 align(16) immutable ubyte[16] sseSpreadAlpha = [
 271 0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
 272 0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
 273 ];
 274
 275 align(16) immutable ubyte[16] sseMaxAlpha = [
 276 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
 277 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
 278 ];
 279
 280 align(16) immutable ubyte[16] sseFullByteAlpha = [
 281 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
 282 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
 283 ];
 284
 285 // mix foreground to background
 286 // EAX is pixel count
 287 // background = (alpha * foreground) + (1-alpha)*background
 288 // WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
 289 // WARNING! do not call it with fully opaque or fully transparent `clr`!
 290 public uint* sseBlendColor (uint* dest, uint clr, uint count) nothrow @trusted @nogc {
 291   asm nothrow @trusted @nogc {
 292     naked;
 293     //enter    0,0; // this actually slower than the byte soup below
 294     push     EBP;
 295     mov      EBP,ESP;
 296     // save modified registers
 297     push     EDI;
 298
 299     mov      EDI,[EBP+12]; // dest
 300     // it can be negative
 301     cmp      EAX,0;
 302     jle      done;
 303     mov      ECX,EAX; // counter
 304
 305     // EAX: count
 306     // [EBP+8]: clr
 307     // [EBP+16]: dest
 308
 309     // align stack
 310     sub      ESP,16;
 311     and      ESP,0xfffffff0u;
 312
 313     mov      EAX,[EBP+8]; // clr
 314
 315     // we can premultiply clr first, and convert alpha to 255-alpha
 316
 317     // prepare SSE data -- 2 pixels
 318     mov      /*SS:*/[ESP],EAX;
 319     mov      /*SS:*/[ESP+4],EAX;
 320
 321     // used `movdqa`, because it indicates int type
 322     // this doesn't matter, it just looks nicer
 323     // also, `movlps` is one byte shorter
 324     movlps   XMM0,/*SS:*/[ESP];
 325     //movdqa   XMM0,/*SS:*/[ESP];
 326     // expand 8 ubytes to 8 ushorts
 327     pmovzxbw XMM1,XMM0;
 328     // XMM0: xx xx xx xx ar gb ar gb
 329     // XMM1: 0a 0r 0g 0b 0a 0r 0g 0b
 330     pshufb   XMM0,[sseSpreadAlpha];
 331     // XMM0: 00 0a 0a 0a 00 0a 0a 0a
 332     movdqa   XMM7,[sseMaxAlpha];
 333     psubw    XMM7,XMM0;  // XMM7 is 255-alpha
 334     // XMM7: 00 0a 0a 0a 00 0a 0a 0a
 335     pmulhuw  XMM0,XMM1;
 336     // XMM0: 00 0r 0g 0b 00 0r 0g 0b
 337     movdqa   XMM6,[sseFullByteAlpha];
 338
 339     //XMM0: 2 premultiplied colors
 340     //XMM7: 2 inverted alphas
 341     //XMM6: destination alpha (replace value)
 342
 343     // totally unaligned?
 344     // this should never happen, but meh...
 345     test     EDI,0x03;
 346     jnz      slowestpath; // alas, the slowest path
 347
 348     // align the address (if necessary)
 349     test     EDI,0x0f;
 350     jz       trymix8aligned;
 351
 352     // we need to mix 1-3 pixels to make the address aligned
 353     // check counter here to allow "slow, but aligned" path (see the code below)
 354     cmp      ECX,4;
 355     jc       slowestpath; // alas
 356
 357     // process 4 pixels (we will drop unused ones)
 358     movdqu   XMM5,[EDI]; // 4 background pixels
 359     pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
 360     // copy high part of XMM5 to low part of XMM5
 361     movhlps  XMM5,XMM5;
 362     pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
 363     //XMM1: 2 lower pixels
 364     //XMM2: 2 upper pixels
 365
 366     pmulhuw  XMM1,XMM7; // multiply by alpha
 367     pmulhuw  XMM2,XMM7; // multiply by alpha
 368
 369     paddusw  XMM1,XMM0; // add premultiplied colors
 370     paddusw  XMM2,XMM0; // add premultiplied colors
 371
 372     packuswb XMM1,XMM2;
 373
 374     // set destination alpha
 375     por      XMM1,XMM6;
 376
 377     // now write 1-3 pixels to align the address
 378     // we are guaranteed to have at least 4 pixels to mix here
 379     // i.e. 4 processed pixels, and at least 4 pixels in the counter
 380
 381     // put in temp storage (it is aligned)
 382     movdqa   /*SS:*/[ESP],XMM1;
 383     mov      EDX,ESI; // save ESI (DMD expects it unchanged)
 384     lea      ESI,[ESP];
 385 uastoreloop:
 386     movsd;
 387     dec      ECX;
 388     test     EDI,0x0f;
 389     jnz      uastoreloop;
 390     mov      ESI,EDX; // restore ESI
 391     // ECX is at least 1 here, and EDI is aligned
 392
 393 trymix8aligned:
 394     // ECX is never zero here
 395     // use "slow, but aligned" path if we have less than 8 pixels to process
 396     cmp      ECX,8;
 397     jc       slowalignedpath;
 398
 399     // save last 3 bits in EAX
 400     // we'll mask it later
 401     movzx    EAX,CL;
 402
 403     // process by 8 pixels while we can
 404     shr      ECX,3;
 405
 406 mix8aligned:
 407     movdqa   XMM5,[EDI]; // 4 background pixels
 408     pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
 409     // copy high part of XMM5 to low part of XMM5
 410     movhlps  XMM5,XMM5;
 411     pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
 412     //XMM1: 2 lower pixels
 413     //XMM2: 2 upper pixels
 414
 415     movdqa   XMM5,[EDI+16]; // 4 background pixels
 416     pmovzxbw XMM3,XMM5; // expand 2 lower pixels to XMM3
 417     // copy high part of XMM5 to low part of XMM5
 418     movhlps  XMM5,XMM5;
 419     pmovzxbw XMM4,XMM5; // expand 2 upper pixels to XMM4
 420     //XMM3: 2 lower pixels
 421     //XMM4: 2 upper pixels
 422
 423     pmulhuw  XMM1,XMM7; // multiply by alpha
 424     pmulhuw  XMM2,XMM7; // multiply by alpha
 425     pmulhuw  XMM3,XMM7; // multiply by alpha
 426     pmulhuw  XMM4,XMM7; // multiply by alpha
 427
 428     paddusw  XMM1,XMM0; // add premultiplied colors
 429     paddusw  XMM2,XMM0; // add premultiplied colors
 430     paddusw  XMM3,XMM0; // add premultiplied colors
 431     paddusw  XMM4,XMM0; // add premultiplied colors
 432
 433     packuswb XMM1,XMM2;
 434     packuswb XMM3,XMM4;
 435
 436     // set destination alpha
 437     por      XMM1,XMM6;
 438     por      XMM3,XMM6;
 439
 440     movdqa   [EDI],XMM1;
 441     movdqa   [EDI+16],XMM3;
 442
 443     add      EDI,32;
 444     dec      ECX;
 445     jnz      mix8aligned;
 446
 447     // do last 1-7 pixels (last counter is in EAX)
 448     // EDI is guaranteed to be aligned here
 449     mov      ECX,EAX;
 450     and      CL,0x07;
 451     jnz      slowalignedpath;
 452
 453     // we're done
 454     mov      EAX,EDI;
 455     mov      EDI,[EBP-4]; // restore EDI
 456     //leave;  // this actually slower than the byte soup below
 457     mov      ESP,EBP;
 458     pop      EBP;
 459     ret 4*2;
 460
 461     align 16;
 462     // mix by 4 pixels, unaligned
 463 slowestpath:
 464     // mix 4 pixels
 465     movdqu   XMM5,[EDI]; // 4 background pixels
 466     pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
 467     // copy high part of XMM5 to low part of XMM5
 468     movhlps  XMM5,XMM5;
 469     pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
 470     //XMM1: 2 lower pixels
 471     //XMM2: 2 upper pixels
 472
 473     pmulhuw  XMM1,XMM7; // multiply by alpha
 474     pmulhuw  XMM2,XMM7; // multiply by alpha
 475
 476     paddusw  XMM1,XMM0; // add premultiplied colors
 477     paddusw  XMM2,XMM0; // add premultiplied colors
 478
 479     packuswb XMM1,XMM2;
 480
 481     // set destination alpha
 482     por      XMM1,XMM6;
 483
 484     sub      ECX,4;
 485     jc       slowestlast;
 486
 487     movdqu   [EDI],XMM1;
 488     add      EDI,16;
 489     jecxz    done;
 490     jmp      slowestpath;
 491
 492     // last 1-3 pixels (never 0)
 493 slowestlast:
 494     // put in temp storage (it is aligned)
 495     movdqa   /*SS:*/[ESP],XMM1;
 496     mov      EDX,ESI; // save ESI (DMD expects it unchanged)
 497     lea      ESI,[ESP];
 498     and      ECX,0x03; // left counter
 499     rep; movsd;
 500     mov      ESI,EDX; // restore ESI
 501     jmp      done;
 502
 503 done:
 504     mov      EAX,EDI;
 505     mov      EDI,[EBP-4]; // restore EDI
 506     //leave;  // this actually slower than the byte soup below
 507     mov      ESP,EBP;
 508     pop      EBP;
 509     ret 4*2;
 510
 511
 512     align 16;
 513     // mix by 4 pixels, aligned (used for 1-7 pixels)
 514 slowalignedpath:
 515     // mix 4 pixels
 516     movdqa   XMM5,[EDI]; // 4 background pixels
 517     pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
 518     // copy high part of XMM5 to low part of XMM5
 519     movhlps  XMM5,XMM5;
 520     pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
 521     //XMM1: 2 lower pixels
 522     //XMM2: 2 upper pixels
 523
 524     pmulhuw  XMM1,XMM7; // multiply by alpha
 525     pmulhuw  XMM2,XMM7; // multiply by alpha
 526
 527     paddusw  XMM1,XMM0; // add premultiplied colors
 528     paddusw  XMM2,XMM0; // add premultiplied colors
 529
 530     packuswb XMM1,XMM2;
 531
 532     // set destination alpha
 533     por      XMM1,XMM6;
 534
 535     sub      ECX,4;
 536     jc       slowestlast;
 537
 538     movdqa   [EDI],XMM1;
 539     add      EDI,16;
 540     jecxz    done;
 541     jmp      slowalignedpath;
 542   }
 543 }
 544
 545 // for x86 naked functions, DMD will pass last arg in EAX
 546 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
 547 // we need to preserve ESI and EDI (and EBX in case of PIC code)
 548 // this doesn't change every 2nd pixel; `count` is count of ALL pixels
 549 public uint* memFillDWDash (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
 550   asm nothrow @trusted @nogc {
 551     naked;
 552     xchg  EDI,/*SS:*/[ESP+8]; // EDI=mptr; also, save old EDI
 553     // it can be negative
 554     cmp   EAX,1;
 555     jl    quit; // 0 or less
 556     // another jump is done later, after the setup
 557     mov   ECX,EAX;        // ECX=count (because last arg is in EAX)
 558     mov   EAX,/*SS:*/[ESP+4]; // EAX=value
 559     // if we only filling one pixel, just do it
 560     je    onepixel;
 561
 562     // ECX is always >=2 here, and we are actually processing 2 pixels at a time anyway
 563     mov   DL,CL; // save the last bit for later use (we may need to set the last pixel)
 564     shr   ECX,1;
 565
 566 storeloop:
 567     mov   [EDI],EAX;
 568     add   EDI,8;
 569     dec   ECX;
 570     jnz   storeloop;
 571
 572     // set last pixel
 573     test  DL,1;
 574     jz    quit;
 575 onepixel:
 576     stosd;
 577
 578 quit:
 579     mov  EAX,EDI; // return new mptr
 580     mov  EDI,/*SS:*/[ESP+8];  // restore EDI
 581     ret  8;
 582   }
 583 }
 584
 585
 586 //TODO: rewrite this with SSE
 587 // EAX is `count`
 588 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
 589 // we need to preserve ESI and EDI (and EBX in case of PIC code)
 590 // this doesn't change every 2nd pixel; `count` is count of ALL pixels
 591 public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
 592   asm nothrow @trusted @nogc {
 593     naked;
 594
 595     xchg  EDI,/*SS:*/[ESP+8]; // EDI=mptr; also, save old EDI
 596     cmp   EAX,0;
 597     jle   quit;
 598
 599     push  EBP; // EBP will contain the counter
 600     push  EBX; // EBX is temporary register
 601     push  ESI; // DMD expects ESI to be unmodified at exit
 602     mov   EBP,EAX; // EBP=counter
 603
 604     mov   EAX,/*SS:*/[ESP+16]; // EAX=clr
 605     mov   ECX,EAX;  // ECX will be clrA
 606     // clrG=clr&0x00ff00u;
 607     and   EAX,0x00ff00u;
 608     push  EAX;
 609     // clrRB=clr&0xff00ffu;
 610     mov   EAX,ECX;
 611     and   EAX,0xff00ffu;
 612     push  EAX;
 613     // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision
 614     shr   ECX,24;
 615     inc   ECX;
 616
 617     // [ESP+0]: clrRB
 618     // [ESP+4]: clrG
 619     // ESI
 620     // EBX
 621     // EBP
 622     // ret addr
 623     // clr
 624     // mptr
 625     // EBP=counter
 626     // EDI=mptr
 627     // ECX=clrA
 628
 629     align 16; // why not
 630
 631     /+
 632       clrA = (clr>>24)+1;
 633       clrRB = clr&0xff00ffu;
 634       clrG = clr&0x00ff00u;
 635
 636       rb = (*mptr)&0xff00ffu;
 637       rb += ((clrRB-rb)*clrA)>>8;
 638       rb &= 0xff00ffu;
 639
 640       g = (*mptr)&0x00ff00u;
 641       g += ((clrG-g)*clrA)>>8;
 642       g &= 0x00ff00u;
 643
 644       *mptr++ = rb|g|0xff000000u;
 645     +/
 646
 647   mixloop:
 648     // rb = (*mptr)&0xff00ffu;
 649     // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu;
 650     mov  EBX,[EDI];
 651     mov  ESI,EBX;       // save `*mptr`
 652     and  EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu
 653     mov  EAX,/*SS:*/[ESP];  // EAX=clrRB
 654     sub  EAX,EBX;       // EAX=clrRB-rb
 655     mul  ECX;           // EAX=(clrRB-rb)*clrA (EDX is dead)
 656     shr  EAX,8;         // EAX=((clrRB-rb)*clrA)>>8
 657     add  EBX,EAX;       // EBX=rb+(((clrRB-rb)*clrA)>>8)
 658     and  EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu
 659
 660     // g = (*mptr)&0x00ff00u;
 661     // g += (((clrG-g)*clrA)>>8)&0x00ff00u;
 662     mov  EDX,ESI;        // EDX=*mptr
 663     and  EDX,0x00ff00u;  // EDX=g=(*mptr)&0x00ff00u
 664     mov  ESI,EDX;        // save g, we well need it later
 665     mov  EAX,/*SS:*/[ESP+4]; // EAX=clrG
 666     sub  EAX,EDX;        // EAX=clrG-g
 667     mul  ECX;            // EAX=(clrG-g)*clrA (EDX is dead)
 668     shr  EAX,8;          // EAX=((clrG-g)*clrA)>>8
 669     add  EAX,ESI;        // EAX=(((clrG-g)*clrA)>>8)+g
 670     and  EAX,0x00ff00u;  // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u
 671
 672     // mix
 673     or   EAX,EBX;
 674     or   EAX,0xff000000u;
 675
 676     stosd;
 677     dec  EBP;
 678     jz   mixdone;
 679     add  EDI,4;
 680     dec  EBP;
 681     jnz  mixloop;
 682
 683 mixdone:
 684     add  ESP,2*4; // drop temp vars
 685     // restore registers
 686     pop  ESI;
 687     pop  EBX;
 688     pop  EBP;
 689
 690 quit:
 691     mov  EAX,EDI; // result
 692     mov  EDI,/*SS:*/[ESP+8];  // restore EDI
 693     ret  8;
 694   }
 695 }
 696
 697 // EAX is `count`
 698 // sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
 699 // we need to preserve ESI and EDI (and EBX in case of PIC code)
 700 // this is using a branch for empty/opaque alphas; i didn't profiled it, but i think it is faster than 3 muls
 701 public void memBlendColorCoverage (uint* mptr, const(ubyte)* coverage, in uint clr, in int count) nothrow @trusted @nogc {
 702   asm nothrow @trusted @nogc {
 703     naked;
 704     //enter    0,0; // this actually slower than the byte soup below
 705     push   EBP;
 706     mov    EBP,ESP;
 707     // save modified registers
 708     push   EDI;
 709     push   ESI;
 710     push   EBX;
 711
 712     mov    EDI,[EBP+16]; // dest
 713
 714     cmp    EAX,0;
 715     jle    done;
 716     mov    ECX,EAX;
 717
 718     mov    AL,byte ptr [EBP+11]; // c.a
 719     // skip completely opaque pixels (just in case)
 720     or     AL,AL;
 721     jz     done;
 722
 723     mov    ESI,[EBP+12]; // coverage
 724     // for fully opaque pixels we can skip one load and one mul
 725     inc    AL;
 726     jz     fullyopaque;
 727
 728 mixloop:
 729     movzx  EAX,byte ptr [ESI]; // load coverage byte
 730     inc    ESI;
 731     or     AL,AL;
 732     jz     mixskip;
 733     // alpha = (*coverage)*c.a;
 734     movzx  EDX,byte ptr [EBP+11]; // c.a
 735     inc    EDX; // for better precision
 736     mul    EDX;
 737     // is the source color completely opaque?
 738     cmp    AX,0xff00; // 256*255 == 0xff00
 739     jz     mixopaque;
 740     mov    EBX,EAX;
 741     // EBX: alpha
 742
 743     // cast(ubyte)((((c.b-v)*alpha)>>16)+v);
 744     movzx  EAX,byte ptr [EBP+8]; // c.b
 745     movzx  EDX,byte ptr [EDI]; // v
 746     sub    EAX,EDX;
 747     // EAX: c.b-v
 748     mul    EBX;
 749     // EAX: (c.b-v)*alpha
 750     // EDX: dead
 751     shr    EAX,16;
 752     add    byte ptr [EDI],AL;
 753     inc    EDI;
 754
 755     movzx  EAX,byte ptr [EBP+9]; // c.r
 756     movzx  EDX,byte ptr [EDI]; // v
 757     sub    EAX,EDX;
 758     // EAX: c.b-v
 759     mul    EBX;
 760     // EAX: (c.b-v)*alpha
 761     // EDX: dead
 762     shr    EAX,16;
 763     add    byte ptr [EDI],AL;
 764     inc    EDI;
 765
 766     movzx  EAX,byte ptr [EBP+10]; // c.g
 767     movzx  EDX,byte ptr [EDI]; // v
 768     sub    EAX,EDX;
 769     // EAX: c.b-v
 770     mul    EBX;
 771     // EAX: (c.b-v)*alpha
 772     // EDX: dead
 773     shr    EAX,16;
 774     add    byte ptr [EDI],AL;
 775     inc    EDI;
 776
 777     mov    byte ptr [EDI],0xff;
 778     inc    EDI;
 779 mixnext:
 780     dec    ECX;
 781     jnz    mixloop;
 782
 783 done:
 784     pop      EBX;
 785     pop      ESI;
 786     pop      EDI;
 787     //leave;  // this actually slower than the byte soup below
 788     mov      ESP,EBP;
 789     pop      EBP;
 790     ret 4*3;
 791
 792 mixskip:
 793     add     EDI,4; // skip destination pixel
 794     jmp     mixnext;
 795
 796 mixopaque:
 797     mov     EAX,[EBP+8]; // source pixel
 798     or      EAX,0xff000000; // alpha
 799     stosd;  // copy it
 800     jmp     mixnext;
 801
 802     // the source color is completely opaque
 803     // use slightly faster code to calculate coverage alpha
 804 fullyopaque:
 805     movzx  EAX,byte ptr [ESI]; // load coverage byte
 806     inc    ESI;
 807     or     AL,AL;
 808     jz     mixskipopaque;
 809     cmp    AL,0xff;
 810     jz     mixopaqueopaque;
 811     shl    EAX,8;
 812     mov    EBX,EAX;
 813     // EBX: alpha
 814
 815     // cast(ubyte)((((c.b-v)*alpha)>>16)+v);
 816     movzx  EAX,byte ptr [EBP+8]; // c.b
 817     movzx  EDX,byte ptr [EDI]; // v
 818     sub    EAX,EDX;
 819     // EAX: c.b-v
 820     mul    EBX;
 821     // EAX: (c.b-v)*alpha
 822     // EDX: dead
 823     shr    EAX,16;
 824     add    byte ptr [EDI],AL;
 825     inc    EDI;
 826
 827     movzx  EAX,byte ptr [EBP+9]; // c.r
 828     movzx  EDX,byte ptr [EDI]; // v
 829     sub    EAX,EDX;
 830     // EAX: c.b-v
 831     mul    EBX;
 832     // EAX: (c.b-v)*alpha
 833     // EDX: dead
 834     shr    EAX,16;
 835     add    byte ptr [EDI],AL;
 836     inc    EDI;
 837
 838     movzx  EAX,byte ptr [EBP+10]; // c.g
 839     movzx  EDX,byte ptr [EDI]; // v
 840     sub    EAX,EDX;
 841     // EAX: c.b-v
 842     mul    EBX;
 843     // EAX: (c.b-v)*alpha
 844     // EDX: dead
 845     shr    EAX,16;
 846     add    byte ptr [EDI],AL;
 847     inc    EDI;
 848
 849     mov    byte ptr [EDI],0xff;
 850     inc    EDI;
 851 mixnextopaque:
 852     dec    ECX;
 853     jnz    fullyopaque;
 854     jmp    done;
 855
 856 mixskipopaque:
 857     add     EDI,4; // skip destination pixel
 858     jmp     mixnextopaque;
 859
 860 mixopaqueopaque:
 861     mov     EAX,[EBP+8]; // source pixel
 862     or      EAX,0xff000000; // alpha
 863     stosd;  // copy it
 864     jmp     mixnextopaque;
 865   }
 866 }
 867
 868
 869 } else {
 870 // no SSE
 871 public uint* memFillDW (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
 872   pragma(inline, true);
 873   if (count > 0) {
 874     ptr[0..cast(usize)count] = value;
 875     ptr += cast(usize)count;
 876   }
 877   return ptr;
 878 }
 879 public uint* memFillDWDash (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
 880   pragma(inline, true);
 881   foreach (immutable c; 0..count) { if (!(c&1)) *ptr++ = value; else ++ptr; }
 882   if (count > 0) {
 883     ptr[0..cast(usize)count] = value;
 884     ptr += cast(usize)count;
 885   }
 886   return ptr;
 887 }
 888 public uint* memBlendColor (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
 889   foreach (immutable _; 0..count) { mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); }
 890   return mptr;
 891 }
 892 public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
 893   foreach (immutable c; 0..count) { if (!(c&1)) mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); else ++mptr; }
 894   return mptr;
 895 }
 896 public void memBlendColorCoverage (uint* mptr, const(ubyte)* coverage, in uint clr, in int count) nothrow @trusted @nogc {
 897   immutable uint cb = clr&0xff;
 898   immutable uint cg = (clr>>8)&0xff;
 899   immutable uint cr = (clr>>16)&0xff;
 900   immutable uint ca = clr>>24;
 901   ubyte* p = cast(ubyte*)mptr;
 902   foreach (immutable _; 0..count) {
 903     immutable uint alpha = (*coverage++)*ca;
 904     version(none) {
 905       uint v = *p; *p++ = cast(ubyte)((((cb-v)*alpha)+(v<<16))>>16);
 906            v = *p; *p++ = cast(ubyte)((((cg-v)*alpha)+(v<<16))>>16);
 907            v = *p; *p++ = cast(ubyte)((((cr-v)*alpha)+(v<<16))>>16);
 908     } else {
 909       uint v = *p; *p++ = cast(ubyte)((((cb-v)*alpha)>>16)+v);
 910            v = *p; *p++ = cast(ubyte)((((cg-v)*alpha)>>16)+v);
 911            v = *p; *p++ = cast(ubyte)((((cr-v)*alpha)>>16)+v);
 912     }
 913     *p++ = 0xff;
 914   }
 915 }
 916 }