wrlib/x86_specific.c

   1 /* x86_convert.c - convert RImage to XImage with x86 optimizations
   2  *
   3  * Raster graphics library
   4  *
   5  * Copyright (c) 2000-2003 Alfredo K. Kojima
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public
  18  *  License along with this library; if not, write to the Free
  19  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  */
  21
  22 #include <config.h>
  23
  24 #ifdef ASM_X86
  25
  26
  27 #ifdef ASM_X86_MMX
  28
  29 int
  30 x86_check_mmx()
  31 {
  32     static int result = -1;
  33
  34     if (result >= 0)
  35         return result;
  36
  37     result = 0;
  38
  39     asm volatile
  40         ("pushal                \n\t" // please dont forget this in any asm
  41          "pushfl                \n\t" // check whether cpuid supported
  42          "pop %%eax             \n\t"
  43          "movl %%eax, %%ebx     \n\t"
  44          "xorl $(1<<21), %%eax  \n\t"
  45          "pushl %%eax           \n\t"
  46          "popfl                 \n\t"
  47          "pushfl                \n\t"
  48          "popl %%eax            \n\t"
  49          "xorl %%ebx, %%eax     \n\t"
  50          "andl $(1<<21), %%eax  \n\t"
  51          "jz .NotPentium        \n\t"
  52          "xorl %%eax, %%eax     \n\t" // no eax effect because of the movl below
  53                                       // except reseting flags. is it needed?
  54          "movl $1, %%eax        \n\t"
  55          "cpuid                 \n\t"
  56          "test $(1<<23), %%edx  \n\t"
  57          "jz .NotMMX            \n\t"
  58
  59          "popal                 \n\t" // popal needed because the address of
  60          "movl $1, %0           \n\t" // variable %0 may be kept in a register
  61          "jmp .noPop            \n"
  62
  63 ".NotMMX:                       \n"
  64 ".NotPentium:                   \n\t"
  65          "popal                 \n"
  66 ".noPop:                        \n\t"
  67
  68          : "=m" (result));
  69
  70     return result;
  71 }
  72
  73
  74 /*
  75  * TODO:
  76  *              32/8    24/8    32/16   24/16   32/24   24/24
  77  * PPlain       YES     YES
  78  * MMX                          DONE
  79  *
  80  *
  81  * - try to align stack (local variable space) into quadword boundary
  82  */
  83 void
  84 x86_mmx_TrueColor_32_to_16(unsigned char *image,
  85                            unsigned short *ximage,
  86                            short *err,
  87                            short *nerr,
  88                            short *rtable,
  89                            short *gtable,
  90                            short *btable,
  91                            int dr,
  92                            int dg,
  93                            int db,
  94                            unsigned int roffs,
  95                            unsigned int goffs,
  96                            unsigned int boffs,
  97                            int width,
  98                            int height,
  99                            int line_offset)
 100 {
 101     union {
 102         long long rrggbbaa;
 103         struct {short int rr, gg, bb, aa;} words;
 104     } rrggbbaa;
 105
 106     union {
 107         long long pixel;
 108         struct {short int rr, gg, bb, aa;} words;
 109     } pixel;
 110
 111     short *tmp_err;
 112     short *tmp_nerr;
 113     int x;
 114
 115     asm volatile
 116         (
 117          "pushl %%ebx                        \n\t"
 118
 119          // pack dr, dg and db into mm6
 120          "movl  %7, %%eax               \n\t"
 121          "movl  %8, %%ebx               \n\t"
 122          "movl  %9, %%ecx               \n\t"
 123          "movw  %%ax, %16               \n\t"
 124          "movw  %%bx, %17               \n\t"
 125          "movw  %%cx, %18               \n\t"
 126          "movw  $0,  %19                \n\t"
 127
 128          "movq  %16, %%mm6              \n\t" // dr dg db 0
 129
 130          // pack 4|4|4|4 into mm7, for shifting (/16)
 131          "movl $0x00040004, %16         \n\t"
 132          "movl $0x00040004, %18         \n\t"
 133          "movq %16, %%mm7               \n\t"
 134
 135          // store constant values for using with mmx when dithering
 136          "movl $0x00070007, %16         \n\t"
 137          "movl $0x00070007, %18         \n\t"
 138          "movq %16, %%mm5               \n\t"
 139
 140          "movl $0x00050005, %16         \n\t"
 141          "movl $0x00050005, %18         \n\t"
 142          "movq %16, %%mm4               \n\t"
 143
 144          "movl $0x00030003, %16         \n\t"
 145          "movl $0x00030003, %18         \n\t"
 146          "movq %16, %%mm3               \n\t"
 147
 148          // process 1 pixel / cycle, each component treated as 16bit
 149          "movl %0, %%esi                \n"   // esi = image->data
 150
 151 ".LoopYa:                               \n\t"
 152          "movl %13, %%eax               \n\t"
 153          "movl %%eax, %26               \n\t" // x = width
 154
 155          "movl %14, %%eax               \n\t"
 156          "decl %%eax                    \n\t" // y--
 157          "movl %%eax, %14               \n\t"
 158          "js .Enda                      \n\t" // if y < 0, goto end
 159          "andl $1, %%eax                \n\t"
 160          "jz .LoopY_1a                  \n"   // if (y&1) goto LoopY_1
 161
 162 ".LoopY_0a:                             \n\t"
 163
 164          "movl %2, %%ebx                \n\t" // ebx = err
 165          "movl %%ebx, %25               \n\t" // [-36] = err
 166          "movl %3, %%eax                \n\t" //
 167          "movl %%eax, %24               \n\t" // [-32] = nerr
 168
 169          "jmp .LoopXa                   \n"
 170
 171 ".LoopY_1a:                             \n\t"
 172
 173          "movl %3, %%ebx                \n\t" // ebx = nerr
 174          "movl %%ebx, %25               \n\t" // [-36] = nerr
 175          "movl %2, %%eax                \n\t" //
 176          "movl %%eax, %24               \n\t" // [-32] = eerr
 177
 178          ".align 16                     \n"
 179 ".LoopXa:                               \n\t"
 180
 181          // calculate errors and pixel components
 182
 183          // depend on ebx, esi, mm6
 184          "movq (%%ebx), %%mm1           \n\t" // mm1 = error[0..3]
 185          "punpcklbw (%%esi), %%mm0      \n\t" // mm0 = image->data[0..3]
 186          "psrlw $8, %%mm0               \n\t" // fixup mm0
 187          "paddusb %%mm1, %%mm0          \n\t" // mm0 = mm0 + mm1 (sat. to 255)
 188          "movq %%mm0, %20               \n\t" // save the pixel
 189
 190          "movzwl %20, %%ecx             \n\t" // ecx = pixel.red
 191          "movl %4, %%edi                \n\t" // edi = rtable
 192          //agi
 193          "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &rtable[pixel.red]
 194          // agi
 195          "movw (%%eax), %%dx            \n\t" // dx = rtable[pixel.red]
 196          "movw %%dx, %16                \n\t" // save rr
 197
 198          "movzwl %21, %%ecx             \n\t" // ecx = pixel.green
 199          "movl %5, %%edi                \n\t" // edi = gtable
 200          //agi
 201          "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &gtable[pixel.green]
 202          //agi
 203          "movw (%%eax), %%dx            \n\t" // dx = gtable[pixel.green]
 204          "movw %%dx, %17                \n\t" // save gg
 205
 206          "movzwl %22, %%ecx             \n\t" // ecx = pixel.blue
 207          "movl %6, %%edi                \n\t" // ebx = btable
 208          //agi
 209          "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &btable[pixel.blue]
 210          //agi
 211          "movw (%%eax), %%dx            \n\t" // dx = btable[pixel.blue]
 212          "movw %%dx, %18                \n\t" // save bb
 213
 214          "movw $0, %19                  \n\t" // save dummy aa
 215
 216          "movq %16, %%mm1               \n\t" // load mm1 with rrggbbaa
 217          "pmullw %%mm6, %%mm1           \n\t" // mm1 = rr*dr|...
 218          "psubsw %%mm1, %%mm0           \n\t" // error = pixel - mm1
 219
 220
 221          // distribute the error
 222
 223          // depend on mm0, mm7, mm3, mm4, mm5
 224
 225          "movl %25, %%ebx               \n\t"
 226
 227          "movq %%mm0, %%mm1             \n\t"
 228          "pmullw %%mm5, %%mm1           \n\t" // mm1 = mm1*7
 229          "psrlw %%mm7, %%mm1            \n\t" // mm1 = mm1/16
 230          "paddw 8(%%ebx), %%mm1                \n\t"
 231          "movq %%mm1, 8(%%ebx)          \n\t" // err[x+1,y] = rer*7/16
 232
 233
 234          "movl %24, %%ebx               \n\t"
 235
 236          "movq %%mm0, %%mm1             \n\t"
 237          "pmullw %%mm4, %%mm1           \n\t" // mm1 = mm1*5
 238          "psrlw %%mm7, %%mm1            \n\t" // mm1 = mm1/16
 239          "paddw -8(%%ebx), %%mm1                \n\t"
 240          "movq %%mm1, -8(%%ebx)         \n\t" // err[x-1,y+1] += rer*3/16
 241
 242          "movq %%mm0, %%mm1             \n\t"
 243          "pmullw %%mm3, %%mm1           \n\t" // mm1 = mm1*3
 244          "psrlw %%mm7, %%mm1            \n\t" // mm1 = mm1/16
 245          "paddw 8(%%ebx), %%mm1         \n\t"
 246          "movq %%mm1, (%%ebx)           \n\t" // err[x,y+1] += rer*5/16
 247
 248          "psrlw %%mm7, %%mm0            \n\t" // mm0 = mm0/16
 249          "movq %%mm0, 8(%%ebx)          \n\t" // err[x+1,y+1] = rer/16
 250
 251
 252          // calculate final pixel value and store
 253          "movl %10, %%ecx               \n\t"
 254          "movw %16, %%ax                \n\t"
 255          "shlw %%cl, %%ax               \n\t" //NP* ax = r<<roffs
 256
 257          "movl %11, %%ecx               \n\t"
 258          "movw %17, %%bx                \n\t"
 259          "shlw %%cl, %%bx               \n\t" //NP*
 260          "orw %%bx, %%ax                \n\t"
 261
 262          "movl %12, %%ecx               \n\t"
 263          "movw %18, %%bx                \n\t"
 264          "shlw %%cl, %%bx               \n\t" //NP*
 265          "orw %%bx, %%ax                \n\t"
 266
 267          "movl %1, %%edx                \n\t"
 268          "movw %%ax, (%%edx)            \n\t"
 269          "addl $2, %%edx                \n\t" // increment ximage
 270          "movl %%edx, %1                \n\t"
 271
 272          // prepare for next iteration on X
 273
 274          "addl $8, %24                  \n\t" // nerr += 8
 275
 276          "movl %25, %%ebx               \n\t"
 277          "addl $8, %%ebx                \n\t"
 278          "movl %%ebx, %25               \n\t" // ebx = err += 8
 279
 280
 281          // Note: in the last pixel, this would cause an invalid memory access
 282          // because, punpcklbw is used (which reads 8 bytes) and the last
 283          // pixel is only 4 bytes. This is no problem because the image data
 284          // was allocated with extra 4 bytes when created.
 285          "addl $4, %%esi                        \n\t" // image->data += 4
 286
 287
 288          "decl %26                      \n\t" // x--
 289          "jnz .LoopXa                   \n\t" // if x>0, goto .LoopX
 290
 291
 292          // depend on edx
 293          "addl %15, %%edx               \n\t" // add extra offset to ximage
 294          "movl %%edx, %1                \n\t"
 295
 296
 297          "jmp .LoopYa                   \n"
 298
 299 ".Enda:                                 \n\t" // THE END
 300          "emms                          \n\t"
 301          "popl %%ebx                         \n\t"
 302          :
 303          :
 304          "m" (image),                      // %0
 305          "m" (ximage),                     // %1
 306          "m" (err),                        // %2
 307          "m" (nerr),                       // %3
 308          "m" (rtable),                     // %4
 309          "m" (gtable),                     // %5
 310          "m" (btable),                     // %6
 311          "m" (dr),                         // %7
 312          "m" (dg),                         // %8
 313          "m" (db),                         // %9
 314          "m" (roffs),                      // %10
 315          "m" (goffs),                      // %11
 316          "m" (boffs),                      // %12
 317          "m" (width),                      // %13
 318          "m" (height),                     // %14
 319          "m" (line_offset),                // %15
 320          "m" (rrggbbaa.rrggbbaa),          // %16 (access to rr)
 321          "m" (rrggbbaa.words.gg),          // %17 (access to gg)
 322          "m" (rrggbbaa.words.bb),          // %18 (access to bb)
 323          "m" (rrggbbaa.words.aa),          // %19 (access to aa)
 324          "m" (pixel.pixel),                // %20 (access to pixel.r)
 325          "m" (pixel.words.gg),             // %21 (access to pixel.g)
 326          "m" (pixel.words.bb),             // %22 (access to pixel.b)
 327          "m" (pixel.words.aa),             // %23 (access to pixel.a)
 328          "m" (tmp_err),                    // %24
 329          "m" (tmp_nerr),                   // %25
 330          "m" (x)                           // %26
 331          : "eax", "ecx", "edx", "esi", "edi"
 332         );
 333 }
 334
 335
 336 void
 337 x86_mmx_TrueColor_24_to_16(unsigned char *image,
 338                            unsigned short *ximage,
 339                            short *err,
 340                            short *nerr,
 341                            short *rtable,
 342                            short *gtable,
 343                            short *btable,
 344                            int dr,
 345                            int dg,
 346                            int db,
 347                            unsigned int roffs,
 348                            unsigned int goffs,
 349                            unsigned int boffs,
 350                            int width,
 351                            int height,
 352                            int line_offset)
 353 {
 354     union {
 355         long long rrggbbaa;
 356         struct {short int rr, gg, bb, aa;} words;
 357     } rrggbbaa;
 358
 359     union {
 360         long long pixel;
 361         struct {short int rr, gg, bb, aa;} words;
 362     } pixel;
 363
 364     short *tmp_err;
 365     short *tmp_nerr;
 366
 367     int x;
 368     int w1;
 369     int w2;
 370
 371     asm volatile
 372         (
 373          "pushl %%ebx                        \n\t"
 374
 375          "movl %13, %%eax               \n\t" // eax = width
 376          "movl %%eax, %%ebx             \n\t"
 377          "shrl $2, %%eax                \n\t"
 378          "movl %%eax, %27               \n\t" // w1 = width / 4
 379          "andl $3, %%ebx                \n\t"
 380          "movl %%ebx, %28               \n"   // w2 = width %% 4
 381
 382
 383 ".LoopYc:                               \n\t"
 384          "movl %13, %%eax               \n\t"
 385          "movl %%eax, %26               \n\t" // x = width
 386
 387          "decl %14                      \n\t" // height--
 388          "js .Endc                      \n\t" // if height < 0 then end
 389
 390          "movl %14, %%eax               \n\t"
 391          "decl %%eax                    \n\t" // y--
 392          "movl %%eax, %14               \n\t"
 393          "js .Endc                      \n\t" // if y < 0, goto end
 394          "andl $1, %%eax                \n\t"
 395          "jz .LoopY_1c                  \n"   // if (y&1) goto LoopY_1
 396
 397 ".LoopY_0c:                             \n\t"
 398
 399          "movl %2, %%ebx                \n\t" // ebx = err
 400          "movl %%ebx, %25               \n\t" // [-36] = err
 401          "movl %3, %%eax                \n\t" //
 402          "movl %%eax, %24               \n\t" // [-32] = nerr
 403
 404          "jmp .LoopX_1c                 \n"
 405
 406 ".LoopY_1c:                             \n\t"
 407
 408          "movl %3, %%ebx                \n\t" // ebx = nerr
 409          "movl %%ebx, %25               \n\t" // [-36] = nerr
 410          "movl %2, %%eax                \n\t" //
 411          "movl %%eax, %24               \n\t" // [-32] = eerr
 412
 413          ".align 16                     \n\t"
 414
 415          "movl %%eax, %26               \n"   // x = w1
 416 ".LoopX_1c:                             \n\t"
 417          "decl %26                      \n\t" // x--
 418          "js .Xend1_c                   \n\t" // if x < 0 then end
 419
 420          // do conversion of 4 pixels
 421          "movq %2, %%mm0                \n\t" // mm0 = err
 422
 423
 424
 425
 426          "jmp .LoopX_1c                 \n"
 427 ".Xend1_c:                              \n\t"
 428
 429          "movl %28, %%eax               \n\t"
 430          "movl %%eax, %26               \n"   // x = w2
 431 ".LoopX_2c:                             \n\t"
 432          "decl %26                      \n\t" // x--
 433          "js .Xend2_c                   \n\t" //
 434          // do conversion
 435          "jmp .LoopX_2c                 \n"
 436 ".Xend2_c:                              \n\t"
 437
 438          "movl %27, %%eax               \n\t"
 439          "jmp .LoopYc                   \n"
 440
 441 ".Endc:                                 \n\t" // THE END
 442          "emms                          \n\t"
 443          "popl %%ebx                         \n\t"
 444          :
 445          :
 446          "m" (image),                      // %0
 447          "m" (ximage),                     // %1
 448          "m" (err),                        // %2
 449          "m" (nerr),                       // %3
 450          "m" (rtable),                     // %4
 451          "m" (gtable),                     // %5
 452          "m" (btable),                     // %6
 453          "m" (dr),                         // %7
 454          "m" (dg),                         // %8
 455          "m" (db),                         // %9
 456          "m" (roffs),                      // %10
 457          "m" (goffs),                      // %11
 458          "m" (boffs),                      // %12
 459          "m" (width),                      // %13
 460          "m" (height),                     // %14
 461          "m" (line_offset),                // %15
 462          "m" (rrggbbaa.rrggbbaa),          // %16 (access to rr)
 463          "m" (rrggbbaa.words.gg),          // %17 (access to gg)
 464          "m" (rrggbbaa.words.bb),          // %18 (access to bb)
 465          "m" (rrggbbaa.words.aa),          // %19 (access to aa)
 466          "m" (pixel.pixel),                // %20 (access to pixel.r)
 467          "m" (pixel.words.gg),             // %21 (access to pixel.g)
 468          "m" (pixel.words.bb),             // %22 (access to pixel.b)
 469          "m" (pixel.words.aa),             // %23 (access to pixel.a)
 470          "m" (tmp_err),                    // %24
 471          "m" (tmp_nerr),                   // %25
 472          "m" (x),                          // %26
 473          "m" (w1),                         // %27
 474          "m" (w2)                          // %28
 475           : "eax", "ecx", "edx", "esi", "edi"
 476         );
 477 }
 478
 479
 480
 481 #endif /* ASM_X86_MMX */
 482
 483
 484
 485 void
 486 x86_PseudoColor_32_to_8(unsigned char *image,
 487                         unsigned char *ximage,
 488                         char *err,
 489                         char *nerr,
 490                         short *ctable,
 491                         int dr,
 492                         int dg,
 493                         int db,
 494                         unsigned long *pixels,
 495                         int cpc,
 496                         int width,
 497                         int height,
 498                         int bytesPerPixel,
 499                         int line_offset)
 500 {
 501     int x;
 502     int cpcpc;
 503
 504     int rr;
 505     int gg;
 506     int bb;
 507
 508     char *tmp_err;
 509     char *tmp_nerr;
 510
 511     char ndr; // aparently not used
 512     char ndg; // aparently not used
 513     char ndb; // aparently not used
 514
 515     asm volatile
 516         (
 517          "pushal                        \n\t"
 518
 519          "movl %9, %%eax                \n\t"
 520          "mulb %9                       \n\t"
 521          "movl %%eax, %15               \n\t" // cpcpc = cpc*cpc
 522
 523          // eax will always be <= 0xffff
 524
 525          // process 1 pixel / cycle, each component treated as 16bit
 526          "movl %0, %%esi                \n"   // esi = image->data
 527
 528 ".LoopYb:                               \n\t"
 529          "movl %10, %%ecx               \n\t"
 530          "movl %%ecx, %14               \n\t" // x = width
 531
 532          "movl %11, %%ecx               \n\t"
 533          "decl %%ecx                    \n\t" // y--
 534          "movl %%ecx, %11               \n\t"
 535          "js .Endb                      \n\t" // if y < 0, goto end
 536          "andl $1, %%ecx                \n\t"
 537          "jz .LoopY_1b                  \n"   // if (y&1) goto LoopY_1
 538
 539 ".LoopY_0b:                             \n\t"
 540
 541          "movl %2, %%ebx                \n\t" // ebx = err
 542 //useless "movl %%ebx, %20              \n\t" // [-36] = err
 543          "movl %3, %%ecx                \n\t" //
 544          "movl %%ecx, %19               \n\t" // [-32] = nerr
 545
 546          "movl $0, (%%ecx)              \n\t" // init error of nerr[0] to 0
 547
 548          "jmp .LoopXb                   \n"
 549
 550 ".LoopY_1b:                             \n\t"
 551
 552          "movl %3, %%ebx                \n\t" // ebx = nerr
 553 //useless "movl %%ebx, %20              \n\t" // [-36] = nerr
 554          "movl %2, %%ecx                \n\t" //
 555          "movl %%ecx, %19               \n\t" // [-32] = err
 556
 557          "movl $0, (%%ecx)              \n\t" // init error of nerr[0] to 0
 558
 559
 560          ".align 16                     \n"
 561 ".LoopXb:                               \n\t"
 562
 563
 564          "movl %4, %%edi                \n\t" // edi = ctable
 565          "xorl %%edx, %%edx             \n\t" // zero the upper word on edx
 566
 567          // RED
 568
 569          // depends on ebx==err, esi==image->data, edi
 570          "movzbw (%%esi), %%dx          \n\t" // dx = image->data[0]
 571          "movsbw (%%ebx), %%ax          \n\t" // ax = error[0]
 572          "addw %%ax, %%dx               \n\t" // pixel.red = data[0] + error[0]
 573
 574          "testb %%dh, %%dh              \n\t" // test if pixel.red < 0 or > 255
 575          "jz .OKRb                      \n\t" // 0 <= pixel.red <= 255
 576          "js .NEGRb                     \n\t" // pixel.red < 0
 577          "movw $0xff, %%dx              \n\t" // pixel.red > 255
 578          "jmp .OKRb                     \n"
 579 ".NEGRb:                                \n\t"
 580          "xorw %%dx, %%dx               \n"
 581 ".OKRb:                                 \n\t"
 582          //partial reg
 583          "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.red]
 584          //agi
 585          "movl (%%ecx), %%eax           \n\t" // ax = ctable[pixel.red]
 586          "movw %%ax, %16                \n\t" // save rr
 587
 588          "mulb %5                       \n\t" // ax = rr*dr
 589          "subw %%ax, %%dx               \n\t" // rer = dx = dx - rr*dr
 590
 591          "movswl %%dx, %%eax            \n\t" // save rer
 592
 593          // distribute error
 594          "leal (, %%eax, 8), %%ecx      \n\t"
 595          "subw %%dx, %%cx               \n\t" // cx = rer * 7
 596          "sarw $4, %%cx                 \n\t" // cx = rer * 7 / 16
 597          "addb %%cl, 4(%%ebx)           \n\t" // err[x+1] += rer * 7 / 16
 598
 599          "movl %19, %%ecx               \n\t" // ecx = nerr
 600
 601          "leaw (%%eax, %%eax, 4), %%dx  \n\t" // dx = rer * 5
 602          "sarw $4, %%dx                 \n\t" // dx = rer * 5 / 16
 603          "addb %%dl, (%%ecx)            \n\t" // nerr[x] += rer * 5 / 16
 604
 605          "leaw (%%eax, %%eax, 2), %%dx  \n\t" // dx = rer * 3
 606          "sarw $4, %%dx                 \n\t" // dx = rer * 3 / 16
 607          "addb %%dl, -4(%%ecx)          \n\t" // nerr[x-1] += rer * 3 / 16
 608
 609          "sarw $4, %%ax                 \n\t" // ax = rer / 16
 610          "movb %%al, 4(%%ecx)           \n\t" // nerr[x+1] = rer / 16
 611
 612
 613          // GREEN
 614
 615          // depends on ebx, esi, edi
 616          "movzbw 1(%%esi), %%dx         \n\t" // dx = image->data[1]
 617          "movsbw 1(%%ebx), %%ax         \n\t" // ax = error[1]
 618          "addw %%ax, %%dx               \n\t" // pixel.grn = data[1] + error[1]
 619
 620          "testb %%dh, %%dh              \n\t" // test if pixel.grn < 0 or > 255
 621          "jz .OKGb                      \n\t" // 0 <= pixel.grn <= 255
 622          "js .NEGGb                     \n\t" // pixel.grn < 0
 623          "movw $0xff, %%dx              \n\t" // pixel.grn > 255
 624          "jmp .OKGb                     \n"
 625 ".NEGGb:                                \n\t"
 626          "xorw %%dx, %%dx               \n"
 627 ".OKGb:                                 \n\t"
 628          // partial reg
 629          "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.grn]
 630          //agi
 631          "movw (%%ecx), %%ax            \n\t" // ax = ctable[pixel.grn]
 632          "movw %%ax, %17                \n\t" // save gg
 633
 634          "mulb %6                       \n\t" // ax = gg*dg
 635          "subw %%ax, %%dx               \n\t" // ger = dx = dx - gg*dg
 636
 637          "movswl %%dx, %%eax            \n\t" // save ger
 638
 639          // distribute error
 640
 641          "leal (, %%eax, 8), %%ecx      \n\t"
 642          "subw %%dx, %%cx               \n\t" // cx = ger * 7
 643          "sarw $4, %%cx                 \n\t" // cx = ger * 7 / 16
 644          "addb %%cl, 5(%%ebx)           \n\t" // err[x+1] += ger * 7 / 16
 645
 646          "movl %19, %%ecx               \n\t" // ecx = nerr
 647
 648          "leaw (%%eax, %%eax, 4), %%dx  \n\t" // dx = ger * 5
 649          "sarw $4, %%dx                 \n\t" // dx = ger * 5 / 16
 650          "addb %%dl, 1(%%ecx)           \n\t" // nerr[x] += ger * 5 / 16
 651
 652          "leaw (%%eax, %%eax, 2), %%dx  \n\t" // dx = ger * 3
 653          "sarw $4, %%dx                 \n\t" // dx = ger * 3 / 16
 654          "addb %%dl, -3(%%ecx)          \n\t" // nerr[x-1] += ger * 3 / 16
 655
 656          "sarw $4, %%ax                 \n\t" // ax = ger / 16
 657          "movb %%al, 5(%%ecx)           \n\t" // nerr[x+1] = ger / 16
 658
 659
 660          // BLUE
 661
 662          // depends on ebx, esi
 663          "movzbw 2(%%esi), %%dx         \n\t" // dx = image->data[2]
 664          "movsbw 2(%%ebx), %%ax         \n\t" // ax = error[2]
 665          "addw %%ax, %%dx               \n\t" // pixel.grn = data[2] + error[2]
 666
 667          "testb %%dh, %%dh              \n\t" // test if pixel.blu < 0 or > 255
 668          "jz .OKBb                      \n\t" // 0 <= pixel.blu <= 255
 669          "js .NEGBb                     \n\t" // pixel.blu < 0
 670          "movw $0xff, %%dx              \n\t" // pixel.blu > 255
 671          "jmp .OKBb                     \n"
 672 ".NEGBb:                                \n\t"
 673          "xorw %%dx, %%dx                       \n"
 674 ".OKBb:                                 \n\t"
 675          //partial reg
 676          "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.blu]
 677          //agi
 678          "movw (%%ecx), %%ax            \n\t" // ax = ctable[pixel.blu]
 679          "movw %%ax, %18                \n\t" // save bb
 680
 681          "mulb %7                       \n\t" // ax = bb*db
 682          "subw %%ax, %%dx               \n\t" // ber = dx = dx - bb*db
 683          "movswl %%dx, %%eax            \n\t" // save ber
 684
 685          // distribute error
 686          "leal (, %%eax, 8), %%ecx      \n\t"
 687          "subw %%dx, %%cx               \n\t" // cx = ber * 7
 688          "sarw $4, %%cx                 \n\t" // cx = ber * 7 / 16
 689          "addb %%cl, 6(%%ebx)           \n\t" // err[x+1] += ber * 7 / 16
 690
 691          "movl %19, %%ecx               \n\t" // ecx = nerr
 692
 693          "leaw (%%eax, %%eax, 4), %%dx  \n\t" // dx = ber * 5
 694          "sarw $4, %%dx                 \n\t" // dx = ber * 5 / 16
 695          "addb %%dl, 2(%%ecx)           \n\t" // nerr[x] += ber * 5 / 16
 696
 697          "leaw (%%eax, %%eax, 2), %%dx  \n\t" // dx = ber * 3
 698          "sarw $4, %%dx                 \n\t" // dx = ber * 3 / 16
 699          "addb %%dl, -4(%%ecx)          \n\t" // nerr[x-1] += ber * 3 / 16
 700
 701          "sarw $4, %%ax                 \n\t" // ax = ber / 16
 702          "movb %%al, 6(%%ecx)           \n\t" // nerr[x+1] = ber / 16
 703
 704          "andl $0xffff, %%eax           \n\t"
 705          // depends on eax & 0xffff0000 == 0
 706          // calculate the index of the value of the pixel
 707          "movw %16, %%ax                \n\t" // ax = rr
 708          "mulb %15                      \n\t" // ax = cpcpc*rr
 709          "movw %%ax, %%cx               \n\t"
 710          "movw %17, %%ax                \n\t" // ax = gg
 711          "mulb %9                       \n\t" // ax = cpc*gg
 712          "addw %%cx, %%ax               \n\t" // ax = cpc*gg + cpcpc*rr
 713          "addw %18, %%ax                \n\t" // ax = cpcpc*rr + cpc*gg + bb
 714
 715          "movl %8, %%ecx                \n\t"
 716          //agi
 717          "leal (%%ecx, %%eax, 4), %%edx \n\t"
 718          //agi
 719          "movb (%%edx), %%cl            \n\t" // cl = pixels[ax]
 720
 721          // store the pixel
 722          "movl %1, %%eax                \n\t"
 723          "movb %%cl, (%%eax)            \n\t" // *ximage = cl
 724          "incl %1                       \n\t" // ximage++
 725
 726          // prepare for next iteration on X
 727
 728          "addl $4, %19                  \n\t" // nerr += 4
 729          "addl $4, %%ebx                \n\t" // err += 4
 730
 731          "addl %12, %%esi               \n\t" // image->data += bpp
 732
 733          "decl %14                      \n\t" // x--
 734          "jnz .LoopXb                   \n\t" // if x>0, goto .LoopX
 735
 736
 737          "movl %13, %%eax               \n\t"
 738          "addl %%eax, %1                \n\t" // add extra offset to ximage
 739
 740          "jmp .LoopYb                   \n"
 741
 742 ".Endb:                                 \n\t"
 743          "emms                          \n\t"
 744          "popal                         \n\t"
 745          :
 746          :
 747          "m" (image),         // %0
 748          "m" (ximage),        // %1
 749          "m" (err),           // %2
 750          "m" (nerr),          // %3
 751          "m" (ctable),        // %4
 752          "m" (dr),            // %5
 753          "m" (dg),            // %6
 754          "m" (db),            // %7
 755          "m" (pixels),        // %8
 756          "m" (cpc),           // %9
 757          "m" (width),         // %10
 758          "m" (height),        // %11
 759          "m" (bytesPerPixel), // %12
 760          "m" (line_offset),   // %13
 761          "m" (x),             // %14
 762          "m" (cpcpc),         // %15
 763          "m" (rr),            // %16
 764          "m" (gg),            // %17
 765          "m" (bb),            // %18
 766          "m" (tmp_err),       // %19
 767          "m" (tmp_nerr),      // %20
 768          "m" (ndr),           // %21
 769          "m" (ndg),           // %22
 770          "m" (ndb)            // %23
 771         );
 772 }
 773
 774 #endif /* ASM_X86 */
 775