wrlib/x86_specific.c

   1 /* x86_convert.c - convert RImage to XImage with x86 optimizations
   2  *
   3  * Raster graphics library
   4  *
   5  * Copyright (c) 2000-2003 Alfredo K. Kojima
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public
  18  *  License along with this library; if not, write to the Free
  19  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  */
  21
  22 #include <config.h>
  23
  24 #ifdef ASM_X86
  25
  26 #ifdef ASM_X86_MMX
  27
  28 int x86_check_mmx()
  29 {
  30         static int result = -1;
  31
  32         if (result >= 0)
  33                 return result;
  34
  35         result = 0;
  36
  37         asm volatile
  38          ("pushal                \n\t"  // please dont forget this in any asm
  39           "pushfl                \n\t"  // check whether cpuid supported
  40           "pop %%eax             \n\t" "movl %%eax, %%ebx     \n\t" "xorl $(1<<21), %%eax  \n\t" "pushl %%eax           \n\t" "popfl                 \n\t" "pushfl                \n\t" "popl %%eax            \n\t" "xorl %%ebx, %%eax     \n\t" "andl $(1<<21), %%eax  \n\t" "jz .NotPentium        \n\t" "xorl %%eax, %%eax     \n\t"        // no eax effect because of the movl below
  41           // except reseting flags. is it needed?
  42           "movl $1, %%eax        \n\t" "cpuid                 \n\t" "test $(1<<23), %%edx  \n\t" "jz .NotMMX            \n\t" "popal                 \n\t"      // popal needed because the address of
  43           "movl $1, %0           \n\t"  // variable %0 may be kept in a register
  44           "jmp .noPop            \n"
  45           ".NotMMX:                       \n"
  46           ".NotPentium:                   \n\t"
  47           "popal                 \n" ".noPop:                        \n\t":"=m" (result));
  48
  49         return result;
  50 }
  51
  52 /*
  53  * TODO:
  54  *              32/8    24/8    32/16   24/16   32/24   24/24
  55  * PPlain       YES     YES
  56  * MMX                          DONE
  57  *
  58  *
  59  * - try to align stack (local variable space) into quadword boundary
  60  */
  61 void
  62 x86_mmx_TrueColor_32_to_16(unsigned char *image,
  63                            unsigned short *ximage,
  64                            short *err,
  65                            short *nerr,
  66                            unsigned short *rtable,
  67                            unsigned short *gtable,
  68                            unsigned short *btable,
  69                            int dr,
  70                            int dg,
  71                            int db,
  72                            unsigned int roffs,
  73                            unsigned int goffs, unsigned int boffs, int width, int height, int line_offset)
  74 {
  75         union {
  76                 long long rrggbbaa;
  77                 struct {
  78                         short int rr, gg, bb, aa;
  79                 } words;
  80         } rrggbbaa;
  81
  82         union {
  83                 long long pixel;
  84                 struct {
  85                         short int rr, gg, bb, aa;
  86                 } words;
  87         } pixel;
  88
  89         short *tmp_err;
  90         short *tmp_nerr;
  91         int x;
  92
  93         asm volatile
  94          ("pushl %%ebx                   \n\t"
  95           // pack dr, dg and db into mm6
  96           "movl  %7, %%eax               \n\t" "movl  %8, %%ebx               \n\t" "movl  %9, %%ecx               \n\t" "movw  %%ax, %16               \n\t" "movw  %%bx, %17               \n\t" "movw  %%cx, %18               \n\t" "movw  $0,  %19                \n\t" "movq  %16, %%mm6              \n\t"       // dr dg db 0
  97           // pack 4|4|4|4 into mm7, for shifting (/16)
  98           "movl $0x00040004, %16         \n\t"
  99           "movl $0x00040004, %18         \n\t" "movq %16, %%mm7               \n\t"
 100           // store constant values for using with mmx when dithering
 101           "movl $0x00070007, %16         \n\t"
 102           "movl $0x00070007, %18         \n\t"
 103           "movq %16, %%mm5               \n\t"
 104           "movl $0x00050005, %16         \n\t"
 105           "movl $0x00050005, %18         \n\t"
 106           "movq %16, %%mm4               \n\t"
 107           "movl $0x00030003, %16         \n\t"
 108           "movl $0x00030003, %18         \n\t" "movq %16, %%mm3               \n\t"
 109           // process 1 pixel / cycle, each component treated as 16bit
 110           "movl %0, %%esi                \n"    // esi = image->data
 111            ".LoopYa:                               \n\t" "movl %13, %%eax               \n\t" "movl %%eax, %26               \n\t"      // x = width
 112            "movl %14, %%eax               \n\t" "decl %%eax                    \n\t"    // y--
 113           "movl %%eax, %14               \n\t" "js .Enda                      \n\t"     // if y < 0, goto end
 114           "andl $1, %%eax                \n\t" "jz .LoopY_1a                  \n"       // if (y&1) goto LoopY_1
 115            ".LoopY_0a:                             \n\t" "movl %2, %%ebx                \n\t"   // ebx = err
 116           "movl %%ebx, %25               \n\t"  // [-36] = err
 117           "movl %3, %%eax                \n\t"  //
 118           "movl %%eax, %24               \n\t"  // [-32] = nerr
 119            "jmp .LoopXa                   \n" ".LoopY_1a:                             \n\t" "movl %3, %%ebx                \n\t"        // ebx = nerr
 120           "movl %%ebx, %25               \n\t"  // [-36] = nerr
 121           "movl %2, %%eax                \n\t"  //
 122           "movl %%eax, %24               \n\t"  // [-32] = eerr
 123            ".align 16                     \n" ".LoopXa:                               \n\t"
 124           // calculate errors and pixel components
 125           // depend on ebx, esi, mm6
 126           "movq (%%ebx), %%mm1           \n\t"  // mm1 = error[0..3]
 127           "punpcklbw (%%esi), %%mm0      \n\t"  // mm0 = image->data[0..3]
 128           "psrlw $8, %%mm0               \n\t"  // fixup mm0
 129           "paddusb %%mm1, %%mm0          \n\t"  // mm0 = mm0 + mm1 (sat. to 255)
 130           "movq %%mm0, %20               \n\t"  // save the pixel
 131            "movzwl %20, %%ecx             \n\t" // ecx = pixel.red
 132           "movl %4, %%edi                \n\t"  // edi = rtable
 133           //agi
 134           "leal (%%edi, %%ecx, 2), %%eax \n\t"  // eax = &rtable[pixel.red]
 135           // agi
 136           "movw (%%eax), %%dx            \n\t"  // dx = rtable[pixel.red]
 137           "movw %%dx, %16                \n\t"  // save rr
 138            "movzwl %21, %%ecx             \n\t" // ecx = pixel.green
 139           "movl %5, %%edi                \n\t"  // edi = gtable
 140           //agi
 141           "leal (%%edi, %%ecx, 2), %%eax \n\t"  // eax = &gtable[pixel.green]
 142           //agi
 143           "movw (%%eax), %%dx            \n\t"  // dx = gtable[pixel.green]
 144           "movw %%dx, %17                \n\t"  // save gg
 145            "movzwl %22, %%ecx             \n\t" // ecx = pixel.blue
 146           "movl %6, %%edi                \n\t"  // ebx = btable
 147           //agi
 148           "leal (%%edi, %%ecx, 2), %%eax \n\t"  // eax = &btable[pixel.blue]
 149           //agi
 150           "movw (%%eax), %%dx            \n\t"  // dx = btable[pixel.blue]
 151           "movw %%dx, %18                \n\t"  // save bb
 152            "movw $0, %19                  \n\t" // save dummy aa
 153            "movq %16, %%mm1               \n\t" // load mm1 with rrggbbaa
 154           "pmullw %%mm6, %%mm1           \n\t"  // mm1 = rr*dr|...
 155           "psubsw %%mm1, %%mm0           \n\t"  // error = pixel - mm1
 156           // distribute the error
 157           // depend on mm0, mm7, mm3, mm4, mm5
 158            "movl %25, %%ebx               \n\t" "movq %%mm0, %%mm1             \n\t" "pmullw %%mm5, %%mm1           \n\t"       // mm1 = mm1*7
 159           "psrlw %%mm7, %%mm1            \n\t"  // mm1 = mm1/16
 160           "paddw 8(%%ebx), %%mm1         \n\t" "movq %%mm1, 8(%%ebx)          \n\t"     // err[x+1,y] = rer*7/16
 161            "movl %24, %%ebx               \n\t" "movq %%mm0, %%mm1             \n\t" "pmullw %%mm4, %%mm1           \n\t"       // mm1 = mm1*5
 162           "psrlw %%mm7, %%mm1            \n\t"  // mm1 = mm1/16
 163           "paddw -8(%%ebx), %%mm1        \n\t" "movq %%mm1, -8(%%ebx)         \n\t"     // err[x-1,y+1] += rer*3/16
 164            "movq %%mm0, %%mm1             \n\t" "pmullw %%mm3, %%mm1           \n\t"    // mm1 = mm1*3
 165           "psrlw %%mm7, %%mm1            \n\t"  // mm1 = mm1/16
 166           "paddw 8(%%ebx), %%mm1         \n\t" "movq %%mm1, (%%ebx)           \n\t"     // err[x,y+1] += rer*5/16
 167            "psrlw %%mm7, %%mm0            \n\t" // mm0 = mm0/16
 168           "movq %%mm0, 8(%%ebx)          \n\t"  // err[x+1,y+1] = rer/16
 169           // calculate final pixel value and store
 170           "movl %10, %%ecx               \n\t" "movw %16, %%ax                \n\t" "shlw %%cl, %%ax               \n\t"        //NP* ax = r<<roffs
 171            "movl %11, %%ecx               \n\t" "movw %17, %%bx                \n\t" "shlw %%cl, %%bx               \n\t"       //NP*
 172           "orw %%bx, %%ax                \n\t" "movl %12, %%ecx               \n\t" "movw %18, %%bx                \n\t" "shlw %%cl, %%bx               \n\t"   //NP*
 173           "orw %%bx, %%ax                \n\t" "movl %1, %%edx                \n\t" "movw %%ax, (%%edx)            \n\t" "addl $2, %%edx                \n\t"   // increment ximage
 174           "movl %%edx, %1                \n\t"
 175           // prepare for next iteration on X
 176            "addl $8, %24                  \n\t" // nerr += 8
 177            "movl %25, %%ebx               \n\t" "addl $8, %%ebx                \n\t" "movl %%ebx, %25               \n\t"       // ebx = err += 8
 178           // Note: in the last pixel, this would cause an invalid memory access
 179           // because, punpcklbw is used (which reads 8 bytes) and the last
 180           // pixel is only 4 bytes. This is no problem because the image data
 181           // was allocated with extra 4 bytes when created.
 182           "addl $4, %%esi                \n\t"  // image->data += 4
 183            "decl %26                      \n\t" // x--
 184           "jnz .LoopXa                   \n\t"  // if x>0, goto .LoopX
 185           // depend on edx
 186           "addl %15, %%edx               \n\t"  // add extra offset to ximage
 187           "movl %%edx, %1                \n\t" "jmp .LoopYa                   \n" ".Enda:                                 \n\t" // THE END
 188           "emms                          \n\t" "popl %%ebx                    \n\t":: "m" (image),      // %0
 189           "m"(ximage),          // %1
 190           "m"(err),             // %2
 191           "m"(nerr),            // %3
 192           "m"(rtable),          // %4
 193           "m"(gtable),          // %5
 194           "m"(btable),          // %6
 195           "m"(dr),              // %7
 196           "m"(dg),              // %8
 197           "m"(db),              // %9
 198           "m"(roffs),           // %10
 199           "m"(goffs),           // %11
 200           "m"(boffs),           // %12
 201           "m"(width),           // %13
 202           "m"(height),          // %14
 203           "m"(line_offset),     // %15
 204           "m"(rrggbbaa.words.rr),       // %16 (access to rr)
 205           "m"(rrggbbaa.words.gg),       // %17 (access to gg)
 206           "m"(rrggbbaa.words.bb),       // %18 (access to bb)
 207           "m"(rrggbbaa.words.aa),       // %19 (access to aa)
 208           "m"(pixel.words.rr),  // %20 (access to pixel.r)
 209           "m"(pixel.words.gg),  // %21 (access to pixel.g)
 210           "m"(pixel.words.bb),  // %22 (access to pixel.b)
 211           "m"(pixel.words.aa),  // %23 (access to pixel.a)
 212           "m"(tmp_err),         // %24
 213           "m"(tmp_nerr),        // %25
 214           "m"(x)                // %26
 215           :"eax", "ecx", "edx", "esi", "edi");
 216 }
 217
 218 void
 219 x86_mmx_TrueColor_24_to_16(unsigned char *image,
 220                            unsigned short *ximage,
 221                            short *err,
 222                            short *nerr,
 223                            short *rtable,
 224                            short *gtable,
 225                            short *btable,
 226                            int dr,
 227                            int dg,
 228                            int db,
 229                            unsigned int roffs,
 230                            unsigned int goffs, unsigned int boffs, int width, int height, int line_offset)
 231 {
 232         union {
 233                 long long rrggbbaa;
 234                 struct {
 235                         short int rr, gg, bb, aa;
 236                 } words;
 237         } rrggbbaa;
 238
 239         union {
 240                 long long pixel;
 241                 struct {
 242                         short int rr, gg, bb, aa;
 243                 } words;
 244         } pixel;
 245
 246         short *tmp_err;
 247         short *tmp_nerr;
 248
 249         int x;
 250         int w1;
 251         int w2;
 252
 253         asm volatile
 254          ("pushl %%ebx                   \n\t" "movl %13, %%eax               \n\t"     // eax = width
 255           "movl %%eax, %%ebx             \n\t" "shrl $2, %%eax                \n\t" "movl %%eax, %27               \n\t"        // w1 = width / 4
 256           "andl $3, %%ebx                \n\t" "movl %%ebx, %28               \n"       // w2 = width %% 4
 257            ".LoopYc:                               \n\t" "movl %13, %%eax               \n\t" "movl %%eax, %26               \n\t"      // x = width
 258            "decl %14                      \n\t" // height--
 259           "js .Endc                      \n\t"  // if height < 0 then end
 260            "movl %14, %%eax               \n\t" "decl %%eax                    \n\t"    // y--
 261           "movl %%eax, %14               \n\t" "js .Endc                      \n\t"     // if y < 0, goto end
 262           "andl $1, %%eax                \n\t" "jz .LoopY_1c                  \n"       // if (y&1) goto LoopY_1
 263            ".LoopY_0c:                             \n\t" "movl %2, %%ebx                \n\t"   // ebx = err
 264           "movl %%ebx, %25               \n\t"  // [-36] = err
 265           "movl %3, %%eax                \n\t"  //
 266           "movl %%eax, %24               \n\t"  // [-32] = nerr
 267            "jmp .LoopX_1c                 \n" ".LoopY_1c:                             \n\t" "movl %3, %%ebx                \n\t"        // ebx = nerr
 268           "movl %%ebx, %25               \n\t"  // [-36] = nerr
 269           "movl %2, %%eax                \n\t"  //
 270           "movl %%eax, %24               \n\t"  // [-32] = eerr
 271            ".align 16                     \n\t" "movl %%eax, %26               \n"      // x = w1
 272           ".LoopX_1c:                             \n\t" "decl %26                      \n\t"    // x--
 273           "js .Xend1_c                   \n\t"  // if x < 0 then end
 274           // do conversion of 4 pixels
 275           "movq %2, %%mm0                \n\t"  // mm0 = err
 276            "jmp .LoopX_1c                 \n" ".Xend1_c:                              \n\t" "movl %28, %%eax               \n\t" "movl %%eax, %26               \n"     // x = w2
 277           ".LoopX_2c:                             \n\t" "decl %26                      \n\t"    // x--
 278           "js .Xend2_c                   \n\t"  //
 279           // do conversion
 280           "jmp .LoopX_2c                 \n" ".Xend2_c:                              \n\t" "movl %27, %%eax               \n\t" "jmp .LoopYc                   \n" ".Endc:                                 \n\t"        // THE END
 281           "emms                          \n\t" "popl %%ebx                    \n\t":: "m" (image),      // %0
 282           "m"(ximage),          // %1
 283           "m"(err),             // %2
 284           "m"(nerr),            // %3
 285           "m"(rtable),          // %4
 286           "m"(gtable),          // %5
 287           "m"(btable),          // %6
 288           "m"(dr),              // %7
 289           "m"(dg),              // %8
 290           "m"(db),              // %9
 291           "m"(roffs),           // %10
 292           "m"(goffs),           // %11
 293           "m"(boffs),           // %12
 294           "m"(width),           // %13
 295           "m"(height),          // %14
 296           "m"(line_offset),     // %15
 297           "m"(rrggbbaa.words.rr),       // %16 (access to rr)
 298           "m"(rrggbbaa.words.gg),       // %17 (access to gg)
 299           "m"(rrggbbaa.words.bb),       // %18 (access to bb)
 300           "m"(rrggbbaa.words.aa),       // %19 (access to aa)
 301           "m"(pixel.words.rr),  // %20 (access to pixel.r)
 302           "m"(pixel.words.gg),  // %21 (access to pixel.g)
 303           "m"(pixel.words.bb),  // %22 (access to pixel.b)
 304           "m"(pixel.words.aa),  // %23 (access to pixel.a)
 305           "m"(tmp_err),         // %24
 306           "m"(tmp_nerr),        // %25
 307           "m"(x),               // %26
 308           "m"(w1),              // %27
 309           "m"(w2)               // %28
 310           :"eax", "ecx", "edx", "esi", "edi");
 311 }
 312
 313 #endif                          /* ASM_X86_MMX */
 314
 315 void
 316 x86_PseudoColor_32_to_8(unsigned char *image,
 317                         unsigned char *ximage,
 318                         char *err,
 319                         char *nerr,
 320                         short *ctable,
 321                         int dr,
 322                         int dg,
 323                         int db,
 324                         unsigned long *pixels, int cpc, int width, int height, int bytesPerPixel, int line_offset)
 325 {
 326         int x;
 327         int cpcpc;
 328
 329         int rr;
 330         int gg;
 331         int bb;
 332
 333         char *tmp_err;
 334         char *tmp_nerr;
 335
 336         char ndr;               // aparently not used
 337         char ndg;               // aparently not used
 338         char ndb;               // aparently not used
 339
 340         asm volatile
 341          ("pushal                        \n\t" "movl %9, %%eax                \n\t" "mulb %9                       \n\t" "movl %%eax, %15               \n\t"   // cpcpc = cpc*cpc
 342           // eax will always be <= 0xffff
 343           // process 1 pixel / cycle, each component treated as 16bit
 344           "movl %0, %%esi                \n"    // esi = image->data
 345            ".LoopYb:                               \n\t" "movl %10, %%ecx               \n\t" "movl %%ecx, %14               \n\t"      // x = width
 346            "movl %11, %%ecx               \n\t" "decl %%ecx                    \n\t"    // y--
 347           "movl %%ecx, %11               \n\t" "js .Endb                      \n\t"     // if y < 0, goto end
 348           "andl $1, %%ecx                \n\t" "jz .LoopY_1b                  \n"       // if (y&1) goto LoopY_1
 349            ".LoopY_0b:                             \n\t" "movl %2, %%ebx                \n\t"   // ebx = err
 350 //useless "movl %%ebx, %20              \n\t" // [-36] = err
 351           "movl %3, %%ecx                \n\t"  //
 352           "movl %%ecx, %19               \n\t"  // [-32] = nerr
 353            "movl $0, (%%ecx)              \n\t" // init error of nerr[0] to 0
 354            "jmp .LoopXb                   \n" ".LoopY_1b:                             \n\t" "movl %3, %%ebx                \n\t"        // ebx = nerr
 355 //useless "movl %%ebx, %20              \n\t" // [-36] = nerr
 356           "movl %2, %%ecx                \n\t"  //
 357           "movl %%ecx, %19               \n\t"  // [-32] = err
 358            "movl $0, (%%ecx)              \n\t" // init error of nerr[0] to 0
 359            ".align 16                     \n" ".LoopXb:                               \n\t" "movl %4, %%edi                \n\t"        // edi = ctable
 360           "xorl %%edx, %%edx             \n\t"  // zero the upper word on edx
 361           // RED
 362           // depends on ebx==err, esi==image->data, edi
 363           "movzbw (%%esi), %%dx          \n\t"  // dx = image->data[0]
 364           "movsbw (%%ebx), %%ax          \n\t"  // ax = error[0]
 365           "addw %%ax, %%dx               \n\t"  // pixel.red = data[0] + error[0]
 366            "testb %%dh, %%dh              \n\t" // test if pixel.red < 0 or > 255
 367           "jz .OKRb                      \n\t"  // 0 <= pixel.red <= 255
 368           "js .NEGRb                     \n\t"  // pixel.red < 0
 369           "movw $0xff, %%dx              \n\t"  // pixel.red > 255
 370           "jmp .OKRb                     \n"
 371           ".NEGRb:                                \n\t"
 372           "xorw %%dx, %%dx               \n" ".OKRb:                                 \n\t"
 373           //partial reg
 374           "leal (%%edi, %%edx, 2), %%ecx \n\t"  // ecx = &ctable[pixel.red]
 375           //agi
 376           "movl (%%ecx), %%eax           \n\t"  // ax = ctable[pixel.red]
 377           "movw %%ax, %16                \n\t"  // save rr
 378            "mulb %5                       \n\t" // ax = rr*dr
 379           "subw %%ax, %%dx               \n\t"  // rer = dx = dx - rr*dr
 380            "movswl %%dx, %%eax            \n\t" // save rer
 381           // distribute error
 382           "leal (, %%eax, 8), %%ecx      \n\t" "subw %%dx, %%cx               \n\t"     // cx = rer * 7
 383           "sarw $4, %%cx                 \n\t"  // cx = rer * 7 / 16
 384           "addb %%cl, 4(%%ebx)           \n\t"  // err[x+1] += rer * 7 / 16
 385            "movl %19, %%ecx               \n\t" // ecx = nerr
 386            "leaw (%%eax, %%eax, 4), %%dx  \n\t" // dx = rer * 5
 387           "sarw $4, %%dx                 \n\t"  // dx = rer * 5 / 16
 388           "addb %%dl, (%%ecx)            \n\t"  // nerr[x] += rer * 5 / 16
 389            "leaw (%%eax, %%eax, 2), %%dx  \n\t" // dx = rer * 3
 390           "sarw $4, %%dx                 \n\t"  // dx = rer * 3 / 16
 391           "addb %%dl, -4(%%ecx)          \n\t"  // nerr[x-1] += rer * 3 / 16
 392            "sarw $4, %%ax                 \n\t" // ax = rer / 16
 393           "movb %%al, 4(%%ecx)           \n\t"  // nerr[x+1] = rer / 16
 394           // GREEN
 395           // depends on ebx, esi, edi
 396           "movzbw 1(%%esi), %%dx         \n\t"  // dx = image->data[1]
 397           "movsbw 1(%%ebx), %%ax         \n\t"  // ax = error[1]
 398           "addw %%ax, %%dx               \n\t"  // pixel.grn = data[1] + error[1]
 399            "testb %%dh, %%dh              \n\t" // test if pixel.grn < 0 or > 255
 400           "jz .OKGb                      \n\t"  // 0 <= pixel.grn <= 255
 401           "js .NEGGb                     \n\t"  // pixel.grn < 0
 402           "movw $0xff, %%dx              \n\t"  // pixel.grn > 255
 403           "jmp .OKGb                     \n"
 404           ".NEGGb:                                \n\t"
 405           "xorw %%dx, %%dx               \n" ".OKGb:                                 \n\t"
 406           // partial reg
 407           "leal (%%edi, %%edx, 2), %%ecx \n\t"  // ecx = &ctable[pixel.grn]
 408           //agi
 409           "movw (%%ecx), %%ax            \n\t"  // ax = ctable[pixel.grn]
 410           "movw %%ax, %17                \n\t"  // save gg
 411            "mulb %6                       \n\t" // ax = gg*dg
 412           "subw %%ax, %%dx               \n\t"  // ger = dx = dx - gg*dg
 413            "movswl %%dx, %%eax            \n\t" // save ger
 414           // distribute error
 415            "leal (, %%eax, 8), %%ecx      \n\t" "subw %%dx, %%cx               \n\t"    // cx = ger * 7
 416           "sarw $4, %%cx                 \n\t"  // cx = ger * 7 / 16
 417           "addb %%cl, 5(%%ebx)           \n\t"  // err[x+1] += ger * 7 / 16
 418            "movl %19, %%ecx               \n\t" // ecx = nerr
 419            "leaw (%%eax, %%eax, 4), %%dx  \n\t" // dx = ger * 5
 420           "sarw $4, %%dx                 \n\t"  // dx = ger * 5 / 16
 421           "addb %%dl, 1(%%ecx)           \n\t"  // nerr[x] += ger * 5 / 16
 422            "leaw (%%eax, %%eax, 2), %%dx  \n\t" // dx = ger * 3
 423           "sarw $4, %%dx                 \n\t"  // dx = ger * 3 / 16
 424           "addb %%dl, -3(%%ecx)          \n\t"  // nerr[x-1] += ger * 3 / 16
 425            "sarw $4, %%ax                 \n\t" // ax = ger / 16
 426           "movb %%al, 5(%%ecx)           \n\t"  // nerr[x+1] = ger / 16
 427           // BLUE
 428           // depends on ebx, esi
 429           "movzbw 2(%%esi), %%dx         \n\t"  // dx = image->data[2]
 430           "movsbw 2(%%ebx), %%ax         \n\t"  // ax = error[2]
 431           "addw %%ax, %%dx               \n\t"  // pixel.grn = data[2] + error[2]
 432            "testb %%dh, %%dh              \n\t" // test if pixel.blu < 0 or > 255
 433           "jz .OKBb                      \n\t"  // 0 <= pixel.blu <= 255
 434           "js .NEGBb                     \n\t"  // pixel.blu < 0
 435           "movw $0xff, %%dx              \n\t"  // pixel.blu > 255
 436           "jmp .OKBb                     \n"
 437           ".NEGBb:                                \n\t"
 438           "xorw %%dx, %%dx               \n" ".OKBb:                                 \n\t"
 439           //partial reg
 440           "leal (%%edi, %%edx, 2), %%ecx \n\t"  // ecx = &ctable[pixel.blu]
 441           //agi
 442           "movw (%%ecx), %%ax            \n\t"  // ax = ctable[pixel.blu]
 443           "movw %%ax, %18                \n\t"  // save bb
 444            "mulb %7                       \n\t" // ax = bb*db
 445           "subw %%ax, %%dx               \n\t"  // ber = dx = dx - bb*db
 446           "movswl %%dx, %%eax            \n\t"  // save ber
 447           // distribute error
 448           "leal (, %%eax, 8), %%ecx      \n\t" "subw %%dx, %%cx               \n\t"     // cx = ber * 7
 449           "sarw $4, %%cx                 \n\t"  // cx = ber * 7 / 16
 450           "addb %%cl, 6(%%ebx)           \n\t"  // err[x+1] += ber * 7 / 16
 451            "movl %19, %%ecx               \n\t" // ecx = nerr
 452            "leaw (%%eax, %%eax, 4), %%dx  \n\t" // dx = ber * 5
 453           "sarw $4, %%dx                 \n\t"  // dx = ber * 5 / 16
 454           "addb %%dl, 2(%%ecx)           \n\t"  // nerr[x] += ber * 5 / 16
 455            "leaw (%%eax, %%eax, 2), %%dx  \n\t" // dx = ber * 3
 456           "sarw $4, %%dx                 \n\t"  // dx = ber * 3 / 16
 457           "addb %%dl, -4(%%ecx)          \n\t"  // nerr[x-1] += ber * 3 / 16
 458            "sarw $4, %%ax                 \n\t" // ax = ber / 16
 459           "movb %%al, 6(%%ecx)           \n\t"  // nerr[x+1] = ber / 16
 460            "andl $0xffff, %%eax           \n\t"
 461           // depends on eax & 0xffff0000 == 0
 462           // calculate the index of the value of the pixel
 463           "movw %16, %%ax                \n\t"  // ax = rr
 464           "mulb %15                      \n\t"  // ax = cpcpc*rr
 465           "movw %%ax, %%cx               \n\t" "movw %17, %%ax                \n\t"     // ax = gg
 466           "mulb %9                       \n\t"  // ax = cpc*gg
 467           "addw %%cx, %%ax               \n\t"  // ax = cpc*gg + cpcpc*rr
 468           "addw %18, %%ax                \n\t"  // ax = cpcpc*rr + cpc*gg + bb
 469            "movl %8, %%ecx                \n\t"
 470           //agi
 471           "leal (%%ecx, %%eax, 4), %%edx \n\t"
 472           //agi
 473           "movb (%%edx), %%cl            \n\t"  // cl = pixels[ax]
 474           // store the pixel
 475           "movl %1, %%eax                \n\t" "movb %%cl, (%%eax)            \n\t"     // *ximage = cl
 476           "incl %1                       \n\t"  // ximage++
 477           // prepare for next iteration on X
 478            "addl $4, %19                  \n\t" // nerr += 4
 479           "addl $4, %%ebx                \n\t"  // err += 4
 480            "addl %12, %%esi               \n\t" // image->data += bpp
 481            "decl %14                      \n\t" // x--
 482           "jnz .LoopXb                   \n\t"  // if x>0, goto .LoopX
 483            "movl %13, %%eax               \n\t" "addl %%eax, %1                \n\t"    // add extra offset to ximage
 484            "jmp .LoopYb                   \n" ".Endb:                                 \n\t" "emms                          \n\t" "popal                         \n\t":: "m" (image),    // %0
 485           "m"(ximage),          // %1
 486           "m"(err),             // %2
 487           "m"(nerr),            // %3
 488           "m"(ctable),          // %4
 489           "m"(dr),              // %5
 490           "m"(dg),              // %6
 491           "m"(db),              // %7
 492           "m"(pixels),          // %8
 493           "m"(cpc),             // %9
 494           "m"(width),           // %10
 495           "m"(height),          // %11
 496           "m"(bytesPerPixel),   // %12
 497           "m"(line_offset),     // %13
 498           "m"(x),               // %14
 499           "m"(cpcpc),           // %15
 500           "m"(rr),              // %16
 501           "m"(gg),              // %17
 502           "m"(bb),              // %18
 503           "m"(tmp_err),         // %19
 504           "m"(tmp_nerr),        // %20
 505           "m"(ndr),             // %21
 506           "m"(ndg),             // %22
 507           "m"(ndb)              // %23
 508             );
 509 }
 510
 511 #endif                          /* ASM_X86 */