Make compilation even less verbose
[wmaker-crm.git] / wrlib / x86_specific.c
bloba67f5717472c4e4ae278272bc51b368d4fb10e7a
1 /* x86_convert.c - convert RImage to XImage with x86 optimizations
3 * Raster graphics library
5 * Copyright (c) 2000-2003 Alfredo K. Kojima
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public
18 * License along with this library; if not, write to the Free
19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #include <config.h>
24 #ifdef ASM_X86
27 #ifdef ASM_X86_MMX
29 int
30 x86_check_mmx()
32 static int result = -1;
34 if (result >= 0)
35 return result;
37 result = 0;
39 asm volatile
40 ("pushal \n\t" // please dont forget this in any asm
41 "pushfl \n\t" // check whether cpuid supported
42 "pop %%eax \n\t"
43 "movl %%eax, %%ebx \n\t"
44 "xorl $(1<<21), %%eax \n\t"
45 "pushl %%eax \n\t"
46 "popfl \n\t"
47 "pushfl \n\t"
48 "popl %%eax \n\t"
49 "xorl %%ebx, %%eax \n\t"
50 "andl $(1<<21), %%eax \n\t"
51 "jz .NotPentium \n\t"
52 "xorl %%eax, %%eax \n\t" // no eax effect because of the movl below
53 // except reseting flags. is it needed?
54 "movl $1, %%eax \n\t"
55 "cpuid \n\t"
56 "test $(1<<23), %%edx \n\t"
57 "jz .NotMMX \n\t"
59 "popal \n\t" // popal needed because the address of
60 "movl $1, %0 \n\t" // variable %0 may be kept in a register
61 "jmp .noPop \n"
63 ".NotMMX: \n"
64 ".NotPentium: \n\t"
65 "popal \n"
66 ".noPop: \n\t"
68 : "=m" (result));
70 return result;
75 * TODO:
76 * 32/8 24/8 32/16 24/16 32/24 24/24
77 * PPlain YES YES
78 * MMX DONE
81 * - try to align stack (local variable space) into quadword boundary
83 void
84 x86_mmx_TrueColor_32_to_16(unsigned char *image,
85 unsigned short *ximage,
86 short *err,
87 short *nerr,
88 unsigned short *rtable,
89 unsigned short *gtable,
90 unsigned short *btable,
91 int dr,
92 int dg,
93 int db,
94 unsigned int roffs,
95 unsigned int goffs,
96 unsigned int boffs,
97 int width,
98 int height,
99 int line_offset)
101 union {
102 long long rrggbbaa;
103 struct {
104 short int rr, gg, bb, aa;
105 } words;
106 } rrggbbaa;
108 union {
109 long long pixel;
110 struct {
111 short int rr, gg, bb, aa;
112 } words;
113 } pixel;
115 short *tmp_err;
116 short *tmp_nerr;
117 int x;
119 asm volatile
121 "pushl %%ebx \n\t"
123 // pack dr, dg and db into mm6
124 "movl %7, %%eax \n\t"
125 "movl %8, %%ebx \n\t"
126 "movl %9, %%ecx \n\t"
127 "movw %%ax, %16 \n\t"
128 "movw %%bx, %17 \n\t"
129 "movw %%cx, %18 \n\t"
130 "movw $0, %19 \n\t"
132 "movq %16, %%mm6 \n\t" // dr dg db 0
134 // pack 4|4|4|4 into mm7, for shifting (/16)
135 "movl $0x00040004, %16 \n\t"
136 "movl $0x00040004, %18 \n\t"
137 "movq %16, %%mm7 \n\t"
139 // store constant values for using with mmx when dithering
140 "movl $0x00070007, %16 \n\t"
141 "movl $0x00070007, %18 \n\t"
142 "movq %16, %%mm5 \n\t"
144 "movl $0x00050005, %16 \n\t"
145 "movl $0x00050005, %18 \n\t"
146 "movq %16, %%mm4 \n\t"
148 "movl $0x00030003, %16 \n\t"
149 "movl $0x00030003, %18 \n\t"
150 "movq %16, %%mm3 \n\t"
152 // process 1 pixel / cycle, each component treated as 16bit
153 "movl %0, %%esi \n" // esi = image->data
155 ".LoopYa: \n\t"
156 "movl %13, %%eax \n\t"
157 "movl %%eax, %26 \n\t" // x = width
159 "movl %14, %%eax \n\t"
160 "decl %%eax \n\t" // y--
161 "movl %%eax, %14 \n\t"
162 "js .Enda \n\t" // if y < 0, goto end
163 "andl $1, %%eax \n\t"
164 "jz .LoopY_1a \n" // if (y&1) goto LoopY_1
166 ".LoopY_0a: \n\t"
168 "movl %2, %%ebx \n\t" // ebx = err
169 "movl %%ebx, %25 \n\t" // [-36] = err
170 "movl %3, %%eax \n\t" //
171 "movl %%eax, %24 \n\t" // [-32] = nerr
173 "jmp .LoopXa \n"
175 ".LoopY_1a: \n\t"
177 "movl %3, %%ebx \n\t" // ebx = nerr
178 "movl %%ebx, %25 \n\t" // [-36] = nerr
179 "movl %2, %%eax \n\t" //
180 "movl %%eax, %24 \n\t" // [-32] = eerr
182 ".align 16 \n"
183 ".LoopXa: \n\t"
185 // calculate errors and pixel components
187 // depend on ebx, esi, mm6
188 "movq (%%ebx), %%mm1 \n\t" // mm1 = error[0..3]
189 "punpcklbw (%%esi), %%mm0 \n\t" // mm0 = image->data[0..3]
190 "psrlw $8, %%mm0 \n\t" // fixup mm0
191 "paddusb %%mm1, %%mm0 \n\t" // mm0 = mm0 + mm1 (sat. to 255)
192 "movq %%mm0, %20 \n\t" // save the pixel
194 "movzwl %20, %%ecx \n\t" // ecx = pixel.red
195 "movl %4, %%edi \n\t" // edi = rtable
196 //agi
197 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &rtable[pixel.red]
198 // agi
199 "movw (%%eax), %%dx \n\t" // dx = rtable[pixel.red]
200 "movw %%dx, %16 \n\t" // save rr
202 "movzwl %21, %%ecx \n\t" // ecx = pixel.green
203 "movl %5, %%edi \n\t" // edi = gtable
204 //agi
205 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &gtable[pixel.green]
206 //agi
207 "movw (%%eax), %%dx \n\t" // dx = gtable[pixel.green]
208 "movw %%dx, %17 \n\t" // save gg
210 "movzwl %22, %%ecx \n\t" // ecx = pixel.blue
211 "movl %6, %%edi \n\t" // ebx = btable
212 //agi
213 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &btable[pixel.blue]
214 //agi
215 "movw (%%eax), %%dx \n\t" // dx = btable[pixel.blue]
216 "movw %%dx, %18 \n\t" // save bb
218 "movw $0, %19 \n\t" // save dummy aa
220 "movq %16, %%mm1 \n\t" // load mm1 with rrggbbaa
221 "pmullw %%mm6, %%mm1 \n\t" // mm1 = rr*dr|...
222 "psubsw %%mm1, %%mm0 \n\t" // error = pixel - mm1
225 // distribute the error
227 // depend on mm0, mm7, mm3, mm4, mm5
229 "movl %25, %%ebx \n\t"
231 "movq %%mm0, %%mm1 \n\t"
232 "pmullw %%mm5, %%mm1 \n\t" // mm1 = mm1*7
233 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
234 "paddw 8(%%ebx), %%mm1 \n\t"
235 "movq %%mm1, 8(%%ebx) \n\t" // err[x+1,y] = rer*7/16
238 "movl %24, %%ebx \n\t"
240 "movq %%mm0, %%mm1 \n\t"
241 "pmullw %%mm4, %%mm1 \n\t" // mm1 = mm1*5
242 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
243 "paddw -8(%%ebx), %%mm1 \n\t"
244 "movq %%mm1, -8(%%ebx) \n\t" // err[x-1,y+1] += rer*3/16
246 "movq %%mm0, %%mm1 \n\t"
247 "pmullw %%mm3, %%mm1 \n\t" // mm1 = mm1*3
248 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
249 "paddw 8(%%ebx), %%mm1 \n\t"
250 "movq %%mm1, (%%ebx) \n\t" // err[x,y+1] += rer*5/16
252 "psrlw %%mm7, %%mm0 \n\t" // mm0 = mm0/16
253 "movq %%mm0, 8(%%ebx) \n\t" // err[x+1,y+1] = rer/16
256 // calculate final pixel value and store
257 "movl %10, %%ecx \n\t"
258 "movw %16, %%ax \n\t"
259 "shlw %%cl, %%ax \n\t" //NP* ax = r<<roffs
261 "movl %11, %%ecx \n\t"
262 "movw %17, %%bx \n\t"
263 "shlw %%cl, %%bx \n\t" //NP*
264 "orw %%bx, %%ax \n\t"
266 "movl %12, %%ecx \n\t"
267 "movw %18, %%bx \n\t"
268 "shlw %%cl, %%bx \n\t" //NP*
269 "orw %%bx, %%ax \n\t"
271 "movl %1, %%edx \n\t"
272 "movw %%ax, (%%edx) \n\t"
273 "addl $2, %%edx \n\t" // increment ximage
274 "movl %%edx, %1 \n\t"
276 // prepare for next iteration on X
278 "addl $8, %24 \n\t" // nerr += 8
280 "movl %25, %%ebx \n\t"
281 "addl $8, %%ebx \n\t"
282 "movl %%ebx, %25 \n\t" // ebx = err += 8
285 // Note: in the last pixel, this would cause an invalid memory access
286 // because, punpcklbw is used (which reads 8 bytes) and the last
287 // pixel is only 4 bytes. This is no problem because the image data
288 // was allocated with extra 4 bytes when created.
289 "addl $4, %%esi \n\t" // image->data += 4
292 "decl %26 \n\t" // x--
293 "jnz .LoopXa \n\t" // if x>0, goto .LoopX
296 // depend on edx
297 "addl %15, %%edx \n\t" // add extra offset to ximage
298 "movl %%edx, %1 \n\t"
301 "jmp .LoopYa \n"
303 ".Enda: \n\t" // THE END
304 "emms \n\t"
305 "popl %%ebx \n\t"
308 "m" (image), // %0
309 "m" (ximage), // %1
310 "m" (err), // %2
311 "m" (nerr), // %3
312 "m" (rtable), // %4
313 "m" (gtable), // %5
314 "m" (btable), // %6
315 "m" (dr), // %7
316 "m" (dg), // %8
317 "m" (db), // %9
318 "m" (roffs), // %10
319 "m" (goffs), // %11
320 "m" (boffs), // %12
321 "m" (width), // %13
322 "m" (height), // %14
323 "m" (line_offset), // %15
324 "m" (rrggbbaa.words.rr), // %16 (access to rr)
325 "m" (rrggbbaa.words.gg), // %17 (access to gg)
326 "m" (rrggbbaa.words.bb), // %18 (access to bb)
327 "m" (rrggbbaa.words.aa), // %19 (access to aa)
328 "m" (pixel.words.rr), // %20 (access to pixel.r)
329 "m" (pixel.words.gg), // %21 (access to pixel.g)
330 "m" (pixel.words.bb), // %22 (access to pixel.b)
331 "m" (pixel.words.aa), // %23 (access to pixel.a)
332 "m" (tmp_err), // %24
333 "m" (tmp_nerr), // %25
334 "m" (x) // %26
335 : "eax", "ecx", "edx", "esi", "edi"
340 void
341 x86_mmx_TrueColor_24_to_16(unsigned char *image,
342 unsigned short *ximage,
343 short *err,
344 short *nerr,
345 short *rtable,
346 short *gtable,
347 short *btable,
348 int dr,
349 int dg,
350 int db,
351 unsigned int roffs,
352 unsigned int goffs,
353 unsigned int boffs,
354 int width,
355 int height,
356 int line_offset)
358 union {
359 long long rrggbbaa;
360 struct {
361 short int rr, gg, bb, aa;
362 } words;
363 } rrggbbaa;
365 union {
366 long long pixel;
367 struct {
368 short int rr, gg, bb, aa;
369 } words;
370 } pixel;
372 short *tmp_err;
373 short *tmp_nerr;
375 int x;
376 int w1;
377 int w2;
379 asm volatile
381 "pushl %%ebx \n\t"
383 "movl %13, %%eax \n\t" // eax = width
384 "movl %%eax, %%ebx \n\t"
385 "shrl $2, %%eax \n\t"
386 "movl %%eax, %27 \n\t" // w1 = width / 4
387 "andl $3, %%ebx \n\t"
388 "movl %%ebx, %28 \n" // w2 = width %% 4
391 ".LoopYc: \n\t"
392 "movl %13, %%eax \n\t"
393 "movl %%eax, %26 \n\t" // x = width
395 "decl %14 \n\t" // height--
396 "js .Endc \n\t" // if height < 0 then end
398 "movl %14, %%eax \n\t"
399 "decl %%eax \n\t" // y--
400 "movl %%eax, %14 \n\t"
401 "js .Endc \n\t" // if y < 0, goto end
402 "andl $1, %%eax \n\t"
403 "jz .LoopY_1c \n" // if (y&1) goto LoopY_1
405 ".LoopY_0c: \n\t"
407 "movl %2, %%ebx \n\t" // ebx = err
408 "movl %%ebx, %25 \n\t" // [-36] = err
409 "movl %3, %%eax \n\t" //
410 "movl %%eax, %24 \n\t" // [-32] = nerr
412 "jmp .LoopX_1c \n"
414 ".LoopY_1c: \n\t"
416 "movl %3, %%ebx \n\t" // ebx = nerr
417 "movl %%ebx, %25 \n\t" // [-36] = nerr
418 "movl %2, %%eax \n\t" //
419 "movl %%eax, %24 \n\t" // [-32] = eerr
421 ".align 16 \n\t"
423 "movl %%eax, %26 \n" // x = w1
424 ".LoopX_1c: \n\t"
425 "decl %26 \n\t" // x--
426 "js .Xend1_c \n\t" // if x < 0 then end
428 // do conversion of 4 pixels
429 "movq %2, %%mm0 \n\t" // mm0 = err
434 "jmp .LoopX_1c \n"
435 ".Xend1_c: \n\t"
437 "movl %28, %%eax \n\t"
438 "movl %%eax, %26 \n" // x = w2
439 ".LoopX_2c: \n\t"
440 "decl %26 \n\t" // x--
441 "js .Xend2_c \n\t" //
442 // do conversion
443 "jmp .LoopX_2c \n"
444 ".Xend2_c: \n\t"
446 "movl %27, %%eax \n\t"
447 "jmp .LoopYc \n"
449 ".Endc: \n\t" // THE END
450 "emms \n\t"
451 "popl %%ebx \n\t"
454 "m" (image), // %0
455 "m" (ximage), // %1
456 "m" (err), // %2
457 "m" (nerr), // %3
458 "m" (rtable), // %4
459 "m" (gtable), // %5
460 "m" (btable), // %6
461 "m" (dr), // %7
462 "m" (dg), // %8
463 "m" (db), // %9
464 "m" (roffs), // %10
465 "m" (goffs), // %11
466 "m" (boffs), // %12
467 "m" (width), // %13
468 "m" (height), // %14
469 "m" (line_offset), // %15
470 "m" (rrggbbaa.words.rr), // %16 (access to rr)
471 "m" (rrggbbaa.words.gg), // %17 (access to gg)
472 "m" (rrggbbaa.words.bb), // %18 (access to bb)
473 "m" (rrggbbaa.words.aa), // %19 (access to aa)
474 "m" (pixel.words.rr), // %20 (access to pixel.r)
475 "m" (pixel.words.gg), // %21 (access to pixel.g)
476 "m" (pixel.words.bb), // %22 (access to pixel.b)
477 "m" (pixel.words.aa), // %23 (access to pixel.a)
478 "m" (tmp_err), // %24
479 "m" (tmp_nerr), // %25
480 "m" (x), // %26
481 "m" (w1), // %27
482 "m" (w2) // %28
483 : "eax", "ecx", "edx", "esi", "edi"
489 #endif /* ASM_X86_MMX */
493 void
494 x86_PseudoColor_32_to_8(unsigned char *image,
495 unsigned char *ximage,
496 char *err,
497 char *nerr,
498 short *ctable,
499 int dr,
500 int dg,
501 int db,
502 unsigned long *pixels,
503 int cpc,
504 int width,
505 int height,
506 int bytesPerPixel,
507 int line_offset)
509 int x;
510 int cpcpc;
512 int rr;
513 int gg;
514 int bb;
516 char *tmp_err;
517 char *tmp_nerr;
519 char ndr; // aparently not used
520 char ndg; // aparently not used
521 char ndb; // aparently not used
523 asm volatile
525 "pushal \n\t"
527 "movl %9, %%eax \n\t"
528 "mulb %9 \n\t"
529 "movl %%eax, %15 \n\t" // cpcpc = cpc*cpc
531 // eax will always be <= 0xffff
533 // process 1 pixel / cycle, each component treated as 16bit
534 "movl %0, %%esi \n" // esi = image->data
536 ".LoopYb: \n\t"
537 "movl %10, %%ecx \n\t"
538 "movl %%ecx, %14 \n\t" // x = width
540 "movl %11, %%ecx \n\t"
541 "decl %%ecx \n\t" // y--
542 "movl %%ecx, %11 \n\t"
543 "js .Endb \n\t" // if y < 0, goto end
544 "andl $1, %%ecx \n\t"
545 "jz .LoopY_1b \n" // if (y&1) goto LoopY_1
547 ".LoopY_0b: \n\t"
549 "movl %2, %%ebx \n\t" // ebx = err
550 //useless "movl %%ebx, %20 \n\t" // [-36] = err
551 "movl %3, %%ecx \n\t" //
552 "movl %%ecx, %19 \n\t" // [-32] = nerr
554 "movl $0, (%%ecx) \n\t" // init error of nerr[0] to 0
556 "jmp .LoopXb \n"
558 ".LoopY_1b: \n\t"
560 "movl %3, %%ebx \n\t" // ebx = nerr
561 //useless "movl %%ebx, %20 \n\t" // [-36] = nerr
562 "movl %2, %%ecx \n\t" //
563 "movl %%ecx, %19 \n\t" // [-32] = err
565 "movl $0, (%%ecx) \n\t" // init error of nerr[0] to 0
568 ".align 16 \n"
569 ".LoopXb: \n\t"
572 "movl %4, %%edi \n\t" // edi = ctable
573 "xorl %%edx, %%edx \n\t" // zero the upper word on edx
575 // RED
577 // depends on ebx==err, esi==image->data, edi
578 "movzbw (%%esi), %%dx \n\t" // dx = image->data[0]
579 "movsbw (%%ebx), %%ax \n\t" // ax = error[0]
580 "addw %%ax, %%dx \n\t" // pixel.red = data[0] + error[0]
582 "testb %%dh, %%dh \n\t" // test if pixel.red < 0 or > 255
583 "jz .OKRb \n\t" // 0 <= pixel.red <= 255
584 "js .NEGRb \n\t" // pixel.red < 0
585 "movw $0xff, %%dx \n\t" // pixel.red > 255
586 "jmp .OKRb \n"
587 ".NEGRb: \n\t"
588 "xorw %%dx, %%dx \n"
589 ".OKRb: \n\t"
590 //partial reg
591 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.red]
592 //agi
593 "movl (%%ecx), %%eax \n\t" // ax = ctable[pixel.red]
594 "movw %%ax, %16 \n\t" // save rr
596 "mulb %5 \n\t" // ax = rr*dr
597 "subw %%ax, %%dx \n\t" // rer = dx = dx - rr*dr
599 "movswl %%dx, %%eax \n\t" // save rer
601 // distribute error
602 "leal (, %%eax, 8), %%ecx \n\t"
603 "subw %%dx, %%cx \n\t" // cx = rer * 7
604 "sarw $4, %%cx \n\t" // cx = rer * 7 / 16
605 "addb %%cl, 4(%%ebx) \n\t" // err[x+1] += rer * 7 / 16
607 "movl %19, %%ecx \n\t" // ecx = nerr
609 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = rer * 5
610 "sarw $4, %%dx \n\t" // dx = rer * 5 / 16
611 "addb %%dl, (%%ecx) \n\t" // nerr[x] += rer * 5 / 16
613 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = rer * 3
614 "sarw $4, %%dx \n\t" // dx = rer * 3 / 16
615 "addb %%dl, -4(%%ecx) \n\t" // nerr[x-1] += rer * 3 / 16
617 "sarw $4, %%ax \n\t" // ax = rer / 16
618 "movb %%al, 4(%%ecx) \n\t" // nerr[x+1] = rer / 16
621 // GREEN
623 // depends on ebx, esi, edi
624 "movzbw 1(%%esi), %%dx \n\t" // dx = image->data[1]
625 "movsbw 1(%%ebx), %%ax \n\t" // ax = error[1]
626 "addw %%ax, %%dx \n\t" // pixel.grn = data[1] + error[1]
628 "testb %%dh, %%dh \n\t" // test if pixel.grn < 0 or > 255
629 "jz .OKGb \n\t" // 0 <= pixel.grn <= 255
630 "js .NEGGb \n\t" // pixel.grn < 0
631 "movw $0xff, %%dx \n\t" // pixel.grn > 255
632 "jmp .OKGb \n"
633 ".NEGGb: \n\t"
634 "xorw %%dx, %%dx \n"
635 ".OKGb: \n\t"
636 // partial reg
637 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.grn]
638 //agi
639 "movw (%%ecx), %%ax \n\t" // ax = ctable[pixel.grn]
640 "movw %%ax, %17 \n\t" // save gg
642 "mulb %6 \n\t" // ax = gg*dg
643 "subw %%ax, %%dx \n\t" // ger = dx = dx - gg*dg
645 "movswl %%dx, %%eax \n\t" // save ger
647 // distribute error
649 "leal (, %%eax, 8), %%ecx \n\t"
650 "subw %%dx, %%cx \n\t" // cx = ger * 7
651 "sarw $4, %%cx \n\t" // cx = ger * 7 / 16
652 "addb %%cl, 5(%%ebx) \n\t" // err[x+1] += ger * 7 / 16
654 "movl %19, %%ecx \n\t" // ecx = nerr
656 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = ger * 5
657 "sarw $4, %%dx \n\t" // dx = ger * 5 / 16
658 "addb %%dl, 1(%%ecx) \n\t" // nerr[x] += ger * 5 / 16
660 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = ger * 3
661 "sarw $4, %%dx \n\t" // dx = ger * 3 / 16
662 "addb %%dl, -3(%%ecx) \n\t" // nerr[x-1] += ger * 3 / 16
664 "sarw $4, %%ax \n\t" // ax = ger / 16
665 "movb %%al, 5(%%ecx) \n\t" // nerr[x+1] = ger / 16
668 // BLUE
670 // depends on ebx, esi
671 "movzbw 2(%%esi), %%dx \n\t" // dx = image->data[2]
672 "movsbw 2(%%ebx), %%ax \n\t" // ax = error[2]
673 "addw %%ax, %%dx \n\t" // pixel.grn = data[2] + error[2]
675 "testb %%dh, %%dh \n\t" // test if pixel.blu < 0 or > 255
676 "jz .OKBb \n\t" // 0 <= pixel.blu <= 255
677 "js .NEGBb \n\t" // pixel.blu < 0
678 "movw $0xff, %%dx \n\t" // pixel.blu > 255
679 "jmp .OKBb \n"
680 ".NEGBb: \n\t"
681 "xorw %%dx, %%dx \n"
682 ".OKBb: \n\t"
683 //partial reg
684 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.blu]
685 //agi
686 "movw (%%ecx), %%ax \n\t" // ax = ctable[pixel.blu]
687 "movw %%ax, %18 \n\t" // save bb
689 "mulb %7 \n\t" // ax = bb*db
690 "subw %%ax, %%dx \n\t" // ber = dx = dx - bb*db
691 "movswl %%dx, %%eax \n\t" // save ber
693 // distribute error
694 "leal (, %%eax, 8), %%ecx \n\t"
695 "subw %%dx, %%cx \n\t" // cx = ber * 7
696 "sarw $4, %%cx \n\t" // cx = ber * 7 / 16
697 "addb %%cl, 6(%%ebx) \n\t" // err[x+1] += ber * 7 / 16
699 "movl %19, %%ecx \n\t" // ecx = nerr
701 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = ber * 5
702 "sarw $4, %%dx \n\t" // dx = ber * 5 / 16
703 "addb %%dl, 2(%%ecx) \n\t" // nerr[x] += ber * 5 / 16
705 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = ber * 3
706 "sarw $4, %%dx \n\t" // dx = ber * 3 / 16
707 "addb %%dl, -4(%%ecx) \n\t" // nerr[x-1] += ber * 3 / 16
709 "sarw $4, %%ax \n\t" // ax = ber / 16
710 "movb %%al, 6(%%ecx) \n\t" // nerr[x+1] = ber / 16
712 "andl $0xffff, %%eax \n\t"
713 // depends on eax & 0xffff0000 == 0
714 // calculate the index of the value of the pixel
715 "movw %16, %%ax \n\t" // ax = rr
716 "mulb %15 \n\t" // ax = cpcpc*rr
717 "movw %%ax, %%cx \n\t"
718 "movw %17, %%ax \n\t" // ax = gg
719 "mulb %9 \n\t" // ax = cpc*gg
720 "addw %%cx, %%ax \n\t" // ax = cpc*gg + cpcpc*rr
721 "addw %18, %%ax \n\t" // ax = cpcpc*rr + cpc*gg + bb
723 "movl %8, %%ecx \n\t"
724 //agi
725 "leal (%%ecx, %%eax, 4), %%edx \n\t"
726 //agi
727 "movb (%%edx), %%cl \n\t" // cl = pixels[ax]
729 // store the pixel
730 "movl %1, %%eax \n\t"
731 "movb %%cl, (%%eax) \n\t" // *ximage = cl
732 "incl %1 \n\t" // ximage++
734 // prepare for next iteration on X
736 "addl $4, %19 \n\t" // nerr += 4
737 "addl $4, %%ebx \n\t" // err += 4
739 "addl %12, %%esi \n\t" // image->data += bpp
741 "decl %14 \n\t" // x--
742 "jnz .LoopXb \n\t" // if x>0, goto .LoopX
745 "movl %13, %%eax \n\t"
746 "addl %%eax, %1 \n\t" // add extra offset to ximage
748 "jmp .LoopYb \n"
750 ".Endb: \n\t"
751 "emms \n\t"
752 "popal \n\t"
755 "m" (image), // %0
756 "m" (ximage), // %1
757 "m" (err), // %2
758 "m" (nerr), // %3
759 "m" (ctable), // %4
760 "m" (dr), // %5
761 "m" (dg), // %6
762 "m" (db), // %7
763 "m" (pixels), // %8
764 "m" (cpc), // %9
765 "m" (width), // %10
766 "m" (height), // %11
767 "m" (bytesPerPixel), // %12
768 "m" (line_offset), // %13
769 "m" (x), // %14
770 "m" (cpcpc), // %15
771 "m" (rr), // %16
772 "m" (gg), // %17
773 "m" (bb), // %18
774 "m" (tmp_err), // %19
775 "m" (tmp_nerr), // %20
776 "m" (ndr), // %21
777 "m" (ndg), // %22
778 "m" (ndb) // %23
782 #endif /* ASM_X86 */