- fixed gcc-4 compilation issues (Vladimir Nadvornik <nadvornik@suse.cz>)
[wmaker-crm.git] / wrlib / x86_specific.c
blobacd9503cf66b029b7f61008e1e1897b9276b2001
1 /* x86_convert.c - convert RImage to XImage with x86 optimizations
3 * Raster graphics library
5 * Copyright (c) 2000-2003 Alfredo K. Kojima
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public
18 * License along with this library; if not, write to the Free
19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #include <config.h>
24 #ifdef ASM_X86
27 #ifdef ASM_X86_MMX
29 int
30 x86_check_mmx()
32 static int result = -1;
34 if (result >= 0)
35 return result;
37 result = 0;
39 asm volatile
40 ("pushal \n\t" // please dont forget this in any asm
41 "pushfl \n\t" // check whether cpuid supported
42 "pop %%eax \n\t"
43 "movl %%eax, %%ebx \n\t"
44 "xorl $(1<<21), %%eax \n\t"
45 "pushl %%eax \n\t"
46 "popfl \n\t"
47 "pushfl \n\t"
48 "popl %%eax \n\t"
49 "xorl %%ebx, %%eax \n\t"
50 "andl $(1<<21), %%eax \n\t"
51 "jz .NotPentium \n\t"
52 "xorl %%eax, %%eax \n\t" // no eax effect because of the movl below
53 // except reseting flags. is it needed?
54 "movl $1, %%eax \n\t"
55 "cpuid \n\t"
56 "test $(1<<23), %%edx \n\t"
57 "jz .NotMMX \n\t"
59 "popal \n\t" // popal needed because the address of
60 "movl $1, %0 \n\t" // variable %0 may be kept in a register
61 "jmp .noPop \n"
63 ".NotMMX: \n"
64 ".NotPentium: \n\t"
65 "popal \n"
66 ".noPop: \n\t"
68 : "=m" (result));
70 return result;
75 * TODO:
76 * 32/8 24/8 32/16 24/16 32/24 24/24
77 * PPlain YES YES
78 * MMX DONE
81 * - try to align stack (local variable space) into quadword boundary
83 void
84 x86_mmx_TrueColor_32_to_16(unsigned char *image,
85 unsigned short *ximage,
86 short *err,
87 short *nerr,
88 short *rtable,
89 short *gtable,
90 short *btable,
91 int dr,
92 int dg,
93 int db,
94 unsigned int roffs,
95 unsigned int goffs,
96 unsigned int boffs,
97 int width,
98 int height,
99 int line_offset)
101 union {
102 long long rrggbbaa;
103 struct {short int rr, gg, bb, aa;} words;
104 } rrggbbaa;
106 union {
107 long long pixel;
108 struct {short int rr, gg, bb, aa;} words;
109 } pixel;
111 short *tmp_err;
112 short *tmp_nerr;
113 int x;
115 asm volatile
117 "pushl %%ebx \n\t"
119 // pack dr, dg and db into mm6
120 "movl %7, %%eax \n\t"
121 "movl %8, %%ebx \n\t"
122 "movl %9, %%ecx \n\t"
123 "movw %%ax, %16 \n\t"
124 "movw %%bx, %17 \n\t"
125 "movw %%cx, %18 \n\t"
126 "movw $0, %19 \n\t"
128 "movq %16, %%mm6 \n\t" // dr dg db 0
130 // pack 4|4|4|4 into mm7, for shifting (/16)
131 "movl $0x00040004, %16 \n\t"
132 "movl $0x00040004, %18 \n\t"
133 "movq %16, %%mm7 \n\t"
135 // store constant values for using with mmx when dithering
136 "movl $0x00070007, %16 \n\t"
137 "movl $0x00070007, %18 \n\t"
138 "movq %16, %%mm5 \n\t"
140 "movl $0x00050005, %16 \n\t"
141 "movl $0x00050005, %18 \n\t"
142 "movq %16, %%mm4 \n\t"
144 "movl $0x00030003, %16 \n\t"
145 "movl $0x00030003, %18 \n\t"
146 "movq %16, %%mm3 \n\t"
148 // process 1 pixel / cycle, each component treated as 16bit
149 "movl %0, %%esi \n" // esi = image->data
151 ".LoopYa: \n\t"
152 "movl %13, %%eax \n\t"
153 "movl %%eax, %26 \n\t" // x = width
155 "movl %14, %%eax \n\t"
156 "decl %%eax \n\t" // y--
157 "movl %%eax, %14 \n\t"
158 "js .Enda \n\t" // if y < 0, goto end
159 "andl $1, %%eax \n\t"
160 "jz .LoopY_1a \n" // if (y&1) goto LoopY_1
162 ".LoopY_0a: \n\t"
164 "movl %2, %%ebx \n\t" // ebx = err
165 "movl %%ebx, %25 \n\t" // [-36] = err
166 "movl %3, %%eax \n\t" //
167 "movl %%eax, %24 \n\t" // [-32] = nerr
169 "jmp .LoopXa \n"
171 ".LoopY_1a: \n\t"
173 "movl %3, %%ebx \n\t" // ebx = nerr
174 "movl %%ebx, %25 \n\t" // [-36] = nerr
175 "movl %2, %%eax \n\t" //
176 "movl %%eax, %24 \n\t" // [-32] = eerr
178 ".align 16 \n"
179 ".LoopXa: \n\t"
181 // calculate errors and pixel components
183 // depend on ebx, esi, mm6
184 "movq (%%ebx), %%mm1 \n\t" // mm1 = error[0..3]
185 "punpcklbw (%%esi), %%mm0 \n\t" // mm0 = image->data[0..3]
186 "psrlw $8, %%mm0 \n\t" // fixup mm0
187 "paddusb %%mm1, %%mm0 \n\t" // mm0 = mm0 + mm1 (sat. to 255)
188 "movq %%mm0, %20 \n\t" // save the pixel
190 "movzwl %20, %%ecx \n\t" // ecx = pixel.red
191 "movl %4, %%edi \n\t" // edi = rtable
192 //agi
193 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &rtable[pixel.red]
194 // agi
195 "movw (%%eax), %%dx \n\t" // dx = rtable[pixel.red]
196 "movw %%dx, %16 \n\t" // save rr
198 "movzwl %21, %%ecx \n\t" // ecx = pixel.green
199 "movl %5, %%edi \n\t" // edi = gtable
200 //agi
201 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &gtable[pixel.green]
202 //agi
203 "movw (%%eax), %%dx \n\t" // dx = gtable[pixel.green]
204 "movw %%dx, %17 \n\t" // save gg
206 "movzwl %22, %%ecx \n\t" // ecx = pixel.blue
207 "movl %6, %%edi \n\t" // ebx = btable
208 //agi
209 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &btable[pixel.blue]
210 //agi
211 "movw (%%eax), %%dx \n\t" // dx = btable[pixel.blue]
212 "movw %%dx, %18 \n\t" // save bb
214 "movw $0, %19 \n\t" // save dummy aa
216 "movq %16, %%mm1 \n\t" // load mm1 with rrggbbaa
217 "pmullw %%mm6, %%mm1 \n\t" // mm1 = rr*dr|...
218 "psubsw %%mm1, %%mm0 \n\t" // error = pixel - mm1
221 // distribute the error
223 // depend on mm0, mm7, mm3, mm4, mm5
225 "movl %25, %%ebx \n\t"
227 "movq %%mm0, %%mm1 \n\t"
228 "pmullw %%mm5, %%mm1 \n\t" // mm1 = mm1*7
229 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
230 "paddw 8(%%ebx), %%mm1 \n\t"
231 "movq %%mm1, 8(%%ebx) \n\t" // err[x+1,y] = rer*7/16
234 "movl %24, %%ebx \n\t"
236 "movq %%mm0, %%mm1 \n\t"
237 "pmullw %%mm4, %%mm1 \n\t" // mm1 = mm1*5
238 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
239 "paddw -8(%%ebx), %%mm1 \n\t"
240 "movq %%mm1, -8(%%ebx) \n\t" // err[x-1,y+1] += rer*3/16
242 "movq %%mm0, %%mm1 \n\t"
243 "pmullw %%mm3, %%mm1 \n\t" // mm1 = mm1*3
244 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
245 "paddw 8(%%ebx), %%mm1 \n\t"
246 "movq %%mm1, (%%ebx) \n\t" // err[x,y+1] += rer*5/16
248 "psrlw %%mm7, %%mm0 \n\t" // mm0 = mm0/16
249 "movq %%mm0, 8(%%ebx) \n\t" // err[x+1,y+1] = rer/16
252 // calculate final pixel value and store
253 "movl %10, %%ecx \n\t"
254 "movw %16, %%ax \n\t"
255 "shlw %%cl, %%ax \n\t" //NP* ax = r<<roffs
257 "movl %11, %%ecx \n\t"
258 "movw %17, %%bx \n\t"
259 "shlw %%cl, %%bx \n\t" //NP*
260 "orw %%bx, %%ax \n\t"
262 "movl %12, %%ecx \n\t"
263 "movw %18, %%bx \n\t"
264 "shlw %%cl, %%bx \n\t" //NP*
265 "orw %%bx, %%ax \n\t"
267 "movl %1, %%edx \n\t"
268 "movw %%ax, (%%edx) \n\t"
269 "addl $2, %%edx \n\t" // increment ximage
270 "movl %%edx, %1 \n\t"
272 // prepare for next iteration on X
274 "addl $8, %24 \n\t" // nerr += 8
276 "movl %25, %%ebx \n\t"
277 "addl $8, %%ebx \n\t"
278 "movl %%ebx, %25 \n\t" // ebx = err += 8
281 // Note: in the last pixel, this would cause an invalid memory access
282 // because, punpcklbw is used (which reads 8 bytes) and the last
283 // pixel is only 4 bytes. This is no problem because the image data
284 // was allocated with extra 4 bytes when created.
285 "addl $4, %%esi \n\t" // image->data += 4
288 "decl %26 \n\t" // x--
289 "jnz .LoopXa \n\t" // if x>0, goto .LoopX
292 // depend on edx
293 "addl %15, %%edx \n\t" // add extra offset to ximage
294 "movl %%edx, %1 \n\t"
297 "jmp .LoopYa \n"
299 ".Enda: \n\t" // THE END
300 "emms \n\t"
301 "popl %%ebx \n\t"
304 "m" (image), // %0
305 "m" (ximage), // %1
306 "m" (err), // %2
307 "m" (nerr), // %3
308 "m" (rtable), // %4
309 "m" (gtable), // %5
310 "m" (btable), // %6
311 "m" (dr), // %7
312 "m" (dg), // %8
313 "m" (db), // %9
314 "m" (roffs), // %10
315 "m" (goffs), // %11
316 "m" (boffs), // %12
317 "m" (width), // %13
318 "m" (height), // %14
319 "m" (line_offset), // %15
320 "m" (rrggbbaa.rrggbbaa), // %16 (access to rr)
321 "m" (rrggbbaa.words.gg), // %17 (access to gg)
322 "m" (rrggbbaa.words.bb), // %18 (access to bb)
323 "m" (rrggbbaa.words.aa), // %19 (access to aa)
324 "m" (pixel.pixel), // %20 (access to pixel.r)
325 "m" (pixel.words.gg), // %21 (access to pixel.g)
326 "m" (pixel.words.bb), // %22 (access to pixel.b)
327 "m" (pixel.words.aa), // %23 (access to pixel.a)
328 "m" (tmp_err), // %24
329 "m" (tmp_nerr), // %25
330 "m" (x) // %26
331 : "eax", "ecx", "edx", "esi", "edi"
336 void
337 x86_mmx_TrueColor_24_to_16(unsigned char *image,
338 unsigned short *ximage,
339 short *err,
340 short *nerr,
341 short *rtable,
342 short *gtable,
343 short *btable,
344 int dr,
345 int dg,
346 int db,
347 unsigned int roffs,
348 unsigned int goffs,
349 unsigned int boffs,
350 int width,
351 int height,
352 int line_offset)
354 union {
355 long long rrggbbaa;
356 struct {short int rr, gg, bb, aa;} words;
357 } rrggbbaa;
359 union {
360 long long pixel;
361 struct {short int rr, gg, bb, aa;} words;
362 } pixel;
364 short *tmp_err;
365 short *tmp_nerr;
367 int x;
368 int w1;
369 int w2;
371 asm volatile
373 "pushl %%ebx \n\t"
375 "movl %13, %%eax \n\t" // eax = width
376 "movl %%eax, %%ebx \n\t"
377 "shrl $2, %%eax \n\t"
378 "movl %%eax, %27 \n\t" // w1 = width / 4
379 "andl $3, %%ebx \n\t"
380 "movl %%ebx, %28 \n" // w2 = width %% 4
383 ".LoopYc: \n\t"
384 "movl %13, %%eax \n\t"
385 "movl %%eax, %26 \n\t" // x = width
387 "decl %14 \n\t" // height--
388 "js .Endc \n\t" // if height < 0 then end
390 "movl %14, %%eax \n\t"
391 "decl %%eax \n\t" // y--
392 "movl %%eax, %14 \n\t"
393 "js .Endc \n\t" // if y < 0, goto end
394 "andl $1, %%eax \n\t"
395 "jz .LoopY_1c \n" // if (y&1) goto LoopY_1
397 ".LoopY_0c: \n\t"
399 "movl %2, %%ebx \n\t" // ebx = err
400 "movl %%ebx, %25 \n\t" // [-36] = err
401 "movl %3, %%eax \n\t" //
402 "movl %%eax, %24 \n\t" // [-32] = nerr
404 "jmp .LoopX_1c \n"
406 ".LoopY_1c: \n\t"
408 "movl %3, %%ebx \n\t" // ebx = nerr
409 "movl %%ebx, %25 \n\t" // [-36] = nerr
410 "movl %2, %%eax \n\t" //
411 "movl %%eax, %24 \n\t" // [-32] = eerr
413 ".align 16 \n\t"
415 "movl %%eax, %26 \n" // x = w1
416 ".LoopX_1c: \n\t"
417 "decl %26 \n\t" // x--
418 "js .Xend1_c \n\t" // if x < 0 then end
420 // do conversion of 4 pixels
421 "movq %2, %%mm0 \n\t" // mm0 = err
426 "jmp .LoopX_1c \n"
427 ".Xend1_c: \n\t"
429 "movl %28, %%eax \n\t"
430 "movl %%eax, %26 \n" // x = w2
431 ".LoopX_2c: \n\t"
432 "decl %26 \n\t" // x--
433 "js .Xend2_c \n\t" //
434 // do conversion
435 "jmp .LoopX_2c \n"
436 ".Xend2_c: \n\t"
438 "movl %27, %%eax \n\t"
439 "jmp .LoopYc \n"
441 ".Endc: \n\t" // THE END
442 "emms \n\t"
443 "popl %%ebx \n\t"
446 "m" (image), // %0
447 "m" (ximage), // %1
448 "m" (err), // %2
449 "m" (nerr), // %3
450 "m" (rtable), // %4
451 "m" (gtable), // %5
452 "m" (btable), // %6
453 "m" (dr), // %7
454 "m" (dg), // %8
455 "m" (db), // %9
456 "m" (roffs), // %10
457 "m" (goffs), // %11
458 "m" (boffs), // %12
459 "m" (width), // %13
460 "m" (height), // %14
461 "m" (line_offset), // %15
462 "m" (rrggbbaa.rrggbbaa), // %16 (access to rr)
463 "m" (rrggbbaa.words.gg), // %17 (access to gg)
464 "m" (rrggbbaa.words.bb), // %18 (access to bb)
465 "m" (rrggbbaa.words.aa), // %19 (access to aa)
466 "m" (pixel.pixel), // %20 (access to pixel.r)
467 "m" (pixel.words.gg), // %21 (access to pixel.g)
468 "m" (pixel.words.bb), // %22 (access to pixel.b)
469 "m" (pixel.words.aa), // %23 (access to pixel.a)
470 "m" (tmp_err), // %24
471 "m" (tmp_nerr), // %25
472 "m" (x), // %26
473 "m" (w1), // %27
474 "m" (w2) // %28
475 : "eax", "ecx", "edx", "esi", "edi"
481 #endif /* ASM_X86_MMX */
485 void
486 x86_PseudoColor_32_to_8(unsigned char *image,
487 unsigned char *ximage,
488 char *err,
489 char *nerr,
490 short *ctable,
491 int dr,
492 int dg,
493 int db,
494 unsigned long *pixels,
495 int cpc,
496 int width,
497 int height,
498 int bytesPerPixel,
499 int line_offset)
501 int x;
502 int cpcpc;
504 int rr;
505 int gg;
506 int bb;
508 char *tmp_err;
509 char *tmp_nerr;
511 char ndr; // aparently not used
512 char ndg; // aparently not used
513 char ndb; // aparently not used
515 asm volatile
517 "pushal \n\t"
519 "movl %9, %%eax \n\t"
520 "mulb %9 \n\t"
521 "movl %%eax, %15 \n\t" // cpcpc = cpc*cpc
523 // eax will always be <= 0xffff
525 // process 1 pixel / cycle, each component treated as 16bit
526 "movl %0, %%esi \n" // esi = image->data
528 ".LoopYb: \n\t"
529 "movl %10, %%ecx \n\t"
530 "movl %%ecx, %14 \n\t" // x = width
532 "movl %11, %%ecx \n\t"
533 "decl %%ecx \n\t" // y--
534 "movl %%ecx, %11 \n\t"
535 "js .Endb \n\t" // if y < 0, goto end
536 "andl $1, %%ecx \n\t"
537 "jz .LoopY_1b \n" // if (y&1) goto LoopY_1
539 ".LoopY_0b: \n\t"
541 "movl %2, %%ebx \n\t" // ebx = err
542 //useless "movl %%ebx, %20 \n\t" // [-36] = err
543 "movl %3, %%ecx \n\t" //
544 "movl %%ecx, %19 \n\t" // [-32] = nerr
546 "movl $0, (%%ecx) \n\t" // init error of nerr[0] to 0
548 "jmp .LoopXb \n"
550 ".LoopY_1b: \n\t"
552 "movl %3, %%ebx \n\t" // ebx = nerr
553 //useless "movl %%ebx, %20 \n\t" // [-36] = nerr
554 "movl %2, %%ecx \n\t" //
555 "movl %%ecx, %19 \n\t" // [-32] = err
557 "movl $0, (%%ecx) \n\t" // init error of nerr[0] to 0
560 ".align 16 \n"
561 ".LoopXb: \n\t"
564 "movl %4, %%edi \n\t" // edi = ctable
565 "xorl %%edx, %%edx \n\t" // zero the upper word on edx
567 // RED
569 // depends on ebx==err, esi==image->data, edi
570 "movzbw (%%esi), %%dx \n\t" // dx = image->data[0]
571 "movsbw (%%ebx), %%ax \n\t" // ax = error[0]
572 "addw %%ax, %%dx \n\t" // pixel.red = data[0] + error[0]
574 "testb %%dh, %%dh \n\t" // test if pixel.red < 0 or > 255
575 "jz .OKRb \n\t" // 0 <= pixel.red <= 255
576 "js .NEGRb \n\t" // pixel.red < 0
577 "movw $0xff, %%dx \n\t" // pixel.red > 255
578 "jmp .OKRb \n"
579 ".NEGRb: \n\t"
580 "xorw %%dx, %%dx \n"
581 ".OKRb: \n\t"
582 //partial reg
583 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.red]
584 //agi
585 "movl (%%ecx), %%eax \n\t" // ax = ctable[pixel.red]
586 "movw %%ax, %16 \n\t" // save rr
588 "mulb %5 \n\t" // ax = rr*dr
589 "subw %%ax, %%dx \n\t" // rer = dx = dx - rr*dr
591 "movswl %%dx, %%eax \n\t" // save rer
593 // distribute error
594 "leal (, %%eax, 8), %%ecx \n\t"
595 "subw %%dx, %%cx \n\t" // cx = rer * 7
596 "sarw $4, %%cx \n\t" // cx = rer * 7 / 16
597 "addb %%cl, 4(%%ebx) \n\t" // err[x+1] += rer * 7 / 16
599 "movl %19, %%ecx \n\t" // ecx = nerr
601 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = rer * 5
602 "sarw $4, %%dx \n\t" // dx = rer * 5 / 16
603 "addb %%dl, (%%ecx) \n\t" // nerr[x] += rer * 5 / 16
605 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = rer * 3
606 "sarw $4, %%dx \n\t" // dx = rer * 3 / 16
607 "addb %%dl, -4(%%ecx) \n\t" // nerr[x-1] += rer * 3 / 16
609 "sarw $4, %%ax \n\t" // ax = rer / 16
610 "movb %%al, 4(%%ecx) \n\t" // nerr[x+1] = rer / 16
613 // GREEN
615 // depends on ebx, esi, edi
616 "movzbw 1(%%esi), %%dx \n\t" // dx = image->data[1]
617 "movsbw 1(%%ebx), %%ax \n\t" // ax = error[1]
618 "addw %%ax, %%dx \n\t" // pixel.grn = data[1] + error[1]
620 "testb %%dh, %%dh \n\t" // test if pixel.grn < 0 or > 255
621 "jz .OKGb \n\t" // 0 <= pixel.grn <= 255
622 "js .NEGGb \n\t" // pixel.grn < 0
623 "movw $0xff, %%dx \n\t" // pixel.grn > 255
624 "jmp .OKGb \n"
625 ".NEGGb: \n\t"
626 "xorw %%dx, %%dx \n"
627 ".OKGb: \n\t"
628 // partial reg
629 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.grn]
630 //agi
631 "movw (%%ecx), %%ax \n\t" // ax = ctable[pixel.grn]
632 "movw %%ax, %17 \n\t" // save gg
634 "mulb %6 \n\t" // ax = gg*dg
635 "subw %%ax, %%dx \n\t" // ger = dx = dx - gg*dg
637 "movswl %%dx, %%eax \n\t" // save ger
639 // distribute error
641 "leal (, %%eax, 8), %%ecx \n\t"
642 "subw %%dx, %%cx \n\t" // cx = ger * 7
643 "sarw $4, %%cx \n\t" // cx = ger * 7 / 16
644 "addb %%cl, 5(%%ebx) \n\t" // err[x+1] += ger * 7 / 16
646 "movl %19, %%ecx \n\t" // ecx = nerr
648 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = ger * 5
649 "sarw $4, %%dx \n\t" // dx = ger * 5 / 16
650 "addb %%dl, 1(%%ecx) \n\t" // nerr[x] += ger * 5 / 16
652 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = ger * 3
653 "sarw $4, %%dx \n\t" // dx = ger * 3 / 16
654 "addb %%dl, -3(%%ecx) \n\t" // nerr[x-1] += ger * 3 / 16
656 "sarw $4, %%ax \n\t" // ax = ger / 16
657 "movb %%al, 5(%%ecx) \n\t" // nerr[x+1] = ger / 16
660 // BLUE
662 // depends on ebx, esi
663 "movzbw 2(%%esi), %%dx \n\t" // dx = image->data[2]
664 "movsbw 2(%%ebx), %%ax \n\t" // ax = error[2]
665 "addw %%ax, %%dx \n\t" // pixel.grn = data[2] + error[2]
667 "testb %%dh, %%dh \n\t" // test if pixel.blu < 0 or > 255
668 "jz .OKBb \n\t" // 0 <= pixel.blu <= 255
669 "js .NEGBb \n\t" // pixel.blu < 0
670 "movw $0xff, %%dx \n\t" // pixel.blu > 255
671 "jmp .OKBb \n"
672 ".NEGBb: \n\t"
673 "xorw %%dx, %%dx \n"
674 ".OKBb: \n\t"
675 //partial reg
676 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.blu]
677 //agi
678 "movw (%%ecx), %%ax \n\t" // ax = ctable[pixel.blu]
679 "movw %%ax, %18 \n\t" // save bb
681 "mulb %7 \n\t" // ax = bb*db
682 "subw %%ax, %%dx \n\t" // ber = dx = dx - bb*db
683 "movswl %%dx, %%eax \n\t" // save ber
685 // distribute error
686 "leal (, %%eax, 8), %%ecx \n\t"
687 "subw %%dx, %%cx \n\t" // cx = ber * 7
688 "sarw $4, %%cx \n\t" // cx = ber * 7 / 16
689 "addb %%cl, 6(%%ebx) \n\t" // err[x+1] += ber * 7 / 16
691 "movl %19, %%ecx \n\t" // ecx = nerr
693 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = ber * 5
694 "sarw $4, %%dx \n\t" // dx = ber * 5 / 16
695 "addb %%dl, 2(%%ecx) \n\t" // nerr[x] += ber * 5 / 16
697 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = ber * 3
698 "sarw $4, %%dx \n\t" // dx = ber * 3 / 16
699 "addb %%dl, -4(%%ecx) \n\t" // nerr[x-1] += ber * 3 / 16
701 "sarw $4, %%ax \n\t" // ax = ber / 16
702 "movb %%al, 6(%%ecx) \n\t" // nerr[x+1] = ber / 16
704 "andl $0xffff, %%eax \n\t"
705 // depends on eax & 0xffff0000 == 0
706 // calculate the index of the value of the pixel
707 "movw %16, %%ax \n\t" // ax = rr
708 "mulb %15 \n\t" // ax = cpcpc*rr
709 "movw %%ax, %%cx \n\t"
710 "movw %17, %%ax \n\t" // ax = gg
711 "mulb %9 \n\t" // ax = cpc*gg
712 "addw %%cx, %%ax \n\t" // ax = cpc*gg + cpcpc*rr
713 "addw %18, %%ax \n\t" // ax = cpcpc*rr + cpc*gg + bb
715 "movl %8, %%ecx \n\t"
716 //agi
717 "leal (%%ecx, %%eax, 4), %%edx \n\t"
718 //agi
719 "movb (%%edx), %%cl \n\t" // cl = pixels[ax]
721 // store the pixel
722 "movl %1, %%eax \n\t"
723 "movb %%cl, (%%eax) \n\t" // *ximage = cl
724 "incl %1 \n\t" // ximage++
726 // prepare for next iteration on X
728 "addl $4, %19 \n\t" // nerr += 4
729 "addl $4, %%ebx \n\t" // err += 4
731 "addl %12, %%esi \n\t" // image->data += bpp
733 "decl %14 \n\t" // x--
734 "jnz .LoopXb \n\t" // if x>0, goto .LoopX
737 "movl %13, %%eax \n\t"
738 "addl %%eax, %1 \n\t" // add extra offset to ximage
740 "jmp .LoopYb \n"
742 ".Endb: \n\t"
743 "emms \n\t"
744 "popal \n\t"
747 "m" (image), // %0
748 "m" (ximage), // %1
749 "m" (err), // %2
750 "m" (nerr), // %3
751 "m" (ctable), // %4
752 "m" (dr), // %5
753 "m" (dg), // %6
754 "m" (db), // %7
755 "m" (pixels), // %8
756 "m" (cpc), // %9
757 "m" (width), // %10
758 "m" (height), // %11
759 "m" (bytesPerPixel), // %12
760 "m" (line_offset), // %13
761 "m" (x), // %14
762 "m" (cpcpc), // %15
763 "m" (rr), // %16
764 "m" (gg), // %17
765 "m" (bb), // %18
766 "m" (tmp_err), // %19
767 "m" (tmp_nerr), // %20
768 "m" (ndr), // %21
769 "m" (ndg), // %22
770 "m" (ndb) // %23
774 #endif /* ASM_X86 */