1 /* x86_convert.c - convert RImage to XImage with x86 optimizations
3 * Raster graphics library
5 * Copyright (c) 2000-2003 Alfredo K. Kojima
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public
18 * License along with this library; if not, write to the Free
19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 static int result
= -1;
40 ("pushal \n\t" // please dont forget this in any asm
41 "pushfl \n\t" // check whether cpuid supported
43 "movl %%eax, %%ebx \n\t"
44 "xorl $(1<<21), %%eax \n\t"
49 "xorl %%ebx, %%eax \n\t"
50 "andl $(1<<21), %%eax \n\t"
52 "xorl %%eax, %%eax \n\t" // no eax effect because of the movl below
53 // except reseting flags. is it needed?
56 "test $(1<<23), %%edx \n\t"
59 "popal \n\t" // popal needed because the address of
60 "movl $1, %0 \n\t" // variable %0 may be kept in a register
76 * 32/8 24/8 32/16 24/16 32/24 24/24
81 * - try to align stack (local variable space) into quadword boundary
84 x86_mmx_TrueColor_32_to_16(unsigned char *image
,
85 unsigned short *ximage
,
88 unsigned short *rtable
,
89 unsigned short *gtable
,
90 unsigned short *btable
,
104 short int rr
, gg
, bb
, aa
;
111 short int rr
, gg
, bb
, aa
;
123 // pack dr, dg and db into mm6
124 "movl %7, %%eax \n\t"
125 "movl %8, %%ebx \n\t"
126 "movl %9, %%ecx \n\t"
127 "movw %%ax, %16 \n\t"
128 "movw %%bx, %17 \n\t"
129 "movw %%cx, %18 \n\t"
132 "movq %16, %%mm6 \n\t" // dr dg db 0
134 // pack 4|4|4|4 into mm7, for shifting (/16)
135 "movl $0x00040004, %16 \n\t"
136 "movl $0x00040004, %18 \n\t"
137 "movq %16, %%mm7 \n\t"
139 // store constant values for using with mmx when dithering
140 "movl $0x00070007, %16 \n\t"
141 "movl $0x00070007, %18 \n\t"
142 "movq %16, %%mm5 \n\t"
144 "movl $0x00050005, %16 \n\t"
145 "movl $0x00050005, %18 \n\t"
146 "movq %16, %%mm4 \n\t"
148 "movl $0x00030003, %16 \n\t"
149 "movl $0x00030003, %18 \n\t"
150 "movq %16, %%mm3 \n\t"
152 // process 1 pixel / cycle, each component treated as 16bit
153 "movl %0, %%esi \n" // esi = image->data
156 "movl %13, %%eax \n\t"
157 "movl %%eax, %26 \n\t" // x = width
159 "movl %14, %%eax \n\t"
160 "decl %%eax \n\t" // y--
161 "movl %%eax, %14 \n\t"
162 "js .Enda \n\t" // if y < 0, goto end
163 "andl $1, %%eax \n\t"
164 "jz .LoopY_1a \n" // if (y&1) goto LoopY_1
168 "movl %2, %%ebx \n\t" // ebx = err
169 "movl %%ebx, %25 \n\t" // [-36] = err
170 "movl %3, %%eax \n\t" //
171 "movl %%eax, %24 \n\t" // [-32] = nerr
177 "movl %3, %%ebx \n\t" // ebx = nerr
178 "movl %%ebx, %25 \n\t" // [-36] = nerr
179 "movl %2, %%eax \n\t" //
180 "movl %%eax, %24 \n\t" // [-32] = eerr
185 // calculate errors and pixel components
187 // depend on ebx, esi, mm6
188 "movq (%%ebx), %%mm1 \n\t" // mm1 = error[0..3]
189 "punpcklbw (%%esi), %%mm0 \n\t" // mm0 = image->data[0..3]
190 "psrlw $8, %%mm0 \n\t" // fixup mm0
191 "paddusb %%mm1, %%mm0 \n\t" // mm0 = mm0 + mm1 (sat. to 255)
192 "movq %%mm0, %20 \n\t" // save the pixel
194 "movzwl %20, %%ecx \n\t" // ecx = pixel.red
195 "movl %4, %%edi \n\t" // edi = rtable
197 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &rtable[pixel.red]
199 "movw (%%eax), %%dx \n\t" // dx = rtable[pixel.red]
200 "movw %%dx, %16 \n\t" // save rr
202 "movzwl %21, %%ecx \n\t" // ecx = pixel.green
203 "movl %5, %%edi \n\t" // edi = gtable
205 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = >able[pixel.green]
207 "movw (%%eax), %%dx \n\t" // dx = gtable[pixel.green]
208 "movw %%dx, %17 \n\t" // save gg
210 "movzwl %22, %%ecx \n\t" // ecx = pixel.blue
211 "movl %6, %%edi \n\t" // ebx = btable
213 "leal (%%edi, %%ecx, 2), %%eax \n\t" // eax = &btable[pixel.blue]
215 "movw (%%eax), %%dx \n\t" // dx = btable[pixel.blue]
216 "movw %%dx, %18 \n\t" // save bb
218 "movw $0, %19 \n\t" // save dummy aa
220 "movq %16, %%mm1 \n\t" // load mm1 with rrggbbaa
221 "pmullw %%mm6, %%mm1 \n\t" // mm1 = rr*dr|...
222 "psubsw %%mm1, %%mm0 \n\t" // error = pixel - mm1
225 // distribute the error
227 // depend on mm0, mm7, mm3, mm4, mm5
229 "movl %25, %%ebx \n\t"
231 "movq %%mm0, %%mm1 \n\t"
232 "pmullw %%mm5, %%mm1 \n\t" // mm1 = mm1*7
233 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
234 "paddw 8(%%ebx), %%mm1 \n\t"
235 "movq %%mm1, 8(%%ebx) \n\t" // err[x+1,y] = rer*7/16
238 "movl %24, %%ebx \n\t"
240 "movq %%mm0, %%mm1 \n\t"
241 "pmullw %%mm4, %%mm1 \n\t" // mm1 = mm1*5
242 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
243 "paddw -8(%%ebx), %%mm1 \n\t"
244 "movq %%mm1, -8(%%ebx) \n\t" // err[x-1,y+1] += rer*3/16
246 "movq %%mm0, %%mm1 \n\t"
247 "pmullw %%mm3, %%mm1 \n\t" // mm1 = mm1*3
248 "psrlw %%mm7, %%mm1 \n\t" // mm1 = mm1/16
249 "paddw 8(%%ebx), %%mm1 \n\t"
250 "movq %%mm1, (%%ebx) \n\t" // err[x,y+1] += rer*5/16
252 "psrlw %%mm7, %%mm0 \n\t" // mm0 = mm0/16
253 "movq %%mm0, 8(%%ebx) \n\t" // err[x+1,y+1] = rer/16
256 // calculate final pixel value and store
257 "movl %10, %%ecx \n\t"
258 "movw %16, %%ax \n\t"
259 "shlw %%cl, %%ax \n\t" //NP* ax = r<<roffs
261 "movl %11, %%ecx \n\t"
262 "movw %17, %%bx \n\t"
263 "shlw %%cl, %%bx \n\t" //NP*
264 "orw %%bx, %%ax \n\t"
266 "movl %12, %%ecx \n\t"
267 "movw %18, %%bx \n\t"
268 "shlw %%cl, %%bx \n\t" //NP*
269 "orw %%bx, %%ax \n\t"
271 "movl %1, %%edx \n\t"
272 "movw %%ax, (%%edx) \n\t"
273 "addl $2, %%edx \n\t" // increment ximage
274 "movl %%edx, %1 \n\t"
276 // prepare for next iteration on X
278 "addl $8, %24 \n\t" // nerr += 8
280 "movl %25, %%ebx \n\t"
281 "addl $8, %%ebx \n\t"
282 "movl %%ebx, %25 \n\t" // ebx = err += 8
285 // Note: in the last pixel, this would cause an invalid memory access
286 // because, punpcklbw is used (which reads 8 bytes) and the last
287 // pixel is only 4 bytes. This is no problem because the image data
288 // was allocated with extra 4 bytes when created.
289 "addl $4, %%esi \n\t" // image->data += 4
292 "decl %26 \n\t" // x--
293 "jnz .LoopXa \n\t" // if x>0, goto .LoopX
297 "addl %15, %%edx \n\t" // add extra offset to ximage
298 "movl %%edx, %1 \n\t"
303 ".Enda: \n\t" // THE END
323 "m" (line_offset
), // %15
324 "m" (rrggbbaa
.words
.rr
), // %16 (access to rr)
325 "m" (rrggbbaa
.words
.gg
), // %17 (access to gg)
326 "m" (rrggbbaa
.words
.bb
), // %18 (access to bb)
327 "m" (rrggbbaa
.words
.aa
), // %19 (access to aa)
328 "m" (pixel
.words
.rr
), // %20 (access to pixel.r)
329 "m" (pixel
.words
.gg
), // %21 (access to pixel.g)
330 "m" (pixel
.words
.bb
), // %22 (access to pixel.b)
331 "m" (pixel
.words
.aa
), // %23 (access to pixel.a)
332 "m" (tmp_err
), // %24
333 "m" (tmp_nerr
), // %25
335 : "eax", "ecx", "edx", "esi", "edi"
341 x86_mmx_TrueColor_24_to_16(unsigned char *image
,
342 unsigned short *ximage
,
361 short int rr
, gg
, bb
, aa
;
368 short int rr
, gg
, bb
, aa
;
383 "movl %13, %%eax \n\t" // eax = width
384 "movl %%eax, %%ebx \n\t"
385 "shrl $2, %%eax \n\t"
386 "movl %%eax, %27 \n\t" // w1 = width / 4
387 "andl $3, %%ebx \n\t"
388 "movl %%ebx, %28 \n" // w2 = width %% 4
392 "movl %13, %%eax \n\t"
393 "movl %%eax, %26 \n\t" // x = width
395 "decl %14 \n\t" // height--
396 "js .Endc \n\t" // if height < 0 then end
398 "movl %14, %%eax \n\t"
399 "decl %%eax \n\t" // y--
400 "movl %%eax, %14 \n\t"
401 "js .Endc \n\t" // if y < 0, goto end
402 "andl $1, %%eax \n\t"
403 "jz .LoopY_1c \n" // if (y&1) goto LoopY_1
407 "movl %2, %%ebx \n\t" // ebx = err
408 "movl %%ebx, %25 \n\t" // [-36] = err
409 "movl %3, %%eax \n\t" //
410 "movl %%eax, %24 \n\t" // [-32] = nerr
416 "movl %3, %%ebx \n\t" // ebx = nerr
417 "movl %%ebx, %25 \n\t" // [-36] = nerr
418 "movl %2, %%eax \n\t" //
419 "movl %%eax, %24 \n\t" // [-32] = eerr
423 "movl %%eax, %26 \n" // x = w1
425 "decl %26 \n\t" // x--
426 "js .Xend1_c \n\t" // if x < 0 then end
428 // do conversion of 4 pixels
429 "movq %2, %%mm0 \n\t" // mm0 = err
437 "movl %28, %%eax \n\t"
438 "movl %%eax, %26 \n" // x = w2
440 "decl %26 \n\t" // x--
441 "js .Xend2_c \n\t" //
446 "movl %27, %%eax \n\t"
449 ".Endc: \n\t" // THE END
469 "m" (line_offset
), // %15
470 "m" (rrggbbaa
.words
.rr
), // %16 (access to rr)
471 "m" (rrggbbaa
.words
.gg
), // %17 (access to gg)
472 "m" (rrggbbaa
.words
.bb
), // %18 (access to bb)
473 "m" (rrggbbaa
.words
.aa
), // %19 (access to aa)
474 "m" (pixel
.words
.rr
), // %20 (access to pixel.r)
475 "m" (pixel
.words
.gg
), // %21 (access to pixel.g)
476 "m" (pixel
.words
.bb
), // %22 (access to pixel.b)
477 "m" (pixel
.words
.aa
), // %23 (access to pixel.a)
478 "m" (tmp_err
), // %24
479 "m" (tmp_nerr
), // %25
483 : "eax", "ecx", "edx", "esi", "edi"
489 #endif /* ASM_X86_MMX */
494 x86_PseudoColor_32_to_8(unsigned char *image
,
495 unsigned char *ximage
,
502 unsigned long *pixels
,
519 char ndr
; // aparently not used
520 char ndg
; // aparently not used
521 char ndb
; // aparently not used
527 "movl %9, %%eax \n\t"
529 "movl %%eax, %15 \n\t" // cpcpc = cpc*cpc
531 // eax will always be <= 0xffff
533 // process 1 pixel / cycle, each component treated as 16bit
534 "movl %0, %%esi \n" // esi = image->data
537 "movl %10, %%ecx \n\t"
538 "movl %%ecx, %14 \n\t" // x = width
540 "movl %11, %%ecx \n\t"
541 "decl %%ecx \n\t" // y--
542 "movl %%ecx, %11 \n\t"
543 "js .Endb \n\t" // if y < 0, goto end
544 "andl $1, %%ecx \n\t"
545 "jz .LoopY_1b \n" // if (y&1) goto LoopY_1
549 "movl %2, %%ebx \n\t" // ebx = err
550 //useless "movl %%ebx, %20 \n\t" // [-36] = err
551 "movl %3, %%ecx \n\t" //
552 "movl %%ecx, %19 \n\t" // [-32] = nerr
554 "movl $0, (%%ecx) \n\t" // init error of nerr[0] to 0
560 "movl %3, %%ebx \n\t" // ebx = nerr
561 //useless "movl %%ebx, %20 \n\t" // [-36] = nerr
562 "movl %2, %%ecx \n\t" //
563 "movl %%ecx, %19 \n\t" // [-32] = err
565 "movl $0, (%%ecx) \n\t" // init error of nerr[0] to 0
572 "movl %4, %%edi \n\t" // edi = ctable
573 "xorl %%edx, %%edx \n\t" // zero the upper word on edx
577 // depends on ebx==err, esi==image->data, edi
578 "movzbw (%%esi), %%dx \n\t" // dx = image->data[0]
579 "movsbw (%%ebx), %%ax \n\t" // ax = error[0]
580 "addw %%ax, %%dx \n\t" // pixel.red = data[0] + error[0]
582 "testb %%dh, %%dh \n\t" // test if pixel.red < 0 or > 255
583 "jz .OKRb \n\t" // 0 <= pixel.red <= 255
584 "js .NEGRb \n\t" // pixel.red < 0
585 "movw $0xff, %%dx \n\t" // pixel.red > 255
591 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.red]
593 "movl (%%ecx), %%eax \n\t" // ax = ctable[pixel.red]
594 "movw %%ax, %16 \n\t" // save rr
596 "mulb %5 \n\t" // ax = rr*dr
597 "subw %%ax, %%dx \n\t" // rer = dx = dx - rr*dr
599 "movswl %%dx, %%eax \n\t" // save rer
602 "leal (, %%eax, 8), %%ecx \n\t"
603 "subw %%dx, %%cx \n\t" // cx = rer * 7
604 "sarw $4, %%cx \n\t" // cx = rer * 7 / 16
605 "addb %%cl, 4(%%ebx) \n\t" // err[x+1] += rer * 7 / 16
607 "movl %19, %%ecx \n\t" // ecx = nerr
609 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = rer * 5
610 "sarw $4, %%dx \n\t" // dx = rer * 5 / 16
611 "addb %%dl, (%%ecx) \n\t" // nerr[x] += rer * 5 / 16
613 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = rer * 3
614 "sarw $4, %%dx \n\t" // dx = rer * 3 / 16
615 "addb %%dl, -4(%%ecx) \n\t" // nerr[x-1] += rer * 3 / 16
617 "sarw $4, %%ax \n\t" // ax = rer / 16
618 "movb %%al, 4(%%ecx) \n\t" // nerr[x+1] = rer / 16
623 // depends on ebx, esi, edi
624 "movzbw 1(%%esi), %%dx \n\t" // dx = image->data[1]
625 "movsbw 1(%%ebx), %%ax \n\t" // ax = error[1]
626 "addw %%ax, %%dx \n\t" // pixel.grn = data[1] + error[1]
628 "testb %%dh, %%dh \n\t" // test if pixel.grn < 0 or > 255
629 "jz .OKGb \n\t" // 0 <= pixel.grn <= 255
630 "js .NEGGb \n\t" // pixel.grn < 0
631 "movw $0xff, %%dx \n\t" // pixel.grn > 255
637 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.grn]
639 "movw (%%ecx), %%ax \n\t" // ax = ctable[pixel.grn]
640 "movw %%ax, %17 \n\t" // save gg
642 "mulb %6 \n\t" // ax = gg*dg
643 "subw %%ax, %%dx \n\t" // ger = dx = dx - gg*dg
645 "movswl %%dx, %%eax \n\t" // save ger
649 "leal (, %%eax, 8), %%ecx \n\t"
650 "subw %%dx, %%cx \n\t" // cx = ger * 7
651 "sarw $4, %%cx \n\t" // cx = ger * 7 / 16
652 "addb %%cl, 5(%%ebx) \n\t" // err[x+1] += ger * 7 / 16
654 "movl %19, %%ecx \n\t" // ecx = nerr
656 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = ger * 5
657 "sarw $4, %%dx \n\t" // dx = ger * 5 / 16
658 "addb %%dl, 1(%%ecx) \n\t" // nerr[x] += ger * 5 / 16
660 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = ger * 3
661 "sarw $4, %%dx \n\t" // dx = ger * 3 / 16
662 "addb %%dl, -3(%%ecx) \n\t" // nerr[x-1] += ger * 3 / 16
664 "sarw $4, %%ax \n\t" // ax = ger / 16
665 "movb %%al, 5(%%ecx) \n\t" // nerr[x+1] = ger / 16
670 // depends on ebx, esi
671 "movzbw 2(%%esi), %%dx \n\t" // dx = image->data[2]
672 "movsbw 2(%%ebx), %%ax \n\t" // ax = error[2]
673 "addw %%ax, %%dx \n\t" // pixel.grn = data[2] + error[2]
675 "testb %%dh, %%dh \n\t" // test if pixel.blu < 0 or > 255
676 "jz .OKBb \n\t" // 0 <= pixel.blu <= 255
677 "js .NEGBb \n\t" // pixel.blu < 0
678 "movw $0xff, %%dx \n\t" // pixel.blu > 255
684 "leal (%%edi, %%edx, 2), %%ecx \n\t" // ecx = &ctable[pixel.blu]
686 "movw (%%ecx), %%ax \n\t" // ax = ctable[pixel.blu]
687 "movw %%ax, %18 \n\t" // save bb
689 "mulb %7 \n\t" // ax = bb*db
690 "subw %%ax, %%dx \n\t" // ber = dx = dx - bb*db
691 "movswl %%dx, %%eax \n\t" // save ber
694 "leal (, %%eax, 8), %%ecx \n\t"
695 "subw %%dx, %%cx \n\t" // cx = ber * 7
696 "sarw $4, %%cx \n\t" // cx = ber * 7 / 16
697 "addb %%cl, 6(%%ebx) \n\t" // err[x+1] += ber * 7 / 16
699 "movl %19, %%ecx \n\t" // ecx = nerr
701 "leaw (%%eax, %%eax, 4), %%dx \n\t" // dx = ber * 5
702 "sarw $4, %%dx \n\t" // dx = ber * 5 / 16
703 "addb %%dl, 2(%%ecx) \n\t" // nerr[x] += ber * 5 / 16
705 "leaw (%%eax, %%eax, 2), %%dx \n\t" // dx = ber * 3
706 "sarw $4, %%dx \n\t" // dx = ber * 3 / 16
707 "addb %%dl, -4(%%ecx) \n\t" // nerr[x-1] += ber * 3 / 16
709 "sarw $4, %%ax \n\t" // ax = ber / 16
710 "movb %%al, 6(%%ecx) \n\t" // nerr[x+1] = ber / 16
712 "andl $0xffff, %%eax \n\t"
713 // depends on eax & 0xffff0000 == 0
714 // calculate the index of the value of the pixel
715 "movw %16, %%ax \n\t" // ax = rr
716 "mulb %15 \n\t" // ax = cpcpc*rr
717 "movw %%ax, %%cx \n\t"
718 "movw %17, %%ax \n\t" // ax = gg
719 "mulb %9 \n\t" // ax = cpc*gg
720 "addw %%cx, %%ax \n\t" // ax = cpc*gg + cpcpc*rr
721 "addw %18, %%ax \n\t" // ax = cpcpc*rr + cpc*gg + bb
723 "movl %8, %%ecx \n\t"
725 "leal (%%ecx, %%eax, 4), %%edx \n\t"
727 "movb (%%edx), %%cl \n\t" // cl = pixels[ax]
730 "movl %1, %%eax \n\t"
731 "movb %%cl, (%%eax) \n\t" // *ximage = cl
732 "incl %1 \n\t" // ximage++
734 // prepare for next iteration on X
736 "addl $4, %19 \n\t" // nerr += 4
737 "addl $4, %%ebx \n\t" // err += 4
739 "addl %12, %%esi \n\t" // image->data += bpp
741 "decl %14 \n\t" // x--
742 "jnz .LoopXb \n\t" // if x>0, goto .LoopX
745 "movl %13, %%eax \n\t"
746 "addl %%eax, %1 \n\t" // add extra offset to ximage
767 "m" (bytesPerPixel
), // %12
768 "m" (line_offset
), // %13
774 "m" (tmp_err
), // %19
775 "m" (tmp_nerr
), // %20