wrlib/x86_specific.c

   1 /* x86_convert.c - convert RImage to XImage with x86 optimizations
   2  *
   3  *  Raster graphics library
   4  *
   5  *  Copyright (c) 2000 Alfredo K. Kojima
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public
  18  *  License along with this library; if not, write to the Free
  19  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  */
  21
  22 #include <config.h>
  23
  24
  25 #ifdef ASM_X86_MMX
  26
  27 int
  28 x86_check_mmx()
  29 {
  30     static int result = 1;
  31
  32     if (result >= 0)
  33         return result;
  34
  35     result = 0;
  36 #if 0
  37     asm volatile
  38         ("pushfl                \n" // check whether cpuid supported
  39          "pop %%eax             \n"
  40          "movl %%eax, %%ebx     \n"
  41          "xorl 1<<21, %%eax     \n"
  42          "pushfl %%eax          \n"
  43          "popfd                 \n"
  44          "pushfl                \n"
  45          "popl %%eax            \n"
  46          "xorl %%eax, %%ebx     \n"
  47          "andl 1<<21, %%eax     \n"
  48          "jz .NotPentium        \n"
  49          "xorl %%eax, %%eax     \n"
  50
  51          "movl $1, %%eax        \n"
  52          "cpuid                 \n"
  53          "test 1<<23, %%edx     \n"
  54          "jz .NotMMX            \n"
  55          "movl $1, %%0          \n"
  56
  57          ".NotMMX:              \n"
  58          ".Bye:                 \n"
  59          ".NotPentium:          \n"
  60
  61          : "=rm" (result));
  62 #endif
  63     return result;
  64 }
  65
  66
  67 void
  68 x86_TrueColor_32_to_16(unsigned char *image, // 8
  69                        unsigned short *ximage, // 12
  70                        short *err, // 16
  71                        short *nerr, // 20
  72                        short *rtable, // 24
  73                        short *gtable, // 28
  74                        short *btable, // 32
  75                        int dr, // 36
  76                        int dg, // 40
  77                        int db, // 44
  78                        unsigned int roffs, // 48
  79                        unsigned int goffs, // 52
  80                        unsigned int boffs, // 56
  81                        int width, // 60
  82                        int height, // 64
  83                        int line_offset) // 68
  84 {
  85     /*
  86      int x; //-4
  87      long long rrggbbaa;// -16
  88      long long pixel; //-24
  89      short *tmp_err; //-32
  90      short *tmp_nerr; //-36
  91      */
  92
  93     asm volatile
  94         (
  95          "subl $64, %esp                \n" // alloc some more stack
  96
  97          "pusha                         \n"
  98
  99          // pack dr, dg and db into mm6
 100          "movl  36(%ebp), %eax          \n"
 101          "movl  40(%ebp), %ebx          \n"
 102          "movw  %ax, -16(%ebp)          \n"
 103
 104          "movw  %bx, -14(%ebp)          \n"
 105          "movl  44(%ebp), %eax          \n"
 106          "movw  $0, -10(%ebp)           \n"
 107          "movw  %ax, -12(%ebp)          \n"
 108
 109          "movq  -16(%ebp), %mm6         \n" // dr dg db 0
 110
 111          // pack 4|4|4|4 into mm7, for shifting (/16)
 112          "movl $0x00040004, -16(%ebp)   \n"
 113          "movl $0x00040004, -12(%ebp)   \n"
 114          "movq -16(%ebp), %mm7          \n"
 115
 116          // store constant values for using with mmx when dithering
 117          "movl $0x00070007, -16(%ebp)   \n"
 118          "movl $0x00070007, -12(%ebp)   \n"
 119          "movq -16(%ebp), %mm5          \n"
 120
 121          "movl $0x00050005, -16(%ebp)   \n"
 122          "movl $0x00050005, -12(%ebp)   \n"
 123          "movq -16(%ebp), %mm4          \n"
 124
 125          "movl $0x00030003, -16(%ebp)   \n"
 126          "movl $0x00030003, -12(%ebp)   \n"
 127          "movq -16(%ebp), %mm3          \n"
 128
 129          // process 1 pixel / cycle, each component treated as 16bit
 130          "movl 8(%ebp), %esi            \n" // esi = image->data
 131
 132 ".LoopY:                                \n"
 133          "movl 60(%ebp), %eax           \n"
 134          "movl %eax, -4(%ebp)           \n" // x = width
 135
 136          "movl 64(%ebp), %eax           \n"
 137          "decl %eax                     \n" // y--
 138          "movl %eax, 64(%ebp)           \n"
 139          "js .End                       \n" // if y < 0, goto end
 140          "andl $1, %eax                 \n"
 141          "jz .LoopY_1                   \n" // if (y&1) goto LoopY_1
 142
 143 ".LoopY_0:                              \n"
 144
 145          "movl 16(%ebp), %ebx           \n" // ebx = err
 146          "movl %ebx, -36(%ebp)          \n" // [-36] = err
 147          "movl 20(%ebp), %eax           \n" //
 148          "movl %eax, -32(%ebp)          \n" // [-32] = nerr
 149
 150          "jmp .LoopX                    \n"
 151
 152 ".LoopY_1:                              \n"
 153
 154          "movl 20(%ebp), %ebx           \n" // ebx = nerr
 155          "movl %ebx, -36(%ebp)          \n" // [-36] = nerr
 156          "movl 16(%ebp), %eax           \n" //
 157          "movl %eax, -32(%ebp)          \n" // [-32] = eerr
 158
 159
 160 ".LoopX:                                \n"
 161
 162          // calculate errors and pixel components
 163
 164          // depend on ebx, esi, mm6
 165          "movq (%ebx), %mm1             \n" // mm1 = error[0..3]
 166          "punpcklbw (%esi), %mm0        \n" // mm0 = image->data[0..3]
 167          "psrlw $8, %mm0                \n" // fixup mm0
 168          "paddusb %mm1, %mm0            \n" // mm0 = mm0 + mm1 (sat. to 255)
 169          "movq %mm0, -24(%ebp)          \n" // save the pixel
 170
 171          "movzwl -24(%ebp), %ecx        \n" // ecx = pixel.red
 172          "movl 24(%ebp), %edi           \n" // edi = rtable
 173          "leal (%edi, %ecx, 2), %eax    \n" // eax = &rtable[pixel.red]
 174          "movl (%eax), %edx             \n" // edx = rtable[pixel.red]
 175          "movw %dx, -16(%ebp)           \n" // save rr
 176
 177          "movzwl -22(%ebp), %ecx        \n" // ecx = pixel.green
 178          "movl 28(%ebp), %edi           \n" // edi = gtable
 179          "leal (%edi, %ecx, 2), %eax    \n" // eax = &gtable[pixel.green]
 180          "movl (%eax), %edx             \n" // ebx = gtable[pixel.green]
 181          "movw %dx, -14(%ebp)           \n" // save gg
 182
 183          "movzwl -20(%ebp), %ecx        \n" // ecx = pixel.blue
 184          "movl 32(%ebp), %edi           \n" // ebx = btable
 185          "leal (%edi, %ecx, 2), %eax    \n" // eax = &btable[pixel.blue]
 186          "movl (%eax), %edx             \n" // ecx = btable[pixel.blue]
 187          "movw %dx, -12(%ebp)           \n" // save bb
 188
 189          "movw $0, -10(%ebp)            \n" // save dummy aa
 190
 191          "movq -16(%ebp), %mm1          \n" // load mm1 with rrggbbaa
 192          "pmullw %mm6, %mm1             \n" // mm1 = rr*dr|...
 193          "psubsw %mm1, %mm0             \n" // error = pixel - mm1
 194
 195
 196          // distribute the error
 197
 198          // depend on mm0, mm7, mm3, mm4, mm5
 199
 200          "movl -36(%ebp), %ebx          \n"
 201
 202          "movq %mm0, %mm1               \n"
 203          "pmullw %mm5, %mm1             \n" // mm1 = mm1*7
 204          "psrlw %mm7, %mm1              \n" // mm1 = mm1/16
 205          "paddw 8(%ebx), %mm1           \n"
 206          "movq %mm1, 8(%ebx)            \n" // err[x+1,y] = rer*7/16
 207
 208
 209          "movl -32(%ebp), %ebx          \n"
 210
 211          "movq %mm0, %mm1               \n"
 212          "pmullw %mm4, %mm1             \n" // mm1 = mm1*5
 213          "psrlw %mm7, %mm1              \n" // mm1 = mm1/16
 214          "paddw -8(%ebx), %mm1          \n"
 215          "movq %mm1, -8(%ebx)           \n" // err[x-1,y+1] += rer*3/16
 216
 217          "movq %mm0, %mm1               \n"
 218          "pmullw %mm3, %mm1             \n" // mm1 = mm1*3
 219          "psrlw %mm7, %mm1              \n" // mm1 = mm1/16
 220          "paddw 8(%ebx), %mm1           \n"
 221          "movq %mm1, (%ebx)             \n" // err[x,y+1] += rer*5/16
 222
 223          "psrlw %mm7, %mm0              \n" // mm0 = mm0/16
 224          "movq %mm0, 8(%ebx)            \n" // err[x+1,y+1] = rer/16
 225
 226
 227          // calculate final pixel value and store
 228          "movl 48(%ebp), %ecx           \n"
 229          "movw -16(%ebp), %ax           \n"
 230          "shlw %cl, %ax                 \n" //NP* ax = r<<roffs
 231
 232          "movl 52(%ebp), %ecx           \n"
 233          "movw -14(%ebp), %bx           \n"
 234          "shlw %cl, %bx                 \n" //NP*
 235          "orw %bx, %ax                  \n"
 236
 237          "movl 56(%ebp), %ecx           \n"
 238          "movw -12(%ebp), %bx           \n"
 239          "shlw %cl, %bx                 \n" //NP*
 240          "orw %bx, %ax                  \n"
 241
 242          "movl 12(%ebp), %edx           \n"
 243          "movw %ax, (%edx)              \n"
 244          "addl $2, %edx                 \n" // increment ximage
 245          "movl %edx, 12(%ebp)           \n"
 246
 247          // prepare for next iteration on X
 248
 249          "addl $8, -32(%ebp)            \n" // nerr += 8
 250
 251          "movl -36(%ebp), %ebx          \n"
 252          "addl $8, %ebx                 \n"
 253          "movl %ebx, -36(%ebp)          \n" // ebx = err += 8
 254
 255
 256          // Note: in the last pixel, this would cause an invalid memory access
 257          // because, punpcklbw is used (which reads 8 bytes) and the last
 258          // pixel is only 4 bytes. This is no problem because the image data
 259          // was allocated with extra 4 bytes when created.
 260          "addl $4, %esi                 \n" // image->data += 4
 261
 262
 263          "decl -4(%ebp)                 \n" // x--
 264          "jnz .LoopX                    \n" // if x>0, goto .LoopX
 265
 266
 267          // depend on edx
 268          "addl 68(%ebp), %edx           \n" // add extra offset to ximage
 269          "movl %edx, 12(%ebp)           \n"
 270
 271
 272          "jmp .LoopY                    \n"
 273
 274 ".End:                                  \n" // THE END
 275
 276          "emms                          \n"
 277
 278          "popa                          \n"
 279          );
 280 }
 281
 282
 283
 284 #endif /* ASM_X86_MMX */
 285