src/video/SDL_RLEaccel.c

   1 /*
   2     SDL - Simple DirectMedia Layer
   3     Copyright (C) 1997-2006 Sam Lantinga
   4
   5     This library is free software; you can redistribute it and/or
   6     modify it under the terms of the GNU Lesser General Public
   7     License as published by the Free Software Foundation; either
   8     version 2.1 of the License, or (at your option) any later version.
   9
  10     This library is distributed in the hope that it will be useful,
  11     but WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13     Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with this library; if not, write to the Free Software
  17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18
  19     Sam Lantinga
  20     slouken@libsdl.org
  21 */
  22 #include "SDL_config.h"
  23
  24 /*
  25  * RLE encoding for software colorkey and alpha-channel acceleration
  26  *
  27  * Original version by Sam Lantinga
  28  *
  29  * Mattias Engdegård (Yorick): Rewrite. New encoding format, encoder and
  30  * decoder. Added per-surface alpha blitter. Added per-pixel alpha
  31  * format, encoder and blitter.
  32  *
  33  * Many thanks to Xark and johns for hints, benchmarks and useful comments
  34  * leading to this code.
  35  *
  36  * Welcome to Macro Mayhem.
  37  */
  38
  39 /*
  40  * The encoding translates the image data to a stream of segments of the form
  41  *
  42  * <skip> <run> <data>
  43  *
  44  * where <skip> is the number of transparent pixels to skip,
  45  *       <run>  is the number of opaque pixels to blit,
  46  * and   <data> are the pixels themselves.
  47  *
  48  * This basic structure is used both for colorkeyed surfaces, used for simple
  49  * binary transparency and for per-surface alpha blending, and for surfaces
  50  * with per-pixel alpha. The details differ, however:
  51  *
  52  * Encoding of colorkeyed surfaces:
  53  *
  54  *   Encoded pixels always have the same format as the target surface.
  55  *   <skip> and <run> are unsigned 8 bit integers, except for 32 bit depth
  56  *   where they are 16 bit. This makes the pixel data aligned at all times.
  57  *   Segments never wrap around from one scan line to the next.
  58  *
  59  *   The end of the sequence is marked by a zero <skip>,<run> pair at the *
  60  *   beginning of a line.
  61  *
  62  * Encoding of surfaces with per-pixel alpha:
  63  *
  64  *   The sequence begins with a struct RLEDestFormat describing the target
  65  *   pixel format, to provide reliable un-encoding.
  66  *
  67  *   Each scan line is encoded twice: First all completely opaque pixels,
  68  *   encoded in the target format as described above, and then all
  69  *   partially transparent (translucent) pixels (where 1 <= alpha <= 254),
  70  *   in the following 32-bit format:
  71  *
  72  *   For 32-bit targets, each pixel has the target RGB format but with
  73  *   the alpha value occupying the highest 8 bits. The <skip> and <run>
  74  *   counts are 16 bit.
  75  *
  76  *   For 16-bit targets, each pixel has the target RGB format, but with
  77  *   the middle component (usually green) shifted 16 steps to the left,
  78  *   and the hole filled with the 5 most significant bits of the alpha value.
  79  *   i.e. if the target has the format         rrrrrggggggbbbbb,
  80  *   the encoded pixel will be 00000gggggg00000rrrrr0aaaaabbbbb.
  81  *   The <skip> and <run> counts are 8 bit for the opaque lines, 16 bit
  82  *   for the translucent lines. Two padding bytes may be inserted
  83  *   before each translucent line to keep them 32-bit aligned.
  84  *
  85  *   The end of the sequence is marked by a zero <skip>,<run> pair at the
  86  *   beginning of an opaque line.
  87  */
  88
  89 #include "SDL_video.h"
  90 #include "SDL_sysvideo.h"
  91 #include "SDL_blit.h"
  92 #include "SDL_RLEaccel_c.h"
  93
  94 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
  95 #define MMX_ASMBLIT
  96 #endif
  97
  98 #ifdef MMX_ASMBLIT
  99 #include "mmx.h"
 100 #include "SDL_cpuinfo.h"
 101 #endif
 102
 103 #ifndef MAX
 104 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 105 #endif
 106 #ifndef MIN
 107 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 108 #endif
 109
 110 #define PIXEL_COPY(to, from, len, bpp)                  \
 111 do {                                                    \
 112     if(bpp == 4) {                                      \
 113         SDL_memcpy4(to, from, (size_t)(len));           \
 114     } else {                                            \
 115         SDL_memcpy(to, from, (size_t)(len) * (bpp));    \
 116     }                                                   \
 117 } while(0)
 118
 119 /*
 120  * Various colorkey blit methods, for opaque and per-surface alpha
 121  */
 122
 123 #define OPAQUE_BLIT(to, from, length, bpp, alpha)       \
 124     PIXEL_COPY(to, from, length, bpp)
 125
 126 #ifdef MMX_ASMBLIT
 127
 128 #define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)       \
 129     do {                                                        \
 130         Uint32 *srcp = (Uint32 *)(from);                        \
 131         Uint32 *dstp = (Uint32 *)(to);                          \
 132         int i = 0x00FF00FF;                                     \
 133         movd_m2r(*(&i), mm3);                                   \
 134         punpckldq_r2r(mm3, mm3);                                \
 135         i = 0xFF000000;                                         \
 136         movd_m2r(*(&i), mm7);                                   \
 137         punpckldq_r2r(mm7, mm7);                                \
 138         i = alpha | alpha << 16;                                \
 139         movd_m2r(*(&i), mm4);                                   \
 140         punpckldq_r2r(mm4, mm4);                                \
 141         pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */              \
 142         pxor_r2r(mm7, mm5); /* make clear alpha mask */         \
 143         i = length;                                             \
 144         if(i & 1) {                                             \
 145           movd_m2r((*srcp), mm1); /* src -> mm1 */              \
 146           punpcklbw_r2r(mm1, mm1);                              \
 147           pand_r2r(mm3, mm1);                                   \
 148           movd_m2r((*dstp), mm2); /* dst -> mm2 */              \
 149           punpcklbw_r2r(mm2, mm2);                              \
 150           pand_r2r(mm3, mm2);                                   \
 151           psubw_r2r(mm2, mm1);                                  \
 152           pmullw_r2r(mm4, mm1);                                 \
 153           psrlw_i2r(8, mm1);                                    \
 154           paddw_r2r(mm1, mm2);                                  \
 155           pand_r2r(mm3, mm2);                                   \
 156           packuswb_r2r(mm2, mm2);                               \
 157           pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */             \
 158           movd_r2m(mm2, *dstp);                                 \
 159           ++srcp;                                               \
 160           ++dstp;                                               \
 161           i--;                                                  \
 162         }                                                       \
 163         for(; i > 0; --i) {                                     \
 164           movq_m2r((*srcp), mm0);                               \
 165           movq_r2r(mm0, mm1);                                   \
 166           punpcklbw_r2r(mm0, mm0);                              \
 167           movq_m2r((*dstp), mm2);                               \
 168           punpckhbw_r2r(mm1, mm1);                              \
 169           movq_r2r(mm2, mm6);                                   \
 170           pand_r2r(mm3, mm0);                                   \
 171           punpcklbw_r2r(mm2, mm2);                              \
 172           pand_r2r(mm3, mm1);                                   \
 173           punpckhbw_r2r(mm6, mm6);                              \
 174           pand_r2r(mm3, mm2);                                   \
 175           psubw_r2r(mm2, mm0);                                  \
 176           pmullw_r2r(mm4, mm0);                                 \
 177           pand_r2r(mm3, mm6);                                   \
 178           psubw_r2r(mm6, mm1);                                  \
 179           pmullw_r2r(mm4, mm1);                                 \
 180           psrlw_i2r(8, mm0);                                    \
 181           paddw_r2r(mm0, mm2);                                  \
 182           psrlw_i2r(8, mm1);                                    \
 183           paddw_r2r(mm1, mm6);                                  \
 184           pand_r2r(mm3, mm2);                                   \
 185           pand_r2r(mm3, mm6);                                   \
 186           packuswb_r2r(mm2, mm2);                               \
 187           packuswb_r2r(mm6, mm6);                               \
 188           psrlq_i2r(32, mm2);                                   \
 189           psllq_i2r(32, mm6);                                   \
 190           por_r2r(mm6, mm2);                                    \
 191           pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */             \
 192          movq_r2m(mm2, *dstp);                                  \
 193           srcp += 2;                                            \
 194           dstp += 2;                                            \
 195           i--;                                                  \
 196         }                                                       \
 197         emms();                                                 \
 198     } while(0)
 199
 200 #define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)       \
 201     do {                                                \
 202         int i, n = 0;                                   \
 203         Uint16 *srcp = (Uint16 *)(from);                \
 204         Uint16 *dstp = (Uint16 *)(to);                  \
 205         Uint32 ALPHA = 0xF800;                          \
 206         movd_m2r(*(&ALPHA), mm1);                       \
 207         punpcklwd_r2r(mm1, mm1);                        \
 208         punpcklwd_r2r(mm1, mm1);                        \
 209         ALPHA = 0x07E0;                                 \
 210         movd_m2r(*(&ALPHA), mm4);                       \
 211         punpcklwd_r2r(mm4, mm4);                        \
 212         punpcklwd_r2r(mm4, mm4);                        \
 213         ALPHA = 0x001F;                                 \
 214         movd_m2r(*(&ALPHA), mm7);                       \
 215         punpcklwd_r2r(mm7, mm7);                        \
 216         punpcklwd_r2r(mm7, mm7);                        \
 217         alpha &= ~(1+2+4);                              \
 218         i = (Uint32)alpha | (Uint32)alpha << 16;        \
 219         movd_m2r(*(&i), mm0);                           \
 220         punpckldq_r2r(mm0, mm0);                        \
 221         ALPHA = alpha >> 3;                             \
 222         i = ((int)(length) & 3);                        \
 223         for(; i > 0; --i) {                             \
 224             Uint32 s = *srcp++;                         \
 225             Uint32 d = *dstp;                           \
 226             s = (s | s << 16) & 0x07e0f81f;             \
 227             d = (d | d << 16) & 0x07e0f81f;             \
 228             d += (s - d) * ALPHA >> 5;                  \
 229             d &= 0x07e0f81f;                            \
 230             *dstp++ = d | d >> 16;                      \
 231             n++;                                        \
 232         }                                               \
 233         i = (int)(length) - n;                          \
 234         for(; i > 0; --i) {                             \
 235           movq_m2r((*dstp), mm3);                       \
 236           movq_m2r((*srcp), mm2);                       \
 237           movq_r2r(mm2, mm5);                           \
 238           pand_r2r(mm1 , mm5);                          \
 239           psrlq_i2r(11, mm5);                           \
 240           movq_r2r(mm3, mm6);                           \
 241           pand_r2r(mm1 , mm6);                          \
 242           psrlq_i2r(11, mm6);                           \
 243           psubw_r2r(mm6, mm5);                          \
 244           pmullw_r2r(mm0, mm5);                         \
 245           psrlw_i2r(8, mm5);                            \
 246           paddw_r2r(mm5, mm6);                          \
 247           psllq_i2r(11, mm6);                           \
 248           pand_r2r(mm1, mm6);                           \
 249           movq_r2r(mm4, mm5);                           \
 250           por_r2r(mm7, mm5);                            \
 251           pand_r2r(mm5, mm3);                           \
 252           por_r2r(mm6, mm3);                            \
 253           movq_r2r(mm2, mm5);                           \
 254           pand_r2r(mm4 , mm5);                          \
 255           psrlq_i2r(5, mm5);                            \
 256           movq_r2r(mm3, mm6);                           \
 257           pand_r2r(mm4 , mm6);                          \
 258           psrlq_i2r(5, mm6);                            \
 259           psubw_r2r(mm6, mm5);                          \
 260           pmullw_r2r(mm0, mm5);                         \
 261           psrlw_i2r(8, mm5);                            \
 262           paddw_r2r(mm5, mm6);                          \
 263           psllq_i2r(5, mm6);                            \
 264           pand_r2r(mm4, mm6);                           \
 265           movq_r2r(mm1, mm5);                           \
 266           por_r2r(mm7, mm5);                            \
 267           pand_r2r(mm5, mm3);                           \
 268           por_r2r(mm6, mm3);                            \
 269           movq_r2r(mm2, mm5);                           \
 270           pand_r2r(mm7 , mm5);                          \
 271           movq_r2r(mm3, mm6);                           \
 272           pand_r2r(mm7 , mm6);                          \
 273           psubw_r2r(mm6, mm5);                          \
 274           pmullw_r2r(mm0, mm5);                         \
 275           psrlw_i2r(8, mm5);                            \
 276           paddw_r2r(mm5, mm6);                          \
 277           pand_r2r(mm7, mm6);                           \
 278           movq_r2r(mm1, mm5);                           \
 279           por_r2r(mm4, mm5);                            \
 280           pand_r2r(mm5, mm3);                           \
 281           por_r2r(mm6, mm3);                            \
 282           movq_r2m(mm3, *dstp);                         \
 283           srcp += 4;                                    \
 284           dstp += 4;                                    \
 285           i -= 3;                                       \
 286         }                                               \
 287         emms();                                         \
 288     } while(0)
 289
 290 #define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)       \
 291     do {                                                \
 292         int i, n = 0;                                   \
 293         Uint16 *srcp = (Uint16 *)(from);                \
 294         Uint16 *dstp = (Uint16 *)(to);                  \
 295         Uint32 ALPHA = 0x7C00;                          \
 296         movd_m2r(*(&ALPHA), mm1);                       \
 297         punpcklwd_r2r(mm1, mm1);                        \
 298         punpcklwd_r2r(mm1, mm1);                        \
 299         ALPHA = 0x03E0;                                 \
 300         movd_m2r(*(&ALPHA), mm4);                       \
 301         punpcklwd_r2r(mm4, mm4);                        \
 302         punpcklwd_r2r(mm4, mm4);                        \
 303         ALPHA = 0x001F;                                 \
 304         movd_m2r(*(&ALPHA), mm7);                       \
 305         punpcklwd_r2r(mm7, mm7);                        \
 306         punpcklwd_r2r(mm7, mm7);                        \
 307         alpha &= ~(1+2+4);                              \
 308         i = (Uint32)alpha | (Uint32)alpha << 16;        \
 309         movd_m2r(*(&i), mm0);                           \
 310         punpckldq_r2r(mm0, mm0);                        \
 311         i = ((int)(length) & 3);                                \
 312         ALPHA = alpha >> 3;                             \
 313         for(; i > 0; --i) {                             \
 314             Uint32 s = *srcp++;                         \
 315             Uint32 d = *dstp;                           \
 316             s = (s | s << 16) & 0x03e07c1f;             \
 317             d = (d | d << 16) & 0x03e07c1f;             \
 318             d += (s - d) * ALPHA >> 5;                  \
 319             d &= 0x03e07c1f;                            \
 320             *dstp++ = d | d >> 16;                      \
 321             n++;                                        \
 322         }                                               \
 323         i = (int)(length) - n;                          \
 324         for(; i > 0; --i) {                             \
 325           movq_m2r((*dstp), mm3);                       \
 326           movq_m2r((*srcp), mm2);                       \
 327           movq_r2r(mm2, mm5);                           \
 328           pand_r2r(mm1 , mm5);                          \
 329           psrlq_i2r(10, mm5);                           \
 330           movq_r2r(mm3, mm6);                           \
 331           pand_r2r(mm1 , mm6);                          \
 332           psrlq_i2r(10, mm6);                           \
 333           psubw_r2r(mm6, mm5);                          \
 334           pmullw_r2r(mm0, mm5);                         \
 335           psrlw_i2r(8, mm5);                            \
 336           paddw_r2r(mm5, mm6);                          \
 337           psllq_i2r(10, mm6);                           \
 338           pand_r2r(mm1, mm6);                           \
 339           movq_r2r(mm4, mm5);                           \
 340           por_r2r(mm7, mm5);                            \
 341           pand_r2r(mm5, mm3);                           \
 342           por_r2r(mm6, mm3);                            \
 343           movq_r2r(mm2, mm5);                           \
 344           pand_r2r(mm4 , mm5);                          \
 345           psrlq_i2r(5, mm5);                            \
 346           movq_r2r(mm3, mm6);                           \
 347           pand_r2r(mm4 , mm6);                          \
 348           psrlq_i2r(5, mm6);                            \
 349           psubw_r2r(mm6, mm5);                          \
 350           pmullw_r2r(mm0, mm5);                         \
 351           psrlw_i2r(8, mm5);                            \
 352           paddw_r2r(mm5, mm6);                          \
 353           psllq_i2r(5, mm6);                            \
 354           pand_r2r(mm4, mm6);                           \
 355           movq_r2r(mm1, mm5);                           \
 356           por_r2r(mm7, mm5);                            \
 357           pand_r2r(mm5, mm3);                           \
 358           por_r2r(mm6, mm3);                            \
 359           movq_r2r(mm2, mm5);                           \
 360           pand_r2r(mm7 , mm5);                          \
 361           movq_r2r(mm3, mm6);                           \
 362           pand_r2r(mm7 , mm6);                          \
 363           psubw_r2r(mm6, mm5);                          \
 364           pmullw_r2r(mm0, mm5);                         \
 365           psrlw_i2r(8, mm5);                            \
 366           paddw_r2r(mm5, mm6);                          \
 367           pand_r2r(mm7, mm6);                           \
 368           movq_r2r(mm1, mm5);                           \
 369           por_r2r(mm4, mm5);                            \
 370           pand_r2r(mm5, mm3);                           \
 371           por_r2r(mm6, mm3);                            \
 372           movq_r2m(mm3, *dstp);                         \
 373           srcp += 4;                                    \
 374           dstp += 4;                                    \
 375           i -= 3;                                       \
 376         }                                               \
 377         emms();                                         \
 378     } while(0)
 379
 380 #endif
 381
 382 /*
 383  * For 32bpp pixels on the form 0x00rrggbb:
 384  * If we treat the middle component separately, we can process the two
 385  * remaining in parallel. This is safe to do because of the gap to the left
 386  * of each component, so the bits from the multiplication don't collide.
 387  * This can be used for any RGB permutation of course.
 388  */
 389 #define ALPHA_BLIT32_888(to, from, length, bpp, alpha)          \
 390     do {                                                        \
 391         int i;                                                  \
 392         Uint32 *src = (Uint32 *)(from);                         \
 393         Uint32 *dst = (Uint32 *)(to);                           \
 394         for(i = 0; i < (int)(length); i++) {                    \
 395             Uint32 s = *src++;                                  \
 396             Uint32 d = *dst;                                    \
 397             Uint32 s1 = s & 0xff00ff;                           \
 398             Uint32 d1 = d & 0xff00ff;                           \
 399             d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;    \
 400             s &= 0xff00;                                        \
 401             d &= 0xff00;                                        \
 402             d = (d + ((s - d) * alpha >> 8)) & 0xff00;          \
 403             *dst++ = d1 | d;                                    \
 404         }                                                       \
 405     } while(0)
 406
 407 /*
 408  * For 16bpp pixels we can go a step further: put the middle component
 409  * in the high 16 bits of a 32 bit word, and process all three RGB
 410  * components at the same time. Since the smallest gap is here just
 411  * 5 bits, we have to scale alpha down to 5 bits as well.
 412  */
 413 #define ALPHA_BLIT16_565(to, from, length, bpp, alpha)  \
 414     do {                                                \
 415         int i;                                          \
 416         Uint16 *src = (Uint16 *)(from);                 \
 417         Uint16 *dst = (Uint16 *)(to);                   \
 418         Uint32 ALPHA = alpha >> 3;                      \
 419         for(i = 0; i < (int)(length); i++) {            \
 420             Uint32 s = *src++;                          \
 421             Uint32 d = *dst;                            \
 422             s = (s | s << 16) & 0x07e0f81f;             \
 423             d = (d | d << 16) & 0x07e0f81f;             \
 424             d += (s - d) * ALPHA >> 5;                  \
 425             d &= 0x07e0f81f;                            \
 426             *dst++ = (Uint16)(d | d >> 16);                     \
 427         }                                               \
 428     } while(0)
 429
 430 #define ALPHA_BLIT16_555(to, from, length, bpp, alpha)  \
 431     do {                                                \
 432         int i;                                          \
 433         Uint16 *src = (Uint16 *)(from);                 \
 434         Uint16 *dst = (Uint16 *)(to);                   \
 435         Uint32 ALPHA = alpha >> 3;                      \
 436         for(i = 0; i < (int)(length); i++) {            \
 437             Uint32 s = *src++;                          \
 438             Uint32 d = *dst;                            \
 439             s = (s | s << 16) & 0x03e07c1f;             \
 440             d = (d | d << 16) & 0x03e07c1f;             \
 441             d += (s - d) * ALPHA >> 5;                  \
 442             d &= 0x03e07c1f;                            \
 443             *dst++ = (Uint16)(d | d >> 16);                     \
 444         }                                               \
 445     } while(0)
 446
 447 /*
 448  * The general slow catch-all function, for remaining depths and formats
 449  */
 450 #define ALPHA_BLIT_ANY(to, from, length, bpp, alpha)                    \
 451     do {                                                                \
 452         int i;                                                          \
 453         Uint8 *src = from;                                              \
 454         Uint8 *dst = to;                                                \
 455         for(i = 0; i < (int)(length); i++) {                            \
 456             Uint32 s, d;                                                \
 457             unsigned rs, gs, bs, rd, gd, bd;                            \
 458             switch(bpp) {                                               \
 459             case 2:                                                     \
 460                 s = *(Uint16 *)src;                                     \
 461                 d = *(Uint16 *)dst;                                     \
 462                 break;                                                  \
 463             case 3:                                                     \
 464                 if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {                   \
 465                     s = (src[0] << 16) | (src[1] << 8) | src[2];        \
 466                     d = (dst[0] << 16) | (dst[1] << 8) | dst[2];        \
 467                 } else {                                                \
 468                     s = (src[2] << 16) | (src[1] << 8) | src[0];        \
 469                     d = (dst[2] << 16) | (dst[1] << 8) | dst[0];        \
 470                 }                                                       \
 471                 break;                                                  \
 472             case 4:                                                     \
 473                 s = *(Uint32 *)src;                                     \
 474                 d = *(Uint32 *)dst;                                     \
 475                 break;                                                  \
 476             }                                                           \
 477             RGB_FROM_PIXEL(s, fmt, rs, gs, bs);                         \
 478             RGB_FROM_PIXEL(d, fmt, rd, gd, bd);                         \
 479             rd += (rs - rd) * alpha >> 8;                               \
 480             gd += (gs - gd) * alpha >> 8;                               \
 481             bd += (bs - bd) * alpha >> 8;                               \
 482             PIXEL_FROM_RGB(d, fmt, rd, gd, bd);                         \
 483             switch(bpp) {                                               \
 484             case 2:                                                     \
 485                 *(Uint16 *)dst = (Uint16)d;                                     \
 486                 break;                                                  \
 487             case 3:                                                     \
 488                 if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {                   \
 489                     dst[0] = (Uint8)(d >> 16);                                  \
 490                     dst[1] = (Uint8)(d >> 8);                                   \
 491                     dst[2] = (Uint8)(d);                                                \
 492                 } else {                                                \
 493                     dst[0] = (Uint8)d;                                          \
 494                     dst[1] = (Uint8)(d >> 8);                                   \
 495                     dst[2] = (Uint8)(d >> 16);                                  \
 496                 }                                                       \
 497                 break;                                                  \
 498             case 4:                                                     \
 499                 *(Uint32 *)dst = d;                                     \
 500                 break;                                                  \
 501             }                                                           \
 502             src += bpp;                                                 \
 503             dst += bpp;                                                 \
 504         }                                                               \
 505     } while(0)
 506
 507 #ifdef MMX_ASMBLIT
 508
 509 #define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)            \
 510     do {                                                                \
 511         Uint32 *srcp = (Uint32 *)(from);                                \
 512         Uint32 *dstp = (Uint32 *)(to);                                  \
 513         int i = 0x00fefefe;                                             \
 514         movd_m2r(*(&i), mm4);                                           \
 515         punpckldq_r2r(mm4, mm4);                                        \
 516         i = 0x00010101;                                                 \
 517         movd_m2r(*(&i), mm3);                                           \
 518         punpckldq_r2r(mm3, mm3);                                        \
 519         i = (int)(length);                                              \
 520         if( i & 1 ) {                                                   \
 521           Uint32 s = *srcp++;                                           \
 522           Uint32 d = *dstp;                                             \
 523           *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)        \
 524                      + (s & d & 0x00010101);                            \
 525           i--;                                                          \
 526         }                                                               \
 527         for(; i > 0; --i) {                                             \
 528             movq_m2r((*dstp), mm2); /* dst -> mm2 */                    \
 529             movq_r2r(mm2, mm6); /* dst -> mm6 */                        \
 530             movq_m2r((*srcp), mm1); /* src -> mm1 */                    \
 531             movq_r2r(mm1, mm5); /* src -> mm5 */                        \
 532             pand_r2r(mm4, mm6); /* dst & 0x00fefefe -> mm6 */           \
 533             pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */           \
 534             paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */   \
 535             psrld_i2r(1, mm5);                                          \
 536             pand_r2r(mm1, mm2); /* s & d -> mm2 */                      \
 537             pand_r2r(mm3, mm2); /* s & d & 0x00010101 -> mm2 */         \
 538             paddd_r2r(mm5, mm2);                                        \
 539             movq_r2m(mm2, (*dstp));                                     \
 540             dstp += 2;                                                  \
 541             srcp += 2;                                                  \
 542             i--;                                                        \
 543         }                                                               \
 544         emms();                                                         \
 545     } while(0)
 546
 547 #endif
 548
 549 /*
 550  * Special case: 50% alpha (alpha=128)
 551  * This is treated specially because it can be optimized very well, and
 552  * since it is good for many cases of semi-translucency.
 553  * The theory is to do all three components at the same time:
 554  * First zero the lowest bit of each component, which gives us room to
 555  * add them. Then shift right and add the sum of the lowest bits.
 556  */
 557 #define ALPHA_BLIT32_888_50(to, from, length, bpp, alpha)               \
 558     do {                                                                \
 559         int i;                                                          \
 560         Uint32 *src = (Uint32 *)(from);                                 \
 561         Uint32 *dst = (Uint32 *)(to);                                   \
 562         for(i = 0; i < (int)(length); i++) {                            \
 563             Uint32 s = *src++;                                          \
 564             Uint32 d = *dst;                                            \
 565             *dst++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)       \
 566                      + (s & d & 0x00010101);                            \
 567         }                                                               \
 568     } while(0)
 569
 570 /*
 571  * For 16bpp, we can actually blend two pixels in parallel, if we take
 572  * care to shift before we add, not after.
 573  */
 574
 575 /* helper: blend a single 16 bit pixel at 50% */
 576 #define BLEND16_50(dst, src, mask)                      \
 577     do {                                                \
 578         Uint32 s = *src++;                              \
 579         Uint32 d = *dst;                                \
 580         *dst++ = (Uint16)((((s & mask) + (d & mask)) >> 1) +    \
 581                           (s & d & (~mask & 0xffff)));          \
 582     } while(0)
 583
 584 /* basic 16bpp blender. mask is the pixels to keep when adding. */
 585 #define ALPHA_BLIT16_50(to, from, length, bpp, alpha, mask)             \
 586     do {                                                                \
 587         unsigned n = (length);                                          \
 588         Uint16 *src = (Uint16 *)(from);                                 \
 589         Uint16 *dst = (Uint16 *)(to);                                   \
 590         if(((uintptr_t)src ^ (uintptr_t)dst) & 3) {                     \
 591             /* source and destination not in phase, blit one by one */  \
 592             while(n--)                                                  \
 593                 BLEND16_50(dst, src, mask);                             \
 594         } else {                                                        \
 595             if((uintptr_t)src & 3) {                                    \
 596                 /* first odd pixel */                                   \
 597                 BLEND16_50(dst, src, mask);                             \
 598                 n--;                                                    \
 599             }                                                           \
 600             for(; n > 1; n -= 2) {                                      \
 601                 Uint32 s = *(Uint32 *)src;                              \
 602                 Uint32 d = *(Uint32 *)dst;                              \
 603                 *(Uint32 *)dst = ((s & (mask | mask << 16)) >> 1)       \
 604                                + ((d & (mask | mask << 16)) >> 1)       \
 605                                + (s & d & (~(mask | mask << 16)));      \
 606                 src += 2;                                               \
 607                 dst += 2;                                               \
 608             }                                                           \
 609             if(n)                                                       \
 610                 BLEND16_50(dst, src, mask); /* last odd pixel */        \
 611         }                                                               \
 612     } while(0)
 613
 614 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha)       \
 615     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de)
 616
 617 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)       \
 618     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
 619
 620 #ifdef MMX_ASMBLIT
 621
 622 #define CHOOSE_BLIT(blitter, alpha, fmt)                                \
 623     do {                                                                \
 624         if(alpha == 255) {                                              \
 625             switch(fmt->BytesPerPixel) {                                \
 626             case 1: blitter(1, Uint8, OPAQUE_BLIT); break;              \
 627             case 2: blitter(2, Uint8, OPAQUE_BLIT); break;              \
 628             case 3: blitter(3, Uint8, OPAQUE_BLIT); break;              \
 629             case 4: blitter(4, Uint16, OPAQUE_BLIT); break;             \
 630             }                                                           \
 631         } else {                                                        \
 632             switch(fmt->BytesPerPixel) {                                \
 633             case 1:                                                     \
 634                 /* No 8bpp alpha blitting */                            \
 635                 break;                                                  \
 636                                                                         \
 637             case 2:                                                     \
 638                 switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {          \
 639                 case 0xffff:                                            \
 640                     if(fmt->Gmask == 0x07e0                             \
 641                        || fmt->Rmask == 0x07e0                          \
 642                        || fmt->Bmask == 0x07e0) {                       \
 643                         if(alpha == 128)                                \
 644                             blitter(2, Uint8, ALPHA_BLIT16_565_50);     \
 645                         else {                                          \
 646                             if(SDL_HasMMX())                            \
 647                                 blitter(2, Uint8, ALPHA_BLIT16_565MMX); \
 648                             else                                        \
 649                                 blitter(2, Uint8, ALPHA_BLIT16_565);    \
 650                         }                                               \
 651                     } else                                              \
 652                         goto general16;                                 \
 653                     break;                                              \
 654                                                                         \
 655                 case 0x7fff:                                            \
 656                     if(fmt->Gmask == 0x03e0                             \
 657                        || fmt->Rmask == 0x03e0                          \
 658                        || fmt->Bmask == 0x03e0) {                       \
 659                         if(alpha == 128)                                \
 660                             blitter(2, Uint8, ALPHA_BLIT16_555_50);     \
 661                         else {                                          \
 662                             if(SDL_HasMMX())                            \
 663                                 blitter(2, Uint8, ALPHA_BLIT16_555MMX); \
 664                             else                                        \
 665                                 blitter(2, Uint8, ALPHA_BLIT16_555);    \
 666                         }                                               \
 667                         break;                                          \
 668                     }                                                   \
 669                     /* fallthrough */                                   \
 670                                                                         \
 671                 default:                                                \
 672                 general16:                                              \
 673                     blitter(2, Uint8, ALPHA_BLIT_ANY);                  \
 674                 }                                                       \
 675                 break;                                                  \
 676                                                                         \
 677             case 3:                                                     \
 678                 blitter(3, Uint8, ALPHA_BLIT_ANY);                      \
 679                 break;                                                  \
 680                                                                         \
 681             case 4:                                                     \
 682                 if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff \
 683                    && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00     \
 684                        || fmt->Bmask == 0xff00)) {                      \
 685                     if(alpha == 128)                                    \
 686                     {                                                   \
 687                         if(SDL_HasMMX())                                \
 688                                 blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
 689                         else                                            \
 690                                 blitter(4, Uint16, ALPHA_BLIT32_888_50);\
 691                     }                                                   \
 692                     else                                                \
 693                     {                                                   \
 694                         if(SDL_HasMMX())                                \
 695                                 blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
 696                         else                                            \
 697                                 blitter(4, Uint16, ALPHA_BLIT32_888);   \
 698                     }                                                   \
 699                 } else                                                  \
 700                     blitter(4, Uint16, ALPHA_BLIT_ANY);                 \
 701                 break;                                                  \
 702             }                                                           \
 703         }                                                               \
 704     } while(0)
 705
 706 #else
 707
 708 #define CHOOSE_BLIT(blitter, alpha, fmt)                                \
 709     do {                                                                \
 710         if(alpha == 255) {                                              \
 711             switch(fmt->BytesPerPixel) {                                \
 712             case 1: blitter(1, Uint8, OPAQUE_BLIT); break;              \
 713             case 2: blitter(2, Uint8, OPAQUE_BLIT); break;              \
 714             case 3: blitter(3, Uint8, OPAQUE_BLIT); break;              \
 715             case 4: blitter(4, Uint16, OPAQUE_BLIT); break;             \
 716             }                                                           \
 717         } else {                                                        \
 718             switch(fmt->BytesPerPixel) {                                \
 719             case 1:                                                     \
 720                 /* No 8bpp alpha blitting */                            \
 721                 break;                                                  \
 722                                                                         \
 723             case 2:                                                     \
 724                 switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {          \
 725                 case 0xffff:                                            \
 726                     if(fmt->Gmask == 0x07e0                             \
 727                        || fmt->Rmask == 0x07e0                          \
 728                        || fmt->Bmask == 0x07e0) {                       \
 729                         if(alpha == 128)                                \
 730                             blitter(2, Uint8, ALPHA_BLIT16_565_50);     \
 731                         else {                                          \
 732                             blitter(2, Uint8, ALPHA_BLIT16_565);        \
 733                         }                                               \
 734                     } else                                              \
 735                         goto general16;                                 \
 736                     break;                                              \
 737                                                                         \
 738                 case 0x7fff:                                            \
 739                     if(fmt->Gmask == 0x03e0                             \
 740                        || fmt->Rmask == 0x03e0                          \
 741                        || fmt->Bmask == 0x03e0) {                       \
 742                         if(alpha == 128)                                \
 743                             blitter(2, Uint8, ALPHA_BLIT16_555_50);     \
 744                         else {                                          \
 745                             blitter(2, Uint8, ALPHA_BLIT16_555);        \
 746                         }                                               \
 747                         break;                                          \
 748                     }                                                   \
 749                     /* fallthrough */                                   \
 750                                                                         \
 751                 default:                                                \
 752                 general16:                                              \
 753                     blitter(2, Uint8, ALPHA_BLIT_ANY);                  \
 754                 }                                                       \
 755                 break;                                                  \
 756                                                                         \
 757             case 3:                                                     \
 758                 blitter(3, Uint8, ALPHA_BLIT_ANY);                      \
 759                 break;                                                  \
 760                                                                         \
 761             case 4:                                                     \
 762                 if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff \
 763                    && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00     \
 764                        || fmt->Bmask == 0xff00)) {                      \
 765                     if(alpha == 128)                                    \
 766                         blitter(4, Uint16, ALPHA_BLIT32_888_50);        \
 767                     else                                                \
 768                         blitter(4, Uint16, ALPHA_BLIT32_888);           \
 769                 } else                                                  \
 770                     blitter(4, Uint16, ALPHA_BLIT_ANY);                 \
 771                 break;                                                  \
 772             }                                                           \
 773         }                                                               \
 774     } while(0)
 775
 776 #endif
 777
 778 /*
 779  * This takes care of the case when the surface is clipped on the left and/or
 780  * right. Top clipping has already been taken care of.
 781  */
 782 static void RLEClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
 783                         Uint8 *dstbuf, SDL_Rect *srcrect, unsigned alpha)
 784 {
 785     SDL_PixelFormat *fmt = dst->format;
 786
 787 #define RLECLIPBLIT(bpp, Type, do_blit)                                    \
 788     do {                                                                   \
 789         int linecount = srcrect->h;                                        \
 790         int ofs = 0;                                                       \
 791         int left = srcrect->x;                                             \
 792         int right = left + srcrect->w;                                     \
 793         dstbuf -= left * bpp;                                              \
 794         for(;;) {                                                          \
 795             int run;                                                       \
 796             ofs += *(Type *)srcbuf;                                        \
 797             run = ((Type *)srcbuf)[1];                                     \
 798             srcbuf += 2 * sizeof(Type);                                    \
 799             if(run) {                                                      \
 800                 /* clip to left and right borders */                       \
 801                 if(ofs < right) {                                          \
 802                     int start = 0;                                         \
 803                     int len = run;                                         \
 804                     int startcol;                                          \
 805                     if(left - ofs > 0) {                                   \
 806                         start = left - ofs;                                \
 807                         len -= start;                                      \
 808                         if(len <= 0)                                       \
 809                             goto nocopy ## bpp ## do_blit;                 \
 810                     }                                                      \
 811                     startcol = ofs + start;                                \
 812                     if(len > right - startcol)                             \
 813                         len = right - startcol;                            \
 814                     do_blit(dstbuf + startcol * bpp, srcbuf + start * bpp, \
 815                             len, bpp, alpha);                              \
 816                 }                                                          \
 817             nocopy ## bpp ## do_blit:                                      \
 818                 srcbuf += run * bpp;                                       \
 819                 ofs += run;                                                \
 820             } else if(!ofs)                                                \
 821                 break;                                                     \
 822             if(ofs == w) {                                                 \
 823                 ofs = 0;                                                   \
 824                 dstbuf += dst->pitch;                                      \
 825                 if(!--linecount)                                           \
 826                     break;                                                 \
 827             }                                                              \
 828         }                                                                  \
 829     } while(0)
 830
 831     CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
 832
 833 #undef RLECLIPBLIT
 834
 835 }
 836
 837
 838 /* blit a colorkeyed RLE surface */
 839 int SDL_RLEBlit(SDL_Surface *src, SDL_Rect *srcrect,
 840                 SDL_Surface *dst, SDL_Rect *dstrect)
 841 {
 842         Uint8 *dstbuf;
 843         Uint8 *srcbuf;
 844         int x, y;
 845         int w = src->w;
 846         unsigned alpha;
 847
 848         /* Lock the destination if necessary */
 849         if ( SDL_MUSTLOCK(dst) ) {
 850                 if ( SDL_LockSurface(dst) < 0 ) {
 851                         return(-1);
 852                 }
 853         }
 854
 855         /* Set up the source and destination pointers */
 856         x = dstrect->x;
 857         y = dstrect->y;
 858         dstbuf = (Uint8 *)dst->pixels
 859                  + y * dst->pitch + x * src->format->BytesPerPixel;
 860         srcbuf = (Uint8 *)src->map->sw_data->aux_data;
 861
 862         {
 863             /* skip lines at the top if neccessary */
 864             int vskip = srcrect->y;
 865             int ofs = 0;
 866             if(vskip) {
 867
 868 #define RLESKIP(bpp, Type)                      \
 869                 for(;;) {                       \
 870                     int run;                    \
 871                     ofs += *(Type *)srcbuf;     \
 872                     run = ((Type *)srcbuf)[1];  \
 873                     srcbuf += sizeof(Type) * 2; \
 874                     if(run) {                   \
 875                         srcbuf += run * bpp;    \
 876                         ofs += run;             \
 877                     } else if(!ofs)             \
 878                         goto done;              \
 879                     if(ofs == w) {              \
 880                         ofs = 0;                \
 881                         if(!--vskip)            \
 882                             break;              \
 883                     }                           \
 884                 }
 885
 886                 switch(src->format->BytesPerPixel) {
 887                 case 1: RLESKIP(1, Uint8); break;
 888                 case 2: RLESKIP(2, Uint8); break;
 889                 case 3: RLESKIP(3, Uint8); break;
 890                 case 4: RLESKIP(4, Uint16); break;
 891                 }
 892
 893 #undef RLESKIP
 894
 895             }
 896         }
 897
 898         alpha = (src->flags & SDL_SRCALPHA) == SDL_SRCALPHA
 899                 ? src->format->alpha : 255;
 900         /* if left or right edge clipping needed, call clip blit */
 901         if ( srcrect->x || srcrect->w != src->w ) {
 902             RLEClipBlit(w, srcbuf, dst, dstbuf, srcrect, alpha);
 903         } else {
 904             SDL_PixelFormat *fmt = src->format;
 905
 906 #define RLEBLIT(bpp, Type, do_blit)                                           \
 907             do {                                                              \
 908                 int linecount = srcrect->h;                                   \
 909                 int ofs = 0;                                                  \
 910                 for(;;) {                                                     \
 911                     unsigned run;                                             \
 912                     ofs += *(Type *)srcbuf;                                   \
 913                     run = ((Type *)srcbuf)[1];                                \
 914                     srcbuf += 2 * sizeof(Type);                               \
 915                     if(run) {                                                 \
 916                         do_blit(dstbuf + ofs * bpp, srcbuf, run, bpp, alpha); \
 917                         srcbuf += run * bpp;                                  \
 918                         ofs += run;                                           \
 919                     } else if(!ofs)                                           \
 920                         break;                                                \
 921                     if(ofs == w) {                                            \
 922                         ofs = 0;                                              \
 923                         dstbuf += dst->pitch;                                 \
 924                         if(!--linecount)                                      \
 925                             break;                                            \
 926                     }                                                         \
 927                 }                                                             \
 928             } while(0)
 929
 930             CHOOSE_BLIT(RLEBLIT, alpha, fmt);
 931
 932 #undef RLEBLIT
 933         }
 934
 935 done:
 936         /* Unlock the destination if necessary */
 937         if ( SDL_MUSTLOCK(dst) ) {
 938                 SDL_UnlockSurface(dst);
 939         }
 940         return(0);
 941 }
 942
 943 #undef OPAQUE_BLIT
 944
 945 /*
 946  * Per-pixel blitting macros for translucent pixels:
 947  * These use the same techniques as the per-surface blitting macros
 948  */
 949
 950 /*
 951  * For 32bpp pixels, we have made sure the alpha is stored in the top
 952  * 8 bits, so proceed as usual
 953  */
 954 #define BLIT_TRANSL_888(src, dst)                               \
 955     do {                                                        \
 956         Uint32 s = src;                                         \
 957         Uint32 d = dst;                                         \
 958         unsigned alpha = s >> 24;                               \
 959         Uint32 s1 = s & 0xff00ff;                               \
 960         Uint32 d1 = d & 0xff00ff;                               \
 961         d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;        \
 962         s &= 0xff00;                                            \
 963         d &= 0xff00;                                            \
 964         d = (d + ((s - d) * alpha >> 8)) & 0xff00;              \
 965         dst = d1 | d;                                           \
 966     } while(0)
 967
 968 /*
 969  * For 16bpp pixels, we have stored the 5 most significant alpha bits in
 970  * bits 5-10. As before, we can process all 3 RGB components at the same time.
 971  */
 972 #define BLIT_TRANSL_565(src, dst)               \
 973     do {                                        \
 974         Uint32 s = src;                         \
 975         Uint32 d = dst;                         \
 976         unsigned alpha = (s & 0x3e0) >> 5;      \
 977         s &= 0x07e0f81f;                        \
 978         d = (d | d << 16) & 0x07e0f81f;         \
 979         d += (s - d) * alpha >> 5;              \
 980         d &= 0x07e0f81f;                        \
 981         dst = (Uint16)(d | d >> 16);                    \
 982     } while(0)
 983
 984 #define BLIT_TRANSL_555(src, dst)               \
 985     do {                                        \
 986         Uint32 s = src;                         \
 987         Uint32 d = dst;                         \
 988         unsigned alpha = (s & 0x3e0) >> 5;      \
 989         s &= 0x03e07c1f;                        \
 990         d = (d | d << 16) & 0x03e07c1f;         \
 991         d += (s - d) * alpha >> 5;              \
 992         d &= 0x03e07c1f;                        \
 993         dst = (Uint16)(d | d >> 16);                    \
 994     } while(0)
 995
 996 /* used to save the destination format in the encoding. Designed to be
 997    macro-compatible with SDL_PixelFormat but without the unneeded fields */
 998 typedef struct {
 999         Uint8  BytesPerPixel;
1000         Uint8  Rloss;
1001         Uint8  Gloss;
1002         Uint8  Bloss;
1003         Uint8  Rshift;
1004         Uint8  Gshift;
1005         Uint8  Bshift;
1006         Uint8  Ashift;
1007         Uint32 Rmask;
1008         Uint32 Gmask;
1009         Uint32 Bmask;
1010         Uint32 Amask;
1011 } RLEDestFormat;
1012
1013 /* blit a pixel-alpha RLE surface clipped at the right and/or left edges */
1014 static void RLEAlphaClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
1015                              Uint8 *dstbuf, SDL_Rect *srcrect)
1016 {
1017     SDL_PixelFormat *df = dst->format;
1018     /*
1019      * clipped blitter: Ptype is the destination pixel type,
1020      * Ctype the translucent count type, and do_blend the macro
1021      * to blend one pixel.
1022      */
1023 #define RLEALPHACLIPBLIT(Ptype, Ctype, do_blend)                          \
1024     do {                                                                  \
1025         int linecount = srcrect->h;                                       \
1026         int left = srcrect->x;                                            \
1027         int right = left + srcrect->w;                                    \
1028         dstbuf -= left * sizeof(Ptype);                                   \
1029         do {                                                              \
1030             int ofs = 0;                                                  \
1031             /* blit opaque pixels on one line */                          \
1032             do {                                                          \
1033                 unsigned run;                                             \
1034                 ofs += ((Ctype *)srcbuf)[0];                              \
1035                 run = ((Ctype *)srcbuf)[1];                               \
1036                 srcbuf += 2 * sizeof(Ctype);                              \
1037                 if(run) {                                                 \
1038                     /* clip to left and right borders */                  \
1039                     int cofs = ofs;                                       \
1040                     int crun = run;                                       \
1041                     if(left - cofs > 0) {                                 \
1042                         crun -= left - cofs;                              \
1043                         cofs = left;                                      \
1044                     }                                                     \
1045                     if(crun > right - cofs)                               \
1046                         crun = right - cofs;                              \
1047                     if(crun > 0)                                          \
1048                         PIXEL_COPY(dstbuf + cofs * sizeof(Ptype),         \
1049                                    srcbuf + (cofs - ofs) * sizeof(Ptype), \
1050                                    (unsigned)crun, sizeof(Ptype));        \
1051                     srcbuf += run * sizeof(Ptype);                        \
1052                     ofs += run;                                           \
1053                 } else if(!ofs)                                           \
1054                     return;                                               \
1055             } while(ofs < w);                                             \
1056             /* skip padding if necessary */                               \
1057             if(sizeof(Ptype) == 2)                                        \
1058                 srcbuf += (uintptr_t)srcbuf & 2;                          \
1059             /* blit translucent pixels on the same line */                \
1060             ofs = 0;                                                      \
1061             do {                                                          \
1062                 unsigned run;                                             \
1063                 ofs += ((Uint16 *)srcbuf)[0];                             \
1064                 run = ((Uint16 *)srcbuf)[1];                              \
1065                 srcbuf += 4;                                              \
1066                 if(run) {                                                 \
1067                     /* clip to left and right borders */                  \
1068                     int cofs = ofs;                                       \
1069                     int crun = run;                                       \
1070                     if(left - cofs > 0) {                                 \
1071                         crun -= left - cofs;                              \
1072                         cofs = left;                                      \
1073                     }                                                     \
1074                     if(crun > right - cofs)                               \
1075                         crun = right - cofs;                              \
1076                     if(crun > 0) {                                        \
1077                         Ptype *dst = (Ptype *)dstbuf + cofs;              \
1078                         Uint32 *src = (Uint32 *)srcbuf + (cofs - ofs);    \
1079                         int i;                                            \
1080                         for(i = 0; i < crun; i++)                         \
1081                             do_blend(src[i], dst[i]);                     \
1082                     }                                                     \
1083                     srcbuf += run * 4;                                    \
1084                     ofs += run;                                           \
1085                 }                                                         \
1086             } while(ofs < w);                                             \
1087             dstbuf += dst->pitch;                                         \
1088         } while(--linecount);                                             \
1089     } while(0)
1090
1091     switch(df->BytesPerPixel) {
1092     case 2:
1093         if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1094            || df->Bmask == 0x07e0)
1095             RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_565);
1096         else
1097             RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_555);
1098         break;
1099     case 4:
1100         RLEALPHACLIPBLIT(Uint32, Uint16, BLIT_TRANSL_888);
1101         break;
1102     }
1103 }
1104
1105 /* blit a pixel-alpha RLE surface */
1106 int SDL_RLEAlphaBlit(SDL_Surface *src, SDL_Rect *srcrect,
1107                      SDL_Surface *dst, SDL_Rect *dstrect)
1108 {
1109     int x, y;
1110     int w = src->w;
1111     Uint8 *srcbuf, *dstbuf;
1112     SDL_PixelFormat *df = dst->format;
1113
1114     /* Lock the destination if necessary */
1115     if ( SDL_MUSTLOCK(dst) ) {
1116         if ( SDL_LockSurface(dst) < 0 ) {
1117             return -1;
1118         }
1119     }
1120
1121     x = dstrect->x;
1122     y = dstrect->y;
1123     dstbuf = (Uint8 *)dst->pixels
1124              + y * dst->pitch + x * df->BytesPerPixel;
1125     srcbuf = (Uint8 *)src->map->sw_data->aux_data + sizeof(RLEDestFormat);
1126
1127     {
1128         /* skip lines at the top if necessary */
1129         int vskip = srcrect->y;
1130         if(vskip) {
1131             int ofs;
1132             if(df->BytesPerPixel == 2) {
1133                 /* the 16/32 interleaved format */
1134                 do {
1135                     /* skip opaque line */
1136                     ofs = 0;
1137                     do {
1138                         int run;
1139                         ofs += srcbuf[0];
1140                         run = srcbuf[1];
1141                         srcbuf += 2;
1142                         if(run) {
1143                             srcbuf += 2 * run;
1144                             ofs += run;
1145                         } else if(!ofs)
1146                             goto done;
1147                     } while(ofs < w);
1148
1149                     /* skip padding */
1150                     srcbuf += (uintptr_t)srcbuf & 2;
1151
1152                     /* skip translucent line */
1153                     ofs = 0;
1154                     do {
1155                         int run;
1156                         ofs += ((Uint16 *)srcbuf)[0];
1157                         run = ((Uint16 *)srcbuf)[1];
1158                         srcbuf += 4 * (run + 1);
1159                         ofs += run;
1160                     } while(ofs < w);
1161                 } while(--vskip);
1162             } else {
1163                 /* the 32/32 interleaved format */
1164                 vskip <<= 1;    /* opaque and translucent have same format */
1165                 do {
1166                     ofs = 0;
1167                     do {
1168                         int run;
1169                         ofs += ((Uint16 *)srcbuf)[0];
1170                         run = ((Uint16 *)srcbuf)[1];
1171                         srcbuf += 4;
1172                         if(run) {
1173                             srcbuf += 4 * run;
1174                             ofs += run;
1175                         } else if(!ofs)
1176                             goto done;
1177                     } while(ofs < w);
1178                 } while(--vskip);
1179             }
1180         }
1181     }
1182
1183     /* if left or right edge clipping needed, call clip blit */
1184     if(srcrect->x || srcrect->w != src->w) {
1185         RLEAlphaClipBlit(w, srcbuf, dst, dstbuf, srcrect);
1186     } else {
1187
1188         /*
1189          * non-clipped blitter. Ptype is the destination pixel type,
1190          * Ctype the translucent count type, and do_blend the
1191          * macro to blend one pixel.
1192          */
1193 #define RLEALPHABLIT(Ptype, Ctype, do_blend)                             \
1194         do {                                                             \
1195             int linecount = srcrect->h;                                  \
1196             do {                                                         \
1197                 int ofs = 0;                                             \
1198                 /* blit opaque pixels on one line */                     \
1199                 do {                                                     \
1200                     unsigned run;                                        \
1201                     ofs += ((Ctype *)srcbuf)[0];                         \
1202                     run = ((Ctype *)srcbuf)[1];                          \
1203                     srcbuf += 2 * sizeof(Ctype);                         \
1204                     if(run) {                                            \
1205                         PIXEL_COPY(dstbuf + ofs * sizeof(Ptype), srcbuf, \
1206                                    run, sizeof(Ptype));                  \
1207                         srcbuf += run * sizeof(Ptype);                   \
1208                         ofs += run;                                      \
1209                     } else if(!ofs)                                      \
1210                         goto done;                                       \
1211                 } while(ofs < w);                                        \
1212                 /* skip padding if necessary */                          \
1213                 if(sizeof(Ptype) == 2)                                   \
1214                     srcbuf += (uintptr_t)srcbuf & 2;                     \
1215                 /* blit translucent pixels on the same line */           \
1216                 ofs = 0;                                                 \
1217                 do {                                                     \
1218                     unsigned run;                                        \
1219                     ofs += ((Uint16 *)srcbuf)[0];                        \
1220                     run = ((Uint16 *)srcbuf)[1];                         \
1221                     srcbuf += 4;                                         \
1222                     if(run) {                                            \
1223                         Ptype *dst = (Ptype *)dstbuf + ofs;              \
1224                         unsigned i;                                      \
1225                         for(i = 0; i < run; i++) {                       \
1226                             Uint32 src = *(Uint32 *)srcbuf;              \
1227                             do_blend(src, *dst);                         \
1228                             srcbuf += 4;                                 \
1229                             dst++;                                       \
1230                         }                                                \
1231                         ofs += run;                                      \
1232                     }                                                    \
1233                 } while(ofs < w);                                        \
1234                 dstbuf += dst->pitch;                                    \
1235             } while(--linecount);                                        \
1236         } while(0)
1237
1238         switch(df->BytesPerPixel) {
1239         case 2:
1240             if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1241                || df->Bmask == 0x07e0)
1242                 RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_565);
1243             else
1244                 RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_555);
1245             break;
1246         case 4:
1247             RLEALPHABLIT(Uint32, Uint16, BLIT_TRANSL_888);
1248             break;
1249         }
1250     }
1251
1252  done:
1253     /* Unlock the destination if necessary */
1254     if ( SDL_MUSTLOCK(dst) ) {
1255         SDL_UnlockSurface(dst);
1256     }
1257     return 0;
1258 }
1259
1260 /*
1261  * Auxiliary functions:
1262  * The encoding functions take 32bpp rgb + a, and
1263  * return the number of bytes copied to the destination.
1264  * The decoding functions copy to 32bpp rgb + a, and
1265  * return the number of bytes copied from the source.
1266  * These are only used in the encoder and un-RLE code and are therefore not
1267  * highly optimised.
1268  */
1269
1270 /* encode 32bpp rgb + a into 16bpp rgb, losing alpha */
1271 static int copy_opaque_16(void *dst, Uint32 *src, int n,
1272                           SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1273 {
1274     int i;
1275     Uint16 *d = dst;
1276     for(i = 0; i < n; i++) {
1277         unsigned r, g, b;
1278         RGB_FROM_PIXEL(*src, sfmt, r, g, b);
1279         PIXEL_FROM_RGB(*d, dfmt, r, g, b);
1280         src++;
1281         d++;
1282     }
1283     return n * 2;
1284 }
1285
1286 /* decode opaque pixels from 16bpp to 32bpp rgb + a */
1287 static int uncopy_opaque_16(Uint32 *dst, void *src, int n,
1288                             RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1289 {
1290     int i;
1291     Uint16 *s = src;
1292     unsigned alpha = dfmt->Amask ? 255 : 0;
1293     for(i = 0; i < n; i++) {
1294         unsigned r, g, b;
1295         RGB_FROM_PIXEL(*s, sfmt, r, g, b);
1296         PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, alpha);
1297         s++;
1298         dst++;
1299     }
1300     return n * 2;
1301 }
1302
1303
1304
1305 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 565 */
1306 static int copy_transl_565(void *dst, Uint32 *src, int n,
1307                            SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1308 {
1309     int i;
1310     Uint32 *d = dst;
1311     for(i = 0; i < n; i++) {
1312         unsigned r, g, b, a;
1313         Uint16 pix;
1314         RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1315         PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1316         *d = ((pix & 0x7e0) << 16) | (pix & 0xf81f) | ((a << 2) & 0x7e0);
1317         src++;
1318         d++;
1319     }
1320     return n * 4;
1321 }
1322
1323 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 555 */
1324 static int copy_transl_555(void *dst, Uint32 *src, int n,
1325                            SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1326 {
1327     int i;
1328     Uint32 *d = dst;
1329     for(i = 0; i < n; i++) {
1330         unsigned r, g, b, a;
1331         Uint16 pix;
1332         RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1333         PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1334         *d = ((pix & 0x3e0) << 16) | (pix & 0xfc1f) | ((a << 2) & 0x3e0);
1335         src++;
1336         d++;
1337     }
1338     return n * 4;
1339 }
1340
1341 /* decode translucent pixels from 32bpp GORAB to 32bpp rgb + a */
1342 static int uncopy_transl_16(Uint32 *dst, void *src, int n,
1343                             RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1344 {
1345     int i;
1346     Uint32 *s = src;
1347     for(i = 0; i < n; i++) {
1348         unsigned r, g, b, a;
1349         Uint32 pix = *s++;
1350         a = (pix & 0x3e0) >> 2;
1351         pix = (pix & ~0x3e0) | pix >> 16;
1352         RGB_FROM_PIXEL(pix, sfmt, r, g, b);
1353         PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1354         dst++;
1355     }
1356     return n * 4;
1357 }
1358
1359 /* encode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
1360 static int copy_32(void *dst, Uint32 *src, int n,
1361                    SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1362 {
1363     int i;
1364     Uint32 *d = dst;
1365     for(i = 0; i < n; i++) {
1366         unsigned r, g, b, a;
1367         Uint32 pixel;
1368         RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1369         PIXEL_FROM_RGB(pixel, dfmt, r, g, b);
1370         *d++ = pixel | a << 24;
1371         src++;
1372     }
1373     return n * 4;
1374 }
1375
1376 /* decode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
1377 static int uncopy_32(Uint32 *dst, void *src, int n,
1378                      RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1379 {
1380     int i;
1381     Uint32 *s = src;
1382     for(i = 0; i < n; i++) {
1383         unsigned r, g, b, a;
1384         Uint32 pixel = *s++;
1385         RGB_FROM_PIXEL(pixel, sfmt, r, g, b);
1386         a = pixel >> 24;
1387         PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1388         dst++;
1389     }
1390     return n * 4;
1391 }
1392
1393 #define ISOPAQUE(pixel, fmt) ((((pixel) & fmt->Amask) >> fmt->Ashift) == 255)
1394
1395 #define ISTRANSL(pixel, fmt)    \
1396     ((unsigned)((((pixel) & fmt->Amask) >> fmt->Ashift) - 1U) < 254U)
1397
1398 /* convert surface to be quickly alpha-blittable onto dest, if possible */
1399 static int RLEAlphaSurface(SDL_Surface *surface)
1400 {
1401     SDL_Surface *dest;
1402     SDL_PixelFormat *df;
1403     int maxsize = 0;
1404     int max_opaque_run;
1405     int max_transl_run = 65535;
1406     unsigned masksum;
1407     Uint8 *rlebuf, *dst;
1408     int (*copy_opaque)(void *, Uint32 *, int,
1409                        SDL_PixelFormat *, SDL_PixelFormat *);
1410     int (*copy_transl)(void *, Uint32 *, int,
1411                        SDL_PixelFormat *, SDL_PixelFormat *);
1412
1413     dest = surface->map->dst;
1414     if(!dest)
1415         return -1;
1416     df = dest->format;
1417     if(surface->format->BitsPerPixel != 32)
1418         return -1;              /* only 32bpp source supported */
1419
1420     /* find out whether the destination is one we support,
1421        and determine the max size of the encoded result */
1422     masksum = df->Rmask | df->Gmask | df->Bmask;
1423     switch(df->BytesPerPixel) {
1424     case 2:
1425         /* 16bpp: only support 565 and 555 formats */
1426         switch(masksum) {
1427         case 0xffff:
1428             if(df->Gmask == 0x07e0
1429                || df->Rmask == 0x07e0 || df->Bmask == 0x07e0) {
1430                 copy_opaque = copy_opaque_16;
1431                 copy_transl = copy_transl_565;
1432             } else
1433                 return -1;
1434             break;
1435         case 0x7fff:
1436             if(df->Gmask == 0x03e0
1437                || df->Rmask == 0x03e0 || df->Bmask == 0x03e0) {
1438                 copy_opaque = copy_opaque_16;
1439                 copy_transl = copy_transl_555;
1440             } else
1441                 return -1;
1442             break;
1443         default:
1444             return -1;
1445         }
1446         max_opaque_run = 255;   /* runs stored as bytes */
1447
1448         /* worst case is alternating opaque and translucent pixels,
1449            with room for alignment padding between lines */
1450         maxsize = surface->h * (2 + (4 + 2) * (surface->w + 1)) + 2;
1451         break;
1452     case 4:
1453         if(masksum != 0x00ffffff)
1454             return -1;          /* requires unused high byte */
1455         copy_opaque = copy_32;
1456         copy_transl = copy_32;
1457         max_opaque_run = 255;   /* runs stored as short ints */
1458
1459         /* worst case is alternating opaque and translucent pixels */
1460         maxsize = surface->h * 2 * 4 * (surface->w + 1) + 4;
1461         break;
1462     default:
1463         return -1;              /* anything else unsupported right now */
1464     }
1465
1466     maxsize += sizeof(RLEDestFormat);
1467     rlebuf = (Uint8 *)SDL_malloc(maxsize);
1468     if(!rlebuf) {
1469         SDL_OutOfMemory();
1470         return -1;
1471     }
1472     {
1473         /* save the destination format so we can undo the encoding later */
1474         RLEDestFormat *r = (RLEDestFormat *)rlebuf;
1475         r->BytesPerPixel = df->BytesPerPixel;
1476         r->Rloss = df->Rloss;
1477         r->Gloss = df->Gloss;
1478         r->Bloss = df->Bloss;
1479         r->Rshift = df->Rshift;
1480         r->Gshift = df->Gshift;
1481         r->Bshift = df->Bshift;
1482         r->Ashift = df->Ashift;
1483         r->Rmask = df->Rmask;
1484         r->Gmask = df->Gmask;
1485         r->Bmask = df->Bmask;
1486         r->Amask = df->Amask;
1487     }
1488     dst = rlebuf + sizeof(RLEDestFormat);
1489
1490     /* Do the actual encoding */
1491     {
1492         int x, y;
1493         int h = surface->h, w = surface->w;
1494         SDL_PixelFormat *sf = surface->format;
1495         Uint32 *src = (Uint32 *)surface->pixels;
1496         Uint8 *lastline = dst;  /* end of last non-blank line */
1497
1498         /* opaque counts are 8 or 16 bits, depending on target depth */
1499 #define ADD_OPAQUE_COUNTS(n, m)                 \
1500         if(df->BytesPerPixel == 4) {            \
1501             ((Uint16 *)dst)[0] = n;             \
1502             ((Uint16 *)dst)[1] = m;             \
1503             dst += 4;                           \
1504         } else {                                \
1505             dst[0] = n;                         \
1506             dst[1] = m;                         \
1507             dst += 2;                           \
1508         }
1509
1510         /* translucent counts are always 16 bit */
1511 #define ADD_TRANSL_COUNTS(n, m)         \
1512         (((Uint16 *)dst)[0] = n, ((Uint16 *)dst)[1] = m, dst += 4)
1513
1514         for(y = 0; y < h; y++) {
1515             int runstart, skipstart;
1516             int blankline = 0;
1517             /* First encode all opaque pixels of a scan line */
1518             x = 0;
1519             do {
1520                 int run, skip, len;
1521                 skipstart = x;
1522                 while(x < w && !ISOPAQUE(src[x], sf))
1523                     x++;
1524                 runstart = x;
1525                 while(x < w && ISOPAQUE(src[x], sf))
1526                     x++;
1527                 skip = runstart - skipstart;
1528                 if(skip == w)
1529                     blankline = 1;
1530                 run = x - runstart;
1531                 while(skip > max_opaque_run) {
1532                     ADD_OPAQUE_COUNTS(max_opaque_run, 0);
1533                     skip -= max_opaque_run;
1534                 }
1535                 len = MIN(run, max_opaque_run);
1536                 ADD_OPAQUE_COUNTS(skip, len);
1537                 dst += copy_opaque(dst, src + runstart, len, sf, df);
1538                 runstart += len;
1539                 run -= len;
1540                 while(run) {
1541                     len = MIN(run, max_opaque_run);
1542                     ADD_OPAQUE_COUNTS(0, len);
1543                     dst += copy_opaque(dst, src + runstart, len, sf, df);
1544                     runstart += len;
1545                     run -= len;
1546                 }
1547             } while(x < w);
1548
1549             /* Make sure the next output address is 32-bit aligned */
1550             dst += (uintptr_t)dst & 2;
1551
1552             /* Next, encode all translucent pixels of the same scan line */
1553             x = 0;
1554             do {
1555                 int run, skip, len;
1556                 skipstart = x;
1557                 while(x < w && !ISTRANSL(src[x], sf))
1558                     x++;
1559                 runstart = x;
1560                 while(x < w && ISTRANSL(src[x], sf))
1561                     x++;
1562                 skip = runstart - skipstart;
1563                 blankline &= (skip == w);
1564                 run = x - runstart;
1565                 while(skip > max_transl_run) {
1566                     ADD_TRANSL_COUNTS(max_transl_run, 0);
1567                     skip -= max_transl_run;
1568                 }
1569                 len = MIN(run, max_transl_run);
1570                 ADD_TRANSL_COUNTS(skip, len);
1571                 dst += copy_transl(dst, src + runstart, len, sf, df);
1572                 runstart += len;
1573                 run -= len;
1574                 while(run) {
1575                     len = MIN(run, max_transl_run);
1576                     ADD_TRANSL_COUNTS(0, len);
1577                     dst += copy_transl(dst, src + runstart, len, sf, df);
1578                     runstart += len;
1579                     run -= len;
1580                 }
1581                 if(!blankline)
1582                     lastline = dst;
1583             } while(x < w);
1584
1585             src += surface->pitch >> 2;
1586         }
1587         dst = lastline;         /* back up past trailing blank lines */
1588         ADD_OPAQUE_COUNTS(0, 0);
1589     }
1590
1591 #undef ADD_OPAQUE_COUNTS
1592 #undef ADD_TRANSL_COUNTS
1593
1594     /* Now that we have it encoded, release the original pixels */
1595     if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1596        && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1597         SDL_free( surface->pixels );
1598         surface->pixels = NULL;
1599     }
1600
1601     /* realloc the buffer to release unused memory */
1602     {
1603         Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1604         if(!p)
1605             p = rlebuf;
1606         surface->map->sw_data->aux_data = p;
1607     }
1608
1609     return 0;
1610 }
1611
1612 static Uint32 getpix_8(Uint8 *srcbuf)
1613 {
1614     return *srcbuf;
1615 }
1616
1617 static Uint32 getpix_16(Uint8 *srcbuf)
1618 {
1619     return *(Uint16 *)srcbuf;
1620 }
1621
1622 static Uint32 getpix_24(Uint8 *srcbuf)
1623 {
1624 #if SDL_BYTEORDER == SDL_LIL_ENDIAN
1625     return srcbuf[0] + (srcbuf[1] << 8) + (srcbuf[2] << 16);
1626 #else
1627     return (srcbuf[0] << 16) + (srcbuf[1] << 8) + srcbuf[2];
1628 #endif
1629 }
1630
1631 static Uint32 getpix_32(Uint8 *srcbuf)
1632 {
1633     return *(Uint32 *)srcbuf;
1634 }
1635
1636 typedef Uint32 (*getpix_func)(Uint8 *);
1637
1638 static getpix_func getpixes[4] = {
1639     getpix_8, getpix_16, getpix_24, getpix_32
1640 };
1641
1642 static int RLEColorkeySurface(SDL_Surface *surface)
1643 {
1644         Uint8 *rlebuf, *dst;
1645         int maxn;
1646         int y;
1647         Uint8 *srcbuf, *curbuf, *lastline;
1648         int maxsize = 0;
1649         int skip, run;
1650         int bpp = surface->format->BytesPerPixel;
1651         getpix_func getpix;
1652         Uint32 ckey, rgbmask;
1653         int w, h;
1654
1655         /* calculate the worst case size for the compressed surface */
1656         switch(bpp) {
1657         case 1:
1658             /* worst case is alternating opaque and transparent pixels,
1659                starting with an opaque pixel */
1660             maxsize = surface->h * 3 * (surface->w / 2 + 1) + 2;
1661             break;
1662         case 2:
1663         case 3:
1664             /* worst case is solid runs, at most 255 pixels wide */
1665             maxsize = surface->h * (2 * (surface->w / 255 + 1)
1666                                     + surface->w * bpp) + 2;
1667             break;
1668         case 4:
1669             /* worst case is solid runs, at most 65535 pixels wide */
1670             maxsize = surface->h * (4 * (surface->w / 65535 + 1)
1671                                     + surface->w * 4) + 4;
1672             break;
1673         }
1674
1675         rlebuf = (Uint8 *)SDL_malloc(maxsize);
1676         if ( rlebuf == NULL ) {
1677                 SDL_OutOfMemory();
1678                 return(-1);
1679         }
1680
1681         /* Set up the conversion */
1682         srcbuf = (Uint8 *)surface->pixels;
1683         curbuf = srcbuf;
1684         maxn = bpp == 4 ? 65535 : 255;
1685         skip = run = 0;
1686         dst = rlebuf;
1687         rgbmask = ~surface->format->Amask;
1688         ckey = surface->format->colorkey & rgbmask;
1689         lastline = dst;
1690         getpix = getpixes[bpp - 1];
1691         w = surface->w;
1692         h = surface->h;
1693
1694 #define ADD_COUNTS(n, m)                        \
1695         if(bpp == 4) {                          \
1696             ((Uint16 *)dst)[0] = n;             \
1697             ((Uint16 *)dst)[1] = m;             \
1698             dst += 4;                           \
1699         } else {                                \
1700             dst[0] = n;                         \
1701             dst[1] = m;                         \
1702             dst += 2;                           \
1703         }
1704
1705         for(y = 0; y < h; y++) {
1706             int x = 0;
1707             int blankline = 0;
1708             do {
1709                 int run, skip, len;
1710                 int runstart;
1711                 int skipstart = x;
1712
1713                 /* find run of transparent, then opaque pixels */
1714                 while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) == ckey)
1715                     x++;
1716                 runstart = x;
1717                 while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) != ckey)
1718                     x++;
1719                 skip = runstart - skipstart;
1720                 if(skip == w)
1721                     blankline = 1;
1722                 run = x - runstart;
1723
1724                 /* encode segment */
1725                 while(skip > maxn) {
1726                     ADD_COUNTS(maxn, 0);
1727                     skip -= maxn;
1728                 }
1729                 len = MIN(run, maxn);
1730                 ADD_COUNTS(skip, len);
1731                 SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1732                 dst += len * bpp;
1733                 run -= len;
1734                 runstart += len;
1735                 while(run) {
1736                     len = MIN(run, maxn);
1737                     ADD_COUNTS(0, len);
1738                     SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1739                     dst += len * bpp;
1740                     runstart += len;
1741                     run -= len;
1742                 }
1743                 if(!blankline)
1744                     lastline = dst;
1745             } while(x < w);
1746
1747             srcbuf += surface->pitch;
1748         }
1749         dst = lastline;         /* back up bast trailing blank lines */
1750         ADD_COUNTS(0, 0);
1751
1752 #undef ADD_COUNTS
1753
1754         /* Now that we have it encoded, release the original pixels */
1755         if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1756            && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1757             SDL_free( surface->pixels );
1758             surface->pixels = NULL;
1759         }
1760
1761         /* realloc the buffer to release unused memory */
1762         {
1763             /* If realloc returns NULL, the original block is left intact */
1764             Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1765             if(!p)
1766                 p = rlebuf;
1767             surface->map->sw_data->aux_data = p;
1768         }
1769
1770         return(0);
1771 }
1772
1773 int SDL_RLESurface(SDL_Surface *surface)
1774 {
1775         int retcode;
1776
1777         /* Clear any previous RLE conversion */
1778         if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1779                 SDL_UnRLESurface(surface, 1);
1780         }
1781
1782         /* We don't support RLE encoding of bitmaps */
1783         if ( surface->format->BitsPerPixel < 8 ) {
1784                 return(-1);
1785         }
1786
1787         /* Lock the surface if it's in hardware */
1788         if ( SDL_MUSTLOCK(surface) ) {
1789                 if ( SDL_LockSurface(surface) < 0 ) {
1790                         return(-1);
1791                 }
1792         }
1793
1794         /* Encode */
1795         if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1796             retcode = RLEColorkeySurface(surface);
1797         } else {
1798             if((surface->flags & SDL_SRCALPHA) == SDL_SRCALPHA
1799                && surface->format->Amask != 0)
1800                 retcode = RLEAlphaSurface(surface);
1801             else
1802                 retcode = -1;   /* no RLE for per-surface alpha sans ckey */
1803         }
1804
1805         /* Unlock the surface if it's in hardware */
1806         if ( SDL_MUSTLOCK(surface) ) {
1807                 SDL_UnlockSurface(surface);
1808         }
1809
1810         if(retcode < 0)
1811             return -1;
1812
1813         /* The surface is now accelerated */
1814         surface->flags |= SDL_RLEACCEL;
1815
1816         return(0);
1817 }
1818
1819 /*
1820  * Un-RLE a surface with pixel alpha
1821  * This may not give back exactly the image before RLE-encoding; all
1822  * completely transparent pixels will be lost, and colour and alpha depth
1823  * may have been reduced (when encoding for 16bpp targets).
1824  */
1825 static SDL_bool UnRLEAlpha(SDL_Surface *surface)
1826 {
1827     Uint8 *srcbuf;
1828     Uint32 *dst;
1829     SDL_PixelFormat *sf = surface->format;
1830     RLEDestFormat *df = surface->map->sw_data->aux_data;
1831     int (*uncopy_opaque)(Uint32 *, void *, int,
1832                          RLEDestFormat *, SDL_PixelFormat *);
1833     int (*uncopy_transl)(Uint32 *, void *, int,
1834                          RLEDestFormat *, SDL_PixelFormat *);
1835     int w = surface->w;
1836     int bpp = df->BytesPerPixel;
1837
1838     if(bpp == 2) {
1839         uncopy_opaque = uncopy_opaque_16;
1840         uncopy_transl = uncopy_transl_16;
1841     } else {
1842         uncopy_opaque = uncopy_transl = uncopy_32;
1843     }
1844
1845     surface->pixels = SDL_malloc(surface->h * surface->pitch);
1846     if ( !surface->pixels ) {
1847         return(SDL_FALSE);
1848     }
1849     /* fill background with transparent pixels */
1850     SDL_memset(surface->pixels, 0, surface->h * surface->pitch);
1851
1852     dst = surface->pixels;
1853     srcbuf = (Uint8 *)(df + 1);
1854     for(;;) {
1855         /* copy opaque pixels */
1856         int ofs = 0;
1857         do {
1858             unsigned run;
1859             if(bpp == 2) {
1860                 ofs += srcbuf[0];
1861                 run = srcbuf[1];
1862                 srcbuf += 2;
1863             } else {
1864                 ofs += ((Uint16 *)srcbuf)[0];
1865                 run = ((Uint16 *)srcbuf)[1];
1866                 srcbuf += 4;
1867             }
1868             if(run) {
1869                 srcbuf += uncopy_opaque(dst + ofs, srcbuf, run, df, sf);
1870                 ofs += run;
1871             } else if(!ofs)
1872                 return(SDL_TRUE);
1873         } while(ofs < w);
1874
1875         /* skip padding if needed */
1876         if(bpp == 2)
1877             srcbuf += (uintptr_t)srcbuf & 2;
1878
1879         /* copy translucent pixels */
1880         ofs = 0;
1881         do {
1882             unsigned run;
1883             ofs += ((Uint16 *)srcbuf)[0];
1884             run = ((Uint16 *)srcbuf)[1];
1885             srcbuf += 4;
1886             if(run) {
1887                 srcbuf += uncopy_transl(dst + ofs, srcbuf, run, df, sf);
1888                 ofs += run;
1889             }
1890         } while(ofs < w);
1891         dst += surface->pitch >> 2;
1892     }
1893     /* Make the compiler happy */
1894     return(SDL_TRUE);
1895 }
1896
1897 void SDL_UnRLESurface(SDL_Surface *surface, int recode)
1898 {
1899     if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1900         surface->flags &= ~SDL_RLEACCEL;
1901
1902         if(recode && (surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1903            && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1904             if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1905                 SDL_Rect full;
1906                 unsigned alpha_flag;
1907
1908                 /* re-create the original surface */
1909                 surface->pixels = SDL_malloc(surface->h * surface->pitch);
1910                 if ( !surface->pixels ) {
1911                         /* Oh crap... */
1912                         surface->flags |= SDL_RLEACCEL;
1913                         return;
1914                 }
1915
1916                 /* fill it with the background colour */
1917                 SDL_FillRect(surface, NULL, surface->format->colorkey);
1918
1919                 /* now render the encoded surface */
1920                 full.x = full.y = 0;
1921                 full.w = surface->w;
1922                 full.h = surface->h;
1923                 alpha_flag = surface->flags & SDL_SRCALPHA;
1924                 surface->flags &= ~SDL_SRCALPHA; /* opaque blit */
1925                 SDL_RLEBlit(surface, &full, surface, &full);
1926                 surface->flags |= alpha_flag;
1927             } else {
1928                 if ( !UnRLEAlpha(surface) ) {
1929                     /* Oh crap... */
1930                     surface->flags |= SDL_RLEACCEL;
1931                     return;
1932                 }
1933             }
1934         }
1935
1936         if ( surface->map && surface->map->sw_data->aux_data ) {
1937             SDL_free(surface->map->sw_data->aux_data);
1938             surface->map->sw_data->aux_data = NULL;
1939         }
1940     }
1941 }
1942
1943