rgb2rgb_template.c

   1 /*
   2  * software RGB to RGB converter
   3  * pluralize by software PAL8 to RGB converter
   4  *              software YUV to YUV converter
   5  *              software YUV to RGB converter
   6  * Written by Nick Kurshev.
   7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
   8  * lot of big-endian byte order fixes by Alex Beregszaszi
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include <stddef.h>
  28
  29 #undef PREFETCH
  30 #undef MOVNTQ
  31 #undef EMMS
  32 #undef SFENCE
  33 #undef MMREG_SIZE
  34 #undef PAVGB
  35
  36 #if COMPILE_TEMPLATE_SSE2
  37 #define MMREG_SIZE 16
  38 #else
  39 #define MMREG_SIZE 8
  40 #endif
  41
  42 #if COMPILE_TEMPLATE_AMD3DNOW
  43 #define PREFETCH  "prefetch"
  44 #define PAVGB     "pavgusb"
  45 #elif COMPILE_TEMPLATE_MMX2
  46 #define PREFETCH "prefetchnta"
  47 #define PAVGB     "pavgb"
  48 #else
  49 #define PREFETCH  " # nop"
  50 #endif
  51
  52 #if COMPILE_TEMPLATE_AMD3DNOW
  53 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  54 #define EMMS     "femms"
  55 #else
  56 #define EMMS     "emms"
  57 #endif
  58
  59 #if COMPILE_TEMPLATE_MMX2
  60 #define MOVNTQ "movntq"
  61 #define SFENCE "sfence"
  62 #else
  63 #define MOVNTQ "movq"
  64 #define SFENCE " # nop"
  65 #endif
  66
  67 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  68 {
  69     uint8_t *dest = dst;
  70     const uint8_t *s = src;
  71     const uint8_t *end;
  72 #if COMPILE_TEMPLATE_MMX
  73     const uint8_t *mm_end;
  74 #endif
  75     end = s + src_size;
  76 #if COMPILE_TEMPLATE_MMX
  77     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
  78     mm_end = end - 23;
  79     __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
  80     while (s < mm_end) {
  81         __asm__ volatile(
  82             PREFETCH"    32%1           \n\t"
  83             "movd          %1, %%mm0    \n\t"
  84             "punpckldq    3%1, %%mm0    \n\t"
  85             "movd         6%1, %%mm1    \n\t"
  86             "punpckldq    9%1, %%mm1    \n\t"
  87             "movd        12%1, %%mm2    \n\t"
  88             "punpckldq   15%1, %%mm2    \n\t"
  89             "movd        18%1, %%mm3    \n\t"
  90             "punpckldq   21%1, %%mm3    \n\t"
  91             "por        %%mm7, %%mm0    \n\t"
  92             "por        %%mm7, %%mm1    \n\t"
  93             "por        %%mm7, %%mm2    \n\t"
  94             "por        %%mm7, %%mm3    \n\t"
  95             MOVNTQ"     %%mm0,   %0     \n\t"
  96             MOVNTQ"     %%mm1,  8%0     \n\t"
  97             MOVNTQ"     %%mm2, 16%0     \n\t"
  98             MOVNTQ"     %%mm3, 24%0"
  99             :"=m"(*dest)
 100             :"m"(*s)
 101             :"memory");
 102         dest += 32;
 103         s += 24;
 104     }
 105     __asm__ volatile(SFENCE:::"memory");
 106     __asm__ volatile(EMMS:::"memory");
 107 #endif
 108     while (s < end) {
 109 #if HAVE_BIGENDIAN
 110         /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
 111         *dest++ = 255;
 112         *dest++ = s[2];
 113         *dest++ = s[1];
 114         *dest++ = s[0];
 115         s+=3;
 116 #else
 117         *dest++ = *s++;
 118         *dest++ = *s++;
 119         *dest++ = *s++;
 120         *dest++ = 255;
 121 #endif
 122     }
 123 }
 124
 125 #define STORE_BGR24_MMX \
 126             "psrlq         $8, %%mm2    \n\t" \
 127             "psrlq         $8, %%mm3    \n\t" \
 128             "psrlq         $8, %%mm6    \n\t" \
 129             "psrlq         $8, %%mm7    \n\t" \
 130             "pand "MANGLE(mask24l)", %%mm0\n\t" \
 131             "pand "MANGLE(mask24l)", %%mm1\n\t" \
 132             "pand "MANGLE(mask24l)", %%mm4\n\t" \
 133             "pand "MANGLE(mask24l)", %%mm5\n\t" \
 134             "pand "MANGLE(mask24h)", %%mm2\n\t" \
 135             "pand "MANGLE(mask24h)", %%mm3\n\t" \
 136             "pand "MANGLE(mask24h)", %%mm6\n\t" \
 137             "pand "MANGLE(mask24h)", %%mm7\n\t" \
 138             "por        %%mm2, %%mm0    \n\t" \
 139             "por        %%mm3, %%mm1    \n\t" \
 140             "por        %%mm6, %%mm4    \n\t" \
 141             "por        %%mm7, %%mm5    \n\t" \
 142  \
 143             "movq       %%mm1, %%mm2    \n\t" \
 144             "movq       %%mm4, %%mm3    \n\t" \
 145             "psllq        $48, %%mm2    \n\t" \
 146             "psllq        $32, %%mm3    \n\t" \
 147             "pand "MANGLE(mask24hh)", %%mm2\n\t" \
 148             "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
 149             "por        %%mm2, %%mm0    \n\t" \
 150             "psrlq        $16, %%mm1    \n\t" \
 151             "psrlq        $32, %%mm4    \n\t" \
 152             "psllq        $16, %%mm5    \n\t" \
 153             "por        %%mm3, %%mm1    \n\t" \
 154             "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
 155             "por        %%mm5, %%mm4    \n\t" \
 156  \
 157             MOVNTQ"     %%mm0,   %0     \n\t" \
 158             MOVNTQ"     %%mm1,  8%0     \n\t" \
 159             MOVNTQ"     %%mm4, 16%0"
 160
 161
 162 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 163 {
 164     uint8_t *dest = dst;
 165     const uint8_t *s = src;
 166     const uint8_t *end;
 167 #if COMPILE_TEMPLATE_MMX
 168     const uint8_t *mm_end;
 169 #endif
 170     end = s + src_size;
 171 #if COMPILE_TEMPLATE_MMX
 172     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
 173     mm_end = end - 31;
 174     while (s < mm_end) {
 175         __asm__ volatile(
 176             PREFETCH"    32%1           \n\t"
 177             "movq          %1, %%mm0    \n\t"
 178             "movq         8%1, %%mm1    \n\t"
 179             "movq        16%1, %%mm4    \n\t"
 180             "movq        24%1, %%mm5    \n\t"
 181             "movq       %%mm0, %%mm2    \n\t"
 182             "movq       %%mm1, %%mm3    \n\t"
 183             "movq       %%mm4, %%mm6    \n\t"
 184             "movq       %%mm5, %%mm7    \n\t"
 185             STORE_BGR24_MMX
 186             :"=m"(*dest)
 187             :"m"(*s)
 188             :"memory");
 189         dest += 24;
 190         s += 32;
 191     }
 192     __asm__ volatile(SFENCE:::"memory");
 193     __asm__ volatile(EMMS:::"memory");
 194 #endif
 195     while (s < end) {
 196 #if HAVE_BIGENDIAN
 197         /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
 198         s++;
 199         dest[2] = *s++;
 200         dest[1] = *s++;
 201         dest[0] = *s++;
 202         dest += 3;
 203 #else
 204         *dest++ = *s++;
 205         *dest++ = *s++;
 206         *dest++ = *s++;
 207         s++;
 208 #endif
 209     }
 210 }
 211
 212 /*
 213  original by Strepto/Astral
 214  ported to gcc & bugfixed: A'rpi
 215  MMX2, 3DNOW optimization by Nick Kurshev
 216  32-bit C version, and and&add trick by Michael Niedermayer
 217 */
 218 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
 219 {
 220     register const uint8_t* s=src;
 221     register uint8_t* d=dst;
 222     register const uint8_t *end;
 223     const uint8_t *mm_end;
 224     end = s + src_size;
 225 #if COMPILE_TEMPLATE_MMX
 226     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
 227     __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
 228     mm_end = end - 15;
 229     while (s<mm_end) {
 230         __asm__ volatile(
 231             PREFETCH"  32%1         \n\t"
 232             "movq        %1, %%mm0  \n\t"
 233             "movq       8%1, %%mm2  \n\t"
 234             "movq     %%mm0, %%mm1  \n\t"
 235             "movq     %%mm2, %%mm3  \n\t"
 236             "pand     %%mm4, %%mm0  \n\t"
 237             "pand     %%mm4, %%mm2  \n\t"
 238             "paddw    %%mm1, %%mm0  \n\t"
 239             "paddw    %%mm3, %%mm2  \n\t"
 240             MOVNTQ"   %%mm0,  %0    \n\t"
 241             MOVNTQ"   %%mm2, 8%0"
 242             :"=m"(*d)
 243             :"m"(*s)
 244         );
 245         d+=16;
 246         s+=16;
 247     }
 248     __asm__ volatile(SFENCE:::"memory");
 249     __asm__ volatile(EMMS:::"memory");
 250 #endif
 251     mm_end = end - 3;
 252     while (s < mm_end) {
 253         register unsigned x= *((const uint32_t *)s);
 254         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 255         d+=4;
 256         s+=4;
 257     }
 258     if (s < end) {
 259         register unsigned short x= *((const uint16_t *)s);
 260         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
 261     }
 262 }
 263
 264 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
 265 {
 266     register const uint8_t* s=src;
 267     register uint8_t* d=dst;
 268     register const uint8_t *end;
 269     const uint8_t *mm_end;
 270     end = s + src_size;
 271 #if COMPILE_TEMPLATE_MMX
 272     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
 273     __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
 274     __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
 275     mm_end = end - 15;
 276     while (s<mm_end) {
 277         __asm__ volatile(
 278             PREFETCH"  32%1         \n\t"
 279             "movq        %1, %%mm0  \n\t"
 280             "movq       8%1, %%mm2  \n\t"
 281             "movq     %%mm0, %%mm1  \n\t"
 282             "movq     %%mm2, %%mm3  \n\t"
 283             "psrlq       $1, %%mm0  \n\t"
 284             "psrlq       $1, %%mm2  \n\t"
 285             "pand     %%mm7, %%mm0  \n\t"
 286             "pand     %%mm7, %%mm2  \n\t"
 287             "pand     %%mm6, %%mm1  \n\t"
 288             "pand     %%mm6, %%mm3  \n\t"
 289             "por      %%mm1, %%mm0  \n\t"
 290             "por      %%mm3, %%mm2  \n\t"
 291             MOVNTQ"   %%mm0,  %0    \n\t"
 292             MOVNTQ"   %%mm2, 8%0"
 293             :"=m"(*d)
 294             :"m"(*s)
 295         );
 296         d+=16;
 297         s+=16;
 298     }
 299     __asm__ volatile(SFENCE:::"memory");
 300     __asm__ volatile(EMMS:::"memory");
 301 #endif
 302     mm_end = end - 3;
 303     while (s < mm_end) {
 304         register uint32_t x= *((const uint32_t*)s);
 305         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
 306         s+=4;
 307         d+=4;
 308     }
 309     if (s < end) {
 310         register uint16_t x= *((const uint16_t*)s);
 311         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
 312     }
 313 }
 314
 315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
 316 {
 317     const uint8_t *s = src;
 318     const uint8_t *end;
 319 #if COMPILE_TEMPLATE_MMX
 320     const uint8_t *mm_end;
 321 #endif
 322     uint16_t *d = (uint16_t *)dst;
 323     end = s + src_size;
 324 #if COMPILE_TEMPLATE_MMX
 325     mm_end = end - 15;
 326 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
 327     __asm__ volatile(
 328         "movq           %3, %%mm5   \n\t"
 329         "movq           %4, %%mm6   \n\t"
 330         "movq           %5, %%mm7   \n\t"
 331         "jmp 2f                     \n\t"
 332         ASMALIGN(4)
 333         "1:                         \n\t"
 334         PREFETCH"   32(%1)          \n\t"
 335         "movd         (%1), %%mm0   \n\t"
 336         "movd        4(%1), %%mm3   \n\t"
 337         "punpckldq   8(%1), %%mm0   \n\t"
 338         "punpckldq  12(%1), %%mm3   \n\t"
 339         "movq        %%mm0, %%mm1   \n\t"
 340         "movq        %%mm3, %%mm4   \n\t"
 341         "pand        %%mm6, %%mm0   \n\t"
 342         "pand        %%mm6, %%mm3   \n\t"
 343         "pmaddwd     %%mm7, %%mm0   \n\t"
 344         "pmaddwd     %%mm7, %%mm3   \n\t"
 345         "pand        %%mm5, %%mm1   \n\t"
 346         "pand        %%mm5, %%mm4   \n\t"
 347         "por         %%mm1, %%mm0   \n\t"
 348         "por         %%mm4, %%mm3   \n\t"
 349         "psrld          $5, %%mm0   \n\t"
 350         "pslld         $11, %%mm3   \n\t"
 351         "por         %%mm3, %%mm0   \n\t"
 352         MOVNTQ"      %%mm0, (%0)    \n\t"
 353         "add           $16,  %1     \n\t"
 354         "add            $8,  %0     \n\t"
 355         "2:                         \n\t"
 356         "cmp            %2,  %1     \n\t"
 357         " jb            1b          \n\t"
 358         : "+r" (d), "+r"(s)
 359         : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
 360     );
 361 #else
 362     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 363     __asm__ volatile(
 364         "movq    %0, %%mm7    \n\t"
 365         "movq    %1, %%mm6    \n\t"
 366         ::"m"(red_16mask),"m"(green_16mask));
 367     while (s < mm_end) {
 368         __asm__ volatile(
 369             PREFETCH"    32%1           \n\t"
 370             "movd          %1, %%mm0    \n\t"
 371             "movd         4%1, %%mm3    \n\t"
 372             "punpckldq    8%1, %%mm0    \n\t"
 373             "punpckldq   12%1, %%mm3    \n\t"
 374             "movq       %%mm0, %%mm1    \n\t"
 375             "movq       %%mm0, %%mm2    \n\t"
 376             "movq       %%mm3, %%mm4    \n\t"
 377             "movq       %%mm3, %%mm5    \n\t"
 378             "psrlq         $3, %%mm0    \n\t"
 379             "psrlq         $3, %%mm3    \n\t"
 380             "pand          %2, %%mm0    \n\t"
 381             "pand          %2, %%mm3    \n\t"
 382             "psrlq         $5, %%mm1    \n\t"
 383             "psrlq         $5, %%mm4    \n\t"
 384             "pand       %%mm6, %%mm1    \n\t"
 385             "pand       %%mm6, %%mm4    \n\t"
 386             "psrlq         $8, %%mm2    \n\t"
 387             "psrlq         $8, %%mm5    \n\t"
 388             "pand       %%mm7, %%mm2    \n\t"
 389             "pand       %%mm7, %%mm5    \n\t"
 390             "por        %%mm1, %%mm0    \n\t"
 391             "por        %%mm4, %%mm3    \n\t"
 392             "por        %%mm2, %%mm0    \n\t"
 393             "por        %%mm5, %%mm3    \n\t"
 394             "psllq        $16, %%mm3    \n\t"
 395             "por        %%mm3, %%mm0    \n\t"
 396             MOVNTQ"     %%mm0, %0       \n\t"
 397             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 398         d += 4;
 399         s += 16;
 400     }
 401 #endif
 402     __asm__ volatile(SFENCE:::"memory");
 403     __asm__ volatile(EMMS:::"memory");
 404 #endif
 405     while (s < end) {
 406         register int rgb = *(const uint32_t*)s; s += 4;
 407         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
 408     }
 409 }
 410
 411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
 412 {
 413     const uint8_t *s = src;
 414     const uint8_t *end;
 415 #if COMPILE_TEMPLATE_MMX
 416     const uint8_t *mm_end;
 417 #endif
 418     uint16_t *d = (uint16_t *)dst;
 419     end = s + src_size;
 420 #if COMPILE_TEMPLATE_MMX
 421     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 422     __asm__ volatile(
 423         "movq          %0, %%mm7    \n\t"
 424         "movq          %1, %%mm6    \n\t"
 425         ::"m"(red_16mask),"m"(green_16mask));
 426     mm_end = end - 15;
 427     while (s < mm_end) {
 428         __asm__ volatile(
 429             PREFETCH"    32%1           \n\t"
 430             "movd          %1, %%mm0    \n\t"
 431             "movd         4%1, %%mm3    \n\t"
 432             "punpckldq    8%1, %%mm0    \n\t"
 433             "punpckldq   12%1, %%mm3    \n\t"
 434             "movq       %%mm0, %%mm1    \n\t"
 435             "movq       %%mm0, %%mm2    \n\t"
 436             "movq       %%mm3, %%mm4    \n\t"
 437             "movq       %%mm3, %%mm5    \n\t"
 438             "psllq         $8, %%mm0    \n\t"
 439             "psllq         $8, %%mm3    \n\t"
 440             "pand       %%mm7, %%mm0    \n\t"
 441             "pand       %%mm7, %%mm3    \n\t"
 442             "psrlq         $5, %%mm1    \n\t"
 443             "psrlq         $5, %%mm4    \n\t"
 444             "pand       %%mm6, %%mm1    \n\t"
 445             "pand       %%mm6, %%mm4    \n\t"
 446             "psrlq        $19, %%mm2    \n\t"
 447             "psrlq        $19, %%mm5    \n\t"
 448             "pand          %2, %%mm2    \n\t"
 449             "pand          %2, %%mm5    \n\t"
 450             "por        %%mm1, %%mm0    \n\t"
 451             "por        %%mm4, %%mm3    \n\t"
 452             "por        %%mm2, %%mm0    \n\t"
 453             "por        %%mm5, %%mm3    \n\t"
 454             "psllq        $16, %%mm3    \n\t"
 455             "por        %%mm3, %%mm0    \n\t"
 456             MOVNTQ"     %%mm0, %0       \n\t"
 457             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 458         d += 4;
 459         s += 16;
 460     }
 461     __asm__ volatile(SFENCE:::"memory");
 462     __asm__ volatile(EMMS:::"memory");
 463 #endif
 464     while (s < end) {
 465         register int rgb = *(const uint32_t*)s; s += 4;
 466         *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
 467     }
 468 }
 469
 470 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
 471 {
 472     const uint8_t *s = src;
 473     const uint8_t *end;
 474 #if COMPILE_TEMPLATE_MMX
 475     const uint8_t *mm_end;
 476 #endif
 477     uint16_t *d = (uint16_t *)dst;
 478     end = s + src_size;
 479 #if COMPILE_TEMPLATE_MMX
 480     mm_end = end - 15;
 481 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
 482     __asm__ volatile(
 483         "movq           %3, %%mm5   \n\t"
 484         "movq           %4, %%mm6   \n\t"
 485         "movq           %5, %%mm7   \n\t"
 486         "jmp            2f          \n\t"
 487         ASMALIGN(4)
 488         "1:                         \n\t"
 489         PREFETCH"   32(%1)          \n\t"
 490         "movd         (%1), %%mm0   \n\t"
 491         "movd        4(%1), %%mm3   \n\t"
 492         "punpckldq   8(%1), %%mm0   \n\t"
 493         "punpckldq  12(%1), %%mm3   \n\t"
 494         "movq        %%mm0, %%mm1   \n\t"
 495         "movq        %%mm3, %%mm4   \n\t"
 496         "pand        %%mm6, %%mm0   \n\t"
 497         "pand        %%mm6, %%mm3   \n\t"
 498         "pmaddwd     %%mm7, %%mm0   \n\t"
 499         "pmaddwd     %%mm7, %%mm3   \n\t"
 500         "pand        %%mm5, %%mm1   \n\t"
 501         "pand        %%mm5, %%mm4   \n\t"
 502         "por         %%mm1, %%mm0   \n\t"
 503         "por         %%mm4, %%mm3   \n\t"
 504         "psrld          $6, %%mm0   \n\t"
 505         "pslld         $10, %%mm3   \n\t"
 506         "por         %%mm3, %%mm0   \n\t"
 507         MOVNTQ"      %%mm0, (%0)    \n\t"
 508         "add           $16,  %1     \n\t"
 509         "add            $8,  %0     \n\t"
 510         "2:                         \n\t"
 511         "cmp            %2,  %1     \n\t"
 512         " jb            1b          \n\t"
 513         : "+r" (d), "+r"(s)
 514         : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
 515     );
 516 #else
 517     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 518     __asm__ volatile(
 519         "movq          %0, %%mm7    \n\t"
 520         "movq          %1, %%mm6    \n\t"
 521         ::"m"(red_15mask),"m"(green_15mask));
 522     while (s < mm_end) {
 523         __asm__ volatile(
 524             PREFETCH"    32%1           \n\t"
 525             "movd          %1, %%mm0    \n\t"
 526             "movd         4%1, %%mm3    \n\t"
 527             "punpckldq    8%1, %%mm0    \n\t"
 528             "punpckldq   12%1, %%mm3    \n\t"
 529             "movq       %%mm0, %%mm1    \n\t"
 530             "movq       %%mm0, %%mm2    \n\t"
 531             "movq       %%mm3, %%mm4    \n\t"
 532             "movq       %%mm3, %%mm5    \n\t"
 533             "psrlq         $3, %%mm0    \n\t"
 534             "psrlq         $3, %%mm3    \n\t"
 535             "pand          %2, %%mm0    \n\t"
 536             "pand          %2, %%mm3    \n\t"
 537             "psrlq         $6, %%mm1    \n\t"
 538             "psrlq         $6, %%mm4    \n\t"
 539             "pand       %%mm6, %%mm1    \n\t"
 540             "pand       %%mm6, %%mm4    \n\t"
 541             "psrlq         $9, %%mm2    \n\t"
 542             "psrlq         $9, %%mm5    \n\t"
 543             "pand       %%mm7, %%mm2    \n\t"
 544             "pand       %%mm7, %%mm5    \n\t"
 545             "por        %%mm1, %%mm0    \n\t"
 546             "por        %%mm4, %%mm3    \n\t"
 547             "por        %%mm2, %%mm0    \n\t"
 548             "por        %%mm5, %%mm3    \n\t"
 549             "psllq        $16, %%mm3    \n\t"
 550             "por        %%mm3, %%mm0    \n\t"
 551             MOVNTQ"     %%mm0, %0       \n\t"
 552             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 553         d += 4;
 554         s += 16;
 555     }
 556 #endif
 557     __asm__ volatile(SFENCE:::"memory");
 558     __asm__ volatile(EMMS:::"memory");
 559 #endif
 560     while (s < end) {
 561         register int rgb = *(const uint32_t*)s; s += 4;
 562         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
 563     }
 564 }
 565
 566 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
 567 {
 568     const uint8_t *s = src;
 569     const uint8_t *end;
 570 #if COMPILE_TEMPLATE_MMX
 571     const uint8_t *mm_end;
 572 #endif
 573     uint16_t *d = (uint16_t *)dst;
 574     end = s + src_size;
 575 #if COMPILE_TEMPLATE_MMX
 576     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 577     __asm__ volatile(
 578         "movq          %0, %%mm7    \n\t"
 579         "movq          %1, %%mm6    \n\t"
 580         ::"m"(red_15mask),"m"(green_15mask));
 581     mm_end = end - 15;
 582     while (s < mm_end) {
 583         __asm__ volatile(
 584             PREFETCH"    32%1           \n\t"
 585             "movd          %1, %%mm0    \n\t"
 586             "movd         4%1, %%mm3    \n\t"
 587             "punpckldq    8%1, %%mm0    \n\t"
 588             "punpckldq   12%1, %%mm3    \n\t"
 589             "movq       %%mm0, %%mm1    \n\t"
 590             "movq       %%mm0, %%mm2    \n\t"
 591             "movq       %%mm3, %%mm4    \n\t"
 592             "movq       %%mm3, %%mm5    \n\t"
 593             "psllq         $7, %%mm0    \n\t"
 594             "psllq         $7, %%mm3    \n\t"
 595             "pand       %%mm7, %%mm0    \n\t"
 596             "pand       %%mm7, %%mm3    \n\t"
 597             "psrlq         $6, %%mm1    \n\t"
 598             "psrlq         $6, %%mm4    \n\t"
 599             "pand       %%mm6, %%mm1    \n\t"
 600             "pand       %%mm6, %%mm4    \n\t"
 601             "psrlq        $19, %%mm2    \n\t"
 602             "psrlq        $19, %%mm5    \n\t"
 603             "pand          %2, %%mm2    \n\t"
 604             "pand          %2, %%mm5    \n\t"
 605             "por        %%mm1, %%mm0    \n\t"
 606             "por        %%mm4, %%mm3    \n\t"
 607             "por        %%mm2, %%mm0    \n\t"
 608             "por        %%mm5, %%mm3    \n\t"
 609             "psllq        $16, %%mm3    \n\t"
 610             "por        %%mm3, %%mm0    \n\t"
 611             MOVNTQ"     %%mm0, %0       \n\t"
 612             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 613         d += 4;
 614         s += 16;
 615     }
 616     __asm__ volatile(SFENCE:::"memory");
 617     __asm__ volatile(EMMS:::"memory");
 618 #endif
 619     while (s < end) {
 620         register int rgb = *(const uint32_t*)s; s += 4;
 621         *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
 622     }
 623 }
 624
 625 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
 626 {
 627     const uint8_t *s = src;
 628     const uint8_t *end;
 629 #if COMPILE_TEMPLATE_MMX
 630     const uint8_t *mm_end;
 631 #endif
 632     uint16_t *d = (uint16_t *)dst;
 633     end = s + src_size;
 634 #if COMPILE_TEMPLATE_MMX
 635     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 636     __asm__ volatile(
 637         "movq         %0, %%mm7     \n\t"
 638         "movq         %1, %%mm6     \n\t"
 639         ::"m"(red_16mask),"m"(green_16mask));
 640     mm_end = end - 11;
 641     while (s < mm_end) {
 642         __asm__ volatile(
 643             PREFETCH"    32%1           \n\t"
 644             "movd          %1, %%mm0    \n\t"
 645             "movd         3%1, %%mm3    \n\t"
 646             "punpckldq    6%1, %%mm0    \n\t"
 647             "punpckldq    9%1, %%mm3    \n\t"
 648             "movq       %%mm0, %%mm1    \n\t"
 649             "movq       %%mm0, %%mm2    \n\t"
 650             "movq       %%mm3, %%mm4    \n\t"
 651             "movq       %%mm3, %%mm5    \n\t"
 652             "psrlq         $3, %%mm0    \n\t"
 653             "psrlq         $3, %%mm3    \n\t"
 654             "pand          %2, %%mm0    \n\t"
 655             "pand          %2, %%mm3    \n\t"
 656             "psrlq         $5, %%mm1    \n\t"
 657             "psrlq         $5, %%mm4    \n\t"
 658             "pand       %%mm6, %%mm1    \n\t"
 659             "pand       %%mm6, %%mm4    \n\t"
 660             "psrlq         $8, %%mm2    \n\t"
 661             "psrlq         $8, %%mm5    \n\t"
 662             "pand       %%mm7, %%mm2    \n\t"
 663             "pand       %%mm7, %%mm5    \n\t"
 664             "por        %%mm1, %%mm0    \n\t"
 665             "por        %%mm4, %%mm3    \n\t"
 666             "por        %%mm2, %%mm0    \n\t"
 667             "por        %%mm5, %%mm3    \n\t"
 668             "psllq        $16, %%mm3    \n\t"
 669             "por        %%mm3, %%mm0    \n\t"
 670             MOVNTQ"     %%mm0, %0       \n\t"
 671             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 672         d += 4;
 673         s += 12;
 674     }
 675     __asm__ volatile(SFENCE:::"memory");
 676     __asm__ volatile(EMMS:::"memory");
 677 #endif
 678     while (s < end) {
 679         const int b = *s++;
 680         const int g = *s++;
 681         const int r = *s++;
 682         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 683     }
 684 }
 685
 686 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
 687 {
 688     const uint8_t *s = src;
 689     const uint8_t *end;
 690 #if COMPILE_TEMPLATE_MMX
 691     const uint8_t *mm_end;
 692 #endif
 693     uint16_t *d = (uint16_t *)dst;
 694     end = s + src_size;
 695 #if COMPILE_TEMPLATE_MMX
 696     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 697     __asm__ volatile(
 698         "movq         %0, %%mm7     \n\t"
 699         "movq         %1, %%mm6     \n\t"
 700         ::"m"(red_16mask),"m"(green_16mask));
 701     mm_end = end - 15;
 702     while (s < mm_end) {
 703         __asm__ volatile(
 704             PREFETCH"    32%1           \n\t"
 705             "movd          %1, %%mm0    \n\t"
 706             "movd         3%1, %%mm3    \n\t"
 707             "punpckldq    6%1, %%mm0    \n\t"
 708             "punpckldq    9%1, %%mm3    \n\t"
 709             "movq       %%mm0, %%mm1    \n\t"
 710             "movq       %%mm0, %%mm2    \n\t"
 711             "movq       %%mm3, %%mm4    \n\t"
 712             "movq       %%mm3, %%mm5    \n\t"
 713             "psllq         $8, %%mm0    \n\t"
 714             "psllq         $8, %%mm3    \n\t"
 715             "pand       %%mm7, %%mm0    \n\t"
 716             "pand       %%mm7, %%mm3    \n\t"
 717             "psrlq         $5, %%mm1    \n\t"
 718             "psrlq         $5, %%mm4    \n\t"
 719             "pand       %%mm6, %%mm1    \n\t"
 720             "pand       %%mm6, %%mm4    \n\t"
 721             "psrlq        $19, %%mm2    \n\t"
 722             "psrlq        $19, %%mm5    \n\t"
 723             "pand          %2, %%mm2    \n\t"
 724             "pand          %2, %%mm5    \n\t"
 725             "por        %%mm1, %%mm0    \n\t"
 726             "por        %%mm4, %%mm3    \n\t"
 727             "por        %%mm2, %%mm0    \n\t"
 728             "por        %%mm5, %%mm3    \n\t"
 729             "psllq        $16, %%mm3    \n\t"
 730             "por        %%mm3, %%mm0    \n\t"
 731             MOVNTQ"     %%mm0, %0       \n\t"
 732             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 733         d += 4;
 734         s += 12;
 735     }
 736     __asm__ volatile(SFENCE:::"memory");
 737     __asm__ volatile(EMMS:::"memory");
 738 #endif
 739     while (s < end) {
 740         const int r = *s++;
 741         const int g = *s++;
 742         const int b = *s++;
 743         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 744     }
 745 }
 746
 747 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
 748 {
 749     const uint8_t *s = src;
 750     const uint8_t *end;
 751 #if COMPILE_TEMPLATE_MMX
 752     const uint8_t *mm_end;
 753 #endif
 754     uint16_t *d = (uint16_t *)dst;
 755     end = s + src_size;
 756 #if COMPILE_TEMPLATE_MMX
 757     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 758     __asm__ volatile(
 759         "movq          %0, %%mm7    \n\t"
 760         "movq          %1, %%mm6    \n\t"
 761         ::"m"(red_15mask),"m"(green_15mask));
 762     mm_end = end - 11;
 763     while (s < mm_end) {
 764         __asm__ volatile(
 765             PREFETCH"    32%1           \n\t"
 766             "movd          %1, %%mm0    \n\t"
 767             "movd         3%1, %%mm3    \n\t"
 768             "punpckldq    6%1, %%mm0    \n\t"
 769             "punpckldq    9%1, %%mm3    \n\t"
 770             "movq       %%mm0, %%mm1    \n\t"
 771             "movq       %%mm0, %%mm2    \n\t"
 772             "movq       %%mm3, %%mm4    \n\t"
 773             "movq       %%mm3, %%mm5    \n\t"
 774             "psrlq         $3, %%mm0    \n\t"
 775             "psrlq         $3, %%mm3    \n\t"
 776             "pand          %2, %%mm0    \n\t"
 777             "pand          %2, %%mm3    \n\t"
 778             "psrlq         $6, %%mm1    \n\t"
 779             "psrlq         $6, %%mm4    \n\t"
 780             "pand       %%mm6, %%mm1    \n\t"
 781             "pand       %%mm6, %%mm4    \n\t"
 782             "psrlq         $9, %%mm2    \n\t"
 783             "psrlq         $9, %%mm5    \n\t"
 784             "pand       %%mm7, %%mm2    \n\t"
 785             "pand       %%mm7, %%mm5    \n\t"
 786             "por        %%mm1, %%mm0    \n\t"
 787             "por        %%mm4, %%mm3    \n\t"
 788             "por        %%mm2, %%mm0    \n\t"
 789             "por        %%mm5, %%mm3    \n\t"
 790             "psllq        $16, %%mm3    \n\t"
 791             "por        %%mm3, %%mm0    \n\t"
 792             MOVNTQ"     %%mm0, %0       \n\t"
 793             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 794         d += 4;
 795         s += 12;
 796     }
 797     __asm__ volatile(SFENCE:::"memory");
 798     __asm__ volatile(EMMS:::"memory");
 799 #endif
 800     while (s < end) {
 801         const int b = *s++;
 802         const int g = *s++;
 803         const int r = *s++;
 804         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 805     }
 806 }
 807
 808 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
 809 {
 810     const uint8_t *s = src;
 811     const uint8_t *end;
 812 #if COMPILE_TEMPLATE_MMX
 813     const uint8_t *mm_end;
 814 #endif
 815     uint16_t *d = (uint16_t *)dst;
 816     end = s + src_size;
 817 #if COMPILE_TEMPLATE_MMX
 818     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 819     __asm__ volatile(
 820         "movq         %0, %%mm7     \n\t"
 821         "movq         %1, %%mm6     \n\t"
 822         ::"m"(red_15mask),"m"(green_15mask));
 823     mm_end = end - 15;
 824     while (s < mm_end) {
 825         __asm__ volatile(
 826             PREFETCH"   32%1            \n\t"
 827             "movd         %1, %%mm0     \n\t"
 828             "movd        3%1, %%mm3     \n\t"
 829             "punpckldq   6%1, %%mm0     \n\t"
 830             "punpckldq   9%1, %%mm3     \n\t"
 831             "movq      %%mm0, %%mm1     \n\t"
 832             "movq      %%mm0, %%mm2     \n\t"
 833             "movq      %%mm3, %%mm4     \n\t"
 834             "movq      %%mm3, %%mm5     \n\t"
 835             "psllq        $7, %%mm0     \n\t"
 836             "psllq        $7, %%mm3     \n\t"
 837             "pand      %%mm7, %%mm0     \n\t"
 838             "pand      %%mm7, %%mm3     \n\t"
 839             "psrlq        $6, %%mm1     \n\t"
 840             "psrlq        $6, %%mm4     \n\t"
 841             "pand      %%mm6, %%mm1     \n\t"
 842             "pand      %%mm6, %%mm4     \n\t"
 843             "psrlq       $19, %%mm2     \n\t"
 844             "psrlq       $19, %%mm5     \n\t"
 845             "pand         %2, %%mm2     \n\t"
 846             "pand         %2, %%mm5     \n\t"
 847             "por       %%mm1, %%mm0     \n\t"
 848             "por       %%mm4, %%mm3     \n\t"
 849             "por       %%mm2, %%mm0     \n\t"
 850             "por       %%mm5, %%mm3     \n\t"
 851             "psllq       $16, %%mm3     \n\t"
 852             "por       %%mm3, %%mm0     \n\t"
 853             MOVNTQ"    %%mm0, %0        \n\t"
 854             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 855         d += 4;
 856         s += 12;
 857     }
 858     __asm__ volatile(SFENCE:::"memory");
 859     __asm__ volatile(EMMS:::"memory");
 860 #endif
 861     while (s < end) {
 862         const int r = *s++;
 863         const int g = *s++;
 864         const int b = *s++;
 865         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 866     }
 867 }
 868
 869 /*
 870   I use less accurate approximation here by simply left-shifting the input
 871   value and filling the low order bits with zeroes. This method improves PNG
 872   compression but this scheme cannot reproduce white exactly, since it does
 873   not generate an all-ones maximum value; the net effect is to darken the
 874   image slightly.
 875
 876   The better method should be "left bit replication":
 877
 878    4 3 2 1 0
 879    ---------
 880    1 1 0 1 1
 881
 882    7 6 5 4 3  2 1 0
 883    ----------------
 884    1 1 0 1 1  1 1 0
 885    |=======|  |===|
 886        |      leftmost bits repeated to fill open bits
 887        |
 888    original bits
 889 */
 890 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 891 {
 892     const uint16_t *end;
 893 #if COMPILE_TEMPLATE_MMX
 894     const uint16_t *mm_end;
 895 #endif
 896     uint8_t *d = dst;
 897     const uint16_t *s = (const uint16_t*)src;
 898     end = s + src_size/2;
 899 #if COMPILE_TEMPLATE_MMX
 900     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
 901     mm_end = end - 7;
 902     while (s < mm_end) {
 903         __asm__ volatile(
 904             PREFETCH"    32%1           \n\t"
 905             "movq          %1, %%mm0    \n\t"
 906             "movq          %1, %%mm1    \n\t"
 907             "movq          %1, %%mm2    \n\t"
 908             "pand          %2, %%mm0    \n\t"
 909             "pand          %3, %%mm1    \n\t"
 910             "pand          %4, %%mm2    \n\t"
 911             "psllq         $3, %%mm0    \n\t"
 912             "psrlq         $2, %%mm1    \n\t"
 913             "psrlq         $7, %%mm2    \n\t"
 914             "movq       %%mm0, %%mm3    \n\t"
 915             "movq       %%mm1, %%mm4    \n\t"
 916             "movq       %%mm2, %%mm5    \n\t"
 917             "punpcklwd     %5, %%mm0    \n\t"
 918             "punpcklwd     %5, %%mm1    \n\t"
 919             "punpcklwd     %5, %%mm2    \n\t"
 920             "punpckhwd     %5, %%mm3    \n\t"
 921             "punpckhwd     %5, %%mm4    \n\t"
 922             "punpckhwd     %5, %%mm5    \n\t"
 923             "psllq         $8, %%mm1    \n\t"
 924             "psllq        $16, %%mm2    \n\t"
 925             "por        %%mm1, %%mm0    \n\t"
 926             "por        %%mm2, %%mm0    \n\t"
 927             "psllq         $8, %%mm4    \n\t"
 928             "psllq        $16, %%mm5    \n\t"
 929             "por        %%mm4, %%mm3    \n\t"
 930             "por        %%mm5, %%mm3    \n\t"
 931
 932             "movq       %%mm0, %%mm6    \n\t"
 933             "movq       %%mm3, %%mm7    \n\t"
 934
 935             "movq         8%1, %%mm0    \n\t"
 936             "movq         8%1, %%mm1    \n\t"
 937             "movq         8%1, %%mm2    \n\t"
 938             "pand          %2, %%mm0    \n\t"
 939             "pand          %3, %%mm1    \n\t"
 940             "pand          %4, %%mm2    \n\t"
 941             "psllq         $3, %%mm0    \n\t"
 942             "psrlq         $2, %%mm1    \n\t"
 943             "psrlq         $7, %%mm2    \n\t"
 944             "movq       %%mm0, %%mm3    \n\t"
 945             "movq       %%mm1, %%mm4    \n\t"
 946             "movq       %%mm2, %%mm5    \n\t"
 947             "punpcklwd     %5, %%mm0    \n\t"
 948             "punpcklwd     %5, %%mm1    \n\t"
 949             "punpcklwd     %5, %%mm2    \n\t"
 950             "punpckhwd     %5, %%mm3    \n\t"
 951             "punpckhwd     %5, %%mm4    \n\t"
 952             "punpckhwd     %5, %%mm5    \n\t"
 953             "psllq         $8, %%mm1    \n\t"
 954             "psllq        $16, %%mm2    \n\t"
 955             "por        %%mm1, %%mm0    \n\t"
 956             "por        %%mm2, %%mm0    \n\t"
 957             "psllq         $8, %%mm4    \n\t"
 958             "psllq        $16, %%mm5    \n\t"
 959             "por        %%mm4, %%mm3    \n\t"
 960             "por        %%mm5, %%mm3    \n\t"
 961
 962             :"=m"(*d)
 963             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
 964             :"memory");
 965         /* borrowed 32 to 24 */
 966         __asm__ volatile(
 967             "movq       %%mm0, %%mm4    \n\t"
 968             "movq       %%mm3, %%mm5    \n\t"
 969             "movq       %%mm6, %%mm0    \n\t"
 970             "movq       %%mm7, %%mm1    \n\t"
 971
 972             "movq       %%mm4, %%mm6    \n\t"
 973             "movq       %%mm5, %%mm7    \n\t"
 974             "movq       %%mm0, %%mm2    \n\t"
 975             "movq       %%mm1, %%mm3    \n\t"
 976
 977             STORE_BGR24_MMX
 978
 979             :"=m"(*d)
 980             :"m"(*s)
 981             :"memory");
 982         d += 24;
 983         s += 8;
 984     }
 985     __asm__ volatile(SFENCE:::"memory");
 986     __asm__ volatile(EMMS:::"memory");
 987 #endif
 988     while (s < end) {
 989         register uint16_t bgr;
 990         bgr = *s++;
 991         *d++ = (bgr&0x1F)<<3;
 992         *d++ = (bgr&0x3E0)>>2;
 993         *d++ = (bgr&0x7C00)>>7;
 994     }
 995 }
 996
 997 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 998 {
 999     const uint16_t *end;
1000 #if COMPILE_TEMPLATE_MMX
1001     const uint16_t *mm_end;
1002 #endif
1003     uint8_t *d = (uint8_t *)dst;
1004     const uint16_t *s = (const uint16_t *)src;
1005     end = s + src_size/2;
1006 #if COMPILE_TEMPLATE_MMX
1007     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1008     mm_end = end - 7;
1009     while (s < mm_end) {
1010         __asm__ volatile(
1011             PREFETCH"    32%1           \n\t"
1012             "movq          %1, %%mm0    \n\t"
1013             "movq          %1, %%mm1    \n\t"
1014             "movq          %1, %%mm2    \n\t"
1015             "pand          %2, %%mm0    \n\t"
1016             "pand          %3, %%mm1    \n\t"
1017             "pand          %4, %%mm2    \n\t"
1018             "psllq         $3, %%mm0    \n\t"
1019             "psrlq         $3, %%mm1    \n\t"
1020             "psrlq         $8, %%mm2    \n\t"
1021             "movq       %%mm0, %%mm3    \n\t"
1022             "movq       %%mm1, %%mm4    \n\t"
1023             "movq       %%mm2, %%mm5    \n\t"
1024             "punpcklwd     %5, %%mm0    \n\t"
1025             "punpcklwd     %5, %%mm1    \n\t"
1026             "punpcklwd     %5, %%mm2    \n\t"
1027             "punpckhwd     %5, %%mm3    \n\t"
1028             "punpckhwd     %5, %%mm4    \n\t"
1029             "punpckhwd     %5, %%mm5    \n\t"
1030             "psllq         $8, %%mm1    \n\t"
1031             "psllq        $16, %%mm2    \n\t"
1032             "por        %%mm1, %%mm0    \n\t"
1033             "por        %%mm2, %%mm0    \n\t"
1034             "psllq         $8, %%mm4    \n\t"
1035             "psllq        $16, %%mm5    \n\t"
1036             "por        %%mm4, %%mm3    \n\t"
1037             "por        %%mm5, %%mm3    \n\t"
1038
1039             "movq       %%mm0, %%mm6    \n\t"
1040             "movq       %%mm3, %%mm7    \n\t"
1041
1042             "movq         8%1, %%mm0    \n\t"
1043             "movq         8%1, %%mm1    \n\t"
1044             "movq         8%1, %%mm2    \n\t"
1045             "pand          %2, %%mm0    \n\t"
1046             "pand          %3, %%mm1    \n\t"
1047             "pand          %4, %%mm2    \n\t"
1048             "psllq         $3, %%mm0    \n\t"
1049             "psrlq         $3, %%mm1    \n\t"
1050             "psrlq         $8, %%mm2    \n\t"
1051             "movq       %%mm0, %%mm3    \n\t"
1052             "movq       %%mm1, %%mm4    \n\t"
1053             "movq       %%mm2, %%mm5    \n\t"
1054             "punpcklwd     %5, %%mm0    \n\t"
1055             "punpcklwd     %5, %%mm1    \n\t"
1056             "punpcklwd     %5, %%mm2    \n\t"
1057             "punpckhwd     %5, %%mm3    \n\t"
1058             "punpckhwd     %5, %%mm4    \n\t"
1059             "punpckhwd     %5, %%mm5    \n\t"
1060             "psllq         $8, %%mm1    \n\t"
1061             "psllq        $16, %%mm2    \n\t"
1062             "por        %%mm1, %%mm0    \n\t"
1063             "por        %%mm2, %%mm0    \n\t"
1064             "psllq         $8, %%mm4    \n\t"
1065             "psllq        $16, %%mm5    \n\t"
1066             "por        %%mm4, %%mm3    \n\t"
1067             "por        %%mm5, %%mm3    \n\t"
1068             :"=m"(*d)
1069             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1070             :"memory");
1071         /* borrowed 32 to 24 */
1072         __asm__ volatile(
1073             "movq       %%mm0, %%mm4    \n\t"
1074             "movq       %%mm3, %%mm5    \n\t"
1075             "movq       %%mm6, %%mm0    \n\t"
1076             "movq       %%mm7, %%mm1    \n\t"
1077
1078             "movq       %%mm4, %%mm6    \n\t"
1079             "movq       %%mm5, %%mm7    \n\t"
1080             "movq       %%mm0, %%mm2    \n\t"
1081             "movq       %%mm1, %%mm3    \n\t"
1082
1083             STORE_BGR24_MMX
1084
1085             :"=m"(*d)
1086             :"m"(*s)
1087             :"memory");
1088         d += 24;
1089         s += 8;
1090     }
1091     __asm__ volatile(SFENCE:::"memory");
1092     __asm__ volatile(EMMS:::"memory");
1093 #endif
1094     while (s < end) {
1095         register uint16_t bgr;
1096         bgr = *s++;
1097         *d++ = (bgr&0x1F)<<3;
1098         *d++ = (bgr&0x7E0)>>3;
1099         *d++ = (bgr&0xF800)>>8;
1100     }
1101 }
1102
1103 /*
1104  * mm0 = 00 B3 00 B2 00 B1 00 B0
1105  * mm1 = 00 G3 00 G2 00 G1 00 G0
1106  * mm2 = 00 R3 00 R2 00 R1 00 R0
1107  * mm6 = FF FF FF FF FF FF FF FF
1108  * mm7 = 00 00 00 00 00 00 00 00
1109  */
1110 #define PACK_RGB32 \
1111     "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112     "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113     "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114     "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115     "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116     "movq       %%mm0, %%mm3    \n\t"                               \
1117     "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118     "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119     MOVNTQ"     %%mm0,  %0      \n\t"                               \
1120     MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1121
1122 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1123 {
1124     const uint16_t *end;
1125 #if COMPILE_TEMPLATE_MMX
1126     const uint16_t *mm_end;
1127 #endif
1128     uint8_t *d = dst;
1129     const uint16_t *s = (const uint16_t *)src;
1130     end = s + src_size/2;
1131 #if COMPILE_TEMPLATE_MMX
1132     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1133     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1134     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1135     mm_end = end - 3;
1136     while (s < mm_end) {
1137         __asm__ volatile(
1138             PREFETCH"    32%1           \n\t"
1139             "movq          %1, %%mm0    \n\t"
1140             "movq          %1, %%mm1    \n\t"
1141             "movq          %1, %%mm2    \n\t"
1142             "pand          %2, %%mm0    \n\t"
1143             "pand          %3, %%mm1    \n\t"
1144             "pand          %4, %%mm2    \n\t"
1145             "psllq         $3, %%mm0    \n\t"
1146             "psrlq         $2, %%mm1    \n\t"
1147             "psrlq         $7, %%mm2    \n\t"
1148             PACK_RGB32
1149             :"=m"(*d)
1150             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1151             :"memory");
1152         d += 16;
1153         s += 4;
1154     }
1155     __asm__ volatile(SFENCE:::"memory");
1156     __asm__ volatile(EMMS:::"memory");
1157 #endif
1158     while (s < end) {
1159         register uint16_t bgr;
1160         bgr = *s++;
1161 #if HAVE_BIGENDIAN
1162         *d++ = 255;
1163         *d++ = (bgr&0x7C00)>>7;
1164         *d++ = (bgr&0x3E0)>>2;
1165         *d++ = (bgr&0x1F)<<3;
1166 #else
1167         *d++ = (bgr&0x1F)<<3;
1168         *d++ = (bgr&0x3E0)>>2;
1169         *d++ = (bgr&0x7C00)>>7;
1170         *d++ = 255;
1171 #endif
1172     }
1173 }
1174
1175 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1176 {
1177     const uint16_t *end;
1178 #if COMPILE_TEMPLATE_MMX
1179     const uint16_t *mm_end;
1180 #endif
1181     uint8_t *d = dst;
1182     const uint16_t *s = (const uint16_t*)src;
1183     end = s + src_size/2;
1184 #if COMPILE_TEMPLATE_MMX
1185     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1186     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1187     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1188     mm_end = end - 3;
1189     while (s < mm_end) {
1190         __asm__ volatile(
1191             PREFETCH"    32%1           \n\t"
1192             "movq          %1, %%mm0    \n\t"
1193             "movq          %1, %%mm1    \n\t"
1194             "movq          %1, %%mm2    \n\t"
1195             "pand          %2, %%mm0    \n\t"
1196             "pand          %3, %%mm1    \n\t"
1197             "pand          %4, %%mm2    \n\t"
1198             "psllq         $3, %%mm0    \n\t"
1199             "psrlq         $3, %%mm1    \n\t"
1200             "psrlq         $8, %%mm2    \n\t"
1201             PACK_RGB32
1202             :"=m"(*d)
1203             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1204             :"memory");
1205         d += 16;
1206         s += 4;
1207     }
1208     __asm__ volatile(SFENCE:::"memory");
1209     __asm__ volatile(EMMS:::"memory");
1210 #endif
1211     while (s < end) {
1212         register uint16_t bgr;
1213         bgr = *s++;
1214 #if HAVE_BIGENDIAN
1215         *d++ = 255;
1216         *d++ = (bgr&0xF800)>>8;
1217         *d++ = (bgr&0x7E0)>>3;
1218         *d++ = (bgr&0x1F)<<3;
1219 #else
1220         *d++ = (bgr&0x1F)<<3;
1221         *d++ = (bgr&0x7E0)>>3;
1222         *d++ = (bgr&0xF800)>>8;
1223         *d++ = 255;
1224 #endif
1225     }
1226 }
1227
1228 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1229 {
1230     x86_reg idx = 15 - src_size;
1231     const uint8_t *s = src-idx;
1232     uint8_t *d = dst-idx;
1233 #if COMPILE_TEMPLATE_MMX
1234     __asm__ volatile(
1235         "test          %0, %0           \n\t"
1236         "jns           2f               \n\t"
1237         PREFETCH"       (%1, %0)        \n\t"
1238         "movq          %3, %%mm7        \n\t"
1239         "pxor          %4, %%mm7        \n\t"
1240         "movq       %%mm7, %%mm6        \n\t"
1241         "pxor          %5, %%mm7        \n\t"
1242         ASMALIGN(4)
1243         "1:                             \n\t"
1244         PREFETCH"     32(%1, %0)        \n\t"
1245         "movq           (%1, %0), %%mm0 \n\t"
1246         "movq          8(%1, %0), %%mm1 \n\t"
1247 # if COMPILE_TEMPLATE_MMX2
1248         "pshufw      $177, %%mm0, %%mm3 \n\t"
1249         "pshufw      $177, %%mm1, %%mm5 \n\t"
1250         "pand       %%mm7, %%mm0        \n\t"
1251         "pand       %%mm6, %%mm3        \n\t"
1252         "pand       %%mm7, %%mm1        \n\t"
1253         "pand       %%mm6, %%mm5        \n\t"
1254         "por        %%mm3, %%mm0        \n\t"
1255         "por        %%mm5, %%mm1        \n\t"
1256 # else
1257         "movq       %%mm0, %%mm2        \n\t"
1258         "movq       %%mm1, %%mm4        \n\t"
1259         "pand       %%mm7, %%mm0        \n\t"
1260         "pand       %%mm6, %%mm2        \n\t"
1261         "pand       %%mm7, %%mm1        \n\t"
1262         "pand       %%mm6, %%mm4        \n\t"
1263         "movq       %%mm2, %%mm3        \n\t"
1264         "movq       %%mm4, %%mm5        \n\t"
1265         "pslld        $16, %%mm2        \n\t"
1266         "psrld        $16, %%mm3        \n\t"
1267         "pslld        $16, %%mm4        \n\t"
1268         "psrld        $16, %%mm5        \n\t"
1269         "por        %%mm2, %%mm0        \n\t"
1270         "por        %%mm4, %%mm1        \n\t"
1271         "por        %%mm3, %%mm0        \n\t"
1272         "por        %%mm5, %%mm1        \n\t"
1273 # endif
1274         MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1275         MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1276         "add          $16, %0           \n\t"
1277         "js            1b               \n\t"
1278         SFENCE"                         \n\t"
1279         EMMS"                           \n\t"
1280         "2:                             \n\t"
1281         : "+&r"(idx)
1282         : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283         : "memory");
1284 #endif
1285     for (; idx<15; idx+=4) {
1286         register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287         v &= 0xff00ff;
1288         *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1289     }
1290 }
1291
1292 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1293 {
1294     unsigned i;
1295 #if COMPILE_TEMPLATE_MMX
1296     x86_reg mmx_size= 23 - src_size;
1297     __asm__ volatile (
1298         "test             %%"REG_a", %%"REG_a"          \n\t"
1299         "jns                     2f                     \n\t"
1300         "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1301         "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1302         "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1303         ASMALIGN(4)
1304         "1:                                             \n\t"
1305         PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1306         "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1307         "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1308         "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1309         "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1310         "pand                 %%mm5, %%mm0              \n\t"
1311         "pand                 %%mm6, %%mm1              \n\t"
1312         "pand                 %%mm7, %%mm2              \n\t"
1313         "por                  %%mm0, %%mm1              \n\t"
1314         "por                  %%mm2, %%mm1              \n\t"
1315         "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1316         MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1317         "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1318         "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1319         "pand                 %%mm7, %%mm0              \n\t"
1320         "pand                 %%mm5, %%mm1              \n\t"
1321         "pand                 %%mm6, %%mm2              \n\t"
1322         "por                  %%mm0, %%mm1              \n\t"
1323         "por                  %%mm2, %%mm1              \n\t"
1324         "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1325         MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1326         "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1327         "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1328         "pand                 %%mm6, %%mm0              \n\t"
1329         "pand                 %%mm7, %%mm1              \n\t"
1330         "pand                 %%mm5, %%mm2              \n\t"
1331         "por                  %%mm0, %%mm1              \n\t"
1332         "por                  %%mm2, %%mm1              \n\t"
1333         MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1334         "add                    $24, %%"REG_a"          \n\t"
1335         " js                     1b                     \n\t"
1336         "2:                                             \n\t"
1337         : "+a" (mmx_size)
1338         : "r" (src-mmx_size), "r"(dst-mmx_size)
1339     );
1340
1341     __asm__ volatile(SFENCE:::"memory");
1342     __asm__ volatile(EMMS:::"memory");
1343
1344     if (mmx_size==23) return; //finished, was multiple of 8
1345
1346     src+= src_size;
1347     dst+= src_size;
1348     src_size= 23-mmx_size;
1349     src-= src_size;
1350     dst-= src_size;
1351 #endif
1352     for (i=0; i<src_size; i+=3) {
1353         register uint8_t x;
1354         x          = src[i + 2];
1355         dst[i + 1] = src[i + 1];
1356         dst[i + 2] = src[i + 0];
1357         dst[i + 0] = x;
1358     }
1359 }
1360
1361 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362                                            long width, long height,
1363                                            long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1364 {
1365     long y;
1366     const x86_reg chromWidth= width>>1;
1367     for (y=0; y<height; y++) {
1368 #if COMPILE_TEMPLATE_MMX
1369         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370         __asm__ volatile(
1371             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1372             ASMALIGN(4)
1373             "1:                                         \n\t"
1374             PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1375             PREFETCH"    32(%2, %%"REG_a")              \n\t"
1376             PREFETCH"    32(%3, %%"REG_a")              \n\t"
1377             "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1378             "movq                    %%mm0, %%mm2       \n\t" // U(0)
1379             "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1380             "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1381             "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1382
1383             "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1384             "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1385             "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1386             "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1387             "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1388             "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1389             "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1390             "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1391
1392             MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1393             MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1394             MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1395             MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1396
1397             "add                        $8, %%"REG_a"   \n\t"
1398             "cmp                        %4, %%"REG_a"   \n\t"
1399             " jb                        1b              \n\t"
1400             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401             : "%"REG_a
1402         );
1403 #else
1404
1405 #if ARCH_ALPHA && HAVE_MVI
1406 #define pl2yuy2(n)                  \
1407     y1 = yc[n];                     \
1408     y2 = yc2[n];                    \
1409     u = uc[n];                      \
1410     v = vc[n];                      \
1411     __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1412     __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1413     __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1414     __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1415     yuv1 = (u << 8) + (v << 24);                \
1416     yuv2 = yuv1 + y2;               \
1417     yuv1 += y1;                     \
1418     qdst[n]  = yuv1;                \
1419     qdst2[n] = yuv2;
1420
1421         int i;
1422         uint64_t *qdst = (uint64_t *) dst;
1423         uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424         const uint32_t *yc = (uint32_t *) ysrc;
1425         const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426         const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427         for (i = 0; i < chromWidth; i += 8) {
1428             uint64_t y1, y2, yuv1, yuv2;
1429             uint64_t u, v;
1430             /* Prefetch */
1431             __asm__("ldq $31,64(%0)" :: "r"(yc));
1432             __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433             __asm__("ldq $31,64(%0)" :: "r"(uc));
1434             __asm__("ldq $31,64(%0)" :: "r"(vc));
1435
1436             pl2yuy2(0);
1437             pl2yuy2(1);
1438             pl2yuy2(2);
1439             pl2yuy2(3);
1440
1441             yc    += 4;
1442             yc2   += 4;
1443             uc    += 4;
1444             vc    += 4;
1445             qdst  += 4;
1446             qdst2 += 4;
1447         }
1448         y++;
1449         ysrc += lumStride;
1450         dst += dstStride;
1451
1452 #elif HAVE_FAST_64BIT
1453         int i;
1454         uint64_t *ldst = (uint64_t *) dst;
1455         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456         for (i = 0; i < chromWidth; i += 2) {
1457             uint64_t k, l;
1458             k = yc[0] + (uc[0] << 8) +
1459                 (yc[1] << 16) + (vc[0] << 24);
1460             l = yc[2] + (uc[1] << 8) +
1461                 (yc[3] << 16) + (vc[1] << 24);
1462             *ldst++ = k + (l << 32);
1463             yc += 4;
1464             uc += 2;
1465             vc += 2;
1466         }
1467
1468 #else
1469         int i, *idst = (int32_t *) dst;
1470         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471         for (i = 0; i < chromWidth; i++) {
1472 #if HAVE_BIGENDIAN
1473             *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474                 (yc[1] << 8) + (vc[0] << 0);
1475 #else
1476             *idst++ = yc[0] + (uc[0] << 8) +
1477                 (yc[1] << 16) + (vc[0] << 24);
1478 #endif
1479             yc += 2;
1480             uc++;
1481             vc++;
1482         }
1483 #endif
1484 #endif
1485         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486             usrc += chromStride;
1487             vsrc += chromStride;
1488         }
1489         ysrc += lumStride;
1490         dst  += dstStride;
1491     }
1492 #if COMPILE_TEMPLATE_MMX
1493     __asm__(EMMS"       \n\t"
1494             SFENCE"     \n\t"
1495             :::"memory");
1496 #endif
1497 }
1498
1499 /**
1500  * Height should be a multiple of 2 and width should be a multiple of 16.
1501  * (If this is a problem for anyone then tell me, and I will fix it.)
1502  */
1503 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504                                       long width, long height,
1505                                       long lumStride, long chromStride, long dstStride)
1506 {
1507     //FIXME interpolate chroma
1508     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1509 }
1510
1511 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512                                            long width, long height,
1513                                            long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1514 {
1515     long y;
1516     const x86_reg chromWidth= width>>1;
1517     for (y=0; y<height; y++) {
1518 #if COMPILE_TEMPLATE_MMX
1519         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1520         __asm__ volatile(
1521             "xor                %%"REG_a", %%"REG_a"    \n\t"
1522             ASMALIGN(4)
1523             "1:                                         \n\t"
1524             PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1525             PREFETCH"   32(%2, %%"REG_a")               \n\t"
1526             PREFETCH"   32(%3, %%"REG_a")               \n\t"
1527             "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1528             "movq                   %%mm0, %%mm2        \n\t" // U(0)
1529             "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1530             "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1531             "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1532
1533             "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1534             "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1535             "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1536             "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1537             "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1538             "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1539             "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1540             "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1541
1542             MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1543             MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1544             MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1545             MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1546
1547             "add                       $8, %%"REG_a"    \n\t"
1548             "cmp                       %4, %%"REG_a"    \n\t"
1549             " jb                       1b               \n\t"
1550             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1551             : "%"REG_a
1552         );
1553 #else
1554 //FIXME adapt the Alpha ASM code from yv12->yuy2
1555
1556 #if HAVE_FAST_64BIT
1557         int i;
1558         uint64_t *ldst = (uint64_t *) dst;
1559         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560         for (i = 0; i < chromWidth; i += 2) {
1561             uint64_t k, l;
1562             k = uc[0] + (yc[0] << 8) +
1563                 (vc[0] << 16) + (yc[1] << 24);
1564             l = uc[1] + (yc[2] << 8) +
1565                 (vc[1] << 16) + (yc[3] << 24);
1566             *ldst++ = k + (l << 32);
1567             yc += 4;
1568             uc += 2;
1569             vc += 2;
1570         }
1571
1572 #else
1573         int i, *idst = (int32_t *) dst;
1574         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575         for (i = 0; i < chromWidth; i++) {
1576 #if HAVE_BIGENDIAN
1577             *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578                 (vc[0] << 8) + (yc[1] << 0);
1579 #else
1580             *idst++ = uc[0] + (yc[0] << 8) +
1581                (vc[0] << 16) + (yc[1] << 24);
1582 #endif
1583             yc += 2;
1584             uc++;
1585             vc++;
1586         }
1587 #endif
1588 #endif
1589         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590             usrc += chromStride;
1591             vsrc += chromStride;
1592         }
1593         ysrc += lumStride;
1594         dst += dstStride;
1595     }
1596 #if COMPILE_TEMPLATE_MMX
1597     __asm__(EMMS"       \n\t"
1598             SFENCE"     \n\t"
1599             :::"memory");
1600 #endif
1601 }
1602
1603 /**
1604  * Height should be a multiple of 2 and width should be a multiple of 16
1605  * (If this is a problem for anyone then tell me, and I will fix it.)
1606  */
1607 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608                                       long width, long height,
1609                                       long lumStride, long chromStride, long dstStride)
1610 {
1611     //FIXME interpolate chroma
1612     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1613 }
1614
1615 /**
1616  * Width should be a multiple of 16.
1617  */
1618 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619                                          long width, long height,
1620                                          long lumStride, long chromStride, long dstStride)
1621 {
1622     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1623 }
1624
1625 /**
1626  * Width should be a multiple of 16.
1627  */
1628 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629                                          long width, long height,
1630                                          long lumStride, long chromStride, long dstStride)
1631 {
1632     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1633 }
1634
1635 /**
1636  * Height should be a multiple of 2 and width should be a multiple of 16.
1637  * (If this is a problem for anyone then tell me, and I will fix it.)
1638  */
1639 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640                                       long width, long height,
1641                                       long lumStride, long chromStride, long srcStride)
1642 {
1643     long y;
1644     const x86_reg chromWidth= width>>1;
1645     for (y=0; y<height; y+=2) {
1646 #if COMPILE_TEMPLATE_MMX
1647         __asm__ volatile(
1648             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1649             "pcmpeqw                 %%mm7, %%mm7       \n\t"
1650             "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1651             ASMALIGN(4)
1652             "1:                \n\t"
1653             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1654             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1655             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1656             "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1657             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1658             "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1659             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1660             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1661             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1662             "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1663             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1664
1665             MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1666
1667             "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1668             "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1669             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1670             "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1671             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1672             "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1673             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1674             "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1675             "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1676             "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1677
1678             MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1679
1680             "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1681             "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1682             "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1683             "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1684             "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1685             "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1686             "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1687             "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1688
1689             MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1690             MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1691
1692             "add                        $8, %%"REG_a"   \n\t"
1693             "cmp                        %4, %%"REG_a"   \n\t"
1694             " jb                        1b              \n\t"
1695             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696             : "memory", "%"REG_a
1697         );
1698
1699         ydst += lumStride;
1700         src  += srcStride;
1701
1702         __asm__ volatile(
1703             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1704             ASMALIGN(4)
1705             "1:                                         \n\t"
1706             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1707             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1708             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1709             "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1710             "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1711             "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1712             "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1713             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1714             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1715             "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1716             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1717
1718             MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1719             MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1720
1721             "add                        $8, %%"REG_a"   \n\t"
1722             "cmp                        %4, %%"REG_a"   \n\t"
1723             " jb                        1b              \n\t"
1724
1725             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726             : "memory", "%"REG_a
1727         );
1728 #else
1729         long i;
1730         for (i=0; i<chromWidth; i++) {
1731             ydst[2*i+0]     = src[4*i+0];
1732             udst[i]     = src[4*i+1];
1733             ydst[2*i+1]     = src[4*i+2];
1734             vdst[i]     = src[4*i+3];
1735         }
1736         ydst += lumStride;
1737         src  += srcStride;
1738
1739         for (i=0; i<chromWidth; i++) {
1740             ydst[2*i+0]     = src[4*i+0];
1741             ydst[2*i+1]     = src[4*i+2];
1742         }
1743 #endif
1744         udst += chromStride;
1745         vdst += chromStride;
1746         ydst += lumStride;
1747         src  += srcStride;
1748     }
1749 #if COMPILE_TEMPLATE_MMX
1750     __asm__ volatile(EMMS"       \n\t"
1751                      SFENCE"     \n\t"
1752                      :::"memory");
1753 #endif
1754 }
1755
1756 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1757 {
1758     long x,y;
1759
1760     dst[0]= src[0];
1761
1762     // first line
1763     for (x=0; x<srcWidth-1; x++) {
1764         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1765         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1766     }
1767     dst[2*srcWidth-1]= src[srcWidth-1];
1768
1769     dst+= dstStride;
1770
1771     for (y=1; y<srcHeight; y++) {
1772 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1773         const x86_reg mmxSize= srcWidth&~15;
1774         __asm__ volatile(
1775             "mov           %4, %%"REG_a"            \n\t"
1776             "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1777             "movq         (%0, %%"REG_a"), %%mm4    \n\t"
1778             "movq                   %%mm4, %%mm2    \n\t"
1779             "psllq                     $8, %%mm4    \n\t"
1780             "pand                   %%mm0, %%mm2    \n\t"
1781             "por                    %%mm2, %%mm4    \n\t"
1782             "movq         (%1, %%"REG_a"), %%mm5    \n\t"
1783             "movq                   %%mm5, %%mm3    \n\t"
1784             "psllq                     $8, %%mm5    \n\t"
1785             "pand                   %%mm0, %%mm3    \n\t"
1786             "por                    %%mm3, %%mm5    \n\t"
1787             "1:                                     \n\t"
1788             "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1789             "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1790             "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1791             "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1792             PAVGB"                  %%mm0, %%mm5    \n\t"
1793             PAVGB"                  %%mm0, %%mm3    \n\t"
1794             PAVGB"                  %%mm0, %%mm5    \n\t"
1795             PAVGB"                  %%mm0, %%mm3    \n\t"
1796             PAVGB"                  %%mm1, %%mm4    \n\t"
1797             PAVGB"                  %%mm1, %%mm2    \n\t"
1798             PAVGB"                  %%mm1, %%mm4    \n\t"
1799             PAVGB"                  %%mm1, %%mm2    \n\t"
1800             "movq                   %%mm5, %%mm7    \n\t"
1801             "movq                   %%mm4, %%mm6    \n\t"
1802             "punpcklbw              %%mm3, %%mm5    \n\t"
1803             "punpckhbw              %%mm3, %%mm7    \n\t"
1804             "punpcklbw              %%mm2, %%mm4    \n\t"
1805             "punpckhbw              %%mm2, %%mm6    \n\t"
1806 #if 1
1807             MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1808             MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1809             MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1810             MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1811 #else
1812             "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1813             "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1814             "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1815             "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1816 #endif
1817             "add                       $8, %%"REG_a"            \n\t"
1818             "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1819             "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1820             " js                       1b                       \n\t"
1821             :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1822                "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1823                "g" (-mmxSize)
1824             : "%"REG_a
1825         );
1826 #else
1827         const x86_reg mmxSize=1;
1828
1829         dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1830         dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1831 #endif
1832
1833         for (x=mmxSize-1; x<srcWidth-1; x++) {
1834             dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1835             dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1836             dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1837             dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1838         }
1839         dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1840         dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1841
1842         dst+=dstStride*2;
1843         src+=srcStride;
1844     }
1845
1846     // last line
1847 #if 1
1848     dst[0]= src[0];
1849
1850     for (x=0; x<srcWidth-1; x++) {
1851         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1852         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1853     }
1854     dst[2*srcWidth-1]= src[srcWidth-1];
1855 #else
1856     for (x=0; x<srcWidth; x++) {
1857         dst[2*x+0]=
1858         dst[2*x+1]= src[x];
1859     }
1860 #endif
1861
1862 #if COMPILE_TEMPLATE_MMX
1863     __asm__ volatile(EMMS"       \n\t"
1864                      SFENCE"     \n\t"
1865                      :::"memory");
1866 #endif
1867 }
1868
1869 /**
1870  * Height should be a multiple of 2 and width should be a multiple of 16.
1871  * (If this is a problem for anyone then tell me, and I will fix it.)
1872  * Chrominance data is only taken from every second line, others are ignored.
1873  * FIXME: Write HQ version.
1874  */
1875 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876                                       long width, long height,
1877                                       long lumStride, long chromStride, long srcStride)
1878 {
1879     long y;
1880     const x86_reg chromWidth= width>>1;
1881     for (y=0; y<height; y+=2) {
1882 #if COMPILE_TEMPLATE_MMX
1883         __asm__ volatile(
1884             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1885             "pcmpeqw             %%mm7, %%mm7   \n\t"
1886             "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1887             ASMALIGN(4)
1888             "1:                                 \n\t"
1889             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1890             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1891             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1892             "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1893             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1894             "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1895             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1896             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1897             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1898             "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1899             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1900
1901             MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1902
1903             "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1904             "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1905             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1906             "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1907             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1908             "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1909             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1910             "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1911             "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1912             "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1913
1914             MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1915
1916             "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1917             "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1918             "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1919             "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1920             "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1921             "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1922             "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1923             "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1924
1925             MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1926             MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1927
1928             "add                    $8, %%"REG_a"   \n\t"
1929             "cmp                    %4, %%"REG_a"   \n\t"
1930             " jb                    1b          \n\t"
1931             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1932             : "memory", "%"REG_a
1933         );
1934
1935         ydst += lumStride;
1936         src  += srcStride;
1937
1938         __asm__ volatile(
1939             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1940             ASMALIGN(4)
1941             "1:                                 \n\t"
1942             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1943             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
1944             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
1945             "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
1946             "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
1947             "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1948             "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1949             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1950             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1951             "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1952             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1953
1954             MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
1955             MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1956
1957             "add                    $8, %%"REG_a"   \n\t"
1958             "cmp                    %4, %%"REG_a"   \n\t"
1959             " jb                    1b          \n\t"
1960
1961             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1962             : "memory", "%"REG_a
1963         );
1964 #else
1965         long i;
1966         for (i=0; i<chromWidth; i++) {
1967             udst[i]     = src[4*i+0];
1968             ydst[2*i+0] = src[4*i+1];
1969             vdst[i]     = src[4*i+2];
1970             ydst[2*i+1] = src[4*i+3];
1971         }
1972         ydst += lumStride;
1973         src  += srcStride;
1974
1975         for (i=0; i<chromWidth; i++) {
1976             ydst[2*i+0] = src[4*i+1];
1977             ydst[2*i+1] = src[4*i+3];
1978         }
1979 #endif
1980         udst += chromStride;
1981         vdst += chromStride;
1982         ydst += lumStride;
1983         src  += srcStride;
1984     }
1985 #if COMPILE_TEMPLATE_MMX
1986     __asm__ volatile(EMMS"       \n\t"
1987                      SFENCE"     \n\t"
1988                      :::"memory");
1989 #endif
1990 }
1991
1992 /**
1993  * Height should be a multiple of 2 and width should be a multiple of 2.
1994  * (If this is a problem for anyone then tell me, and I will fix it.)
1995  * Chrominance data is only taken from every second line,
1996  * others are ignored in the C version.
1997  * FIXME: Write HQ version.
1998  */
1999 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2000                                        long width, long height,
2001                                        long lumStride, long chromStride, long srcStride)
2002 {
2003     long y;
2004     const x86_reg chromWidth= width>>1;
2005 #if COMPILE_TEMPLATE_MMX
2006     for (y=0; y<height-2; y+=2) {
2007         long i;
2008         for (i=0; i<2; i++) {
2009             __asm__ volatile(
2010                 "mov                        %2, %%"REG_a"   \n\t"
2011                 "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2012                 "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2013                 "pxor                    %%mm7, %%mm7       \n\t"
2014                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2015                 ASMALIGN(4)
2016                 "1:                                         \n\t"
2017                 PREFETCH"    64(%0, %%"REG_d")              \n\t"
2018                 "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2019                 "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2020                 "punpcklbw               %%mm7, %%mm0       \n\t"
2021                 "punpcklbw               %%mm7, %%mm1       \n\t"
2022                 "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2023                 "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2024                 "punpcklbw               %%mm7, %%mm2       \n\t"
2025                 "punpcklbw               %%mm7, %%mm3       \n\t"
2026                 "pmaddwd                 %%mm6, %%mm0       \n\t"
2027                 "pmaddwd                 %%mm6, %%mm1       \n\t"
2028                 "pmaddwd                 %%mm6, %%mm2       \n\t"
2029                 "pmaddwd                 %%mm6, %%mm3       \n\t"
2030 #ifndef FAST_BGR2YV12
2031                 "psrad                      $8, %%mm0       \n\t"
2032                 "psrad                      $8, %%mm1       \n\t"
2033                 "psrad                      $8, %%mm2       \n\t"
2034                 "psrad                      $8, %%mm3       \n\t"
2035 #endif
2036                 "packssdw                %%mm1, %%mm0       \n\t"
2037                 "packssdw                %%mm3, %%mm2       \n\t"
2038                 "pmaddwd                 %%mm5, %%mm0       \n\t"
2039                 "pmaddwd                 %%mm5, %%mm2       \n\t"
2040                 "packssdw                %%mm2, %%mm0       \n\t"
2041                 "psraw                      $7, %%mm0       \n\t"
2042
2043                 "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2044                 "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2045                 "punpcklbw               %%mm7, %%mm4       \n\t"
2046                 "punpcklbw               %%mm7, %%mm1       \n\t"
2047                 "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2048                 "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2049                 "punpcklbw               %%mm7, %%mm2       \n\t"
2050                 "punpcklbw               %%mm7, %%mm3       \n\t"
2051                 "pmaddwd                 %%mm6, %%mm4       \n\t"
2052                 "pmaddwd                 %%mm6, %%mm1       \n\t"
2053                 "pmaddwd                 %%mm6, %%mm2       \n\t"
2054                 "pmaddwd                 %%mm6, %%mm3       \n\t"
2055 #ifndef FAST_BGR2YV12
2056                 "psrad                      $8, %%mm4       \n\t"
2057                 "psrad                      $8, %%mm1       \n\t"
2058                 "psrad                      $8, %%mm2       \n\t"
2059                 "psrad                      $8, %%mm3       \n\t"
2060 #endif
2061                 "packssdw                %%mm1, %%mm4       \n\t"
2062                 "packssdw                %%mm3, %%mm2       \n\t"
2063                 "pmaddwd                 %%mm5, %%mm4       \n\t"
2064                 "pmaddwd                 %%mm5, %%mm2       \n\t"
2065                 "add                       $24, %%"REG_d"   \n\t"
2066                 "packssdw                %%mm2, %%mm4       \n\t"
2067                 "psraw                      $7, %%mm4       \n\t"
2068
2069                 "packuswb                %%mm4, %%mm0       \n\t"
2070                 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2071
2072                 MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2073                 "add                        $8,      %%"REG_a"  \n\t"
2074                 " js                        1b                  \n\t"
2075                 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2076                 : "%"REG_a, "%"REG_d
2077             );
2078             ydst += lumStride;
2079             src  += srcStride;
2080         }
2081         src -= srcStride*2;
2082         __asm__ volatile(
2083             "mov                        %4, %%"REG_a"   \n\t"
2084             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2085             "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2086             "pxor                    %%mm7, %%mm7       \n\t"
2087             "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2088             "add                 %%"REG_d", %%"REG_d"   \n\t"
2089             ASMALIGN(4)
2090             "1:                                         \n\t"
2091             PREFETCH"    64(%0, %%"REG_d")              \n\t"
2092             PREFETCH"    64(%1, %%"REG_d")              \n\t"
2093 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2094             "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2095             "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2096             "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2097             "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2098             PAVGB"                   %%mm1, %%mm0       \n\t"
2099             PAVGB"                   %%mm3, %%mm2       \n\t"
2100             "movq                    %%mm0, %%mm1       \n\t"
2101             "movq                    %%mm2, %%mm3       \n\t"
2102             "psrlq                     $24, %%mm0       \n\t"
2103             "psrlq                     $24, %%mm2       \n\t"
2104             PAVGB"                   %%mm1, %%mm0       \n\t"
2105             PAVGB"                   %%mm3, %%mm2       \n\t"
2106             "punpcklbw               %%mm7, %%mm0       \n\t"
2107             "punpcklbw               %%mm7, %%mm2       \n\t"
2108 #else
2109             "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2110             "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2111             "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2112             "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2113             "punpcklbw               %%mm7, %%mm0       \n\t"
2114             "punpcklbw               %%mm7, %%mm1       \n\t"
2115             "punpcklbw               %%mm7, %%mm2       \n\t"
2116             "punpcklbw               %%mm7, %%mm3       \n\t"
2117             "paddw                   %%mm1, %%mm0       \n\t"
2118             "paddw                   %%mm3, %%mm2       \n\t"
2119             "paddw                   %%mm2, %%mm0       \n\t"
2120             "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2121             "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2122             "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2123             "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2124             "punpcklbw               %%mm7, %%mm4       \n\t"
2125             "punpcklbw               %%mm7, %%mm1       \n\t"
2126             "punpcklbw               %%mm7, %%mm2       \n\t"
2127             "punpcklbw               %%mm7, %%mm3       \n\t"
2128             "paddw                   %%mm1, %%mm4       \n\t"
2129             "paddw                   %%mm3, %%mm2       \n\t"
2130             "paddw                   %%mm4, %%mm2       \n\t"
2131             "psrlw                      $2, %%mm0       \n\t"
2132             "psrlw                      $2, %%mm2       \n\t"
2133 #endif
2134             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2135             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2136
2137             "pmaddwd                 %%mm0, %%mm1       \n\t"
2138             "pmaddwd                 %%mm2, %%mm3       \n\t"
2139             "pmaddwd                 %%mm6, %%mm0       \n\t"
2140             "pmaddwd                 %%mm6, %%mm2       \n\t"
2141 #ifndef FAST_BGR2YV12
2142             "psrad                      $8, %%mm0       \n\t"
2143             "psrad                      $8, %%mm1       \n\t"
2144             "psrad                      $8, %%mm2       \n\t"
2145             "psrad                      $8, %%mm3       \n\t"
2146 #endif
2147             "packssdw                %%mm2, %%mm0       \n\t"
2148             "packssdw                %%mm3, %%mm1       \n\t"
2149             "pmaddwd                 %%mm5, %%mm0       \n\t"
2150             "pmaddwd                 %%mm5, %%mm1       \n\t"
2151             "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2152             "psraw                      $7, %%mm0       \n\t"
2153
2154 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2155             "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2156             "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2157             "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2158             "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2159             PAVGB"                   %%mm1, %%mm4       \n\t"
2160             PAVGB"                   %%mm3, %%mm2       \n\t"
2161             "movq                    %%mm4, %%mm1       \n\t"
2162             "movq                    %%mm2, %%mm3       \n\t"
2163             "psrlq                     $24, %%mm4       \n\t"
2164             "psrlq                     $24, %%mm2       \n\t"
2165             PAVGB"                   %%mm1, %%mm4       \n\t"
2166             PAVGB"                   %%mm3, %%mm2       \n\t"
2167             "punpcklbw               %%mm7, %%mm4       \n\t"
2168             "punpcklbw               %%mm7, %%mm2       \n\t"
2169 #else
2170             "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2171             "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2172             "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2173             "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2174             "punpcklbw               %%mm7, %%mm4       \n\t"
2175             "punpcklbw               %%mm7, %%mm1       \n\t"
2176             "punpcklbw               %%mm7, %%mm2       \n\t"
2177             "punpcklbw               %%mm7, %%mm3       \n\t"
2178             "paddw                   %%mm1, %%mm4       \n\t"
2179             "paddw                   %%mm3, %%mm2       \n\t"
2180             "paddw                   %%mm2, %%mm4       \n\t"
2181             "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2182             "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2183             "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2184             "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2185             "punpcklbw               %%mm7, %%mm5       \n\t"
2186             "punpcklbw               %%mm7, %%mm1       \n\t"
2187             "punpcklbw               %%mm7, %%mm2       \n\t"
2188             "punpcklbw               %%mm7, %%mm3       \n\t"
2189             "paddw                   %%mm1, %%mm5       \n\t"
2190             "paddw                   %%mm3, %%mm2       \n\t"
2191             "paddw                   %%mm5, %%mm2       \n\t"
2192             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2193             "psrlw                      $2, %%mm4       \n\t"
2194             "psrlw                      $2, %%mm2       \n\t"
2195 #endif
2196             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2197             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2198
2199             "pmaddwd                 %%mm4, %%mm1       \n\t"
2200             "pmaddwd                 %%mm2, %%mm3       \n\t"
2201             "pmaddwd                 %%mm6, %%mm4       \n\t"
2202             "pmaddwd                 %%mm6, %%mm2       \n\t"
2203 #ifndef FAST_BGR2YV12
2204             "psrad                      $8, %%mm4       \n\t"
2205             "psrad                      $8, %%mm1       \n\t"
2206             "psrad                      $8, %%mm2       \n\t"
2207             "psrad                      $8, %%mm3       \n\t"
2208 #endif
2209             "packssdw                %%mm2, %%mm4       \n\t"
2210             "packssdw                %%mm3, %%mm1       \n\t"
2211             "pmaddwd                 %%mm5, %%mm4       \n\t"
2212             "pmaddwd                 %%mm5, %%mm1       \n\t"
2213             "add                       $24, %%"REG_d"   \n\t"
2214             "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2215             "psraw                      $7, %%mm4       \n\t"
2216
2217             "movq                    %%mm0, %%mm1           \n\t"
2218             "punpckldq               %%mm4, %%mm0           \n\t"
2219             "punpckhdq               %%mm4, %%mm1           \n\t"
2220             "packsswb                %%mm1, %%mm0           \n\t"
2221             "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2222             "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2223             "punpckhdq               %%mm0, %%mm0           \n\t"
2224             "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2225             "add                        $4, %%"REG_a"       \n\t"
2226             " js                        1b                  \n\t"
2227             : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2228             : "%"REG_a, "%"REG_d
2229         );
2230
2231         udst += chromStride;
2232         vdst += chromStride;
2233         src  += srcStride*2;
2234     }
2235
2236     __asm__ volatile(EMMS"       \n\t"
2237                      SFENCE"     \n\t"
2238                      :::"memory");
2239 #else
2240     y=0;
2241 #endif
2242     for (; y<height; y+=2) {
2243         long i;
2244         for (i=0; i<chromWidth; i++) {
2245             unsigned int b = src[6*i+0];
2246             unsigned int g = src[6*i+1];
2247             unsigned int r = src[6*i+2];
2248
2249             unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2250             unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2251             unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2252
2253             udst[i]     = U;
2254             vdst[i]     = V;
2255             ydst[2*i]   = Y;
2256
2257             b = src[6*i+3];
2258             g = src[6*i+4];
2259             r = src[6*i+5];
2260
2261             Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2262             ydst[2*i+1]     = Y;
2263         }
2264         ydst += lumStride;
2265         src  += srcStride;
2266
2267         for (i=0; i<chromWidth; i++) {
2268             unsigned int b = src[6*i+0];
2269             unsigned int g = src[6*i+1];
2270             unsigned int r = src[6*i+2];
2271
2272             unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2273
2274             ydst[2*i]     = Y;
2275
2276             b = src[6*i+3];
2277             g = src[6*i+4];
2278             r = src[6*i+5];
2279
2280             Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2281             ydst[2*i+1]     = Y;
2282         }
2283         udst += chromStride;
2284         vdst += chromStride;
2285         ydst += lumStride;
2286         src  += srcStride;
2287     }
2288 }
2289
2290 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2291                              long width, long height, long src1Stride,
2292                              long src2Stride, long dstStride)
2293 {
2294     long h;
2295
2296     for (h=0; h < height; h++) {
2297         long w;
2298
2299 #if COMPILE_TEMPLATE_MMX
2300 #if COMPILE_TEMPLATE_SSE2
2301         __asm__(
2302             "xor              %%"REG_a", %%"REG_a"  \n\t"
2303             "1:                                     \n\t"
2304             PREFETCH" 64(%1, %%"REG_a")             \n\t"
2305             PREFETCH" 64(%2, %%"REG_a")             \n\t"
2306             "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2307             "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2308             "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2309             "punpcklbw           %%xmm2, %%xmm0     \n\t"
2310             "punpckhbw           %%xmm2, %%xmm1     \n\t"
2311             "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2312             "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2313             "add                    $16, %%"REG_a"  \n\t"
2314             "cmp                     %3, %%"REG_a"  \n\t"
2315             " jb                     1b             \n\t"
2316             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2317             : "memory", "%"REG_a""
2318         );
2319 #else
2320         __asm__(
2321             "xor %%"REG_a", %%"REG_a"               \n\t"
2322             "1:                                     \n\t"
2323             PREFETCH" 64(%1, %%"REG_a")             \n\t"
2324             PREFETCH" 64(%2, %%"REG_a")             \n\t"
2325             "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2326             "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2327             "movq                 %%mm0, %%mm1      \n\t"
2328             "movq                 %%mm2, %%mm3      \n\t"
2329             "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2330             "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2331             "punpcklbw            %%mm4, %%mm0      \n\t"
2332             "punpckhbw            %%mm4, %%mm1      \n\t"
2333             "punpcklbw            %%mm5, %%mm2      \n\t"
2334             "punpckhbw            %%mm5, %%mm3      \n\t"
2335             MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2336             MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2337             MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2338             MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2339             "add                    $16, %%"REG_a"  \n\t"
2340             "cmp                     %3, %%"REG_a"  \n\t"
2341             " jb                     1b             \n\t"
2342             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2343             : "memory", "%"REG_a
2344         );
2345 #endif
2346         for (w= (width&(~15)); w < width; w++) {
2347             dest[2*w+0] = src1[w];
2348             dest[2*w+1] = src2[w];
2349         }
2350 #else
2351         for (w=0; w < width; w++) {
2352             dest[2*w+0] = src1[w];
2353             dest[2*w+1] = src2[w];
2354         }
2355 #endif
2356         dest += dstStride;
2357         src1 += src1Stride;
2358         src2 += src2Stride;
2359     }
2360 #if COMPILE_TEMPLATE_MMX
2361     __asm__(
2362             EMMS"       \n\t"
2363             SFENCE"     \n\t"
2364             ::: "memory"
2365             );
2366 #endif
2367 }
2368
2369 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2370                                        uint8_t *dst1, uint8_t *dst2,
2371                                        long width, long height,
2372                                        long srcStride1, long srcStride2,
2373                                        long dstStride1, long dstStride2)
2374 {
2375     x86_reg y;
2376     long x,w,h;
2377     w=width/2; h=height/2;
2378 #if COMPILE_TEMPLATE_MMX
2379     __asm__ volatile(
2380         PREFETCH" %0    \n\t"
2381         PREFETCH" %1    \n\t"
2382         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2383 #endif
2384     for (y=0;y<h;y++) {
2385         const uint8_t* s1=src1+srcStride1*(y>>1);
2386         uint8_t* d=dst1+dstStride1*y;
2387         x=0;
2388 #if COMPILE_TEMPLATE_MMX
2389         for (;x<w-31;x+=32) {
2390             __asm__ volatile(
2391                 PREFETCH"   32%1        \n\t"
2392                 "movq         %1, %%mm0 \n\t"
2393                 "movq        8%1, %%mm2 \n\t"
2394                 "movq       16%1, %%mm4 \n\t"
2395                 "movq       24%1, %%mm6 \n\t"
2396                 "movq      %%mm0, %%mm1 \n\t"
2397                 "movq      %%mm2, %%mm3 \n\t"
2398                 "movq      %%mm4, %%mm5 \n\t"
2399                 "movq      %%mm6, %%mm7 \n\t"
2400                 "punpcklbw %%mm0, %%mm0 \n\t"
2401                 "punpckhbw %%mm1, %%mm1 \n\t"
2402                 "punpcklbw %%mm2, %%mm2 \n\t"
2403                 "punpckhbw %%mm3, %%mm3 \n\t"
2404                 "punpcklbw %%mm4, %%mm4 \n\t"
2405                 "punpckhbw %%mm5, %%mm5 \n\t"
2406                 "punpcklbw %%mm6, %%mm6 \n\t"
2407                 "punpckhbw %%mm7, %%mm7 \n\t"
2408                 MOVNTQ"    %%mm0,   %0  \n\t"
2409                 MOVNTQ"    %%mm1,  8%0  \n\t"
2410                 MOVNTQ"    %%mm2, 16%0  \n\t"
2411                 MOVNTQ"    %%mm3, 24%0  \n\t"
2412                 MOVNTQ"    %%mm4, 32%0  \n\t"
2413                 MOVNTQ"    %%mm5, 40%0  \n\t"
2414                 MOVNTQ"    %%mm6, 48%0  \n\t"
2415                 MOVNTQ"    %%mm7, 56%0"
2416                 :"=m"(d[2*x])
2417                 :"m"(s1[x])
2418                 :"memory");
2419         }
2420 #endif
2421         for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2422     }
2423     for (y=0;y<h;y++) {
2424         const uint8_t* s2=src2+srcStride2*(y>>1);
2425         uint8_t* d=dst2+dstStride2*y;
2426         x=0;
2427 #if COMPILE_TEMPLATE_MMX
2428         for (;x<w-31;x+=32) {
2429             __asm__ volatile(
2430                 PREFETCH"   32%1        \n\t"
2431                 "movq         %1, %%mm0 \n\t"
2432                 "movq        8%1, %%mm2 \n\t"
2433                 "movq       16%1, %%mm4 \n\t"
2434                 "movq       24%1, %%mm6 \n\t"
2435                 "movq      %%mm0, %%mm1 \n\t"
2436                 "movq      %%mm2, %%mm3 \n\t"
2437                 "movq      %%mm4, %%mm5 \n\t"
2438                 "movq      %%mm6, %%mm7 \n\t"
2439                 "punpcklbw %%mm0, %%mm0 \n\t"
2440                 "punpckhbw %%mm1, %%mm1 \n\t"
2441                 "punpcklbw %%mm2, %%mm2 \n\t"
2442                 "punpckhbw %%mm3, %%mm3 \n\t"
2443                 "punpcklbw %%mm4, %%mm4 \n\t"
2444                 "punpckhbw %%mm5, %%mm5 \n\t"
2445                 "punpcklbw %%mm6, %%mm6 \n\t"
2446                 "punpckhbw %%mm7, %%mm7 \n\t"
2447                 MOVNTQ"    %%mm0,   %0  \n\t"
2448                 MOVNTQ"    %%mm1,  8%0  \n\t"
2449                 MOVNTQ"    %%mm2, 16%0  \n\t"
2450                 MOVNTQ"    %%mm3, 24%0  \n\t"
2451                 MOVNTQ"    %%mm4, 32%0  \n\t"
2452                 MOVNTQ"    %%mm5, 40%0  \n\t"
2453                 MOVNTQ"    %%mm6, 48%0  \n\t"
2454                 MOVNTQ"    %%mm7, 56%0"
2455                 :"=m"(d[2*x])
2456                 :"m"(s2[x])
2457                 :"memory");
2458         }
2459 #endif
2460         for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2461     }
2462 #if COMPILE_TEMPLATE_MMX
2463     __asm__(
2464             EMMS"       \n\t"
2465             SFENCE"     \n\t"
2466             ::: "memory"
2467         );
2468 #endif
2469 }
2470
2471 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2472                                         uint8_t *dst,
2473                                         long width, long height,
2474                                         long srcStride1, long srcStride2,
2475                                         long srcStride3, long dstStride)
2476 {
2477     x86_reg x;
2478     long y,w,h;
2479     w=width/2; h=height;
2480     for (y=0;y<h;y++) {
2481         const uint8_t* yp=src1+srcStride1*y;
2482         const uint8_t* up=src2+srcStride2*(y>>2);
2483         const uint8_t* vp=src3+srcStride3*(y>>2);
2484         uint8_t* d=dst+dstStride*y;
2485         x=0;
2486 #if COMPILE_TEMPLATE_MMX
2487         for (;x<w-7;x+=8) {
2488             __asm__ volatile(
2489                 PREFETCH"   32(%1, %0)          \n\t"
2490                 PREFETCH"   32(%2, %0)          \n\t"
2491                 PREFETCH"   32(%3, %0)          \n\t"
2492                 "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2493                 "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2494                 "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2495                 "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2496                 "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2497                 "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2498                 "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2499                 "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2500                 "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2501                 "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2502
2503                 "movq            %%mm1, %%mm6   \n\t"
2504                 "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2505                 "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2506                 "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2507                 MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2508                 MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2509
2510                 "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2511                 "movq     8(%1, %0, 4), %%mm0   \n\t"
2512                 "movq            %%mm0, %%mm3   \n\t"
2513                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2514                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2515                 MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2516                 MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2517
2518                 "movq            %%mm4, %%mm6   \n\t"
2519                 "movq    16(%1, %0, 4), %%mm0   \n\t"
2520                 "movq            %%mm0, %%mm3   \n\t"
2521                 "punpcklbw       %%mm5, %%mm4   \n\t"
2522                 "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2523                 "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2524                 MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2525                 MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2526
2527                 "punpckhbw       %%mm5, %%mm6   \n\t"
2528                 "movq    24(%1, %0, 4), %%mm0   \n\t"
2529                 "movq            %%mm0, %%mm3   \n\t"
2530                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2531                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2532                 MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2533                 MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2534
2535                 : "+r" (x)
2536                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2537                 :"memory");
2538         }
2539 #endif
2540         for (; x<w; x++) {
2541             const long x2 = x<<2;
2542             d[8*x+0] = yp[x2];
2543             d[8*x+1] = up[x];
2544             d[8*x+2] = yp[x2+1];
2545             d[8*x+3] = vp[x];
2546             d[8*x+4] = yp[x2+2];
2547             d[8*x+5] = up[x];
2548             d[8*x+6] = yp[x2+3];
2549             d[8*x+7] = vp[x];
2550         }
2551     }
2552 #if COMPILE_TEMPLATE_MMX
2553     __asm__(
2554             EMMS"       \n\t"
2555             SFENCE"     \n\t"
2556             ::: "memory"
2557         );
2558 #endif
2559 }
2560
2561 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2562 {
2563     dst +=   count;
2564     src += 2*count;
2565     count= - count;
2566
2567 #if COMPILE_TEMPLATE_MMX
2568     if(count <= -16) {
2569         count += 15;
2570         __asm__ volatile(
2571             "pcmpeqw       %%mm7, %%mm7        \n\t"
2572             "psrlw            $8, %%mm7        \n\t"
2573             "1:                                \n\t"
2574             "movq -30(%1, %0, 2), %%mm0        \n\t"
2575             "movq -22(%1, %0, 2), %%mm1        \n\t"
2576             "movq -14(%1, %0, 2), %%mm2        \n\t"
2577             "movq  -6(%1, %0, 2), %%mm3        \n\t"
2578             "pand          %%mm7, %%mm0        \n\t"
2579             "pand          %%mm7, %%mm1        \n\t"
2580             "pand          %%mm7, %%mm2        \n\t"
2581             "pand          %%mm7, %%mm3        \n\t"
2582             "packuswb      %%mm1, %%mm0        \n\t"
2583             "packuswb      %%mm3, %%mm2        \n\t"
2584             MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2585             MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2586             "add             $16, %0           \n\t"
2587             " js 1b                            \n\t"
2588             : "+r"(count)
2589             : "r"(src), "r"(dst)
2590         );
2591         count -= 15;
2592     }
2593 #endif
2594     while(count<0) {
2595         dst[count]= src[2*count];
2596         count++;
2597     }
2598 }
2599
2600 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2601 {
2602     dst0+=   count;
2603     dst1+=   count;
2604     src += 4*count;
2605     count= - count;
2606 #if COMPILE_TEMPLATE_MMX
2607     if(count <= -8) {
2608         count += 7;
2609         __asm__ volatile(
2610             "pcmpeqw       %%mm7, %%mm7        \n\t"
2611             "psrlw            $8, %%mm7        \n\t"
2612             "1:                                \n\t"
2613             "movq -28(%1, %0, 4), %%mm0        \n\t"
2614             "movq -20(%1, %0, 4), %%mm1        \n\t"
2615             "movq -12(%1, %0, 4), %%mm2        \n\t"
2616             "movq  -4(%1, %0, 4), %%mm3        \n\t"
2617             "pand          %%mm7, %%mm0        \n\t"
2618             "pand          %%mm7, %%mm1        \n\t"
2619             "pand          %%mm7, %%mm2        \n\t"
2620             "pand          %%mm7, %%mm3        \n\t"
2621             "packuswb      %%mm1, %%mm0        \n\t"
2622             "packuswb      %%mm3, %%mm2        \n\t"
2623             "movq          %%mm0, %%mm1        \n\t"
2624             "movq          %%mm2, %%mm3        \n\t"
2625             "psrlw            $8, %%mm0        \n\t"
2626             "psrlw            $8, %%mm2        \n\t"
2627             "pand          %%mm7, %%mm1        \n\t"
2628             "pand          %%mm7, %%mm3        \n\t"
2629             "packuswb      %%mm2, %%mm0        \n\t"
2630             "packuswb      %%mm3, %%mm1        \n\t"
2631             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2632             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2633             "add              $8, %0           \n\t"
2634             " js 1b                            \n\t"
2635             : "+r"(count)
2636             : "r"(src), "r"(dst0), "r"(dst1)
2637         );
2638         count -= 7;
2639     }
2640 #endif
2641     while(count<0) {
2642         dst0[count]= src[4*count+0];
2643         dst1[count]= src[4*count+2];
2644         count++;
2645     }
2646 }
2647
2648 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2649 {
2650     dst0 +=   count;
2651     dst1 +=   count;
2652     src0 += 4*count;
2653     src1 += 4*count;
2654     count= - count;
2655 #ifdef PAVGB
2656     if(count <= -8) {
2657         count += 7;
2658         __asm__ volatile(
2659             "pcmpeqw        %%mm7, %%mm7        \n\t"
2660             "psrlw             $8, %%mm7        \n\t"
2661             "1:                                \n\t"
2662             "movq  -28(%1, %0, 4), %%mm0        \n\t"
2663             "movq  -20(%1, %0, 4), %%mm1        \n\t"
2664             "movq  -12(%1, %0, 4), %%mm2        \n\t"
2665             "movq   -4(%1, %0, 4), %%mm3        \n\t"
2666             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2667             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2668             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2669             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2670             "pand           %%mm7, %%mm0        \n\t"
2671             "pand           %%mm7, %%mm1        \n\t"
2672             "pand           %%mm7, %%mm2        \n\t"
2673             "pand           %%mm7, %%mm3        \n\t"
2674             "packuswb       %%mm1, %%mm0        \n\t"
2675             "packuswb       %%mm3, %%mm2        \n\t"
2676             "movq           %%mm0, %%mm1        \n\t"
2677             "movq           %%mm2, %%mm3        \n\t"
2678             "psrlw             $8, %%mm0        \n\t"
2679             "psrlw             $8, %%mm2        \n\t"
2680             "pand           %%mm7, %%mm1        \n\t"
2681             "pand           %%mm7, %%mm3        \n\t"
2682             "packuswb       %%mm2, %%mm0        \n\t"
2683             "packuswb       %%mm3, %%mm1        \n\t"
2684             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2685             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2686             "add               $8, %0           \n\t"
2687             " js 1b                            \n\t"
2688             : "+r"(count)
2689             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2690         );
2691         count -= 7;
2692     }
2693 #endif
2694     while(count<0) {
2695         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2696         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2697         count++;
2698     }
2699 }
2700
2701 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2702 {
2703     dst0+=   count;
2704     dst1+=   count;
2705     src += 4*count;
2706     count= - count;
2707 #if COMPILE_TEMPLATE_MMX
2708     if(count <= -8) {
2709         count += 7;
2710         __asm__ volatile(
2711             "pcmpeqw       %%mm7, %%mm7        \n\t"
2712             "psrlw            $8, %%mm7        \n\t"
2713             "1:                                \n\t"
2714             "movq -28(%1, %0, 4), %%mm0        \n\t"
2715             "movq -20(%1, %0, 4), %%mm1        \n\t"
2716             "movq -12(%1, %0, 4), %%mm2        \n\t"
2717             "movq  -4(%1, %0, 4), %%mm3        \n\t"
2718             "psrlw            $8, %%mm0        \n\t"
2719             "psrlw            $8, %%mm1        \n\t"
2720             "psrlw            $8, %%mm2        \n\t"
2721             "psrlw            $8, %%mm3        \n\t"
2722             "packuswb      %%mm1, %%mm0        \n\t"
2723             "packuswb      %%mm3, %%mm2        \n\t"
2724             "movq          %%mm0, %%mm1        \n\t"
2725             "movq          %%mm2, %%mm3        \n\t"
2726             "psrlw            $8, %%mm0        \n\t"
2727             "psrlw            $8, %%mm2        \n\t"
2728             "pand          %%mm7, %%mm1        \n\t"
2729             "pand          %%mm7, %%mm3        \n\t"
2730             "packuswb      %%mm2, %%mm0        \n\t"
2731             "packuswb      %%mm3, %%mm1        \n\t"
2732             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2733             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2734             "add              $8, %0           \n\t"
2735             " js 1b                            \n\t"
2736             : "+r"(count)
2737             : "r"(src), "r"(dst0), "r"(dst1)
2738         );
2739         count -= 7;
2740     }
2741 #endif
2742     src++;
2743     while(count<0) {
2744         dst0[count]= src[4*count+0];
2745         dst1[count]= src[4*count+2];
2746         count++;
2747     }
2748 }
2749
2750 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2751 {
2752     dst0 +=   count;
2753     dst1 +=   count;
2754     src0 += 4*count;
2755     src1 += 4*count;
2756     count= - count;
2757 #ifdef PAVGB
2758     if(count <= -8) {
2759         count += 7;
2760         __asm__ volatile(
2761             "pcmpeqw        %%mm7, %%mm7        \n\t"
2762             "psrlw             $8, %%mm7        \n\t"
2763             "1:                                \n\t"
2764             "movq  -28(%1, %0, 4), %%mm0        \n\t"
2765             "movq  -20(%1, %0, 4), %%mm1        \n\t"
2766             "movq  -12(%1, %0, 4), %%mm2        \n\t"
2767             "movq   -4(%1, %0, 4), %%mm3        \n\t"
2768             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2769             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2770             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2771             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2772             "psrlw             $8, %%mm0        \n\t"
2773             "psrlw             $8, %%mm1        \n\t"
2774             "psrlw             $8, %%mm2        \n\t"
2775             "psrlw             $8, %%mm3        \n\t"
2776             "packuswb       %%mm1, %%mm0        \n\t"
2777             "packuswb       %%mm3, %%mm2        \n\t"
2778             "movq           %%mm0, %%mm1        \n\t"
2779             "movq           %%mm2, %%mm3        \n\t"
2780             "psrlw             $8, %%mm0        \n\t"
2781             "psrlw             $8, %%mm2        \n\t"
2782             "pand           %%mm7, %%mm1        \n\t"
2783             "pand           %%mm7, %%mm3        \n\t"
2784             "packuswb       %%mm2, %%mm0        \n\t"
2785             "packuswb       %%mm3, %%mm1        \n\t"
2786             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2787             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2788             "add               $8, %0           \n\t"
2789             " js 1b                            \n\t"
2790             : "+r"(count)
2791             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2792         );
2793         count -= 7;
2794     }
2795 #endif
2796     src0++;
2797     src1++;
2798     while(count<0) {
2799         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2800         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2801         count++;
2802     }
2803 }
2804
2805 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2806                                       long width, long height,
2807                                       long lumStride, long chromStride, long srcStride)
2808 {
2809     long y;
2810     const long chromWidth= -((-width)>>1);
2811
2812     for (y=0; y<height; y++) {
2813         RENAME(extract_even)(src, ydst, width);
2814         if(y&1) {
2815             RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2816             udst+= chromStride;
2817             vdst+= chromStride;
2818         }
2819
2820         src += srcStride;
2821         ydst+= lumStride;
2822     }
2823 #if COMPILE_TEMPLATE_MMX
2824     __asm__(
2825             EMMS"       \n\t"
2826             SFENCE"     \n\t"
2827             ::: "memory"
2828         );
2829 #endif
2830 }
2831
2832 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2833                                       long width, long height,
2834                                       long lumStride, long chromStride, long srcStride)
2835 {
2836     long y;
2837     const long chromWidth= -((-width)>>1);
2838
2839     for (y=0; y<height; y++) {
2840         RENAME(extract_even)(src, ydst, width);
2841         RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2842
2843         src += srcStride;
2844         ydst+= lumStride;
2845         udst+= chromStride;
2846         vdst+= chromStride;
2847     }
2848 #if COMPILE_TEMPLATE_MMX
2849     __asm__(
2850             EMMS"       \n\t"
2851             SFENCE"     \n\t"
2852             ::: "memory"
2853         );
2854 #endif
2855 }
2856
2857 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2858                                       long width, long height,
2859                                       long lumStride, long chromStride, long srcStride)
2860 {
2861     long y;
2862     const long chromWidth= -((-width)>>1);
2863
2864     for (y=0; y<height; y++) {
2865         RENAME(extract_even)(src+1, ydst, width);
2866         if(y&1) {
2867             RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2868             udst+= chromStride;
2869             vdst+= chromStride;
2870         }
2871
2872         src += srcStride;
2873         ydst+= lumStride;
2874     }
2875 #if COMPILE_TEMPLATE_MMX
2876     __asm__(
2877             EMMS"       \n\t"
2878             SFENCE"     \n\t"
2879             ::: "memory"
2880         );
2881 #endif
2882 }
2883
2884 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2885                                       long width, long height,
2886                                       long lumStride, long chromStride, long srcStride)
2887 {
2888     long y;
2889     const long chromWidth= -((-width)>>1);
2890
2891     for (y=0; y<height; y++) {
2892         RENAME(extract_even)(src+1, ydst, width);
2893         RENAME(extract_even2)(src, udst, vdst, chromWidth);
2894
2895         src += srcStride;
2896         ydst+= lumStride;
2897         udst+= chromStride;
2898         vdst+= chromStride;
2899     }
2900 #if COMPILE_TEMPLATE_MMX
2901     __asm__(
2902             EMMS"       \n\t"
2903             SFENCE"     \n\t"
2904             ::: "memory"
2905         );
2906 #endif
2907 }
2908
2909 static inline void RENAME(rgb2rgb_init)(void)
2910 {
2911     rgb15to16       = RENAME(rgb15to16);
2912     rgb15tobgr24    = RENAME(rgb15tobgr24);
2913     rgb15to32       = RENAME(rgb15to32);
2914     rgb16tobgr24    = RENAME(rgb16tobgr24);
2915     rgb16to32       = RENAME(rgb16to32);
2916     rgb16to15       = RENAME(rgb16to15);
2917     rgb24tobgr16    = RENAME(rgb24tobgr16);
2918     rgb24tobgr15    = RENAME(rgb24tobgr15);
2919     rgb24tobgr32    = RENAME(rgb24tobgr32);
2920     rgb32to16       = RENAME(rgb32to16);
2921     rgb32to15       = RENAME(rgb32to15);
2922     rgb32tobgr24    = RENAME(rgb32tobgr24);
2923     rgb24to15       = RENAME(rgb24to15);
2924     rgb24to16       = RENAME(rgb24to16);
2925     rgb24tobgr24    = RENAME(rgb24tobgr24);
2926     shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2927     rgb32tobgr16    = RENAME(rgb32tobgr16);
2928     rgb32tobgr15    = RENAME(rgb32tobgr15);
2929     yv12toyuy2      = RENAME(yv12toyuy2);
2930     yv12touyvy      = RENAME(yv12touyvy);
2931     yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2932     yuv422ptouyvy   = RENAME(yuv422ptouyvy);
2933     yuy2toyv12      = RENAME(yuy2toyv12);
2934     planar2x        = RENAME(planar2x);
2935     rgb24toyv12     = RENAME(rgb24toyv12);
2936     interleaveBytes = RENAME(interleaveBytes);
2937     vu9_to_vu12     = RENAME(vu9_to_vu12);
2938     yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
2939
2940     uyvytoyuv420    = RENAME(uyvytoyuv420);
2941     uyvytoyuv422    = RENAME(uyvytoyuv422);
2942     yuyvtoyuv420    = RENAME(yuyvtoyuv420);
2943     yuyvtoyuv422    = RENAME(yuyvtoyuv422);
2944 }