libswscale/rgb2rgb_template.c

   1 /*
   2  * software RGB to RGB converter
   3  * pluralize by software PAL8 to RGB converter
   4  *              software YUV to YUV converter
   5  *              software YUV to RGB converter
   6  * Written by Nick Kurshev.
   7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
   8  * lot of big-endian byte order fixes by Alex Beregszaszi
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License
  23  * along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  *
  26  * The C code (not assembly, MMX, ...) of this file can be used
  27  * under the LGPL license.
  28  */
  29
  30 #include <stddef.h>
  31
  32 #undef PREFETCH
  33 #undef MOVNTQ
  34 #undef EMMS
  35 #undef SFENCE
  36 #undef MMREG_SIZE
  37 #undef PREFETCHW
  38 #undef PAVGB
  39
  40 #if HAVE_SSE2
  41 #define MMREG_SIZE 16
  42 #else
  43 #define MMREG_SIZE 8
  44 #endif
  45
  46 #if HAVE_AMD3DNOW
  47 #define PREFETCH  "prefetch"
  48 #define PREFETCHW "prefetchw"
  49 #define PAVGB     "pavgusb"
  50 #elif HAVE_MMX2
  51 #define PREFETCH "prefetchnta"
  52 #define PREFETCHW "prefetcht0"
  53 #define PAVGB     "pavgb"
  54 #else
  55 #define PREFETCH  " # nop"
  56 #define PREFETCHW " # nop"
  57 #endif
  58
  59 #if HAVE_AMD3DNOW
  60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  61 #define EMMS     "femms"
  62 #else
  63 #define EMMS     "emms"
  64 #endif
  65
  66 #if HAVE_MMX2
  67 #define MOVNTQ "movntq"
  68 #define SFENCE "sfence"
  69 #else
  70 #define MOVNTQ "movq"
  71 #define SFENCE " # nop"
  72 #endif
  73
  74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  75 {
  76     uint8_t *dest = dst;
  77     const uint8_t *s = src;
  78     const uint8_t *end;
  79     #if HAVE_MMX
  80     const uint8_t *mm_end;
  81     #endif
  82     end = s + src_size;
  83     #if HAVE_MMX
  84     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
  85     mm_end = end - 23;
  86     __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
  87     while (s < mm_end) {
  88         __asm__ volatile(
  89             PREFETCH"    32%1           \n\t"
  90             "movd          %1, %%mm0    \n\t"
  91             "punpckldq    3%1, %%mm0    \n\t"
  92             "movd         6%1, %%mm1    \n\t"
  93             "punpckldq    9%1, %%mm1    \n\t"
  94             "movd        12%1, %%mm2    \n\t"
  95             "punpckldq   15%1, %%mm2    \n\t"
  96             "movd        18%1, %%mm3    \n\t"
  97             "punpckldq   21%1, %%mm3    \n\t"
  98             "por        %%mm7, %%mm0    \n\t"
  99             "por        %%mm7, %%mm1    \n\t"
 100             "por        %%mm7, %%mm2    \n\t"
 101             "por        %%mm7, %%mm3    \n\t"
 102             MOVNTQ"     %%mm0,   %0     \n\t"
 103             MOVNTQ"     %%mm1,  8%0     \n\t"
 104             MOVNTQ"     %%mm2, 16%0     \n\t"
 105             MOVNTQ"     %%mm3, 24%0"
 106             :"=m"(*dest)
 107             :"m"(*s)
 108             :"memory");
 109         dest += 32;
 110         s += 24;
 111     }
 112     __asm__ volatile(SFENCE:::"memory");
 113     __asm__ volatile(EMMS:::"memory");
 114     #endif
 115     while (s < end) {
 116     #if HAVE_BIGENDIAN
 117         /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
 118         *dest++ = 255;
 119         *dest++ = s[2];
 120         *dest++ = s[1];
 121         *dest++ = s[0];
 122         s+=3;
 123     #else
 124         *dest++ = *s++;
 125         *dest++ = *s++;
 126         *dest++ = *s++;
 127         *dest++ = 255;
 128     #endif
 129     }
 130 }
 131
 132 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 133 {
 134     uint8_t *dest = dst;
 135     const uint8_t *s = src;
 136     const uint8_t *end;
 137 #if HAVE_MMX
 138     const uint8_t *mm_end;
 139 #endif
 140     end = s + src_size;
 141 #if HAVE_MMX
 142     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
 143     mm_end = end - 31;
 144     while (s < mm_end) {
 145         __asm__ volatile(
 146             PREFETCH"    32%1           \n\t"
 147             "movq          %1, %%mm0    \n\t"
 148             "movq         8%1, %%mm1    \n\t"
 149             "movq        16%1, %%mm4    \n\t"
 150             "movq        24%1, %%mm5    \n\t"
 151             "movq       %%mm0, %%mm2    \n\t"
 152             "movq       %%mm1, %%mm3    \n\t"
 153             "movq       %%mm4, %%mm6    \n\t"
 154             "movq       %%mm5, %%mm7    \n\t"
 155             "psrlq         $8, %%mm2    \n\t"
 156             "psrlq         $8, %%mm3    \n\t"
 157             "psrlq         $8, %%mm6    \n\t"
 158             "psrlq         $8, %%mm7    \n\t"
 159             "pand          %2, %%mm0    \n\t"
 160             "pand          %2, %%mm1    \n\t"
 161             "pand          %2, %%mm4    \n\t"
 162             "pand          %2, %%mm5    \n\t"
 163             "pand          %3, %%mm2    \n\t"
 164             "pand          %3, %%mm3    \n\t"
 165             "pand          %3, %%mm6    \n\t"
 166             "pand          %3, %%mm7    \n\t"
 167             "por        %%mm2, %%mm0    \n\t"
 168             "por        %%mm3, %%mm1    \n\t"
 169             "por        %%mm6, %%mm4    \n\t"
 170             "por        %%mm7, %%mm5    \n\t"
 171
 172             "movq       %%mm1, %%mm2    \n\t"
 173             "movq       %%mm4, %%mm3    \n\t"
 174             "psllq        $48, %%mm2    \n\t"
 175             "psllq        $32, %%mm3    \n\t"
 176             "pand          %4, %%mm2    \n\t"
 177             "pand          %5, %%mm3    \n\t"
 178             "por        %%mm2, %%mm0    \n\t"
 179             "psrlq        $16, %%mm1    \n\t"
 180             "psrlq        $32, %%mm4    \n\t"
 181             "psllq        $16, %%mm5    \n\t"
 182             "por        %%mm3, %%mm1    \n\t"
 183             "pand          %6, %%mm5    \n\t"
 184             "por        %%mm5, %%mm4    \n\t"
 185
 186             MOVNTQ"     %%mm0,   %0     \n\t"
 187             MOVNTQ"     %%mm1,  8%0     \n\t"
 188             MOVNTQ"     %%mm4, 16%0"
 189             :"=m"(*dest)
 190             :"m"(*s),"m"(mask24l),
 191             "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
 192             :"memory");
 193         dest += 24;
 194         s += 32;
 195     }
 196     __asm__ volatile(SFENCE:::"memory");
 197     __asm__ volatile(EMMS:::"memory");
 198 #endif
 199     while (s < end) {
 200 #if HAVE_BIGENDIAN
 201         /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
 202         s++;
 203         dest[2] = *s++;
 204         dest[1] = *s++;
 205         dest[0] = *s++;
 206         dest += 3;
 207 #else
 208         *dest++ = *s++;
 209         *dest++ = *s++;
 210         *dest++ = *s++;
 211         s++;
 212 #endif
 213     }
 214 }
 215
 216 /*
 217  original by Strepto/Astral
 218  ported to gcc & bugfixed: A'rpi
 219  MMX2, 3DNOW optimization by Nick Kurshev
 220  32-bit C version, and and&add trick by Michael Niedermayer
 221 */
 222 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
 223 {
 224     register const uint8_t* s=src;
 225     register uint8_t* d=dst;
 226     register const uint8_t *end;
 227     const uint8_t *mm_end;
 228     end = s + src_size;
 229 #if HAVE_MMX
 230     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
 231     __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
 232     mm_end = end - 15;
 233     while (s<mm_end) {
 234         __asm__ volatile(
 235             PREFETCH"  32%1         \n\t"
 236             "movq        %1, %%mm0  \n\t"
 237             "movq       8%1, %%mm2  \n\t"
 238             "movq     %%mm0, %%mm1  \n\t"
 239             "movq     %%mm2, %%mm3  \n\t"
 240             "pand     %%mm4, %%mm0  \n\t"
 241             "pand     %%mm4, %%mm2  \n\t"
 242             "paddw    %%mm1, %%mm0  \n\t"
 243             "paddw    %%mm3, %%mm2  \n\t"
 244             MOVNTQ"   %%mm0,  %0    \n\t"
 245             MOVNTQ"   %%mm2, 8%0"
 246             :"=m"(*d)
 247             :"m"(*s)
 248         );
 249         d+=16;
 250         s+=16;
 251     }
 252     __asm__ volatile(SFENCE:::"memory");
 253     __asm__ volatile(EMMS:::"memory");
 254 #endif
 255     mm_end = end - 3;
 256     while (s < mm_end) {
 257         register unsigned x= *((const uint32_t *)s);
 258         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 259         d+=4;
 260         s+=4;
 261     }
 262     if (s < end) {
 263         register unsigned short x= *((const uint16_t *)s);
 264         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
 265     }
 266 }
 267
 268 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
 269 {
 270     register const uint8_t* s=src;
 271     register uint8_t* d=dst;
 272     register const uint8_t *end;
 273     const uint8_t *mm_end;
 274     end = s + src_size;
 275 #if HAVE_MMX
 276     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
 277     __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
 278     __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
 279     mm_end = end - 15;
 280     while (s<mm_end) {
 281         __asm__ volatile(
 282             PREFETCH"  32%1         \n\t"
 283             "movq        %1, %%mm0  \n\t"
 284             "movq       8%1, %%mm2  \n\t"
 285             "movq     %%mm0, %%mm1  \n\t"
 286             "movq     %%mm2, %%mm3  \n\t"
 287             "psrlq       $1, %%mm0  \n\t"
 288             "psrlq       $1, %%mm2  \n\t"
 289             "pand     %%mm7, %%mm0  \n\t"
 290             "pand     %%mm7, %%mm2  \n\t"
 291             "pand     %%mm6, %%mm1  \n\t"
 292             "pand     %%mm6, %%mm3  \n\t"
 293             "por      %%mm1, %%mm0  \n\t"
 294             "por      %%mm3, %%mm2  \n\t"
 295             MOVNTQ"   %%mm0,  %0    \n\t"
 296             MOVNTQ"   %%mm2, 8%0"
 297             :"=m"(*d)
 298             :"m"(*s)
 299         );
 300         d+=16;
 301         s+=16;
 302     }
 303     __asm__ volatile(SFENCE:::"memory");
 304     __asm__ volatile(EMMS:::"memory");
 305 #endif
 306     mm_end = end - 3;
 307     while (s < mm_end) {
 308         register uint32_t x= *((const uint32_t*)s);
 309         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
 310         s+=4;
 311         d+=4;
 312     }
 313     if (s < end) {
 314         register uint16_t x= *((const uint16_t*)s);
 315         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
 316     }
 317 }
 318
 319 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
 320 {
 321     const uint8_t *s = src;
 322     const uint8_t *end;
 323 #if HAVE_MMX
 324     const uint8_t *mm_end;
 325 #endif
 326     uint16_t *d = (uint16_t *)dst;
 327     end = s + src_size;
 328 #if HAVE_MMX
 329     mm_end = end - 15;
 330 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
 331     __asm__ volatile(
 332         "movq           %3, %%mm5   \n\t"
 333         "movq           %4, %%mm6   \n\t"
 334         "movq           %5, %%mm7   \n\t"
 335         "jmp 2f                     \n\t"
 336         ASMALIGN(4)
 337         "1:                         \n\t"
 338         PREFETCH"   32(%1)          \n\t"
 339         "movd         (%1), %%mm0   \n\t"
 340         "movd        4(%1), %%mm3   \n\t"
 341         "punpckldq   8(%1), %%mm0   \n\t"
 342         "punpckldq  12(%1), %%mm3   \n\t"
 343         "movq        %%mm0, %%mm1   \n\t"
 344         "movq        %%mm3, %%mm4   \n\t"
 345         "pand        %%mm6, %%mm0   \n\t"
 346         "pand        %%mm6, %%mm3   \n\t"
 347         "pmaddwd     %%mm7, %%mm0   \n\t"
 348         "pmaddwd     %%mm7, %%mm3   \n\t"
 349         "pand        %%mm5, %%mm1   \n\t"
 350         "pand        %%mm5, %%mm4   \n\t"
 351         "por         %%mm1, %%mm0   \n\t"
 352         "por         %%mm4, %%mm3   \n\t"
 353         "psrld          $5, %%mm0   \n\t"
 354         "pslld         $11, %%mm3   \n\t"
 355         "por         %%mm3, %%mm0   \n\t"
 356         MOVNTQ"      %%mm0, (%0)    \n\t"
 357         "add           $16,  %1     \n\t"
 358         "add            $8,  %0     \n\t"
 359         "2:                         \n\t"
 360         "cmp            %2,  %1     \n\t"
 361         " jb            1b          \n\t"
 362         : "+r" (d), "+r"(s)
 363         : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
 364     );
 365 #else
 366     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 367     __asm__ volatile(
 368         "movq    %0, %%mm7    \n\t"
 369         "movq    %1, %%mm6    \n\t"
 370         ::"m"(red_16mask),"m"(green_16mask));
 371     while (s < mm_end) {
 372         __asm__ volatile(
 373             PREFETCH"    32%1           \n\t"
 374             "movd          %1, %%mm0    \n\t"
 375             "movd         4%1, %%mm3    \n\t"
 376             "punpckldq    8%1, %%mm0    \n\t"
 377             "punpckldq   12%1, %%mm3    \n\t"
 378             "movq       %%mm0, %%mm1    \n\t"
 379             "movq       %%mm0, %%mm2    \n\t"
 380             "movq       %%mm3, %%mm4    \n\t"
 381             "movq       %%mm3, %%mm5    \n\t"
 382             "psrlq         $3, %%mm0    \n\t"
 383             "psrlq         $3, %%mm3    \n\t"
 384             "pand          %2, %%mm0    \n\t"
 385             "pand          %2, %%mm3    \n\t"
 386             "psrlq         $5, %%mm1    \n\t"
 387             "psrlq         $5, %%mm4    \n\t"
 388             "pand       %%mm6, %%mm1    \n\t"
 389             "pand       %%mm6, %%mm4    \n\t"
 390             "psrlq         $8, %%mm2    \n\t"
 391             "psrlq         $8, %%mm5    \n\t"
 392             "pand       %%mm7, %%mm2    \n\t"
 393             "pand       %%mm7, %%mm5    \n\t"
 394             "por        %%mm1, %%mm0    \n\t"
 395             "por        %%mm4, %%mm3    \n\t"
 396             "por        %%mm2, %%mm0    \n\t"
 397             "por        %%mm5, %%mm3    \n\t"
 398             "psllq        $16, %%mm3    \n\t"
 399             "por        %%mm3, %%mm0    \n\t"
 400             MOVNTQ"     %%mm0, %0       \n\t"
 401             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 402         d += 4;
 403         s += 16;
 404     }
 405 #endif
 406     __asm__ volatile(SFENCE:::"memory");
 407     __asm__ volatile(EMMS:::"memory");
 408 #endif
 409     while (s < end) {
 410         register int rgb = *(const uint32_t*)s; s += 4;
 411         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
 412     }
 413 }
 414
 415 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
 416 {
 417     const uint8_t *s = src;
 418     const uint8_t *end;
 419 #if HAVE_MMX
 420     const uint8_t *mm_end;
 421 #endif
 422     uint16_t *d = (uint16_t *)dst;
 423     end = s + src_size;
 424 #if HAVE_MMX
 425     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 426     __asm__ volatile(
 427         "movq          %0, %%mm7    \n\t"
 428         "movq          %1, %%mm6    \n\t"
 429         ::"m"(red_16mask),"m"(green_16mask));
 430     mm_end = end - 15;
 431     while (s < mm_end) {
 432         __asm__ volatile(
 433             PREFETCH"    32%1           \n\t"
 434             "movd          %1, %%mm0    \n\t"
 435             "movd         4%1, %%mm3    \n\t"
 436             "punpckldq    8%1, %%mm0    \n\t"
 437             "punpckldq   12%1, %%mm3    \n\t"
 438             "movq       %%mm0, %%mm1    \n\t"
 439             "movq       %%mm0, %%mm2    \n\t"
 440             "movq       %%mm3, %%mm4    \n\t"
 441             "movq       %%mm3, %%mm5    \n\t"
 442             "psllq         $8, %%mm0    \n\t"
 443             "psllq         $8, %%mm3    \n\t"
 444             "pand       %%mm7, %%mm0    \n\t"
 445             "pand       %%mm7, %%mm3    \n\t"
 446             "psrlq         $5, %%mm1    \n\t"
 447             "psrlq         $5, %%mm4    \n\t"
 448             "pand       %%mm6, %%mm1    \n\t"
 449             "pand       %%mm6, %%mm4    \n\t"
 450             "psrlq        $19, %%mm2    \n\t"
 451             "psrlq        $19, %%mm5    \n\t"
 452             "pand          %2, %%mm2    \n\t"
 453             "pand          %2, %%mm5    \n\t"
 454             "por        %%mm1, %%mm0    \n\t"
 455             "por        %%mm4, %%mm3    \n\t"
 456             "por        %%mm2, %%mm0    \n\t"
 457             "por        %%mm5, %%mm3    \n\t"
 458             "psllq        $16, %%mm3    \n\t"
 459             "por        %%mm3, %%mm0    \n\t"
 460             MOVNTQ"     %%mm0, %0       \n\t"
 461             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 462         d += 4;
 463         s += 16;
 464     }
 465     __asm__ volatile(SFENCE:::"memory");
 466     __asm__ volatile(EMMS:::"memory");
 467 #endif
 468     while (s < end) {
 469         register int rgb = *(const uint32_t*)s; s += 4;
 470         *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
 471     }
 472 }
 473
 474 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
 475 {
 476     const uint8_t *s = src;
 477     const uint8_t *end;
 478 #if HAVE_MMX
 479     const uint8_t *mm_end;
 480 #endif
 481     uint16_t *d = (uint16_t *)dst;
 482     end = s + src_size;
 483 #if HAVE_MMX
 484     mm_end = end - 15;
 485 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
 486     __asm__ volatile(
 487         "movq           %3, %%mm5   \n\t"
 488         "movq           %4, %%mm6   \n\t"
 489         "movq           %5, %%mm7   \n\t"
 490         "jmp            2f          \n\t"
 491         ASMALIGN(4)
 492         "1:                         \n\t"
 493         PREFETCH"   32(%1)          \n\t"
 494         "movd         (%1), %%mm0   \n\t"
 495         "movd        4(%1), %%mm3   \n\t"
 496         "punpckldq   8(%1), %%mm0   \n\t"
 497         "punpckldq  12(%1), %%mm3   \n\t"
 498         "movq        %%mm0, %%mm1   \n\t"
 499         "movq        %%mm3, %%mm4   \n\t"
 500         "pand        %%mm6, %%mm0   \n\t"
 501         "pand        %%mm6, %%mm3   \n\t"
 502         "pmaddwd     %%mm7, %%mm0   \n\t"
 503         "pmaddwd     %%mm7, %%mm3   \n\t"
 504         "pand        %%mm5, %%mm1   \n\t"
 505         "pand        %%mm5, %%mm4   \n\t"
 506         "por         %%mm1, %%mm0   \n\t"
 507         "por         %%mm4, %%mm3   \n\t"
 508         "psrld          $6, %%mm0   \n\t"
 509         "pslld         $10, %%mm3   \n\t"
 510         "por         %%mm3, %%mm0   \n\t"
 511         MOVNTQ"      %%mm0, (%0)    \n\t"
 512         "add           $16,  %1     \n\t"
 513         "add            $8,  %0     \n\t"
 514         "2:                         \n\t"
 515         "cmp            %2,  %1     \n\t"
 516         " jb            1b          \n\t"
 517         : "+r" (d), "+r"(s)
 518         : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
 519     );
 520 #else
 521     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 522     __asm__ volatile(
 523         "movq          %0, %%mm7    \n\t"
 524         "movq          %1, %%mm6    \n\t"
 525         ::"m"(red_15mask),"m"(green_15mask));
 526     while (s < mm_end) {
 527         __asm__ volatile(
 528             PREFETCH"    32%1           \n\t"
 529             "movd          %1, %%mm0    \n\t"
 530             "movd         4%1, %%mm3    \n\t"
 531             "punpckldq    8%1, %%mm0    \n\t"
 532             "punpckldq   12%1, %%mm3    \n\t"
 533             "movq       %%mm0, %%mm1    \n\t"
 534             "movq       %%mm0, %%mm2    \n\t"
 535             "movq       %%mm3, %%mm4    \n\t"
 536             "movq       %%mm3, %%mm5    \n\t"
 537             "psrlq         $3, %%mm0    \n\t"
 538             "psrlq         $3, %%mm3    \n\t"
 539             "pand          %2, %%mm0    \n\t"
 540             "pand          %2, %%mm3    \n\t"
 541             "psrlq         $6, %%mm1    \n\t"
 542             "psrlq         $6, %%mm4    \n\t"
 543             "pand       %%mm6, %%mm1    \n\t"
 544             "pand       %%mm6, %%mm4    \n\t"
 545             "psrlq         $9, %%mm2    \n\t"
 546             "psrlq         $9, %%mm5    \n\t"
 547             "pand       %%mm7, %%mm2    \n\t"
 548             "pand       %%mm7, %%mm5    \n\t"
 549             "por        %%mm1, %%mm0    \n\t"
 550             "por        %%mm4, %%mm3    \n\t"
 551             "por        %%mm2, %%mm0    \n\t"
 552             "por        %%mm5, %%mm3    \n\t"
 553             "psllq        $16, %%mm3    \n\t"
 554             "por        %%mm3, %%mm0    \n\t"
 555             MOVNTQ"     %%mm0, %0       \n\t"
 556             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 557         d += 4;
 558         s += 16;
 559     }
 560 #endif
 561     __asm__ volatile(SFENCE:::"memory");
 562     __asm__ volatile(EMMS:::"memory");
 563 #endif
 564     while (s < end) {
 565         register int rgb = *(const uint32_t*)s; s += 4;
 566         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
 567     }
 568 }
 569
 570 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
 571 {
 572     const uint8_t *s = src;
 573     const uint8_t *end;
 574 #if HAVE_MMX
 575     const uint8_t *mm_end;
 576 #endif
 577     uint16_t *d = (uint16_t *)dst;
 578     end = s + src_size;
 579 #if HAVE_MMX
 580     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 581     __asm__ volatile(
 582         "movq          %0, %%mm7    \n\t"
 583         "movq          %1, %%mm6    \n\t"
 584         ::"m"(red_15mask),"m"(green_15mask));
 585     mm_end = end - 15;
 586     while (s < mm_end) {
 587         __asm__ volatile(
 588             PREFETCH"    32%1           \n\t"
 589             "movd          %1, %%mm0    \n\t"
 590             "movd         4%1, %%mm3    \n\t"
 591             "punpckldq    8%1, %%mm0    \n\t"
 592             "punpckldq   12%1, %%mm3    \n\t"
 593             "movq       %%mm0, %%mm1    \n\t"
 594             "movq       %%mm0, %%mm2    \n\t"
 595             "movq       %%mm3, %%mm4    \n\t"
 596             "movq       %%mm3, %%mm5    \n\t"
 597             "psllq         $7, %%mm0    \n\t"
 598             "psllq         $7, %%mm3    \n\t"
 599             "pand       %%mm7, %%mm0    \n\t"
 600             "pand       %%mm7, %%mm3    \n\t"
 601             "psrlq         $6, %%mm1    \n\t"
 602             "psrlq         $6, %%mm4    \n\t"
 603             "pand       %%mm6, %%mm1    \n\t"
 604             "pand       %%mm6, %%mm4    \n\t"
 605             "psrlq        $19, %%mm2    \n\t"
 606             "psrlq        $19, %%mm5    \n\t"
 607             "pand          %2, %%mm2    \n\t"
 608             "pand          %2, %%mm5    \n\t"
 609             "por        %%mm1, %%mm0    \n\t"
 610             "por        %%mm4, %%mm3    \n\t"
 611             "por        %%mm2, %%mm0    \n\t"
 612             "por        %%mm5, %%mm3    \n\t"
 613             "psllq        $16, %%mm3    \n\t"
 614             "por        %%mm3, %%mm0    \n\t"
 615             MOVNTQ"     %%mm0, %0       \n\t"
 616             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 617         d += 4;
 618         s += 16;
 619     }
 620     __asm__ volatile(SFENCE:::"memory");
 621     __asm__ volatile(EMMS:::"memory");
 622 #endif
 623     while (s < end) {
 624         register int rgb = *(const uint32_t*)s; s += 4;
 625         *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
 626     }
 627 }
 628
 629 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
 630 {
 631     const uint8_t *s = src;
 632     const uint8_t *end;
 633 #if HAVE_MMX
 634     const uint8_t *mm_end;
 635 #endif
 636     uint16_t *d = (uint16_t *)dst;
 637     end = s + src_size;
 638 #if HAVE_MMX
 639     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 640     __asm__ volatile(
 641         "movq         %0, %%mm7     \n\t"
 642         "movq         %1, %%mm6     \n\t"
 643         ::"m"(red_16mask),"m"(green_16mask));
 644     mm_end = end - 11;
 645     while (s < mm_end) {
 646         __asm__ volatile(
 647             PREFETCH"    32%1           \n\t"
 648             "movd          %1, %%mm0    \n\t"
 649             "movd         3%1, %%mm3    \n\t"
 650             "punpckldq    6%1, %%mm0    \n\t"
 651             "punpckldq    9%1, %%mm3    \n\t"
 652             "movq       %%mm0, %%mm1    \n\t"
 653             "movq       %%mm0, %%mm2    \n\t"
 654             "movq       %%mm3, %%mm4    \n\t"
 655             "movq       %%mm3, %%mm5    \n\t"
 656             "psrlq         $3, %%mm0    \n\t"
 657             "psrlq         $3, %%mm3    \n\t"
 658             "pand          %2, %%mm0    \n\t"
 659             "pand          %2, %%mm3    \n\t"
 660             "psrlq         $5, %%mm1    \n\t"
 661             "psrlq         $5, %%mm4    \n\t"
 662             "pand       %%mm6, %%mm1    \n\t"
 663             "pand       %%mm6, %%mm4    \n\t"
 664             "psrlq         $8, %%mm2    \n\t"
 665             "psrlq         $8, %%mm5    \n\t"
 666             "pand       %%mm7, %%mm2    \n\t"
 667             "pand       %%mm7, %%mm5    \n\t"
 668             "por        %%mm1, %%mm0    \n\t"
 669             "por        %%mm4, %%mm3    \n\t"
 670             "por        %%mm2, %%mm0    \n\t"
 671             "por        %%mm5, %%mm3    \n\t"
 672             "psllq        $16, %%mm3    \n\t"
 673             "por        %%mm3, %%mm0    \n\t"
 674             MOVNTQ"     %%mm0, %0       \n\t"
 675             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 676         d += 4;
 677         s += 12;
 678     }
 679     __asm__ volatile(SFENCE:::"memory");
 680     __asm__ volatile(EMMS:::"memory");
 681 #endif
 682     while (s < end) {
 683         const int b = *s++;
 684         const int g = *s++;
 685         const int r = *s++;
 686         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 687     }
 688 }
 689
 690 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
 691 {
 692     const uint8_t *s = src;
 693     const uint8_t *end;
 694 #if HAVE_MMX
 695     const uint8_t *mm_end;
 696 #endif
 697     uint16_t *d = (uint16_t *)dst;
 698     end = s + src_size;
 699 #if HAVE_MMX
 700     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 701     __asm__ volatile(
 702         "movq         %0, %%mm7     \n\t"
 703         "movq         %1, %%mm6     \n\t"
 704         ::"m"(red_16mask),"m"(green_16mask));
 705     mm_end = end - 15;
 706     while (s < mm_end) {
 707         __asm__ volatile(
 708             PREFETCH"    32%1           \n\t"
 709             "movd          %1, %%mm0    \n\t"
 710             "movd         3%1, %%mm3    \n\t"
 711             "punpckldq    6%1, %%mm0    \n\t"
 712             "punpckldq    9%1, %%mm3    \n\t"
 713             "movq       %%mm0, %%mm1    \n\t"
 714             "movq       %%mm0, %%mm2    \n\t"
 715             "movq       %%mm3, %%mm4    \n\t"
 716             "movq       %%mm3, %%mm5    \n\t"
 717             "psllq         $8, %%mm0    \n\t"
 718             "psllq         $8, %%mm3    \n\t"
 719             "pand       %%mm7, %%mm0    \n\t"
 720             "pand       %%mm7, %%mm3    \n\t"
 721             "psrlq         $5, %%mm1    \n\t"
 722             "psrlq         $5, %%mm4    \n\t"
 723             "pand       %%mm6, %%mm1    \n\t"
 724             "pand       %%mm6, %%mm4    \n\t"
 725             "psrlq        $19, %%mm2    \n\t"
 726             "psrlq        $19, %%mm5    \n\t"
 727             "pand          %2, %%mm2    \n\t"
 728             "pand          %2, %%mm5    \n\t"
 729             "por        %%mm1, %%mm0    \n\t"
 730             "por        %%mm4, %%mm3    \n\t"
 731             "por        %%mm2, %%mm0    \n\t"
 732             "por        %%mm5, %%mm3    \n\t"
 733             "psllq        $16, %%mm3    \n\t"
 734             "por        %%mm3, %%mm0    \n\t"
 735             MOVNTQ"     %%mm0, %0       \n\t"
 736             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 737         d += 4;
 738         s += 12;
 739     }
 740     __asm__ volatile(SFENCE:::"memory");
 741     __asm__ volatile(EMMS:::"memory");
 742 #endif
 743     while (s < end) {
 744         const int r = *s++;
 745         const int g = *s++;
 746         const int b = *s++;
 747         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 748     }
 749 }
 750
 751 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
 752 {
 753     const uint8_t *s = src;
 754     const uint8_t *end;
 755 #if HAVE_MMX
 756     const uint8_t *mm_end;
 757 #endif
 758     uint16_t *d = (uint16_t *)dst;
 759     end = s + src_size;
 760 #if HAVE_MMX
 761     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 762     __asm__ volatile(
 763         "movq          %0, %%mm7    \n\t"
 764         "movq          %1, %%mm6    \n\t"
 765         ::"m"(red_15mask),"m"(green_15mask));
 766     mm_end = end - 11;
 767     while (s < mm_end) {
 768         __asm__ volatile(
 769             PREFETCH"    32%1           \n\t"
 770             "movd          %1, %%mm0    \n\t"
 771             "movd         3%1, %%mm3    \n\t"
 772             "punpckldq    6%1, %%mm0    \n\t"
 773             "punpckldq    9%1, %%mm3    \n\t"
 774             "movq       %%mm0, %%mm1    \n\t"
 775             "movq       %%mm0, %%mm2    \n\t"
 776             "movq       %%mm3, %%mm4    \n\t"
 777             "movq       %%mm3, %%mm5    \n\t"
 778             "psrlq         $3, %%mm0    \n\t"
 779             "psrlq         $3, %%mm3    \n\t"
 780             "pand          %2, %%mm0    \n\t"
 781             "pand          %2, %%mm3    \n\t"
 782             "psrlq         $6, %%mm1    \n\t"
 783             "psrlq         $6, %%mm4    \n\t"
 784             "pand       %%mm6, %%mm1    \n\t"
 785             "pand       %%mm6, %%mm4    \n\t"
 786             "psrlq         $9, %%mm2    \n\t"
 787             "psrlq         $9, %%mm5    \n\t"
 788             "pand       %%mm7, %%mm2    \n\t"
 789             "pand       %%mm7, %%mm5    \n\t"
 790             "por        %%mm1, %%mm0    \n\t"
 791             "por        %%mm4, %%mm3    \n\t"
 792             "por        %%mm2, %%mm0    \n\t"
 793             "por        %%mm5, %%mm3    \n\t"
 794             "psllq        $16, %%mm3    \n\t"
 795             "por        %%mm3, %%mm0    \n\t"
 796             MOVNTQ"     %%mm0, %0       \n\t"
 797             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 798         d += 4;
 799         s += 12;
 800     }
 801     __asm__ volatile(SFENCE:::"memory");
 802     __asm__ volatile(EMMS:::"memory");
 803 #endif
 804     while (s < end) {
 805         const int b = *s++;
 806         const int g = *s++;
 807         const int r = *s++;
 808         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 809     }
 810 }
 811
 812 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
 813 {
 814     const uint8_t *s = src;
 815     const uint8_t *end;
 816 #if HAVE_MMX
 817     const uint8_t *mm_end;
 818 #endif
 819     uint16_t *d = (uint16_t *)dst;
 820     end = s + src_size;
 821 #if HAVE_MMX
 822     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 823     __asm__ volatile(
 824         "movq         %0, %%mm7     \n\t"
 825         "movq         %1, %%mm6     \n\t"
 826         ::"m"(red_15mask),"m"(green_15mask));
 827     mm_end = end - 15;
 828     while (s < mm_end) {
 829         __asm__ volatile(
 830             PREFETCH"   32%1            \n\t"
 831             "movd         %1, %%mm0     \n\t"
 832             "movd        3%1, %%mm3     \n\t"
 833             "punpckldq   6%1, %%mm0     \n\t"
 834             "punpckldq   9%1, %%mm3     \n\t"
 835             "movq      %%mm0, %%mm1     \n\t"
 836             "movq      %%mm0, %%mm2     \n\t"
 837             "movq      %%mm3, %%mm4     \n\t"
 838             "movq      %%mm3, %%mm5     \n\t"
 839             "psllq        $7, %%mm0     \n\t"
 840             "psllq        $7, %%mm3     \n\t"
 841             "pand      %%mm7, %%mm0     \n\t"
 842             "pand      %%mm7, %%mm3     \n\t"
 843             "psrlq        $6, %%mm1     \n\t"
 844             "psrlq        $6, %%mm4     \n\t"
 845             "pand      %%mm6, %%mm1     \n\t"
 846             "pand      %%mm6, %%mm4     \n\t"
 847             "psrlq       $19, %%mm2     \n\t"
 848             "psrlq       $19, %%mm5     \n\t"
 849             "pand         %2, %%mm2     \n\t"
 850             "pand         %2, %%mm5     \n\t"
 851             "por       %%mm1, %%mm0     \n\t"
 852             "por       %%mm4, %%mm3     \n\t"
 853             "por       %%mm2, %%mm0     \n\t"
 854             "por       %%mm5, %%mm3     \n\t"
 855             "psllq       $16, %%mm3     \n\t"
 856             "por       %%mm3, %%mm0     \n\t"
 857             MOVNTQ"    %%mm0, %0        \n\t"
 858             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 859         d += 4;
 860         s += 12;
 861     }
 862     __asm__ volatile(SFENCE:::"memory");
 863     __asm__ volatile(EMMS:::"memory");
 864 #endif
 865     while (s < end) {
 866         const int r = *s++;
 867         const int g = *s++;
 868         const int b = *s++;
 869         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 870     }
 871 }
 872
 873 /*
 874   I use less accurate approximation here by simply left-shifting the input
 875   value and filling the low order bits with zeroes. This method improves PNG
 876   compression but this scheme cannot reproduce white exactly, since it does
 877   not generate an all-ones maximum value; the net effect is to darken the
 878   image slightly.
 879
 880   The better method should be "left bit replication":
 881
 882    4 3 2 1 0
 883    ---------
 884    1 1 0 1 1
 885
 886    7 6 5 4 3  2 1 0
 887    ----------------
 888    1 1 0 1 1  1 1 0
 889    |=======|  |===|
 890        |      leftmost bits repeated to fill open bits
 891        |
 892    original bits
 893 */
 894 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 895 {
 896     const uint16_t *end;
 897 #if HAVE_MMX
 898     const uint16_t *mm_end;
 899 #endif
 900     uint8_t *d = dst;
 901     const uint16_t *s = (const uint16_t*)src;
 902     end = s + src_size/2;
 903 #if HAVE_MMX
 904     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
 905     mm_end = end - 7;
 906     while (s < mm_end) {
 907         __asm__ volatile(
 908             PREFETCH"    32%1           \n\t"
 909             "movq          %1, %%mm0    \n\t"
 910             "movq          %1, %%mm1    \n\t"
 911             "movq          %1, %%mm2    \n\t"
 912             "pand          %2, %%mm0    \n\t"
 913             "pand          %3, %%mm1    \n\t"
 914             "pand          %4, %%mm2    \n\t"
 915             "psllq         $3, %%mm0    \n\t"
 916             "psrlq         $2, %%mm1    \n\t"
 917             "psrlq         $7, %%mm2    \n\t"
 918             "movq       %%mm0, %%mm3    \n\t"
 919             "movq       %%mm1, %%mm4    \n\t"
 920             "movq       %%mm2, %%mm5    \n\t"
 921             "punpcklwd     %5, %%mm0    \n\t"
 922             "punpcklwd     %5, %%mm1    \n\t"
 923             "punpcklwd     %5, %%mm2    \n\t"
 924             "punpckhwd     %5, %%mm3    \n\t"
 925             "punpckhwd     %5, %%mm4    \n\t"
 926             "punpckhwd     %5, %%mm5    \n\t"
 927             "psllq         $8, %%mm1    \n\t"
 928             "psllq        $16, %%mm2    \n\t"
 929             "por        %%mm1, %%mm0    \n\t"
 930             "por        %%mm2, %%mm0    \n\t"
 931             "psllq         $8, %%mm4    \n\t"
 932             "psllq        $16, %%mm5    \n\t"
 933             "por        %%mm4, %%mm3    \n\t"
 934             "por        %%mm5, %%mm3    \n\t"
 935
 936             "movq       %%mm0, %%mm6    \n\t"
 937             "movq       %%mm3, %%mm7    \n\t"
 938
 939             "movq         8%1, %%mm0    \n\t"
 940             "movq         8%1, %%mm1    \n\t"
 941             "movq         8%1, %%mm2    \n\t"
 942             "pand          %2, %%mm0    \n\t"
 943             "pand          %3, %%mm1    \n\t"
 944             "pand          %4, %%mm2    \n\t"
 945             "psllq         $3, %%mm0    \n\t"
 946             "psrlq         $2, %%mm1    \n\t"
 947             "psrlq         $7, %%mm2    \n\t"
 948             "movq       %%mm0, %%mm3    \n\t"
 949             "movq       %%mm1, %%mm4    \n\t"
 950             "movq       %%mm2, %%mm5    \n\t"
 951             "punpcklwd     %5, %%mm0    \n\t"
 952             "punpcklwd     %5, %%mm1    \n\t"
 953             "punpcklwd     %5, %%mm2    \n\t"
 954             "punpckhwd     %5, %%mm3    \n\t"
 955             "punpckhwd     %5, %%mm4    \n\t"
 956             "punpckhwd     %5, %%mm5    \n\t"
 957             "psllq         $8, %%mm1    \n\t"
 958             "psllq        $16, %%mm2    \n\t"
 959             "por        %%mm1, %%mm0    \n\t"
 960             "por        %%mm2, %%mm0    \n\t"
 961             "psllq         $8, %%mm4    \n\t"
 962             "psllq        $16, %%mm5    \n\t"
 963             "por        %%mm4, %%mm3    \n\t"
 964             "por        %%mm5, %%mm3    \n\t"
 965
 966             :"=m"(*d)
 967             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
 968             :"memory");
 969         /* borrowed 32 to 24 */
 970         __asm__ volatile(
 971             "movq       %%mm0, %%mm4    \n\t"
 972             "movq       %%mm3, %%mm5    \n\t"
 973             "movq       %%mm6, %%mm0    \n\t"
 974             "movq       %%mm7, %%mm1    \n\t"
 975
 976             "movq       %%mm4, %%mm6    \n\t"
 977             "movq       %%mm5, %%mm7    \n\t"
 978             "movq       %%mm0, %%mm2    \n\t"
 979             "movq       %%mm1, %%mm3    \n\t"
 980
 981             "psrlq         $8, %%mm2    \n\t"
 982             "psrlq         $8, %%mm3    \n\t"
 983             "psrlq         $8, %%mm6    \n\t"
 984             "psrlq         $8, %%mm7    \n\t"
 985             "pand          %2, %%mm0    \n\t"
 986             "pand          %2, %%mm1    \n\t"
 987             "pand          %2, %%mm4    \n\t"
 988             "pand          %2, %%mm5    \n\t"
 989             "pand          %3, %%mm2    \n\t"
 990             "pand          %3, %%mm3    \n\t"
 991             "pand          %3, %%mm6    \n\t"
 992             "pand          %3, %%mm7    \n\t"
 993             "por        %%mm2, %%mm0    \n\t"
 994             "por        %%mm3, %%mm1    \n\t"
 995             "por        %%mm6, %%mm4    \n\t"
 996             "por        %%mm7, %%mm5    \n\t"
 997
 998             "movq       %%mm1, %%mm2    \n\t"
 999             "movq       %%mm4, %%mm3    \n\t"
1000             "psllq        $48, %%mm2    \n\t"
1001             "psllq        $32, %%mm3    \n\t"
1002             "pand          %4, %%mm2    \n\t"
1003             "pand          %5, %%mm3    \n\t"
1004             "por        %%mm2, %%mm0    \n\t"
1005             "psrlq        $16, %%mm1    \n\t"
1006             "psrlq        $32, %%mm4    \n\t"
1007             "psllq        $16, %%mm5    \n\t"
1008             "por        %%mm3, %%mm1    \n\t"
1009             "pand          %6, %%mm5    \n\t"
1010             "por        %%mm5, %%mm4    \n\t"
1011
1012             MOVNTQ"     %%mm0,   %0     \n\t"
1013             MOVNTQ"     %%mm1,  8%0     \n\t"
1014             MOVNTQ"     %%mm4, 16%0"
1015
1016             :"=m"(*d)
1017             :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1018             :"memory");
1019         d += 24;
1020         s += 8;
1021     }
1022     __asm__ volatile(SFENCE:::"memory");
1023     __asm__ volatile(EMMS:::"memory");
1024 #endif
1025     while (s < end) {
1026         register uint16_t bgr;
1027         bgr = *s++;
1028         *d++ = (bgr&0x1F)<<3;
1029         *d++ = (bgr&0x3E0)>>2;
1030         *d++ = (bgr&0x7C00)>>7;
1031     }
1032 }
1033
1034 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1035 {
1036     const uint16_t *end;
1037 #if HAVE_MMX
1038     const uint16_t *mm_end;
1039 #endif
1040     uint8_t *d = (uint8_t *)dst;
1041     const uint16_t *s = (const uint16_t *)src;
1042     end = s + src_size/2;
1043 #if HAVE_MMX
1044     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1045     mm_end = end - 7;
1046     while (s < mm_end) {
1047         __asm__ volatile(
1048             PREFETCH"    32%1           \n\t"
1049             "movq          %1, %%mm0    \n\t"
1050             "movq          %1, %%mm1    \n\t"
1051             "movq          %1, %%mm2    \n\t"
1052             "pand          %2, %%mm0    \n\t"
1053             "pand          %3, %%mm1    \n\t"
1054             "pand          %4, %%mm2    \n\t"
1055             "psllq         $3, %%mm0    \n\t"
1056             "psrlq         $3, %%mm1    \n\t"
1057             "psrlq         $8, %%mm2    \n\t"
1058             "movq       %%mm0, %%mm3    \n\t"
1059             "movq       %%mm1, %%mm4    \n\t"
1060             "movq       %%mm2, %%mm5    \n\t"
1061             "punpcklwd     %5, %%mm0    \n\t"
1062             "punpcklwd     %5, %%mm1    \n\t"
1063             "punpcklwd     %5, %%mm2    \n\t"
1064             "punpckhwd     %5, %%mm3    \n\t"
1065             "punpckhwd     %5, %%mm4    \n\t"
1066             "punpckhwd     %5, %%mm5    \n\t"
1067             "psllq         $8, %%mm1    \n\t"
1068             "psllq        $16, %%mm2    \n\t"
1069             "por        %%mm1, %%mm0    \n\t"
1070             "por        %%mm2, %%mm0    \n\t"
1071             "psllq         $8, %%mm4    \n\t"
1072             "psllq        $16, %%mm5    \n\t"
1073             "por        %%mm4, %%mm3    \n\t"
1074             "por        %%mm5, %%mm3    \n\t"
1075
1076             "movq       %%mm0, %%mm6    \n\t"
1077             "movq       %%mm3, %%mm7    \n\t"
1078
1079             "movq         8%1, %%mm0    \n\t"
1080             "movq         8%1, %%mm1    \n\t"
1081             "movq         8%1, %%mm2    \n\t"
1082             "pand          %2, %%mm0    \n\t"
1083             "pand          %3, %%mm1    \n\t"
1084             "pand          %4, %%mm2    \n\t"
1085             "psllq         $3, %%mm0    \n\t"
1086             "psrlq         $3, %%mm1    \n\t"
1087             "psrlq         $8, %%mm2    \n\t"
1088             "movq       %%mm0, %%mm3    \n\t"
1089             "movq       %%mm1, %%mm4    \n\t"
1090             "movq       %%mm2, %%mm5    \n\t"
1091             "punpcklwd     %5, %%mm0    \n\t"
1092             "punpcklwd     %5, %%mm1    \n\t"
1093             "punpcklwd     %5, %%mm2    \n\t"
1094             "punpckhwd     %5, %%mm3    \n\t"
1095             "punpckhwd     %5, %%mm4    \n\t"
1096             "punpckhwd     %5, %%mm5    \n\t"
1097             "psllq         $8, %%mm1    \n\t"
1098             "psllq        $16, %%mm2    \n\t"
1099             "por        %%mm1, %%mm0    \n\t"
1100             "por        %%mm2, %%mm0    \n\t"
1101             "psllq         $8, %%mm4    \n\t"
1102             "psllq        $16, %%mm5    \n\t"
1103             "por        %%mm4, %%mm3    \n\t"
1104             "por        %%mm5, %%mm3    \n\t"
1105             :"=m"(*d)
1106             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1107             :"memory");
1108         /* borrowed 32 to 24 */
1109         __asm__ volatile(
1110             "movq       %%mm0, %%mm4    \n\t"
1111             "movq       %%mm3, %%mm5    \n\t"
1112             "movq       %%mm6, %%mm0    \n\t"
1113             "movq       %%mm7, %%mm1    \n\t"
1114
1115             "movq       %%mm4, %%mm6    \n\t"
1116             "movq       %%mm5, %%mm7    \n\t"
1117             "movq       %%mm0, %%mm2    \n\t"
1118             "movq       %%mm1, %%mm3    \n\t"
1119
1120             "psrlq         $8, %%mm2    \n\t"
1121             "psrlq         $8, %%mm3    \n\t"
1122             "psrlq         $8, %%mm6    \n\t"
1123             "psrlq         $8, %%mm7    \n\t"
1124             "pand          %2, %%mm0    \n\t"
1125             "pand          %2, %%mm1    \n\t"
1126             "pand          %2, %%mm4    \n\t"
1127             "pand          %2, %%mm5    \n\t"
1128             "pand          %3, %%mm2    \n\t"
1129             "pand          %3, %%mm3    \n\t"
1130             "pand          %3, %%mm6    \n\t"
1131             "pand          %3, %%mm7    \n\t"
1132             "por        %%mm2, %%mm0    \n\t"
1133             "por        %%mm3, %%mm1    \n\t"
1134             "por        %%mm6, %%mm4    \n\t"
1135             "por        %%mm7, %%mm5    \n\t"
1136
1137             "movq       %%mm1, %%mm2    \n\t"
1138             "movq       %%mm4, %%mm3    \n\t"
1139             "psllq        $48, %%mm2    \n\t"
1140             "psllq        $32, %%mm3    \n\t"
1141             "pand          %4, %%mm2    \n\t"
1142             "pand          %5, %%mm3    \n\t"
1143             "por        %%mm2, %%mm0    \n\t"
1144             "psrlq        $16, %%mm1    \n\t"
1145             "psrlq        $32, %%mm4    \n\t"
1146             "psllq        $16, %%mm5    \n\t"
1147             "por        %%mm3, %%mm1    \n\t"
1148             "pand          %6, %%mm5    \n\t"
1149             "por        %%mm5, %%mm4    \n\t"
1150
1151             MOVNTQ"     %%mm0,   %0     \n\t"
1152             MOVNTQ"     %%mm1,  8%0     \n\t"
1153             MOVNTQ"     %%mm4, 16%0"
1154
1155             :"=m"(*d)
1156             :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1157             :"memory");
1158         d += 24;
1159         s += 8;
1160     }
1161     __asm__ volatile(SFENCE:::"memory");
1162     __asm__ volatile(EMMS:::"memory");
1163 #endif
1164     while (s < end) {
1165         register uint16_t bgr;
1166         bgr = *s++;
1167         *d++ = (bgr&0x1F)<<3;
1168         *d++ = (bgr&0x7E0)>>3;
1169         *d++ = (bgr&0xF800)>>8;
1170     }
1171 }
1172
1173 /*
1174  * mm0 = 00 B3 00 B2 00 B1 00 B0
1175  * mm1 = 00 G3 00 G2 00 G1 00 G0
1176  * mm2 = 00 R3 00 R2 00 R1 00 R0
1177  * mm6 = FF FF FF FF FF FF FF FF
1178  * mm7 = 00 00 00 00 00 00 00 00
1179  */
1180 #define PACK_RGB32 \
1181     "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1182     "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1183     "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1184     "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1185     "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1186     "movq       %%mm0, %%mm3    \n\t"                               \
1187     "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1188     "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1189     MOVNTQ"     %%mm0,  %0      \n\t"                               \
1190     MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1191
1192 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1193 {
1194     const uint16_t *end;
1195 #if HAVE_MMX
1196     const uint16_t *mm_end;
1197 #endif
1198     uint8_t *d = dst;
1199     const uint16_t *s = (const uint16_t *)src;
1200     end = s + src_size/2;
1201 #if HAVE_MMX
1202     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1203     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1204     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1205     mm_end = end - 3;
1206     while (s < mm_end) {
1207         __asm__ volatile(
1208             PREFETCH"    32%1           \n\t"
1209             "movq          %1, %%mm0    \n\t"
1210             "movq          %1, %%mm1    \n\t"
1211             "movq          %1, %%mm2    \n\t"
1212             "pand          %2, %%mm0    \n\t"
1213             "pand          %3, %%mm1    \n\t"
1214             "pand          %4, %%mm2    \n\t"
1215             "psllq         $3, %%mm0    \n\t"
1216             "psrlq         $2, %%mm1    \n\t"
1217             "psrlq         $7, %%mm2    \n\t"
1218             PACK_RGB32
1219             :"=m"(*d)
1220             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1221             :"memory");
1222         d += 16;
1223         s += 4;
1224     }
1225     __asm__ volatile(SFENCE:::"memory");
1226     __asm__ volatile(EMMS:::"memory");
1227 #endif
1228     while (s < end) {
1229         register uint16_t bgr;
1230         bgr = *s++;
1231 #if HAVE_BIGENDIAN
1232         *d++ = 255;
1233         *d++ = (bgr&0x7C00)>>7;
1234         *d++ = (bgr&0x3E0)>>2;
1235         *d++ = (bgr&0x1F)<<3;
1236 #else
1237         *d++ = (bgr&0x1F)<<3;
1238         *d++ = (bgr&0x3E0)>>2;
1239         *d++ = (bgr&0x7C00)>>7;
1240         *d++ = 255;
1241 #endif
1242     }
1243 }
1244
1245 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1246 {
1247     const uint16_t *end;
1248 #if HAVE_MMX
1249     const uint16_t *mm_end;
1250 #endif
1251     uint8_t *d = dst;
1252     const uint16_t *s = (const uint16_t*)src;
1253     end = s + src_size/2;
1254 #if HAVE_MMX
1255     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1256     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1257     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1258     mm_end = end - 3;
1259     while (s < mm_end) {
1260         __asm__ volatile(
1261             PREFETCH"    32%1           \n\t"
1262             "movq          %1, %%mm0    \n\t"
1263             "movq          %1, %%mm1    \n\t"
1264             "movq          %1, %%mm2    \n\t"
1265             "pand          %2, %%mm0    \n\t"
1266             "pand          %3, %%mm1    \n\t"
1267             "pand          %4, %%mm2    \n\t"
1268             "psllq         $3, %%mm0    \n\t"
1269             "psrlq         $3, %%mm1    \n\t"
1270             "psrlq         $8, %%mm2    \n\t"
1271             PACK_RGB32
1272             :"=m"(*d)
1273             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1274             :"memory");
1275         d += 16;
1276         s += 4;
1277     }
1278     __asm__ volatile(SFENCE:::"memory");
1279     __asm__ volatile(EMMS:::"memory");
1280 #endif
1281     while (s < end) {
1282         register uint16_t bgr;
1283         bgr = *s++;
1284 #if HAVE_BIGENDIAN
1285         *d++ = 255;
1286         *d++ = (bgr&0xF800)>>8;
1287         *d++ = (bgr&0x7E0)>>3;
1288         *d++ = (bgr&0x1F)<<3;
1289 #else
1290         *d++ = (bgr&0x1F)<<3;
1291         *d++ = (bgr&0x7E0)>>3;
1292         *d++ = (bgr&0xF800)>>8;
1293         *d++ = 255;
1294 #endif
1295     }
1296 }
1297
1298 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1299 {
1300     x86_reg idx = 15 - src_size;
1301     const uint8_t *s = src-idx;
1302     uint8_t *d = dst-idx;
1303 #if HAVE_MMX
1304     __asm__ volatile(
1305         "test          %0, %0           \n\t"
1306         "jns           2f               \n\t"
1307         PREFETCH"       (%1, %0)        \n\t"
1308         "movq          %3, %%mm7        \n\t"
1309         "pxor          %4, %%mm7        \n\t"
1310         "movq       %%mm7, %%mm6        \n\t"
1311         "pxor          %5, %%mm7        \n\t"
1312         ASMALIGN(4)
1313         "1:                             \n\t"
1314         PREFETCH"     32(%1, %0)        \n\t"
1315         "movq           (%1, %0), %%mm0 \n\t"
1316         "movq          8(%1, %0), %%mm1 \n\t"
1317 # if HAVE_MMX2
1318         "pshufw      $177, %%mm0, %%mm3 \n\t"
1319         "pshufw      $177, %%mm1, %%mm5 \n\t"
1320         "pand       %%mm7, %%mm0        \n\t"
1321         "pand       %%mm6, %%mm3        \n\t"
1322         "pand       %%mm7, %%mm1        \n\t"
1323         "pand       %%mm6, %%mm5        \n\t"
1324         "por        %%mm3, %%mm0        \n\t"
1325         "por        %%mm5, %%mm1        \n\t"
1326 # else
1327         "movq       %%mm0, %%mm2        \n\t"
1328         "movq       %%mm1, %%mm4        \n\t"
1329         "pand       %%mm7, %%mm0        \n\t"
1330         "pand       %%mm6, %%mm2        \n\t"
1331         "pand       %%mm7, %%mm1        \n\t"
1332         "pand       %%mm6, %%mm4        \n\t"
1333         "movq       %%mm2, %%mm3        \n\t"
1334         "movq       %%mm4, %%mm5        \n\t"
1335         "pslld        $16, %%mm2        \n\t"
1336         "psrld        $16, %%mm3        \n\t"
1337         "pslld        $16, %%mm4        \n\t"
1338         "psrld        $16, %%mm5        \n\t"
1339         "por        %%mm2, %%mm0        \n\t"
1340         "por        %%mm4, %%mm1        \n\t"
1341         "por        %%mm3, %%mm0        \n\t"
1342         "por        %%mm5, %%mm1        \n\t"
1343 # endif
1344         MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1345         MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1346         "add          $16, %0           \n\t"
1347         "js            1b               \n\t"
1348         SFENCE"                         \n\t"
1349         EMMS"                           \n\t"
1350         "2:                             \n\t"
1351         : "+&r"(idx)
1352         : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1353         : "memory");
1354 #endif
1355     for (; idx<15; idx+=4) {
1356         register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1357         v &= 0xff00ff;
1358         *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1359     }
1360 }
1361
1362 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1363 {
1364     unsigned i;
1365 #if HAVE_MMX
1366     x86_reg mmx_size= 23 - src_size;
1367     __asm__ volatile (
1368         "test             %%"REG_a", %%"REG_a"          \n\t"
1369         "jns                     2f                     \n\t"
1370         "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1371         "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1372         "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1373         ASMALIGN(4)
1374         "1:                                             \n\t"
1375         PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1376         "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1377         "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1378         "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1379         "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1380         "pand                 %%mm5, %%mm0              \n\t"
1381         "pand                 %%mm6, %%mm1              \n\t"
1382         "pand                 %%mm7, %%mm2              \n\t"
1383         "por                  %%mm0, %%mm1              \n\t"
1384         "por                  %%mm2, %%mm1              \n\t"
1385         "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1386         MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1387         "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1388         "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1389         "pand                 %%mm7, %%mm0              \n\t"
1390         "pand                 %%mm5, %%mm1              \n\t"
1391         "pand                 %%mm6, %%mm2              \n\t"
1392         "por                  %%mm0, %%mm1              \n\t"
1393         "por                  %%mm2, %%mm1              \n\t"
1394         "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1395         MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1396         "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1397         "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1398         "pand                 %%mm6, %%mm0              \n\t"
1399         "pand                 %%mm7, %%mm1              \n\t"
1400         "pand                 %%mm5, %%mm2              \n\t"
1401         "por                  %%mm0, %%mm1              \n\t"
1402         "por                  %%mm2, %%mm1              \n\t"
1403         MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1404         "add                    $24, %%"REG_a"          \n\t"
1405         " js                     1b                     \n\t"
1406         "2:                                             \n\t"
1407         : "+a" (mmx_size)
1408         : "r" (src-mmx_size), "r"(dst-mmx_size)
1409     );
1410
1411     __asm__ volatile(SFENCE:::"memory");
1412     __asm__ volatile(EMMS:::"memory");
1413
1414     if (mmx_size==23) return; //finished, was multiple of 8
1415
1416     src+= src_size;
1417     dst+= src_size;
1418     src_size= 23-mmx_size;
1419     src-= src_size;
1420     dst-= src_size;
1421 #endif
1422     for (i=0; i<src_size; i+=3) {
1423         register uint8_t x;
1424         x          = src[i + 2];
1425         dst[i + 1] = src[i + 1];
1426         dst[i + 2] = src[i + 0];
1427         dst[i + 0] = x;
1428     }
1429 }
1430
1431 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1432                                            long width, long height,
1433                                            long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1434 {
1435     long y;
1436     const x86_reg chromWidth= width>>1;
1437     for (y=0; y<height; y++) {
1438 #if HAVE_MMX
1439 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1440         __asm__ volatile(
1441             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1442             ASMALIGN(4)
1443             "1:                                         \n\t"
1444             PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1445             PREFETCH"    32(%2, %%"REG_a")              \n\t"
1446             PREFETCH"    32(%3, %%"REG_a")              \n\t"
1447             "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1448             "movq                    %%mm0, %%mm2       \n\t" // U(0)
1449             "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1450             "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1451             "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1452
1453             "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1454             "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1455             "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1456             "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1457             "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1458             "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1459             "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1460             "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1461
1462             MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1463             MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1464             MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1465             MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1466
1467             "add                        $8, %%"REG_a"   \n\t"
1468             "cmp                        %4, %%"REG_a"   \n\t"
1469             " jb                        1b              \n\t"
1470             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1471             : "%"REG_a
1472         );
1473 #else
1474
1475 #if ARCH_ALPHA && HAVE_MVI
1476 #define pl2yuy2(n)                  \
1477     y1 = yc[n];                     \
1478     y2 = yc2[n];                    \
1479     u = uc[n];                      \
1480     v = vc[n];                      \
1481     __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1482     __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1483     __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1484     __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1485     yuv1 = (u << 8) + (v << 24);                \
1486     yuv2 = yuv1 + y2;               \
1487     yuv1 += y1;                     \
1488     qdst[n]  = yuv1;                \
1489     qdst2[n] = yuv2;
1490
1491         int i;
1492         uint64_t *qdst = (uint64_t *) dst;
1493         uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1494         const uint32_t *yc = (uint32_t *) ysrc;
1495         const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1496         const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1497         for (i = 0; i < chromWidth; i += 8) {
1498             uint64_t y1, y2, yuv1, yuv2;
1499             uint64_t u, v;
1500             /* Prefetch */
1501             __asm__("ldq $31,64(%0)" :: "r"(yc));
1502             __asm__("ldq $31,64(%0)" :: "r"(yc2));
1503             __asm__("ldq $31,64(%0)" :: "r"(uc));
1504             __asm__("ldq $31,64(%0)" :: "r"(vc));
1505
1506             pl2yuy2(0);
1507             pl2yuy2(1);
1508             pl2yuy2(2);
1509             pl2yuy2(3);
1510
1511             yc    += 4;
1512             yc2   += 4;
1513             uc    += 4;
1514             vc    += 4;
1515             qdst  += 4;
1516             qdst2 += 4;
1517         }
1518         y++;
1519         ysrc += lumStride;
1520         dst += dstStride;
1521
1522 #elif HAVE_FAST_64BIT
1523         int i;
1524         uint64_t *ldst = (uint64_t *) dst;
1525         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1526         for (i = 0; i < chromWidth; i += 2) {
1527             uint64_t k, l;
1528             k = yc[0] + (uc[0] << 8) +
1529                 (yc[1] << 16) + (vc[0] << 24);
1530             l = yc[2] + (uc[1] << 8) +
1531                 (yc[3] << 16) + (vc[1] << 24);
1532             *ldst++ = k + (l << 32);
1533             yc += 4;
1534             uc += 2;
1535             vc += 2;
1536         }
1537
1538 #else
1539         int i, *idst = (int32_t *) dst;
1540         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1541         for (i = 0; i < chromWidth; i++) {
1542 #if HAVE_BIGENDIAN
1543             *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1544                 (yc[1] << 8) + (vc[0] << 0);
1545 #else
1546             *idst++ = yc[0] + (uc[0] << 8) +
1547                 (yc[1] << 16) + (vc[0] << 24);
1548 #endif
1549             yc += 2;
1550             uc++;
1551             vc++;
1552         }
1553 #endif
1554 #endif
1555         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1556             usrc += chromStride;
1557             vsrc += chromStride;
1558         }
1559         ysrc += lumStride;
1560         dst  += dstStride;
1561     }
1562 #if HAVE_MMX
1563     __asm__(EMMS"       \n\t"
1564             SFENCE"     \n\t"
1565             :::"memory");
1566 #endif
1567 }
1568
1569 /**
1570  * Height should be a multiple of 2 and width should be a multiple of 16.
1571  * (If this is a problem for anyone then tell me, and I will fix it.)
1572  */
1573 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1574                                       long width, long height,
1575                                       long lumStride, long chromStride, long dstStride)
1576 {
1577     //FIXME interpolate chroma
1578     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1579 }
1580
1581 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1582                                            long width, long height,
1583                                            long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1584 {
1585     long y;
1586     const x86_reg chromWidth= width>>1;
1587     for (y=0; y<height; y++) {
1588 #if HAVE_MMX
1589 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1590         __asm__ volatile(
1591             "xor                %%"REG_a", %%"REG_a"    \n\t"
1592             ASMALIGN(4)
1593             "1:                                         \n\t"
1594             PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1595             PREFETCH"   32(%2, %%"REG_a")               \n\t"
1596             PREFETCH"   32(%3, %%"REG_a")               \n\t"
1597             "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1598             "movq                   %%mm0, %%mm2        \n\t" // U(0)
1599             "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1600             "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1601             "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1602
1603             "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1604             "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1605             "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1606             "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1607             "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1608             "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1609             "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1610             "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1611
1612             MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1613             MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1614             MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1615             MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1616
1617             "add                       $8, %%"REG_a"    \n\t"
1618             "cmp                       %4, %%"REG_a"    \n\t"
1619             " jb                       1b               \n\t"
1620             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1621             : "%"REG_a
1622         );
1623 #else
1624 //FIXME adapt the Alpha ASM code from yv12->yuy2
1625
1626 #if HAVE_FAST_64BIT
1627         int i;
1628         uint64_t *ldst = (uint64_t *) dst;
1629         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1630         for (i = 0; i < chromWidth; i += 2) {
1631             uint64_t k, l;
1632             k = uc[0] + (yc[0] << 8) +
1633                 (vc[0] << 16) + (yc[1] << 24);
1634             l = uc[1] + (yc[2] << 8) +
1635                 (vc[1] << 16) + (yc[3] << 24);
1636             *ldst++ = k + (l << 32);
1637             yc += 4;
1638             uc += 2;
1639             vc += 2;
1640         }
1641
1642 #else
1643         int i, *idst = (int32_t *) dst;
1644         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1645         for (i = 0; i < chromWidth; i++) {
1646 #if HAVE_BIGENDIAN
1647             *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1648                 (vc[0] << 8) + (yc[1] << 0);
1649 #else
1650             *idst++ = uc[0] + (yc[0] << 8) +
1651                (vc[0] << 16) + (yc[1] << 24);
1652 #endif
1653             yc += 2;
1654             uc++;
1655             vc++;
1656         }
1657 #endif
1658 #endif
1659         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1660             usrc += chromStride;
1661             vsrc += chromStride;
1662         }
1663         ysrc += lumStride;
1664         dst += dstStride;
1665     }
1666 #if HAVE_MMX
1667     __asm__(EMMS"       \n\t"
1668             SFENCE"     \n\t"
1669             :::"memory");
1670 #endif
1671 }
1672
1673 /**
1674  * Height should be a multiple of 2 and width should be a multiple of 16
1675  * (If this is a problem for anyone then tell me, and I will fix it.)
1676  */
1677 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1678                                       long width, long height,
1679                                       long lumStride, long chromStride, long dstStride)
1680 {
1681     //FIXME interpolate chroma
1682     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1683 }
1684
1685 /**
1686  * Width should be a multiple of 16.
1687  */
1688 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1689                                          long width, long height,
1690                                          long lumStride, long chromStride, long dstStride)
1691 {
1692     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1693 }
1694
1695 /**
1696  * Width should be a multiple of 16.
1697  */
1698 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1699                                          long width, long height,
1700                                          long lumStride, long chromStride, long dstStride)
1701 {
1702     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1703 }
1704
1705 /**
1706  * Height should be a multiple of 2 and width should be a multiple of 16.
1707  * (If this is a problem for anyone then tell me, and I will fix it.)
1708  */
1709 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1710                                       long width, long height,
1711                                       long lumStride, long chromStride, long srcStride)
1712 {
1713     long y;
1714     const x86_reg chromWidth= width>>1;
1715     for (y=0; y<height; y+=2) {
1716 #if HAVE_MMX
1717         __asm__ volatile(
1718             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1719             "pcmpeqw                 %%mm7, %%mm7       \n\t"
1720             "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1721             ASMALIGN(4)
1722             "1:                \n\t"
1723             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1724             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1725             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1726             "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1727             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1728             "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1729             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1730             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1731             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1732             "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1733             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1734
1735             MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1736
1737             "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1738             "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1739             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1740             "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1741             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1742             "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1743             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1744             "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1745             "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1746             "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1747
1748             MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1749
1750             "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1751             "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1752             "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1753             "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1754             "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1755             "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1756             "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1757             "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1758
1759             MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1760             MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1761
1762             "add                        $8, %%"REG_a"   \n\t"
1763             "cmp                        %4, %%"REG_a"   \n\t"
1764             " jb                        1b              \n\t"
1765             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1766             : "memory", "%"REG_a
1767         );
1768
1769         ydst += lumStride;
1770         src  += srcStride;
1771
1772         __asm__ volatile(
1773             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1774             ASMALIGN(4)
1775             "1:                                         \n\t"
1776             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1777             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1778             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1779             "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1780             "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1781             "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1782             "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1783             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1784             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1785             "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1786             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1787
1788             MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1789             MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1790
1791             "add                        $8, %%"REG_a"   \n\t"
1792             "cmp                        %4, %%"REG_a"   \n\t"
1793             " jb                        1b              \n\t"
1794
1795             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1796             : "memory", "%"REG_a
1797         );
1798 #else
1799         long i;
1800         for (i=0; i<chromWidth; i++) {
1801             ydst[2*i+0]     = src[4*i+0];
1802             udst[i]     = src[4*i+1];
1803             ydst[2*i+1]     = src[4*i+2];
1804             vdst[i]     = src[4*i+3];
1805         }
1806         ydst += lumStride;
1807         src  += srcStride;
1808
1809         for (i=0; i<chromWidth; i++) {
1810             ydst[2*i+0]     = src[4*i+0];
1811             ydst[2*i+1]     = src[4*i+2];
1812         }
1813 #endif
1814         udst += chromStride;
1815         vdst += chromStride;
1816         ydst += lumStride;
1817         src  += srcStride;
1818     }
1819 #if HAVE_MMX
1820     __asm__ volatile(EMMS"       \n\t"
1821                      SFENCE"     \n\t"
1822                      :::"memory");
1823 #endif
1824 }
1825
1826 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1827                                       uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1828                                       long width, long height, long lumStride, long chromStride)
1829 {
1830     /* Y Plane */
1831     memcpy(ydst, ysrc, width*height);
1832
1833     /* XXX: implement upscaling for U,V */
1834 }
1835
1836 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1837 {
1838     long x,y;
1839
1840     dst[0]= src[0];
1841
1842     // first line
1843     for (x=0; x<srcWidth-1; x++) {
1844         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1845         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1846     }
1847     dst[2*srcWidth-1]= src[srcWidth-1];
1848
1849     dst+= dstStride;
1850
1851     for (y=1; y<srcHeight; y++) {
1852 #if HAVE_MMX2 || HAVE_AMD3DNOW
1853         const x86_reg mmxSize= srcWidth&~15;
1854         __asm__ volatile(
1855             "mov           %4, %%"REG_a"            \n\t"
1856             "1:                                     \n\t"
1857             "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1858             "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1859             "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1860             "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1861             "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1862             "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1863             PAVGB"                  %%mm0, %%mm5    \n\t"
1864             PAVGB"                  %%mm0, %%mm3    \n\t"
1865             PAVGB"                  %%mm0, %%mm5    \n\t"
1866             PAVGB"                  %%mm0, %%mm3    \n\t"
1867             PAVGB"                  %%mm1, %%mm4    \n\t"
1868             PAVGB"                  %%mm1, %%mm2    \n\t"
1869             PAVGB"                  %%mm1, %%mm4    \n\t"
1870             PAVGB"                  %%mm1, %%mm2    \n\t"
1871             "movq                   %%mm5, %%mm7    \n\t"
1872             "movq                   %%mm4, %%mm6    \n\t"
1873             "punpcklbw              %%mm3, %%mm5    \n\t"
1874             "punpckhbw              %%mm3, %%mm7    \n\t"
1875             "punpcklbw              %%mm2, %%mm4    \n\t"
1876             "punpckhbw              %%mm2, %%mm6    \n\t"
1877 #if 1
1878             MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1879             MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1880             MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1881             MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1882 #else
1883             "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1884             "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1885             "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1886             "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1887 #endif
1888             "add                       $8, %%"REG_a"            \n\t"
1889             " js                       1b                       \n\t"
1890             :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1891             "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1892             "g" (-mmxSize)
1893             : "%"REG_a
1894
1895         );
1896 #else
1897         const x86_reg mmxSize=1;
1898 #endif
1899         dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1900         dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1901
1902         for (x=mmxSize-1; x<srcWidth-1; x++) {
1903             dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1904             dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1905             dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1906             dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1907         }
1908         dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1909         dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1910
1911         dst+=dstStride*2;
1912         src+=srcStride;
1913     }
1914
1915     // last line
1916 #if 1
1917     dst[0]= src[0];
1918
1919     for (x=0; x<srcWidth-1; x++) {
1920         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1921         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1922     }
1923     dst[2*srcWidth-1]= src[srcWidth-1];
1924 #else
1925     for (x=0; x<srcWidth; x++) {
1926         dst[2*x+0]=
1927         dst[2*x+1]= src[x];
1928     }
1929 #endif
1930
1931 #if HAVE_MMX
1932     __asm__ volatile(EMMS"       \n\t"
1933                      SFENCE"     \n\t"
1934                      :::"memory");
1935 #endif
1936 }
1937
1938 /**
1939  * Height should be a multiple of 2 and width should be a multiple of 16.
1940  * (If this is a problem for anyone then tell me, and I will fix it.)
1941  * Chrominance data is only taken from every second line, others are ignored.
1942  * FIXME: Write HQ version.
1943  */
1944 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1945                                       long width, long height,
1946                                       long lumStride, long chromStride, long srcStride)
1947 {
1948     long y;
1949     const x86_reg chromWidth= width>>1;
1950     for (y=0; y<height; y+=2) {
1951 #if HAVE_MMX
1952         __asm__ volatile(
1953             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1954             "pcmpeqw             %%mm7, %%mm7   \n\t"
1955             "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1956             ASMALIGN(4)
1957             "1:                                 \n\t"
1958             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1959             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1960             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1961             "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1962             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1963             "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1964             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1965             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1966             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1967             "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1968             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1969
1970             MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1971
1972             "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1973             "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1974             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1975             "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1976             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1977             "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1978             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1979             "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1980             "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1981             "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1982
1983             MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1984
1985             "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1986             "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1987             "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1988             "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1989             "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1990             "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1991             "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1992             "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1993
1994             MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1995             MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1996
1997             "add                    $8, %%"REG_a"   \n\t"
1998             "cmp                    %4, %%"REG_a"   \n\t"
1999             " jb                    1b          \n\t"
2000             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2001             : "memory", "%"REG_a
2002         );
2003
2004         ydst += lumStride;
2005         src  += srcStride;
2006
2007         __asm__ volatile(
2008             "xor                 %%"REG_a", %%"REG_a"   \n\t"
2009             ASMALIGN(4)
2010             "1:                                 \n\t"
2011             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2012             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
2013             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
2014             "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
2015             "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
2016             "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2017             "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2018             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2019             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2020             "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2021             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2022
2023             MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
2024             MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2025
2026             "add                    $8, %%"REG_a"   \n\t"
2027             "cmp                    %4, %%"REG_a"   \n\t"
2028             " jb                    1b          \n\t"
2029
2030             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2031             : "memory", "%"REG_a
2032         );
2033 #else
2034         long i;
2035         for (i=0; i<chromWidth; i++) {
2036             udst[i]     = src[4*i+0];
2037             ydst[2*i+0] = src[4*i+1];
2038             vdst[i]     = src[4*i+2];
2039             ydst[2*i+1] = src[4*i+3];
2040         }
2041         ydst += lumStride;
2042         src  += srcStride;
2043
2044         for (i=0; i<chromWidth; i++) {
2045             ydst[2*i+0] = src[4*i+1];
2046             ydst[2*i+1] = src[4*i+3];
2047         }
2048 #endif
2049         udst += chromStride;
2050         vdst += chromStride;
2051         ydst += lumStride;
2052         src  += srcStride;
2053     }
2054 #if HAVE_MMX
2055     __asm__ volatile(EMMS"       \n\t"
2056                      SFENCE"     \n\t"
2057                      :::"memory");
2058 #endif
2059 }
2060
2061 /**
2062  * Height should be a multiple of 2 and width should be a multiple of 2.
2063  * (If this is a problem for anyone then tell me, and I will fix it.)
2064  * Chrominance data is only taken from every second line,
2065  * others are ignored in the C version.
2066  * FIXME: Write HQ version.
2067  */
2068 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2069                                        long width, long height,
2070                                        long lumStride, long chromStride, long srcStride)
2071 {
2072     long y;
2073     const x86_reg chromWidth= width>>1;
2074 #if HAVE_MMX
2075     for (y=0; y<height-2; y+=2) {
2076         long i;
2077         for (i=0; i<2; i++) {
2078             __asm__ volatile(
2079                 "mov                        %2, %%"REG_a"   \n\t"
2080                 "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2081                 "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2082                 "pxor                    %%mm7, %%mm7       \n\t"
2083                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2084                 ASMALIGN(4)
2085                 "1:                                         \n\t"
2086                 PREFETCH"    64(%0, %%"REG_d")              \n\t"
2087                 "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2088                 "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2089                 "punpcklbw               %%mm7, %%mm0       \n\t"
2090                 "punpcklbw               %%mm7, %%mm1       \n\t"
2091                 "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2092                 "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2093                 "punpcklbw               %%mm7, %%mm2       \n\t"
2094                 "punpcklbw               %%mm7, %%mm3       \n\t"
2095                 "pmaddwd                 %%mm6, %%mm0       \n\t"
2096                 "pmaddwd                 %%mm6, %%mm1       \n\t"
2097                 "pmaddwd                 %%mm6, %%mm2       \n\t"
2098                 "pmaddwd                 %%mm6, %%mm3       \n\t"
2099 #ifndef FAST_BGR2YV12
2100                 "psrad                      $8, %%mm0       \n\t"
2101                 "psrad                      $8, %%mm1       \n\t"
2102                 "psrad                      $8, %%mm2       \n\t"
2103                 "psrad                      $8, %%mm3       \n\t"
2104 #endif
2105                 "packssdw                %%mm1, %%mm0       \n\t"
2106                 "packssdw                %%mm3, %%mm2       \n\t"
2107                 "pmaddwd                 %%mm5, %%mm0       \n\t"
2108                 "pmaddwd                 %%mm5, %%mm2       \n\t"
2109                 "packssdw                %%mm2, %%mm0       \n\t"
2110                 "psraw                      $7, %%mm0       \n\t"
2111
2112                 "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2113                 "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2114                 "punpcklbw               %%mm7, %%mm4       \n\t"
2115                 "punpcklbw               %%mm7, %%mm1       \n\t"
2116                 "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2117                 "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2118                 "punpcklbw               %%mm7, %%mm2       \n\t"
2119                 "punpcklbw               %%mm7, %%mm3       \n\t"
2120                 "pmaddwd                 %%mm6, %%mm4       \n\t"
2121                 "pmaddwd                 %%mm6, %%mm1       \n\t"
2122                 "pmaddwd                 %%mm6, %%mm2       \n\t"
2123                 "pmaddwd                 %%mm6, %%mm3       \n\t"
2124 #ifndef FAST_BGR2YV12
2125                 "psrad                      $8, %%mm4       \n\t"
2126                 "psrad                      $8, %%mm1       \n\t"
2127                 "psrad                      $8, %%mm2       \n\t"
2128                 "psrad                      $8, %%mm3       \n\t"
2129 #endif
2130                 "packssdw                %%mm1, %%mm4       \n\t"
2131                 "packssdw                %%mm3, %%mm2       \n\t"
2132                 "pmaddwd                 %%mm5, %%mm4       \n\t"
2133                 "pmaddwd                 %%mm5, %%mm2       \n\t"
2134                 "add                       $24, %%"REG_d"   \n\t"
2135                 "packssdw                %%mm2, %%mm4       \n\t"
2136                 "psraw                      $7, %%mm4       \n\t"
2137
2138                 "packuswb                %%mm4, %%mm0       \n\t"
2139                 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2140
2141                 MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2142                 "add                        $8,      %%"REG_a"  \n\t"
2143                 " js                        1b                  \n\t"
2144                 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2145                 : "%"REG_a, "%"REG_d
2146             );
2147             ydst += lumStride;
2148             src  += srcStride;
2149         }
2150         src -= srcStride*2;
2151         __asm__ volatile(
2152             "mov                        %4, %%"REG_a"   \n\t"
2153             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2154             "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2155             "pxor                    %%mm7, %%mm7       \n\t"
2156             "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2157             "add                 %%"REG_d", %%"REG_d"   \n\t"
2158             ASMALIGN(4)
2159             "1:                                         \n\t"
2160             PREFETCH"    64(%0, %%"REG_d")              \n\t"
2161             PREFETCH"    64(%1, %%"REG_d")              \n\t"
2162 #if HAVE_MMX2 || HAVE_AMD3DNOW
2163             "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2164             "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2165             "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2166             "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2167             PAVGB"                   %%mm1, %%mm0       \n\t"
2168             PAVGB"                   %%mm3, %%mm2       \n\t"
2169             "movq                    %%mm0, %%mm1       \n\t"
2170             "movq                    %%mm2, %%mm3       \n\t"
2171             "psrlq                     $24, %%mm0       \n\t"
2172             "psrlq                     $24, %%mm2       \n\t"
2173             PAVGB"                   %%mm1, %%mm0       \n\t"
2174             PAVGB"                   %%mm3, %%mm2       \n\t"
2175             "punpcklbw               %%mm7, %%mm0       \n\t"
2176             "punpcklbw               %%mm7, %%mm2       \n\t"
2177 #else
2178             "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2179             "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2180             "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2181             "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2182             "punpcklbw               %%mm7, %%mm0       \n\t"
2183             "punpcklbw               %%mm7, %%mm1       \n\t"
2184             "punpcklbw               %%mm7, %%mm2       \n\t"
2185             "punpcklbw               %%mm7, %%mm3       \n\t"
2186             "paddw                   %%mm1, %%mm0       \n\t"
2187             "paddw                   %%mm3, %%mm2       \n\t"
2188             "paddw                   %%mm2, %%mm0       \n\t"
2189             "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2190             "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2191             "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2192             "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2193             "punpcklbw               %%mm7, %%mm4       \n\t"
2194             "punpcklbw               %%mm7, %%mm1       \n\t"
2195             "punpcklbw               %%mm7, %%mm2       \n\t"
2196             "punpcklbw               %%mm7, %%mm3       \n\t"
2197             "paddw                   %%mm1, %%mm4       \n\t"
2198             "paddw                   %%mm3, %%mm2       \n\t"
2199             "paddw                   %%mm4, %%mm2       \n\t"
2200             "psrlw                      $2, %%mm0       \n\t"
2201             "psrlw                      $2, %%mm2       \n\t"
2202 #endif
2203             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2204             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2205
2206             "pmaddwd                 %%mm0, %%mm1       \n\t"
2207             "pmaddwd                 %%mm2, %%mm3       \n\t"
2208             "pmaddwd                 %%mm6, %%mm0       \n\t"
2209             "pmaddwd                 %%mm6, %%mm2       \n\t"
2210 #ifndef FAST_BGR2YV12
2211             "psrad                      $8, %%mm0       \n\t"
2212             "psrad                      $8, %%mm1       \n\t"
2213             "psrad                      $8, %%mm2       \n\t"
2214             "psrad                      $8, %%mm3       \n\t"
2215 #endif
2216             "packssdw                %%mm2, %%mm0       \n\t"
2217             "packssdw                %%mm3, %%mm1       \n\t"
2218             "pmaddwd                 %%mm5, %%mm0       \n\t"
2219             "pmaddwd                 %%mm5, %%mm1       \n\t"
2220             "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2221             "psraw                      $7, %%mm0       \n\t"
2222
2223 #if HAVE_MMX2 || HAVE_AMD3DNOW
2224             "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2225             "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2226             "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2227             "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2228             PAVGB"                   %%mm1, %%mm4       \n\t"
2229             PAVGB"                   %%mm3, %%mm2       \n\t"
2230             "movq                    %%mm4, %%mm1       \n\t"
2231             "movq                    %%mm2, %%mm3       \n\t"
2232             "psrlq                     $24, %%mm4       \n\t"
2233             "psrlq                     $24, %%mm2       \n\t"
2234             PAVGB"                   %%mm1, %%mm4       \n\t"
2235             PAVGB"                   %%mm3, %%mm2       \n\t"
2236             "punpcklbw               %%mm7, %%mm4       \n\t"
2237             "punpcklbw               %%mm7, %%mm2       \n\t"
2238 #else
2239             "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2240             "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2241             "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2242             "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2243             "punpcklbw               %%mm7, %%mm4       \n\t"
2244             "punpcklbw               %%mm7, %%mm1       \n\t"
2245             "punpcklbw               %%mm7, %%mm2       \n\t"
2246             "punpcklbw               %%mm7, %%mm3       \n\t"
2247             "paddw                   %%mm1, %%mm4       \n\t"
2248             "paddw                   %%mm3, %%mm2       \n\t"
2249             "paddw                   %%mm2, %%mm4       \n\t"
2250             "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2251             "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2252             "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2253             "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2254             "punpcklbw               %%mm7, %%mm5       \n\t"
2255             "punpcklbw               %%mm7, %%mm1       \n\t"
2256             "punpcklbw               %%mm7, %%mm2       \n\t"
2257             "punpcklbw               %%mm7, %%mm3       \n\t"
2258             "paddw                   %%mm1, %%mm5       \n\t"
2259             "paddw                   %%mm3, %%mm2       \n\t"
2260             "paddw                   %%mm5, %%mm2       \n\t"
2261             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2262             "psrlw                      $2, %%mm4       \n\t"
2263             "psrlw                      $2, %%mm2       \n\t"
2264 #endif
2265             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2266             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2267
2268             "pmaddwd                 %%mm4, %%mm1       \n\t"
2269             "pmaddwd                 %%mm2, %%mm3       \n\t"
2270             "pmaddwd                 %%mm6, %%mm4       \n\t"
2271             "pmaddwd                 %%mm6, %%mm2       \n\t"
2272 #ifndef FAST_BGR2YV12
2273             "psrad                      $8, %%mm4       \n\t"
2274             "psrad                      $8, %%mm1       \n\t"
2275             "psrad                      $8, %%mm2       \n\t"
2276             "psrad                      $8, %%mm3       \n\t"
2277 #endif
2278             "packssdw                %%mm2, %%mm4       \n\t"
2279             "packssdw                %%mm3, %%mm1       \n\t"
2280             "pmaddwd                 %%mm5, %%mm4       \n\t"
2281             "pmaddwd                 %%mm5, %%mm1       \n\t"
2282             "add                       $24, %%"REG_d"   \n\t"
2283             "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2284             "psraw                      $7, %%mm4       \n\t"
2285
2286             "movq                    %%mm0, %%mm1           \n\t"
2287             "punpckldq               %%mm4, %%mm0           \n\t"
2288             "punpckhdq               %%mm4, %%mm1           \n\t"
2289             "packsswb                %%mm1, %%mm0           \n\t"
2290             "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2291             "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2292             "punpckhdq               %%mm0, %%mm0           \n\t"
2293             "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2294             "add                        $4, %%"REG_a"       \n\t"
2295             " js                        1b                  \n\t"
2296             : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2297             : "%"REG_a, "%"REG_d
2298         );
2299
2300         udst += chromStride;
2301         vdst += chromStride;
2302         src  += srcStride*2;
2303     }
2304
2305     __asm__ volatile(EMMS"       \n\t"
2306                      SFENCE"     \n\t"
2307                      :::"memory");
2308 #else
2309     y=0;
2310 #endif
2311     for (; y<height; y+=2) {
2312         long i;
2313         for (i=0; i<chromWidth; i++) {
2314             unsigned int b = src[6*i+0];
2315             unsigned int g = src[6*i+1];
2316             unsigned int r = src[6*i+2];
2317
2318             unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2319             unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2320             unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2321
2322             udst[i]     = U;
2323             vdst[i]     = V;
2324             ydst[2*i]   = Y;
2325
2326             b = src[6*i+3];
2327             g = src[6*i+4];
2328             r = src[6*i+5];
2329
2330             Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2331             ydst[2*i+1]     = Y;
2332         }
2333         ydst += lumStride;
2334         src  += srcStride;
2335
2336         for (i=0; i<chromWidth; i++) {
2337             unsigned int b = src[6*i+0];
2338             unsigned int g = src[6*i+1];
2339             unsigned int r = src[6*i+2];
2340
2341             unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2342
2343             ydst[2*i]     = Y;
2344
2345             b = src[6*i+3];
2346             g = src[6*i+4];
2347             r = src[6*i+5];
2348
2349             Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350             ydst[2*i+1]     = Y;
2351         }
2352         udst += chromStride;
2353         vdst += chromStride;
2354         ydst += lumStride;
2355         src  += srcStride;
2356     }
2357 }
2358
2359 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2360                              long width, long height, long src1Stride,
2361                              long src2Stride, long dstStride)
2362 {
2363     long h;
2364
2365     for (h=0; h < height; h++) {
2366         long w;
2367
2368 #if HAVE_MMX
2369 #if HAVE_SSE2
2370         __asm__(
2371             "xor              %%"REG_a", %%"REG_a"  \n\t"
2372             "1:                                     \n\t"
2373             PREFETCH" 64(%1, %%"REG_a")             \n\t"
2374             PREFETCH" 64(%2, %%"REG_a")             \n\t"
2375             "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2376             "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2377             "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2378             "punpcklbw           %%xmm2, %%xmm0     \n\t"
2379             "punpckhbw           %%xmm2, %%xmm1     \n\t"
2380             "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2381             "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2382             "add                    $16, %%"REG_a"  \n\t"
2383             "cmp                     %3, %%"REG_a"  \n\t"
2384             " jb                     1b             \n\t"
2385             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2386             : "memory", "%"REG_a""
2387         );
2388 #else
2389         __asm__(
2390             "xor %%"REG_a", %%"REG_a"               \n\t"
2391             "1:                                     \n\t"
2392             PREFETCH" 64(%1, %%"REG_a")             \n\t"
2393             PREFETCH" 64(%2, %%"REG_a")             \n\t"
2394             "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2395             "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2396             "movq                 %%mm0, %%mm1      \n\t"
2397             "movq                 %%mm2, %%mm3      \n\t"
2398             "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2399             "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2400             "punpcklbw            %%mm4, %%mm0      \n\t"
2401             "punpckhbw            %%mm4, %%mm1      \n\t"
2402             "punpcklbw            %%mm5, %%mm2      \n\t"
2403             "punpckhbw            %%mm5, %%mm3      \n\t"
2404             MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2405             MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2406             MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2407             MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2408             "add                    $16, %%"REG_a"  \n\t"
2409             "cmp                     %3, %%"REG_a"  \n\t"
2410             " jb                     1b             \n\t"
2411             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2412             : "memory", "%"REG_a
2413         );
2414 #endif
2415         for (w= (width&(~15)); w < width; w++) {
2416             dest[2*w+0] = src1[w];
2417             dest[2*w+1] = src2[w];
2418         }
2419 #else
2420         for (w=0; w < width; w++) {
2421             dest[2*w+0] = src1[w];
2422             dest[2*w+1] = src2[w];
2423         }
2424 #endif
2425         dest += dstStride;
2426                 src1 += src1Stride;
2427                 src2 += src2Stride;
2428     }
2429 #if HAVE_MMX
2430     __asm__(
2431             EMMS"       \n\t"
2432             SFENCE"     \n\t"
2433             ::: "memory"
2434             );
2435 #endif
2436 }
2437
2438 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2439                                        uint8_t *dst1, uint8_t *dst2,
2440                                        long width, long height,
2441                                        long srcStride1, long srcStride2,
2442                                        long dstStride1, long dstStride2)
2443 {
2444     x86_reg y;
2445     long x,w,h;
2446     w=width/2; h=height/2;
2447 #if HAVE_MMX
2448     __asm__ volatile(
2449         PREFETCH" %0    \n\t"
2450         PREFETCH" %1    \n\t"
2451         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2452 #endif
2453     for (y=0;y<h;y++) {
2454         const uint8_t* s1=src1+srcStride1*(y>>1);
2455         uint8_t* d=dst1+dstStride1*y;
2456         x=0;
2457 #if HAVE_MMX
2458         for (;x<w-31;x+=32) {
2459             __asm__ volatile(
2460                 PREFETCH"   32%1        \n\t"
2461                 "movq         %1, %%mm0 \n\t"
2462                 "movq        8%1, %%mm2 \n\t"
2463                 "movq       16%1, %%mm4 \n\t"
2464                 "movq       24%1, %%mm6 \n\t"
2465                 "movq      %%mm0, %%mm1 \n\t"
2466                 "movq      %%mm2, %%mm3 \n\t"
2467                 "movq      %%mm4, %%mm5 \n\t"
2468                 "movq      %%mm6, %%mm7 \n\t"
2469                 "punpcklbw %%mm0, %%mm0 \n\t"
2470                 "punpckhbw %%mm1, %%mm1 \n\t"
2471                 "punpcklbw %%mm2, %%mm2 \n\t"
2472                 "punpckhbw %%mm3, %%mm3 \n\t"
2473                 "punpcklbw %%mm4, %%mm4 \n\t"
2474                 "punpckhbw %%mm5, %%mm5 \n\t"
2475                 "punpcklbw %%mm6, %%mm6 \n\t"
2476                 "punpckhbw %%mm7, %%mm7 \n\t"
2477                 MOVNTQ"    %%mm0,   %0  \n\t"
2478                 MOVNTQ"    %%mm1,  8%0  \n\t"
2479                 MOVNTQ"    %%mm2, 16%0  \n\t"
2480                 MOVNTQ"    %%mm3, 24%0  \n\t"
2481                 MOVNTQ"    %%mm4, 32%0  \n\t"
2482                 MOVNTQ"    %%mm5, 40%0  \n\t"
2483                 MOVNTQ"    %%mm6, 48%0  \n\t"
2484                 MOVNTQ"    %%mm7, 56%0"
2485                 :"=m"(d[2*x])
2486                 :"m"(s1[x])
2487                 :"memory");
2488         }
2489 #endif
2490         for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2491     }
2492     for (y=0;y<h;y++) {
2493         const uint8_t* s2=src2+srcStride2*(y>>1);
2494         uint8_t* d=dst2+dstStride2*y;
2495         x=0;
2496 #if HAVE_MMX
2497         for (;x<w-31;x+=32) {
2498             __asm__ volatile(
2499                 PREFETCH"   32%1        \n\t"
2500                 "movq         %1, %%mm0 \n\t"
2501                 "movq        8%1, %%mm2 \n\t"
2502                 "movq       16%1, %%mm4 \n\t"
2503                 "movq       24%1, %%mm6 \n\t"
2504                 "movq      %%mm0, %%mm1 \n\t"
2505                 "movq      %%mm2, %%mm3 \n\t"
2506                 "movq      %%mm4, %%mm5 \n\t"
2507                 "movq      %%mm6, %%mm7 \n\t"
2508                 "punpcklbw %%mm0, %%mm0 \n\t"
2509                 "punpckhbw %%mm1, %%mm1 \n\t"
2510                 "punpcklbw %%mm2, %%mm2 \n\t"
2511                 "punpckhbw %%mm3, %%mm3 \n\t"
2512                 "punpcklbw %%mm4, %%mm4 \n\t"
2513                 "punpckhbw %%mm5, %%mm5 \n\t"
2514                 "punpcklbw %%mm6, %%mm6 \n\t"
2515                 "punpckhbw %%mm7, %%mm7 \n\t"
2516                 MOVNTQ"    %%mm0,   %0  \n\t"
2517                 MOVNTQ"    %%mm1,  8%0  \n\t"
2518                 MOVNTQ"    %%mm2, 16%0  \n\t"
2519                 MOVNTQ"    %%mm3, 24%0  \n\t"
2520                 MOVNTQ"    %%mm4, 32%0  \n\t"
2521                 MOVNTQ"    %%mm5, 40%0  \n\t"
2522                 MOVNTQ"    %%mm6, 48%0  \n\t"
2523                 MOVNTQ"    %%mm7, 56%0"
2524                 :"=m"(d[2*x])
2525                 :"m"(s2[x])
2526                 :"memory");
2527         }
2528 #endif
2529         for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2530     }
2531 #if HAVE_MMX
2532     __asm__(
2533             EMMS"       \n\t"
2534             SFENCE"     \n\t"
2535             ::: "memory"
2536         );
2537 #endif
2538 }
2539
2540 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2541                                         uint8_t *dst,
2542                                         long width, long height,
2543                                         long srcStride1, long srcStride2,
2544                                         long srcStride3, long dstStride)
2545 {
2546     x86_reg x;
2547     long y,w,h;
2548     w=width/2; h=height;
2549     for (y=0;y<h;y++) {
2550         const uint8_t* yp=src1+srcStride1*y;
2551         const uint8_t* up=src2+srcStride2*(y>>2);
2552         const uint8_t* vp=src3+srcStride3*(y>>2);
2553         uint8_t* d=dst+dstStride*y;
2554         x=0;
2555 #if HAVE_MMX
2556         for (;x<w-7;x+=8) {
2557             __asm__ volatile(
2558                 PREFETCH"   32(%1, %0)          \n\t"
2559                 PREFETCH"   32(%2, %0)          \n\t"
2560                 PREFETCH"   32(%3, %0)          \n\t"
2561                 "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2562                 "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2563                 "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2564                 "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2565                 "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2566                 "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2567                 "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2568                 "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2569                 "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2570                 "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2571
2572                 "movq            %%mm1, %%mm6   \n\t"
2573                 "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2574                 "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2575                 "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2576                 MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2577                 MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2578
2579                 "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2580                 "movq     8(%1, %0, 4), %%mm0   \n\t"
2581                 "movq            %%mm0, %%mm3   \n\t"
2582                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2583                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2584                 MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2585                 MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2586
2587                 "movq            %%mm4, %%mm6   \n\t"
2588                 "movq    16(%1, %0, 4), %%mm0   \n\t"
2589                 "movq            %%mm0, %%mm3   \n\t"
2590                 "punpcklbw       %%mm5, %%mm4   \n\t"
2591                 "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2592                 "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2593                 MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2594                 MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2595
2596                 "punpckhbw       %%mm5, %%mm6   \n\t"
2597                 "movq    24(%1, %0, 4), %%mm0   \n\t"
2598                 "movq            %%mm0, %%mm3   \n\t"
2599                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2600                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2601                 MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2602                 MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2603
2604                 : "+r" (x)
2605                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2606                 :"memory");
2607         }
2608 #endif
2609         for (; x<w; x++) {
2610             const long x2 = x<<2;
2611             d[8*x+0] = yp[x2];
2612             d[8*x+1] = up[x];
2613             d[8*x+2] = yp[x2+1];
2614             d[8*x+3] = vp[x];
2615             d[8*x+4] = yp[x2+2];
2616             d[8*x+5] = up[x];
2617             d[8*x+6] = yp[x2+3];
2618             d[8*x+7] = vp[x];
2619         }
2620     }
2621 #if HAVE_MMX
2622     __asm__(
2623             EMMS"       \n\t"
2624             SFENCE"     \n\t"
2625             ::: "memory"
2626         );
2627 #endif
2628 }
2629
2630 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2631 {
2632     dst +=   count;
2633     src += 2*count;
2634     count= - count;
2635
2636 #if HAVE_MMX
2637     if(count <= -16) {
2638         count += 15;
2639         __asm__ volatile(
2640             "pcmpeqw       %%mm7, %%mm7        \n\t"
2641             "psrlw            $8, %%mm7        \n\t"
2642             "1:                                \n\t"
2643             "movq -30(%1, %0, 2), %%mm0        \n\t"
2644             "movq -22(%1, %0, 2), %%mm1        \n\t"
2645             "movq -14(%1, %0, 2), %%mm2        \n\t"
2646             "movq  -6(%1, %0, 2), %%mm3        \n\t"
2647             "pand          %%mm7, %%mm0        \n\t"
2648             "pand          %%mm7, %%mm1        \n\t"
2649             "pand          %%mm7, %%mm2        \n\t"
2650             "pand          %%mm7, %%mm3        \n\t"
2651             "packuswb      %%mm1, %%mm0        \n\t"
2652             "packuswb      %%mm3, %%mm2        \n\t"
2653             MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2654             MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2655             "add             $16, %0           \n\t"
2656             " js 1b                            \n\t"
2657             : "+r"(count)
2658             : "r"(src), "r"(dst)
2659         );
2660         count -= 15;
2661     }
2662 #endif
2663     while(count<0) {
2664         dst[count]= src[2*count];
2665         count++;
2666     }
2667 }
2668
2669 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2670 {
2671     dst0+=   count;
2672     dst1+=   count;
2673     src += 4*count;
2674     count= - count;
2675 #if HAVE_MMX
2676     if(count <= -8) {
2677         count += 7;
2678         __asm__ volatile(
2679             "pcmpeqw       %%mm7, %%mm7        \n\t"
2680             "psrlw            $8, %%mm7        \n\t"
2681             "1:                                \n\t"
2682             "movq -28(%1, %0, 4), %%mm0        \n\t"
2683             "movq -20(%1, %0, 4), %%mm1        \n\t"
2684             "movq -12(%1, %0, 4), %%mm2        \n\t"
2685             "movq  -4(%1, %0, 4), %%mm3        \n\t"
2686             "pand          %%mm7, %%mm0        \n\t"
2687             "pand          %%mm7, %%mm1        \n\t"
2688             "pand          %%mm7, %%mm2        \n\t"
2689             "pand          %%mm7, %%mm3        \n\t"
2690             "packuswb      %%mm1, %%mm0        \n\t"
2691             "packuswb      %%mm3, %%mm2        \n\t"
2692             "movq          %%mm0, %%mm1        \n\t"
2693             "movq          %%mm2, %%mm3        \n\t"
2694             "psrlw            $8, %%mm0        \n\t"
2695             "psrlw            $8, %%mm2        \n\t"
2696             "pand          %%mm7, %%mm1        \n\t"
2697             "pand          %%mm7, %%mm3        \n\t"
2698             "packuswb      %%mm2, %%mm0        \n\t"
2699             "packuswb      %%mm3, %%mm1        \n\t"
2700             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2701             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2702             "add              $8, %0           \n\t"
2703             " js 1b                            \n\t"
2704             : "+r"(count)
2705             : "r"(src), "r"(dst0), "r"(dst1)
2706         );
2707         count -= 7;
2708     }
2709 #endif
2710     while(count<0) {
2711         dst0[count]= src[4*count+0];
2712         dst1[count]= src[4*count+2];
2713         count++;
2714     }
2715 }
2716
2717 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2718 {
2719     dst0 +=   count;
2720     dst1 +=   count;
2721     src0 += 4*count;
2722     src1 += 4*count;
2723     count= - count;
2724 #ifdef PAVGB
2725     if(count <= -8) {
2726         count += 7;
2727         __asm__ volatile(
2728             "pcmpeqw        %%mm7, %%mm7        \n\t"
2729             "psrlw             $8, %%mm7        \n\t"
2730             "1:                                \n\t"
2731             "movq  -28(%1, %0, 4), %%mm0        \n\t"
2732             "movq  -20(%1, %0, 4), %%mm1        \n\t"
2733             "movq  -12(%1, %0, 4), %%mm2        \n\t"
2734             "movq   -4(%1, %0, 4), %%mm3        \n\t"
2735             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2736             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2737             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2738             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2739             "pand           %%mm7, %%mm0        \n\t"
2740             "pand           %%mm7, %%mm1        \n\t"
2741             "pand           %%mm7, %%mm2        \n\t"
2742             "pand           %%mm7, %%mm3        \n\t"
2743             "packuswb       %%mm1, %%mm0        \n\t"
2744             "packuswb       %%mm3, %%mm2        \n\t"
2745             "movq           %%mm0, %%mm1        \n\t"
2746             "movq           %%mm2, %%mm3        \n\t"
2747             "psrlw             $8, %%mm0        \n\t"
2748             "psrlw             $8, %%mm2        \n\t"
2749             "pand           %%mm7, %%mm1        \n\t"
2750             "pand           %%mm7, %%mm3        \n\t"
2751             "packuswb       %%mm2, %%mm0        \n\t"
2752             "packuswb       %%mm3, %%mm1        \n\t"
2753             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2754             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2755             "add               $8, %0           \n\t"
2756             " js 1b                            \n\t"
2757             : "+r"(count)
2758             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2759         );
2760         count -= 7;
2761     }
2762 #endif
2763     while(count<0) {
2764         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2765         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2766         count++;
2767     }
2768 }
2769
2770 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2771 {
2772     dst0+=   count;
2773     dst1+=   count;
2774     src += 4*count;
2775     count= - count;
2776 #if HAVE_MMX
2777     if(count <= -8) {
2778         count += 7;
2779         __asm__ volatile(
2780             "pcmpeqw       %%mm7, %%mm7        \n\t"
2781             "psrlw            $8, %%mm7        \n\t"
2782             "1:                                \n\t"
2783             "movq -28(%1, %0, 4), %%mm0        \n\t"
2784             "movq -20(%1, %0, 4), %%mm1        \n\t"
2785             "movq -12(%1, %0, 4), %%mm2        \n\t"
2786             "movq  -4(%1, %0, 4), %%mm3        \n\t"
2787             "psrlw            $8, %%mm0        \n\t"
2788             "psrlw            $8, %%mm1        \n\t"
2789             "psrlw            $8, %%mm2        \n\t"
2790             "psrlw            $8, %%mm3        \n\t"
2791             "packuswb      %%mm1, %%mm0        \n\t"
2792             "packuswb      %%mm3, %%mm2        \n\t"
2793             "movq          %%mm0, %%mm1        \n\t"
2794             "movq          %%mm2, %%mm3        \n\t"
2795             "psrlw            $8, %%mm0        \n\t"
2796             "psrlw            $8, %%mm2        \n\t"
2797             "pand          %%mm7, %%mm1        \n\t"
2798             "pand          %%mm7, %%mm3        \n\t"
2799             "packuswb      %%mm2, %%mm0        \n\t"
2800             "packuswb      %%mm3, %%mm1        \n\t"
2801             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2802             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2803             "add              $8, %0           \n\t"
2804             " js 1b                            \n\t"
2805             : "+r"(count)
2806             : "r"(src), "r"(dst0), "r"(dst1)
2807         );
2808         count -= 7;
2809     }
2810 #endif
2811     src++;
2812     while(count<0) {
2813         dst0[count]= src[4*count+0];
2814         dst1[count]= src[4*count+2];
2815         count++;
2816     }
2817 }
2818
2819 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2820 {
2821     dst0 +=   count;
2822     dst1 +=   count;
2823     src0 += 4*count;
2824     src1 += 4*count;
2825     count= - count;
2826 #ifdef PAVGB
2827     if(count <= -8) {
2828         count += 7;
2829         __asm__ volatile(
2830             "pcmpeqw        %%mm7, %%mm7        \n\t"
2831             "psrlw             $8, %%mm7        \n\t"
2832             "1:                                \n\t"
2833             "movq  -28(%1, %0, 4), %%mm0        \n\t"
2834             "movq  -20(%1, %0, 4), %%mm1        \n\t"
2835             "movq  -12(%1, %0, 4), %%mm2        \n\t"
2836             "movq   -4(%1, %0, 4), %%mm3        \n\t"
2837             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2838             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2839             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2840             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2841             "psrlw             $8, %%mm0        \n\t"
2842             "psrlw             $8, %%mm1        \n\t"
2843             "psrlw             $8, %%mm2        \n\t"
2844             "psrlw             $8, %%mm3        \n\t"
2845             "packuswb       %%mm1, %%mm0        \n\t"
2846             "packuswb       %%mm3, %%mm2        \n\t"
2847             "movq           %%mm0, %%mm1        \n\t"
2848             "movq           %%mm2, %%mm3        \n\t"
2849             "psrlw             $8, %%mm0        \n\t"
2850             "psrlw             $8, %%mm2        \n\t"
2851             "pand           %%mm7, %%mm1        \n\t"
2852             "pand           %%mm7, %%mm3        \n\t"
2853             "packuswb       %%mm2, %%mm0        \n\t"
2854             "packuswb       %%mm3, %%mm1        \n\t"
2855             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2856             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2857             "add               $8, %0           \n\t"
2858             " js 1b                            \n\t"
2859             : "+r"(count)
2860             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2861         );
2862         count -= 7;
2863     }
2864 #endif
2865     src0++;
2866     src1++;
2867     while(count<0) {
2868         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2869         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2870         count++;
2871     }
2872 }
2873
2874 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2875                                       long width, long height,
2876                                       long lumStride, long chromStride, long srcStride)
2877 {
2878     long y;
2879     const long chromWidth= -((-width)>>1);
2880
2881     for (y=0; y<height; y++) {
2882         RENAME(extract_even)(src, ydst, width);
2883         if(y&1) {
2884             RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2885             udst+= chromStride;
2886             vdst+= chromStride;
2887         }
2888
2889         src += srcStride;
2890         ydst+= lumStride;
2891     }
2892 #if HAVE_MMX
2893     __asm__(
2894             EMMS"       \n\t"
2895             SFENCE"     \n\t"
2896             ::: "memory"
2897         );
2898 #endif
2899 }
2900
2901 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2902                                       long width, long height,
2903                                       long lumStride, long chromStride, long srcStride)
2904 {
2905     long y;
2906     const long chromWidth= -((-width)>>1);
2907
2908     for (y=0; y<height; y++) {
2909         RENAME(extract_even)(src, ydst, width);
2910         RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2911
2912         src += srcStride;
2913         ydst+= lumStride;
2914         udst+= chromStride;
2915         vdst+= chromStride;
2916     }
2917 #if HAVE_MMX
2918     __asm__(
2919             EMMS"       \n\t"
2920             SFENCE"     \n\t"
2921             ::: "memory"
2922         );
2923 #endif
2924 }
2925
2926 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2927                                       long width, long height,
2928                                       long lumStride, long chromStride, long srcStride)
2929 {
2930     long y;
2931     const long chromWidth= -((-width)>>1);
2932
2933     for (y=0; y<height; y++) {
2934         RENAME(extract_even)(src+1, ydst, width);
2935         if(y&1) {
2936             RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2937             udst+= chromStride;
2938             vdst+= chromStride;
2939         }
2940
2941         src += srcStride;
2942         ydst+= lumStride;
2943     }
2944 #if HAVE_MMX
2945     __asm__(
2946             EMMS"       \n\t"
2947             SFENCE"     \n\t"
2948             ::: "memory"
2949         );
2950 #endif
2951 }
2952
2953 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2954                                       long width, long height,
2955                                       long lumStride, long chromStride, long srcStride)
2956 {
2957     long y;
2958     const long chromWidth= -((-width)>>1);
2959
2960     for (y=0; y<height; y++) {
2961         RENAME(extract_even)(src+1, ydst, width);
2962         RENAME(extract_even2)(src, udst, vdst, chromWidth);
2963
2964         src += srcStride;
2965         ydst+= lumStride;
2966         udst+= chromStride;
2967         vdst+= chromStride;
2968     }
2969 #if HAVE_MMX
2970     __asm__(
2971             EMMS"       \n\t"
2972             SFENCE"     \n\t"
2973             ::: "memory"
2974         );
2975 #endif
2976 }
2977
2978 static inline void RENAME(rgb2rgb_init)(void)
2979 {
2980     rgb15to16       = RENAME(rgb15to16);
2981     rgb15tobgr24    = RENAME(rgb15tobgr24);
2982     rgb15to32       = RENAME(rgb15to32);
2983     rgb16tobgr24    = RENAME(rgb16tobgr24);
2984     rgb16to32       = RENAME(rgb16to32);
2985     rgb16to15       = RENAME(rgb16to15);
2986     rgb24tobgr16    = RENAME(rgb24tobgr16);
2987     rgb24tobgr15    = RENAME(rgb24tobgr15);
2988     rgb24tobgr32    = RENAME(rgb24tobgr32);
2989     rgb32to16       = RENAME(rgb32to16);
2990     rgb32to15       = RENAME(rgb32to15);
2991     rgb32tobgr24    = RENAME(rgb32tobgr24);
2992     rgb24to15       = RENAME(rgb24to15);
2993     rgb24to16       = RENAME(rgb24to16);
2994     rgb24tobgr24    = RENAME(rgb24tobgr24);
2995     rgb32tobgr32    = RENAME(rgb32tobgr32);
2996     rgb32tobgr16    = RENAME(rgb32tobgr16);
2997     rgb32tobgr15    = RENAME(rgb32tobgr15);
2998     yv12toyuy2      = RENAME(yv12toyuy2);
2999     yv12touyvy      = RENAME(yv12touyvy);
3000     yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
3001     yuv422ptouyvy   = RENAME(yuv422ptouyvy);
3002     yuy2toyv12      = RENAME(yuy2toyv12);
3003 //    yvu9toyv12      = RENAME(yvu9toyv12);
3004     planar2x        = RENAME(planar2x);
3005     rgb24toyv12     = RENAME(rgb24toyv12);
3006     interleaveBytes = RENAME(interleaveBytes);
3007     vu9_to_vu12     = RENAME(vu9_to_vu12);
3008     yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
3009
3010     uyvytoyuv420    = RENAME(uyvytoyuv420);
3011     uyvytoyuv422    = RENAME(uyvytoyuv422);
3012     yuyvtoyuv420    = RENAME(yuyvtoyuv420);
3013     yuyvtoyuv422    = RENAME(yuyvtoyuv422);
3014 }