rgb2rgb_template.c

   1 /*
   2  * software RGB to RGB converter
   3  * pluralize by software PAL8 to RGB converter
   4  *              software YUV to YUV converter
   5  *              software YUV to RGB converter
   6  * Written by Nick Kurshev.
   7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
   8  * lot of big-endian byte order fixes by Alex Beregszaszi
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License
  23  * along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  *
  26  * The C code (not assembly, MMX, ...) of this file can be used
  27  * under the LGPL license.
  28  */
  29
  30 #include <stddef.h>
  31
  32 #undef PREFETCH
  33 #undef MOVNTQ
  34 #undef EMMS
  35 #undef SFENCE
  36 #undef MMREG_SIZE
  37 #undef PAVGB
  38
  39 #if HAVE_SSE2
  40 #define MMREG_SIZE 16
  41 #else
  42 #define MMREG_SIZE 8
  43 #endif
  44
  45 #if HAVE_AMD3DNOW
  46 #define PREFETCH  "prefetch"
  47 #define PAVGB     "pavgusb"
  48 #elif HAVE_MMX2
  49 #define PREFETCH "prefetchnta"
  50 #define PAVGB     "pavgb"
  51 #else
  52 #define PREFETCH  " # nop"
  53 #endif
  54
  55 #if HAVE_AMD3DNOW
  56 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  57 #define EMMS     "femms"
  58 #else
  59 #define EMMS     "emms"
  60 #endif
  61
  62 #if HAVE_MMX2
  63 #define MOVNTQ "movntq"
  64 #define SFENCE "sfence"
  65 #else
  66 #define MOVNTQ "movq"
  67 #define SFENCE " # nop"
  68 #endif
  69
  70 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  71 {
  72     uint8_t *dest = dst;
  73     const uint8_t *s = src;
  74     const uint8_t *end;
  75 #if HAVE_MMX
  76     const uint8_t *mm_end;
  77 #endif
  78     end = s + src_size;
  79 #if HAVE_MMX
  80     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
  81     mm_end = end - 23;
  82     __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
  83     while (s < mm_end) {
  84         __asm__ volatile(
  85             PREFETCH"    32%1           \n\t"
  86             "movd          %1, %%mm0    \n\t"
  87             "punpckldq    3%1, %%mm0    \n\t"
  88             "movd         6%1, %%mm1    \n\t"
  89             "punpckldq    9%1, %%mm1    \n\t"
  90             "movd        12%1, %%mm2    \n\t"
  91             "punpckldq   15%1, %%mm2    \n\t"
  92             "movd        18%1, %%mm3    \n\t"
  93             "punpckldq   21%1, %%mm3    \n\t"
  94             "por        %%mm7, %%mm0    \n\t"
  95             "por        %%mm7, %%mm1    \n\t"
  96             "por        %%mm7, %%mm2    \n\t"
  97             "por        %%mm7, %%mm3    \n\t"
  98             MOVNTQ"     %%mm0,   %0     \n\t"
  99             MOVNTQ"     %%mm1,  8%0     \n\t"
 100             MOVNTQ"     %%mm2, 16%0     \n\t"
 101             MOVNTQ"     %%mm3, 24%0"
 102             :"=m"(*dest)
 103             :"m"(*s)
 104             :"memory");
 105         dest += 32;
 106         s += 24;
 107     }
 108     __asm__ volatile(SFENCE:::"memory");
 109     __asm__ volatile(EMMS:::"memory");
 110 #endif
 111     while (s < end) {
 112 #if HAVE_BIGENDIAN
 113         /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
 114         *dest++ = 255;
 115         *dest++ = s[2];
 116         *dest++ = s[1];
 117         *dest++ = s[0];
 118         s+=3;
 119 #else
 120         *dest++ = *s++;
 121         *dest++ = *s++;
 122         *dest++ = *s++;
 123         *dest++ = 255;
 124 #endif
 125     }
 126 }
 127
 128 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 129 {
 130     uint8_t *dest = dst;
 131     const uint8_t *s = src;
 132     const uint8_t *end;
 133 #if HAVE_MMX
 134     const uint8_t *mm_end;
 135 #endif
 136     end = s + src_size;
 137 #if HAVE_MMX
 138     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
 139     mm_end = end - 31;
 140     while (s < mm_end) {
 141         __asm__ volatile(
 142             PREFETCH"    32%1           \n\t"
 143             "movq          %1, %%mm0    \n\t"
 144             "movq         8%1, %%mm1    \n\t"
 145             "movq        16%1, %%mm4    \n\t"
 146             "movq        24%1, %%mm5    \n\t"
 147             "movq       %%mm0, %%mm2    \n\t"
 148             "movq       %%mm1, %%mm3    \n\t"
 149             "movq       %%mm4, %%mm6    \n\t"
 150             "movq       %%mm5, %%mm7    \n\t"
 151             "psrlq         $8, %%mm2    \n\t"
 152             "psrlq         $8, %%mm3    \n\t"
 153             "psrlq         $8, %%mm6    \n\t"
 154             "psrlq         $8, %%mm7    \n\t"
 155             "pand          %2, %%mm0    \n\t"
 156             "pand          %2, %%mm1    \n\t"
 157             "pand          %2, %%mm4    \n\t"
 158             "pand          %2, %%mm5    \n\t"
 159             "pand          %3, %%mm2    \n\t"
 160             "pand          %3, %%mm3    \n\t"
 161             "pand          %3, %%mm6    \n\t"
 162             "pand          %3, %%mm7    \n\t"
 163             "por        %%mm2, %%mm0    \n\t"
 164             "por        %%mm3, %%mm1    \n\t"
 165             "por        %%mm6, %%mm4    \n\t"
 166             "por        %%mm7, %%mm5    \n\t"
 167
 168             "movq       %%mm1, %%mm2    \n\t"
 169             "movq       %%mm4, %%mm3    \n\t"
 170             "psllq        $48, %%mm2    \n\t"
 171             "psllq        $32, %%mm3    \n\t"
 172             "pand          %4, %%mm2    \n\t"
 173             "pand          %5, %%mm3    \n\t"
 174             "por        %%mm2, %%mm0    \n\t"
 175             "psrlq        $16, %%mm1    \n\t"
 176             "psrlq        $32, %%mm4    \n\t"
 177             "psllq        $16, %%mm5    \n\t"
 178             "por        %%mm3, %%mm1    \n\t"
 179             "pand          %6, %%mm5    \n\t"
 180             "por        %%mm5, %%mm4    \n\t"
 181
 182             MOVNTQ"     %%mm0,   %0     \n\t"
 183             MOVNTQ"     %%mm1,  8%0     \n\t"
 184             MOVNTQ"     %%mm4, 16%0"
 185             :"=m"(*dest)
 186             :"m"(*s),"m"(mask24l),
 187             "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
 188             :"memory");
 189         dest += 24;
 190         s += 32;
 191     }
 192     __asm__ volatile(SFENCE:::"memory");
 193     __asm__ volatile(EMMS:::"memory");
 194 #endif
 195     while (s < end) {
 196 #if HAVE_BIGENDIAN
 197         /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
 198         s++;
 199         dest[2] = *s++;
 200         dest[1] = *s++;
 201         dest[0] = *s++;
 202         dest += 3;
 203 #else
 204         *dest++ = *s++;
 205         *dest++ = *s++;
 206         *dest++ = *s++;
 207         s++;
 208 #endif
 209     }
 210 }
 211
 212 /*
 213  original by Strepto/Astral
 214  ported to gcc & bugfixed: A'rpi
 215  MMX2, 3DNOW optimization by Nick Kurshev
 216  32-bit C version, and and&add trick by Michael Niedermayer
 217 */
 218 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
 219 {
 220     register const uint8_t* s=src;
 221     register uint8_t* d=dst;
 222     register const uint8_t *end;
 223     const uint8_t *mm_end;
 224     end = s + src_size;
 225 #if HAVE_MMX
 226     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
 227     __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
 228     mm_end = end - 15;
 229     while (s<mm_end) {
 230         __asm__ volatile(
 231             PREFETCH"  32%1         \n\t"
 232             "movq        %1, %%mm0  \n\t"
 233             "movq       8%1, %%mm2  \n\t"
 234             "movq     %%mm0, %%mm1  \n\t"
 235             "movq     %%mm2, %%mm3  \n\t"
 236             "pand     %%mm4, %%mm0  \n\t"
 237             "pand     %%mm4, %%mm2  \n\t"
 238             "paddw    %%mm1, %%mm0  \n\t"
 239             "paddw    %%mm3, %%mm2  \n\t"
 240             MOVNTQ"   %%mm0,  %0    \n\t"
 241             MOVNTQ"   %%mm2, 8%0"
 242             :"=m"(*d)
 243             :"m"(*s)
 244         );
 245         d+=16;
 246         s+=16;
 247     }
 248     __asm__ volatile(SFENCE:::"memory");
 249     __asm__ volatile(EMMS:::"memory");
 250 #endif
 251     mm_end = end - 3;
 252     while (s < mm_end) {
 253         register unsigned x= *((const uint32_t *)s);
 254         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 255         d+=4;
 256         s+=4;
 257     }
 258     if (s < end) {
 259         register unsigned short x= *((const uint16_t *)s);
 260         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
 261     }
 262 }
 263
 264 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
 265 {
 266     register const uint8_t* s=src;
 267     register uint8_t* d=dst;
 268     register const uint8_t *end;
 269     const uint8_t *mm_end;
 270     end = s + src_size;
 271 #if HAVE_MMX
 272     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
 273     __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
 274     __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
 275     mm_end = end - 15;
 276     while (s<mm_end) {
 277         __asm__ volatile(
 278             PREFETCH"  32%1         \n\t"
 279             "movq        %1, %%mm0  \n\t"
 280             "movq       8%1, %%mm2  \n\t"
 281             "movq     %%mm0, %%mm1  \n\t"
 282             "movq     %%mm2, %%mm3  \n\t"
 283             "psrlq       $1, %%mm0  \n\t"
 284             "psrlq       $1, %%mm2  \n\t"
 285             "pand     %%mm7, %%mm0  \n\t"
 286             "pand     %%mm7, %%mm2  \n\t"
 287             "pand     %%mm6, %%mm1  \n\t"
 288             "pand     %%mm6, %%mm3  \n\t"
 289             "por      %%mm1, %%mm0  \n\t"
 290             "por      %%mm3, %%mm2  \n\t"
 291             MOVNTQ"   %%mm0,  %0    \n\t"
 292             MOVNTQ"   %%mm2, 8%0"
 293             :"=m"(*d)
 294             :"m"(*s)
 295         );
 296         d+=16;
 297         s+=16;
 298     }
 299     __asm__ volatile(SFENCE:::"memory");
 300     __asm__ volatile(EMMS:::"memory");
 301 #endif
 302     mm_end = end - 3;
 303     while (s < mm_end) {
 304         register uint32_t x= *((const uint32_t*)s);
 305         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
 306         s+=4;
 307         d+=4;
 308     }
 309     if (s < end) {
 310         register uint16_t x= *((const uint16_t*)s);
 311         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
 312     }
 313 }
 314
 315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
 316 {
 317     const uint8_t *s = src;
 318     const uint8_t *end;
 319 #if HAVE_MMX
 320     const uint8_t *mm_end;
 321 #endif
 322     uint16_t *d = (uint16_t *)dst;
 323     end = s + src_size;
 324 #if HAVE_MMX
 325     mm_end = end - 15;
 326 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
 327     __asm__ volatile(
 328         "movq           %3, %%mm5   \n\t"
 329         "movq           %4, %%mm6   \n\t"
 330         "movq           %5, %%mm7   \n\t"
 331         "jmp 2f                     \n\t"
 332         ASMALIGN(4)
 333         "1:                         \n\t"
 334         PREFETCH"   32(%1)          \n\t"
 335         "movd         (%1), %%mm0   \n\t"
 336         "movd        4(%1), %%mm3   \n\t"
 337         "punpckldq   8(%1), %%mm0   \n\t"
 338         "punpckldq  12(%1), %%mm3   \n\t"
 339         "movq        %%mm0, %%mm1   \n\t"
 340         "movq        %%mm3, %%mm4   \n\t"
 341         "pand        %%mm6, %%mm0   \n\t"
 342         "pand        %%mm6, %%mm3   \n\t"
 343         "pmaddwd     %%mm7, %%mm0   \n\t"
 344         "pmaddwd     %%mm7, %%mm3   \n\t"
 345         "pand        %%mm5, %%mm1   \n\t"
 346         "pand        %%mm5, %%mm4   \n\t"
 347         "por         %%mm1, %%mm0   \n\t"
 348         "por         %%mm4, %%mm3   \n\t"
 349         "psrld          $5, %%mm0   \n\t"
 350         "pslld         $11, %%mm3   \n\t"
 351         "por         %%mm3, %%mm0   \n\t"
 352         MOVNTQ"      %%mm0, (%0)    \n\t"
 353         "add           $16,  %1     \n\t"
 354         "add            $8,  %0     \n\t"
 355         "2:                         \n\t"
 356         "cmp            %2,  %1     \n\t"
 357         " jb            1b          \n\t"
 358         : "+r" (d), "+r"(s)
 359         : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
 360     );
 361 #else
 362     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 363     __asm__ volatile(
 364         "movq    %0, %%mm7    \n\t"
 365         "movq    %1, %%mm6    \n\t"
 366         ::"m"(red_16mask),"m"(green_16mask));
 367     while (s < mm_end) {
 368         __asm__ volatile(
 369             PREFETCH"    32%1           \n\t"
 370             "movd          %1, %%mm0    \n\t"
 371             "movd         4%1, %%mm3    \n\t"
 372             "punpckldq    8%1, %%mm0    \n\t"
 373             "punpckldq   12%1, %%mm3    \n\t"
 374             "movq       %%mm0, %%mm1    \n\t"
 375             "movq       %%mm0, %%mm2    \n\t"
 376             "movq       %%mm3, %%mm4    \n\t"
 377             "movq       %%mm3, %%mm5    \n\t"
 378             "psrlq         $3, %%mm0    \n\t"
 379             "psrlq         $3, %%mm3    \n\t"
 380             "pand          %2, %%mm0    \n\t"
 381             "pand          %2, %%mm3    \n\t"
 382             "psrlq         $5, %%mm1    \n\t"
 383             "psrlq         $5, %%mm4    \n\t"
 384             "pand       %%mm6, %%mm1    \n\t"
 385             "pand       %%mm6, %%mm4    \n\t"
 386             "psrlq         $8, %%mm2    \n\t"
 387             "psrlq         $8, %%mm5    \n\t"
 388             "pand       %%mm7, %%mm2    \n\t"
 389             "pand       %%mm7, %%mm5    \n\t"
 390             "por        %%mm1, %%mm0    \n\t"
 391             "por        %%mm4, %%mm3    \n\t"
 392             "por        %%mm2, %%mm0    \n\t"
 393             "por        %%mm5, %%mm3    \n\t"
 394             "psllq        $16, %%mm3    \n\t"
 395             "por        %%mm3, %%mm0    \n\t"
 396             MOVNTQ"     %%mm0, %0       \n\t"
 397             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 398         d += 4;
 399         s += 16;
 400     }
 401 #endif
 402     __asm__ volatile(SFENCE:::"memory");
 403     __asm__ volatile(EMMS:::"memory");
 404 #endif
 405     while (s < end) {
 406         register int rgb = *(const uint32_t*)s; s += 4;
 407         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
 408     }
 409 }
 410
 411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
 412 {
 413     const uint8_t *s = src;
 414     const uint8_t *end;
 415 #if HAVE_MMX
 416     const uint8_t *mm_end;
 417 #endif
 418     uint16_t *d = (uint16_t *)dst;
 419     end = s + src_size;
 420 #if HAVE_MMX
 421     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 422     __asm__ volatile(
 423         "movq          %0, %%mm7    \n\t"
 424         "movq          %1, %%mm6    \n\t"
 425         ::"m"(red_16mask),"m"(green_16mask));
 426     mm_end = end - 15;
 427     while (s < mm_end) {
 428         __asm__ volatile(
 429             PREFETCH"    32%1           \n\t"
 430             "movd          %1, %%mm0    \n\t"
 431             "movd         4%1, %%mm3    \n\t"
 432             "punpckldq    8%1, %%mm0    \n\t"
 433             "punpckldq   12%1, %%mm3    \n\t"
 434             "movq       %%mm0, %%mm1    \n\t"
 435             "movq       %%mm0, %%mm2    \n\t"
 436             "movq       %%mm3, %%mm4    \n\t"
 437             "movq       %%mm3, %%mm5    \n\t"
 438             "psllq         $8, %%mm0    \n\t"
 439             "psllq         $8, %%mm3    \n\t"
 440             "pand       %%mm7, %%mm0    \n\t"
 441             "pand       %%mm7, %%mm3    \n\t"
 442             "psrlq         $5, %%mm1    \n\t"
 443             "psrlq         $5, %%mm4    \n\t"
 444             "pand       %%mm6, %%mm1    \n\t"
 445             "pand       %%mm6, %%mm4    \n\t"
 446             "psrlq        $19, %%mm2    \n\t"
 447             "psrlq        $19, %%mm5    \n\t"
 448             "pand          %2, %%mm2    \n\t"
 449             "pand          %2, %%mm5    \n\t"
 450             "por        %%mm1, %%mm0    \n\t"
 451             "por        %%mm4, %%mm3    \n\t"
 452             "por        %%mm2, %%mm0    \n\t"
 453             "por        %%mm5, %%mm3    \n\t"
 454             "psllq        $16, %%mm3    \n\t"
 455             "por        %%mm3, %%mm0    \n\t"
 456             MOVNTQ"     %%mm0, %0       \n\t"
 457             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 458         d += 4;
 459         s += 16;
 460     }
 461     __asm__ volatile(SFENCE:::"memory");
 462     __asm__ volatile(EMMS:::"memory");
 463 #endif
 464     while (s < end) {
 465         register int rgb = *(const uint32_t*)s; s += 4;
 466         *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
 467     }
 468 }
 469
 470 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
 471 {
 472     const uint8_t *s = src;
 473     const uint8_t *end;
 474 #if HAVE_MMX
 475     const uint8_t *mm_end;
 476 #endif
 477     uint16_t *d = (uint16_t *)dst;
 478     end = s + src_size;
 479 #if HAVE_MMX
 480     mm_end = end - 15;
 481 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
 482     __asm__ volatile(
 483         "movq           %3, %%mm5   \n\t"
 484         "movq           %4, %%mm6   \n\t"
 485         "movq           %5, %%mm7   \n\t"
 486         "jmp            2f          \n\t"
 487         ASMALIGN(4)
 488         "1:                         \n\t"
 489         PREFETCH"   32(%1)          \n\t"
 490         "movd         (%1), %%mm0   \n\t"
 491         "movd        4(%1), %%mm3   \n\t"
 492         "punpckldq   8(%1), %%mm0   \n\t"
 493         "punpckldq  12(%1), %%mm3   \n\t"
 494         "movq        %%mm0, %%mm1   \n\t"
 495         "movq        %%mm3, %%mm4   \n\t"
 496         "pand        %%mm6, %%mm0   \n\t"
 497         "pand        %%mm6, %%mm3   \n\t"
 498         "pmaddwd     %%mm7, %%mm0   \n\t"
 499         "pmaddwd     %%mm7, %%mm3   \n\t"
 500         "pand        %%mm5, %%mm1   \n\t"
 501         "pand        %%mm5, %%mm4   \n\t"
 502         "por         %%mm1, %%mm0   \n\t"
 503         "por         %%mm4, %%mm3   \n\t"
 504         "psrld          $6, %%mm0   \n\t"
 505         "pslld         $10, %%mm3   \n\t"
 506         "por         %%mm3, %%mm0   \n\t"
 507         MOVNTQ"      %%mm0, (%0)    \n\t"
 508         "add           $16,  %1     \n\t"
 509         "add            $8,  %0     \n\t"
 510         "2:                         \n\t"
 511         "cmp            %2,  %1     \n\t"
 512         " jb            1b          \n\t"
 513         : "+r" (d), "+r"(s)
 514         : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
 515     );
 516 #else
 517     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 518     __asm__ volatile(
 519         "movq          %0, %%mm7    \n\t"
 520         "movq          %1, %%mm6    \n\t"
 521         ::"m"(red_15mask),"m"(green_15mask));
 522     while (s < mm_end) {
 523         __asm__ volatile(
 524             PREFETCH"    32%1           \n\t"
 525             "movd          %1, %%mm0    \n\t"
 526             "movd         4%1, %%mm3    \n\t"
 527             "punpckldq    8%1, %%mm0    \n\t"
 528             "punpckldq   12%1, %%mm3    \n\t"
 529             "movq       %%mm0, %%mm1    \n\t"
 530             "movq       %%mm0, %%mm2    \n\t"
 531             "movq       %%mm3, %%mm4    \n\t"
 532             "movq       %%mm3, %%mm5    \n\t"
 533             "psrlq         $3, %%mm0    \n\t"
 534             "psrlq         $3, %%mm3    \n\t"
 535             "pand          %2, %%mm0    \n\t"
 536             "pand          %2, %%mm3    \n\t"
 537             "psrlq         $6, %%mm1    \n\t"
 538             "psrlq         $6, %%mm4    \n\t"
 539             "pand       %%mm6, %%mm1    \n\t"
 540             "pand       %%mm6, %%mm4    \n\t"
 541             "psrlq         $9, %%mm2    \n\t"
 542             "psrlq         $9, %%mm5    \n\t"
 543             "pand       %%mm7, %%mm2    \n\t"
 544             "pand       %%mm7, %%mm5    \n\t"
 545             "por        %%mm1, %%mm0    \n\t"
 546             "por        %%mm4, %%mm3    \n\t"
 547             "por        %%mm2, %%mm0    \n\t"
 548             "por        %%mm5, %%mm3    \n\t"
 549             "psllq        $16, %%mm3    \n\t"
 550             "por        %%mm3, %%mm0    \n\t"
 551             MOVNTQ"     %%mm0, %0       \n\t"
 552             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 553         d += 4;
 554         s += 16;
 555     }
 556 #endif
 557     __asm__ volatile(SFENCE:::"memory");
 558     __asm__ volatile(EMMS:::"memory");
 559 #endif
 560     while (s < end) {
 561         register int rgb = *(const uint32_t*)s; s += 4;
 562         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
 563     }
 564 }
 565
 566 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
 567 {
 568     const uint8_t *s = src;
 569     const uint8_t *end;
 570 #if HAVE_MMX
 571     const uint8_t *mm_end;
 572 #endif
 573     uint16_t *d = (uint16_t *)dst;
 574     end = s + src_size;
 575 #if HAVE_MMX
 576     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 577     __asm__ volatile(
 578         "movq          %0, %%mm7    \n\t"
 579         "movq          %1, %%mm6    \n\t"
 580         ::"m"(red_15mask),"m"(green_15mask));
 581     mm_end = end - 15;
 582     while (s < mm_end) {
 583         __asm__ volatile(
 584             PREFETCH"    32%1           \n\t"
 585             "movd          %1, %%mm0    \n\t"
 586             "movd         4%1, %%mm3    \n\t"
 587             "punpckldq    8%1, %%mm0    \n\t"
 588             "punpckldq   12%1, %%mm3    \n\t"
 589             "movq       %%mm0, %%mm1    \n\t"
 590             "movq       %%mm0, %%mm2    \n\t"
 591             "movq       %%mm3, %%mm4    \n\t"
 592             "movq       %%mm3, %%mm5    \n\t"
 593             "psllq         $7, %%mm0    \n\t"
 594             "psllq         $7, %%mm3    \n\t"
 595             "pand       %%mm7, %%mm0    \n\t"
 596             "pand       %%mm7, %%mm3    \n\t"
 597             "psrlq         $6, %%mm1    \n\t"
 598             "psrlq         $6, %%mm4    \n\t"
 599             "pand       %%mm6, %%mm1    \n\t"
 600             "pand       %%mm6, %%mm4    \n\t"
 601             "psrlq        $19, %%mm2    \n\t"
 602             "psrlq        $19, %%mm5    \n\t"
 603             "pand          %2, %%mm2    \n\t"
 604             "pand          %2, %%mm5    \n\t"
 605             "por        %%mm1, %%mm0    \n\t"
 606             "por        %%mm4, %%mm3    \n\t"
 607             "por        %%mm2, %%mm0    \n\t"
 608             "por        %%mm5, %%mm3    \n\t"
 609             "psllq        $16, %%mm3    \n\t"
 610             "por        %%mm3, %%mm0    \n\t"
 611             MOVNTQ"     %%mm0, %0       \n\t"
 612             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 613         d += 4;
 614         s += 16;
 615     }
 616     __asm__ volatile(SFENCE:::"memory");
 617     __asm__ volatile(EMMS:::"memory");
 618 #endif
 619     while (s < end) {
 620         register int rgb = *(const uint32_t*)s; s += 4;
 621         *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
 622     }
 623 }
 624
 625 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
 626 {
 627     const uint8_t *s = src;
 628     const uint8_t *end;
 629 #if HAVE_MMX
 630     const uint8_t *mm_end;
 631 #endif
 632     uint16_t *d = (uint16_t *)dst;
 633     end = s + src_size;
 634 #if HAVE_MMX
 635     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 636     __asm__ volatile(
 637         "movq         %0, %%mm7     \n\t"
 638         "movq         %1, %%mm6     \n\t"
 639         ::"m"(red_16mask),"m"(green_16mask));
 640     mm_end = end - 11;
 641     while (s < mm_end) {
 642         __asm__ volatile(
 643             PREFETCH"    32%1           \n\t"
 644             "movd          %1, %%mm0    \n\t"
 645             "movd         3%1, %%mm3    \n\t"
 646             "punpckldq    6%1, %%mm0    \n\t"
 647             "punpckldq    9%1, %%mm3    \n\t"
 648             "movq       %%mm0, %%mm1    \n\t"
 649             "movq       %%mm0, %%mm2    \n\t"
 650             "movq       %%mm3, %%mm4    \n\t"
 651             "movq       %%mm3, %%mm5    \n\t"
 652             "psrlq         $3, %%mm0    \n\t"
 653             "psrlq         $3, %%mm3    \n\t"
 654             "pand          %2, %%mm0    \n\t"
 655             "pand          %2, %%mm3    \n\t"
 656             "psrlq         $5, %%mm1    \n\t"
 657             "psrlq         $5, %%mm4    \n\t"
 658             "pand       %%mm6, %%mm1    \n\t"
 659             "pand       %%mm6, %%mm4    \n\t"
 660             "psrlq         $8, %%mm2    \n\t"
 661             "psrlq         $8, %%mm5    \n\t"
 662             "pand       %%mm7, %%mm2    \n\t"
 663             "pand       %%mm7, %%mm5    \n\t"
 664             "por        %%mm1, %%mm0    \n\t"
 665             "por        %%mm4, %%mm3    \n\t"
 666             "por        %%mm2, %%mm0    \n\t"
 667             "por        %%mm5, %%mm3    \n\t"
 668             "psllq        $16, %%mm3    \n\t"
 669             "por        %%mm3, %%mm0    \n\t"
 670             MOVNTQ"     %%mm0, %0       \n\t"
 671             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 672         d += 4;
 673         s += 12;
 674     }
 675     __asm__ volatile(SFENCE:::"memory");
 676     __asm__ volatile(EMMS:::"memory");
 677 #endif
 678     while (s < end) {
 679         const int b = *s++;
 680         const int g = *s++;
 681         const int r = *s++;
 682         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 683     }
 684 }
 685
 686 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
 687 {
 688     const uint8_t *s = src;
 689     const uint8_t *end;
 690 #if HAVE_MMX
 691     const uint8_t *mm_end;
 692 #endif
 693     uint16_t *d = (uint16_t *)dst;
 694     end = s + src_size;
 695 #if HAVE_MMX
 696     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 697     __asm__ volatile(
 698         "movq         %0, %%mm7     \n\t"
 699         "movq         %1, %%mm6     \n\t"
 700         ::"m"(red_16mask),"m"(green_16mask));
 701     mm_end = end - 15;
 702     while (s < mm_end) {
 703         __asm__ volatile(
 704             PREFETCH"    32%1           \n\t"
 705             "movd          %1, %%mm0    \n\t"
 706             "movd         3%1, %%mm3    \n\t"
 707             "punpckldq    6%1, %%mm0    \n\t"
 708             "punpckldq    9%1, %%mm3    \n\t"
 709             "movq       %%mm0, %%mm1    \n\t"
 710             "movq       %%mm0, %%mm2    \n\t"
 711             "movq       %%mm3, %%mm4    \n\t"
 712             "movq       %%mm3, %%mm5    \n\t"
 713             "psllq         $8, %%mm0    \n\t"
 714             "psllq         $8, %%mm3    \n\t"
 715             "pand       %%mm7, %%mm0    \n\t"
 716             "pand       %%mm7, %%mm3    \n\t"
 717             "psrlq         $5, %%mm1    \n\t"
 718             "psrlq         $5, %%mm4    \n\t"
 719             "pand       %%mm6, %%mm1    \n\t"
 720             "pand       %%mm6, %%mm4    \n\t"
 721             "psrlq        $19, %%mm2    \n\t"
 722             "psrlq        $19, %%mm5    \n\t"
 723             "pand          %2, %%mm2    \n\t"
 724             "pand          %2, %%mm5    \n\t"
 725             "por        %%mm1, %%mm0    \n\t"
 726             "por        %%mm4, %%mm3    \n\t"
 727             "por        %%mm2, %%mm0    \n\t"
 728             "por        %%mm5, %%mm3    \n\t"
 729             "psllq        $16, %%mm3    \n\t"
 730             "por        %%mm3, %%mm0    \n\t"
 731             MOVNTQ"     %%mm0, %0       \n\t"
 732             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 733         d += 4;
 734         s += 12;
 735     }
 736     __asm__ volatile(SFENCE:::"memory");
 737     __asm__ volatile(EMMS:::"memory");
 738 #endif
 739     while (s < end) {
 740         const int r = *s++;
 741         const int g = *s++;
 742         const int b = *s++;
 743         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 744     }
 745 }
 746
 747 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
 748 {
 749     const uint8_t *s = src;
 750     const uint8_t *end;
 751 #if HAVE_MMX
 752     const uint8_t *mm_end;
 753 #endif
 754     uint16_t *d = (uint16_t *)dst;
 755     end = s + src_size;
 756 #if HAVE_MMX
 757     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 758     __asm__ volatile(
 759         "movq          %0, %%mm7    \n\t"
 760         "movq          %1, %%mm6    \n\t"
 761         ::"m"(red_15mask),"m"(green_15mask));
 762     mm_end = end - 11;
 763     while (s < mm_end) {
 764         __asm__ volatile(
 765             PREFETCH"    32%1           \n\t"
 766             "movd          %1, %%mm0    \n\t"
 767             "movd         3%1, %%mm3    \n\t"
 768             "punpckldq    6%1, %%mm0    \n\t"
 769             "punpckldq    9%1, %%mm3    \n\t"
 770             "movq       %%mm0, %%mm1    \n\t"
 771             "movq       %%mm0, %%mm2    \n\t"
 772             "movq       %%mm3, %%mm4    \n\t"
 773             "movq       %%mm3, %%mm5    \n\t"
 774             "psrlq         $3, %%mm0    \n\t"
 775             "psrlq         $3, %%mm3    \n\t"
 776             "pand          %2, %%mm0    \n\t"
 777             "pand          %2, %%mm3    \n\t"
 778             "psrlq         $6, %%mm1    \n\t"
 779             "psrlq         $6, %%mm4    \n\t"
 780             "pand       %%mm6, %%mm1    \n\t"
 781             "pand       %%mm6, %%mm4    \n\t"
 782             "psrlq         $9, %%mm2    \n\t"
 783             "psrlq         $9, %%mm5    \n\t"
 784             "pand       %%mm7, %%mm2    \n\t"
 785             "pand       %%mm7, %%mm5    \n\t"
 786             "por        %%mm1, %%mm0    \n\t"
 787             "por        %%mm4, %%mm3    \n\t"
 788             "por        %%mm2, %%mm0    \n\t"
 789             "por        %%mm5, %%mm3    \n\t"
 790             "psllq        $16, %%mm3    \n\t"
 791             "por        %%mm3, %%mm0    \n\t"
 792             MOVNTQ"     %%mm0, %0       \n\t"
 793             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 794         d += 4;
 795         s += 12;
 796     }
 797     __asm__ volatile(SFENCE:::"memory");
 798     __asm__ volatile(EMMS:::"memory");
 799 #endif
 800     while (s < end) {
 801         const int b = *s++;
 802         const int g = *s++;
 803         const int r = *s++;
 804         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 805     }
 806 }
 807
 808 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
 809 {
 810     const uint8_t *s = src;
 811     const uint8_t *end;
 812 #if HAVE_MMX
 813     const uint8_t *mm_end;
 814 #endif
 815     uint16_t *d = (uint16_t *)dst;
 816     end = s + src_size;
 817 #if HAVE_MMX
 818     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
 819     __asm__ volatile(
 820         "movq         %0, %%mm7     \n\t"
 821         "movq         %1, %%mm6     \n\t"
 822         ::"m"(red_15mask),"m"(green_15mask));
 823     mm_end = end - 15;
 824     while (s < mm_end) {
 825         __asm__ volatile(
 826             PREFETCH"   32%1            \n\t"
 827             "movd         %1, %%mm0     \n\t"
 828             "movd        3%1, %%mm3     \n\t"
 829             "punpckldq   6%1, %%mm0     \n\t"
 830             "punpckldq   9%1, %%mm3     \n\t"
 831             "movq      %%mm0, %%mm1     \n\t"
 832             "movq      %%mm0, %%mm2     \n\t"
 833             "movq      %%mm3, %%mm4     \n\t"
 834             "movq      %%mm3, %%mm5     \n\t"
 835             "psllq        $7, %%mm0     \n\t"
 836             "psllq        $7, %%mm3     \n\t"
 837             "pand      %%mm7, %%mm0     \n\t"
 838             "pand      %%mm7, %%mm3     \n\t"
 839             "psrlq        $6, %%mm1     \n\t"
 840             "psrlq        $6, %%mm4     \n\t"
 841             "pand      %%mm6, %%mm1     \n\t"
 842             "pand      %%mm6, %%mm4     \n\t"
 843             "psrlq       $19, %%mm2     \n\t"
 844             "psrlq       $19, %%mm5     \n\t"
 845             "pand         %2, %%mm2     \n\t"
 846             "pand         %2, %%mm5     \n\t"
 847             "por       %%mm1, %%mm0     \n\t"
 848             "por       %%mm4, %%mm3     \n\t"
 849             "por       %%mm2, %%mm0     \n\t"
 850             "por       %%mm5, %%mm3     \n\t"
 851             "psllq       $16, %%mm3     \n\t"
 852             "por       %%mm3, %%mm0     \n\t"
 853             MOVNTQ"    %%mm0, %0        \n\t"
 854             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 855         d += 4;
 856         s += 12;
 857     }
 858     __asm__ volatile(SFENCE:::"memory");
 859     __asm__ volatile(EMMS:::"memory");
 860 #endif
 861     while (s < end) {
 862         const int r = *s++;
 863         const int g = *s++;
 864         const int b = *s++;
 865         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 866     }
 867 }
 868
 869 /*
 870   I use less accurate approximation here by simply left-shifting the input
 871   value and filling the low order bits with zeroes. This method improves PNG
 872   compression but this scheme cannot reproduce white exactly, since it does
 873   not generate an all-ones maximum value; the net effect is to darken the
 874   image slightly.
 875
 876   The better method should be "left bit replication":
 877
 878    4 3 2 1 0
 879    ---------
 880    1 1 0 1 1
 881
 882    7 6 5 4 3  2 1 0
 883    ----------------
 884    1 1 0 1 1  1 1 0
 885    |=======|  |===|
 886        |      leftmost bits repeated to fill open bits
 887        |
 888    original bits
 889 */
 890 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 891 {
 892     const uint16_t *end;
 893 #if HAVE_MMX
 894     const uint16_t *mm_end;
 895 #endif
 896     uint8_t *d = dst;
 897     const uint16_t *s = (const uint16_t*)src;
 898     end = s + src_size/2;
 899 #if HAVE_MMX
 900     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
 901     mm_end = end - 7;
 902     while (s < mm_end) {
 903         __asm__ volatile(
 904             PREFETCH"    32%1           \n\t"
 905             "movq          %1, %%mm0    \n\t"
 906             "movq          %1, %%mm1    \n\t"
 907             "movq          %1, %%mm2    \n\t"
 908             "pand          %2, %%mm0    \n\t"
 909             "pand          %3, %%mm1    \n\t"
 910             "pand          %4, %%mm2    \n\t"
 911             "psllq         $3, %%mm0    \n\t"
 912             "psrlq         $2, %%mm1    \n\t"
 913             "psrlq         $7, %%mm2    \n\t"
 914             "movq       %%mm0, %%mm3    \n\t"
 915             "movq       %%mm1, %%mm4    \n\t"
 916             "movq       %%mm2, %%mm5    \n\t"
 917             "punpcklwd     %5, %%mm0    \n\t"
 918             "punpcklwd     %5, %%mm1    \n\t"
 919             "punpcklwd     %5, %%mm2    \n\t"
 920             "punpckhwd     %5, %%mm3    \n\t"
 921             "punpckhwd     %5, %%mm4    \n\t"
 922             "punpckhwd     %5, %%mm5    \n\t"
 923             "psllq         $8, %%mm1    \n\t"
 924             "psllq        $16, %%mm2    \n\t"
 925             "por        %%mm1, %%mm0    \n\t"
 926             "por        %%mm2, %%mm0    \n\t"
 927             "psllq         $8, %%mm4    \n\t"
 928             "psllq        $16, %%mm5    \n\t"
 929             "por        %%mm4, %%mm3    \n\t"
 930             "por        %%mm5, %%mm3    \n\t"
 931
 932             "movq       %%mm0, %%mm6    \n\t"
 933             "movq       %%mm3, %%mm7    \n\t"
 934
 935             "movq         8%1, %%mm0    \n\t"
 936             "movq         8%1, %%mm1    \n\t"
 937             "movq         8%1, %%mm2    \n\t"
 938             "pand          %2, %%mm0    \n\t"
 939             "pand          %3, %%mm1    \n\t"
 940             "pand          %4, %%mm2    \n\t"
 941             "psllq         $3, %%mm0    \n\t"
 942             "psrlq         $2, %%mm1    \n\t"
 943             "psrlq         $7, %%mm2    \n\t"
 944             "movq       %%mm0, %%mm3    \n\t"
 945             "movq       %%mm1, %%mm4    \n\t"
 946             "movq       %%mm2, %%mm5    \n\t"
 947             "punpcklwd     %5, %%mm0    \n\t"
 948             "punpcklwd     %5, %%mm1    \n\t"
 949             "punpcklwd     %5, %%mm2    \n\t"
 950             "punpckhwd     %5, %%mm3    \n\t"
 951             "punpckhwd     %5, %%mm4    \n\t"
 952             "punpckhwd     %5, %%mm5    \n\t"
 953             "psllq         $8, %%mm1    \n\t"
 954             "psllq        $16, %%mm2    \n\t"
 955             "por        %%mm1, %%mm0    \n\t"
 956             "por        %%mm2, %%mm0    \n\t"
 957             "psllq         $8, %%mm4    \n\t"
 958             "psllq        $16, %%mm5    \n\t"
 959             "por        %%mm4, %%mm3    \n\t"
 960             "por        %%mm5, %%mm3    \n\t"
 961
 962             :"=m"(*d)
 963             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
 964             :"memory");
 965         /* borrowed 32 to 24 */
 966         __asm__ volatile(
 967             "movq       %%mm0, %%mm4    \n\t"
 968             "movq       %%mm3, %%mm5    \n\t"
 969             "movq       %%mm6, %%mm0    \n\t"
 970             "movq       %%mm7, %%mm1    \n\t"
 971
 972             "movq       %%mm4, %%mm6    \n\t"
 973             "movq       %%mm5, %%mm7    \n\t"
 974             "movq       %%mm0, %%mm2    \n\t"
 975             "movq       %%mm1, %%mm3    \n\t"
 976
 977             "psrlq         $8, %%mm2    \n\t"
 978             "psrlq         $8, %%mm3    \n\t"
 979             "psrlq         $8, %%mm6    \n\t"
 980             "psrlq         $8, %%mm7    \n\t"
 981             "pand          %2, %%mm0    \n\t"
 982             "pand          %2, %%mm1    \n\t"
 983             "pand          %2, %%mm4    \n\t"
 984             "pand          %2, %%mm5    \n\t"
 985             "pand          %3, %%mm2    \n\t"
 986             "pand          %3, %%mm3    \n\t"
 987             "pand          %3, %%mm6    \n\t"
 988             "pand          %3, %%mm7    \n\t"
 989             "por        %%mm2, %%mm0    \n\t"
 990             "por        %%mm3, %%mm1    \n\t"
 991             "por        %%mm6, %%mm4    \n\t"
 992             "por        %%mm7, %%mm5    \n\t"
 993
 994             "movq       %%mm1, %%mm2    \n\t"
 995             "movq       %%mm4, %%mm3    \n\t"
 996             "psllq        $48, %%mm2    \n\t"
 997             "psllq        $32, %%mm3    \n\t"
 998             "pand          %4, %%mm2    \n\t"
 999             "pand          %5, %%mm3    \n\t"
1000             "por        %%mm2, %%mm0    \n\t"
1001             "psrlq        $16, %%mm1    \n\t"
1002             "psrlq        $32, %%mm4    \n\t"
1003             "psllq        $16, %%mm5    \n\t"
1004             "por        %%mm3, %%mm1    \n\t"
1005             "pand          %6, %%mm5    \n\t"
1006             "por        %%mm5, %%mm4    \n\t"
1007
1008             MOVNTQ"     %%mm0,   %0     \n\t"
1009             MOVNTQ"     %%mm1,  8%0     \n\t"
1010             MOVNTQ"     %%mm4, 16%0"
1011
1012             :"=m"(*d)
1013             :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1014             :"memory");
1015         d += 24;
1016         s += 8;
1017     }
1018     __asm__ volatile(SFENCE:::"memory");
1019     __asm__ volatile(EMMS:::"memory");
1020 #endif
1021     while (s < end) {
1022         register uint16_t bgr;
1023         bgr = *s++;
1024         *d++ = (bgr&0x1F)<<3;
1025         *d++ = (bgr&0x3E0)>>2;
1026         *d++ = (bgr&0x7C00)>>7;
1027     }
1028 }
1029
1030 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1031 {
1032     const uint16_t *end;
1033 #if HAVE_MMX
1034     const uint16_t *mm_end;
1035 #endif
1036     uint8_t *d = (uint8_t *)dst;
1037     const uint16_t *s = (const uint16_t *)src;
1038     end = s + src_size/2;
1039 #if HAVE_MMX
1040     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1041     mm_end = end - 7;
1042     while (s < mm_end) {
1043         __asm__ volatile(
1044             PREFETCH"    32%1           \n\t"
1045             "movq          %1, %%mm0    \n\t"
1046             "movq          %1, %%mm1    \n\t"
1047             "movq          %1, %%mm2    \n\t"
1048             "pand          %2, %%mm0    \n\t"
1049             "pand          %3, %%mm1    \n\t"
1050             "pand          %4, %%mm2    \n\t"
1051             "psllq         $3, %%mm0    \n\t"
1052             "psrlq         $3, %%mm1    \n\t"
1053             "psrlq         $8, %%mm2    \n\t"
1054             "movq       %%mm0, %%mm3    \n\t"
1055             "movq       %%mm1, %%mm4    \n\t"
1056             "movq       %%mm2, %%mm5    \n\t"
1057             "punpcklwd     %5, %%mm0    \n\t"
1058             "punpcklwd     %5, %%mm1    \n\t"
1059             "punpcklwd     %5, %%mm2    \n\t"
1060             "punpckhwd     %5, %%mm3    \n\t"
1061             "punpckhwd     %5, %%mm4    \n\t"
1062             "punpckhwd     %5, %%mm5    \n\t"
1063             "psllq         $8, %%mm1    \n\t"
1064             "psllq        $16, %%mm2    \n\t"
1065             "por        %%mm1, %%mm0    \n\t"
1066             "por        %%mm2, %%mm0    \n\t"
1067             "psllq         $8, %%mm4    \n\t"
1068             "psllq        $16, %%mm5    \n\t"
1069             "por        %%mm4, %%mm3    \n\t"
1070             "por        %%mm5, %%mm3    \n\t"
1071
1072             "movq       %%mm0, %%mm6    \n\t"
1073             "movq       %%mm3, %%mm7    \n\t"
1074
1075             "movq         8%1, %%mm0    \n\t"
1076             "movq         8%1, %%mm1    \n\t"
1077             "movq         8%1, %%mm2    \n\t"
1078             "pand          %2, %%mm0    \n\t"
1079             "pand          %3, %%mm1    \n\t"
1080             "pand          %4, %%mm2    \n\t"
1081             "psllq         $3, %%mm0    \n\t"
1082             "psrlq         $3, %%mm1    \n\t"
1083             "psrlq         $8, %%mm2    \n\t"
1084             "movq       %%mm0, %%mm3    \n\t"
1085             "movq       %%mm1, %%mm4    \n\t"
1086             "movq       %%mm2, %%mm5    \n\t"
1087             "punpcklwd     %5, %%mm0    \n\t"
1088             "punpcklwd     %5, %%mm1    \n\t"
1089             "punpcklwd     %5, %%mm2    \n\t"
1090             "punpckhwd     %5, %%mm3    \n\t"
1091             "punpckhwd     %5, %%mm4    \n\t"
1092             "punpckhwd     %5, %%mm5    \n\t"
1093             "psllq         $8, %%mm1    \n\t"
1094             "psllq        $16, %%mm2    \n\t"
1095             "por        %%mm1, %%mm0    \n\t"
1096             "por        %%mm2, %%mm0    \n\t"
1097             "psllq         $8, %%mm4    \n\t"
1098             "psllq        $16, %%mm5    \n\t"
1099             "por        %%mm4, %%mm3    \n\t"
1100             "por        %%mm5, %%mm3    \n\t"
1101             :"=m"(*d)
1102             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1103             :"memory");
1104         /* borrowed 32 to 24 */
1105         __asm__ volatile(
1106             "movq       %%mm0, %%mm4    \n\t"
1107             "movq       %%mm3, %%mm5    \n\t"
1108             "movq       %%mm6, %%mm0    \n\t"
1109             "movq       %%mm7, %%mm1    \n\t"
1110
1111             "movq       %%mm4, %%mm6    \n\t"
1112             "movq       %%mm5, %%mm7    \n\t"
1113             "movq       %%mm0, %%mm2    \n\t"
1114             "movq       %%mm1, %%mm3    \n\t"
1115
1116             "psrlq         $8, %%mm2    \n\t"
1117             "psrlq         $8, %%mm3    \n\t"
1118             "psrlq         $8, %%mm6    \n\t"
1119             "psrlq         $8, %%mm7    \n\t"
1120             "pand          %2, %%mm0    \n\t"
1121             "pand          %2, %%mm1    \n\t"
1122             "pand          %2, %%mm4    \n\t"
1123             "pand          %2, %%mm5    \n\t"
1124             "pand          %3, %%mm2    \n\t"
1125             "pand          %3, %%mm3    \n\t"
1126             "pand          %3, %%mm6    \n\t"
1127             "pand          %3, %%mm7    \n\t"
1128             "por        %%mm2, %%mm0    \n\t"
1129             "por        %%mm3, %%mm1    \n\t"
1130             "por        %%mm6, %%mm4    \n\t"
1131             "por        %%mm7, %%mm5    \n\t"
1132
1133             "movq       %%mm1, %%mm2    \n\t"
1134             "movq       %%mm4, %%mm3    \n\t"
1135             "psllq        $48, %%mm2    \n\t"
1136             "psllq        $32, %%mm3    \n\t"
1137             "pand          %4, %%mm2    \n\t"
1138             "pand          %5, %%mm3    \n\t"
1139             "por        %%mm2, %%mm0    \n\t"
1140             "psrlq        $16, %%mm1    \n\t"
1141             "psrlq        $32, %%mm4    \n\t"
1142             "psllq        $16, %%mm5    \n\t"
1143             "por        %%mm3, %%mm1    \n\t"
1144             "pand          %6, %%mm5    \n\t"
1145             "por        %%mm5, %%mm4    \n\t"
1146
1147             MOVNTQ"     %%mm0,   %0     \n\t"
1148             MOVNTQ"     %%mm1,  8%0     \n\t"
1149             MOVNTQ"     %%mm4, 16%0"
1150
1151             :"=m"(*d)
1152             :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1153             :"memory");
1154         d += 24;
1155         s += 8;
1156     }
1157     __asm__ volatile(SFENCE:::"memory");
1158     __asm__ volatile(EMMS:::"memory");
1159 #endif
1160     while (s < end) {
1161         register uint16_t bgr;
1162         bgr = *s++;
1163         *d++ = (bgr&0x1F)<<3;
1164         *d++ = (bgr&0x7E0)>>3;
1165         *d++ = (bgr&0xF800)>>8;
1166     }
1167 }
1168
1169 /*
1170  * mm0 = 00 B3 00 B2 00 B1 00 B0
1171  * mm1 = 00 G3 00 G2 00 G1 00 G0
1172  * mm2 = 00 R3 00 R2 00 R1 00 R0
1173  * mm6 = FF FF FF FF FF FF FF FF
1174  * mm7 = 00 00 00 00 00 00 00 00
1175  */
1176 #define PACK_RGB32 \
1177     "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1178     "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1179     "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1180     "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1181     "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1182     "movq       %%mm0, %%mm3    \n\t"                               \
1183     "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1184     "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1185     MOVNTQ"     %%mm0,  %0      \n\t"                               \
1186     MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1187
1188 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1189 {
1190     const uint16_t *end;
1191 #if HAVE_MMX
1192     const uint16_t *mm_end;
1193 #endif
1194     uint8_t *d = dst;
1195     const uint16_t *s = (const uint16_t *)src;
1196     end = s + src_size/2;
1197 #if HAVE_MMX
1198     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1199     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1200     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1201     mm_end = end - 3;
1202     while (s < mm_end) {
1203         __asm__ volatile(
1204             PREFETCH"    32%1           \n\t"
1205             "movq          %1, %%mm0    \n\t"
1206             "movq          %1, %%mm1    \n\t"
1207             "movq          %1, %%mm2    \n\t"
1208             "pand          %2, %%mm0    \n\t"
1209             "pand          %3, %%mm1    \n\t"
1210             "pand          %4, %%mm2    \n\t"
1211             "psllq         $3, %%mm0    \n\t"
1212             "psrlq         $2, %%mm1    \n\t"
1213             "psrlq         $7, %%mm2    \n\t"
1214             PACK_RGB32
1215             :"=m"(*d)
1216             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1217             :"memory");
1218         d += 16;
1219         s += 4;
1220     }
1221     __asm__ volatile(SFENCE:::"memory");
1222     __asm__ volatile(EMMS:::"memory");
1223 #endif
1224     while (s < end) {
1225         register uint16_t bgr;
1226         bgr = *s++;
1227 #if HAVE_BIGENDIAN
1228         *d++ = 255;
1229         *d++ = (bgr&0x7C00)>>7;
1230         *d++ = (bgr&0x3E0)>>2;
1231         *d++ = (bgr&0x1F)<<3;
1232 #else
1233         *d++ = (bgr&0x1F)<<3;
1234         *d++ = (bgr&0x3E0)>>2;
1235         *d++ = (bgr&0x7C00)>>7;
1236         *d++ = 255;
1237 #endif
1238     }
1239 }
1240
1241 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1242 {
1243     const uint16_t *end;
1244 #if HAVE_MMX
1245     const uint16_t *mm_end;
1246 #endif
1247     uint8_t *d = dst;
1248     const uint16_t *s = (const uint16_t*)src;
1249     end = s + src_size/2;
1250 #if HAVE_MMX
1251     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1252     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1253     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1254     mm_end = end - 3;
1255     while (s < mm_end) {
1256         __asm__ volatile(
1257             PREFETCH"    32%1           \n\t"
1258             "movq          %1, %%mm0    \n\t"
1259             "movq          %1, %%mm1    \n\t"
1260             "movq          %1, %%mm2    \n\t"
1261             "pand          %2, %%mm0    \n\t"
1262             "pand          %3, %%mm1    \n\t"
1263             "pand          %4, %%mm2    \n\t"
1264             "psllq         $3, %%mm0    \n\t"
1265             "psrlq         $3, %%mm1    \n\t"
1266             "psrlq         $8, %%mm2    \n\t"
1267             PACK_RGB32
1268             :"=m"(*d)
1269             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1270             :"memory");
1271         d += 16;
1272         s += 4;
1273     }
1274     __asm__ volatile(SFENCE:::"memory");
1275     __asm__ volatile(EMMS:::"memory");
1276 #endif
1277     while (s < end) {
1278         register uint16_t bgr;
1279         bgr = *s++;
1280 #if HAVE_BIGENDIAN
1281         *d++ = 255;
1282         *d++ = (bgr&0xF800)>>8;
1283         *d++ = (bgr&0x7E0)>>3;
1284         *d++ = (bgr&0x1F)<<3;
1285 #else
1286         *d++ = (bgr&0x1F)<<3;
1287         *d++ = (bgr&0x7E0)>>3;
1288         *d++ = (bgr&0xF800)>>8;
1289         *d++ = 255;
1290 #endif
1291     }
1292 }
1293
1294 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1295 {
1296     x86_reg idx = 15 - src_size;
1297     const uint8_t *s = src-idx;
1298     uint8_t *d = dst-idx;
1299 #if HAVE_MMX
1300     __asm__ volatile(
1301         "test          %0, %0           \n\t"
1302         "jns           2f               \n\t"
1303         PREFETCH"       (%1, %0)        \n\t"
1304         "movq          %3, %%mm7        \n\t"
1305         "pxor          %4, %%mm7        \n\t"
1306         "movq       %%mm7, %%mm6        \n\t"
1307         "pxor          %5, %%mm7        \n\t"
1308         ASMALIGN(4)
1309         "1:                             \n\t"
1310         PREFETCH"     32(%1, %0)        \n\t"
1311         "movq           (%1, %0), %%mm0 \n\t"
1312         "movq          8(%1, %0), %%mm1 \n\t"
1313 # if HAVE_MMX2
1314         "pshufw      $177, %%mm0, %%mm3 \n\t"
1315         "pshufw      $177, %%mm1, %%mm5 \n\t"
1316         "pand       %%mm7, %%mm0        \n\t"
1317         "pand       %%mm6, %%mm3        \n\t"
1318         "pand       %%mm7, %%mm1        \n\t"
1319         "pand       %%mm6, %%mm5        \n\t"
1320         "por        %%mm3, %%mm0        \n\t"
1321         "por        %%mm5, %%mm1        \n\t"
1322 # else
1323         "movq       %%mm0, %%mm2        \n\t"
1324         "movq       %%mm1, %%mm4        \n\t"
1325         "pand       %%mm7, %%mm0        \n\t"
1326         "pand       %%mm6, %%mm2        \n\t"
1327         "pand       %%mm7, %%mm1        \n\t"
1328         "pand       %%mm6, %%mm4        \n\t"
1329         "movq       %%mm2, %%mm3        \n\t"
1330         "movq       %%mm4, %%mm5        \n\t"
1331         "pslld        $16, %%mm2        \n\t"
1332         "psrld        $16, %%mm3        \n\t"
1333         "pslld        $16, %%mm4        \n\t"
1334         "psrld        $16, %%mm5        \n\t"
1335         "por        %%mm2, %%mm0        \n\t"
1336         "por        %%mm4, %%mm1        \n\t"
1337         "por        %%mm3, %%mm0        \n\t"
1338         "por        %%mm5, %%mm1        \n\t"
1339 # endif
1340         MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1341         MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1342         "add          $16, %0           \n\t"
1343         "js            1b               \n\t"
1344         SFENCE"                         \n\t"
1345         EMMS"                           \n\t"
1346         "2:                             \n\t"
1347         : "+&r"(idx)
1348         : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1349         : "memory");
1350 #endif
1351     for (; idx<15; idx+=4) {
1352         register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1353         v &= 0xff00ff;
1354         *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1355     }
1356 }
1357
1358 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1359 {
1360     unsigned i;
1361 #if HAVE_MMX
1362     x86_reg mmx_size= 23 - src_size;
1363     __asm__ volatile (
1364         "test             %%"REG_a", %%"REG_a"          \n\t"
1365         "jns                     2f                     \n\t"
1366         "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1367         "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1368         "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1369         ASMALIGN(4)
1370         "1:                                             \n\t"
1371         PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1372         "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1373         "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1374         "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1375         "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1376         "pand                 %%mm5, %%mm0              \n\t"
1377         "pand                 %%mm6, %%mm1              \n\t"
1378         "pand                 %%mm7, %%mm2              \n\t"
1379         "por                  %%mm0, %%mm1              \n\t"
1380         "por                  %%mm2, %%mm1              \n\t"
1381         "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1382         MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1383         "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1384         "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1385         "pand                 %%mm7, %%mm0              \n\t"
1386         "pand                 %%mm5, %%mm1              \n\t"
1387         "pand                 %%mm6, %%mm2              \n\t"
1388         "por                  %%mm0, %%mm1              \n\t"
1389         "por                  %%mm2, %%mm1              \n\t"
1390         "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1391         MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1392         "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1393         "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1394         "pand                 %%mm6, %%mm0              \n\t"
1395         "pand                 %%mm7, %%mm1              \n\t"
1396         "pand                 %%mm5, %%mm2              \n\t"
1397         "por                  %%mm0, %%mm1              \n\t"
1398         "por                  %%mm2, %%mm1              \n\t"
1399         MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1400         "add                    $24, %%"REG_a"          \n\t"
1401         " js                     1b                     \n\t"
1402         "2:                                             \n\t"
1403         : "+a" (mmx_size)
1404         : "r" (src-mmx_size), "r"(dst-mmx_size)
1405     );
1406
1407     __asm__ volatile(SFENCE:::"memory");
1408     __asm__ volatile(EMMS:::"memory");
1409
1410     if (mmx_size==23) return; //finished, was multiple of 8
1411
1412     src+= src_size;
1413     dst+= src_size;
1414     src_size= 23-mmx_size;
1415     src-= src_size;
1416     dst-= src_size;
1417 #endif
1418     for (i=0; i<src_size; i+=3) {
1419         register uint8_t x;
1420         x          = src[i + 2];
1421         dst[i + 1] = src[i + 1];
1422         dst[i + 2] = src[i + 0];
1423         dst[i + 0] = x;
1424     }
1425 }
1426
1427 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1428                                            long width, long height,
1429                                            long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1430 {
1431     long y;
1432     const x86_reg chromWidth= width>>1;
1433     for (y=0; y<height; y++) {
1434 #if HAVE_MMX
1435         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1436         __asm__ volatile(
1437             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1438             ASMALIGN(4)
1439             "1:                                         \n\t"
1440             PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1441             PREFETCH"    32(%2, %%"REG_a")              \n\t"
1442             PREFETCH"    32(%3, %%"REG_a")              \n\t"
1443             "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1444             "movq                    %%mm0, %%mm2       \n\t" // U(0)
1445             "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1446             "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1447             "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1448
1449             "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1450             "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1451             "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1452             "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1453             "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1454             "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1455             "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1456             "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1457
1458             MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1459             MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1460             MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1461             MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1462
1463             "add                        $8, %%"REG_a"   \n\t"
1464             "cmp                        %4, %%"REG_a"   \n\t"
1465             " jb                        1b              \n\t"
1466             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1467             : "%"REG_a
1468         );
1469 #else
1470
1471 #if ARCH_ALPHA && HAVE_MVI
1472 #define pl2yuy2(n)                  \
1473     y1 = yc[n];                     \
1474     y2 = yc2[n];                    \
1475     u = uc[n];                      \
1476     v = vc[n];                      \
1477     __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1478     __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1479     __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1480     __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1481     yuv1 = (u << 8) + (v << 24);                \
1482     yuv2 = yuv1 + y2;               \
1483     yuv1 += y1;                     \
1484     qdst[n]  = yuv1;                \
1485     qdst2[n] = yuv2;
1486
1487         int i;
1488         uint64_t *qdst = (uint64_t *) dst;
1489         uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1490         const uint32_t *yc = (uint32_t *) ysrc;
1491         const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1492         const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1493         for (i = 0; i < chromWidth; i += 8) {
1494             uint64_t y1, y2, yuv1, yuv2;
1495             uint64_t u, v;
1496             /* Prefetch */
1497             __asm__("ldq $31,64(%0)" :: "r"(yc));
1498             __asm__("ldq $31,64(%0)" :: "r"(yc2));
1499             __asm__("ldq $31,64(%0)" :: "r"(uc));
1500             __asm__("ldq $31,64(%0)" :: "r"(vc));
1501
1502             pl2yuy2(0);
1503             pl2yuy2(1);
1504             pl2yuy2(2);
1505             pl2yuy2(3);
1506
1507             yc    += 4;
1508             yc2   += 4;
1509             uc    += 4;
1510             vc    += 4;
1511             qdst  += 4;
1512             qdst2 += 4;
1513         }
1514         y++;
1515         ysrc += lumStride;
1516         dst += dstStride;
1517
1518 #elif HAVE_FAST_64BIT
1519         int i;
1520         uint64_t *ldst = (uint64_t *) dst;
1521         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1522         for (i = 0; i < chromWidth; i += 2) {
1523             uint64_t k, l;
1524             k = yc[0] + (uc[0] << 8) +
1525                 (yc[1] << 16) + (vc[0] << 24);
1526             l = yc[2] + (uc[1] << 8) +
1527                 (yc[3] << 16) + (vc[1] << 24);
1528             *ldst++ = k + (l << 32);
1529             yc += 4;
1530             uc += 2;
1531             vc += 2;
1532         }
1533
1534 #else
1535         int i, *idst = (int32_t *) dst;
1536         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1537         for (i = 0; i < chromWidth; i++) {
1538 #if HAVE_BIGENDIAN
1539             *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1540                 (yc[1] << 8) + (vc[0] << 0);
1541 #else
1542             *idst++ = yc[0] + (uc[0] << 8) +
1543                 (yc[1] << 16) + (vc[0] << 24);
1544 #endif
1545             yc += 2;
1546             uc++;
1547             vc++;
1548         }
1549 #endif
1550 #endif
1551         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1552             usrc += chromStride;
1553             vsrc += chromStride;
1554         }
1555         ysrc += lumStride;
1556         dst  += dstStride;
1557     }
1558 #if HAVE_MMX
1559     __asm__(EMMS"       \n\t"
1560             SFENCE"     \n\t"
1561             :::"memory");
1562 #endif
1563 }
1564
1565 /**
1566  * Height should be a multiple of 2 and width should be a multiple of 16.
1567  * (If this is a problem for anyone then tell me, and I will fix it.)
1568  */
1569 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1570                                       long width, long height,
1571                                       long lumStride, long chromStride, long dstStride)
1572 {
1573     //FIXME interpolate chroma
1574     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1575 }
1576
1577 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1578                                            long width, long height,
1579                                            long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1580 {
1581     long y;
1582     const x86_reg chromWidth= width>>1;
1583     for (y=0; y<height; y++) {
1584 #if HAVE_MMX
1585         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1586         __asm__ volatile(
1587             "xor                %%"REG_a", %%"REG_a"    \n\t"
1588             ASMALIGN(4)
1589             "1:                                         \n\t"
1590             PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1591             PREFETCH"   32(%2, %%"REG_a")               \n\t"
1592             PREFETCH"   32(%3, %%"REG_a")               \n\t"
1593             "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1594             "movq                   %%mm0, %%mm2        \n\t" // U(0)
1595             "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1596             "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1597             "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1598
1599             "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1600             "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1601             "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1602             "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1603             "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1604             "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1605             "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1606             "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1607
1608             MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1609             MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1610             MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1611             MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1612
1613             "add                       $8, %%"REG_a"    \n\t"
1614             "cmp                       %4, %%"REG_a"    \n\t"
1615             " jb                       1b               \n\t"
1616             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1617             : "%"REG_a
1618         );
1619 #else
1620 //FIXME adapt the Alpha ASM code from yv12->yuy2
1621
1622 #if HAVE_FAST_64BIT
1623         int i;
1624         uint64_t *ldst = (uint64_t *) dst;
1625         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1626         for (i = 0; i < chromWidth; i += 2) {
1627             uint64_t k, l;
1628             k = uc[0] + (yc[0] << 8) +
1629                 (vc[0] << 16) + (yc[1] << 24);
1630             l = uc[1] + (yc[2] << 8) +
1631                 (vc[1] << 16) + (yc[3] << 24);
1632             *ldst++ = k + (l << 32);
1633             yc += 4;
1634             uc += 2;
1635             vc += 2;
1636         }
1637
1638 #else
1639         int i, *idst = (int32_t *) dst;
1640         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1641         for (i = 0; i < chromWidth; i++) {
1642 #if HAVE_BIGENDIAN
1643             *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1644                 (vc[0] << 8) + (yc[1] << 0);
1645 #else
1646             *idst++ = uc[0] + (yc[0] << 8) +
1647                (vc[0] << 16) + (yc[1] << 24);
1648 #endif
1649             yc += 2;
1650             uc++;
1651             vc++;
1652         }
1653 #endif
1654 #endif
1655         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1656             usrc += chromStride;
1657             vsrc += chromStride;
1658         }
1659         ysrc += lumStride;
1660         dst += dstStride;
1661     }
1662 #if HAVE_MMX
1663     __asm__(EMMS"       \n\t"
1664             SFENCE"     \n\t"
1665             :::"memory");
1666 #endif
1667 }
1668
1669 /**
1670  * Height should be a multiple of 2 and width should be a multiple of 16
1671  * (If this is a problem for anyone then tell me, and I will fix it.)
1672  */
1673 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1674                                       long width, long height,
1675                                       long lumStride, long chromStride, long dstStride)
1676 {
1677     //FIXME interpolate chroma
1678     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1679 }
1680
1681 /**
1682  * Width should be a multiple of 16.
1683  */
1684 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1685                                          long width, long height,
1686                                          long lumStride, long chromStride, long dstStride)
1687 {
1688     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1689 }
1690
1691 /**
1692  * Width should be a multiple of 16.
1693  */
1694 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1695                                          long width, long height,
1696                                          long lumStride, long chromStride, long dstStride)
1697 {
1698     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1699 }
1700
1701 /**
1702  * Height should be a multiple of 2 and width should be a multiple of 16.
1703  * (If this is a problem for anyone then tell me, and I will fix it.)
1704  */
1705 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1706                                       long width, long height,
1707                                       long lumStride, long chromStride, long srcStride)
1708 {
1709     long y;
1710     const x86_reg chromWidth= width>>1;
1711     for (y=0; y<height; y+=2) {
1712 #if HAVE_MMX
1713         __asm__ volatile(
1714             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1715             "pcmpeqw                 %%mm7, %%mm7       \n\t"
1716             "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1717             ASMALIGN(4)
1718             "1:                \n\t"
1719             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1720             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1721             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1722             "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1723             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1724             "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1725             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1726             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1727             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1728             "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1729             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1730
1731             MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1732
1733             "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1734             "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1735             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1736             "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1737             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1738             "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1739             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1740             "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1741             "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1742             "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1743
1744             MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1745
1746             "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1747             "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1748             "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1749             "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1750             "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1751             "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1752             "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1753             "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1754
1755             MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1756             MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1757
1758             "add                        $8, %%"REG_a"   \n\t"
1759             "cmp                        %4, %%"REG_a"   \n\t"
1760             " jb                        1b              \n\t"
1761             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1762             : "memory", "%"REG_a
1763         );
1764
1765         ydst += lumStride;
1766         src  += srcStride;
1767
1768         __asm__ volatile(
1769             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1770             ASMALIGN(4)
1771             "1:                                         \n\t"
1772             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1773             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1774             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1775             "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1776             "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1777             "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1778             "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1779             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1780             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1781             "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1782             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1783
1784             MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1785             MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1786
1787             "add                        $8, %%"REG_a"   \n\t"
1788             "cmp                        %4, %%"REG_a"   \n\t"
1789             " jb                        1b              \n\t"
1790
1791             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1792             : "memory", "%"REG_a
1793         );
1794 #else
1795         long i;
1796         for (i=0; i<chromWidth; i++) {
1797             ydst[2*i+0]     = src[4*i+0];
1798             udst[i]     = src[4*i+1];
1799             ydst[2*i+1]     = src[4*i+2];
1800             vdst[i]     = src[4*i+3];
1801         }
1802         ydst += lumStride;
1803         src  += srcStride;
1804
1805         for (i=0; i<chromWidth; i++) {
1806             ydst[2*i+0]     = src[4*i+0];
1807             ydst[2*i+1]     = src[4*i+2];
1808         }
1809 #endif
1810         udst += chromStride;
1811         vdst += chromStride;
1812         ydst += lumStride;
1813         src  += srcStride;
1814     }
1815 #if HAVE_MMX
1816     __asm__ volatile(EMMS"       \n\t"
1817                      SFENCE"     \n\t"
1818                      :::"memory");
1819 #endif
1820 }
1821
1822 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1823                                       uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1824                                       long width, long height, long lumStride, long chromStride)
1825 {
1826     /* Y Plane */
1827     memcpy(ydst, ysrc, width*height);
1828
1829     /* XXX: implement upscaling for U,V */
1830 }
1831
1832 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1833 {
1834     long x,y;
1835
1836     dst[0]= src[0];
1837
1838     // first line
1839     for (x=0; x<srcWidth-1; x++) {
1840         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1841         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1842     }
1843     dst[2*srcWidth-1]= src[srcWidth-1];
1844
1845     dst+= dstStride;
1846
1847     for (y=1; y<srcHeight; y++) {
1848 #if HAVE_MMX2 || HAVE_AMD3DNOW
1849         const x86_reg mmxSize= srcWidth&~15;
1850         __asm__ volatile(
1851             "mov           %4, %%"REG_a"            \n\t"
1852             "1:                                     \n\t"
1853             "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1854             "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1855             "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1856             "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1857             "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1858             "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1859             PAVGB"                  %%mm0, %%mm5    \n\t"
1860             PAVGB"                  %%mm0, %%mm3    \n\t"
1861             PAVGB"                  %%mm0, %%mm5    \n\t"
1862             PAVGB"                  %%mm0, %%mm3    \n\t"
1863             PAVGB"                  %%mm1, %%mm4    \n\t"
1864             PAVGB"                  %%mm1, %%mm2    \n\t"
1865             PAVGB"                  %%mm1, %%mm4    \n\t"
1866             PAVGB"                  %%mm1, %%mm2    \n\t"
1867             "movq                   %%mm5, %%mm7    \n\t"
1868             "movq                   %%mm4, %%mm6    \n\t"
1869             "punpcklbw              %%mm3, %%mm5    \n\t"
1870             "punpckhbw              %%mm3, %%mm7    \n\t"
1871             "punpcklbw              %%mm2, %%mm4    \n\t"
1872             "punpckhbw              %%mm2, %%mm6    \n\t"
1873 #if 1
1874             MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1875             MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1876             MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1877             MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1878 #else
1879             "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1880             "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1881             "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1882             "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1883 #endif
1884             "add                       $8, %%"REG_a"            \n\t"
1885             " js                       1b                       \n\t"
1886             :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1887             "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1888             "g" (-mmxSize)
1889             : "%"REG_a
1890
1891         );
1892 #else
1893         const x86_reg mmxSize=1;
1894 #endif
1895         dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1896         dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1897
1898         for (x=mmxSize-1; x<srcWidth-1; x++) {
1899             dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1900             dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1901             dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1902             dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1903         }
1904         dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1905         dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1906
1907         dst+=dstStride*2;
1908         src+=srcStride;
1909     }
1910
1911     // last line
1912 #if 1
1913     dst[0]= src[0];
1914
1915     for (x=0; x<srcWidth-1; x++) {
1916         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1917         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1918     }
1919     dst[2*srcWidth-1]= src[srcWidth-1];
1920 #else
1921     for (x=0; x<srcWidth; x++) {
1922         dst[2*x+0]=
1923         dst[2*x+1]= src[x];
1924     }
1925 #endif
1926
1927 #if HAVE_MMX
1928     __asm__ volatile(EMMS"       \n\t"
1929                      SFENCE"     \n\t"
1930                      :::"memory");
1931 #endif
1932 }
1933
1934 /**
1935  * Height should be a multiple of 2 and width should be a multiple of 16.
1936  * (If this is a problem for anyone then tell me, and I will fix it.)
1937  * Chrominance data is only taken from every second line, others are ignored.
1938  * FIXME: Write HQ version.
1939  */
1940 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1941                                       long width, long height,
1942                                       long lumStride, long chromStride, long srcStride)
1943 {
1944     long y;
1945     const x86_reg chromWidth= width>>1;
1946     for (y=0; y<height; y+=2) {
1947 #if HAVE_MMX
1948         __asm__ volatile(
1949             "xor                 %%"REG_a", %%"REG_a"   \n\t"
1950             "pcmpeqw             %%mm7, %%mm7   \n\t"
1951             "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1952             ASMALIGN(4)
1953             "1:                                 \n\t"
1954             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1955             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1956             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1957             "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1958             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1959             "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1960             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1961             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1962             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1963             "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1964             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1965
1966             MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1967
1968             "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1969             "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1970             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1971             "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1972             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1973             "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1974             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1975             "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1976             "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1977             "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1978
1979             MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1980
1981             "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1982             "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1983             "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1984             "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1985             "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1986             "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1987             "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1988             "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1989
1990             MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1991             MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1992
1993             "add                    $8, %%"REG_a"   \n\t"
1994             "cmp                    %4, %%"REG_a"   \n\t"
1995             " jb                    1b          \n\t"
1996             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1997             : "memory", "%"REG_a
1998         );
1999
2000         ydst += lumStride;
2001         src  += srcStride;
2002
2003         __asm__ volatile(
2004             "xor                 %%"REG_a", %%"REG_a"   \n\t"
2005             ASMALIGN(4)
2006             "1:                                 \n\t"
2007             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2008             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
2009             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
2010             "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
2011             "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
2012             "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2013             "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2014             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2015             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2016             "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2017             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2018
2019             MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
2020             MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2021
2022             "add                    $8, %%"REG_a"   \n\t"
2023             "cmp                    %4, %%"REG_a"   \n\t"
2024             " jb                    1b          \n\t"
2025
2026             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2027             : "memory", "%"REG_a
2028         );
2029 #else
2030         long i;
2031         for (i=0; i<chromWidth; i++) {
2032             udst[i]     = src[4*i+0];
2033             ydst[2*i+0] = src[4*i+1];
2034             vdst[i]     = src[4*i+2];
2035             ydst[2*i+1] = src[4*i+3];
2036         }
2037         ydst += lumStride;
2038         src  += srcStride;
2039
2040         for (i=0; i<chromWidth; i++) {
2041             ydst[2*i+0] = src[4*i+1];
2042             ydst[2*i+1] = src[4*i+3];
2043         }
2044 #endif
2045         udst += chromStride;
2046         vdst += chromStride;
2047         ydst += lumStride;
2048         src  += srcStride;
2049     }
2050 #if HAVE_MMX
2051     __asm__ volatile(EMMS"       \n\t"
2052                      SFENCE"     \n\t"
2053                      :::"memory");
2054 #endif
2055 }
2056
2057 /**
2058  * Height should be a multiple of 2 and width should be a multiple of 2.
2059  * (If this is a problem for anyone then tell me, and I will fix it.)
2060  * Chrominance data is only taken from every second line,
2061  * others are ignored in the C version.
2062  * FIXME: Write HQ version.
2063  */
2064 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2065                                        long width, long height,
2066                                        long lumStride, long chromStride, long srcStride)
2067 {
2068     long y;
2069     const x86_reg chromWidth= width>>1;
2070 #if HAVE_MMX
2071     for (y=0; y<height-2; y+=2) {
2072         long i;
2073         for (i=0; i<2; i++) {
2074             __asm__ volatile(
2075                 "mov                        %2, %%"REG_a"   \n\t"
2076                 "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2077                 "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2078                 "pxor                    %%mm7, %%mm7       \n\t"
2079                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2080                 ASMALIGN(4)
2081                 "1:                                         \n\t"
2082                 PREFETCH"    64(%0, %%"REG_d")              \n\t"
2083                 "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2084                 "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2085                 "punpcklbw               %%mm7, %%mm0       \n\t"
2086                 "punpcklbw               %%mm7, %%mm1       \n\t"
2087                 "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2088                 "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2089                 "punpcklbw               %%mm7, %%mm2       \n\t"
2090                 "punpcklbw               %%mm7, %%mm3       \n\t"
2091                 "pmaddwd                 %%mm6, %%mm0       \n\t"
2092                 "pmaddwd                 %%mm6, %%mm1       \n\t"
2093                 "pmaddwd                 %%mm6, %%mm2       \n\t"
2094                 "pmaddwd                 %%mm6, %%mm3       \n\t"
2095 #ifndef FAST_BGR2YV12
2096                 "psrad                      $8, %%mm0       \n\t"
2097                 "psrad                      $8, %%mm1       \n\t"
2098                 "psrad                      $8, %%mm2       \n\t"
2099                 "psrad                      $8, %%mm3       \n\t"
2100 #endif
2101                 "packssdw                %%mm1, %%mm0       \n\t"
2102                 "packssdw                %%mm3, %%mm2       \n\t"
2103                 "pmaddwd                 %%mm5, %%mm0       \n\t"
2104                 "pmaddwd                 %%mm5, %%mm2       \n\t"
2105                 "packssdw                %%mm2, %%mm0       \n\t"
2106                 "psraw                      $7, %%mm0       \n\t"
2107
2108                 "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2109                 "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2110                 "punpcklbw               %%mm7, %%mm4       \n\t"
2111                 "punpcklbw               %%mm7, %%mm1       \n\t"
2112                 "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2113                 "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2114                 "punpcklbw               %%mm7, %%mm2       \n\t"
2115                 "punpcklbw               %%mm7, %%mm3       \n\t"
2116                 "pmaddwd                 %%mm6, %%mm4       \n\t"
2117                 "pmaddwd                 %%mm6, %%mm1       \n\t"
2118                 "pmaddwd                 %%mm6, %%mm2       \n\t"
2119                 "pmaddwd                 %%mm6, %%mm3       \n\t"
2120 #ifndef FAST_BGR2YV12
2121                 "psrad                      $8, %%mm4       \n\t"
2122                 "psrad                      $8, %%mm1       \n\t"
2123                 "psrad                      $8, %%mm2       \n\t"
2124                 "psrad                      $8, %%mm3       \n\t"
2125 #endif
2126                 "packssdw                %%mm1, %%mm4       \n\t"
2127                 "packssdw                %%mm3, %%mm2       \n\t"
2128                 "pmaddwd                 %%mm5, %%mm4       \n\t"
2129                 "pmaddwd                 %%mm5, %%mm2       \n\t"
2130                 "add                       $24, %%"REG_d"   \n\t"
2131                 "packssdw                %%mm2, %%mm4       \n\t"
2132                 "psraw                      $7, %%mm4       \n\t"
2133
2134                 "packuswb                %%mm4, %%mm0       \n\t"
2135                 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2136
2137                 MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2138                 "add                        $8,      %%"REG_a"  \n\t"
2139                 " js                        1b                  \n\t"
2140                 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2141                 : "%"REG_a, "%"REG_d
2142             );
2143             ydst += lumStride;
2144             src  += srcStride;
2145         }
2146         src -= srcStride*2;
2147         __asm__ volatile(
2148             "mov                        %4, %%"REG_a"   \n\t"
2149             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2150             "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2151             "pxor                    %%mm7, %%mm7       \n\t"
2152             "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2153             "add                 %%"REG_d", %%"REG_d"   \n\t"
2154             ASMALIGN(4)
2155             "1:                                         \n\t"
2156             PREFETCH"    64(%0, %%"REG_d")              \n\t"
2157             PREFETCH"    64(%1, %%"REG_d")              \n\t"
2158 #if HAVE_MMX2 || HAVE_AMD3DNOW
2159             "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2160             "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2161             "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2162             "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2163             PAVGB"                   %%mm1, %%mm0       \n\t"
2164             PAVGB"                   %%mm3, %%mm2       \n\t"
2165             "movq                    %%mm0, %%mm1       \n\t"
2166             "movq                    %%mm2, %%mm3       \n\t"
2167             "psrlq                     $24, %%mm0       \n\t"
2168             "psrlq                     $24, %%mm2       \n\t"
2169             PAVGB"                   %%mm1, %%mm0       \n\t"
2170             PAVGB"                   %%mm3, %%mm2       \n\t"
2171             "punpcklbw               %%mm7, %%mm0       \n\t"
2172             "punpcklbw               %%mm7, %%mm2       \n\t"
2173 #else
2174             "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2175             "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2176             "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2177             "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2178             "punpcklbw               %%mm7, %%mm0       \n\t"
2179             "punpcklbw               %%mm7, %%mm1       \n\t"
2180             "punpcklbw               %%mm7, %%mm2       \n\t"
2181             "punpcklbw               %%mm7, %%mm3       \n\t"
2182             "paddw                   %%mm1, %%mm0       \n\t"
2183             "paddw                   %%mm3, %%mm2       \n\t"
2184             "paddw                   %%mm2, %%mm0       \n\t"
2185             "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2186             "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2187             "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2188             "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2189             "punpcklbw               %%mm7, %%mm4       \n\t"
2190             "punpcklbw               %%mm7, %%mm1       \n\t"
2191             "punpcklbw               %%mm7, %%mm2       \n\t"
2192             "punpcklbw               %%mm7, %%mm3       \n\t"
2193             "paddw                   %%mm1, %%mm4       \n\t"
2194             "paddw                   %%mm3, %%mm2       \n\t"
2195             "paddw                   %%mm4, %%mm2       \n\t"
2196             "psrlw                      $2, %%mm0       \n\t"
2197             "psrlw                      $2, %%mm2       \n\t"
2198 #endif
2199             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2200             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2201
2202             "pmaddwd                 %%mm0, %%mm1       \n\t"
2203             "pmaddwd                 %%mm2, %%mm3       \n\t"
2204             "pmaddwd                 %%mm6, %%mm0       \n\t"
2205             "pmaddwd                 %%mm6, %%mm2       \n\t"
2206 #ifndef FAST_BGR2YV12
2207             "psrad                      $8, %%mm0       \n\t"
2208             "psrad                      $8, %%mm1       \n\t"
2209             "psrad                      $8, %%mm2       \n\t"
2210             "psrad                      $8, %%mm3       \n\t"
2211 #endif
2212             "packssdw                %%mm2, %%mm0       \n\t"
2213             "packssdw                %%mm3, %%mm1       \n\t"
2214             "pmaddwd                 %%mm5, %%mm0       \n\t"
2215             "pmaddwd                 %%mm5, %%mm1       \n\t"
2216             "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2217             "psraw                      $7, %%mm0       \n\t"
2218
2219 #if HAVE_MMX2 || HAVE_AMD3DNOW
2220             "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2221             "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2222             "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2223             "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2224             PAVGB"                   %%mm1, %%mm4       \n\t"
2225             PAVGB"                   %%mm3, %%mm2       \n\t"
2226             "movq                    %%mm4, %%mm1       \n\t"
2227             "movq                    %%mm2, %%mm3       \n\t"
2228             "psrlq                     $24, %%mm4       \n\t"
2229             "psrlq                     $24, %%mm2       \n\t"
2230             PAVGB"                   %%mm1, %%mm4       \n\t"
2231             PAVGB"                   %%mm3, %%mm2       \n\t"
2232             "punpcklbw               %%mm7, %%mm4       \n\t"
2233             "punpcklbw               %%mm7, %%mm2       \n\t"
2234 #else
2235             "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2236             "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2237             "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2238             "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2239             "punpcklbw               %%mm7, %%mm4       \n\t"
2240             "punpcklbw               %%mm7, %%mm1       \n\t"
2241             "punpcklbw               %%mm7, %%mm2       \n\t"
2242             "punpcklbw               %%mm7, %%mm3       \n\t"
2243             "paddw                   %%mm1, %%mm4       \n\t"
2244             "paddw                   %%mm3, %%mm2       \n\t"
2245             "paddw                   %%mm2, %%mm4       \n\t"
2246             "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2247             "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2248             "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2249             "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2250             "punpcklbw               %%mm7, %%mm5       \n\t"
2251             "punpcklbw               %%mm7, %%mm1       \n\t"
2252             "punpcklbw               %%mm7, %%mm2       \n\t"
2253             "punpcklbw               %%mm7, %%mm3       \n\t"
2254             "paddw                   %%mm1, %%mm5       \n\t"
2255             "paddw                   %%mm3, %%mm2       \n\t"
2256             "paddw                   %%mm5, %%mm2       \n\t"
2257             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2258             "psrlw                      $2, %%mm4       \n\t"
2259             "psrlw                      $2, %%mm2       \n\t"
2260 #endif
2261             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2262             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2263
2264             "pmaddwd                 %%mm4, %%mm1       \n\t"
2265             "pmaddwd                 %%mm2, %%mm3       \n\t"
2266             "pmaddwd                 %%mm6, %%mm4       \n\t"
2267             "pmaddwd                 %%mm6, %%mm2       \n\t"
2268 #ifndef FAST_BGR2YV12
2269             "psrad                      $8, %%mm4       \n\t"
2270             "psrad                      $8, %%mm1       \n\t"
2271             "psrad                      $8, %%mm2       \n\t"
2272             "psrad                      $8, %%mm3       \n\t"
2273 #endif
2274             "packssdw                %%mm2, %%mm4       \n\t"
2275             "packssdw                %%mm3, %%mm1       \n\t"
2276             "pmaddwd                 %%mm5, %%mm4       \n\t"
2277             "pmaddwd                 %%mm5, %%mm1       \n\t"
2278             "add                       $24, %%"REG_d"   \n\t"
2279             "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2280             "psraw                      $7, %%mm4       \n\t"
2281
2282             "movq                    %%mm0, %%mm1           \n\t"
2283             "punpckldq               %%mm4, %%mm0           \n\t"
2284             "punpckhdq               %%mm4, %%mm1           \n\t"
2285             "packsswb                %%mm1, %%mm0           \n\t"
2286             "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2287             "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2288             "punpckhdq               %%mm0, %%mm0           \n\t"
2289             "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2290             "add                        $4, %%"REG_a"       \n\t"
2291             " js                        1b                  \n\t"
2292             : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2293             : "%"REG_a, "%"REG_d
2294         );
2295
2296         udst += chromStride;
2297         vdst += chromStride;
2298         src  += srcStride*2;
2299     }
2300
2301     __asm__ volatile(EMMS"       \n\t"
2302                      SFENCE"     \n\t"
2303                      :::"memory");
2304 #else
2305     y=0;
2306 #endif
2307     for (; y<height; y+=2) {
2308         long i;
2309         for (i=0; i<chromWidth; i++) {
2310             unsigned int b = src[6*i+0];
2311             unsigned int g = src[6*i+1];
2312             unsigned int r = src[6*i+2];
2313
2314             unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2315             unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2316             unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2317
2318             udst[i]     = U;
2319             vdst[i]     = V;
2320             ydst[2*i]   = Y;
2321
2322             b = src[6*i+3];
2323             g = src[6*i+4];
2324             r = src[6*i+5];
2325
2326             Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2327             ydst[2*i+1]     = Y;
2328         }
2329         ydst += lumStride;
2330         src  += srcStride;
2331
2332         for (i=0; i<chromWidth; i++) {
2333             unsigned int b = src[6*i+0];
2334             unsigned int g = src[6*i+1];
2335             unsigned int r = src[6*i+2];
2336
2337             unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2338
2339             ydst[2*i]     = Y;
2340
2341             b = src[6*i+3];
2342             g = src[6*i+4];
2343             r = src[6*i+5];
2344
2345             Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2346             ydst[2*i+1]     = Y;
2347         }
2348         udst += chromStride;
2349         vdst += chromStride;
2350         ydst += lumStride;
2351         src  += srcStride;
2352     }
2353 }
2354
2355 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2356                              long width, long height, long src1Stride,
2357                              long src2Stride, long dstStride)
2358 {
2359     long h;
2360
2361     for (h=0; h < height; h++) {
2362         long w;
2363
2364 #if HAVE_MMX
2365 #if HAVE_SSE2
2366         __asm__(
2367             "xor              %%"REG_a", %%"REG_a"  \n\t"
2368             "1:                                     \n\t"
2369             PREFETCH" 64(%1, %%"REG_a")             \n\t"
2370             PREFETCH" 64(%2, %%"REG_a")             \n\t"
2371             "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2372             "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2373             "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2374             "punpcklbw           %%xmm2, %%xmm0     \n\t"
2375             "punpckhbw           %%xmm2, %%xmm1     \n\t"
2376             "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2377             "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2378             "add                    $16, %%"REG_a"  \n\t"
2379             "cmp                     %3, %%"REG_a"  \n\t"
2380             " jb                     1b             \n\t"
2381             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2382             : "memory", "%"REG_a""
2383         );
2384 #else
2385         __asm__(
2386             "xor %%"REG_a", %%"REG_a"               \n\t"
2387             "1:                                     \n\t"
2388             PREFETCH" 64(%1, %%"REG_a")             \n\t"
2389             PREFETCH" 64(%2, %%"REG_a")             \n\t"
2390             "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2391             "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2392             "movq                 %%mm0, %%mm1      \n\t"
2393             "movq                 %%mm2, %%mm3      \n\t"
2394             "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2395             "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2396             "punpcklbw            %%mm4, %%mm0      \n\t"
2397             "punpckhbw            %%mm4, %%mm1      \n\t"
2398             "punpcklbw            %%mm5, %%mm2      \n\t"
2399             "punpckhbw            %%mm5, %%mm3      \n\t"
2400             MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2401             MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2402             MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2403             MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2404             "add                    $16, %%"REG_a"  \n\t"
2405             "cmp                     %3, %%"REG_a"  \n\t"
2406             " jb                     1b             \n\t"
2407             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2408             : "memory", "%"REG_a
2409         );
2410 #endif
2411         for (w= (width&(~15)); w < width; w++) {
2412             dest[2*w+0] = src1[w];
2413             dest[2*w+1] = src2[w];
2414         }
2415 #else
2416         for (w=0; w < width; w++) {
2417             dest[2*w+0] = src1[w];
2418             dest[2*w+1] = src2[w];
2419         }
2420 #endif
2421         dest += dstStride;
2422                 src1 += src1Stride;
2423                 src2 += src2Stride;
2424     }
2425 #if HAVE_MMX
2426     __asm__(
2427             EMMS"       \n\t"
2428             SFENCE"     \n\t"
2429             ::: "memory"
2430             );
2431 #endif
2432 }
2433
2434 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2435                                        uint8_t *dst1, uint8_t *dst2,
2436                                        long width, long height,
2437                                        long srcStride1, long srcStride2,
2438                                        long dstStride1, long dstStride2)
2439 {
2440     x86_reg y;
2441     long x,w,h;
2442     w=width/2; h=height/2;
2443 #if HAVE_MMX
2444     __asm__ volatile(
2445         PREFETCH" %0    \n\t"
2446         PREFETCH" %1    \n\t"
2447         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2448 #endif
2449     for (y=0;y<h;y++) {
2450         const uint8_t* s1=src1+srcStride1*(y>>1);
2451         uint8_t* d=dst1+dstStride1*y;
2452         x=0;
2453 #if HAVE_MMX
2454         for (;x<w-31;x+=32) {
2455             __asm__ volatile(
2456                 PREFETCH"   32%1        \n\t"
2457                 "movq         %1, %%mm0 \n\t"
2458                 "movq        8%1, %%mm2 \n\t"
2459                 "movq       16%1, %%mm4 \n\t"
2460                 "movq       24%1, %%mm6 \n\t"
2461                 "movq      %%mm0, %%mm1 \n\t"
2462                 "movq      %%mm2, %%mm3 \n\t"
2463                 "movq      %%mm4, %%mm5 \n\t"
2464                 "movq      %%mm6, %%mm7 \n\t"
2465                 "punpcklbw %%mm0, %%mm0 \n\t"
2466                 "punpckhbw %%mm1, %%mm1 \n\t"
2467                 "punpcklbw %%mm2, %%mm2 \n\t"
2468                 "punpckhbw %%mm3, %%mm3 \n\t"
2469                 "punpcklbw %%mm4, %%mm4 \n\t"
2470                 "punpckhbw %%mm5, %%mm5 \n\t"
2471                 "punpcklbw %%mm6, %%mm6 \n\t"
2472                 "punpckhbw %%mm7, %%mm7 \n\t"
2473                 MOVNTQ"    %%mm0,   %0  \n\t"
2474                 MOVNTQ"    %%mm1,  8%0  \n\t"
2475                 MOVNTQ"    %%mm2, 16%0  \n\t"
2476                 MOVNTQ"    %%mm3, 24%0  \n\t"
2477                 MOVNTQ"    %%mm4, 32%0  \n\t"
2478                 MOVNTQ"    %%mm5, 40%0  \n\t"
2479                 MOVNTQ"    %%mm6, 48%0  \n\t"
2480                 MOVNTQ"    %%mm7, 56%0"
2481                 :"=m"(d[2*x])
2482                 :"m"(s1[x])
2483                 :"memory");
2484         }
2485 #endif
2486         for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2487     }
2488     for (y=0;y<h;y++) {
2489         const uint8_t* s2=src2+srcStride2*(y>>1);
2490         uint8_t* d=dst2+dstStride2*y;
2491         x=0;
2492 #if HAVE_MMX
2493         for (;x<w-31;x+=32) {
2494             __asm__ volatile(
2495                 PREFETCH"   32%1        \n\t"
2496                 "movq         %1, %%mm0 \n\t"
2497                 "movq        8%1, %%mm2 \n\t"
2498                 "movq       16%1, %%mm4 \n\t"
2499                 "movq       24%1, %%mm6 \n\t"
2500                 "movq      %%mm0, %%mm1 \n\t"
2501                 "movq      %%mm2, %%mm3 \n\t"
2502                 "movq      %%mm4, %%mm5 \n\t"
2503                 "movq      %%mm6, %%mm7 \n\t"
2504                 "punpcklbw %%mm0, %%mm0 \n\t"
2505                 "punpckhbw %%mm1, %%mm1 \n\t"
2506                 "punpcklbw %%mm2, %%mm2 \n\t"
2507                 "punpckhbw %%mm3, %%mm3 \n\t"
2508                 "punpcklbw %%mm4, %%mm4 \n\t"
2509                 "punpckhbw %%mm5, %%mm5 \n\t"
2510                 "punpcklbw %%mm6, %%mm6 \n\t"
2511                 "punpckhbw %%mm7, %%mm7 \n\t"
2512                 MOVNTQ"    %%mm0,   %0  \n\t"
2513                 MOVNTQ"    %%mm1,  8%0  \n\t"
2514                 MOVNTQ"    %%mm2, 16%0  \n\t"
2515                 MOVNTQ"    %%mm3, 24%0  \n\t"
2516                 MOVNTQ"    %%mm4, 32%0  \n\t"
2517                 MOVNTQ"    %%mm5, 40%0  \n\t"
2518                 MOVNTQ"    %%mm6, 48%0  \n\t"
2519                 MOVNTQ"    %%mm7, 56%0"
2520                 :"=m"(d[2*x])
2521                 :"m"(s2[x])
2522                 :"memory");
2523         }
2524 #endif
2525         for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2526     }
2527 #if HAVE_MMX
2528     __asm__(
2529             EMMS"       \n\t"
2530             SFENCE"     \n\t"
2531             ::: "memory"
2532         );
2533 #endif
2534 }
2535
2536 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2537                                         uint8_t *dst,
2538                                         long width, long height,
2539                                         long srcStride1, long srcStride2,
2540                                         long srcStride3, long dstStride)
2541 {
2542     x86_reg x;
2543     long y,w,h;
2544     w=width/2; h=height;
2545     for (y=0;y<h;y++) {
2546         const uint8_t* yp=src1+srcStride1*y;
2547         const uint8_t* up=src2+srcStride2*(y>>2);
2548         const uint8_t* vp=src3+srcStride3*(y>>2);
2549         uint8_t* d=dst+dstStride*y;
2550         x=0;
2551 #if HAVE_MMX
2552         for (;x<w-7;x+=8) {
2553             __asm__ volatile(
2554                 PREFETCH"   32(%1, %0)          \n\t"
2555                 PREFETCH"   32(%2, %0)          \n\t"
2556                 PREFETCH"   32(%3, %0)          \n\t"
2557                 "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2558                 "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2559                 "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2560                 "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2561                 "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2562                 "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2563                 "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2564                 "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2565                 "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2566                 "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2567
2568                 "movq            %%mm1, %%mm6   \n\t"
2569                 "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2570                 "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2571                 "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2572                 MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2573                 MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2574
2575                 "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2576                 "movq     8(%1, %0, 4), %%mm0   \n\t"
2577                 "movq            %%mm0, %%mm3   \n\t"
2578                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2579                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2580                 MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2581                 MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2582
2583                 "movq            %%mm4, %%mm6   \n\t"
2584                 "movq    16(%1, %0, 4), %%mm0   \n\t"
2585                 "movq            %%mm0, %%mm3   \n\t"
2586                 "punpcklbw       %%mm5, %%mm4   \n\t"
2587                 "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2588                 "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2589                 MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2590                 MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2591
2592                 "punpckhbw       %%mm5, %%mm6   \n\t"
2593                 "movq    24(%1, %0, 4), %%mm0   \n\t"
2594                 "movq            %%mm0, %%mm3   \n\t"
2595                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2596                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2597                 MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2598                 MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2599
2600                 : "+r" (x)
2601                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2602                 :"memory");
2603         }
2604 #endif
2605         for (; x<w; x++) {
2606             const long x2 = x<<2;
2607             d[8*x+0] = yp[x2];
2608             d[8*x+1] = up[x];
2609             d[8*x+2] = yp[x2+1];
2610             d[8*x+3] = vp[x];
2611             d[8*x+4] = yp[x2+2];
2612             d[8*x+5] = up[x];
2613             d[8*x+6] = yp[x2+3];
2614             d[8*x+7] = vp[x];
2615         }
2616     }
2617 #if HAVE_MMX
2618     __asm__(
2619             EMMS"       \n\t"
2620             SFENCE"     \n\t"
2621             ::: "memory"
2622         );
2623 #endif
2624 }
2625
2626 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2627 {
2628     dst +=   count;
2629     src += 2*count;
2630     count= - count;
2631
2632 #if HAVE_MMX
2633     if(count <= -16) {
2634         count += 15;
2635         __asm__ volatile(
2636             "pcmpeqw       %%mm7, %%mm7        \n\t"
2637             "psrlw            $8, %%mm7        \n\t"
2638             "1:                                \n\t"
2639             "movq -30(%1, %0, 2), %%mm0        \n\t"
2640             "movq -22(%1, %0, 2), %%mm1        \n\t"
2641             "movq -14(%1, %0, 2), %%mm2        \n\t"
2642             "movq  -6(%1, %0, 2), %%mm3        \n\t"
2643             "pand          %%mm7, %%mm0        \n\t"
2644             "pand          %%mm7, %%mm1        \n\t"
2645             "pand          %%mm7, %%mm2        \n\t"
2646             "pand          %%mm7, %%mm3        \n\t"
2647             "packuswb      %%mm1, %%mm0        \n\t"
2648             "packuswb      %%mm3, %%mm2        \n\t"
2649             MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2650             MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2651             "add             $16, %0           \n\t"
2652             " js 1b                            \n\t"
2653             : "+r"(count)
2654             : "r"(src), "r"(dst)
2655         );
2656         count -= 15;
2657     }
2658 #endif
2659     while(count<0) {
2660         dst[count]= src[2*count];
2661         count++;
2662     }
2663 }
2664
2665 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2666 {
2667     dst0+=   count;
2668     dst1+=   count;
2669     src += 4*count;
2670     count= - count;
2671 #if HAVE_MMX
2672     if(count <= -8) {
2673         count += 7;
2674         __asm__ volatile(
2675             "pcmpeqw       %%mm7, %%mm7        \n\t"
2676             "psrlw            $8, %%mm7        \n\t"
2677             "1:                                \n\t"
2678             "movq -28(%1, %0, 4), %%mm0        \n\t"
2679             "movq -20(%1, %0, 4), %%mm1        \n\t"
2680             "movq -12(%1, %0, 4), %%mm2        \n\t"
2681             "movq  -4(%1, %0, 4), %%mm3        \n\t"
2682             "pand          %%mm7, %%mm0        \n\t"
2683             "pand          %%mm7, %%mm1        \n\t"
2684             "pand          %%mm7, %%mm2        \n\t"
2685             "pand          %%mm7, %%mm3        \n\t"
2686             "packuswb      %%mm1, %%mm0        \n\t"
2687             "packuswb      %%mm3, %%mm2        \n\t"
2688             "movq          %%mm0, %%mm1        \n\t"
2689             "movq          %%mm2, %%mm3        \n\t"
2690             "psrlw            $8, %%mm0        \n\t"
2691             "psrlw            $8, %%mm2        \n\t"
2692             "pand          %%mm7, %%mm1        \n\t"
2693             "pand          %%mm7, %%mm3        \n\t"
2694             "packuswb      %%mm2, %%mm0        \n\t"
2695             "packuswb      %%mm3, %%mm1        \n\t"
2696             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2697             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2698             "add              $8, %0           \n\t"
2699             " js 1b                            \n\t"
2700             : "+r"(count)
2701             : "r"(src), "r"(dst0), "r"(dst1)
2702         );
2703         count -= 7;
2704     }
2705 #endif
2706     while(count<0) {
2707         dst0[count]= src[4*count+0];
2708         dst1[count]= src[4*count+2];
2709         count++;
2710     }
2711 }
2712
2713 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2714 {
2715     dst0 +=   count;
2716     dst1 +=   count;
2717     src0 += 4*count;
2718     src1 += 4*count;
2719     count= - count;
2720 #ifdef PAVGB
2721     if(count <= -8) {
2722         count += 7;
2723         __asm__ volatile(
2724             "pcmpeqw        %%mm7, %%mm7        \n\t"
2725             "psrlw             $8, %%mm7        \n\t"
2726             "1:                                \n\t"
2727             "movq  -28(%1, %0, 4), %%mm0        \n\t"
2728             "movq  -20(%1, %0, 4), %%mm1        \n\t"
2729             "movq  -12(%1, %0, 4), %%mm2        \n\t"
2730             "movq   -4(%1, %0, 4), %%mm3        \n\t"
2731             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2732             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2733             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2734             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2735             "pand           %%mm7, %%mm0        \n\t"
2736             "pand           %%mm7, %%mm1        \n\t"
2737             "pand           %%mm7, %%mm2        \n\t"
2738             "pand           %%mm7, %%mm3        \n\t"
2739             "packuswb       %%mm1, %%mm0        \n\t"
2740             "packuswb       %%mm3, %%mm2        \n\t"
2741             "movq           %%mm0, %%mm1        \n\t"
2742             "movq           %%mm2, %%mm3        \n\t"
2743             "psrlw             $8, %%mm0        \n\t"
2744             "psrlw             $8, %%mm2        \n\t"
2745             "pand           %%mm7, %%mm1        \n\t"
2746             "pand           %%mm7, %%mm3        \n\t"
2747             "packuswb       %%mm2, %%mm0        \n\t"
2748             "packuswb       %%mm3, %%mm1        \n\t"
2749             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2750             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2751             "add               $8, %0           \n\t"
2752             " js 1b                            \n\t"
2753             : "+r"(count)
2754             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2755         );
2756         count -= 7;
2757     }
2758 #endif
2759     while(count<0) {
2760         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2761         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2762         count++;
2763     }
2764 }
2765
2766 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2767 {
2768     dst0+=   count;
2769     dst1+=   count;
2770     src += 4*count;
2771     count= - count;
2772 #if HAVE_MMX
2773     if(count <= -8) {
2774         count += 7;
2775         __asm__ volatile(
2776             "pcmpeqw       %%mm7, %%mm7        \n\t"
2777             "psrlw            $8, %%mm7        \n\t"
2778             "1:                                \n\t"
2779             "movq -28(%1, %0, 4), %%mm0        \n\t"
2780             "movq -20(%1, %0, 4), %%mm1        \n\t"
2781             "movq -12(%1, %0, 4), %%mm2        \n\t"
2782             "movq  -4(%1, %0, 4), %%mm3        \n\t"
2783             "psrlw            $8, %%mm0        \n\t"
2784             "psrlw            $8, %%mm1        \n\t"
2785             "psrlw            $8, %%mm2        \n\t"
2786             "psrlw            $8, %%mm3        \n\t"
2787             "packuswb      %%mm1, %%mm0        \n\t"
2788             "packuswb      %%mm3, %%mm2        \n\t"
2789             "movq          %%mm0, %%mm1        \n\t"
2790             "movq          %%mm2, %%mm3        \n\t"
2791             "psrlw            $8, %%mm0        \n\t"
2792             "psrlw            $8, %%mm2        \n\t"
2793             "pand          %%mm7, %%mm1        \n\t"
2794             "pand          %%mm7, %%mm3        \n\t"
2795             "packuswb      %%mm2, %%mm0        \n\t"
2796             "packuswb      %%mm3, %%mm1        \n\t"
2797             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2798             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2799             "add              $8, %0           \n\t"
2800             " js 1b                            \n\t"
2801             : "+r"(count)
2802             : "r"(src), "r"(dst0), "r"(dst1)
2803         );
2804         count -= 7;
2805     }
2806 #endif
2807     src++;
2808     while(count<0) {
2809         dst0[count]= src[4*count+0];
2810         dst1[count]= src[4*count+2];
2811         count++;
2812     }
2813 }
2814
2815 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2816 {
2817     dst0 +=   count;
2818     dst1 +=   count;
2819     src0 += 4*count;
2820     src1 += 4*count;
2821     count= - count;
2822 #ifdef PAVGB
2823     if(count <= -8) {
2824         count += 7;
2825         __asm__ volatile(
2826             "pcmpeqw        %%mm7, %%mm7        \n\t"
2827             "psrlw             $8, %%mm7        \n\t"
2828             "1:                                \n\t"
2829             "movq  -28(%1, %0, 4), %%mm0        \n\t"
2830             "movq  -20(%1, %0, 4), %%mm1        \n\t"
2831             "movq  -12(%1, %0, 4), %%mm2        \n\t"
2832             "movq   -4(%1, %0, 4), %%mm3        \n\t"
2833             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2834             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2835             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2836             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2837             "psrlw             $8, %%mm0        \n\t"
2838             "psrlw             $8, %%mm1        \n\t"
2839             "psrlw             $8, %%mm2        \n\t"
2840             "psrlw             $8, %%mm3        \n\t"
2841             "packuswb       %%mm1, %%mm0        \n\t"
2842             "packuswb       %%mm3, %%mm2        \n\t"
2843             "movq           %%mm0, %%mm1        \n\t"
2844             "movq           %%mm2, %%mm3        \n\t"
2845             "psrlw             $8, %%mm0        \n\t"
2846             "psrlw             $8, %%mm2        \n\t"
2847             "pand           %%mm7, %%mm1        \n\t"
2848             "pand           %%mm7, %%mm3        \n\t"
2849             "packuswb       %%mm2, %%mm0        \n\t"
2850             "packuswb       %%mm3, %%mm1        \n\t"
2851             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2852             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2853             "add               $8, %0           \n\t"
2854             " js 1b                            \n\t"
2855             : "+r"(count)
2856             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2857         );
2858         count -= 7;
2859     }
2860 #endif
2861     src0++;
2862     src1++;
2863     while(count<0) {
2864         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2865         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2866         count++;
2867     }
2868 }
2869
2870 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2871                                       long width, long height,
2872                                       long lumStride, long chromStride, long srcStride)
2873 {
2874     long y;
2875     const long chromWidth= -((-width)>>1);
2876
2877     for (y=0; y<height; y++) {
2878         RENAME(extract_even)(src, ydst, width);
2879         if(y&1) {
2880             RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2881             udst+= chromStride;
2882             vdst+= chromStride;
2883         }
2884
2885         src += srcStride;
2886         ydst+= lumStride;
2887     }
2888 #if HAVE_MMX
2889     __asm__(
2890             EMMS"       \n\t"
2891             SFENCE"     \n\t"
2892             ::: "memory"
2893         );
2894 #endif
2895 }
2896
2897 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2898                                       long width, long height,
2899                                       long lumStride, long chromStride, long srcStride)
2900 {
2901     long y;
2902     const long chromWidth= -((-width)>>1);
2903
2904     for (y=0; y<height; y++) {
2905         RENAME(extract_even)(src, ydst, width);
2906         RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2907
2908         src += srcStride;
2909         ydst+= lumStride;
2910         udst+= chromStride;
2911         vdst+= chromStride;
2912     }
2913 #if HAVE_MMX
2914     __asm__(
2915             EMMS"       \n\t"
2916             SFENCE"     \n\t"
2917             ::: "memory"
2918         );
2919 #endif
2920 }
2921
2922 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2923                                       long width, long height,
2924                                       long lumStride, long chromStride, long srcStride)
2925 {
2926     long y;
2927     const long chromWidth= -((-width)>>1);
2928
2929     for (y=0; y<height; y++) {
2930         RENAME(extract_even)(src+1, ydst, width);
2931         if(y&1) {
2932             RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2933             udst+= chromStride;
2934             vdst+= chromStride;
2935         }
2936
2937         src += srcStride;
2938         ydst+= lumStride;
2939     }
2940 #if HAVE_MMX
2941     __asm__(
2942             EMMS"       \n\t"
2943             SFENCE"     \n\t"
2944             ::: "memory"
2945         );
2946 #endif
2947 }
2948
2949 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2950                                       long width, long height,
2951                                       long lumStride, long chromStride, long srcStride)
2952 {
2953     long y;
2954     const long chromWidth= -((-width)>>1);
2955
2956     for (y=0; y<height; y++) {
2957         RENAME(extract_even)(src+1, ydst, width);
2958         RENAME(extract_even2)(src, udst, vdst, chromWidth);
2959
2960         src += srcStride;
2961         ydst+= lumStride;
2962         udst+= chromStride;
2963         vdst+= chromStride;
2964     }
2965 #if HAVE_MMX
2966     __asm__(
2967             EMMS"       \n\t"
2968             SFENCE"     \n\t"
2969             ::: "memory"
2970         );
2971 #endif
2972 }
2973
2974 static inline void RENAME(rgb2rgb_init)(void)
2975 {
2976     rgb15to16       = RENAME(rgb15to16);
2977     rgb15tobgr24    = RENAME(rgb15tobgr24);
2978     rgb15to32       = RENAME(rgb15to32);
2979     rgb16tobgr24    = RENAME(rgb16tobgr24);
2980     rgb16to32       = RENAME(rgb16to32);
2981     rgb16to15       = RENAME(rgb16to15);
2982     rgb24tobgr16    = RENAME(rgb24tobgr16);
2983     rgb24tobgr15    = RENAME(rgb24tobgr15);
2984     rgb24tobgr32    = RENAME(rgb24tobgr32);
2985     rgb32to16       = RENAME(rgb32to16);
2986     rgb32to15       = RENAME(rgb32to15);
2987     rgb32tobgr24    = RENAME(rgb32tobgr24);
2988     rgb24to15       = RENAME(rgb24to15);
2989     rgb24to16       = RENAME(rgb24to16);
2990     rgb24tobgr24    = RENAME(rgb24tobgr24);
2991     rgb32tobgr32    = RENAME(rgb32tobgr32);
2992     rgb32tobgr16    = RENAME(rgb32tobgr16);
2993     rgb32tobgr15    = RENAME(rgb32tobgr15);
2994     yv12toyuy2      = RENAME(yv12toyuy2);
2995     yv12touyvy      = RENAME(yv12touyvy);
2996     yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2997     yuv422ptouyvy   = RENAME(yuv422ptouyvy);
2998     yuy2toyv12      = RENAME(yuy2toyv12);
2999 //    yvu9toyv12      = RENAME(yvu9toyv12);
3000     planar2x        = RENAME(planar2x);
3001     rgb24toyv12     = RENAME(rgb24toyv12);
3002     interleaveBytes = RENAME(interleaveBytes);
3003     vu9_to_vu12     = RENAME(vu9_to_vu12);
3004     yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
3005
3006     uyvytoyuv420    = RENAME(uyvytoyuv420);
3007     uyvytoyuv422    = RENAME(uyvytoyuv422);
3008     yuyvtoyuv420    = RENAME(yuyvtoyuv420);
3009     yuyvtoyuv422    = RENAME(yuyvtoyuv422);
3010 }