streamtools/rgbtorgb.cc

   1 #include <stdint.h>
   2 #include <stdlib.h> // for size_t
   3 #include <vector>
   4 #include <cmath>
   5
   6 /* RGB to RGB and RGB from/to I420 conversions written by Bisqwit
   7  * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
   8  */
   9
  10 typedef uint_least64_t uint64_t;
  11
  12 #include "quantize.hh"
  13 #include "rgbtorgb.hh"
  14 #include "simd.hh"
  15
  16 /* For BPP conversions */
  17
  18 static const uint64_t mask24l        __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  19 static const uint64_t mask24h        __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  20 static const uint64_t mask24hh       __attribute__((aligned(8))) = 0xffff000000000000ULL;
  21 static const uint64_t mask24hhh      __attribute__((aligned(8))) = 0xffffffff00000000ULL;
  22 static const uint64_t mask24hhhh     __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
  23
  24 static const uint64_t mask64h        __attribute__((aligned(8))) = 0xFF00FF00FF00FF00ULL;
  25 static const uint64_t mask64l        __attribute__((aligned(8))) = 0x00FF00FF00FF00FFULL;
  26 static const uint64_t mask64hw       __attribute__((aligned(8))) = 0xFFFF0000FFFF0000ULL;
  27 static const uint64_t mask64lw       __attribute__((aligned(8))) = 0x0000FFFF0000FFFFULL;
  28 static const uint64_t mask64hd       __attribute__((aligned(8))) = 0xFFFFFFFF00000000ULL;
  29 static const uint64_t mask64ld       __attribute__((aligned(8))) = 0x00000000FFFFFFFFULL;
  30
  31 /* For RGB2YUV: */
  32
  33 static const int RGB2YUV_SHIFT = 15; /* highest value where [RGB][YUV] fit in signed short */
  34
  35 static const int RY = 8414;  //  ((int)(( 65.738/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  36 static const int RV = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  37 static const int RU = -4856; //  ((int)((-37.945/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  38
  39 static const int GY = 16519; //  ((int)((129.057/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  40 static const int GV = -12051;//  ((int)((-94.154/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  41 static const int GU = -9534; //  ((int)((-74.494/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  42
  43 static const int BY = 3208;  //  ((int)(( 25.064/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  44 static const int BV = -2339; //  ((int)((-18.285/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  45 static const int BU = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
  46
  47 static const int Y_ADD = 16;
  48 static const int U_ADD = 128;
  49 static const int V_ADD = 128;
  50
  51 /* For YUV2RGB: */
  52
  53 static const int YUV2RGB_SHIFT = 13; /* highest value where UB still fits in signed short */
  54
  55 static const int Y_REV = 9539; // ((int)( (  255 / 219.0 )     * (1<<YUV2RGB_SHIFT)+0.5));
  56 static const int VR = 14688;   // ((int)( ( 117504 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
  57 static const int VG = -6659;   // ((int)( ( -53279 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
  58 static const int UG = -3208;   // ((int)( ( -25675 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
  59 static const int UB = 16525;   // ((int)( ( 132201 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
  60
  61 /****************/
  62
  63 template<typename c64>
  64 static inline void Convert32To24_32bytes(c64 w0, c64 w1, c64 w2, c64 w3, unsigned char* dest)
  65 {
  66     c64 r0 = (w0 & mask24l) | ((w0 >> 8) & mask24h); /* bbbaaa */
  67     c64 r1 = (w1 & mask24l) | ((w1 >> 8) & mask24h); /* dddccc */
  68     c64 r2 = (w2 & mask24l) | ((w2 >> 8) & mask24h); /* fffeee */
  69     c64 r3 = (w3 & mask24l) | ((w3 >> 8) & mask24h); /* hhhggg */
  70
  71     /* ccbbbaaa */
  72     ((r0      )  | ((r1 << 48) & mask24hh)).Put(dest+0);
  73     /* feeedddc */
  74     ((r1 >> 16)  | ((r2 << 32) & mask24hhh)).Put(dest+8);
  75     /* hhhgggff */
  76     ((r2 >> 32)  | ((r3 << 16) & mask24hhhh)).Put(dest+16);
  77 }
  78
  79 #if defined(__x86_64) || defined(USE_MMX)
  80 static void Convert32To24_32bytes(const unsigned char* src,
  81                                   unsigned char* dest)
  82 {
  83     c64 w0; w0.Get(src+0);
  84     c64 w1; w1.Get(src+8);
  85     c64 w2; w2.Get(src+16);
  86     c64 w3; w3.Get(src+24);
  87     Convert32To24_32bytes(w0,w1,w2,w3, dest);
  88 }
  89 #endif
  90
  91 void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
  92 {
  93     const unsigned char* src = (const unsigned char*)data;
  94
  95     #if defined(__x86_64) || defined(USE_MMX)
  96     while(npixels >= 8)
  97     {
  98         Convert32To24_32bytes(src, dest);
  99         src  += 4*8;
 100         dest += 3*8;
 101         npixels -= 8;
 102     }
 103      #ifdef USE_MMX
 104      MMX_clear();
 105      #endif
 106     #endif
 107
 108     for(unsigned pos=0; pos<npixels; ++pos)
 109     {
 110         dest[3*pos+0] = src[4*pos+0];
 111         dest[3*pos+1] = src[4*pos+1];
 112         dest[3*pos+2] = src[4*pos+2];
 113     }
 114 }
 115
 116 static void Unbuild16(unsigned char* target, unsigned rgb16)
 117 {
 118     unsigned B = (rgb16%32)*256/32;
 119     unsigned G = ((rgb16/32)%64)*256/64;
 120     unsigned R = ((rgb16/(32*64))%32)*256/32;
 121     target[0] = R;
 122     target[1] = G;
 123     target[2] = B;
 124 }
 125
 126 static void Unbuild15(unsigned char* target, unsigned rgb16)
 127 {
 128     unsigned B = (rgb16%32)*256/32;
 129     unsigned G = ((rgb16/32)%32)*256/32;
 130     unsigned R = ((rgb16/(32*32))%32)*256/32;
 131     target[0] = R;
 132     target[1] = G;
 133     target[2] = B;
 134 }
 135
 136 template<int basevalue_lo, int basevalue_hi>
 137 struct Bits16const
 138 {
 139     static const uint64_t static_value =
 140        (( ((uint64_t)(unsigned short) basevalue_lo) << 0)
 141       | ( ((uint64_t)(unsigned short) basevalue_hi) << 16)
 142       | ( ((uint64_t)(unsigned short) basevalue_lo) << 32)
 143       | ( ((uint64_t)(unsigned short) basevalue_hi) << 48));
 144     static const uint64_t value;
 145 };
 146 template<int basevalue_lo, int basevalue_hi>
 147 const uint64_t Bits16const<basevalue_lo, basevalue_hi>::value =
 148                Bits16const<basevalue_lo, basevalue_hi>::static_value;
 149
 150 template<int basevalue_lo, int basevalue_hi>
 151 struct Bits32const
 152 {
 153     static const uint64_t static_value =
 154        (( ((uint64_t)(unsigned int) basevalue_lo) << 0)
 155       | ( ((uint64_t)(unsigned int) basevalue_hi) << 32));
 156     static const uint64_t value = static_value;
 157 };/*
 158 template<int basevalue_lo, int basevalue_hi>
 159 const uint64_t Bits32const<basevalue_lo, basevalue_hi>::value =
 160                Bits32const<basevalue_lo, basevalue_hi>::static_value;*/
 161
 162 template<uint64_t basevalue_lo, uint64_t basevalue_hi>
 163 struct Bits8const
 164 {
 165     static const uint64_t static_value =
 166        ((basevalue_lo << 0)
 167       | (basevalue_hi << 8)
 168       | (basevalue_lo << 16)
 169       | (basevalue_hi << 24)
 170       | (basevalue_lo << 32)
 171       | (basevalue_hi << 40)
 172       | (basevalue_lo << 48)
 173       | (basevalue_hi << 56));
 174     static const uint64_t value = static_value;
 175 };
 176
 177
 178 template<int lowbitcount, int highbitcount, int leftshift>
 179 struct MaskBconst
 180 {
 181     static const uint64_t basevalue_lo = (1 <<  lowbitcount) - 1;
 182     static const uint64_t basevalue_hi = (1 << highbitcount) - 1;
 183     static const uint64_t value = Bits8const<basevalue_lo,basevalue_hi>::value << leftshift;
 184 };
 185
 186 template<int bits>
 187 struct Convert_2byte_consts
 188 {
 189     static const uint64_t mask_lo;//   = MaskBconst<bits,0, 0>::value;
 190     static const uint64_t mask_hi;//   = MaskBconst<bits,0, 8>::value;
 191     static const uint64_t mask_frac;// = MaskBconst<8-bits,8-bits, 0>::value;
 192 };
 193 template<int bits>
 194 const uint64_t Convert_2byte_consts<bits>::mask_lo   = MaskBconst<bits, 0, 0>::value;
 195 template<int bits>
 196 const uint64_t Convert_2byte_consts<bits>::mask_hi   = MaskBconst<bits, 0, 8>::value;
 197 template<int bits>
 198 const uint64_t Convert_2byte_consts<bits>::mask_frac = MaskBconst<8-bits, 8-bits, 0>::value;
 199
 200 template<int offs, int bits>
 201 struct Convert_2byte_helper
 202 {
 203     c64 lo, hi;
 204
 205     Convert_2byte_helper(c64 p4a, c64 p4b)
 206     {
 207         const uint64_t& mask_lo   = Convert_2byte_consts<bits>::mask_lo;
 208         const uint64_t& mask_hi   = Convert_2byte_consts<bits>::mask_hi;
 209         const uint64_t& mask_frac = Convert_2byte_consts<bits>::mask_frac;
 210
 211         /* STEP 1: SEPARATE THE PIXELS INTO RED, GREEN AND BLUE COMPONENTS */
 212
 213         /* 000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb */
 214         c64 s5 = ((p4a >> offs) & mask_lo) | ((p4b << (8-offs)) & mask_hi);
 215
 216         /* STEP 2: SCALE THE COLOR COMPONENTS TO 256 RANGE */
 217
 218         /* BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000 */
 219         /* 00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb */
 220         c64 v8 = (s5 << (8-bits)) | ((s5 >> (bits-(8-bits))) & mask_frac);
 221         /* v8:
 222          *
 223          * BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb *
 224          */
 225
 226         /* STEP 3: DEINTERLACE THE PIXELS */
 227         lo = (v8     ) & mask64l;
 228         hi = (v8 >> 8) & mask64l;
 229     }
 230 };
 231
 232 /*
 233 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
 234 static void Convert_2byte_to_24Common(const unsigned char* src, unsigned char* dest)
 235     __attribute((noinline));
 236 */
 237 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits, bool rgb24>
 238 static void Convert_2byte_to_24or32Common(const unsigned char* src, unsigned char* dest)
 239 {
 240     c64 p4a; p4a.Get(src+0); // four pixels
 241     c64 p4b; p4b.Get(src+8); // another four pixels
 242
 243     /* in: In both registers: */
 244
 245     Convert_2byte_helper<roffs,rbits> r(p4a,p4b);
 246     Convert_2byte_helper<boffs,bbits> b(p4a,p4b);
 247     Convert_2byte_helper<goffs,gbits> g(p4a,p4b);
 248
 249     /* STEP 4: CONVERT PIXELS INTO RGB32 */
 250
 251     /* Now we have:
 252      *               b.lo =  0j0g0d0a
 253      *               g.lo =  0k0h0e0b
 254      *               r.lo =  0l0i0f0c
 255      *               b.hi =  0J0G0D0A
 256      *               g.hi =  0K0H0E0B
 257      *               r.hi =  0L0I0F0C
 258      * We want:
 259      *                 w1 =  0fed0cba
 260      *                 w2 =  0lkj0ihg
 261      *                 w3 =  0FED0CBA
 262      *                 w4 =  0LKJ0IHG
 263      */
 264
 265 #if 0 && defined(__MMX__) /* FIXME why is this 0&&? */
 266     // punpcklbw  0k0h0e0b, 0j0g0d0a -> 00ed00ba
 267     // punpcklwd  0l0i0f0c, ________ -> 0f__0c__
 268     c64 w1 = r.lo.unpacklwd(0) | g.lo.unpacklbw(b.lo); // pix 0,1
 269     // punpckhbw  0k0h0e0b, 0j0g0d0a -> 00kj00hg
 270     // punpckhwd  0l0i0f0c, ________ -> 0l__0i__
 271     c64 w2 = r.lo.unpackhwd(0) | g.lo.unpackhbw(b.lo); // pix 2,3
 272
 273     c64 w3 = r.hi.unpacklwd(0) | g.hi.unpacklbw(b.hi); // pix 4,5
 274     c64 w4 = r.hi.unpackhwd(0) | g.hi.unpackhbw(b.hi); // pix 6,7
 275     #ifndef USE_MMX
 276      MMX_clear();
 277     #endif
 278 #else
 279     /* With 64-bit registers, this code is greatly simpler than
 280      * the emulation of unpack opcodes. However, when the
 281      * unpack opcodes is available, using them is shorter.
 282      * Which way is faster? FIXME: Find out
 283      */
 284
 285     //        mask64lw:  00**00**
 286     //        mask64hw:  **00**00
 287     // b.lo & mask64lw:  000g000a
 288     // g.lo & mask64lw:  000h000b
 289     // r.lo & mask64lw:  000i000c
 290     // b.lo & mask64hw:  0j000d00
 291     // g.lo & mask64hw:  0k000e00
 292     // r.lo & mask64hw:  0l000f00
 293
 294     c64 tlo1 = ((b.lo & mask64lw)     ) | ((g.lo & mask64lw) << 8) | ((r.lo & mask64lw) << 16);
 295     c64 tlo2 = ((b.lo & mask64hw) >>16) | ((g.lo & mask64hw) >> 8) | ((r.lo & mask64hw)      );
 296
 297     c64 thi1 = ((b.hi & mask64lw)     ) | ((g.hi & mask64lw) << 8) | ((r.hi & mask64lw) << 16);
 298     c64 thi2 = ((b.hi & mask64hw) >>16) | ((g.hi & mask64hw) >> 8) | ((r.hi & mask64hw)      );
 299     /*
 300      *                tlo1 =  0ihg0cba
 301      *                tlo2 =  0lkj0fed
 302      *                thi1 =  0IHG0CBA
 303      *                thi2 =  0LKJ0FED
 304      *            mask64ld =  0000****
 305      *            mask64hd =  ****0000
 306      */
 307
 308     c64 w1 = (tlo1 & mask64ld) | ((tlo2 & mask64ld) << 32); // 00000cba | 00000fed = 0fed0bca
 309     c64 w2 = (tlo2 & mask64hd) | ((tlo1 & mask64hd) >> 32); // 0lkj0000 | 0ihg0000 = 0lkj0ihg
 310
 311     c64 w3 = (thi1 & mask64ld) | ((thi2 & mask64ld) << 32);
 312     c64 w4 = (thi2 & mask64hd) | ((thi1 & mask64hd) >> 32);
 313 #endif
 314
 315     if(rgb24)
 316     {
 317         /* STEP 5A: CONVERT PIXELS INTO RGB24 */
 318         Convert32To24_32bytes(w1,w2,w3,w4, dest);
 319     }
 320     else
 321     {
 322         /* STEP 5B: STORE RGB32 */
 323         w1.Put(dest+0);
 324         w2.Put(dest+8);
 325         w3.Put(dest+16);
 326         w4.Put(dest+24);
 327     }
 328
 329     /*
 330      punpcklbw    ____ABCD, ____abcd = AaBbCcDd
 331      punpcklwd    ____ABCD, ____abcd = ABabCDcd
 332      punpckldq    ____ABCD, ____abcd = ABCDabcd
 333
 334      punpckhbw    ABCD____, abcd____ = AaBbCcDd
 335      punpckhwd    ABCD____, abcd____ = ABabCDcd
 336      punpckhdq    ABCD____, abcd____ = ABCDabcd
 337     */
 338 }
 339
 340 void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
 341 {
 342     const unsigned char* src = (const unsigned char*)data;
 343
 344     if(swap_red_blue)
 345         for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
 346             Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, true> (src, dest);
 347     else
 348         for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
 349             Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, true> (src, dest);
 350
 351     #ifdef USE_MMX
 352      MMX_clear();
 353     #endif
 354     for(unsigned a=0; a<npixels; ++a)
 355     {
 356         unsigned short v = ((const unsigned short*)src)[a];
 357         Unbuild15(&dest[a*3], v);
 358     }
 359 }
 360
 361 void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
 362 {
 363     const unsigned char* src = (const unsigned char*)data;
 364
 365     if(swap_red_blue)
 366         for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
 367             Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, true> (src, dest);
 368     else
 369         for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
 370             Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, true> (src, dest);
 371
 372     #ifdef USE_MMX
 373      MMX_clear();
 374     #endif
 375     for(unsigned a=0; a<npixels; ++a)
 376     {
 377         unsigned short v = ((const unsigned short*)src)[a];
 378         Unbuild16(&dest[a*3], v);
 379     }
 380 }
 381
 382 void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
 383 {
 384     const unsigned char* src = (const unsigned char*)data;
 385
 386     if(swap_red_blue)
 387         for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
 388             Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, false> (src, dest);
 389     else
 390         for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
 391             Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, false> (src, dest);
 392
 393     #ifdef USE_MMX
 394      MMX_clear();
 395     #endif
 396     for(unsigned a=0; a<npixels; ++a)
 397     {
 398         unsigned short v = ((const unsigned short*)src)[a];
 399         Unbuild15(&dest[a*4], v);
 400     }
 401 }
 402
 403 void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
 404 {
 405     const unsigned char* src = (const unsigned char*)data;
 406
 407     if(swap_red_blue)
 408         for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
 409             Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, false> (src, dest);
 410     else
 411         for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
 412             Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, false> (src, dest);
 413
 414     #ifdef USE_MMX
 415      MMX_clear();
 416     #endif
 417     for(unsigned a=0; a<npixels; ++a)
 418     {
 419         unsigned short v = ((const unsigned short*)src)[a];
 420         Unbuild16(&dest[a*4], v);
 421     }
 422 }
 423
 424 static inline unsigned Build16(unsigned x,unsigned y, const unsigned char* rgbdata)
 425 {
 426     unsigned o16 = (x + 4*y) % 16;
 427     return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
 428          | (Quantize4x4<63>(o16, rgbdata[1]) << 5)
 429          | (Quantize4x4<31>(o16, rgbdata[0]) << 11);
 430 }
 431 static inline unsigned Build15(unsigned x,unsigned y, const unsigned char* rgbdata)
 432 {
 433     unsigned o16 = (x + 4*y) % 16;
 434     return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
 435          | (Quantize4x4<31>(o16, rgbdata[1]) << 5)
 436          | (Quantize4x4<31>(o16, rgbdata[0]) << 10);
 437 }
 438
 439 void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 440 {
 441     const unsigned char* logodata = (const unsigned char*) data;
 442     unsigned short* result = (unsigned short*) dest;
 443     unsigned x=0,y=0;
 444     for(unsigned pos=0; pos<npixels; ++pos)
 445     {
 446         result[pos] = Build16(x,y, &logodata[pos*3]);
 447         if(++x >= width) { x=0; ++y; }
 448     }
 449 }
 450
 451 void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 452 {
 453     const unsigned char* logodata = (const unsigned char*) data;
 454     unsigned short* result = (unsigned short*) dest;
 455     unsigned x=0,y=0;
 456     for(unsigned pos=0; pos<npixels; ++pos)
 457     {
 458         result[pos] = Build15(x,y, &logodata[pos*3]);
 459         if(++x >= width) { x=0; ++y; }
 460     }
 461 }
 462
 463 #ifdef __MMX__
 464 static inline void Convert_I420_MMX_Common
 465     (c64_MMX p0_1, c64_MMX p2_3,
 466      unsigned char* dest_y0,
 467      unsigned char* dest_y1,
 468      unsigned char* dest_u,
 469      unsigned char* dest_v)
 470 {
 471     c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
 472     c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
 473     c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3);
 474     c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
 475
 476     c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
 477     c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
 478     c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);
 479
 480     c64_MMX ctotal = p0.add16(
 481                      p2.add16(
 482                      p1.add16(
 483                      p3)));
 484
 485     p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
 486     p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
 487     p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
 488     p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
 489
 490     c64_MMX yy;
 491     yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
 492                ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
 493                ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
 494                ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
 495     yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
 496
 497     // Because we're writing to adjacent pixels, we optimize this by
 498     // writing two 8-bit values at once in both cases.
 499     *(short*)dest_y0 = yy.Extract88_from_1616lo();
 500     *(short*)dest_y1 = yy.Extract88_from_1616hi();
 501
 502     c64_MMX u_total32 = _mm_madd_pi16(rgb_u.value, ctotal.value);
 503     c64_MMX v_total32 = _mm_madd_pi16(rgb_v.value, ctotal.value);
 504
 505     *dest_u = U_ADD + ((u_total32.Extract32<0>() + u_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
 506     *dest_v = V_ADD + ((v_total32.Extract32<0>() + v_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
 507 }
 508
 509 static inline void Convert_YUY2_MMX_Common
 510     (c64_MMX p0_1, c64_MMX p2_3,
 511      unsigned char* dest_yvyu)
 512 {
 513     c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
 514     c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
 515     c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3); // expand to 64-bit (4*16)
 516     c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
 517
 518     c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
 519     c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
 520     c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);
 521
 522     c64_MMX ctotal0 = p0.add16(p1);
 523     c64_MMX ctotal2 = p2.add16(p3);
 524
 525     p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
 526     p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
 527     p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
 528     p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
 529
 530     c64_MMX yy;
 531     yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
 532                ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
 533                ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
 534                ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
 535
 536     yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
 537
 538     c64_MMX u_total32_0 = _mm_madd_pi16(rgb_u.value, ctotal0.value);
 539     c64_MMX v_total32_0 = _mm_madd_pi16(rgb_v.value, ctotal0.value);
 540     c64_MMX u_total32_2 = _mm_madd_pi16(rgb_u.value, ctotal2.value);
 541     c64_MMX v_total32_2 = _mm_madd_pi16(rgb_v.value, ctotal2.value);
 542
 543     c64_MMX quadword = yy; // four y values: at 0, 2, 4 and 6
 544
 545     c64_MMX uv; uv.Init16(
 546         ((v_total32_0.Extract32<0>() + v_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
 547         ((u_total32_0.Extract32<0>() + u_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
 548         ((v_total32_2.Extract32<0>() + v_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
 549         ((u_total32_2.Extract32<0>() + u_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)) );
 550     c64_MMX uv_adds; uv_adds.Init16(V_ADD, U_ADD, V_ADD, U_ADD);
 551     uv = uv.add16(uv_adds);
 552
 553     quadword |= uv << 8;     // two u and v values: at 1, 3, 5 and 7.
 554     quadword.Put(dest_yvyu); // write four y values: at 0, 2, 4 and 6
 555 }
 556 #endif
 557
 558 /*template<int PixStride>
 559 void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 560     __attribute__((noinline));*/
 561
 562 template<int PixStride>
 563 void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 564 {
 565     const unsigned char* src = (const unsigned char*) data;
 566     unsigned height = npixels / width;
 567
 568     unsigned pos = 0;
 569     unsigned ypos = 0;
 570     unsigned vpos = npixels;
 571     unsigned upos = vpos + npixels / 4;
 572     unsigned stride = width*PixStride;
 573
 574     /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u",
 575         npixels,width,height, ypos,upos,vpos);*/
 576
 577     /* This function is based on code from x264 svn version 711 */
 578     /* TODO: Apply MMX optimization for 24-bit pixels */
 579
 580     for(unsigned y=0; y<height; y += 2)
 581     {
 582         for(unsigned x=0; x<width; x += 2)
 583         {
 584         #ifdef __MMX__
 585           if(PixStride == 4)
 586           {
 587             c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
 588             c64_MMX p2_3; p2_3.Get(&src[pos+stride]); // two 32-bit pixels
 589
 590             pos += PixStride*2;
 591
 592             Convert_I420_MMX_Common(p0_1, p2_3,
 593                 dest+ypos,
 594                 dest+ypos+width,
 595                 dest+upos++,
 596                 dest+vpos++);
 597           }
 598           else
 599         #endif
 600           {
 601             int c[3], rgb[3][4];
 602
 603             /* luma */
 604             for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
 605             for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n + stride];
 606             pos += PixStride;
 607
 608             for(int n=0; n<3; ++n) c[n] += rgb[n][2] = src[pos + n];
 609             for(int n=0; n<3; ++n) c[n] += rgb[n][3] = src[pos + n + stride];
 610             pos += PixStride;
 611
 612             unsigned destpos[4] = { ypos, ypos+width, ypos+1, ypos+width+1 };
 613             for(int n=0; n<4; ++n)
 614             {
 615                 dest[destpos[n]]
 616                     = Y_ADD + ((RY * rgb[0][n]
 617                               + GY * rgb[1][n]
 618                               + BY * rgb[2][n]
 619                                ) >> RGB2YUV_SHIFT);  // y
 620             }
 621
 622             dest[upos++] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2)) );
 623             dest[vpos++] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)) );
 624           }
 625
 626             ypos += 2;
 627         }
 628         pos += stride;
 629         ypos += width;
 630     }
 631
 632     /*fprintf(stderr, ",yr=%u,ur=%u,vr=%u\n",
 633         ypos,upos,vpos);*/
 634
 635     #ifdef __MMX__
 636      MMX_clear();
 637     #endif
 638 }
 639
 640 template<int PixStride>
 641 void Convert_4byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 642 {
 643     const unsigned char* src = (const unsigned char*) data;
 644     unsigned height = npixels / width;
 645     unsigned pos = 0;
 646     unsigned ypos = 0;
 647     unsigned stride = width*PixStride;
 648
 649     /* This function is based on code from x264 svn version 711 */
 650     /* TODO: Apply MMX optimization for 24-bit pixels */
 651
 652     for(unsigned y=0; y<height; ++y)
 653     {
 654         for(unsigned x=0; x<width; x += 2)
 655         {
 656         #ifdef __MMX__
 657           if(PixStride == 4)
 658           {
 659             c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
 660             pos += PixStride*2;
 661
 662             c64_MMX p2_3; p2_3.Get(&src[pos]);        // two 32-bit pixels (4*8)
 663             pos += PixStride*2;
 664             x += 2;
 665
 666             Convert_YUY2_MMX_Common(p0_1, p2_3,
 667                 dest+ypos);
 668
 669             ypos += 4;
 670           }
 671           else
 672         #endif
 673           {
 674             int c[3], rgb[3][2];
 675
 676             /* luma */
 677             for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
 678             pos += PixStride;
 679
 680             for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n];
 681             pos += PixStride;
 682
 683             for(int n=0; n<2; ++n)
 684             {
 685                 dest[ypos + n*2]
 686                     = Y_ADD + ((RY * rgb[0][n]
 687                               + GY * rgb[1][n]
 688                               + BY * rgb[2][n]
 689                                ) >> RGB2YUV_SHIFT);  // y
 690             }
 691
 692             dest[ypos+3] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1)) );
 693             dest[ypos+1] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)) );
 694           }
 695             ypos += 4;
 696         }
 697     }
 698     #ifdef __MMX__
 699     MMX_clear();
 700     #endif
 701 }
 702
 703 /*template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
 704 void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 705     __attribute__((noinline));*/
 706
 707 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
 708 void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 709 {
 710     const unsigned PixStride = 2;
 711     const unsigned char* src = (const unsigned char*) data;
 712     unsigned height = npixels / width;
 713     unsigned pos = 0;
 714     unsigned ypos = 0;
 715     unsigned vpos = npixels;
 716     unsigned upos = vpos + npixels / 4;
 717     unsigned stride = width*PixStride;
 718
 719     /* This function is based on code from x264 svn version 711 */
 720
 721     for(unsigned y=0; y<height; y += 2)
 722     {
 723         for(unsigned x=0; x<width; x += 8)
 724         {
 725             unsigned char Rgb2byteBuf[2][8][4];
 726
 727             /* Convert 8 pixels from two scanlines (16 in total)
 728              * from RGB15 / RGB16 to RGB32
 729              * (Not RGB32, because RGB32 conversion is faster)
 730              */
 731             Convert_2byte_to_24or32Common
 732                 <roffs,rbits, goffs,gbits, boffs,bbits, false>
 733                 (src+pos,        Rgb2byteBuf[0][0]);
 734
 735             Convert_2byte_to_24or32Common
 736                 <roffs,rbits, goffs,gbits, boffs,bbits, false>
 737                 (src+pos+stride, Rgb2byteBuf[1][0]);
 738
 739             pos += 16;
 740
 741             for(int x8 = 0; x8 < 8; x8 += 2)
 742             {
 743               #ifdef _q_MMX__
 744                 c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[0][x8][0]); // two 32-bit pixels (4*8)
 745                 c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[1][x8][0]); // two 32-bit pixels
 746
 747                 Convert_I420_MMX_Common(p0_1, p2_3,
 748                     dest+ypos,
 749                     dest+ypos+width,
 750                     dest+upos++,
 751                     dest+vpos++);
 752               #else
 753                 int c[3];
 754                 /* TODO: Some faster means than using pointers */
 755                 unsigned char* rgb[4] =
 756                 {
 757                     Rgb2byteBuf[0][x8+0],
 758                     Rgb2byteBuf[0][x8+1],
 759                     Rgb2byteBuf[1][x8+0],
 760                     Rgb2byteBuf[1][x8+1]
 761                 };
 762
 763                 for(int m=0; m<3; ++m) c[m] = 0;
 764                 for(int n=0; n<4; ++n)
 765                     for(int m=0; m<3; ++m)
 766                         c[m] += rgb[n][m];
 767
 768                 unsigned destpos[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
 769                 for(int n=0; n<4; ++n)
 770                 {
 771                     dest[destpos[n]]
 772                         = Y_ADD + ((RY * rgb[n][0]
 773                                   + GY * rgb[n][1]
 774                                   + BY * rgb[n][2]
 775                                    ) >> RGB2YUV_SHIFT);  // y
 776                 }
 777
 778                 /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
 779                 // Note: +2 is because c[] contains 4 values
 780                 dest[upos++] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2));
 781                 dest[vpos++] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2));
 782               #endif
 783                 ypos += 2;
 784             }
 785         }
 786         pos += stride;
 787         ypos += width;
 788     }
 789
 790     #ifdef __MMX__
 791     MMX_clear();
 792     #endif
 793 }
 794
 795 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
 796 void Convert_2byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
 797 {
 798     const unsigned PixStride = 2;
 799     const unsigned char* src = (const unsigned char*) data;
 800     unsigned height = npixels / width;
 801     unsigned pos = 0;
 802     unsigned ypos = 0;
 803     unsigned stride = width*PixStride;
 804
 805     for(unsigned y=0; y<height; ++y)
 806     {
 807         for(unsigned x=0; x<width; x += 8)
 808         {
 809             unsigned char Rgb2byteBuf[8][4];
 810
 811             /* Convert 8 pixels from a scanline
 812              * from RGB15 / RGB16 to RGB32
 813              * (Not RGB32, because RGB32 conversion is faster)
 814              */
 815             Convert_2byte_to_24or32Common
 816                 <roffs,rbits, goffs,gbits, boffs,bbits, false>
 817                 (src+pos, Rgb2byteBuf[0]);
 818
 819             pos += 16;
 820
 821             for(int x8 = 0; x8 < 8; )
 822             {
 823               #ifdef __MMX__
 824                 c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[x8  ][0]); // two 32-bit pixels (4*8)
 825                 c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[x8+2][0]); // two 32-bit pixels (4*8)
 826                 Convert_YUY2_MMX_Common(p0_1, p2_3, dest+ypos);
 827                 x8   += 4;
 828                 ypos += 8;
 829               #else
 830                 int c[3];
 831                 /* TODO: Some faster means than using pointers */
 832                 unsigned char* rgb[2] =
 833                 {
 834                     Rgb2byteBuf[x8+0],
 835                     Rgb2byteBuf[x8+1],
 836                 };
 837
 838                 for(int m=0; m<3; ++m) c[m] = 0;
 839                 for(int n=0; n<2; ++n)
 840                     for(int m=0; m<3; ++m)
 841                         c[m] += rgb[n][m];
 842
 843                 for(int n=0; n<2; ++n)
 844                 {
 845                     dest[ypos + n*2]
 846                         = Y_ADD + ((RY * rgb[n][0]
 847                                   + GY * rgb[n][1]
 848                                   + BY * rgb[n][2]
 849                                    ) >> RGB2YUV_SHIFT);  // y
 850                 }
 851
 852                 /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
 853                 // Note: +2 is because c[] contains 4 values
 854                 dest[ypos+3] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1));
 855                 dest[ypos+1] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1));
 856                 x8   += 2;
 857                 ypos += 4;
 858               #endif
 859             }
 860         }
 861     }
 862
 863     #ifdef __MMX__
 864     MMX_clear();
 865     #endif
 866 }
 867
 868
 869 /***/
 870
 871 void Convert_I420To24Frame(const void* data, unsigned char* dest,
 872                            unsigned npixels, unsigned width, bool swap_red_blue)
 873 {
 874     const unsigned char* src = (const unsigned char*) data;
 875     unsigned height = npixels / width;
 876     unsigned pos = 0;
 877     unsigned ypos = 0;
 878     unsigned vpos = npixels;
 879     unsigned upos = vpos + npixels / 4;
 880
 881     /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u\n",
 882         npixels,width,height, ypos,upos,vpos);*/
 883
 884     #ifdef __MMX__
 885     c64_MMX rgb[4], yy[4];
 886     static const c64_MMX vmul/*; vmul.Init16*/(VR, VG, 0, 0);  // R,G,B,0 * vmul = V
 887     static const c64_MMX umul/*; umul.Init16*/(0, UG, UB, 0);  // R,G,B,0 * umul = U
 888     #endif
 889
 890     /*
 891         Y input: 16..235
 892         U input: 16..240
 893         V input: 16..240
 894
 895     */
 896
 897   #pragma omp parallel for
 898     for(unsigned y=0; y<height; y += 2)
 899     {
 900         for(unsigned x=0; x<width; )
 901         {
 902         #ifdef __MMX__
 903             rgb[0]=rgb[1]=rgb[2]=rgb[3]=yy[0]=yy[1]=yy[2]=yy[3]=c64_MMX(mask64hd)|mask64ld;
 904             /* Somehow, this line above fixes an error
 905              * where U&V seem to be off by 4 pixels.
 906              * Probably a GCC bug? */
 907
 908             /* Load 4 U and V values and subtract U_ADD and V_ADD from them. */
 909             uint64_t tmp_u = *(uint32_t*)&src[upos];
 910             uint64_t tmp_v = *(uint32_t*)&src[vpos];
 911             c64_MMX uuq = c64_MMX(0)
 912                      .unpacklbw(tmp_u) // 8-bit to 16-bit
 913                      .sub16(Bits16const<U_ADD,U_ADD>::value)
 914                      .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
 915             c64_MMX vvq = c64_MMX(0)
 916                      .unpacklbw(tmp_v)
 917                      .sub16(Bits16const<V_ADD,V_ADD>::value)
 918                      .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
 919
 920             const short* uu = (const short*)&uuq;
 921             const short* vv = (const short*)&vvq;
 922
 923             /* c64_MMX rgb[4]; // four sets of 4*int16, each representing 1 rgb value */
 924             for(int n=0; n<4; ++n)
 925             {
 926                 /* vv is shifted by 3 bits, vmul is shifted by 13 bits
 927                  * 16 bits in total, so mul16hi gets the 16-bit downscaled part */
 928                 c64_MMX v; v.Init16(vv[n]);
 929                 c64_MMX u; u.Init16(uu[n]);
 930                 rgb[n] = v.mul16hi(vmul).add16(
 931                          u.mul16hi(umul)      );
 932             }
 933
 934             /* rgb[0] : U,V increment of RGB32 for x0,y0 - x1,y1
 935              * rgb[1] : U,V increment of RGB32 for x2,y0 - x3,y1
 936              * rgb[2] : U,V increment of RGB32 for x4,y0 - x5,y1
 937              * rgb[3] : U,V increment of RGB32 for x6,y0 - x7,y1
 938              */
 939
 940             unsigned yyoffs[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
 941             /* c64_MMX yy[4]; // four sets of 4*int16, each representing four Y values */
 942             for(int n=0; n<4; ++n)
 943             {
 944                 c64_MMX luma; luma.Init16(
 945                     src[yyoffs[0]+n*2],  /* n(0..3): x0y0,x2y0,x4y0,x6y0 */
 946                     src[yyoffs[1]+n*2],  /* n(0..3): x1y0,x3y0,x5y0,x7y0 */
 947                     src[yyoffs[2]+n*2],  /* n(0..3): x0y1,x2y1,x4y1,x6y1 */
 948                     src[yyoffs[3]+n*2]   /* n(0..3): x1y1,x3y1,x5y1,x7y1 */
 949                 );
 950                 luma = luma.sub16(Bits16const<Y_ADD,Y_ADD>::value);
 951                 luma = luma.shl16(16 - YUV2RGB_SHIFT);
 952                 yy[n] = luma.mul16hi(Bits16const<Y_REV,Y_REV>::value);
 953             }
 954             const short* const yyval = (const short*) &yy[0].value;
 955             /*
 956                 values in order:
 957                    x0y0 x1y0 x0y1 x1y1
 958                    x2y0 x3y0 x2y1 x3y1
 959                    x4y0 x5y0 x4y1 x5y1
 960                    x6y0 x7y0 x6y1 x7y1
 961             */
 962             int tmppos = pos;
 963             for(int ny = 0; ny < 4; ny += 2)
 964             {
 965                 /* Note: We must use 16-bit pixels here instead of 8-bit,
 966                  * because the rgb+Y addition can overflow. conv_s16_u8()
 967                  * does the necessary clamping, which would not be done
 968                  * if the values were 8-bit.
 969                  */
 970                 // 8 pixels for one scanline, repeated twice
 971                 /* Note: C++ has no named constructors, so we
 972                  * use statement blocks here as substitutes.
 973                  */
 974                 c64_MMX r0
 975                     = rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+0]); tmp; }) )
 976                            .conv_s16_u8(
 977                       rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+1]); tmp; }) ));
 978                 c64_MMX r1
 979                     = rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+4]); tmp; }) )
 980                            .conv_s16_u8(
 981                       rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+5]); tmp; }) ));
 982                 c64_MMX r2
 983                     = rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+8]); tmp; }) )
 984                            .conv_s16_u8(
 985                       rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+9]); tmp; }) ));
 986                 c64_MMX r3
 987                     = rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+12]); tmp; }) )
 988                            .conv_s16_u8(
 989                       rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+13]); tmp; }) ));
 990
 991                 Convert32To24_32bytes(r0,r1,r2,r3, &dest[tmppos]);
 992                 tmppos += width*3; // next line
 993             }
 994             upos += 4;
 995             vpos += 4;
 996             ypos += 8;   // eight bytes for this line (and eight from next too)
 997             pos  += 8*3; // eight triplets generated on this line
 998             x    += 8;   // eight yy values used on this line
 999         #else /* non-MMX */
1000             int u = src[upos] - U_ADD;
1001             int v = src[vpos] - V_ADD;
1002
1003             int rgb[3] =
1004                 {
1005                    (VR * v         ) >> (YUV2RGB_SHIFT),
1006                    (VG * v + UG * u) >> (YUV2RGB_SHIFT),
1007                    (       + UB * u) >> (YUV2RGB_SHIFT)
1008                 };
1009
1010             unsigned incr[4] = {0,1,width,width+1};
1011
1012             for(unsigned r=0; r<4; ++r)
1013                 for(unsigned doffs=pos + incr[r]*3, yoffs=ypos + incr[r],
1014                         yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
1015                         n=0; n<3; ++n)
1016                     dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
1017
1018             upos += 1;
1019             vpos += 1;
1020             ypos += 2; // two bytes for this line (two from next line)
1021             pos  += 2*3; // two triplets generated on this line
1022             x    += 2; // two yy values used on this line
1023         #endif
1024         }
1025         ypos += width;
1026         pos += 3*width;
1027     }
1028     #ifdef __MMX__
1029     MMX_clear();
1030     #endif
1031 }
1032
1033 void Convert_YUY2To24Frame(const void* data, unsigned char* dest,
1034                            unsigned npixels, unsigned width, bool swap_red_blue)
1035 {
1036     const unsigned char* src = (const unsigned char*) data;
1037     unsigned height = npixels / width;
1038     unsigned pos = 0;
1039     unsigned ypos = 0;
1040
1041     /* TODO: MMX optimization */
1042
1043     /*
1044         Y input: 16..235
1045         U input: 16..240
1046         V input: 16..240
1047
1048     */
1049   #pragma omp parallel for
1050     for(unsigned y=0; y<height; ++y)
1051     {
1052         for(unsigned x=0; x<width; x += 2)
1053         {
1054             /* non-MMX */
1055             int u = src[ypos+1] - U_ADD;
1056             int v = src[ypos+3] - V_ADD;
1057
1058             int rgb[3] =
1059                 {
1060                    (VR * v         ) >> (YUV2RGB_SHIFT),
1061                    (VG * v + UG * u) >> (YUV2RGB_SHIFT),
1062                    (       + UB * u) >> (YUV2RGB_SHIFT)
1063                 };
1064
1065             for(unsigned r=0; r<2; ++r)
1066                 for(unsigned doffs=pos + r*3, yoffs=ypos+r*2,
1067                         yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
1068                         n=0; n<3; ++n)
1069                     dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
1070
1071             ypos += 4; // four bytes for this line (y,u,y,v)
1072             pos  += 2*3; // two triplets generated on this line
1073             x    += 2; // two yy values used on this line
1074         }
1075     }
1076 }
1077
1078 /***/
1079 void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1080 {
1081     Convert_4byte_To_I420Frame<3>(data,dest,npixels,width);
1082 }
1083 void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1084 {
1085     Convert_4byte_To_I420Frame<4>(data,dest,npixels,width);
1086 }
1087 void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1088 {
1089     Convert_2byte_To_I420Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
1090 }
1091 void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1092 {
1093     Convert_2byte_To_I420Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
1094 }
1095 /***/
1096 void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1097 {
1098     Convert_4byte_To_YUY2Frame<3>(data,dest,npixels,width);
1099 }
1100 void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1101 {
1102     Convert_4byte_To_YUY2Frame<4>(data,dest,npixels,width);
1103 }
1104 void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1105 {
1106     Convert_2byte_To_YUY2Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
1107 }
1108 void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1109 {
1110     Convert_2byte_To_YUY2Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
1111 }