Allow size-specific resizers
[jpcrr.git] / streamtools / rgbtorgb.cc
blob88743b81da9a073dbd7739500f6ea55e3f972cf5
1 #include <stdint.h>
2 #include <stdlib.h> // for size_t
3 #include <vector>
4 #include <cmath>
6 /* RGB to RGB and RGB from/to I420 conversions written by Bisqwit
7 * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
8 */
10 typedef uint_least64_t uint64_t;
12 #include "quantize.hh"
13 #include "rgbtorgb.hh"
14 #include "simd.hh"
16 /* For BPP conversions */
18 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
19 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
20 static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
21 static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
22 static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
24 static const uint64_t mask64h __attribute__((aligned(8))) = 0xFF00FF00FF00FF00ULL;
25 static const uint64_t mask64l __attribute__((aligned(8))) = 0x00FF00FF00FF00FFULL;
26 static const uint64_t mask64hw __attribute__((aligned(8))) = 0xFFFF0000FFFF0000ULL;
27 static const uint64_t mask64lw __attribute__((aligned(8))) = 0x0000FFFF0000FFFFULL;
28 static const uint64_t mask64hd __attribute__((aligned(8))) = 0xFFFFFFFF00000000ULL;
29 static const uint64_t mask64ld __attribute__((aligned(8))) = 0x00000000FFFFFFFFULL;
31 /* For RGB2YUV: */
33 static const int RGB2YUV_SHIFT = 15; /* highest value where [RGB][YUV] fit in signed short */
35 static const int RY = 8414; // ((int)(( 65.738/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
36 static const int RV = 14392; // ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
37 static const int RU = -4856; // ((int)((-37.945/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
39 static const int GY = 16519; // ((int)((129.057/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
40 static const int GV = -12051;// ((int)((-94.154/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
41 static const int GU = -9534; // ((int)((-74.494/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
43 static const int BY = 3208; // ((int)(( 25.064/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
44 static const int BV = -2339; // ((int)((-18.285/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
45 static const int BU = 14392; // ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
47 static const int Y_ADD = 16;
48 static const int U_ADD = 128;
49 static const int V_ADD = 128;
51 /* For YUV2RGB: */
53 static const int YUV2RGB_SHIFT = 13; /* highest value where UB still fits in signed short */
55 static const int Y_REV = 9539; // ((int)( ( 255 / 219.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
56 static const int VR = 14688; // ((int)( ( 117504 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
57 static const int VG = -6659; // ((int)( ( -53279 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
58 static const int UG = -3208; // ((int)( ( -25675 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
59 static const int UB = 16525; // ((int)( ( 132201 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
61 /****************/
63 template<typename c64>
64 static inline void Convert32To24_32bytes(c64 w0, c64 w1, c64 w2, c64 w3, unsigned char* dest)
66 c64 r0 = (w0 & mask24l) | ((w0 >> 8) & mask24h); /* bbbaaa */
67 c64 r1 = (w1 & mask24l) | ((w1 >> 8) & mask24h); /* dddccc */
68 c64 r2 = (w2 & mask24l) | ((w2 >> 8) & mask24h); /* fffeee */
69 c64 r3 = (w3 & mask24l) | ((w3 >> 8) & mask24h); /* hhhggg */
71 /* ccbbbaaa */
72 ((r0 ) | ((r1 << 48) & mask24hh)).Put(dest+0);
73 /* feeedddc */
74 ((r1 >> 16) | ((r2 << 32) & mask24hhh)).Put(dest+8);
75 /* hhhgggff */
76 ((r2 >> 32) | ((r3 << 16) & mask24hhhh)).Put(dest+16);
79 #if defined(__x86_64) || defined(USE_MMX)
80 static void Convert32To24_32bytes(const unsigned char* src,
81 unsigned char* dest)
83 c64 w0; w0.Get(src+0);
84 c64 w1; w1.Get(src+8);
85 c64 w2; w2.Get(src+16);
86 c64 w3; w3.Get(src+24);
87 Convert32To24_32bytes(w0,w1,w2,w3, dest);
89 #endif
91 void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
93 const unsigned char* src = (const unsigned char*)data;
95 #if defined(__x86_64) || defined(USE_MMX)
96 while(npixels >= 8)
98 Convert32To24_32bytes(src, dest);
99 src += 4*8;
100 dest += 3*8;
101 npixels -= 8;
103 #ifdef USE_MMX
104 MMX_clear();
105 #endif
106 #endif
108 for(unsigned pos=0; pos<npixels; ++pos)
110 dest[3*pos+0] = src[4*pos+0];
111 dest[3*pos+1] = src[4*pos+1];
112 dest[3*pos+2] = src[4*pos+2];
116 static void Unbuild16(unsigned char* target, unsigned rgb16)
118 unsigned B = (rgb16%32)*256/32;
119 unsigned G = ((rgb16/32)%64)*256/64;
120 unsigned R = ((rgb16/(32*64))%32)*256/32;
121 target[0] = R;
122 target[1] = G;
123 target[2] = B;
126 static void Unbuild15(unsigned char* target, unsigned rgb16)
128 unsigned B = (rgb16%32)*256/32;
129 unsigned G = ((rgb16/32)%32)*256/32;
130 unsigned R = ((rgb16/(32*32))%32)*256/32;
131 target[0] = R;
132 target[1] = G;
133 target[2] = B;
136 template<int basevalue_lo, int basevalue_hi>
137 struct Bits16const
139 static const uint64_t static_value =
140 (( ((uint64_t)(unsigned short) basevalue_lo) << 0)
141 | ( ((uint64_t)(unsigned short) basevalue_hi) << 16)
142 | ( ((uint64_t)(unsigned short) basevalue_lo) << 32)
143 | ( ((uint64_t)(unsigned short) basevalue_hi) << 48));
144 static const uint64_t value;
146 template<int basevalue_lo, int basevalue_hi>
147 const uint64_t Bits16const<basevalue_lo, basevalue_hi>::value =
148 Bits16const<basevalue_lo, basevalue_hi>::static_value;
150 template<int basevalue_lo, int basevalue_hi>
151 struct Bits32const
153 static const uint64_t static_value =
154 (( ((uint64_t)(unsigned int) basevalue_lo) << 0)
155 | ( ((uint64_t)(unsigned int) basevalue_hi) << 32));
156 static const uint64_t value = static_value;
157 };/*
158 template<int basevalue_lo, int basevalue_hi>
159 const uint64_t Bits32const<basevalue_lo, basevalue_hi>::value =
160 Bits32const<basevalue_lo, basevalue_hi>::static_value;*/
162 template<uint64_t basevalue_lo, uint64_t basevalue_hi>
163 struct Bits8const
165 static const uint64_t static_value =
166 ((basevalue_lo << 0)
167 | (basevalue_hi << 8)
168 | (basevalue_lo << 16)
169 | (basevalue_hi << 24)
170 | (basevalue_lo << 32)
171 | (basevalue_hi << 40)
172 | (basevalue_lo << 48)
173 | (basevalue_hi << 56));
174 static const uint64_t value = static_value;
178 template<int lowbitcount, int highbitcount, int leftshift>
179 struct MaskBconst
181 static const uint64_t basevalue_lo = (1 << lowbitcount) - 1;
182 static const uint64_t basevalue_hi = (1 << highbitcount) - 1;
183 static const uint64_t value = Bits8const<basevalue_lo,basevalue_hi>::value << leftshift;
186 template<int bits>
187 struct Convert_2byte_consts
189 static const uint64_t mask_lo;// = MaskBconst<bits,0, 0>::value;
190 static const uint64_t mask_hi;// = MaskBconst<bits,0, 8>::value;
191 static const uint64_t mask_frac;// = MaskBconst<8-bits,8-bits, 0>::value;
193 template<int bits>
194 const uint64_t Convert_2byte_consts<bits>::mask_lo = MaskBconst<bits, 0, 0>::value;
195 template<int bits>
196 const uint64_t Convert_2byte_consts<bits>::mask_hi = MaskBconst<bits, 0, 8>::value;
197 template<int bits>
198 const uint64_t Convert_2byte_consts<bits>::mask_frac = MaskBconst<8-bits, 8-bits, 0>::value;
200 template<int offs, int bits>
201 struct Convert_2byte_helper
203 c64 lo, hi;
205 Convert_2byte_helper(c64 p4a, c64 p4b)
207 const uint64_t& mask_lo = Convert_2byte_consts<bits>::mask_lo;
208 const uint64_t& mask_hi = Convert_2byte_consts<bits>::mask_hi;
209 const uint64_t& mask_frac = Convert_2byte_consts<bits>::mask_frac;
211 /* STEP 1: SEPARATE THE PIXELS INTO RED, GREEN AND BLUE COMPONENTS */
213 /* 000BBBBB 000bbbbb 000BBBBB 000bbbbb 000BBBBB 000bbbbb 000BBBBB 000bbbbb */
214 c64 s5 = ((p4a >> offs) & mask_lo) | ((p4b << (8-offs)) & mask_hi);
216 /* STEP 2: SCALE THE COLOR COMPONENTS TO 256 RANGE */
218 /* BBBBB000 bbbbb000 BBBBB000 bbbbb000 BBBBB000 bbbbb000 BBBBB000 bbbbb000 */
219 /* 00000BBB 00000bbb 00000BBB 00000bbb 00000BBB 00000bbb 00000BBB 00000bbb */
220 c64 v8 = (s5 << (8-bits)) | ((s5 >> (bits-(8-bits))) & mask_frac);
221 /* v8:
223 * BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb *
226 /* STEP 3: DEINTERLACE THE PIXELS */
227 lo = (v8 ) & mask64l;
228 hi = (v8 >> 8) & mask64l;
233 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
234 static void Convert_2byte_to_24Common(const unsigned char* src, unsigned char* dest)
235 __attribute((noinline));
237 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits, bool rgb24>
238 static void Convert_2byte_to_24or32Common(const unsigned char* src, unsigned char* dest)
240 c64 p4a; p4a.Get(src+0); // four pixels
241 c64 p4b; p4b.Get(src+8); // another four pixels
243 /* in: In both registers: */
245 Convert_2byte_helper<roffs,rbits> r(p4a,p4b);
246 Convert_2byte_helper<boffs,bbits> b(p4a,p4b);
247 Convert_2byte_helper<goffs,gbits> g(p4a,p4b);
249 /* STEP 4: CONVERT PIXELS INTO RGB32 */
251 /* Now we have:
252 * b.lo = 0j0g0d0a
253 * g.lo = 0k0h0e0b
254 * r.lo = 0l0i0f0c
255 * b.hi = 0J0G0D0A
256 * g.hi = 0K0H0E0B
257 * r.hi = 0L0I0F0C
258 * We want:
259 * w1 = 0fed0cba
260 * w2 = 0lkj0ihg
261 * w3 = 0FED0CBA
262 * w4 = 0LKJ0IHG
265 #if 0 && defined(__MMX__) /* FIXME why is this 0&&? */
266 // punpcklbw 0k0h0e0b, 0j0g0d0a -> 00ed00ba
267 // punpcklwd 0l0i0f0c, ________ -> 0f__0c__
268 c64 w1 = r.lo.unpacklwd(0) | g.lo.unpacklbw(b.lo); // pix 0,1
269 // punpckhbw 0k0h0e0b, 0j0g0d0a -> 00kj00hg
270 // punpckhwd 0l0i0f0c, ________ -> 0l__0i__
271 c64 w2 = r.lo.unpackhwd(0) | g.lo.unpackhbw(b.lo); // pix 2,3
273 c64 w3 = r.hi.unpacklwd(0) | g.hi.unpacklbw(b.hi); // pix 4,5
274 c64 w4 = r.hi.unpackhwd(0) | g.hi.unpackhbw(b.hi); // pix 6,7
275 #ifndef USE_MMX
276 MMX_clear();
277 #endif
278 #else
279 /* With 64-bit registers, this code is greatly simpler than
280 * the emulation of unpack opcodes. However, when the
281 * unpack opcodes is available, using them is shorter.
282 * Which way is faster? FIXME: Find out
285 // mask64lw: 00**00**
286 // mask64hw: **00**00
287 // b.lo & mask64lw: 000g000a
288 // g.lo & mask64lw: 000h000b
289 // r.lo & mask64lw: 000i000c
290 // b.lo & mask64hw: 0j000d00
291 // g.lo & mask64hw: 0k000e00
292 // r.lo & mask64hw: 0l000f00
294 c64 tlo1 = ((b.lo & mask64lw) ) | ((g.lo & mask64lw) << 8) | ((r.lo & mask64lw) << 16);
295 c64 tlo2 = ((b.lo & mask64hw) >>16) | ((g.lo & mask64hw) >> 8) | ((r.lo & mask64hw) );
297 c64 thi1 = ((b.hi & mask64lw) ) | ((g.hi & mask64lw) << 8) | ((r.hi & mask64lw) << 16);
298 c64 thi2 = ((b.hi & mask64hw) >>16) | ((g.hi & mask64hw) >> 8) | ((r.hi & mask64hw) );
300 * tlo1 = 0ihg0cba
301 * tlo2 = 0lkj0fed
302 * thi1 = 0IHG0CBA
303 * thi2 = 0LKJ0FED
304 * mask64ld = 0000****
305 * mask64hd = ****0000
308 c64 w1 = (tlo1 & mask64ld) | ((tlo2 & mask64ld) << 32); // 00000cba | 00000fed = 0fed0bca
309 c64 w2 = (tlo2 & mask64hd) | ((tlo1 & mask64hd) >> 32); // 0lkj0000 | 0ihg0000 = 0lkj0ihg
311 c64 w3 = (thi1 & mask64ld) | ((thi2 & mask64ld) << 32);
312 c64 w4 = (thi2 & mask64hd) | ((thi1 & mask64hd) >> 32);
313 #endif
315 if(rgb24)
317 /* STEP 5A: CONVERT PIXELS INTO RGB24 */
318 Convert32To24_32bytes(w1,w2,w3,w4, dest);
320 else
322 /* STEP 5B: STORE RGB32 */
323 w1.Put(dest+0);
324 w2.Put(dest+8);
325 w3.Put(dest+16);
326 w4.Put(dest+24);
330 punpcklbw ____ABCD, ____abcd = AaBbCcDd
331 punpcklwd ____ABCD, ____abcd = ABabCDcd
332 punpckldq ____ABCD, ____abcd = ABCDabcd
334 punpckhbw ABCD____, abcd____ = AaBbCcDd
335 punpckhwd ABCD____, abcd____ = ABabCDcd
336 punpckhdq ABCD____, abcd____ = ABCDabcd
340 void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
342 const unsigned char* src = (const unsigned char*)data;
344 if(swap_red_blue)
345 for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
346 Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, true> (src, dest);
347 else
348 for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
349 Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, true> (src, dest);
351 #ifdef USE_MMX
352 MMX_clear();
353 #endif
354 for(unsigned a=0; a<npixels; ++a)
356 unsigned short v = ((const unsigned short*)src)[a];
357 Unbuild15(&dest[a*3], v);
361 void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
363 const unsigned char* src = (const unsigned char*)data;
365 if(swap_red_blue)
366 for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
367 Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, true> (src, dest);
368 else
369 for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
370 Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, true> (src, dest);
372 #ifdef USE_MMX
373 MMX_clear();
374 #endif
375 for(unsigned a=0; a<npixels; ++a)
377 unsigned short v = ((const unsigned short*)src)[a];
378 Unbuild16(&dest[a*3], v);
382 void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
384 const unsigned char* src = (const unsigned char*)data;
386 if(swap_red_blue)
387 for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
388 Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, false> (src, dest);
389 else
390 for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
391 Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, false> (src, dest);
393 #ifdef USE_MMX
394 MMX_clear();
395 #endif
396 for(unsigned a=0; a<npixels; ++a)
398 unsigned short v = ((const unsigned short*)src)[a];
399 Unbuild15(&dest[a*4], v);
403 void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
405 const unsigned char* src = (const unsigned char*)data;
407 if(swap_red_blue)
408 for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
409 Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, false> (src, dest);
410 else
411 for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
412 Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, false> (src, dest);
414 #ifdef USE_MMX
415 MMX_clear();
416 #endif
417 for(unsigned a=0; a<npixels; ++a)
419 unsigned short v = ((const unsigned short*)src)[a];
420 Unbuild16(&dest[a*4], v);
424 static inline unsigned Build16(unsigned x,unsigned y, const unsigned char* rgbdata)
426 unsigned o16 = (x + 4*y) % 16;
427 return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
428 | (Quantize4x4<63>(o16, rgbdata[1]) << 5)
429 | (Quantize4x4<31>(o16, rgbdata[0]) << 11);
431 static inline unsigned Build15(unsigned x,unsigned y, const unsigned char* rgbdata)
433 unsigned o16 = (x + 4*y) % 16;
434 return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
435 | (Quantize4x4<31>(o16, rgbdata[1]) << 5)
436 | (Quantize4x4<31>(o16, rgbdata[0]) << 10);
439 void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
441 const unsigned char* logodata = (const unsigned char*) data;
442 unsigned short* result = (unsigned short*) dest;
443 unsigned x=0,y=0;
444 for(unsigned pos=0; pos<npixels; ++pos)
446 result[pos] = Build16(x,y, &logodata[pos*3]);
447 if(++x >= width) { x=0; ++y; }
451 void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
453 const unsigned char* logodata = (const unsigned char*) data;
454 unsigned short* result = (unsigned short*) dest;
455 unsigned x=0,y=0;
456 for(unsigned pos=0; pos<npixels; ++pos)
458 result[pos] = Build15(x,y, &logodata[pos*3]);
459 if(++x >= width) { x=0; ++y; }
463 #ifdef __MMX__
464 static inline void Convert_I420_MMX_Common
465 (c64_MMX p0_1, c64_MMX p2_3,
466 unsigned char* dest_y0,
467 unsigned char* dest_y1,
468 unsigned char* dest_u,
469 unsigned char* dest_v)
471 c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
472 c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
473 c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3);
474 c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
476 c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
477 c64_MMX rgb_u; rgb_u.Init16(RU,GU,BU, 0);
478 c64_MMX rgb_v; rgb_v.Init16(RV,GV,BV, 0);
480 c64_MMX ctotal = p0.add16(
481 p2.add16(
482 p1.add16(
483 p3)));
485 p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
486 p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
487 p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
488 p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
490 c64_MMX yy;
491 yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
492 ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
493 ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
494 ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
495 yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
497 // Because we're writing to adjacent pixels, we optimize this by
498 // writing two 8-bit values at once in both cases.
499 *(short*)dest_y0 = yy.Extract88_from_1616lo();
500 *(short*)dest_y1 = yy.Extract88_from_1616hi();
502 c64_MMX u_total32 = _mm_madd_pi16(rgb_u.value, ctotal.value);
503 c64_MMX v_total32 = _mm_madd_pi16(rgb_v.value, ctotal.value);
505 *dest_u = U_ADD + ((u_total32.Extract32<0>() + u_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
506 *dest_v = V_ADD + ((v_total32.Extract32<0>() + v_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
509 static inline void Convert_YUY2_MMX_Common
510 (c64_MMX p0_1, c64_MMX p2_3,
511 unsigned char* dest_yvyu)
513 c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
514 c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
515 c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3); // expand to 64-bit (4*16)
516 c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
518 c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
519 c64_MMX rgb_u; rgb_u.Init16(RU,GU,BU, 0);
520 c64_MMX rgb_v; rgb_v.Init16(RV,GV,BV, 0);
522 c64_MMX ctotal0 = p0.add16(p1);
523 c64_MMX ctotal2 = p2.add16(p3);
525 p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
526 p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
527 p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
528 p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
530 c64_MMX yy;
531 yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
532 ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
533 ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
534 ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
536 yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
538 c64_MMX u_total32_0 = _mm_madd_pi16(rgb_u.value, ctotal0.value);
539 c64_MMX v_total32_0 = _mm_madd_pi16(rgb_v.value, ctotal0.value);
540 c64_MMX u_total32_2 = _mm_madd_pi16(rgb_u.value, ctotal2.value);
541 c64_MMX v_total32_2 = _mm_madd_pi16(rgb_v.value, ctotal2.value);
543 c64_MMX quadword = yy; // four y values: at 0, 2, 4 and 6
545 c64_MMX uv; uv.Init16(
546 ((v_total32_0.Extract32<0>() + v_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
547 ((u_total32_0.Extract32<0>() + u_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
548 ((v_total32_2.Extract32<0>() + v_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
549 ((u_total32_2.Extract32<0>() + u_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)) );
550 c64_MMX uv_adds; uv_adds.Init16(V_ADD, U_ADD, V_ADD, U_ADD);
551 uv = uv.add16(uv_adds);
553 quadword |= uv << 8; // two u and v values: at 1, 3, 5 and 7.
554 quadword.Put(dest_yvyu); // write four y values: at 0, 2, 4 and 6
556 #endif
558 /*template<int PixStride>
559 void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
560 __attribute__((noinline));*/
562 template<int PixStride>
563 void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
565 const unsigned char* src = (const unsigned char*) data;
566 unsigned height = npixels / width;
568 unsigned pos = 0;
569 unsigned ypos = 0;
570 unsigned vpos = npixels;
571 unsigned upos = vpos + npixels / 4;
572 unsigned stride = width*PixStride;
574 /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u",
575 npixels,width,height, ypos,upos,vpos);*/
577 /* This function is based on code from x264 svn version 711 */
578 /* TODO: Apply MMX optimization for 24-bit pixels */
580 for(unsigned y=0; y<height; y += 2)
582 for(unsigned x=0; x<width; x += 2)
584 #ifdef __MMX__
585 if(PixStride == 4)
587 c64_MMX p0_1; p0_1.Get(&src[pos]); // two 32-bit pixels (4*8)
588 c64_MMX p2_3; p2_3.Get(&src[pos+stride]); // two 32-bit pixels
590 pos += PixStride*2;
592 Convert_I420_MMX_Common(p0_1, p2_3,
593 dest+ypos,
594 dest+ypos+width,
595 dest+upos++,
596 dest+vpos++);
598 else
599 #endif
601 int c[3], rgb[3][4];
603 /* luma */
604 for(int n=0; n<3; ++n) c[n] = rgb[n][0] = src[pos + n];
605 for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n + stride];
606 pos += PixStride;
608 for(int n=0; n<3; ++n) c[n] += rgb[n][2] = src[pos + n];
609 for(int n=0; n<3; ++n) c[n] += rgb[n][3] = src[pos + n + stride];
610 pos += PixStride;
612 unsigned destpos[4] = { ypos, ypos+width, ypos+1, ypos+width+1 };
613 for(int n=0; n<4; ++n)
615 dest[destpos[n]]
616 = Y_ADD + ((RY * rgb[0][n]
617 + GY * rgb[1][n]
618 + BY * rgb[2][n]
619 ) >> RGB2YUV_SHIFT); // y
622 dest[upos++] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2)) );
623 dest[vpos++] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)) );
626 ypos += 2;
628 pos += stride;
629 ypos += width;
632 /*fprintf(stderr, ",yr=%u,ur=%u,vr=%u\n",
633 ypos,upos,vpos);*/
635 #ifdef __MMX__
636 MMX_clear();
637 #endif
640 template<int PixStride>
641 void Convert_4byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
643 const unsigned char* src = (const unsigned char*) data;
644 unsigned height = npixels / width;
645 unsigned pos = 0;
646 unsigned ypos = 0;
647 unsigned stride = width*PixStride;
649 /* This function is based on code from x264 svn version 711 */
650 /* TODO: Apply MMX optimization for 24-bit pixels */
652 for(unsigned y=0; y<height; ++y)
654 for(unsigned x=0; x<width; x += 2)
656 #ifdef __MMX__
657 if(PixStride == 4)
659 c64_MMX p0_1; p0_1.Get(&src[pos]); // two 32-bit pixels (4*8)
660 pos += PixStride*2;
662 c64_MMX p2_3; p2_3.Get(&src[pos]); // two 32-bit pixels (4*8)
663 pos += PixStride*2;
664 x += 2;
666 Convert_YUY2_MMX_Common(p0_1, p2_3,
667 dest+ypos);
669 ypos += 4;
671 else
672 #endif
674 int c[3], rgb[3][2];
676 /* luma */
677 for(int n=0; n<3; ++n) c[n] = rgb[n][0] = src[pos + n];
678 pos += PixStride;
680 for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n];
681 pos += PixStride;
683 for(int n=0; n<2; ++n)
685 dest[ypos + n*2]
686 = Y_ADD + ((RY * rgb[0][n]
687 + GY * rgb[1][n]
688 + BY * rgb[2][n]
689 ) >> RGB2YUV_SHIFT); // y
692 dest[ypos+3] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1)) );
693 dest[ypos+1] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)) );
695 ypos += 4;
698 #ifdef __MMX__
699 MMX_clear();
700 #endif
703 /*template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
704 void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
705 __attribute__((noinline));*/
707 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
708 void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
710 const unsigned PixStride = 2;
711 const unsigned char* src = (const unsigned char*) data;
712 unsigned height = npixels / width;
713 unsigned pos = 0;
714 unsigned ypos = 0;
715 unsigned vpos = npixels;
716 unsigned upos = vpos + npixels / 4;
717 unsigned stride = width*PixStride;
719 /* This function is based on code from x264 svn version 711 */
721 for(unsigned y=0; y<height; y += 2)
723 for(unsigned x=0; x<width; x += 8)
725 unsigned char Rgb2byteBuf[2][8][4];
727 /* Convert 8 pixels from two scanlines (16 in total)
728 * from RGB15 / RGB16 to RGB32
729 * (Not RGB32, because RGB32 conversion is faster)
731 Convert_2byte_to_24or32Common
732 <roffs,rbits, goffs,gbits, boffs,bbits, false>
733 (src+pos, Rgb2byteBuf[0][0]);
735 Convert_2byte_to_24or32Common
736 <roffs,rbits, goffs,gbits, boffs,bbits, false>
737 (src+pos+stride, Rgb2byteBuf[1][0]);
739 pos += 16;
741 for(int x8 = 0; x8 < 8; x8 += 2)
743 #ifdef _q_MMX__
744 c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[0][x8][0]); // two 32-bit pixels (4*8)
745 c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[1][x8][0]); // two 32-bit pixels
747 Convert_I420_MMX_Common(p0_1, p2_3,
748 dest+ypos,
749 dest+ypos+width,
750 dest+upos++,
751 dest+vpos++);
752 #else
753 int c[3];
754 /* TODO: Some faster means than using pointers */
755 unsigned char* rgb[4] =
757 Rgb2byteBuf[0][x8+0],
758 Rgb2byteBuf[0][x8+1],
759 Rgb2byteBuf[1][x8+0],
760 Rgb2byteBuf[1][x8+1]
763 for(int m=0; m<3; ++m) c[m] = 0;
764 for(int n=0; n<4; ++n)
765 for(int m=0; m<3; ++m)
766 c[m] += rgb[n][m];
768 unsigned destpos[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
769 for(int n=0; n<4; ++n)
771 dest[destpos[n]]
772 = Y_ADD + ((RY * rgb[n][0]
773 + GY * rgb[n][1]
774 + BY * rgb[n][2]
775 ) >> RGB2YUV_SHIFT); // y
778 /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
779 // Note: +2 is because c[] contains 4 values
780 dest[upos++] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2));
781 dest[vpos++] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2));
782 #endif
783 ypos += 2;
786 pos += stride;
787 ypos += width;
790 #ifdef __MMX__
791 MMX_clear();
792 #endif
795 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
796 void Convert_2byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
798 const unsigned PixStride = 2;
799 const unsigned char* src = (const unsigned char*) data;
800 unsigned height = npixels / width;
801 unsigned pos = 0;
802 unsigned ypos = 0;
803 unsigned stride = width*PixStride;
805 for(unsigned y=0; y<height; ++y)
807 for(unsigned x=0; x<width; x += 8)
809 unsigned char Rgb2byteBuf[8][4];
811 /* Convert 8 pixels from a scanline
812 * from RGB15 / RGB16 to RGB32
813 * (Not RGB32, because RGB32 conversion is faster)
815 Convert_2byte_to_24or32Common
816 <roffs,rbits, goffs,gbits, boffs,bbits, false>
817 (src+pos, Rgb2byteBuf[0]);
819 pos += 16;
821 for(int x8 = 0; x8 < 8; )
823 #ifdef __MMX__
824 c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[x8 ][0]); // two 32-bit pixels (4*8)
825 c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[x8+2][0]); // two 32-bit pixels (4*8)
826 Convert_YUY2_MMX_Common(p0_1, p2_3, dest+ypos);
827 x8 += 4;
828 ypos += 8;
829 #else
830 int c[3];
831 /* TODO: Some faster means than using pointers */
832 unsigned char* rgb[2] =
834 Rgb2byteBuf[x8+0],
835 Rgb2byteBuf[x8+1],
838 for(int m=0; m<3; ++m) c[m] = 0;
839 for(int n=0; n<2; ++n)
840 for(int m=0; m<3; ++m)
841 c[m] += rgb[n][m];
843 for(int n=0; n<2; ++n)
845 dest[ypos + n*2]
846 = Y_ADD + ((RY * rgb[n][0]
847 + GY * rgb[n][1]
848 + BY * rgb[n][2]
849 ) >> RGB2YUV_SHIFT); // y
852 /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
853 // Note: +2 is because c[] contains 4 values
854 dest[ypos+3] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1));
855 dest[ypos+1] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1));
856 x8 += 2;
857 ypos += 4;
858 #endif
863 #ifdef __MMX__
864 MMX_clear();
865 #endif
869 /***/
871 void Convert_I420To24Frame(const void* data, unsigned char* dest,
872 unsigned npixels, unsigned width, bool swap_red_blue)
874 const unsigned char* src = (const unsigned char*) data;
875 unsigned height = npixels / width;
876 unsigned pos = 0;
877 unsigned ypos = 0;
878 unsigned vpos = npixels;
879 unsigned upos = vpos + npixels / 4;
881 /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u\n",
882 npixels,width,height, ypos,upos,vpos);*/
884 #ifdef __MMX__
885 c64_MMX rgb[4], yy[4];
886 static const c64_MMX vmul/*; vmul.Init16*/(VR, VG, 0, 0); // R,G,B,0 * vmul = V
887 static const c64_MMX umul/*; umul.Init16*/(0, UG, UB, 0); // R,G,B,0 * umul = U
888 #endif
891 Y input: 16..235
892 U input: 16..240
893 V input: 16..240
897 #pragma omp parallel for
898 for(unsigned y=0; y<height; y += 2)
900 for(unsigned x=0; x<width; )
902 #ifdef __MMX__
903 rgb[0]=rgb[1]=rgb[2]=rgb[3]=yy[0]=yy[1]=yy[2]=yy[3]=c64_MMX(mask64hd)|mask64ld;
904 /* Somehow, this line above fixes an error
905 * where U&V seem to be off by 4 pixels.
906 * Probably a GCC bug? */
908 /* Load 4 U and V values and subtract U_ADD and V_ADD from them. */
909 uint64_t tmp_u = *(uint32_t*)&src[upos];
910 uint64_t tmp_v = *(uint32_t*)&src[vpos];
911 c64_MMX uuq = c64_MMX(0)
912 .unpacklbw(tmp_u) // 8-bit to 16-bit
913 .sub16(Bits16const<U_ADD,U_ADD>::value)
914 .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
915 c64_MMX vvq = c64_MMX(0)
916 .unpacklbw(tmp_v)
917 .sub16(Bits16const<V_ADD,V_ADD>::value)
918 .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
920 const short* uu = (const short*)&uuq;
921 const short* vv = (const short*)&vvq;
923 /* c64_MMX rgb[4]; // four sets of 4*int16, each representing 1 rgb value */
924 for(int n=0; n<4; ++n)
926 /* vv is shifted by 3 bits, vmul is shifted by 13 bits
927 * 16 bits in total, so mul16hi gets the 16-bit downscaled part */
928 c64_MMX v; v.Init16(vv[n]);
929 c64_MMX u; u.Init16(uu[n]);
930 rgb[n] = v.mul16hi(vmul).add16(
931 u.mul16hi(umul) );
934 /* rgb[0] : U,V increment of RGB32 for x0,y0 - x1,y1
935 * rgb[1] : U,V increment of RGB32 for x2,y0 - x3,y1
936 * rgb[2] : U,V increment of RGB32 for x4,y0 - x5,y1
937 * rgb[3] : U,V increment of RGB32 for x6,y0 - x7,y1
940 unsigned yyoffs[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
941 /* c64_MMX yy[4]; // four sets of 4*int16, each representing four Y values */
942 for(int n=0; n<4; ++n)
944 c64_MMX luma; luma.Init16(
945 src[yyoffs[0]+n*2], /* n(0..3): x0y0,x2y0,x4y0,x6y0 */
946 src[yyoffs[1]+n*2], /* n(0..3): x1y0,x3y0,x5y0,x7y0 */
947 src[yyoffs[2]+n*2], /* n(0..3): x0y1,x2y1,x4y1,x6y1 */
948 src[yyoffs[3]+n*2] /* n(0..3): x1y1,x3y1,x5y1,x7y1 */
950 luma = luma.sub16(Bits16const<Y_ADD,Y_ADD>::value);
951 luma = luma.shl16(16 - YUV2RGB_SHIFT);
952 yy[n] = luma.mul16hi(Bits16const<Y_REV,Y_REV>::value);
954 const short* const yyval = (const short*) &yy[0].value;
956 values in order:
957 x0y0 x1y0 x0y1 x1y1
958 x2y0 x3y0 x2y1 x3y1
959 x4y0 x5y0 x4y1 x5y1
960 x6y0 x7y0 x6y1 x7y1
962 int tmppos = pos;
963 for(int ny = 0; ny < 4; ny += 2)
965 /* Note: We must use 16-bit pixels here instead of 8-bit,
966 * because the rgb+Y addition can overflow. conv_s16_u8()
967 * does the necessary clamping, which would not be done
968 * if the values were 8-bit.
970 // 8 pixels for one scanline, repeated twice
971 /* Note: C++ has no named constructors, so we
972 * use statement blocks here as substitutes.
974 c64_MMX r0
975 = rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+0]); tmp; }) )
976 .conv_s16_u8(
977 rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+1]); tmp; }) ));
978 c64_MMX r1
979 = rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+4]); tmp; }) )
980 .conv_s16_u8(
981 rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+5]); tmp; }) ));
982 c64_MMX r2
983 = rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+8]); tmp; }) )
984 .conv_s16_u8(
985 rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+9]); tmp; }) ));
986 c64_MMX r3
987 = rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+12]); tmp; }) )
988 .conv_s16_u8(
989 rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+13]); tmp; }) ));
991 Convert32To24_32bytes(r0,r1,r2,r3, &dest[tmppos]);
992 tmppos += width*3; // next line
994 upos += 4;
995 vpos += 4;
996 ypos += 8; // eight bytes for this line (and eight from next too)
997 pos += 8*3; // eight triplets generated on this line
998 x += 8; // eight yy values used on this line
999 #else /* non-MMX */
1000 int u = src[upos] - U_ADD;
1001 int v = src[vpos] - V_ADD;
1003 int rgb[3] =
1005 (VR * v ) >> (YUV2RGB_SHIFT),
1006 (VG * v + UG * u) >> (YUV2RGB_SHIFT),
1007 ( + UB * u) >> (YUV2RGB_SHIFT)
1010 unsigned incr[4] = {0,1,width,width+1};
1012 for(unsigned r=0; r<4; ++r)
1013 for(unsigned doffs=pos + incr[r]*3, yoffs=ypos + incr[r],
1014 yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
1015 n=0; n<3; ++n)
1016 dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
1018 upos += 1;
1019 vpos += 1;
1020 ypos += 2; // two bytes for this line (two from next line)
1021 pos += 2*3; // two triplets generated on this line
1022 x += 2; // two yy values used on this line
1023 #endif
1025 ypos += width;
1026 pos += 3*width;
1028 #ifdef __MMX__
1029 MMX_clear();
1030 #endif
1033 void Convert_YUY2To24Frame(const void* data, unsigned char* dest,
1034 unsigned npixels, unsigned width, bool swap_red_blue)
1036 const unsigned char* src = (const unsigned char*) data;
1037 unsigned height = npixels / width;
1038 unsigned pos = 0;
1039 unsigned ypos = 0;
1041 /* TODO: MMX optimization */
1044 Y input: 16..235
1045 U input: 16..240
1046 V input: 16..240
1049 #pragma omp parallel for
1050 for(unsigned y=0; y<height; ++y)
1052 for(unsigned x=0; x<width; x += 2)
1054 /* non-MMX */
1055 int u = src[ypos+1] - U_ADD;
1056 int v = src[ypos+3] - V_ADD;
1058 int rgb[3] =
1060 (VR * v ) >> (YUV2RGB_SHIFT),
1061 (VG * v + UG * u) >> (YUV2RGB_SHIFT),
1062 ( + UB * u) >> (YUV2RGB_SHIFT)
1065 for(unsigned r=0; r<2; ++r)
1066 for(unsigned doffs=pos + r*3, yoffs=ypos+r*2,
1067 yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
1068 n=0; n<3; ++n)
1069 dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
1071 ypos += 4; // four bytes for this line (y,u,y,v)
1072 pos += 2*3; // two triplets generated on this line
1073 x += 2; // two yy values used on this line
1078 /***/
1079 void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1081 Convert_4byte_To_I420Frame<3>(data,dest,npixels,width);
1083 void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1085 Convert_4byte_To_I420Frame<4>(data,dest,npixels,width);
1087 void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1089 Convert_2byte_To_I420Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
1091 void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1093 Convert_2byte_To_I420Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
1095 /***/
1096 void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1098 Convert_4byte_To_YUY2Frame<3>(data,dest,npixels,width);
1100 void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1102 Convert_4byte_To_YUY2Frame<4>(data,dest,npixels,width);
1104 void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1106 Convert_2byte_To_YUY2Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
1108 void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
1110 Convert_2byte_To_YUY2Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);