2 #include <stdlib.h> // for size_t
6 /* RGB to RGB and RGB from/to I420 conversions written by Bisqwit
7 * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
10 typedef uint_least64_t uint64_t;
12 #include "quantize.hh"
13 #include "rgbtorgb.hh"
16 /* For BPP conversions */
18 static const uint64_t mask24l
__attribute__((aligned(8))) = 0x0000000000FFFFFFULL
;
19 static const uint64_t mask24h
__attribute__((aligned(8))) = 0x0000FFFFFF000000ULL
;
20 static const uint64_t mask24hh
__attribute__((aligned(8))) = 0xffff000000000000ULL
;
21 static const uint64_t mask24hhh
__attribute__((aligned(8))) = 0xffffffff00000000ULL
;
22 static const uint64_t mask24hhhh
__attribute__((aligned(8))) = 0xffffffffffff0000ULL
;
24 static const uint64_t mask64h
__attribute__((aligned(8))) = 0xFF00FF00FF00FF00ULL
;
25 static const uint64_t mask64l
__attribute__((aligned(8))) = 0x00FF00FF00FF00FFULL
;
26 static const uint64_t mask64hw
__attribute__((aligned(8))) = 0xFFFF0000FFFF0000ULL
;
27 static const uint64_t mask64lw
__attribute__((aligned(8))) = 0x0000FFFF0000FFFFULL
;
28 static const uint64_t mask64hd
__attribute__((aligned(8))) = 0xFFFFFFFF00000000ULL
;
29 static const uint64_t mask64ld
__attribute__((aligned(8))) = 0x00000000FFFFFFFFULL
;
33 static const int RGB2YUV_SHIFT
= 15; /* highest value where [RGB][YUV] fit in signed short */
35 static const int RY
= 8414; // ((int)(( 65.738/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
36 static const int RV
= 14392; // ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
37 static const int RU
= -4856; // ((int)((-37.945/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
39 static const int GY
= 16519; // ((int)((129.057/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
40 static const int GV
= -12051;// ((int)((-94.154/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
41 static const int GU
= -9534; // ((int)((-74.494/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
43 static const int BY
= 3208; // ((int)(( 25.064/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
44 static const int BV
= -2339; // ((int)((-18.285/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
45 static const int BU
= 14392; // ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
47 static const int Y_ADD
= 16;
48 static const int U_ADD
= 128;
49 static const int V_ADD
= 128;
53 static const int YUV2RGB_SHIFT
= 13; /* highest value where UB still fits in signed short */
55 static const int Y_REV
= 9539; // ((int)( ( 255 / 219.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
56 static const int VR
= 14688; // ((int)( ( 117504 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
57 static const int VG
= -6659; // ((int)( ( -53279 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
58 static const int UG
= -3208; // ((int)( ( -25675 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
59 static const int UB
= 16525; // ((int)( ( 132201 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
63 template<typename c64
>
64 static inline void Convert32To24_32bytes(c64 w0
, c64 w1
, c64 w2
, c64 w3
, unsigned char* dest
)
66 c64 r0
= (w0
& mask24l
) | ((w0
>> 8) & mask24h
); /* bbbaaa */
67 c64 r1
= (w1
& mask24l
) | ((w1
>> 8) & mask24h
); /* dddccc */
68 c64 r2
= (w2
& mask24l
) | ((w2
>> 8) & mask24h
); /* fffeee */
69 c64 r3
= (w3
& mask24l
) | ((w3
>> 8) & mask24h
); /* hhhggg */
72 ((r0
) | ((r1
<< 48) & mask24hh
)).Put(dest
+0);
74 ((r1
>> 16) | ((r2
<< 32) & mask24hhh
)).Put(dest
+8);
76 ((r2
>> 32) | ((r3
<< 16) & mask24hhhh
)).Put(dest
+16);
79 #if defined(__x86_64) || defined(USE_MMX)
80 static void Convert32To24_32bytes(const unsigned char* src
,
83 c64 w0
; w0
.Get(src
+0);
84 c64 w1
; w1
.Get(src
+8);
85 c64 w2
; w2
.Get(src
+16);
86 c64 w3
; w3
.Get(src
+24);
87 Convert32To24_32bytes(w0
,w1
,w2
,w3
, dest
);
91 void Convert32To24Frame(const void* data
, unsigned char* dest
, unsigned npixels
)
93 const unsigned char* src
= (const unsigned char*)data
;
95 #if defined(__x86_64) || defined(USE_MMX)
98 Convert32To24_32bytes(src
, dest
);
108 for(unsigned pos
=0; pos
<npixels
; ++pos
)
110 dest
[3*pos
+0] = src
[4*pos
+0];
111 dest
[3*pos
+1] = src
[4*pos
+1];
112 dest
[3*pos
+2] = src
[4*pos
+2];
116 static void Unbuild16(unsigned char* target
, unsigned rgb16
)
118 unsigned B
= (rgb16
%32)*256/32;
119 unsigned G
= ((rgb16
/32)%64)*256/64;
120 unsigned R
= ((rgb16
/(32*64))%32)*256/32;
126 static void Unbuild15(unsigned char* target
, unsigned rgb16
)
128 unsigned B
= (rgb16
%32)*256/32;
129 unsigned G
= ((rgb16
/32)%32)*256/32;
130 unsigned R
= ((rgb16
/(32*32))%32)*256/32;
136 template<int basevalue_lo
, int basevalue_hi
>
139 static const uint64_t static_value
=
140 (( ((uint64_t)(unsigned short) basevalue_lo
) << 0)
141 | ( ((uint64_t)(unsigned short) basevalue_hi
) << 16)
142 | ( ((uint64_t)(unsigned short) basevalue_lo
) << 32)
143 | ( ((uint64_t)(unsigned short) basevalue_hi
) << 48));
144 static const uint64_t value
;
146 template<int basevalue_lo
, int basevalue_hi
>
147 const uint64_t Bits16const
<basevalue_lo
, basevalue_hi
>::value
=
148 Bits16const
<basevalue_lo
, basevalue_hi
>::static_value
;
150 template<int basevalue_lo
, int basevalue_hi
>
153 static const uint64_t static_value
=
154 (( ((uint64_t)(unsigned int) basevalue_lo
) << 0)
155 | ( ((uint64_t)(unsigned int) basevalue_hi
) << 32));
156 static const uint64_t value
= static_value
;
158 template<int basevalue_lo, int basevalue_hi>
159 const uint64_t Bits32const<basevalue_lo, basevalue_hi>::value =
160 Bits32const<basevalue_lo, basevalue_hi>::static_value;*/
162 template<uint64_t basevalue_lo
, uint64_t basevalue_hi
>
165 static const uint64_t static_value
=
167 | (basevalue_hi
<< 8)
168 | (basevalue_lo
<< 16)
169 | (basevalue_hi
<< 24)
170 | (basevalue_lo
<< 32)
171 | (basevalue_hi
<< 40)
172 | (basevalue_lo
<< 48)
173 | (basevalue_hi
<< 56));
174 static const uint64_t value
= static_value
;
178 template<int lowbitcount
, int highbitcount
, int leftshift
>
181 static const uint64_t basevalue_lo
= (1 << lowbitcount
) - 1;
182 static const uint64_t basevalue_hi
= (1 << highbitcount
) - 1;
183 static const uint64_t value
= Bits8const
<basevalue_lo
,basevalue_hi
>::value
<< leftshift
;
187 struct Convert_2byte_consts
189 static const uint64_t mask_lo
;// = MaskBconst<bits,0, 0>::value;
190 static const uint64_t mask_hi
;// = MaskBconst<bits,0, 8>::value;
191 static const uint64_t mask_frac
;// = MaskBconst<8-bits,8-bits, 0>::value;
194 const uint64_t Convert_2byte_consts
<bits
>::mask_lo
= MaskBconst
<bits
, 0, 0>::value
;
196 const uint64_t Convert_2byte_consts
<bits
>::mask_hi
= MaskBconst
<bits
, 0, 8>::value
;
198 const uint64_t Convert_2byte_consts
<bits
>::mask_frac
= MaskBconst
<8-bits
, 8-bits
, 0>::value
;
200 template<int offs
, int bits
>
201 struct Convert_2byte_helper
205 Convert_2byte_helper(c64 p4a
, c64 p4b
)
207 const uint64_t& mask_lo
= Convert_2byte_consts
<bits
>::mask_lo
;
208 const uint64_t& mask_hi
= Convert_2byte_consts
<bits
>::mask_hi
;
209 const uint64_t& mask_frac
= Convert_2byte_consts
<bits
>::mask_frac
;
211 /* STEP 1: SEPARATE THE PIXELS INTO RED, GREEN AND BLUE COMPONENTS */
213 /* 000BBBBB 000bbbbb 000BBBBB 000bbbbb 000BBBBB 000bbbbb 000BBBBB 000bbbbb */
214 c64 s5
= ((p4a
>> offs
) & mask_lo
) | ((p4b
<< (8-offs
)) & mask_hi
);
216 /* STEP 2: SCALE THE COLOR COMPONENTS TO 256 RANGE */
218 /* BBBBB000 bbbbb000 BBBBB000 bbbbb000 BBBBB000 bbbbb000 BBBBB000 bbbbb000 */
219 /* 00000BBB 00000bbb 00000BBB 00000bbb 00000BBB 00000bbb 00000BBB 00000bbb */
220 c64 v8
= (s5
<< (8-bits
)) | ((s5
>> (bits
-(8-bits
))) & mask_frac
);
223 * BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb *
226 /* STEP 3: DEINTERLACE THE PIXELS */
227 lo
= (v8
) & mask64l
;
228 hi
= (v8
>> 8) & mask64l
;
233 template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
234 static void Convert_2byte_to_24Common(const unsigned char* src, unsigned char* dest)
235 __attribute((noinline));
237 template<int roffs
,int rbits
, int goffs
,int gbits
, int boffs
,int bbits
, bool rgb24
>
238 static void Convert_2byte_to_24or32Common(const unsigned char* src
, unsigned char* dest
)
240 c64 p4a
; p4a
.Get(src
+0); // four pixels
241 c64 p4b
; p4b
.Get(src
+8); // another four pixels
243 /* in: In both registers: */
245 Convert_2byte_helper
<roffs
,rbits
> r(p4a
,p4b
);
246 Convert_2byte_helper
<boffs
,bbits
> b(p4a
,p4b
);
247 Convert_2byte_helper
<goffs
,gbits
> g(p4a
,p4b
);
249 /* STEP 4: CONVERT PIXELS INTO RGB32 */
265 #if 0 && defined(__MMX__) /* FIXME why is this 0&&? */
266 // punpcklbw 0k0h0e0b, 0j0g0d0a -> 00ed00ba
267 // punpcklwd 0l0i0f0c, ________ -> 0f__0c__
268 c64 w1
= r
.lo
.unpacklwd(0) | g
.lo
.unpacklbw(b
.lo
); // pix 0,1
269 // punpckhbw 0k0h0e0b, 0j0g0d0a -> 00kj00hg
270 // punpckhwd 0l0i0f0c, ________ -> 0l__0i__
271 c64 w2
= r
.lo
.unpackhwd(0) | g
.lo
.unpackhbw(b
.lo
); // pix 2,3
273 c64 w3
= r
.hi
.unpacklwd(0) | g
.hi
.unpacklbw(b
.hi
); // pix 4,5
274 c64 w4
= r
.hi
.unpackhwd(0) | g
.hi
.unpackhbw(b
.hi
); // pix 6,7
279 /* With 64-bit registers, this code is greatly simpler than
280 * the emulation of unpack opcodes. However, when the
281 * unpack opcodes is available, using them is shorter.
282 * Which way is faster? FIXME: Find out
285 // mask64lw: 00**00**
286 // mask64hw: **00**00
287 // b.lo & mask64lw: 000g000a
288 // g.lo & mask64lw: 000h000b
289 // r.lo & mask64lw: 000i000c
290 // b.lo & mask64hw: 0j000d00
291 // g.lo & mask64hw: 0k000e00
292 // r.lo & mask64hw: 0l000f00
294 c64 tlo1
= ((b
.lo
& mask64lw
) ) | ((g
.lo
& mask64lw
) << 8) | ((r
.lo
& mask64lw
) << 16);
295 c64 tlo2
= ((b
.lo
& mask64hw
) >>16) | ((g
.lo
& mask64hw
) >> 8) | ((r
.lo
& mask64hw
) );
297 c64 thi1
= ((b
.hi
& mask64lw
) ) | ((g
.hi
& mask64lw
) << 8) | ((r
.hi
& mask64lw
) << 16);
298 c64 thi2
= ((b
.hi
& mask64hw
) >>16) | ((g
.hi
& mask64hw
) >> 8) | ((r
.hi
& mask64hw
) );
304 * mask64ld = 0000****
305 * mask64hd = ****0000
308 c64 w1
= (tlo1
& mask64ld
) | ((tlo2
& mask64ld
) << 32); // 00000cba | 00000fed = 0fed0bca
309 c64 w2
= (tlo2
& mask64hd
) | ((tlo1
& mask64hd
) >> 32); // 0lkj0000 | 0ihg0000 = 0lkj0ihg
311 c64 w3
= (thi1
& mask64ld
) | ((thi2
& mask64ld
) << 32);
312 c64 w4
= (thi2
& mask64hd
) | ((thi1
& mask64hd
) >> 32);
317 /* STEP 5A: CONVERT PIXELS INTO RGB24 */
318 Convert32To24_32bytes(w1
,w2
,w3
,w4
, dest
);
322 /* STEP 5B: STORE RGB32 */
330 punpcklbw ____ABCD, ____abcd = AaBbCcDd
331 punpcklwd ____ABCD, ____abcd = ABabCDcd
332 punpckldq ____ABCD, ____abcd = ABCDabcd
334 punpckhbw ABCD____, abcd____ = AaBbCcDd
335 punpckhwd ABCD____, abcd____ = ABabCDcd
336 punpckhdq ABCD____, abcd____ = ABCDabcd
340 void Convert15To24Frame(const void* data
, unsigned char* dest
, unsigned npixels
, bool swap_red_blue
)
342 const unsigned char* src
= (const unsigned char*)data
;
345 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*3, npixels
-= 8)
346 Convert_2byte_to_24or32Common
<0,5, 5,5, 10,5, true> (src
, dest
);
348 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*3, npixels
-= 8)
349 Convert_2byte_to_24or32Common
<10,5, 5,5, 0,5, true> (src
, dest
);
354 for(unsigned a
=0; a
<npixels
; ++a
)
356 unsigned short v
= ((const unsigned short*)src
)[a
];
357 Unbuild15(&dest
[a
*3], v
);
361 void Convert16To24Frame(const void* data
, unsigned char* dest
, unsigned npixels
, bool swap_red_blue
)
363 const unsigned char* src
= (const unsigned char*)data
;
366 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*3, npixels
-= 8)
367 Convert_2byte_to_24or32Common
<0,5, 5,6, 11,5, true> (src
, dest
);
369 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*3, npixels
-= 8)
370 Convert_2byte_to_24or32Common
<11,5, 5,6, 0,5, true> (src
, dest
);
375 for(unsigned a
=0; a
<npixels
; ++a
)
377 unsigned short v
= ((const unsigned short*)src
)[a
];
378 Unbuild16(&dest
[a
*3], v
);
382 void Convert15To32Frame(const void* data
, unsigned char* dest
, unsigned npixels
, bool swap_red_blue
)
384 const unsigned char* src
= (const unsigned char*)data
;
387 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*4, npixels
-= 8)
388 Convert_2byte_to_24or32Common
<0,5, 5,5, 10,5, false> (src
, dest
);
390 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*4, npixels
-= 8)
391 Convert_2byte_to_24or32Common
<10,5, 5,5, 0,5, false> (src
, dest
);
396 for(unsigned a
=0; a
<npixels
; ++a
)
398 unsigned short v
= ((const unsigned short*)src
)[a
];
399 Unbuild15(&dest
[a
*4], v
);
403 void Convert16To32Frame(const void* data
, unsigned char* dest
, unsigned npixels
, bool swap_red_blue
)
405 const unsigned char* src
= (const unsigned char*)data
;
408 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*4, npixels
-= 8)
409 Convert_2byte_to_24or32Common
<0,5, 5,6, 11,5, false> (src
, dest
);
411 for(; npixels
>= 8; src
+= 8*2, dest
+= 8*4, npixels
-= 8)
412 Convert_2byte_to_24or32Common
<11,5, 5,6, 0,5, false> (src
, dest
);
417 for(unsigned a
=0; a
<npixels
; ++a
)
419 unsigned short v
= ((const unsigned short*)src
)[a
];
420 Unbuild16(&dest
[a
*4], v
);
424 static inline unsigned Build16(unsigned x
,unsigned y
, const unsigned char* rgbdata
)
426 unsigned o16
= (x
+ 4*y
) % 16;
427 return (Quantize4x4
<31>(o16
, rgbdata
[2]) << 0)
428 | (Quantize4x4
<63>(o16
, rgbdata
[1]) << 5)
429 | (Quantize4x4
<31>(o16
, rgbdata
[0]) << 11);
431 static inline unsigned Build15(unsigned x
,unsigned y
, const unsigned char* rgbdata
)
433 unsigned o16
= (x
+ 4*y
) % 16;
434 return (Quantize4x4
<31>(o16
, rgbdata
[2]) << 0)
435 | (Quantize4x4
<31>(o16
, rgbdata
[1]) << 5)
436 | (Quantize4x4
<31>(o16
, rgbdata
[0]) << 10);
439 void Convert24To16Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
441 const unsigned char* logodata
= (const unsigned char*) data
;
442 unsigned short* result
= (unsigned short*) dest
;
444 for(unsigned pos
=0; pos
<npixels
; ++pos
)
446 result
[pos
] = Build16(x
,y
, &logodata
[pos
*3]);
447 if(++x
>= width
) { x
=0; ++y
; }
451 void Convert24To15Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
453 const unsigned char* logodata
= (const unsigned char*) data
;
454 unsigned short* result
= (unsigned short*) dest
;
456 for(unsigned pos
=0; pos
<npixels
; ++pos
)
458 result
[pos
] = Build15(x
,y
, &logodata
[pos
*3]);
459 if(++x
>= width
) { x
=0; ++y
; }
464 static inline void Convert_I420_MMX_Common
465 (c64_MMX p0_1
, c64_MMX p2_3
,
466 unsigned char* dest_y0
,
467 unsigned char* dest_y1
,
468 unsigned char* dest_u
,
469 unsigned char* dest_v
)
471 c64_MMX p0
= c64_MMX(0).unpacklbw(p0_1
); // expand to 64-bit (4*16)
472 c64_MMX p1
= c64_MMX(0).unpackhbw(p0_1
);
473 c64_MMX p2
= c64_MMX(0).unpacklbw(p2_3
);
474 c64_MMX p3
= c64_MMX(0).unpackhbw(p2_3
);
476 c64_MMX ry_gy_by
; ry_gy_by
.Init16(RY
,GY
,BY
, 0);
477 c64_MMX rgb_u
; rgb_u
.Init16(RU
,GU
,BU
, 0);
478 c64_MMX rgb_v
; rgb_v
.Init16(RV
,GV
,BV
, 0);
480 c64_MMX ctotal
= p0
.add16(
485 p0
= _mm_madd_pi16(ry_gy_by
.value
, p0
.value
);
486 p1
= _mm_madd_pi16(ry_gy_by
.value
, p1
.value
);
487 p2
= _mm_madd_pi16(ry_gy_by
.value
, p2
.value
);
488 p3
= _mm_madd_pi16(ry_gy_by
.value
, p3
.value
);
491 yy
.Init16( ((p0
.Extract32
<0>() + p0
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)),
492 ((p1
.Extract32
<0>() + p1
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)),
493 ((p2
.Extract32
<0>() + p2
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)),
494 ((p3
.Extract32
<0>() + p3
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)) );
495 yy
= yy
.add16( Bits16const
<Y_ADD
,Y_ADD
>::value
);
497 // Because we're writing to adjacent pixels, we optimize this by
498 // writing two 8-bit values at once in both cases.
499 *(short*)dest_y0
= yy
.Extract88_from_1616lo();
500 *(short*)dest_y1
= yy
.Extract88_from_1616hi();
502 c64_MMX u_total32
= _mm_madd_pi16(rgb_u
.value
, ctotal
.value
);
503 c64_MMX v_total32
= _mm_madd_pi16(rgb_v
.value
, ctotal
.value
);
505 *dest_u
= U_ADD
+ ((u_total32
.Extract32
<0>() + u_total32
.Extract32
<1>()) >> (RGB2YUV_SHIFT
+2));
506 *dest_v
= V_ADD
+ ((v_total32
.Extract32
<0>() + v_total32
.Extract32
<1>()) >> (RGB2YUV_SHIFT
+2));
509 static inline void Convert_YUY2_MMX_Common
510 (c64_MMX p0_1
, c64_MMX p2_3
,
511 unsigned char* dest_yvyu
)
513 c64_MMX p0
= c64_MMX(0).unpacklbw(p0_1
); // expand to 64-bit (4*16)
514 c64_MMX p1
= c64_MMX(0).unpackhbw(p0_1
);
515 c64_MMX p2
= c64_MMX(0).unpacklbw(p2_3
); // expand to 64-bit (4*16)
516 c64_MMX p3
= c64_MMX(0).unpackhbw(p2_3
);
518 c64_MMX ry_gy_by
; ry_gy_by
.Init16(RY
,GY
,BY
, 0);
519 c64_MMX rgb_u
; rgb_u
.Init16(RU
,GU
,BU
, 0);
520 c64_MMX rgb_v
; rgb_v
.Init16(RV
,GV
,BV
, 0);
522 c64_MMX ctotal0
= p0
.add16(p1
);
523 c64_MMX ctotal2
= p2
.add16(p3
);
525 p0
= _mm_madd_pi16(ry_gy_by
.value
, p0
.value
);
526 p1
= _mm_madd_pi16(ry_gy_by
.value
, p1
.value
);
527 p2
= _mm_madd_pi16(ry_gy_by
.value
, p2
.value
);
528 p3
= _mm_madd_pi16(ry_gy_by
.value
, p3
.value
);
531 yy
.Init16( ((p0
.Extract32
<0>() + p0
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)),
532 ((p1
.Extract32
<0>() + p1
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)),
533 ((p2
.Extract32
<0>() + p2
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)),
534 ((p3
.Extract32
<0>() + p3
.Extract32
<1>()) >> (RGB2YUV_SHIFT
)) );
536 yy
= yy
.add16( Bits16const
<Y_ADD
,Y_ADD
>::value
);
538 c64_MMX u_total32_0
= _mm_madd_pi16(rgb_u
.value
, ctotal0
.value
);
539 c64_MMX v_total32_0
= _mm_madd_pi16(rgb_v
.value
, ctotal0
.value
);
540 c64_MMX u_total32_2
= _mm_madd_pi16(rgb_u
.value
, ctotal2
.value
);
541 c64_MMX v_total32_2
= _mm_madd_pi16(rgb_v
.value
, ctotal2
.value
);
543 c64_MMX quadword
= yy
; // four y values: at 0, 2, 4 and 6
545 c64_MMX uv
; uv
.Init16(
546 ((v_total32_0
.Extract32
<0>() + v_total32_0
.Extract32
<1>()) >> (RGB2YUV_SHIFT
+1)),
547 ((u_total32_0
.Extract32
<0>() + u_total32_0
.Extract32
<1>()) >> (RGB2YUV_SHIFT
+1)),
548 ((v_total32_2
.Extract32
<0>() + v_total32_2
.Extract32
<1>()) >> (RGB2YUV_SHIFT
+1)),
549 ((u_total32_2
.Extract32
<0>() + u_total32_2
.Extract32
<1>()) >> (RGB2YUV_SHIFT
+1)) );
550 c64_MMX uv_adds
; uv_adds
.Init16(V_ADD
, U_ADD
, V_ADD
, U_ADD
);
551 uv
= uv
.add16(uv_adds
);
553 quadword
|= uv
<< 8; // two u and v values: at 1, 3, 5 and 7.
554 quadword
.Put(dest_yvyu
); // write four y values: at 0, 2, 4 and 6
558 /*template<int PixStride>
559 void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
560 __attribute__((noinline));*/
562 template<int PixStride
>
563 void Convert_4byte_To_I420Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
565 const unsigned char* src
= (const unsigned char*) data
;
566 unsigned height
= npixels
/ width
;
570 unsigned vpos
= npixels
;
571 unsigned upos
= vpos
+ npixels
/ 4;
572 unsigned stride
= width
*PixStride
;
574 /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u",
575 npixels,width,height, ypos,upos,vpos);*/
577 /* This function is based on code from x264 svn version 711 */
578 /* TODO: Apply MMX optimization for 24-bit pixels */
580 for(unsigned y
=0; y
<height
; y
+= 2)
582 for(unsigned x
=0; x
<width
; x
+= 2)
587 c64_MMX p0_1
; p0_1
.Get(&src
[pos
]); // two 32-bit pixels (4*8)
588 c64_MMX p2_3
; p2_3
.Get(&src
[pos
+stride
]); // two 32-bit pixels
592 Convert_I420_MMX_Common(p0_1
, p2_3
,
604 for(int n
=0; n
<3; ++n
) c
[n
] = rgb
[n
][0] = src
[pos
+ n
];
605 for(int n
=0; n
<3; ++n
) c
[n
] += rgb
[n
][1] = src
[pos
+ n
+ stride
];
608 for(int n
=0; n
<3; ++n
) c
[n
] += rgb
[n
][2] = src
[pos
+ n
];
609 for(int n
=0; n
<3; ++n
) c
[n
] += rgb
[n
][3] = src
[pos
+ n
+ stride
];
612 unsigned destpos
[4] = { ypos
, ypos
+width
, ypos
+1, ypos
+width
+1 };
613 for(int n
=0; n
<4; ++n
)
616 = Y_ADD
+ ((RY
* rgb
[0][n
]
619 ) >> RGB2YUV_SHIFT
); // y
622 dest
[upos
++] = (U_ADD
+ ((RU
* c
[0] + GU
* c
[1] + BU
* c
[2]) >> (RGB2YUV_SHIFT
+2)) );
623 dest
[vpos
++] = (V_ADD
+ ((RV
* c
[0] + GV
* c
[1] + BV
* c
[2]) >> (RGB2YUV_SHIFT
+2)) );
632 /*fprintf(stderr, ",yr=%u,ur=%u,vr=%u\n",
640 template<int PixStride
>
641 void Convert_4byte_To_YUY2Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
643 const unsigned char* src
= (const unsigned char*) data
;
644 unsigned height
= npixels
/ width
;
647 unsigned stride
= width
*PixStride
;
649 /* This function is based on code from x264 svn version 711 */
650 /* TODO: Apply MMX optimization for 24-bit pixels */
652 for(unsigned y
=0; y
<height
; ++y
)
654 for(unsigned x
=0; x
<width
; x
+= 2)
659 c64_MMX p0_1
; p0_1
.Get(&src
[pos
]); // two 32-bit pixels (4*8)
662 c64_MMX p2_3
; p2_3
.Get(&src
[pos
]); // two 32-bit pixels (4*8)
666 Convert_YUY2_MMX_Common(p0_1
, p2_3
,
677 for(int n
=0; n
<3; ++n
) c
[n
] = rgb
[n
][0] = src
[pos
+ n
];
680 for(int n
=0; n
<3; ++n
) c
[n
] += rgb
[n
][1] = src
[pos
+ n
];
683 for(int n
=0; n
<2; ++n
)
686 = Y_ADD
+ ((RY
* rgb
[0][n
]
689 ) >> RGB2YUV_SHIFT
); // y
692 dest
[ypos
+3] = (U_ADD
+ ((RU
* c
[0] + GU
* c
[1] + BU
* c
[2]) >> (RGB2YUV_SHIFT
+1)) );
693 dest
[ypos
+1] = (V_ADD
+ ((RV
* c
[0] + GV
* c
[1] + BV
* c
[2]) >> (RGB2YUV_SHIFT
+1)) );
703 /*template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
704 void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
705 __attribute__((noinline));*/
707 template<int roffs
,int rbits
, int goffs
,int gbits
, int boffs
,int bbits
>
708 void Convert_2byte_To_I420Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
710 const unsigned PixStride
= 2;
711 const unsigned char* src
= (const unsigned char*) data
;
712 unsigned height
= npixels
/ width
;
715 unsigned vpos
= npixels
;
716 unsigned upos
= vpos
+ npixels
/ 4;
717 unsigned stride
= width
*PixStride
;
719 /* This function is based on code from x264 svn version 711 */
721 for(unsigned y
=0; y
<height
; y
+= 2)
723 for(unsigned x
=0; x
<width
; x
+= 8)
725 unsigned char Rgb2byteBuf
[2][8][4];
727 /* Convert 8 pixels from two scanlines (16 in total)
728 * from RGB15 / RGB16 to RGB32
729 * (Not RGB32, because RGB32 conversion is faster)
731 Convert_2byte_to_24or32Common
732 <roffs
,rbits
, goffs
,gbits
, boffs
,bbits
, false>
733 (src
+pos
, Rgb2byteBuf
[0][0]);
735 Convert_2byte_to_24or32Common
736 <roffs
,rbits
, goffs
,gbits
, boffs
,bbits
, false>
737 (src
+pos
+stride
, Rgb2byteBuf
[1][0]);
741 for(int x8
= 0; x8
< 8; x8
+= 2)
744 c64_MMX p0_1
; p0_1
.Get(&Rgb2byteBuf
[0][x8
][0]); // two 32-bit pixels (4*8)
745 c64_MMX p2_3
; p2_3
.Get(&Rgb2byteBuf
[1][x8
][0]); // two 32-bit pixels
747 Convert_I420_MMX_Common(p0_1
, p2_3
,
754 /* TODO: Some faster means than using pointers */
755 unsigned char* rgb
[4] =
757 Rgb2byteBuf
[0][x8
+0],
758 Rgb2byteBuf
[0][x8
+1],
759 Rgb2byteBuf
[1][x8
+0],
763 for(int m
=0; m
<3; ++m
) c
[m
] = 0;
764 for(int n
=0; n
<4; ++n
)
765 for(int m
=0; m
<3; ++m
)
768 unsigned destpos
[4] = { ypos
, ypos
+1, ypos
+width
, ypos
+width
+1 };
769 for(int n
=0; n
<4; ++n
)
772 = Y_ADD
+ ((RY
* rgb
[n
][0]
775 ) >> RGB2YUV_SHIFT
); // y
778 /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
779 // Note: +2 is because c[] contains 4 values
780 dest
[upos
++] = U_ADD
+ ((RU
* c
[0] + GU
* c
[1] + BU
* c
[2]) >> (RGB2YUV_SHIFT
+2));
781 dest
[vpos
++] = V_ADD
+ ((RV
* c
[0] + GV
* c
[1] + BV
* c
[2]) >> (RGB2YUV_SHIFT
+2));
795 template<int roffs
,int rbits
, int goffs
,int gbits
, int boffs
,int bbits
>
796 void Convert_2byte_To_YUY2Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
798 const unsigned PixStride
= 2;
799 const unsigned char* src
= (const unsigned char*) data
;
800 unsigned height
= npixels
/ width
;
803 unsigned stride
= width
*PixStride
;
805 for(unsigned y
=0; y
<height
; ++y
)
807 for(unsigned x
=0; x
<width
; x
+= 8)
809 unsigned char Rgb2byteBuf
[8][4];
811 /* Convert 8 pixels from a scanline
812 * from RGB15 / RGB16 to RGB32
813 * (Not RGB32, because RGB32 conversion is faster)
815 Convert_2byte_to_24or32Common
816 <roffs
,rbits
, goffs
,gbits
, boffs
,bbits
, false>
817 (src
+pos
, Rgb2byteBuf
[0]);
821 for(int x8
= 0; x8
< 8; )
824 c64_MMX p0_1
; p0_1
.Get(&Rgb2byteBuf
[x8
][0]); // two 32-bit pixels (4*8)
825 c64_MMX p2_3
; p2_3
.Get(&Rgb2byteBuf
[x8
+2][0]); // two 32-bit pixels (4*8)
826 Convert_YUY2_MMX_Common(p0_1
, p2_3
, dest
+ypos
);
831 /* TODO: Some faster means than using pointers */
832 unsigned char* rgb
[2] =
838 for(int m
=0; m
<3; ++m
) c
[m
] = 0;
839 for(int n
=0; n
<2; ++n
)
840 for(int m
=0; m
<3; ++m
)
843 for(int n
=0; n
<2; ++n
)
846 = Y_ADD
+ ((RY
* rgb
[n
][0]
849 ) >> RGB2YUV_SHIFT
); // y
852 /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
853 // Note: +2 is because c[] contains 4 values
854 dest
[ypos
+3] = U_ADD
+ ((RU
* c
[0] + GU
* c
[1] + BU
* c
[2]) >> (RGB2YUV_SHIFT
+1));
855 dest
[ypos
+1] = V_ADD
+ ((RV
* c
[0] + GV
* c
[1] + BV
* c
[2]) >> (RGB2YUV_SHIFT
+1));
871 void Convert_I420To24Frame(const void* data
, unsigned char* dest
,
872 unsigned npixels
, unsigned width
, bool swap_red_blue
)
874 const unsigned char* src
= (const unsigned char*) data
;
875 unsigned height
= npixels
/ width
;
878 unsigned vpos
= npixels
;
879 unsigned upos
= vpos
+ npixels
/ 4;
881 /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u\n",
882 npixels,width,height, ypos,upos,vpos);*/
885 c64_MMX rgb
[4], yy
[4];
886 static const c64_MMX vmul
/*; vmul.Init16*/(VR
, VG
, 0, 0); // R,G,B,0 * vmul = V
887 static const c64_MMX umul
/*; umul.Init16*/(0, UG
, UB
, 0); // R,G,B,0 * umul = U
897 #pragma omp parallel for
898 for(unsigned y
=0; y
<height
; y
+= 2)
900 for(unsigned x
=0; x
<width
; )
903 rgb
[0]=rgb
[1]=rgb
[2]=rgb
[3]=yy
[0]=yy
[1]=yy
[2]=yy
[3]=c64_MMX(mask64hd
)|mask64ld
;
904 /* Somehow, this line above fixes an error
905 * where U&V seem to be off by 4 pixels.
906 * Probably a GCC bug? */
908 /* Load 4 U and V values and subtract U_ADD and V_ADD from them. */
909 uint64_t tmp_u
= *(uint32_t*)&src
[upos
];
910 uint64_t tmp_v
= *(uint32_t*)&src
[vpos
];
911 c64_MMX uuq
= c64_MMX(0)
912 .unpacklbw(tmp_u
) // 8-bit to 16-bit
913 .sub16(Bits16const
<U_ADD
,U_ADD
>::value
)
914 .shl16(16 - YUV2RGB_SHIFT
); // shift them so that *13bitconst results in upper 16 bits having the actual value
915 c64_MMX vvq
= c64_MMX(0)
917 .sub16(Bits16const
<V_ADD
,V_ADD
>::value
)
918 .shl16(16 - YUV2RGB_SHIFT
); // shift them so that *13bitconst results in upper 16 bits having the actual value
920 const short* uu
= (const short*)&uuq
;
921 const short* vv
= (const short*)&vvq
;
923 /* c64_MMX rgb[4]; // four sets of 4*int16, each representing 1 rgb value */
924 for(int n
=0; n
<4; ++n
)
926 /* vv is shifted by 3 bits, vmul is shifted by 13 bits
927 * 16 bits in total, so mul16hi gets the 16-bit downscaled part */
928 c64_MMX v
; v
.Init16(vv
[n
]);
929 c64_MMX u
; u
.Init16(uu
[n
]);
930 rgb
[n
] = v
.mul16hi(vmul
).add16(
934 /* rgb[0] : U,V increment of RGB32 for x0,y0 - x1,y1
935 * rgb[1] : U,V increment of RGB32 for x2,y0 - x3,y1
936 * rgb[2] : U,V increment of RGB32 for x4,y0 - x5,y1
937 * rgb[3] : U,V increment of RGB32 for x6,y0 - x7,y1
940 unsigned yyoffs
[4] = { ypos
, ypos
+1, ypos
+width
, ypos
+width
+1 };
941 /* c64_MMX yy[4]; // four sets of 4*int16, each representing four Y values */
942 for(int n
=0; n
<4; ++n
)
944 c64_MMX luma
; luma
.Init16(
945 src
[yyoffs
[0]+n
*2], /* n(0..3): x0y0,x2y0,x4y0,x6y0 */
946 src
[yyoffs
[1]+n
*2], /* n(0..3): x1y0,x3y0,x5y0,x7y0 */
947 src
[yyoffs
[2]+n
*2], /* n(0..3): x0y1,x2y1,x4y1,x6y1 */
948 src
[yyoffs
[3]+n
*2] /* n(0..3): x1y1,x3y1,x5y1,x7y1 */
950 luma
= luma
.sub16(Bits16const
<Y_ADD
,Y_ADD
>::value
);
951 luma
= luma
.shl16(16 - YUV2RGB_SHIFT
);
952 yy
[n
] = luma
.mul16hi(Bits16const
<Y_REV
,Y_REV
>::value
);
954 const short* const yyval
= (const short*) &yy
[0].value
;
963 for(int ny
= 0; ny
< 4; ny
+= 2)
965 /* Note: We must use 16-bit pixels here instead of 8-bit,
966 * because the rgb+Y addition can overflow. conv_s16_u8()
967 * does the necessary clamping, which would not be done
968 * if the values were 8-bit.
970 // 8 pixels for one scanline, repeated twice
971 /* Note: C++ has no named constructors, so we
972 * use statement blocks here as substitutes.
975 = rgb
[0].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+0]); tmp
; }) )
977 rgb
[0].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+1]); tmp
; }) ));
979 = rgb
[1].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+4]); tmp
; }) )
981 rgb
[1].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+5]); tmp
; }) ));
983 = rgb
[2].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+8]); tmp
; }) )
985 rgb
[2].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+9]); tmp
; }) ));
987 = rgb
[3].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+12]); tmp
; }) )
989 rgb
[3].add16( ({ c64_MMX tmp
; tmp
.Init16(yyval
[ny
+13]); tmp
; }) ));
991 Convert32To24_32bytes(r0
,r1
,r2
,r3
, &dest
[tmppos
]);
992 tmppos
+= width
*3; // next line
996 ypos
+= 8; // eight bytes for this line (and eight from next too)
997 pos
+= 8*3; // eight triplets generated on this line
998 x
+= 8; // eight yy values used on this line
1000 int u
= src
[upos
] - U_ADD
;
1001 int v
= src
[vpos
] - V_ADD
;
1005 (VR
* v
) >> (YUV2RGB_SHIFT
),
1006 (VG
* v
+ UG
* u
) >> (YUV2RGB_SHIFT
),
1007 ( + UB
* u
) >> (YUV2RGB_SHIFT
)
1010 unsigned incr
[4] = {0,1,width
,width
+1};
1012 for(unsigned r
=0; r
<4; ++r
)
1013 for(unsigned doffs
=pos
+ incr
[r
]*3, yoffs
=ypos
+ incr
[r
],
1014 yy
= (Y_REV
* (src
[yoffs
] - Y_ADD
)) >> YUV2RGB_SHIFT
,
1016 dest
[doffs
+n
] = c64::clamp_u8(rgb
[n
] + (int)yy
);
1020 ypos
+= 2; // two bytes for this line (two from next line)
1021 pos
+= 2*3; // two triplets generated on this line
1022 x
+= 2; // two yy values used on this line
1033 void Convert_YUY2To24Frame(const void* data
, unsigned char* dest
,
1034 unsigned npixels
, unsigned width
, bool swap_red_blue
)
1036 const unsigned char* src
= (const unsigned char*) data
;
1037 unsigned height
= npixels
/ width
;
1041 /* TODO: MMX optimization */
1049 #pragma omp parallel for
1050 for(unsigned y
=0; y
<height
; ++y
)
1052 for(unsigned x
=0; x
<width
; x
+= 2)
1055 int u
= src
[ypos
+1] - U_ADD
;
1056 int v
= src
[ypos
+3] - V_ADD
;
1060 (VR
* v
) >> (YUV2RGB_SHIFT
),
1061 (VG
* v
+ UG
* u
) >> (YUV2RGB_SHIFT
),
1062 ( + UB
* u
) >> (YUV2RGB_SHIFT
)
1065 for(unsigned r
=0; r
<2; ++r
)
1066 for(unsigned doffs
=pos
+ r
*3, yoffs
=ypos
+r
*2,
1067 yy
= (Y_REV
* (src
[yoffs
] - Y_ADD
)) >> YUV2RGB_SHIFT
,
1069 dest
[doffs
+n
] = c64::clamp_u8(rgb
[n
] + (int)yy
);
1071 ypos
+= 4; // four bytes for this line (y,u,y,v)
1072 pos
+= 2*3; // two triplets generated on this line
1073 x
+= 2; // two yy values used on this line
1079 void Convert24To_I420Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1081 Convert_4byte_To_I420Frame
<3>(data
,dest
,npixels
,width
);
1083 void Convert32To_I420Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1085 Convert_4byte_To_I420Frame
<4>(data
,dest
,npixels
,width
);
1087 void Convert15To_I420Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1089 Convert_2byte_To_I420Frame
<10,5, 5,5, 0,5>(data
,dest
,npixels
,width
);
1091 void Convert16To_I420Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1093 Convert_2byte_To_I420Frame
<11,5, 5,6, 0,5>(data
,dest
,npixels
,width
);
1096 void Convert24To_YUY2Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1098 Convert_4byte_To_YUY2Frame
<3>(data
,dest
,npixels
,width
);
1100 void Convert32To_YUY2Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1102 Convert_4byte_To_YUY2Frame
<4>(data
,dest
,npixels
,width
);
1104 void Convert15To_YUY2Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1106 Convert_2byte_To_YUY2Frame
<10,5, 5,5, 0,5>(data
,dest
,npixels
,width
);
1108 void Convert16To_YUY2Frame(const void* data
, unsigned char* dest
, unsigned npixels
, unsigned width
)
1110 Convert_2byte_To_YUY2Frame
<11,5, 5,6, 0,5>(data
,dest
,npixels
,width
);