Add built-in VGA font
[jpcrr.git] / streamtools / simd.hh
blobb138e692524b43fde2fb2228108811c6fb057ce2
1 #if defined(__MMX__) && !defined(__x86_64)
2 #define USE_MMX
3 #endif
4 #if defined(__SSE__)
5 #define USE_SSE
6 #endif
8 /* SIMD interface (MMX) written by Bisqwit
9 * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
12 #ifdef __3dNOW__
13 # include <mm3dnow.h> /* Note: not available on ICC */
14 #elif defined(__MMX__)
15 # include <mmintrin.h>
16 #endif
18 struct c64_common
20 static signed char clamp_s8(int_fast64_t v)
21 { return v<-128 ? -128 : (v > 127 ? 127 : v); }
22 static unsigned char clamp_u8(int_fast64_t v)
23 { return v<0 ? 0 : (v > 255 ? 255 : v); }
24 static short clamp_s16(int_fast64_t v)
25 { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); }
27 static inline uint_fast64_t expand32_8(uint_fast32_t a)
29 // 0000abcd -> 0a0b0c0d
30 typedef uint_fast64_t v;
31 return (a&0xFFU)
32 | ((a&0xFF00U)<<8) // base: 8+8 = 16
33 | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32
34 | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48
36 static inline uint_fast64_t expand32_16(uint_fast32_t a)
38 // 0000abcd -> 00ab00cd
39 typedef uint_fast64_t v;
40 return (a&0xFFFFU)
41 | ((v)(a&0xFFFF0000UL)<<16); // base: 16+16 = 32
45 #ifdef __MMX__
46 /* 64-bit integers that use MMX / 3Dnow operations where relevant */
47 struct c64_MMX: public c64_common
49 typedef c64_MMX c64;
51 __m64 value;
53 inline c64_MMX() { }
54 inline c64_MMX(__m64 v) : value(v) { }
55 inline c64_MMX(const uint64_t& v) : value( *(const __m64*)& v) { }
56 inline c64_MMX(int v) : value(_m_from_int(v)) { }
57 inline c64_MMX(short a,short b,short c, short d)
58 : value(_mm_setr_pi16(a,b,c,d)) { }
60 inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
61 inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
62 c64& operator<<= (int n) { return *this = shl64(n); }
63 c64& operator>>= (int n) { return *this = shr64(n); }
65 c64 conv_s16_u8() const { return conv_s16_u8(*this); }
66 c64 conv_s16_s8() const { return conv_s16_s8(*this); }
68 void Get(const unsigned char* p) { value = *(const __m64*)p; }
69 void Put( unsigned char* p)const { *(__m64*)p = value; }
71 void Init16(short a,short b,short c, short d)
72 { value = _mm_setr_pi16(a,b,c,d); }
73 void Init16(short a)
74 { value = _mm_set1_pi16(a); }
76 void GetD(const unsigned char* p) { value = *(const __m64*)p; }
78 template<int n>
79 short Extract16() const { return ((const short*)&value)[n]; }
80 template<int n>
81 int Extract32() const { return ((const int*)&value)[n]; }
83 short Extract88_from_1616lo() const
85 const unsigned char* data = (const unsigned char*)&value;
86 // bytes: 76543210
87 // shorts: 33221100
88 // take: H L
89 return data[0] | *(short*)(data+1);
90 //return data[0] | ((*(const unsigned int*)data) >> 8);
92 short Extract88_from_1616hi() const
94 const unsigned char* data = 4+(const unsigned char*)&value;
95 // bytes: 76543210
96 // shorts: 33221100
97 // take: H L
98 return data[0] | *(short*)(data+1);
99 //return data[0] | ((*(const unsigned int*)data) >> 8);
103 c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; }
104 c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; }
105 c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; }
106 c64& operator+= (const c64& b) { return *this = *this + b; }
107 c64& operator-= (const c64& b) { return *this = *this - b; }
109 c64 operator~ () const {
110 static const uint_least64_t negpat = ~(uint_least64_t)0;
111 return c64(_mm_xor_si64(value, *(const __m64*)&negpat));
114 /* psllqi: p = packed
115 s = shift
116 r = right, l = left
117 l = shift in zero, a = shift in sign bit
118 q = 64-bit, d = 32-bit, w = 16-bit
119 [i = immed amount]
121 c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); }
122 c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); }
123 c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); }
125 c64 operator- (const c64& b) const
127 #ifdef __SSE2__
128 return _mm_sub_si64(value, b.value);
129 #else
130 return (const uint64_t&)value - (const uint64_t&)b.value;
131 #endif
133 c64 operator+ (const c64& b) const
135 #ifdef __SSE2__
136 return _mm_add_si64(value, b.value);
137 #else
138 return (const uint64_t&)value + (const uint64_t&)b.value;
139 #endif
143 c64 shl64(int b) const { return _mm_slli_si64(value, b); }
144 c64 shr64(int b) const { return _mm_srli_si64(value, b); }
145 c64 shl16(int b) const { return _mm_slli_pi16(value, b); }
146 c64 shr16(int b) const { return _mm_srli_pi16(value, b); }
147 c64 sar32(int b) const { return _mm_srai_pi32(value, b); }
148 c64 sar16(int b) const { return _mm_srai_pi16(value, b); }
149 c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); }
150 c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); }
151 c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); }
152 c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); }
153 c64 mul16(const c64& b) const { return _mm_mullo_pi16(value, b.value); }
154 c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); }
155 //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
156 c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); }
157 c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); }
159 c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); }
160 c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); }
161 c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); }
162 c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); }
163 c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); }
164 c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); }
166 c64 operator& (const uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); }
168 c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); }
169 c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); }
170 c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); }
172 #endif
174 struct c64_nonMMX: public c64_common
176 typedef c64_nonMMX c64;
178 uint_least64_t value;
180 inline c64_nonMMX() { }
181 inline c64_nonMMX(uint64_t v) : value(v) { }
182 inline c64_nonMMX(int v) : value(v) { }
183 inline c64_nonMMX(short a,short b,short c, short d)
184 { Init16(a,b,c,d); }
186 c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
187 c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
188 c64& operator<<= (int n) { return *this = shl64(n); }
189 c64& operator>>= (int n) { return *this = shr64(n); }
191 c64 conv_s16_u8() const { return conv_s16_u8(*this); }
192 c64 conv_s16_s8() const { return conv_s16_s8(*this); }
194 void Init16(short a,short b,short c, short d)
195 { uint_fast64_t aa = (unsigned short)a,
196 bb = (unsigned short)b,
197 cc = (unsigned short)c,
198 dd = (unsigned short)d;
199 value = aa | (bb << 16) | (cc << 32) | (dd << 48); }
200 void Init16(short a)
201 { Init16(a,a,a,a); }
202 void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d,
203 unsigned char e,unsigned char f,unsigned char g,unsigned char h)
205 value = ((uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24)))
206 | (((uint_fast64_t)e) << 32)
207 | (((uint_fast64_t)f) << 40)
208 | (((uint_fast64_t)g) << 48)
209 | (((uint_fast64_t)h) << 56);
212 void Get(const unsigned char* p) { value = *(const uint_least64_t*)p; }
213 void Put( unsigned char* p)const { *(uint_least64_t*)p = value; }
215 c64& operator&= (const c64& b) { value&=b.value; return *this; }
216 c64& operator|= (const c64& b) { value|=b.value; return *this; }
217 c64& operator^= (const c64& b) { value^=b.value; return *this; }
218 c64& operator+= (const c64& b) { value+=b.value; return *this; }
219 c64& operator-= (const c64& b) { value-=b.value; return *this; }
220 c64 operator& (const c64& b) const { return value & b.value; }
221 c64 operator| (const c64& b) const { return value | b.value; }
222 c64 operator^ (const c64& b) const { return value ^ b.value; }
223 c64 operator- (const c64& b) const { return value - b.value; }
224 c64 operator+ (const c64& b) const { return value + b.value; }
226 c64 operator& (uint_fast64_t b) const { return value & b; }
228 c64 operator~ () const { return ~value; }
230 #define usimdsim(type, count, op) \
231 type* p = (type*)&res.value; \
232 for(int n=0; n<count; ++n) p[n] = (p[n] op b)
234 #define simdsim(type, count, op) \
235 type* p = (type*)&res.value; \
236 const type* o = (const type*)&b.value; \
237 for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
239 c64 shl64(int b) const { return value << b; }
240 c64 shr64(int b) const { return value >> b; }
241 c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; }
242 c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; }
243 c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; }
244 c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; }
246 c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; }
247 c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; }
248 c64 add32(const c64& b) const { c64 res = *this; simdsim(int, 2, +); return res; }
249 c64 sub32(const c64& b) const { c64 res = *this; simdsim(int, 2, -); return res; }
250 c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; }
251 c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; }
252 c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; }
253 c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; }
255 #undef simdsim
256 #undef usimdsim
258 c64 conv_s32_s16(const c64& b) const
260 c64 res; res.
261 Init16(clamp_s16(value & 0xFFFFFFFFU),
262 clamp_s16(value >> 32),
263 clamp_s16(b.value & 0xFFFFFFFFU),
264 clamp_s16(b.value >> 32));
265 return res;
267 c64 conv_s16_u8(const c64& b) const
269 c64 res; res.
270 Init8(clamp_u8(value & 0xFFFF),
271 clamp_u8((value >> 16) & 0xFFFF),
272 clamp_u8((value >> 32) & 0xFFFF),
273 clamp_u8((value >> 48) & 0xFFFF),
274 clamp_u8(b.value & 0xFFFF),
275 clamp_u8((b.value >> 16) & 0xFFFF),
276 clamp_u8((b.value >> 32) & 0xFFFF),
277 clamp_u8((b.value >> 48) & 0xFFFF));
278 return res;
280 c64 conv_s16_s8(const c64& b) const
282 c64 res; res.
283 Init8(clamp_s8(value & 0xFFFF),
284 clamp_s8((value >> 16) & 0xFFFF),
285 clamp_s8((value >> 32) & 0xFFFF),
286 clamp_s8((value >> 48) & 0xFFFF),
287 clamp_s8(b.value & 0xFFFF),
288 clamp_s8((b.value >> 16) & 0xFFFF),
289 clamp_s8((b.value >> 32) & 0xFFFF),
290 clamp_s8((b.value >> 48) & 0xFFFF));
291 return res;
294 /* TODO: Verify that these are correct (though they should never be used anyway) */
295 c64 unpacklbw(const c64& p) const
297 #if defined(__MMX__) && !defined(__ICC)
298 /* ICC says [error: type of cast must be integral or enum]
299 * on the return value cast,
300 * so we cannot use this code on ICC. Fine for GCC. */
301 return (uint_least64_t)_m_punpcklbw(*(__m64*)&p.value, *(__m64*)&value);
302 #else
303 uint_fast64_t a=value, b=p.value;
304 return expand32_8(a) | (expand32_8(b) << 8);
305 #endif
307 c64 unpackhbw(const c64& p) const
309 #if defined(__MMX__) && !defined(__ICC)
310 return (uint_least64_t)_m_punpckhbw(*(__m64*)&p.value, *(__m64*)&value);
311 #else
312 uint_fast64_t a=value, b=p.value;
313 return expand32_8(a>>32) | (expand32_8(b>>32) << 8);
314 #endif
316 c64 unpacklwd(const c64& p) const
318 #if defined(__MMX__) && !defined(__ICC)
319 return (uint_least64_t)_m_punpcklwd(*(__m64*)&p.value, *(__m64*)&value);
320 #else
321 uint_fast64_t a=value, b=p.value;
322 return expand32_16(a) | (expand32_16(b) << 16);
323 #endif
325 c64 unpackhwd(const c64& p) const
327 #if defined(__MMX__) && !defined(__ICC)
328 return (uint_least64_t)_m_punpckhwd(*(__m64*)&p.value, *(__m64*)&value);
329 #else
330 uint_fast64_t a=value, b=p.value;
331 return expand32_16(a>>32) | (expand32_16(b>>32) << 16);
332 #endif
334 c64 unpackldq() const { return unpackldq(*this); }
335 c64 unpackldq(const c64& p) const
337 #if defined(__MMX__) && !defined(__ICC)
338 return (uint_least64_t)_m_punpckldq(*(__m64*)&p.value, *(__m64*)&value);
339 #else
340 return value | (p.value << 32);
341 #endif
345 #ifdef USE_MMX
346 typedef c64_MMX c64;
347 #else
348 typedef c64_nonMMX c64;
349 #endif
351 static inline void MMX_clear()
353 #ifdef __3dNOW__
354 _m_femms(); /* Note: not available on ICC or Valgrind */
355 //_mm_empty();
356 #elif defined(__MMX__)
357 _mm_empty();
358 #endif