1 #if defined(__MMX__) && !defined(__x86_64)
8 /* SIMD interface (MMX) written by Bisqwit
9 * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
13 # include <mm3dnow.h> /* Note: not available on ICC */
14 #elif defined(__MMX__)
15 # include <mmintrin.h>
20 static signed char clamp_s8(int_fast64_t v
)
21 { return v
<-128 ? -128 : (v
> 127 ? 127 : v
); }
22 static unsigned char clamp_u8(int_fast64_t v
)
23 { return v
<0 ? 0 : (v
> 255 ? 255 : v
); }
24 static short clamp_s16(int_fast64_t v
)
25 { return v
<-32768 ? -32768 : (v
> 32767 ? 32767 : v
); }
27 static inline uint_fast64_t expand32_8(uint_fast32_t a
)
29 // 0000abcd -> 0a0b0c0d
30 typedef uint_fast64_t v
;
32 | ((a
&0xFF00U
)<<8) // base: 8+8 = 16
33 | ((v
)(a
&0xFF0000U
)<<16) // base: 16+16 = 32
34 | ((v
)(a
&0xFF000000UL
)<<24); // base: 24+24 = 48
36 static inline uint_fast64_t expand32_16(uint_fast32_t a
)
38 // 0000abcd -> 00ab00cd
39 typedef uint_fast64_t v
;
41 | ((v
)(a
&0xFFFF0000UL
)<<16); // base: 16+16 = 32
46 /* 64-bit integers that use MMX / 3Dnow operations where relevant */
47 struct c64_MMX
: public c64_common
54 inline c64_MMX(__m64 v
) : value(v
) { }
55 inline c64_MMX(const uint64_t& v
) : value( *(const __m64
*)& v
) { }
56 inline c64_MMX(int v
) : value(_m_from_int(v
)) { }
57 inline c64_MMX(short a
,short b
,short c
, short d
)
58 : value(_mm_setr_pi16(a
,b
,c
,d
)) { }
60 inline c64
operator<< (int b
) const { if(b
< 0) return *this >> -b
; return shl64(b
); }
61 inline c64
operator>> (int b
) const { if(b
< 0) return *this << -b
; return shr64(b
); }
62 c64
& operator<<= (int n
) { return *this = shl64(n
); }
63 c64
& operator>>= (int n
) { return *this = shr64(n
); }
65 c64
conv_s16_u8() const { return conv_s16_u8(*this); }
66 c64
conv_s16_s8() const { return conv_s16_s8(*this); }
68 void Get(const unsigned char* p
) { value
= *(const __m64
*)p
; }
69 void Put( unsigned char* p
)const { *(__m64
*)p
= value
; }
71 void Init16(short a
,short b
,short c
, short d
)
72 { value
= _mm_setr_pi16(a
,b
,c
,d
); }
74 { value
= _mm_set1_pi16(a
); }
76 void GetD(const unsigned char* p
) { value
= *(const __m64
*)p
; }
79 short Extract16() const { return ((const short*)&value
)[n
]; }
81 int Extract32() const { return ((const int*)&value
)[n
]; }
83 short Extract88_from_1616lo() const
85 const unsigned char* data
= (const unsigned char*)&value
;
89 return data
[0] | *(short*)(data
+1);
90 //return data[0] | ((*(const unsigned int*)data) >> 8);
92 short Extract88_from_1616hi() const
94 const unsigned char* data
= 4+(const unsigned char*)&value
;
98 return data
[0] | *(short*)(data
+1);
99 //return data[0] | ((*(const unsigned int*)data) >> 8);
103 c64
& operator&= (const c64
& b
) { value
=_mm_and_si64(value
,b
.value
); return *this; }
104 c64
& operator|= (const c64
& b
) { value
=_mm_or_si64(value
,b
.value
); return *this; }
105 c64
& operator^= (const c64
& b
) { value
=_mm_xor_si64(value
,b
.value
); return *this; }
106 c64
& operator+= (const c64
& b
) { return *this = *this + b
; }
107 c64
& operator-= (const c64
& b
) { return *this = *this - b
; }
109 c64
operator~ () const {
110 static const uint_least64_t negpat
= ~(uint_least64_t)0;
111 return c64(_mm_xor_si64(value
, *(const __m64
*)&negpat
));
114 /* psllqi: p = packed
117 l = shift in zero, a = shift in sign bit
118 q = 64-bit, d = 32-bit, w = 16-bit
121 c64
operator& (const c64
& b
) const { return c64(_mm_and_si64(value
,b
.value
)); }
122 c64
operator| (const c64
& b
) const { return c64(_mm_or_si64(value
,b
.value
)); }
123 c64
operator^ (const c64
& b
) const { return c64(_mm_xor_si64(value
,b
.value
)); }
125 c64
operator- (const c64
& b
) const
128 return _mm_sub_si64(value
, b
.value
);
130 return (const uint64_t&)value
- (const uint64_t&)b
.value
;
133 c64
operator+ (const c64
& b
) const
136 return _mm_add_si64(value
, b
.value
);
138 return (const uint64_t&)value
+ (const uint64_t&)b
.value
;
143 c64
shl64(int b
) const { return _mm_slli_si64(value
, b
); }
144 c64
shr64(int b
) const { return _mm_srli_si64(value
, b
); }
145 c64
shl16(int b
) const { return _mm_slli_pi16(value
, b
); }
146 c64
shr16(int b
) const { return _mm_srli_pi16(value
, b
); }
147 c64
sar32(int b
) const { return _mm_srai_pi32(value
, b
); }
148 c64
sar16(int b
) const { return _mm_srai_pi16(value
, b
); }
149 c64
add32(const c64
& b
) const { return _mm_add_pi32(value
, b
.value
); }
150 c64
add16(const c64
& b
) const { return _mm_add_pi16(value
, b
.value
); }
151 c64
sub32(const c64
& b
) const { return _mm_sub_pi32(value
, b
.value
); }
152 c64
sub16(const c64
& b
) const { return _mm_sub_pi16(value
, b
.value
); }
153 c64
mul16(const c64
& b
) const { return _mm_mullo_pi16(value
, b
.value
); }
154 c64
mul16hi(const c64
& b
) const { return _mm_mulhi_pi16(value
, b
.value
); }
155 //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
156 c64
add8(const c64
& b
) const { return _mm_add_pi8(value
, b
.value
); }
157 c64
sub8(const c64
& b
) const { return _mm_sub_pi8(value
, b
.value
); }
159 c64
unpacklbw(const c64
& b
) const { return _mm_unpacklo_pi8(b
.value
,value
); }
160 c64
unpacklwd(const c64
& b
) const { return _mm_unpacklo_pi16(b
.value
,value
); }
161 c64
unpackhbw(const c64
& b
) const { return _mm_unpackhi_pi8(b
.value
,value
); }
162 c64
unpackhwd(const c64
& b
) const { return _mm_unpackhi_pi16(b
.value
,value
); }
163 c64
unpackldq(const c64
& b
) const { return _mm_unpacklo_pi32(b
.value
,value
); }
164 c64
unpackldq() const { return _mm_unpacklo_pi32(value
,value
); }
166 c64
operator& (const uint64_t& v
) { return c64(_mm_and_si64(value
, *(const __m64
*)& v
)); }
168 c64
conv_s32_s16(const c64
& b
) const { return _mm_packs_pi32(value
, b
.value
); }
169 c64
conv_s16_u8(const c64
& b
) const { return _mm_packs_pu16(value
, b
.value
); }
170 c64
conv_s16_s8(const c64
& b
) const { return _mm_packs_pi16(value
, b
.value
); }
174 struct c64_nonMMX
: public c64_common
176 typedef c64_nonMMX c64
;
178 uint_least64_t value
;
180 inline c64_nonMMX() { }
181 inline c64_nonMMX(uint64_t v
) : value(v
) { }
182 inline c64_nonMMX(int v
) : value(v
) { }
183 inline c64_nonMMX(short a
,short b
,short c
, short d
)
186 c64
operator<< (int b
) const { if(b
< 0) return *this >> -b
; return shl64(b
); }
187 c64
operator>> (int b
) const { if(b
< 0) return *this << -b
; return shr64(b
); }
188 c64
& operator<<= (int n
) { return *this = shl64(n
); }
189 c64
& operator>>= (int n
) { return *this = shr64(n
); }
191 c64
conv_s16_u8() const { return conv_s16_u8(*this); }
192 c64
conv_s16_s8() const { return conv_s16_s8(*this); }
194 void Init16(short a
,short b
,short c
, short d
)
195 { uint_fast64_t aa
= (unsigned short)a
,
196 bb
= (unsigned short)b
,
197 cc
= (unsigned short)c
,
198 dd
= (unsigned short)d
;
199 value
= aa
| (bb
<< 16) | (cc
<< 32) | (dd
<< 48); }
202 void Init8(unsigned char a
,unsigned char b
,unsigned char c
,unsigned char d
,
203 unsigned char e
,unsigned char f
,unsigned char g
,unsigned char h
)
205 value
= ((uint_fast64_t)(a
| (b
<< 8) | (c
<< 16) | (d
<< 24)))
206 | (((uint_fast64_t)e
) << 32)
207 | (((uint_fast64_t)f
) << 40)
208 | (((uint_fast64_t)g
) << 48)
209 | (((uint_fast64_t)h
) << 56);
212 void Get(const unsigned char* p
) { value
= *(const uint_least64_t*)p
; }
213 void Put( unsigned char* p
)const { *(uint_least64_t*)p
= value
; }
215 c64
& operator&= (const c64
& b
) { value
&=b
.value
; return *this; }
216 c64
& operator|= (const c64
& b
) { value
|=b
.value
; return *this; }
217 c64
& operator^= (const c64
& b
) { value
^=b
.value
; return *this; }
218 c64
& operator+= (const c64
& b
) { value
+=b
.value
; return *this; }
219 c64
& operator-= (const c64
& b
) { value
-=b
.value
; return *this; }
220 c64
operator& (const c64
& b
) const { return value
& b
.value
; }
221 c64
operator| (const c64
& b
) const { return value
| b
.value
; }
222 c64
operator^ (const c64
& b
) const { return value
^ b
.value
; }
223 c64
operator- (const c64
& b
) const { return value
- b
.value
; }
224 c64
operator+ (const c64
& b
) const { return value
+ b
.value
; }
226 c64
operator& (uint_fast64_t b
) const { return value
& b
; }
228 c64
operator~ () const { return ~value
; }
230 #define usimdsim(type, count, op) \
231 type* p = (type*)&res.value; \
232 for(int n=0; n<count; ++n) p[n] = (p[n] op b)
234 #define simdsim(type, count, op) \
235 type* p = (type*)&res.value; \
236 const type* o = (const type*)&b.value; \
237 for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
239 c64
shl64(int b
) const { return value
<< b
; }
240 c64
shr64(int b
) const { return value
>> b
; }
241 c64
shl16(int b
) const { c64 res
= *this; usimdsim(short, 2, <<); return res
; }
242 c64
shr16(int b
) const { c64 res
= *this; usimdsim(unsigned short, 2, >>); return res
; }
243 c64
sar32(int b
) const { c64 res
= *this; usimdsim(int, 2, >>); return res
; }
244 c64
sar16(int b
) const { c64 res
= *this; usimdsim(short, 2, >>); return res
; }
246 c64
add16(const c64
& b
) const { c64 res
= *this; simdsim(short, 4, +); return res
; }
247 c64
sub16(const c64
& b
) const { c64 res
= *this; simdsim(short, 4, -); return res
; }
248 c64
add32(const c64
& b
) const { c64 res
= *this; simdsim(int, 2, +); return res
; }
249 c64
sub32(const c64
& b
) const { c64 res
= *this; simdsim(int, 2, -); return res
; }
250 c64
mul16(const c64
& b
) const { c64 res
= *this; simdsim(short, 4, *); return res
; }
251 c64
mul16hi(const c64
& b
) const { c64 res
= *this; simdsim(short, 4, *) >> 16; return res
; }
252 c64
add8(const c64
& b
) const { c64 res
= *this; simdsim(unsigned char, 8, +); return res
; }
253 c64
sub8(const c64
& b
) const { c64 res
= *this; simdsim(unsigned char, 8, -); return res
; }
258 c64
conv_s32_s16(const c64
& b
) const
261 Init16(clamp_s16(value
& 0xFFFFFFFFU
),
262 clamp_s16(value
>> 32),
263 clamp_s16(b
.value
& 0xFFFFFFFFU
),
264 clamp_s16(b
.value
>> 32));
267 c64
conv_s16_u8(const c64
& b
) const
270 Init8(clamp_u8(value
& 0xFFFF),
271 clamp_u8((value
>> 16) & 0xFFFF),
272 clamp_u8((value
>> 32) & 0xFFFF),
273 clamp_u8((value
>> 48) & 0xFFFF),
274 clamp_u8(b
.value
& 0xFFFF),
275 clamp_u8((b
.value
>> 16) & 0xFFFF),
276 clamp_u8((b
.value
>> 32) & 0xFFFF),
277 clamp_u8((b
.value
>> 48) & 0xFFFF));
280 c64
conv_s16_s8(const c64
& b
) const
283 Init8(clamp_s8(value
& 0xFFFF),
284 clamp_s8((value
>> 16) & 0xFFFF),
285 clamp_s8((value
>> 32) & 0xFFFF),
286 clamp_s8((value
>> 48) & 0xFFFF),
287 clamp_s8(b
.value
& 0xFFFF),
288 clamp_s8((b
.value
>> 16) & 0xFFFF),
289 clamp_s8((b
.value
>> 32) & 0xFFFF),
290 clamp_s8((b
.value
>> 48) & 0xFFFF));
294 /* TODO: Verify that these are correct (though they should never be used anyway) */
295 c64
unpacklbw(const c64
& p
) const
297 #if defined(__MMX__) && !defined(__ICC)
298 /* ICC says [error: type of cast must be integral or enum]
299 * on the return value cast,
300 * so we cannot use this code on ICC. Fine for GCC. */
301 return (uint_least64_t)_m_punpcklbw(*(__m64
*)&p
.value
, *(__m64
*)&value
);
303 uint_fast64_t a
=value
, b
=p
.value
;
304 return expand32_8(a
) | (expand32_8(b
) << 8);
307 c64
unpackhbw(const c64
& p
) const
309 #if defined(__MMX__) && !defined(__ICC)
310 return (uint_least64_t)_m_punpckhbw(*(__m64
*)&p
.value
, *(__m64
*)&value
);
312 uint_fast64_t a
=value
, b
=p
.value
;
313 return expand32_8(a
>>32) | (expand32_8(b
>>32) << 8);
316 c64
unpacklwd(const c64
& p
) const
318 #if defined(__MMX__) && !defined(__ICC)
319 return (uint_least64_t)_m_punpcklwd(*(__m64
*)&p
.value
, *(__m64
*)&value
);
321 uint_fast64_t a
=value
, b
=p
.value
;
322 return expand32_16(a
) | (expand32_16(b
) << 16);
325 c64
unpackhwd(const c64
& p
) const
327 #if defined(__MMX__) && !defined(__ICC)
328 return (uint_least64_t)_m_punpckhwd(*(__m64
*)&p
.value
, *(__m64
*)&value
);
330 uint_fast64_t a
=value
, b
=p
.value
;
331 return expand32_16(a
>>32) | (expand32_16(b
>>32) << 16);
334 c64
unpackldq() const { return unpackldq(*this); }
335 c64
unpackldq(const c64
& p
) const
337 #if defined(__MMX__) && !defined(__ICC)
338 return (uint_least64_t)_m_punpckldq(*(__m64
*)&p
.value
, *(__m64
*)&value
);
340 return value
| (p
.value
<< 32);
348 typedef c64_nonMMX c64
;
351 static inline void MMX_clear()
354 _m_femms(); /* Note: not available on ICC or Valgrind */
356 #elif defined(__MMX__)