streamtools/simd.hh

   1 #if defined(__MMX__) && !defined(__x86_64)
   2 #define USE_MMX
   3 #endif
   4 #if defined(__SSE__)
   5 #define USE_SSE
   6 #endif
   7
   8 /* SIMD interface (MMX) written by Bisqwit
   9  * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
  10  */
  11
  12 #ifdef __3dNOW__
  13 # include <mm3dnow.h> /* Note: not available on ICC */
  14 #elif defined(__MMX__)
  15 # include <mmintrin.h>
  16 #endif
  17
  18 struct c64_common
  19 {
  20     static signed char clamp_s8(int_fast64_t v)
  21         { return v<-128 ? -128 : (v > 127 ? 127 : v); }
  22     static unsigned char clamp_u8(int_fast64_t v)
  23         { return v<0 ? 0 : (v > 255 ? 255 : v); }
  24     static short clamp_s16(int_fast64_t v)
  25         { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); }
  26
  27     static inline uint_fast64_t expand32_8(uint_fast32_t a)
  28     {
  29         // 0000abcd -> 0a0b0c0d
  30         typedef uint_fast64_t v;
  31         return (a&0xFFU)
  32             | ((a&0xFF00U)<<8)    // base: 8+8 = 16
  33             | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32
  34             | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48
  35     }
  36     static inline uint_fast64_t expand32_16(uint_fast32_t a)
  37     {
  38         // 0000abcd -> 00ab00cd
  39         typedef uint_fast64_t v;
  40         return (a&0xFFFFU)
  41          | ((v)(a&0xFFFF0000UL)<<16);   // base: 16+16 = 32
  42     }
  43 };
  44
  45 #ifdef __MMX__
  46 /* 64-bit integers that use MMX / 3Dnow operations where relevant */
  47 struct c64_MMX: public c64_common
  48 {
  49     typedef c64_MMX c64;
  50
  51     __m64 value;
  52
  53     inline c64_MMX() { }
  54     inline c64_MMX(__m64 v) : value(v) { }
  55     inline c64_MMX(const uint64_t& v) : value( *(const __m64*)& v) { }
  56     inline c64_MMX(int v) : value(_m_from_int(v)) { }
  57     inline c64_MMX(short a,short b,short c, short d)
  58         : value(_mm_setr_pi16(a,b,c,d)) { }
  59
  60     inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
  61     inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
  62     c64& operator<<= (int n) { return *this = shl64(n); }
  63     c64& operator>>= (int n) { return *this = shr64(n); }
  64
  65     c64 conv_s16_u8() const { return conv_s16_u8(*this); }
  66     c64 conv_s16_s8() const { return conv_s16_s8(*this); }
  67
  68     void Get(const unsigned char* p)      { value = *(const __m64*)p; }
  69     void Put(      unsigned char* p)const { *(__m64*)p =  value; }
  70
  71     void Init16(short a,short b,short c, short d)
  72         { value = _mm_setr_pi16(a,b,c,d); }
  73     void Init16(short a)
  74         { value = _mm_set1_pi16(a); }
  75
  76     void GetD(const unsigned char* p)      { value = *(const __m64*)p; }
  77
  78     template<int n>
  79     short Extract16() const { return ((const short*)&value)[n]; }
  80     template<int n>
  81     int Extract32() const { return ((const int*)&value)[n]; }
  82
  83     short Extract88_from_1616lo() const
  84     {
  85         const unsigned char* data = (const unsigned char*)&value;
  86         // bytes:  76543210
  87         // shorts: 33221100
  88         // take:        H L
  89         return data[0] | *(short*)(data+1);
  90         //return data[0] | ((*(const unsigned int*)data) >> 8);
  91     }
  92     short Extract88_from_1616hi() const
  93     {
  94         const unsigned char* data = 4+(const unsigned char*)&value;
  95         // bytes:  76543210
  96         // shorts: 33221100
  97         // take:    H L
  98         return data[0] | *(short*)(data+1);
  99         //return data[0] | ((*(const unsigned int*)data) >> 8);
 100     }
 101
 102
 103     c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; }
 104     c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; }
 105     c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; }
 106     c64& operator+= (const c64& b) { return *this = *this + b; }
 107     c64& operator-= (const c64& b) { return *this = *this - b; }
 108
 109     c64 operator~ () const {
 110         static const uint_least64_t negpat = ~(uint_least64_t)0;
 111         return c64(_mm_xor_si64(value, *(const __m64*)&negpat));
 112     }
 113
 114             /* psllqi: p = packed
 115                        s = shift
 116                        r = right, l = left
 117                        l = shift in zero, a = shift in sign bit
 118                        q = 64-bit, d = 32-bit, w = 16-bit
 119                       [i = immed amount]
 120              */
 121     c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); }
 122     c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); }
 123     c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); }
 124
 125     c64 operator- (const c64& b) const
 126     {
 127         #ifdef __SSE2__
 128         return _mm_sub_si64(value, b.value);
 129         #else
 130         return (const uint64_t&)value - (const uint64_t&)b.value;
 131         #endif
 132     }
 133     c64 operator+ (const c64& b) const
 134     {
 135         #ifdef __SSE2__
 136         return _mm_add_si64(value, b.value);
 137         #else
 138         return (const uint64_t&)value + (const uint64_t&)b.value;
 139         #endif
 140     }
 141
 142
 143     c64 shl64(int b) const { return _mm_slli_si64(value, b); }
 144     c64 shr64(int b) const { return _mm_srli_si64(value, b); }
 145     c64 shl16(int b) const { return _mm_slli_pi16(value, b); }
 146     c64 shr16(int b) const { return _mm_srli_pi16(value, b); }
 147     c64 sar32(int b) const { return _mm_srai_pi32(value, b); }
 148     c64 sar16(int b) const { return _mm_srai_pi16(value, b); }
 149     c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); }
 150     c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); }
 151     c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); }
 152     c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); }
 153     c64 mul16(const c64& b) const   { return _mm_mullo_pi16(value, b.value); }
 154     c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); }
 155     //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
 156     c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); }
 157     c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); }
 158
 159     c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); }
 160     c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); }
 161     c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); }
 162     c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); }
 163     c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); }
 164     c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); }
 165
 166     c64 operator& (const uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); }
 167
 168     c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); }
 169     c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); }
 170     c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); }
 171 };
 172 #endif
 173
 174 struct c64_nonMMX: public c64_common
 175 {
 176     typedef c64_nonMMX c64;
 177
 178     uint_least64_t value;
 179
 180     inline c64_nonMMX() { }
 181     inline c64_nonMMX(uint64_t v) : value(v) { }
 182     inline c64_nonMMX(int v) : value(v) { }
 183     inline c64_nonMMX(short a,short b,short c, short d)
 184         { Init16(a,b,c,d); }
 185
 186     c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
 187     c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
 188     c64& operator<<= (int n) { return *this = shl64(n); }
 189     c64& operator>>= (int n) { return *this = shr64(n); }
 190
 191     c64 conv_s16_u8() const { return conv_s16_u8(*this); }
 192     c64 conv_s16_s8() const { return conv_s16_s8(*this); }
 193
 194     void Init16(short a,short b,short c, short d)
 195         { uint_fast64_t aa = (unsigned short)a,
 196                         bb = (unsigned short)b,
 197                         cc = (unsigned short)c,
 198                         dd = (unsigned short)d;
 199           value = aa | (bb << 16) | (cc << 32) | (dd << 48); }
 200     void Init16(short a)
 201         { Init16(a,a,a,a); }
 202     void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d,
 203                unsigned char e,unsigned char f,unsigned char g,unsigned char h)
 204     {
 205         value = ((uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24)))
 206               | (((uint_fast64_t)e) << 32)
 207               | (((uint_fast64_t)f) << 40)
 208               | (((uint_fast64_t)g) << 48)
 209               | (((uint_fast64_t)h) << 56);
 210     }
 211
 212     void Get(const unsigned char* p)      { value = *(const uint_least64_t*)p; }
 213     void Put(      unsigned char* p)const { *(uint_least64_t*)p =  value; }
 214
 215     c64& operator&= (const c64& b) { value&=b.value; return *this; }
 216     c64& operator|= (const c64& b) { value|=b.value; return *this; }
 217     c64& operator^= (const c64& b) { value^=b.value; return *this; }
 218     c64& operator+= (const c64& b) { value+=b.value; return *this; }
 219     c64& operator-= (const c64& b) { value-=b.value; return *this; }
 220     c64 operator& (const c64& b) const { return value & b.value; }
 221     c64 operator| (const c64& b) const { return value | b.value; }
 222     c64 operator^ (const c64& b) const { return value ^ b.value; }
 223     c64 operator- (const c64& b) const { return value - b.value; }
 224     c64 operator+ (const c64& b) const { return value + b.value; }
 225
 226     c64 operator& (uint_fast64_t b) const { return value & b; }
 227
 228     c64 operator~ () const { return ~value; }
 229
 230     #define usimdsim(type, count, op) \
 231         type* p = (type*)&res.value; \
 232         for(int n=0; n<count; ++n) p[n] = (p[n] op b)
 233
 234     #define simdsim(type, count, op) \
 235         type* p = (type*)&res.value; \
 236         const type* o = (const type*)&b.value; \
 237         for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
 238
 239     c64 shl64(int b) const { return value << b; }
 240     c64 shr64(int b) const { return value >> b; }
 241     c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; }
 242     c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; }
 243     c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; }
 244     c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; }
 245
 246     c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; }
 247     c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; }
 248     c64 add32(const c64& b) const { c64 res = *this; simdsim(int,   2, +); return res; }
 249     c64 sub32(const c64& b) const { c64 res = *this; simdsim(int,   2, -); return res; }
 250     c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; }
 251     c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; }
 252     c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; }
 253     c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; }
 254
 255     #undef simdsim
 256     #undef usimdsim
 257
 258     c64 conv_s32_s16(const c64& b) const
 259     {
 260         c64 res; res.
 261         Init16(clamp_s16(value & 0xFFFFFFFFU),
 262                clamp_s16(value >> 32),
 263                clamp_s16(b.value & 0xFFFFFFFFU),
 264                clamp_s16(b.value >> 32));
 265         return res;
 266     }
 267     c64 conv_s16_u8(const c64& b) const
 268     {
 269         c64 res; res.
 270         Init8(clamp_u8(value & 0xFFFF),
 271               clamp_u8((value >> 16) & 0xFFFF),
 272               clamp_u8((value >> 32) & 0xFFFF),
 273               clamp_u8((value >> 48) & 0xFFFF),
 274               clamp_u8(b.value & 0xFFFF),
 275               clamp_u8((b.value >> 16) & 0xFFFF),
 276               clamp_u8((b.value >> 32) & 0xFFFF),
 277               clamp_u8((b.value >> 48) & 0xFFFF));
 278         return res;
 279     }
 280     c64 conv_s16_s8(const c64& b) const
 281     {
 282         c64 res; res.
 283         Init8(clamp_s8(value & 0xFFFF),
 284               clamp_s8((value >> 16) & 0xFFFF),
 285               clamp_s8((value >> 32) & 0xFFFF),
 286               clamp_s8((value >> 48) & 0xFFFF),
 287               clamp_s8(b.value & 0xFFFF),
 288               clamp_s8((b.value >> 16) & 0xFFFF),
 289               clamp_s8((b.value >> 32) & 0xFFFF),
 290               clamp_s8((b.value >> 48) & 0xFFFF));
 291         return res;
 292     }
 293
 294     /* TODO: Verify that these are correct (though they should never be used anyway) */
 295     c64 unpacklbw(const c64& p) const
 296     {
 297     #if defined(__MMX__) && !defined(__ICC)
 298         /* ICC says [error: type of cast must be integral or enum]
 299          * on the return value cast,
 300          * so we cannot use this code on ICC. Fine for GCC. */
 301         return (uint_least64_t)_m_punpcklbw(*(__m64*)&p.value, *(__m64*)&value);
 302     #else
 303         uint_fast64_t a=value, b=p.value;
 304         return expand32_8(a) | (expand32_8(b) << 8);
 305     #endif
 306     }
 307     c64 unpackhbw(const c64& p) const
 308     {
 309     #if defined(__MMX__) && !defined(__ICC)
 310         return (uint_least64_t)_m_punpckhbw(*(__m64*)&p.value, *(__m64*)&value);
 311     #else
 312         uint_fast64_t a=value, b=p.value;
 313         return expand32_8(a>>32) | (expand32_8(b>>32) << 8);
 314     #endif
 315     }
 316     c64 unpacklwd(const c64& p) const
 317     {
 318     #if defined(__MMX__) && !defined(__ICC)
 319         return (uint_least64_t)_m_punpcklwd(*(__m64*)&p.value, *(__m64*)&value);
 320     #else
 321         uint_fast64_t a=value, b=p.value;
 322         return expand32_16(a) | (expand32_16(b) << 16);
 323     #endif
 324     }
 325     c64 unpackhwd(const c64& p) const
 326     {
 327     #if defined(__MMX__) && !defined(__ICC)
 328         return (uint_least64_t)_m_punpckhwd(*(__m64*)&p.value, *(__m64*)&value);
 329     #else
 330         uint_fast64_t a=value, b=p.value;
 331         return expand32_16(a>>32) | (expand32_16(b>>32) << 16);
 332     #endif
 333     }
 334     c64 unpackldq() const { return unpackldq(*this); }
 335     c64 unpackldq(const c64& p) const
 336     {
 337     #if defined(__MMX__) && !defined(__ICC)
 338         return (uint_least64_t)_m_punpckldq(*(__m64*)&p.value, *(__m64*)&value);
 339     #else
 340         return value | (p.value << 32);
 341     #endif
 342     }
 343 };
 344
 345 #ifdef USE_MMX
 346 typedef c64_MMX c64;
 347 #else
 348 typedef c64_nonMMX c64;
 349 #endif
 350
 351 static inline void MMX_clear()
 352 {
 353     #ifdef __3dNOW__
 354     _m_femms(); /* Note: not available on ICC or Valgrind */
 355     //_mm_empty();
 356     #elif defined(__MMX__)
 357     _mm_empty();
 358     #endif
 359 }