simd.h

   1 // This file is part of the ustl library, an STL implementation.
   2 //
   3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
   4 // This file is free software, distributed under the MIT License.
   5 //
   6 /// \file simd.h
   7 /// \brief SIMD-type algorithms, with hardware acceleration, if available.
   8 ///
   9 /// All algorithms are container-based because iterator syntax is just too
  10 /// damn verbose and because the specializations need to be able to tell
  11 /// how many elements are in the container in order to choose proper SIMD
  12 /// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
  13 /// Specializations are only for the tuple template because the container
  14 /// must be of a fixed and compile-time-known size for the compiler to be
  15 /// able to choose the specialization.
  16 ///
  17
  18 #ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
  19 #define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
  20
  21 #include "ulimits.h"
  22 #if HAVE_MATH_H
  23     #include <math.h>
  24 #endif
  25
  26 namespace ustl {
  27 namespace simd {
  28
  29 //----------------------------------------------------------------------
  30 // Generic algorithms
  31 //----------------------------------------------------------------------
  32
  33 /// Applies \p op to each element in \p op1.
  34 template <typename Ctr, typename UnaryOperation>
  35 inline void packop (Ctr& op1, UnaryOperation op)
  36 {
  37     foreach (typename Ctr::iterator, i, op1)
  38         op (*i);
  39 }
  40
  41 /// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
  42 template <typename Ctr, typename BinaryOperation>
  43 inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
  44 {
  45     assert (op2.size() <= op1.size());
  46     typename Ctr::const_iterator i1 (op1.begin());
  47     typename Ctr::iterator i2 (op2.begin());
  48     for (; i2 != op2.end(); ++i1, ++i2)
  49         *i2 = op (*i2, *i1);
  50 }
  51
  52 /// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
  53 template <typename Ctr, typename BinaryOperation>
  54 inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
  55 {
  56     assert (op1.size() <= op2.size() && op1.size() <= result.size());
  57     passign (op1, result);
  58     packop (op2, result);
  59 }
  60
  61 /// Copies \p op1 into \p result.
  62 template <typename Ctr>
  63 inline void passign (const Ctr& op1, Ctr& result)
  64 {
  65     assert (op1.size() <= result.size());
  66     typename Ctr::iterator d (result.begin());
  67     foreach (typename Ctr::const_iterator, s, op1)
  68         *d++ = *s;
  69 }
  70
  71 /// Copies \p result.size() elements from \p op1 to \p result.
  72 template <typename Ctr>
  73 inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
  74 {
  75     foreach (typename Ctr::iterator, d, result)
  76         *d = *op1++;
  77 }
  78
  79 template <typename Ctr1, typename Ctr2, typename ConvertFunction>
  80 inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
  81 {
  82     assert (op1.size() <= op2.size());
  83     typename Ctr1::const_iterator i1 (op1.begin());
  84     typename Ctr2::iterator i2 (op2.begin());
  85     for (; i1 != op1.end(); ++i1, ++i2)
  86         *i2 = f (*i1);
  87 }
  88
  89 // Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
  90 STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
  91 STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
  92 STD_BINARY_FUNCTOR (fpshl,  T, (a << b))
  93 STD_BINARY_FUNCTOR (fpshr,  T, (a >> b))
  94 STD_BINARY_FUNCTOR (fpmin,  T, (min (a, b)))
  95 STD_BINARY_FUNCTOR (fpmax,  T, (max (a, b)))
  96 STD_BINARY_FUNCTOR (fpavg,  T, ((a + b + 1) / 2))
  97 STD_CONVERSION_FUNCTOR (fcast, (D(a)))
  98 #if HAVE_MATH_H
  99 STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
 100 STD_UNARY_FUNCTOR (fpsqrt,      T, (reset_mmx(), T (sqrt (a))))
 101 STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))
 102 STD_UNARY_FUNCTOR (fsin,        T, (reset_mmx(), T (sin (a))))
 103 STD_UNARY_FUNCTOR (fcos,        T, (reset_mmx(), T (cos (a))))
 104 STD_UNARY_FUNCTOR (ftan,        T, (reset_mmx(), T (tan (a))))
 105 #if HAVE_RINTF
 106 STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
 107 #else
 108 STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
 109 #endif
 110 template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
 111 #endif
 112 template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
 113 template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }
 114
 115 #define SIMD_PACKEDOP1(name, operation)         \
 116 template <typename Ctr>                         \
 117 inline void name (Ctr& op1)                     \
 118 {                                               \
 119     typedef typename Ctr::value_type value_t;   \
 120     packop (op1, operation<value_t>());         \
 121 }
 122 #define SIMD_PACKEDOP2(name, operation)         \
 123 template <typename Ctr>                         \
 124 inline void name (const Ctr& op1, Ctr& op2)     \
 125 {                                               \
 126     typedef typename Ctr::value_type value_t;   \
 127     packop (op1, op2, operation<value_t>());    \
 128 }
 129 #define SIMD_PACKEDOP3(name, operation)                 \
 130 template <typename Ctr>                                 \
 131 inline void name (const Ctr& op1, const Ctr& op2, Ctr& result)  \
 132 {                                                       \
 133     typedef typename Ctr::value_type value_t;           \
 134     packop (op1, op2, result, operation<value_t>());    \
 135 }
 136 #define SIMD_SINGLEOP1(name, operation)         \
 137 template <typename T>                           \
 138 inline T name (T op)                            \
 139 {                                               \
 140     operation<T> obj;                           \
 141     return (obj(op));                           \
 142 }
 143 #define SIMD_CONVERTOP(name, operation)         \
 144 template <typename Ctr1, typename Ctr2>         \
 145 inline void name (const Ctr1& op1, Ctr2& op2)   \
 146 {                                               \
 147     typedef typename Ctr1::value_type value1_t; \
 148     typedef typename Ctr2::value_type value2_t; \
 149     pconvert (op1, op2, operation<value1_t, value2_t>());\
 150 }
 151
 152 SIMD_PACKEDOP2 (padd, plus)
 153 SIMD_PACKEDOP2 (psub, minus)
 154 SIMD_PACKEDOP2 (pmul, multiplies)
 155 SIMD_PACKEDOP2 (pdiv, divides)
 156 SIMD_PACKEDOP2 (pand, bitwise_and)
 157 SIMD_PACKEDOP2 (por, bitwise_or)
 158 SIMD_PACKEDOP2 (pxor, bitwise_xor)
 159 SIMD_PACKEDOP2 (pshl, fpshl)
 160 SIMD_PACKEDOP2 (pshr, fpshr)
 161 SIMD_PACKEDOP2 (psubs, fpsubs)
 162 SIMD_PACKEDOP2 (pmin, fpmin)
 163 SIMD_PACKEDOP2 (pmax, fpmax)
 164 SIMD_PACKEDOP2 (pavg, fpavg)
 165
 166 SIMD_PACKEDOP3 (padd, plus)
 167 SIMD_PACKEDOP3 (psub, minus)
 168 SIMD_PACKEDOP3 (pmul, multiplies)
 169 SIMD_PACKEDOP3 (pdiv, divides)
 170 SIMD_PACKEDOP3 (pand, bitwise_and)
 171 SIMD_PACKEDOP3 (por, bitwise_or)
 172 SIMD_PACKEDOP3 (pxor, bitwise_xor)
 173 SIMD_PACKEDOP3 (pshl, fpshl)
 174 SIMD_PACKEDOP3 (pshr, fpshr)
 175 SIMD_PACKEDOP3 (padds, fpadds)
 176 SIMD_PACKEDOP3 (psubs, fpsubs)
 177 SIMD_PACKEDOP3 (pmin, fpmin)
 178 SIMD_PACKEDOP3 (pmax, fpmax)
 179 SIMD_PACKEDOP3 (pavg, fpavg)
 180
 181 #if HAVE_MATH_H
 182 SIMD_PACKEDOP1 (precip, fpreciprocal)
 183 SIMD_PACKEDOP1 (psqrt, fpsqrt)
 184 SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
 185 SIMD_PACKEDOP1 (psin, fsin)
 186 SIMD_PACKEDOP1 (pcos, fcos)
 187 SIMD_PACKEDOP1 (ptan, ftan)
 188
 189 SIMD_SINGLEOP1 (srecip, fpreciprocal)
 190 SIMD_SINGLEOP1 (ssqrt, fpsqrt)
 191 SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
 192 SIMD_SINGLEOP1 (ssin, fsin)
 193 SIMD_SINGLEOP1 (scos, fcos)
 194 SIMD_SINGLEOP1 (stan, ftan)
 195
 196 SIMD_CONVERTOP (pround, fround)
 197
 198 template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
 199 #endif
 200
 201 #undef SIMD_SINGLEOP1
 202 #undef SIMD_PACKEDOP3
 203 #undef SIMD_PACKEDOP2
 204 #undef SIMD_PACKEDOP1
 205
 206 //----------------------------------------------------------------------
 207 // Vector types to cast tuple data to
 208 //----------------------------------------------------------------------
 209
 210 #if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
 211 #define VECTOR_ATTRIBUTE(mode,vs)       __attribute__((vector_size(vs)))
 212 #else
 213 #define VECTOR_ATTRIBUTE(mode,vs)
 214 #endif
 215 typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
 216 typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
 217 typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
 218 typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
 219 typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
 220 #if HAVE_INT64_T
 221 typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
 222 #endif
 223 typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
 224 typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
 225 typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
 226 #undef VECTOR_ATTRIBUTE
 227
 228 #define SIMDA_RI(n)             "m"(oin[n])
 229 #define SIMDA_RO(n)             "m"(oout[n])
 230 #define SIMDA_WI(n)             "=m"(oin[n])
 231 #define SIMDA_WO(n)             "=m"(oout[n])
 232
 233 //----------------------------------------------------------------------
 234 // Hardware accelerated specializations
 235 //----------------------------------------------------------------------
 236
 237 #define SIMD_PKOP2_SPEC(n, type, optype)        \
 238 template <>                                     \
 239 inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
 240 #define SIMD_PASSIGN_SPEC(n, type)              \
 241 template <>                                     \
 242 inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
 243 #define SIMD_IPASSIGN_SPEC(n, type)             \
 244 template <>                                     \
 245 inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
 246 #define SIMD_CONVERT_SPEC(n, type1, type2, optype)      \
 247 template <>                                     \
 248 inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
 249
 250 #if CPU_HAS_MMX
 251 #define STD_MMX_ARGS    "=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory"
 252 #define DBL_MMX_ARGS    "=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
 253 #define MMX_PKOP2_SPEC(n,type,optype,instruction)       \
 254 SIMD_PKOP2_SPEC(n,type,optype)          \
 255 { asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
 256 #define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction)   \
 257 SIMD_PKOP2_SPEC(n,type,optype)          \
 258 { asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
 259 #define MMX_PASSIGN_SPEC(n,type)        \
 260 SIMD_PASSIGN_SPEC(n,type)               \
 261 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
 262 #define MMX_DBL_PASSIGN_SPEC(n,type)    \
 263 SIMD_PASSIGN_SPEC(n,type)               \
 264 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
 265 #define MMX_IPASSIGN_SPEC(n,type)       \
 266 SIMD_IPASSIGN_SPEC(n,type)              \
 267 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
 268 #define MMX_DBL_IPASSIGN_SPEC(n,type)   \
 269 SIMD_IPASSIGN_SPEC(n,type)              \
 270 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
 271
 272 MMX_PASSIGN_SPEC(8,uint8_t)
 273 MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
 274 MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
 275 MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
 276 MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
 277 MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
 278 MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
 279 MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)
 280
 281 MMX_PASSIGN_SPEC(8,int8_t)
 282 MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
 283 MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
 284 MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
 285 MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
 286 MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
 287 MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
 288 MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)
 289
 290 MMX_PASSIGN_SPEC(4,uint16_t)
 291 MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
 292 MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
 293 MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
 294 MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
 295 MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
 296 /// \todo psllw does not work like other operations, it uses the first element for shift count.
 297 //MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
 298 //MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
 299 MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
 300 MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)
 301
 302 MMX_PASSIGN_SPEC(4,int16_t)
 303 MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
 304 MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
 305 MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
 306 MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
 307 MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
 308 //MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
 309 //MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
 310 MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
 311 MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)
 312
 313 MMX_PASSIGN_SPEC(2,uint32_t)
 314 MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
 315 MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
 316 MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
 317 MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
 318 MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
 319 //MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
 320 //MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
 321
 322 MMX_PASSIGN_SPEC(2,int32_t)
 323 MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
 324 MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
 325 MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
 326 MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
 327 MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
 328 //MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
 329 //MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
 330
 331 MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
 332 MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
 333 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
 334 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
 335 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
 336 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
 337 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
 338
 339 MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
 340 MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
 341 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
 342 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
 343 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
 344 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
 345 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
 346
 347 #if CPU_HAS_SSE || CPU_HAS_3DNOW
 348 MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
 349 MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
 350 MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
 351 MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
 352 MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
 353 MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
 354 MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
 355 MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
 356 #endif // CPU_HAS_SSE || CPU_HAS_3DNOW
 357
 358 #if CPU_HAS_3DNOW
 359 MMX_PASSIGN_SPEC(2,float)
 360 MMX_PKOP2_SPEC(2,float,plus,pfadd)
 361 MMX_PKOP2_SPEC(2,float,minus,pfsub)
 362 MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
 363 MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
 364 MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
 365 #ifndef CPU_HAS_SSE
 366 MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
 367 MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
 368 MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
 369 MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
 370 MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
 371 #endif
 372 #endif // CPU_HAS_3DNOW
 373
 374 MMX_IPASSIGN_SPEC(8,uint8_t)
 375 MMX_IPASSIGN_SPEC(4,uint16_t)
 376 MMX_IPASSIGN_SPEC(2,uint32_t)
 377 MMX_IPASSIGN_SPEC(2,float)
 378
 379 #ifndef CPU_HAS_SSE
 380 MMX_DBL_PASSIGN_SPEC(4,float)
 381 MMX_DBL_PASSIGN_SPEC(4,uint32_t)
 382 MMX_DBL_PASSIGN_SPEC(4,int32_t)
 383 MMX_DBL_IPASSIGN_SPEC(4,float)
 384 MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
 385 MMX_DBL_IPASSIGN_SPEC(4,int32_t)
 386 #endif
 387
 388 #undef MMX_IPASSIGN_SPEC
 389 #undef MMX_PASSIGN_SPEC
 390 #undef MMX_PKOP2_SPEC
 391 #undef STD_MMX_ARGS
 392 #endif // CPU_HAS_MMX
 393
 394 #if CPU_HAS_SSE
 395 #define STD_SSE_ARGS    "=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory"
 396 #define SSE_PKOP2_SPEC(n,type,optype,instruction)       \
 397 SIMD_PKOP2_SPEC(n,type,optype)          \
 398 { asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
 399 #define SSE_PASSIGN_SPEC(n,type)                        \
 400 SIMD_PASSIGN_SPEC(n,type)               \
 401 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
 402 #define SSE_IPASSIGN_SPEC(n,type)       \
 403 SIMD_IPASSIGN_SPEC(n,type)              \
 404 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
 405 SSE_PASSIGN_SPEC(4,float)
 406 SSE_PASSIGN_SPEC(4,int32_t)
 407 SSE_PASSIGN_SPEC(4,uint32_t)
 408 SSE_PKOP2_SPEC(4,float,plus,addps)
 409 SSE_PKOP2_SPEC(4,float,minus,subps)
 410 SSE_PKOP2_SPEC(4,float,multiplies,mulps)
 411 SSE_PKOP2_SPEC(4,float,divides,divps)
 412 SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
 413 SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
 414 SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
 415 SSE_PKOP2_SPEC(4,float,fpmax,maxps)
 416 SSE_PKOP2_SPEC(4,float,fpmin,minps)
 417
 418 SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
 419     asm ("cvtps2pi %2, %%mm0\n\t"
 420          "cvtps2pi %3, %%mm1\n\t"
 421          "movq %%mm0, %0\n\t"
 422          "movq %%mm1, %1"
 423          : DBL_MMX_ARGS);
 424     reset_mmx();
 425 }
 426 SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
 427     asm ("cvtpi2ps %2, %%xmm0\n\t"
 428          "shufps $0x4E,%%xmm0,%%xmm0\n\t"
 429          "cvtpi2ps %1, %%xmm0\n\t"
 430          "movups %%xmm0, %0"
 431          : "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
 432 }
 433 template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
 434     register int32_t rv;
 435     asm ("movss %1, %%xmm0\n\t"
 436          "cvtss2si %%xmm0, %0"
 437          : "=r"(rv) : "m"(a) : "xmm0" );
 438     return (rv);
 439 }
 440 template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
 441     register uint32_t rv;
 442     asm ("movss %1, %%xmm0\n\t"
 443          "cvtss2si %%xmm0, %0"
 444          : "=r"(rv) : "m"(a) : "xmm0" );
 445     return (rv);
 446 }
 447
 448 SSE_IPASSIGN_SPEC(4,float)
 449 SSE_IPASSIGN_SPEC(4,int32_t)
 450 SSE_IPASSIGN_SPEC(4,uint32_t)
 451
 452 #undef SSE_IPASSIGN_SPEC
 453 #undef SSE_PASSIGN_SPEC
 454 #undef SSE_PKOP2_SPEC
 455 #undef STD_SSE_ARGS
 456 #endif // CPU_HAS_SSE
 457
 458 #undef SIMDA_RI
 459 #undef SIMDA_RO
 460 #undef SIMDA_WI
 461 #undef SIMDA_WO
 462 #undef SIMD_PACKEDOP_SPEC
 463
 464 } // namespace simd
 465 } // namespace ustl
 466
 467 #endif
 468