src/gromacs/simd/impl_x86_sse2/impl_x86_sse2_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H
  36 #define GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H
  37
  38 #include "config.h"
  39
  40 #include <cassert>
  41 #include <cstddef>
  42
  43 #include <emmintrin.h>
  44
  45 namespace gmx
  46 {
  47
  48 class Simd4Float
  49 {
  50     public:
  51         Simd4Float() {}
  52
  53         Simd4Float(float f) : simdInternal_(_mm_set1_ps(f)) {}
  54
  55         // Internal utility constructor to simplify return statements
  56         Simd4Float(__m128 simd) : simdInternal_(simd) {}
  57
  58         __m128  simdInternal_;
  59 };
  60
  61 class Simd4FBool
  62 {
  63     public:
  64         Simd4FBool() {}
  65
  66         //! \brief Construct from scalar bool
  67         Simd4FBool(bool b) : simdInternal_(_mm_castsi128_ps(_mm_set1_epi32( b ? 0xFFFFFFFF : 0))) {}
  68
  69         // Internal utility constructor to simplify return statements
  70         Simd4FBool(__m128 simd) : simdInternal_(simd) {}
  71
  72         __m128  simdInternal_;
  73 };
  74
  75 static inline Simd4Float gmx_simdcall
  76 load4(const float *m)
  77 {
  78     assert(size_t(m) % 16 == 0);
  79     return {
  80                _mm_load_ps(m)
  81     };
  82 }
  83
  84 static inline void gmx_simdcall
  85 store4(float *m, Simd4Float a)
  86 {
  87     assert(size_t(m) % 16 == 0);
  88     _mm_store_ps(m, a.simdInternal_);
  89 }
  90
  91 static inline Simd4Float gmx_simdcall
  92 load4U(const float *m)
  93 {
  94     return {
  95                _mm_loadu_ps(m)
  96     };
  97 }
  98
  99 static inline void gmx_simdcall
 100 store4U(float *m, Simd4Float a)
 101 {
 102     _mm_storeu_ps(m, a.simdInternal_);
 103 }
 104
 105 static inline Simd4Float gmx_simdcall
 106 simd4SetZeroF()
 107 {
 108     return {
 109                _mm_setzero_ps()
 110     };
 111 }
 112
 113 static inline Simd4Float gmx_simdcall
 114 operator&(Simd4Float a, Simd4Float b)
 115 {
 116     return {
 117                _mm_and_ps(a.simdInternal_, b.simdInternal_)
 118     };
 119 }
 120
 121 static inline Simd4Float gmx_simdcall
 122 andNot(Simd4Float a, Simd4Float b)
 123 {
 124     return {
 125                _mm_andnot_ps(a.simdInternal_, b.simdInternal_)
 126     };
 127 }
 128
 129 static inline Simd4Float gmx_simdcall
 130 operator|(Simd4Float a, Simd4Float b)
 131 {
 132     return {
 133                _mm_or_ps(a.simdInternal_, b.simdInternal_)
 134     };
 135 }
 136
 137 static inline Simd4Float gmx_simdcall
 138 operator^(Simd4Float a, Simd4Float b)
 139 {
 140     return {
 141                _mm_xor_ps(a.simdInternal_, b.simdInternal_)
 142     };
 143 }
 144
 145 static inline Simd4Float gmx_simdcall
 146 operator+(Simd4Float a, Simd4Float b)
 147 {
 148     return {
 149                _mm_add_ps(a.simdInternal_, b.simdInternal_)
 150     };
 151 }
 152
 153 static inline Simd4Float gmx_simdcall
 154 operator-(Simd4Float a, Simd4Float b)
 155 {
 156     return {
 157                _mm_sub_ps(a.simdInternal_, b.simdInternal_)
 158     };
 159 }
 160
 161 static inline Simd4Float gmx_simdcall
 162 operator-(Simd4Float x)
 163 {
 164     return {
 165                _mm_xor_ps(x.simdInternal_, _mm_set1_ps(GMX_FLOAT_NEGZERO))
 166     };
 167 }
 168
 169 static inline Simd4Float gmx_simdcall
 170 operator*(Simd4Float a, Simd4Float b)
 171 {
 172     return {
 173                _mm_mul_ps(a.simdInternal_, b.simdInternal_)
 174     };
 175 }
 176
 177 // Override for AVX-128-FMA and higher
 178 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 179 static inline Simd4Float gmx_simdcall
 180 fma(Simd4Float a, Simd4Float b, Simd4Float c)
 181 {
 182     return {
 183                _mm_add_ps(_mm_mul_ps(a.simdInternal_, b.simdInternal_), c.simdInternal_)
 184     };
 185 }
 186
 187 static inline Simd4Float gmx_simdcall
 188 fms(Simd4Float a, Simd4Float b, Simd4Float c)
 189 {
 190     return {
 191                _mm_sub_ps(_mm_mul_ps(a.simdInternal_, b.simdInternal_), c.simdInternal_)
 192     };
 193 }
 194
 195 static inline Simd4Float gmx_simdcall
 196 fnma(Simd4Float a, Simd4Float b, Simd4Float c)
 197 {
 198     return {
 199                _mm_sub_ps(c.simdInternal_, _mm_mul_ps(a.simdInternal_, b.simdInternal_))
 200     };
 201 }
 202
 203 static inline Simd4Float gmx_simdcall
 204 fnms(Simd4Float a, Simd4Float b, Simd4Float c)
 205 {
 206     return {
 207                _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(a.simdInternal_, b.simdInternal_), c.simdInternal_))
 208     };
 209 }
 210 #endif
 211
 212 static inline Simd4Float gmx_simdcall
 213 rsqrt(Simd4Float x)
 214 {
 215     return {
 216                _mm_rsqrt_ps(x.simdInternal_)
 217     };
 218 }
 219
 220 static inline Simd4Float gmx_simdcall
 221 abs(Simd4Float x)
 222 {
 223     return {
 224                _mm_andnot_ps( _mm_set1_ps(GMX_FLOAT_NEGZERO), x.simdInternal_ )
 225     };
 226 }
 227
 228 static inline Simd4Float gmx_simdcall
 229 max(Simd4Float a, Simd4Float b)
 230 {
 231     return {
 232                _mm_max_ps(a.simdInternal_, b.simdInternal_)
 233     };
 234 }
 235
 236 static inline Simd4Float gmx_simdcall
 237 min(Simd4Float a, Simd4Float b)
 238 {
 239     return {
 240                _mm_min_ps(a.simdInternal_, b.simdInternal_)
 241     };
 242 }
 243
 244 // Override for SSE4.1 and higher
 245 #if GMX_SIMD_X86_SSE2
 246 static inline Simd4Float gmx_simdcall
 247 round(Simd4Float x)
 248 {
 249     return {
 250                _mm_cvtepi32_ps( _mm_cvtps_epi32(x.simdInternal_) )
 251     };
 252 }
 253
 254 static inline Simd4Float gmx_simdcall
 255 trunc(Simd4Float x)
 256 {
 257     return {
 258                _mm_cvtepi32_ps( _mm_cvttps_epi32(x.simdInternal_) )
 259     };
 260 }
 261
 262 static inline float gmx_simdcall
 263 dotProduct(Simd4Float a, Simd4Float b)
 264 {
 265     __m128 c, d;
 266     c = _mm_mul_ps(a.simdInternal_, b.simdInternal_);
 267     d = _mm_add_ps(c, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 1, 2, 1)));
 268     d = _mm_add_ps(d, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 2, 3, 2)));
 269     return *reinterpret_cast<float *>(&d);
 270 }
 271 #endif
 272
 273 static inline void gmx_simdcall
 274 transpose(Simd4Float * v0, Simd4Float * v1,
 275           Simd4Float * v2, Simd4Float * v3)
 276 {
 277     _MM_TRANSPOSE4_PS(v0->simdInternal_, v1->simdInternal_, v2->simdInternal_, v3->simdInternal_);
 278 }
 279
 280 static inline Simd4FBool gmx_simdcall
 281 operator==(Simd4Float a, Simd4Float b)
 282 {
 283     return {
 284                _mm_cmpeq_ps(a.simdInternal_, b.simdInternal_)
 285     };
 286 }
 287
 288 static inline Simd4FBool gmx_simdcall
 289 operator!=(Simd4Float a, Simd4Float b)
 290 {
 291     return {
 292                _mm_cmpneq_ps(a.simdInternal_, b.simdInternal_)
 293     };
 294 }
 295
 296 static inline Simd4FBool gmx_simdcall
 297 operator<(Simd4Float a, Simd4Float b)
 298 {
 299     return {
 300                _mm_cmplt_ps(a.simdInternal_, b.simdInternal_)
 301     };
 302 }
 303
 304 static inline Simd4FBool gmx_simdcall
 305 operator<=(Simd4Float a, Simd4Float b)
 306 {
 307     return {
 308                _mm_cmple_ps(a.simdInternal_, b.simdInternal_)
 309     };
 310 }
 311
 312 static inline Simd4FBool gmx_simdcall
 313 operator&&(Simd4FBool a, Simd4FBool b)
 314 {
 315     return {
 316                _mm_and_ps(a.simdInternal_, b.simdInternal_)
 317     };
 318 }
 319
 320 static inline Simd4FBool gmx_simdcall
 321 operator||(Simd4FBool a, Simd4FBool b)
 322 {
 323     return {
 324                _mm_or_ps(a.simdInternal_, b.simdInternal_)
 325     };
 326 }
 327
 328 static inline bool gmx_simdcall
 329 anyTrue(Simd4FBool a) { return _mm_movemask_ps(a.simdInternal_) != 0; }
 330
 331 static inline Simd4Float gmx_simdcall
 332 selectByMask(Simd4Float a, Simd4FBool mask)
 333 {
 334     return {
 335                _mm_and_ps(a.simdInternal_, mask.simdInternal_)
 336     };
 337 }
 338
 339 static inline Simd4Float gmx_simdcall
 340 selectByNotMask(Simd4Float a, Simd4FBool mask)
 341 {
 342     return {
 343                _mm_andnot_ps(mask.simdInternal_, a.simdInternal_)
 344     };
 345 }
 346
 347 // Override for SSE4.1 and higher
 348 #if GMX_SIMD_X86_SSE2
 349 static inline Simd4Float gmx_simdcall
 350 blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
 351 {
 352     return {
 353                _mm_or_ps(_mm_andnot_ps(sel.simdInternal_, a.simdInternal_), _mm_and_ps(sel.simdInternal_, b.simdInternal_))
 354     };
 355 }
 356 #endif
 357
 358 // Override for AVX-128-FMA and higher
 359 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
 360 static inline float gmx_simdcall
 361 reduce(Simd4Float a)
 362 {
 363     __m128 b;
 364     b = _mm_add_ps(a.simdInternal_, _mm_shuffle_ps(a.simdInternal_, a.simdInternal_, _MM_SHUFFLE(1, 0, 3, 2)));
 365     b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
 366     return *reinterpret_cast<float *>(&b);
 367 }
 368 #endif
 369
 370 }      // namespace gmx
 371
 372 #endif // GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H