src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_simd4_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H
  37 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstddef>
  43
  44 #include <immintrin.h>
  45
  46 namespace gmx
  47 {
  48
  49 class Simd4Double
  50 {
  51     public:
  52         Simd4Double() {}
  53
  54         Simd4Double(double d) : simdInternal_(_mm256_set1_pd(d)) {}
  55
  56         // Internal utility constructor to simplify return statements
  57         Simd4Double(__m256d simd) : simdInternal_(simd) {}
  58
  59         __m256d  simdInternal_;
  60 };
  61
  62 class Simd4DBool
  63 {
  64     public:
  65         Simd4DBool() {}
  66
  67         //! \brief Construct from scalar bool
  68         Simd4DBool(bool b) : simdInternal_(_mm256_castsi256_pd(_mm256_set1_epi32( b ? 0xFFFFFFFF : 0))) {}
  69
  70         // Internal utility constructor to simplify return statements
  71         Simd4DBool(__m256d simd) : simdInternal_(simd) {}
  72
  73         __m256d  simdInternal_;
  74 };
  75
  76 static inline Simd4Double gmx_simdcall
  77 load4(const double *m)
  78 {
  79     assert(std::size_t(m) % 32 == 0);
  80     return {
  81                _mm256_load_pd(m)
  82     };
  83 }
  84
  85 static inline void gmx_simdcall
  86 store4(double *m, Simd4Double a)
  87 {
  88     assert(std::size_t(m) % 32 == 0);
  89     _mm256_store_pd(m, a.simdInternal_);
  90 }
  91
  92 static inline Simd4Double gmx_simdcall
  93 load4U(const double *m)
  94 {
  95     return {
  96                _mm256_loadu_pd(m)
  97     };
  98 }
  99
 100 static inline void gmx_simdcall
 101 store4U(double *m, Simd4Double a)
 102 {
 103     _mm256_storeu_pd(m, a.simdInternal_);
 104 }
 105
 106 static inline Simd4Double gmx_simdcall
 107 simd4SetZeroD()
 108 {
 109     return {
 110                _mm256_setzero_pd()
 111     };
 112 }
 113
 114 static inline Simd4Double gmx_simdcall
 115 operator&(Simd4Double a, Simd4Double b)
 116 {
 117     return {
 118                _mm256_and_pd(a.simdInternal_, b.simdInternal_)
 119     };
 120 }
 121
 122 static inline Simd4Double gmx_simdcall
 123 andNot(Simd4Double a, Simd4Double b)
 124 {
 125     return {
 126                _mm256_andnot_pd(a.simdInternal_, b.simdInternal_)
 127     };
 128 }
 129
 130 static inline Simd4Double gmx_simdcall
 131 operator|(Simd4Double a, Simd4Double b)
 132 {
 133     return {
 134                _mm256_or_pd(a.simdInternal_, b.simdInternal_)
 135     };
 136 }
 137
 138 static inline Simd4Double gmx_simdcall
 139 operator^(Simd4Double a, Simd4Double b)
 140 {
 141     return {
 142                _mm256_xor_pd(a.simdInternal_, b.simdInternal_)
 143     };
 144 }
 145
 146 static inline Simd4Double gmx_simdcall
 147 operator+(Simd4Double a, Simd4Double b)
 148 {
 149     return {
 150                _mm256_add_pd(a.simdInternal_, b.simdInternal_)
 151     };
 152 }
 153
 154 static inline Simd4Double gmx_simdcall
 155 operator-(Simd4Double a, Simd4Double b)
 156 {
 157     return {
 158                _mm256_sub_pd(a.simdInternal_, b.simdInternal_)
 159     };
 160 }
 161
 162 static inline Simd4Double gmx_simdcall
 163 operator-(Simd4Double x)
 164 {
 165     return {
 166                _mm256_xor_pd(x.simdInternal_, _mm256_set1_pd(GMX_DOUBLE_NEGZERO))
 167     };
 168 }
 169
 170 static inline Simd4Double gmx_simdcall
 171 operator*(Simd4Double a, Simd4Double b)
 172 {
 173     return {
 174                _mm256_mul_pd(a.simdInternal_, b.simdInternal_)
 175     };
 176 }
 177
 178 // Override for AVX2 and higher
 179 #if GMX_SIMD_X86_AVX_256
 180 static inline Simd4Double gmx_simdcall
 181 fma(Simd4Double a, Simd4Double b, Simd4Double c)
 182 {
 183     return {
 184                _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)
 185     };
 186 }
 187
 188 static inline Simd4Double gmx_simdcall
 189 fms(Simd4Double a, Simd4Double b, Simd4Double c)
 190 {
 191     return {
 192                _mm256_sub_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)
 193     };
 194 }
 195
 196 static inline Simd4Double gmx_simdcall
 197 fnma(Simd4Double a, Simd4Double b, Simd4Double c)
 198 {
 199     return {
 200                _mm256_sub_pd(c.simdInternal_, _mm256_mul_pd(a.simdInternal_, b.simdInternal_))
 201     };
 202 }
 203
 204 static inline Simd4Double gmx_simdcall
 205 fnms(Simd4Double a, Simd4Double b, Simd4Double c)
 206 {
 207     return {
 208                _mm256_sub_pd(_mm256_setzero_pd(), _mm256_add_pd(_mm256_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_))
 209     };
 210 }
 211 #endif
 212
 213 static inline Simd4Double gmx_simdcall
 214 rsqrt(Simd4Double x)
 215 {
 216     return {
 217                _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x.simdInternal_)))
 218     };
 219 }
 220
 221 static inline Simd4Double gmx_simdcall
 222 abs(Simd4Double x)
 223 {
 224     return {
 225                _mm256_andnot_pd( _mm256_set1_pd(GMX_DOUBLE_NEGZERO), x.simdInternal_ )
 226     };
 227 }
 228
 229 static inline Simd4Double gmx_simdcall
 230 max(Simd4Double a, Simd4Double b)
 231 {
 232     return {
 233                _mm256_max_pd(a.simdInternal_, b.simdInternal_)
 234     };
 235 }
 236
 237 static inline Simd4Double gmx_simdcall
 238 min(Simd4Double a, Simd4Double b)
 239 {
 240     return {
 241                _mm256_min_pd(a.simdInternal_, b.simdInternal_)
 242     };
 243 }
 244
 245 static inline Simd4Double gmx_simdcall
 246 round(Simd4Double x)
 247 {
 248     return {
 249                _mm256_round_pd(x.simdInternal_, _MM_FROUND_NINT)
 250     };
 251 }
 252
 253 static inline Simd4Double gmx_simdcall
 254 trunc(Simd4Double x)
 255 {
 256     return {
 257                _mm256_round_pd(x.simdInternal_, _MM_FROUND_TRUNC)
 258     };
 259 }
 260
 261 static inline double gmx_simdcall
 262 dotProduct(Simd4Double a, Simd4Double b)
 263 {
 264     __m128d tmp1, tmp2;
 265     a.simdInternal_  = _mm256_mul_pd(a.simdInternal_, b.simdInternal_);
 266     tmp1             = _mm256_castpd256_pd128(a.simdInternal_);
 267     tmp2             = _mm256_extractf128_pd(a.simdInternal_, 0x1);
 268
 269     tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
 270     tmp1 = _mm_add_pd(tmp1, tmp2);
 271     return *reinterpret_cast<double *>(&tmp1);
 272 }
 273
 274 static inline void gmx_simdcall
 275 transpose(Simd4Double * v0, Simd4Double * v1,
 276           Simd4Double * v2, Simd4Double * v3)
 277 {
 278     __m256d t1, t2, t3, t4;
 279     t1                = _mm256_unpacklo_pd(v0->simdInternal_, v1->simdInternal_);
 280     t2                = _mm256_unpackhi_pd(v0->simdInternal_, v1->simdInternal_);
 281     t3                = _mm256_unpacklo_pd(v2->simdInternal_, v3->simdInternal_);
 282     t4                = _mm256_unpackhi_pd(v2->simdInternal_, v3->simdInternal_);
 283     v0->simdInternal_ = _mm256_permute2f128_pd(t1, t3, 0x20);
 284     v1->simdInternal_ = _mm256_permute2f128_pd(t2, t4, 0x20);
 285     v2->simdInternal_ = _mm256_permute2f128_pd(t1, t3, 0x31);
 286     v3->simdInternal_ = _mm256_permute2f128_pd(t2, t4, 0x31);
 287 }
 288
 289 static inline Simd4DBool gmx_simdcall
 290 operator==(Simd4Double a, Simd4Double b)
 291 {
 292     return {
 293                _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
 294     };
 295 }
 296
 297 static inline Simd4DBool gmx_simdcall
 298 operator!=(Simd4Double a, Simd4Double b)
 299 {
 300     return {
 301                _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_NEQ_OQ)
 302     };
 303 }
 304
 305 static inline Simd4DBool gmx_simdcall
 306 operator<(Simd4Double a, Simd4Double b)
 307 {
 308     return {
 309                _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LT_OQ)
 310     };
 311 }
 312
 313 static inline Simd4DBool gmx_simdcall
 314 operator<=(Simd4Double a, Simd4Double b)
 315 {
 316     return {
 317                _mm256_cmp_pd(a.simdInternal_, b.simdInternal_, _CMP_LE_OQ)
 318     };
 319 }
 320
 321 static inline Simd4DBool gmx_simdcall
 322 operator&&(Simd4DBool a, Simd4DBool b)
 323 {
 324     return {
 325                _mm256_and_pd(a.simdInternal_, b.simdInternal_)
 326     };
 327 }
 328
 329 static inline Simd4DBool gmx_simdcall
 330 operator||(Simd4DBool a, Simd4DBool b)
 331 {
 332     return {
 333                _mm256_or_pd(a.simdInternal_, b.simdInternal_)
 334     };
 335 }
 336
 337 static inline bool gmx_simdcall
 338 anyTrue(Simd4DBool a) { return _mm256_movemask_pd(a.simdInternal_) != 0; }
 339
 340 static inline Simd4Double gmx_simdcall
 341 selectByMask(Simd4Double a, Simd4DBool mask)
 342 {
 343     return {
 344                _mm256_and_pd(a.simdInternal_, mask.simdInternal_)
 345     };
 346 }
 347
 348 static inline Simd4Double gmx_simdcall
 349 selectByNotMask(Simd4Double a, Simd4DBool mask)
 350 {
 351     return {
 352                _mm256_andnot_pd(mask.simdInternal_, a.simdInternal_)
 353     };
 354 }
 355
 356 static inline Simd4Double gmx_simdcall
 357 blend(Simd4Double a, Simd4Double b, Simd4DBool sel)
 358 {
 359     return {
 360                _mm256_blendv_pd(a.simdInternal_, b.simdInternal_, sel.simdInternal_)
 361     };
 362 }
 363
 364 static inline double gmx_simdcall
 365 reduce(Simd4Double a)
 366 {
 367     __m128d a0, a1;
 368     // test with shuffle & add as an alternative to hadd later
 369     a.simdInternal_ = _mm256_hadd_pd(a.simdInternal_, a.simdInternal_);
 370     a0              = _mm256_castpd256_pd128(a.simdInternal_);
 371     a1              = _mm256_extractf128_pd(a.simdInternal_, 0x1);
 372     a0              = _mm_add_sd(a0, a1);
 373     return *reinterpret_cast<double *>(&a0);
 374 }
 375
 376 }      // namespace gmx
 377
 378 #endif // GMX_SIMD_IMPL_X86_AVX_256_SIMD4_DOUBLE_H