src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd4_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
  37 #define GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42
  43 #include <immintrin.h>
  44
  45 #include "gromacs/utility/basedefinitions.h"
  46
  47 #include "impl_x86_mic_simd_double.h"
  48
  49 namespace gmx
  50 {
  51
  52 class Simd4Double
  53 {
  54     public:
  55         Simd4Double() {}
  56
  57         Simd4Double(double d) : simdInternal_(_mm512_set1_pd(d)) {}
  58
  59         // Internal utility constructor to simplify return statements
  60         Simd4Double(__m512d simd) : simdInternal_(simd) {}
  61
  62         __m512d  simdInternal_;
  63 };
  64
  65 class Simd4DBool
  66 {
  67     public:
  68         Simd4DBool() {}
  69
  70         // Internal utility constructor to simplify return statements
  71         Simd4DBool(__mmask16 simd) : simdInternal_(simd) {}
  72
  73         __mmask16  simdInternal_;
  74 };
  75
  76 static inline Simd4Double gmx_simdcall
  77 load4(const double *m)
  78 {
  79     assert(size_t(m) % 32 == 0);
  80     return {
  81                _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
  82     };
  83 }
  84
  85 static inline void gmx_simdcall
  86 store4(double *m, Simd4Double a)
  87 {
  88     assert(size_t(m) % 32 == 0);
  89     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
  90 }
  91
  92 static inline Simd4Double gmx_simdcall
  93 load4U(const double *m)
  94 {
  95     return {
  96                _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m), _mm512_int2mask(0xF), m+8)
  97     };
  98 }
  99
 100 static inline void gmx_simdcall
 101 store4U(double *m, Simd4Double a)
 102 {
 103     _mm512_mask_packstorelo_pd(m, _mm512_int2mask(0xF), a.simdInternal_);
 104     _mm512_mask_packstorehi_pd(m+8, _mm512_int2mask(0xF), a.simdInternal_);
 105 }
 106
 107 static inline Simd4Double gmx_simdcall
 108 simd4SetZeroD()
 109 {
 110     return {
 111                _mm512_setzero_pd()
 112     };
 113 }
 114
 115 static inline Simd4Double gmx_simdcall
 116 operator&(Simd4Double a, Simd4Double b)
 117 {
 118     return {
 119                _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
 120                                                          _mm512_castpd_si512(b.simdInternal_)))
 121     };
 122 }
 123
 124 static inline Simd4Double gmx_simdcall
 125 andNot(Simd4Double a, Simd4Double b)
 126 {
 127     return {
 128                _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
 129                                                             _mm512_castpd_si512(b.simdInternal_)))
 130     };
 131 }
 132
 133 static inline Simd4Double gmx_simdcall
 134 operator|(Simd4Double a, Simd4Double b)
 135 {
 136     return {
 137                _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
 138                                                         _mm512_castpd_si512(b.simdInternal_)))
 139     };
 140 }
 141
 142 static inline Simd4Double gmx_simdcall
 143 operator^(Simd4Double a, Simd4Double b)
 144 {
 145     return {
 146                _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), _mm512_castpd_si512(a.simdInternal_),
 147                                                          _mm512_castpd_si512(b.simdInternal_)))
 148     };
 149 }
 150
 151 static inline Simd4Double gmx_simdcall
 152 operator+(Simd4Double a, Simd4Double b)
 153 {
 154     return {
 155                _mm512_mask_add_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
 156     };
 157 }
 158
 159 static inline Simd4Double gmx_simdcall
 160 operator-(Simd4Double a, Simd4Double b)
 161 {
 162     return {
 163                _mm512_mask_sub_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
 164     };
 165 }
 166
 167 static inline Simd4Double gmx_simdcall
 168 operator-(Simd4Double x)
 169 {
 170     return {
 171                _mm512_mask_addn_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _mm512_setzero_pd())
 172     };
 173 }
 174
 175 static inline Simd4Double gmx_simdcall
 176 operator*(Simd4Double a, Simd4Double b)
 177 {
 178     return {
 179                _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
 180     };
 181 }
 182
 183 static inline Simd4Double gmx_simdcall
 184 fma(Simd4Double a, Simd4Double b, Simd4Double c)
 185 {
 186     return {
 187                _mm512_mask_fmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
 188     };
 189 }
 190
 191 static inline Simd4Double gmx_simdcall
 192 fms(Simd4Double a, Simd4Double b, Simd4Double c)
 193 {
 194     return {
 195                _mm512_mask_fmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
 196     };
 197 }
 198
 199 static inline Simd4Double gmx_simdcall
 200 fnma(Simd4Double a, Simd4Double b, Simd4Double c)
 201 {
 202     return {
 203                _mm512_mask_fnmadd_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
 204     };
 205 }
 206
 207 static inline Simd4Double gmx_simdcall
 208 fnms(Simd4Double a, Simd4Double b, Simd4Double c)
 209 {
 210     return {
 211                _mm512_mask_fnmsub_pd(a.simdInternal_, _mm512_int2mask(0xF), b.simdInternal_, c.simdInternal_)
 212     };
 213 }
 214
 215 static inline Simd4Double gmx_simdcall
 216 rsqrt(Simd4Double x)
 217 {
 218     return {
 219                _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(),
 220                                       _mm512_int2mask(0xF),
 221                                       _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(),
 222                                                              _mm512_int2mask(0xF),
 223                                                              _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(),
 224                                                                                     _mm512_int2mask(0xF), x.simdInternal_)))
 225     };
 226 }
 227
 228 static inline Simd4Double gmx_simdcall
 229 abs(Simd4Double x)
 230 {
 231     return {
 232                _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF),
 233                                                             _mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)),
 234                                                             _mm512_castpd_si512(x.simdInternal_)))
 235
 236     };
 237 }
 238
 239 static inline Simd4Double gmx_simdcall
 240 max(Simd4Double a, Simd4Double b)
 241 {
 242     return {
 243                _mm512_mask_gmax_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
 244     };
 245 }
 246
 247 static inline Simd4Double gmx_simdcall
 248 min(Simd4Double a, Simd4Double b)
 249 {
 250     return {
 251                _mm512_mask_gmin_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_)
 252     };
 253 }
 254
 255 static inline Simd4Double gmx_simdcall
 256 round(Simd4Double x)
 257 {
 258     return {
 259                _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
 260     };
 261 }
 262
 263 static inline Simd4Double gmx_simdcall
 264 trunc(Simd4Double x)
 265 {
 266     return {
 267                _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
 268     };
 269 }
 270
 271 static inline double gmx_simdcall
 272 dotProduct(Simd4Double a, Simd4Double b)
 273 {
 274     return _mm512_mask_reduce_add_pd(_mm512_int2mask(7),
 275                                      _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7),
 276                                                         a.simdInternal_, b.simdInternal_));
 277 }
 278
 279 static inline void gmx_simdcall
 280 transpose(Simd4Double * v0, Simd4Double * v1,
 281           Simd4Double * v2, Simd4Double * v3)
 282 {
 283     __m512i t0 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v0->simdInternal_), 0xFF00,
 284                                                 _mm512_castpd_si512(v1->simdInternal_), _MM_PERM_BABA);
 285     __m512i t1 = _mm512_mask_permute4f128_epi32(_mm512_castpd_si512(v2->simdInternal_), 0xFF00,
 286                                                 _mm512_castpd_si512(v3->simdInternal_), _MM_PERM_BABA);
 287
 288     t0 = _mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t0);
 289     t1 = _mm512_permutevar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), t1);
 290
 291     v0->simdInternal_ = _mm512_mask_swizzle_pd(_mm512_castsi512_pd(t0), _mm512_int2mask(0xCC),
 292                                                _mm512_castsi512_pd(t1), _MM_SWIZ_REG_BADC);
 293     v1->simdInternal_ = _mm512_mask_swizzle_pd(_mm512_castsi512_pd(t1), _mm512_int2mask(0x33),
 294                                                _mm512_castsi512_pd(t0), _MM_SWIZ_REG_BADC);
 295
 296     v2->simdInternal_ = _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v0->simdInternal_), _MM_PERM_DCDC));
 297     v3->simdInternal_ = _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(v1->simdInternal_), _MM_PERM_DCDC));
 298 }
 299
 300 // Picky, picky, picky:
 301 // icc-16 complains about "Illegal value of immediate argument to intrinsic"
 302 // unless we use
 303 // 1) Ordered-quiet for ==
 304 // 2) Unordered-quiet for !=
 305 // 3) Ordered-signaling for < and <=
 306
 307 static inline Simd4DBool gmx_simdcall
 308 operator==(Simd4Double a, Simd4Double b)
 309 {
 310     return {
 311                _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
 312     };
 313 }
 314
 315 static inline Simd4DBool gmx_simdcall
 316 operator!=(Simd4Double a, Simd4Double b)
 317 {
 318     return {
 319                _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
 320     };
 321 }
 322
 323 static inline Simd4DBool gmx_simdcall
 324 operator<(Simd4Double a, Simd4Double b)
 325 {
 326     return {
 327                _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
 328     };
 329 }
 330
 331 static inline Simd4DBool gmx_simdcall
 332 operator<=(Simd4Double a, Simd4Double b)
 333 {
 334     return {
 335                _mm512_mask_cmp_pd_mask(_mm512_int2mask(0xF), a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
 336     };
 337 }
 338
 339 static inline Simd4DBool gmx_simdcall
 340 operator&&(Simd4DBool a, Simd4DBool b)
 341 {
 342     return {
 343                _mm512_kand(a.simdInternal_, b.simdInternal_)
 344     };
 345 }
 346
 347 static inline Simd4DBool gmx_simdcall
 348 operator||(Simd4DBool a, Simd4DBool b)
 349 {
 350     return {
 351                _mm512_kor(a.simdInternal_, b.simdInternal_)
 352     };
 353 }
 354
 355 static inline bool gmx_simdcall
 356 anyTrue(Simd4DBool a)
 357 {
 358     return (_mm512_mask2int(a.simdInternal_) & 0xF) != 0;
 359 }
 360
 361 static inline Simd4Double gmx_simdcall
 362 selectByMask(Simd4Double a, Simd4DBool m)
 363 {
 364     return {
 365                _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_)
 366     };
 367 }
 368
 369 static inline Simd4Double gmx_simdcall
 370 selectByNotMask(Simd4Double a, Simd4DBool m)
 371 {
 372     return {
 373                _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(m.simdInternal_), a.simdInternal_)
 374     };
 375 }
 376
 377 static inline Simd4Double gmx_simdcall
 378 blend(Simd4Double a, Simd4Double b, Simd4DBool sel)
 379 {
 380     return {
 381                _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
 382     };
 383 }
 384
 385 static inline double gmx_simdcall
 386 reduce(Simd4Double a)
 387 {
 388     return _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), a.simdInternal_);
 389 }
 390
 391 }      // namespace gmx
 392
 393 #endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_DOUBLE_H