src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
  37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42
  43 #include <arm_neon.h>
  44
  45 #include "gromacs/math/utilities.h"
  46
  47 #include "impl_arm_neon_asimd_simd_float.h"
  48
  49 namespace gmx
  50 {
  51
  52 class SimdDouble
  53 {
  54 public:
  55     SimdDouble() {}
  56
  57     SimdDouble(double d) : simdInternal_(vdupq_n_f64(d)) {}
  58
  59     // Internal utility constructor to simplify return statements
  60     SimdDouble(float64x2_t simd) : simdInternal_(simd) {}
  61
  62     float64x2_t simdInternal_;
  63 };
  64
  65 class SimdDInt32
  66 {
  67 public:
  68     SimdDInt32() {}
  69
  70     SimdDInt32(std::int32_t i) : simdInternal_(vdup_n_s32(i)) {}
  71
  72     // Internal utility constructor to simplify return statements
  73     SimdDInt32(int32x2_t simd) : simdInternal_(simd) {}
  74
  75     int32x2_t simdInternal_;
  76 };
  77
  78 class SimdDBool
  79 {
  80 public:
  81     SimdDBool() {}
  82
  83     SimdDBool(bool b) : simdInternal_(vdupq_n_u64(b ? 0xFFFFFFFFFFFFFFFF : 0)) {}
  84
  85     // Internal utility constructor to simplify return statements
  86     SimdDBool(uint64x2_t simd) : simdInternal_(simd) {}
  87
  88     uint64x2_t simdInternal_;
  89 };
  90
  91 class SimdDIBool
  92 {
  93 public:
  94     SimdDIBool() {}
  95
  96     SimdDIBool(bool b) : simdInternal_(vdup_n_u32(b ? 0xFFFFFFFF : 0)) {}
  97
  98     // Internal utility constructor to simplify return statements
  99     SimdDIBool(uint32x2_t simd) : simdInternal_(simd) {}
 100
 101     uint32x2_t simdInternal_;
 102 };
 103
 104 static inline SimdDouble gmx_simdcall simdLoad(const double* m, SimdDoubleTag = {})
 105 {
 106     assert(std::size_t(m) % 16 == 0);
 107     return { vld1q_f64(m) };
 108 }
 109
 110 static inline void gmx_simdcall store(double* m, SimdDouble a)
 111 {
 112     assert(std::size_t(m) % 16 == 0);
 113     vst1q_f64(m, a.simdInternal_);
 114 }
 115
 116 static inline SimdDouble gmx_simdcall simdLoadU(const double* m, SimdDoubleTag = {})
 117 {
 118     return { vld1q_f64(m) };
 119 }
 120
 121 static inline void gmx_simdcall storeU(double* m, SimdDouble a)
 122 {
 123     vst1q_f64(m, a.simdInternal_);
 124 }
 125
 126 static inline SimdDouble gmx_simdcall setZeroD()
 127 {
 128     return { vdupq_n_f64(0.0) };
 129 }
 130
 131 static inline SimdDInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdDInt32Tag)
 132 {
 133     assert(std::size_t(m) % 8 == 0);
 134     return { vld1_s32(m) };
 135 }
 136
 137 static inline void gmx_simdcall store(std::int32_t* m, SimdDInt32 a)
 138 {
 139     assert(std::size_t(m) % 8 == 0);
 140     vst1_s32(m, a.simdInternal_);
 141 }
 142
 143 static inline SimdDInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdDInt32Tag)
 144 {
 145     return { vld1_s32(m) };
 146 }
 147
 148 static inline void gmx_simdcall storeU(std::int32_t* m, SimdDInt32 a)
 149 {
 150     vst1_s32(m, a.simdInternal_);
 151 }
 152
 153 static inline SimdDInt32 gmx_simdcall setZeroDI()
 154 {
 155     return { vdup_n_s32(0) };
 156 }
 157
 158 template<int index>
 159 gmx_simdcall static inline std::int32_t extract(SimdDInt32 a)
 160 {
 161     return vget_lane_s32(a.simdInternal_, index);
 162 }
 163
 164 static inline SimdDouble gmx_simdcall operator&(SimdDouble a, SimdDouble b)
 165 {
 166     return { float64x2_t(vandq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_))) };
 167 }
 168
 169 static inline SimdDouble gmx_simdcall andNot(SimdDouble a, SimdDouble b)
 170 {
 171     return { float64x2_t(vbicq_s64(int64x2_t(b.simdInternal_), int64x2_t(a.simdInternal_))) };
 172 }
 173
 174 static inline SimdDouble gmx_simdcall operator|(SimdDouble a, SimdDouble b)
 175 {
 176     return { float64x2_t(vorrq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_))) };
 177 }
 178
 179 static inline SimdDouble gmx_simdcall operator^(SimdDouble a, SimdDouble b)
 180 {
 181     return { float64x2_t(veorq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_))) };
 182 }
 183
 184 static inline SimdDouble gmx_simdcall operator+(SimdDouble a, SimdDouble b)
 185 {
 186     return { vaddq_f64(a.simdInternal_, b.simdInternal_) };
 187 }
 188
 189 static inline SimdDouble gmx_simdcall operator-(SimdDouble a, SimdDouble b)
 190 {
 191     return { vsubq_f64(a.simdInternal_, b.simdInternal_) };
 192 }
 193
 194 static inline SimdDouble gmx_simdcall operator-(SimdDouble x)
 195 {
 196     return { vnegq_f64(x.simdInternal_) };
 197 }
 198
 199 static inline SimdDouble gmx_simdcall operator*(SimdDouble a, SimdDouble b)
 200 {
 201     return { vmulq_f64(a.simdInternal_, b.simdInternal_) };
 202 }
 203
 204 static inline SimdDouble gmx_simdcall fma(SimdDouble a, SimdDouble b, SimdDouble c)
 205 {
 206     return { vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_) };
 207 }
 208
 209 static inline SimdDouble gmx_simdcall fms(SimdDouble a, SimdDouble b, SimdDouble c)
 210 {
 211     return { vnegq_f64(vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)) };
 212 }
 213
 214 static inline SimdDouble gmx_simdcall fnma(SimdDouble a, SimdDouble b, SimdDouble c)
 215 {
 216     return { vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_) };
 217 }
 218
 219 static inline SimdDouble gmx_simdcall fnms(SimdDouble a, SimdDouble b, SimdDouble c)
 220 {
 221     return { vnegq_f64(vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)) };
 222 }
 223
 224 static inline SimdDouble gmx_simdcall rsqrt(SimdDouble x)
 225 {
 226     return { vrsqrteq_f64(x.simdInternal_) };
 227 }
 228
 229 static inline SimdDouble gmx_simdcall rsqrtIter(SimdDouble lu, SimdDouble x)
 230 {
 231     return { vmulq_f64(lu.simdInternal_,
 232                        vrsqrtsq_f64(vmulq_f64(lu.simdInternal_, lu.simdInternal_), x.simdInternal_)) };
 233 }
 234
 235 static inline SimdDouble gmx_simdcall rcp(SimdDouble x)
 236 {
 237     return { vrecpeq_f64(x.simdInternal_) };
 238 }
 239
 240 static inline SimdDouble gmx_simdcall rcpIter(SimdDouble lu, SimdDouble x)
 241 {
 242     return { vmulq_f64(lu.simdInternal_, vrecpsq_f64(lu.simdInternal_, x.simdInternal_)) };
 243 }
 244
 245 static inline SimdDouble gmx_simdcall maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
 246 {
 247     float64x2_t addend = float64x2_t(vandq_u64(uint64x2_t(b.simdInternal_), m.simdInternal_));
 248
 249     return { vaddq_f64(a.simdInternal_, addend) };
 250 }
 251
 252 static inline SimdDouble gmx_simdcall maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
 253 {
 254     float64x2_t prod = vmulq_f64(a.simdInternal_, b.simdInternal_);
 255     return { float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_)) };
 256 }
 257
 258 static inline SimdDouble gmx_simdcall maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
 259 {
 260     float64x2_t prod = vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_);
 261
 262     return { float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_)) };
 263 }
 264
 265 static inline SimdDouble gmx_simdcall maskzRsqrt(SimdDouble x, SimdDBool m)
 266 {
 267     // The result will always be correct since we mask the result with m, but
 268     // for debug builds we also want to make sure not to generate FP exceptions
 269 #ifndef NDEBUG
 270     x.simdInternal_ = vbslq_f64(m.simdInternal_, x.simdInternal_, vdupq_n_f64(1.0));
 271 #endif
 272     return { float64x2_t(vandq_u64(uint64x2_t(vrsqrteq_f64(x.simdInternal_)), m.simdInternal_)) };
 273 }
 274
 275 static inline SimdDouble gmx_simdcall maskzRcp(SimdDouble x, SimdDBool m)
 276 {
 277     // The result will always be correct since we mask the result with m, but
 278     // for debug builds we also want to make sure not to generate FP exceptions
 279 #ifndef NDEBUG
 280     x.simdInternal_ = vbslq_f64(m.simdInternal_, x.simdInternal_, vdupq_n_f64(1.0));
 281 #endif
 282     return { float64x2_t(vandq_u64(uint64x2_t(vrecpeq_f64(x.simdInternal_)), m.simdInternal_)) };
 283 }
 284
 285 static inline SimdDouble gmx_simdcall abs(SimdDouble x)
 286 {
 287     return { vabsq_f64(x.simdInternal_) };
 288 }
 289
 290 static inline SimdDouble gmx_simdcall max(SimdDouble a, SimdDouble b)
 291 {
 292     return { vmaxq_f64(a.simdInternal_, b.simdInternal_) };
 293 }
 294
 295 static inline SimdDouble gmx_simdcall min(SimdDouble a, SimdDouble b)
 296 {
 297     return { vminq_f64(a.simdInternal_, b.simdInternal_) };
 298 }
 299
 300 static inline SimdDouble gmx_simdcall round(SimdDouble x)
 301 {
 302     return { vrndnq_f64(x.simdInternal_) };
 303 }
 304
 305 static inline SimdDouble gmx_simdcall trunc(SimdDouble x)
 306 {
 307     return { vrndq_f64(x.simdInternal_) };
 308 }
 309
 310 template<MathOptimization opt = MathOptimization::Safe>
 311 static inline SimdDouble frexp(SimdDouble value, SimdDInt32* exponent)
 312 {
 313     const float64x2_t exponentMask = float64x2_t(vdupq_n_s64(0x7FF0000000000000LL));
 314     const float64x2_t mantissaMask = float64x2_t(vdupq_n_s64(0x800FFFFFFFFFFFFFLL));
 315
 316     const int64x2_t exponentBias = vdupq_n_s64(1022); // add 1 to make our definition identical to frexp()
 317     const float64x2_t half = vdupq_n_f64(0.5);
 318     int64x2_t         iExponent;
 319
 320     iExponent               = vandq_s64(int64x2_t(value.simdInternal_), int64x2_t(exponentMask));
 321     iExponent               = vsubq_s64(vshrq_n_s64(iExponent, 52), exponentBias);
 322     exponent->simdInternal_ = vmovn_s64(iExponent);
 323
 324     return { float64x2_t(vorrq_s64(
 325             vandq_s64(int64x2_t(value.simdInternal_), int64x2_t(mantissaMask)), int64x2_t(half))) };
 326 }
 327
 328 template<MathOptimization opt = MathOptimization::Safe>
 329 static inline SimdDouble ldexp(SimdDouble value, SimdDInt32 exponent)
 330 {
 331     const int32x2_t exponentBias = vdup_n_s32(1023);
 332     int32x2_t       iExponent    = vadd_s32(exponent.simdInternal_, exponentBias);
 333     int64x2_t       iExponent64;
 334
 335     if (opt == MathOptimization::Safe)
 336     {
 337         // Make sure biased argument is not negative
 338         iExponent = vmax_s32(iExponent, vdup_n_s32(0));
 339     }
 340
 341     iExponent64 = vmovl_s32(iExponent);
 342     iExponent64 = vshlq_n_s64(iExponent64, 52);
 343
 344     return { vmulq_f64(value.simdInternal_, float64x2_t(iExponent64)) };
 345 }
 346
 347 static inline double gmx_simdcall reduce(SimdDouble a)
 348 {
 349     float64x2_t b = vpaddq_f64(a.simdInternal_, a.simdInternal_);
 350     return vgetq_lane_f64(b, 0);
 351 }
 352
 353 static inline SimdDBool gmx_simdcall operator==(SimdDouble a, SimdDouble b)
 354 {
 355     return { vceqq_f64(a.simdInternal_, b.simdInternal_) };
 356 }
 357
 358 static inline SimdDBool gmx_simdcall operator!=(SimdDouble a, SimdDouble b)
 359 {
 360     return { vreinterpretq_u64_u32(
 361             vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.simdInternal_, b.simdInternal_)))) };
 362 }
 363
 364 static inline SimdDBool gmx_simdcall operator<(SimdDouble a, SimdDouble b)
 365 {
 366     return { vcltq_f64(a.simdInternal_, b.simdInternal_) };
 367 }
 368
 369 static inline SimdDBool gmx_simdcall operator<=(SimdDouble a, SimdDouble b)
 370 {
 371     return { vcleq_f64(a.simdInternal_, b.simdInternal_) };
 372 }
 373
 374 static inline SimdDBool gmx_simdcall testBits(SimdDouble a)
 375 {
 376     return { vtstq_s64(int64x2_t(a.simdInternal_), int64x2_t(a.simdInternal_)) };
 377 }
 378
 379 static inline SimdDBool gmx_simdcall operator&&(SimdDBool a, SimdDBool b)
 380 {
 381     return { vandq_u64(a.simdInternal_, b.simdInternal_) };
 382 }
 383
 384 static inline SimdDBool gmx_simdcall operator||(SimdDBool a, SimdDBool b)
 385 {
 386     return { vorrq_u64(a.simdInternal_, b.simdInternal_) };
 387 }
 388
 389 static inline bool gmx_simdcall anyTrue(SimdDBool a)
 390 {
 391     return (vmaxvq_u32((uint32x4_t)(a.simdInternal_)) != 0);
 392 }
 393
 394 static inline SimdDouble gmx_simdcall selectByMask(SimdDouble a, SimdDBool m)
 395 {
 396     return { float64x2_t(vandq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_)) };
 397 }
 398
 399 static inline SimdDouble gmx_simdcall selectByNotMask(SimdDouble a, SimdDBool m)
 400 {
 401     return { float64x2_t(vbicq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_)) };
 402 }
 403
 404 static inline SimdDouble gmx_simdcall blend(SimdDouble a, SimdDouble b, SimdDBool sel)
 405 {
 406     return { vbslq_f64(sel.simdInternal_, b.simdInternal_, a.simdInternal_) };
 407 }
 408
 409 static inline SimdDInt32 gmx_simdcall operator&(SimdDInt32 a, SimdDInt32 b)
 410 {
 411     return { vand_s32(a.simdInternal_, b.simdInternal_) };
 412 }
 413
 414 static inline SimdDInt32 gmx_simdcall andNot(SimdDInt32 a, SimdDInt32 b)
 415 {
 416     return { vbic_s32(b.simdInternal_, a.simdInternal_) };
 417 }
 418
 419 static inline SimdDInt32 gmx_simdcall operator|(SimdDInt32 a, SimdDInt32 b)
 420 {
 421     return { vorr_s32(a.simdInternal_, b.simdInternal_) };
 422 }
 423
 424 static inline SimdDInt32 gmx_simdcall operator^(SimdDInt32 a, SimdDInt32 b)
 425 {
 426     return { veor_s32(a.simdInternal_, b.simdInternal_) };
 427 }
 428
 429 static inline SimdDInt32 gmx_simdcall operator+(SimdDInt32 a, SimdDInt32 b)
 430 {
 431     return { vadd_s32(a.simdInternal_, b.simdInternal_) };
 432 }
 433
 434 static inline SimdDInt32 gmx_simdcall operator-(SimdDInt32 a, SimdDInt32 b)
 435 {
 436     return { vsub_s32(a.simdInternal_, b.simdInternal_) };
 437 }
 438
 439 static inline SimdDInt32 gmx_simdcall operator*(SimdDInt32 a, SimdDInt32 b)
 440 {
 441     return { vmul_s32(a.simdInternal_, b.simdInternal_) };
 442 }
 443
 444 static inline SimdDIBool gmx_simdcall operator==(SimdDInt32 a, SimdDInt32 b)
 445 {
 446     return { vceq_s32(a.simdInternal_, b.simdInternal_) };
 447 }
 448
 449 static inline SimdDIBool gmx_simdcall testBits(SimdDInt32 a)
 450 {
 451     return { vtst_s32(a.simdInternal_, a.simdInternal_) };
 452 }
 453
 454 static inline SimdDIBool gmx_simdcall operator<(SimdDInt32 a, SimdDInt32 b)
 455 {
 456     return { vclt_s32(a.simdInternal_, b.simdInternal_) };
 457 }
 458
 459 static inline SimdDIBool gmx_simdcall operator&&(SimdDIBool a, SimdDIBool b)
 460 {
 461     return { vand_u32(a.simdInternal_, b.simdInternal_) };
 462 }
 463
 464 static inline SimdDIBool gmx_simdcall operator||(SimdDIBool a, SimdDIBool b)
 465 {
 466     return { vorr_u32(a.simdInternal_, b.simdInternal_) };
 467 }
 468
 469 static inline bool gmx_simdcall anyTrue(SimdDIBool a)
 470 {
 471     return (vmaxv_u32(a.simdInternal_) != 0);
 472 }
 473
 474 static inline SimdDInt32 gmx_simdcall selectByMask(SimdDInt32 a, SimdDIBool m)
 475 {
 476     return { vand_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_)) };
 477 }
 478
 479 static inline SimdDInt32 gmx_simdcall selectByNotMask(SimdDInt32 a, SimdDIBool m)
 480 {
 481     return { vbic_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_)) };
 482 }
 483
 484 static inline SimdDInt32 gmx_simdcall blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
 485 {
 486     return { vbsl_s32(sel.simdInternal_, b.simdInternal_, a.simdInternal_) };
 487 }
 488
 489 static inline SimdDInt32 gmx_simdcall cvtR2I(SimdDouble a)
 490 {
 491     return { vmovn_s64(vcvtnq_s64_f64(a.simdInternal_)) };
 492 }
 493
 494 static inline SimdDInt32 gmx_simdcall cvttR2I(SimdDouble a)
 495 {
 496     return { vmovn_s64(vcvtq_s64_f64(a.simdInternal_)) };
 497 }
 498
 499 static inline SimdDouble gmx_simdcall cvtI2R(SimdDInt32 a)
 500 {
 501     return { vcvtq_f64_s64(vmovl_s32(a.simdInternal_)) };
 502 }
 503
 504 static inline SimdDIBool gmx_simdcall cvtB2IB(SimdDBool a)
 505 {
 506     return { vqmovn_u64(a.simdInternal_) };
 507 }
 508
 509 static inline SimdDBool gmx_simdcall cvtIB2B(SimdDIBool a)
 510 {
 511     return { vorrq_u64(vmovl_u32(a.simdInternal_), vshlq_n_u64(vmovl_u32(a.simdInternal_), 32)) };
 512 }
 513
 514 static inline void gmx_simdcall cvtF2DD(SimdFloat f, SimdDouble* d0, SimdDouble* d1)
 515 {
 516     d0->simdInternal_ = vcvt_f64_f32(vget_low_f32(f.simdInternal_));
 517     d1->simdInternal_ = vcvt_high_f64_f32(f.simdInternal_);
 518 }
 519
 520 static inline SimdFloat gmx_simdcall cvtDD2F(SimdDouble d0, SimdDouble d1)
 521 {
 522     return { vcvt_high_f32_f64(vcvt_f32_f64(d0.simdInternal_), d1.simdInternal_) };
 523 }
 524
 525 } // namespace gmx
 526
 527 #endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H