src/gromacs/simd/impl_reference/impl_reference_simd_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_SIMD_FLOAT_H
  37 #define GMX_SIMD_IMPL_REFERENCE_SIMD_FLOAT_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference implementation, SIMD single precision.
  42
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cmath>
  52 #include <cstddef>
  53 #include <cstdint>
  54
  55 #include <algorithm>
  56 #include <array>
  57
  58 #include "gromacs/math/utilities.h"
  59
  60 #include "impl_reference_definitions.h"
  61
  62 namespace gmx
  63 {
  64
  65 /*! \cond libapi */
  66 /*! \addtogroup module_simd */
  67 /*! \{ */
  68
  69 /*! \name SIMD implementation data types and built-in conversions between types
  70  * \{
  71  */
  72
  73 /*! \libinternal \brief Float SIMD variable. Available if GMX_SIMD_HAVE_FLOAT is 1.
  74  *
  75  * \note This variable cannot be placed inside other structures or classes, since
  76  *       some compilers (including at least clang-3.7) appear to lose the
  77  *       alignment. This is likely particularly severe when allocating such
  78  *       memory on the heap, but it occurs for stack structures too.
  79  */
  80 class SimdFloat
  81 {
  82 public:
  83     SimdFloat() {}
  84
  85     //! \brief Construct from scalar
  86     SimdFloat(float f) { simdInternal_.fill(f); }
  87
  88     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
  89      *
  90      * This has to be public to enable usage in combination with static inline
  91      * functions, but it should never, EVER, be accessed by any code outside
  92      * the corresponding implementation directory since the type will depend
  93      * on the architecture.
  94      */
  95     std::array<float, GMX_SIMD_FLOAT_WIDTH> simdInternal_;
  96 };
  97
  98 /*! \libinternal \brief Integer SIMD variable type to use for conversions to/from float.
  99  *
 100  * This is also the widest integer SIMD type. Available if GMX_SIMD_HAVE_FLOAT is 1.
 101  *
 102  * \note The integer SIMD type will always be available, but on architectures
 103  * that do not have any real integer SIMD support it might be defined as the
 104  * floating-point type. This will work fine, since there are separate defines
 105  * for whether the implementation can actually do any operations on integer
 106  * SIMD types.
 107  * \note This variable cannot be placed inside other structures or classes, since
 108  *       some compilers (including at least clang-3.7) appear to lose the
 109  *       alignment. This is likely particularly severe when allocating such
 110  *       memory on the heap, but it occurs for stack structures too.
 111  */
 112 class SimdFInt32
 113 {
 114 public:
 115     SimdFInt32() {}
 116
 117     //! \brief Construct from scalar
 118     SimdFInt32(std::int32_t i) { simdInternal_.fill(i); }
 119
 120     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
 121      *
 122      * This has to be public to enable usage in combination with static inline
 123      * functions, but it should never, EVER, be accessed by any code outside
 124      * the corresponding implementation directory since the type will depend
 125      * on the architecture.
 126      */
 127     std::array<std::int32_t, GMX_SIMD_FINT32_WIDTH> simdInternal_;
 128 };
 129
 130 /*! \libinternal \brief Boolean type for float SIMD data.
 131  *
 132  *  Available if GMX_SIMD_HAVE_FLOAT is 1.
 133  *
 134  * \note This variable cannot be placed inside other structures or classes, since
 135  *       some compilers (including at least clang-3.7) appear to lose the
 136  *       alignment. This is likely particularly severe when allocating such
 137  *       memory on the heap, but it occurs for stack structures too.
 138  */
 139 class SimdFBool
 140 {
 141 public:
 142     SimdFBool() {}
 143
 144     //! \brief Construct from scalar
 145     SimdFBool(bool b) { simdInternal_.fill(b); }
 146
 147     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
 148      *
 149      * This has to be public to enable usage in combination with static inline
 150      * functions, but it should never, EVER, be accessed by any code outside
 151      * the corresponding implementation directory since the type will depend
 152      * on the architecture.
 153      */
 154     std::array<bool, GMX_SIMD_FLOAT_WIDTH> simdInternal_;
 155 };
 156
 157 /*! \libinternal \brief Boolean type for integer datatypes corresponding to float SIMD.
 158  *
 159  * Available if GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
 160  *
 161  * \note This variable cannot be placed inside other structures or classes, since
 162  *       some compilers (including at least clang-3.7) appear to lose the
 163  *       alignment. This is likely particularly severe when allocating such
 164  *       memory on the heap, but it occurs for stack structures too.
 165  */
 166 class SimdFIBool
 167 {
 168 public:
 169     SimdFIBool() {}
 170
 171     //! \brief Construct from scalar
 172     SimdFIBool(bool b) { simdInternal_.fill(b); }
 173
 174     /*! \brief Internal SIMD data. Implementation dependent, don't touch.
 175      *
 176      * This has to be public to enable usage in combination with static inline
 177      * functions, but it should never, EVER, be accessed by any code outside
 178      * the corresponding implementation directory since the type will depend
 179      * on the architecture.
 180      */
 181     std::array<bool, GMX_SIMD_FINT32_WIDTH> simdInternal_;
 182 };
 183
 184 /*! \}
 185  *
 186  * \name SIMD implementation load/store operations for single precision floating point
 187  * \{
 188  */
 189
 190 /*! \brief Load \ref GMX_SIMD_FLOAT_WIDTH float numbers from aligned memory.
 191  *
 192  * \param m Pointer to memory aligned to the SIMD width.
 193  * \return SIMD variable with data loaded.
 194  */
 195 static inline SimdFloat gmx_simdcall simdLoad(const float* m, SimdFloatTag = {})
 196 {
 197     SimdFloat a;
 198
 199     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
 200
 201     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 202     return a;
 203 }
 204
 205 /*! \brief Store the contents of SIMD float variable to aligned memory m.
 206  *
 207  * \param[out] m Pointer to memory, aligned to SIMD width.
 208  * \param a SIMD variable to store
 209  */
 210 static inline void gmx_simdcall store(float* m, SimdFloat a)
 211 {
 212     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(float)) == 0);
 213
 214     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 215 }
 216
 217 /*! \brief Load SIMD float from unaligned memory.
 218  *
 219  * Available if \ref GMX_SIMD_HAVE_LOADU is 1.
 220  *
 221  * \param m Pointer to memory, no alignment requirement.
 222  * \return SIMD variable with data loaded.
 223  */
 224 static inline SimdFloat gmx_simdcall simdLoadU(const float* m, SimdFloatTag = {})
 225 {
 226     SimdFloat a;
 227     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 228     return a;
 229 }
 230
 231 /*! \brief Store SIMD float to unaligned memory.
 232  *
 233  * Available if \ref GMX_SIMD_HAVE_STOREU is 1.
 234  *
 235  * \param[out] m Pointer to memory, no alignment requirement.
 236  * \param a SIMD variable to store.
 237  */
 238 static inline void gmx_simdcall storeU(float* m, SimdFloat a)
 239 {
 240     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 241 }
 242
 243 /*! \brief Set all SIMD float variable elements to 0.0.
 244  *
 245  * You should typically just call \ref gmx::setZero(), which uses proxy objects
 246  * internally to handle all types rather than adding the suffix used here.
 247  *
 248  * \return SIMD 0.0F
 249  */
 250 static inline SimdFloat gmx_simdcall setZeroF()
 251 {
 252     return SimdFloat(0.0F);
 253 }
 254
 255 /*! \} */
 256
 257
 258 /*!
 259  * \name SIMD implementation load/store operations for integers (corresponding to float)
 260  * \{
 261  */
 262
 263 /*! \brief Load aligned SIMD integer data, width corresponds to \ref gmx::SimdFloat.
 264  *
 265  * You should typically just call \ref gmx::load(), which uses proxy objects
 266  * internally to handle all types rather than adding the suffix used here.
 267  *
 268  * \param m Pointer to memory, aligned to (float) integer SIMD width.
 269  * \return SIMD integer variable.
 270  */
 271 static inline SimdFInt32 gmx_simdcall simdLoad(const std::int32_t* m, SimdFInt32Tag)
 272 {
 273     SimdFInt32 a;
 274
 275     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(std::int32_t)) == 0);
 276
 277     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 278     return a;
 279 };
 280
 281 /*! \brief Store aligned SIMD integer data, width corresponds to \ref gmx::SimdFloat.
 282  *
 283  * \param m Memory aligned to (float) integer SIMD width.
 284  * \param a SIMD variable to store.
 285  */
 286 static inline void gmx_simdcall store(std::int32_t* m, SimdFInt32 a)
 287 {
 288     assert(std::size_t(m) % (a.simdInternal_.size() * sizeof(std::int32_t)) == 0);
 289
 290     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 291 };
 292
 293 /*! \brief Load unaligned integer SIMD data, width corresponds to \ref gmx::SimdFloat.
 294  *
 295  * You should typically just call \ref gmx::loadU(), which uses proxy objects
 296  * internally to handle all types rather than adding the suffix used here.
 297  *
 298  * Available if \ref GMX_SIMD_HAVE_LOADU is 1.
 299  *
 300  * \param m Pointer to memory, no alignment requirements.
 301  * \return SIMD integer variable.
 302  */
 303 static inline SimdFInt32 gmx_simdcall simdLoadU(const std::int32_t* m, SimdFInt32Tag)
 304 {
 305     SimdFInt32 a;
 306     std::copy(m, m + a.simdInternal_.size(), a.simdInternal_.begin());
 307     return a;
 308 }
 309
 310 /*! \brief Store unaligned SIMD integer data, width corresponds to \ref gmx::SimdFloat.
 311  *
 312  * Available if \ref GMX_SIMD_HAVE_STOREU is 1.
 313  *
 314  * \param m Memory pointer, no alignment requirements.
 315  * \param a SIMD variable to store.
 316  */
 317 static inline void gmx_simdcall storeU(std::int32_t* m, SimdFInt32 a)
 318 {
 319     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 320 }
 321
 322 /*! \brief Set all SIMD (float) integer variable elements to 0.
 323  *
 324  * You should typically just call \ref gmx::setZero(), which uses proxy objects
 325  * internally to handle all types rather than adding the suffix used here.
 326  *
 327  * \return SIMD 0
 328  */
 329 static inline SimdFInt32 gmx_simdcall setZeroFI()
 330 {
 331     return SimdFInt32(0);
 332 }
 333
 334 /*! \brief Extract element with index i from \ref gmx::SimdFInt32.
 335  *
 336  * Available if \ref GMX_SIMD_HAVE_FINT32_EXTRACT is 1.
 337  *
 338  * \tparam index Compile-time constant, position to extract (first position is 0)
 339  * \param  a     SIMD variable from which to extract value.
 340  * \return Single integer from position index in SIMD variable.
 341  */
 342 template<int index>
 343 static inline std::int32_t gmx_simdcall extract(SimdFInt32 a)
 344 {
 345     return a.simdInternal_[index];
 346 }
 347
 348 /*! \}
 349  *
 350  * \name SIMD implementation single precision floating-point bitwise logical operations
 351  * \{
 352  */
 353
 354 /*! \brief Bitwise and for two SIMD float variables.
 355  *
 356  * Supported if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 357  *
 358  * \param a data1
 359  * \param b data2
 360  * \return data1 & data2
 361  */
 362 static inline SimdFloat gmx_simdcall operator&(SimdFloat a, SimdFloat b)
 363 {
 364     SimdFloat res;
 365
 366     union {
 367         float        r;
 368         std::int32_t i;
 369     } conv1, conv2;
 370
 371     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 372     {
 373         conv1.r              = a.simdInternal_[i];
 374         conv2.r              = b.simdInternal_[i];
 375         conv1.i              = conv1.i & conv2.i;
 376         res.simdInternal_[i] = conv1.r;
 377     }
 378     return res;
 379 }
 380
 381 /*! \brief Bitwise andnot for SIMD float.
 382  *
 383  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 384  *
 385  * \param a data1
 386  * \param b data2
 387  * \return (~data1) & data2
 388  */
 389 static inline SimdFloat gmx_simdcall andNot(SimdFloat a, SimdFloat b)
 390 {
 391     SimdFloat res;
 392
 393     union {
 394         float        r;
 395         std::int32_t i;
 396     } conv1, conv2;
 397
 398     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 399     {
 400         conv1.r              = a.simdInternal_[i];
 401         conv2.r              = b.simdInternal_[i];
 402         conv1.i              = ~conv1.i & conv2.i;
 403         res.simdInternal_[i] = conv1.r;
 404     }
 405     return res;
 406 }
 407
 408 /*! \brief Bitwise or for SIMD float.
 409  *
 410  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 411  *
 412  * \param a data1
 413  * \param b data2
 414  * \return data1 | data2
 415  */
 416 static inline SimdFloat gmx_simdcall operator|(SimdFloat a, SimdFloat b)
 417 {
 418     SimdFloat res;
 419
 420     union {
 421         float        r;
 422         std::int32_t i;
 423     } conv1, conv2;
 424
 425     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 426     {
 427         conv1.r              = a.simdInternal_[i];
 428         conv2.r              = b.simdInternal_[i];
 429         conv1.i              = conv1.i | conv2.i;
 430         res.simdInternal_[i] = conv1.r;
 431     }
 432     return res;
 433 }
 434
 435 /*! \brief Bitwise xor for SIMD float.
 436  *
 437  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 438  *
 439  * \param a data1
 440  * \param b data2
 441  * \return data1 ^ data2
 442  */
 443 static inline SimdFloat gmx_simdcall operator^(SimdFloat a, SimdFloat b)
 444 {
 445     SimdFloat res;
 446
 447     union {
 448         float        r;
 449         std::int32_t i;
 450     } conv1, conv2;
 451
 452     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 453     {
 454         conv1.r              = a.simdInternal_[i];
 455         conv2.r              = b.simdInternal_[i];
 456         conv1.i              = conv1.i ^ conv2.i;
 457         res.simdInternal_[i] = conv1.r;
 458     }
 459     return res;
 460 }
 461
 462 /*! \}
 463  *
 464  * \name SIMD implementation single precision floating-point arithmetics
 465  * \{
 466  */
 467
 468 /*! \brief Add two float SIMD variables.
 469  *
 470  * \param a term1
 471  * \param b term2
 472  * \return a+b
 473  */
 474 static inline SimdFloat gmx_simdcall operator+(SimdFloat a, SimdFloat b)
 475 {
 476     SimdFloat res;
 477
 478     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 479     {
 480         res.simdInternal_[i] = a.simdInternal_[i] + b.simdInternal_[i];
 481     }
 482     return res;
 483 }
 484
 485 /*! \brief Subtract two float SIMD variables.
 486  *
 487  * \param a term1
 488  * \param b term2
 489  * \return a-b
 490  */
 491 static inline SimdFloat gmx_simdcall operator-(SimdFloat a, SimdFloat b)
 492 {
 493     SimdFloat res;
 494
 495     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 496     {
 497         res.simdInternal_[i] = a.simdInternal_[i] - b.simdInternal_[i];
 498     }
 499     return res;
 500 }
 501
 502 /*! \brief SIMD single precision negate.
 503  *
 504  * \param a SIMD double precision value
 505  * \return -a
 506  */
 507 static inline SimdFloat gmx_simdcall operator-(SimdFloat a)
 508 {
 509     SimdFloat res;
 510
 511     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 512     {
 513         res.simdInternal_[i] = -a.simdInternal_[i];
 514     }
 515     return res;
 516 }
 517
 518 /*! \brief Multiply two float SIMD variables.
 519  *
 520  * \param a factor1
 521  * \param b factor2
 522  * \return a*b.
 523  */
 524 static inline SimdFloat gmx_simdcall operator*(SimdFloat a, SimdFloat b)
 525 {
 526     SimdFloat res;
 527
 528     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 529     {
 530         res.simdInternal_[i] = a.simdInternal_[i] * b.simdInternal_[i];
 531     }
 532     return res;
 533 }
 534
 535 /*! \brief SIMD float Fused-multiply-add. Result is a*b+c.
 536  *
 537  * \param a factor1
 538  * \param b factor2
 539  * \param c term
 540  * \return a*b+c
 541  */
 542 static inline SimdFloat gmx_simdcall fma(SimdFloat a, SimdFloat b, SimdFloat c)
 543 {
 544     return a * b + c;
 545 }
 546
 547 /*! \brief SIMD float Fused-multiply-subtract. Result is a*b-c.
 548  *
 549  * \param a factor1
 550  * \param b factor2
 551  * \param c term
 552  * \return a*b-c
 553  */
 554 static inline SimdFloat gmx_simdcall fms(SimdFloat a, SimdFloat b, SimdFloat c)
 555 {
 556     return a * b - c;
 557 }
 558
 559 /*! \brief SIMD float Fused-negated-multiply-add. Result is -a*b+c.
 560  *
 561  * \param a factor1
 562  * \param b factor2
 563  * \param c term
 564  * \return -a*b+c
 565  */
 566 static inline SimdFloat gmx_simdcall fnma(SimdFloat a, SimdFloat b, SimdFloat c)
 567 {
 568     return c - a * b;
 569 }
 570
 571 /*! \brief SIMD float Fused-negated-multiply-subtract. Result is -a*b-c.
 572  *
 573  * \param a factor1
 574  * \param b factor2
 575  * \param c term
 576  * \return -a*b-c
 577  */
 578 static inline SimdFloat gmx_simdcall fnms(SimdFloat a, SimdFloat b, SimdFloat c)
 579 {
 580     return -a * b - c;
 581 }
 582
 583 /*! \brief SIMD float 1.0/sqrt(x) lookup.
 584  *
 585  * This is a low-level instruction that should only be called from routines
 586  * implementing the inverse square root in simd_math.h.
 587  *
 588  * \param x Argument, x>0
 589  * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
 590  */
 591 static inline SimdFloat gmx_simdcall rsqrt(SimdFloat x)
 592 {
 593     SimdFloat res;
 594
 595     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 596     {
 597         res.simdInternal_[i] = 1.0F / std::sqrt(x.simdInternal_[i]);
 598     }
 599     return res;
 600 };
 601
 602 /*! \brief SIMD float 1.0/x lookup.
 603  *
 604  * This is a low-level instruction that should only be called from routines
 605  * implementing the reciprocal in simd_math.h.
 606  *
 607  * \param x Argument, x!=0
 608  * \return Approximation of 1/x, accuracy is \ref GMX_SIMD_RCP_BITS.
 609  */
 610 static inline SimdFloat gmx_simdcall rcp(SimdFloat x)
 611 {
 612     SimdFloat res;
 613
 614     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 615     {
 616         res.simdInternal_[i] = 1.0F / x.simdInternal_[i];
 617     }
 618     return res;
 619 };
 620
 621 /*! \brief Add two float SIMD variables, masked version.
 622  *
 623  * \param a term1
 624  * \param b term2
 625  * \param m mask
 626  * \return a+b where mask is true, a otherwise.
 627  */
 628 static inline SimdFloat gmx_simdcall maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
 629 {
 630     SimdFloat res;
 631
 632     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 633     {
 634         res.simdInternal_[i] = a.simdInternal_[i] + (m.simdInternal_[i] ? b.simdInternal_[i] : 0.0F);
 635     }
 636     return res;
 637 }
 638
 639 /*! \brief Multiply two float SIMD variables, masked version.
 640  *
 641  * \param a factor1
 642  * \param b factor2
 643  * \param m mask
 644  * \return a*b where mask is true, 0.0 otherwise.
 645  */
 646 static inline SimdFloat gmx_simdcall maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
 647 {
 648     SimdFloat res;
 649
 650     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 651     {
 652         res.simdInternal_[i] = m.simdInternal_[i] ? (a.simdInternal_[i] * b.simdInternal_[i]) : 0.0F;
 653     }
 654     return res;
 655 }
 656
 657 /*! \brief SIMD float fused multiply-add, masked version.
 658  *
 659  * \param a factor1
 660  * \param b factor2
 661  * \param c term
 662  * \param m mask
 663  * \return a*b+c where mask is true, 0.0 otherwise.
 664  */
 665 static inline SimdFloat gmx_simdcall maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
 666 {
 667     SimdFloat res;
 668
 669     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 670     {
 671         res.simdInternal_[i] =
 672                 m.simdInternal_[i] ? (a.simdInternal_[i] * b.simdInternal_[i] + c.simdInternal_[i]) : 0.0F;
 673     }
 674     return res;
 675 }
 676
 677 /*! \brief SIMD float 1.0/sqrt(x) lookup, masked version.
 678  *
 679  * This is a low-level instruction that should only be called from routines
 680  * implementing the inverse square root in simd_math.h.
 681  *
 682  * \param x Argument, x>0 for entries where mask is true.
 683  * \param m Mask
 684  * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
 685  *         The result for masked-out entries will be 0.0.
 686  */
 687 static inline SimdFloat gmx_simdcall maskzRsqrt(SimdFloat x, SimdFBool m)
 688 {
 689     SimdFloat res;
 690
 691     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 692     {
 693         res.simdInternal_[i] = (m.simdInternal_[i] != 0) ? 1.0F / std::sqrt(x.simdInternal_[i]) : 0.0F;
 694     }
 695     return res;
 696 }
 697
 698 /*! \brief SIMD float 1.0/x lookup, masked version.
 699  *
 700  * This is a low-level instruction that should only be called from routines
 701  * implementing the reciprocal in simd_math.h.
 702  *
 703  * \param x Argument, x>0 for entries where mask is true.
 704  * \param m Mask
 705  * \return Approximation of 1/x, accuracy is \ref GMX_SIMD_RCP_BITS.
 706  *         The result for masked-out entries will be 0.0.
 707  */
 708 static inline SimdFloat gmx_simdcall maskzRcp(SimdFloat x, SimdFBool m)
 709 {
 710     SimdFloat res;
 711
 712     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 713     {
 714         res.simdInternal_[i] = (m.simdInternal_[i] != 0) ? 1.0F / x.simdInternal_[i] : 0.0F;
 715     }
 716     return res;
 717 }
 718
 719 /*! \brief SIMD float Floating-point abs().
 720  *
 721  * \param a any floating point values
 722  * \return abs(a) for each element.
 723  */
 724 static inline SimdFloat gmx_simdcall abs(SimdFloat a)
 725 {
 726     SimdFloat res;
 727
 728     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 729     {
 730         res.simdInternal_[i] = std::abs(a.simdInternal_[i]);
 731     }
 732     return res;
 733 }
 734
 735 /*! \brief Set each SIMD float element to the largest from two variables.
 736  *
 737  * \param a Any floating-point value
 738  * \param b Any floating-point value
 739  * \return max(a,b) for each element.
 740  */
 741 static inline SimdFloat gmx_simdcall max(SimdFloat a, SimdFloat b)
 742 {
 743     SimdFloat res;
 744
 745     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 746     {
 747         res.simdInternal_[i] = std::max(a.simdInternal_[i], b.simdInternal_[i]);
 748     }
 749     return res;
 750 }
 751
 752 /*! \brief Set each SIMD float element to the smallest from two variables.
 753  *
 754  * \param a Any floating-point value
 755  * \param b Any floating-point value
 756  * \return min(a,b) for each element.
 757  */
 758 static inline SimdFloat gmx_simdcall min(SimdFloat a, SimdFloat b)
 759 {
 760     SimdFloat res;
 761
 762     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 763     {
 764         res.simdInternal_[i] = std::min(a.simdInternal_[i], b.simdInternal_[i]);
 765     }
 766     return res;
 767 }
 768
 769 /*! \brief SIMD float round to nearest integer value (in floating-point format).
 770  *
 771  * \param a Any floating-point value
 772  * \return The nearest integer, represented in floating-point format.
 773  *
 774  * \note Round mode is implementation defined. The only guarantee is that it
 775  * is consistent between rounding functions (round, cvtR2I).
 776  */
 777 static inline SimdFloat gmx_simdcall round(SimdFloat a)
 778 {
 779     SimdFloat res;
 780
 781     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 782     {
 783         res.simdInternal_[i] = std::round(a.simdInternal_[i]);
 784     }
 785     return res;
 786 }
 787
 788 /*! \brief Truncate SIMD float, i.e. round towards zero - common hardware instruction.
 789  *
 790  * \param a Any floating-point value
 791  * \return Integer rounded towards zero, represented in floating-point format.
 792  *
 793  * \note This is truncation towards zero, not floor(). The reason for this
 794  * is that truncation is virtually always present as a dedicated hardware
 795  * instruction, but floor() frequently isn't.
 796  */
 797 static inline SimdFloat gmx_simdcall trunc(SimdFloat a)
 798 {
 799     SimdFloat res;
 800
 801     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 802     {
 803         res.simdInternal_[i] = std::trunc(a.simdInternal_[i]);
 804     }
 805     return res;
 806 }
 807
 808 /*! \brief Extract (integer) exponent and fraction from single precision SIMD.
 809  *
 810  * \tparam      opt       By default this function behaves like the standard
 811  *                        library such that frexp(+-0,exp) returns +-0 and
 812  *                        stores 0 in the exponent when value is 0. If you
 813  *                        know the argument is always nonzero, you can set
 814  *                        the template parameter to MathOptimization::Unsafe
 815  *                        to make it slightly faster.
 816  *
 817  * \param       value     Floating-point value to extract from
 818  * \param[out]  exponent  Returned exponent of value, integer SIMD format.
 819  * \return      Fraction of value, floating-point SIMD format.
 820  */
 821 template<MathOptimization opt = MathOptimization::Safe>
 822 static inline SimdFloat gmx_simdcall frexp(SimdFloat value, SimdFInt32* exponent)
 823 {
 824     SimdFloat fraction;
 825
 826     for (std::size_t i = 0; i < fraction.simdInternal_.size(); i++)
 827     {
 828         fraction.simdInternal_[i] = std::frexp(value.simdInternal_[i], &exponent->simdInternal_[i]);
 829     }
 830     return fraction;
 831 }
 832
 833 /*! \brief Multiply a SIMD float value by the number 2 raised to an exp power.
 834  *
 835  * \tparam opt By default, this routine will return zero for input arguments
 836  *             that are so small they cannot be reproduced in the current
 837  *             precision. If the unsafe math optimization template parameter
 838  *             setting is used, these tests are skipped, and the result will
 839  *             be undefined (possible even NaN). This might happen below -127
 840  *             in single precision or -1023 in double, although some
 841  *             might use denormal support to extend the range.
 842  *
 843  * \param value Floating-point number to multiply with new exponent
 844  * \param exponent Integer that will not overflow as 2^exponent.
 845  * \return value*2^exponent
 846  */
 847 template<MathOptimization opt = MathOptimization::Safe>
 848 static inline SimdFloat gmx_simdcall ldexp(SimdFloat value, SimdFInt32 exponent)
 849 {
 850     SimdFloat res;
 851
 852     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 853     {
 854         // std::ldexp already takes care of clamping arguments, so we do not
 855         // need to do anything in the reference implementation
 856         res.simdInternal_[i] = std::ldexp(value.simdInternal_[i], exponent.simdInternal_[i]);
 857     }
 858     return res;
 859 }
 860
 861 /*! \brief Return sum of all elements in SIMD float variable.
 862  *
 863  * \param a SIMD variable to reduce/sum.
 864  * \return The sum of all elements in the argument variable.
 865  *
 866  */
 867 static inline float gmx_simdcall reduce(SimdFloat a)
 868 {
 869     float sum = 0.0F;
 870
 871     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
 872     {
 873         sum += a.simdInternal_[i];
 874     }
 875     return sum;
 876 }
 877
 878 /*! \}
 879  *
 880  * \name SIMD implementation single precision floating-point comparisons, boolean, selection.
 881  * \{
 882  */
 883
 884 /*! \brief SIMD a==b for single SIMD.
 885  *
 886  * \param a value1
 887  * \param b value2
 888  * \return Each element of the boolean will be set to true if a==b.
 889  *
 890  * Beware that exact floating-point comparisons are difficult.
 891  */
 892 static inline SimdFBool gmx_simdcall operator==(SimdFloat a, SimdFloat b)
 893 {
 894     SimdFBool res;
 895
 896     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 897     {
 898         res.simdInternal_[i] = (a.simdInternal_[i] == b.simdInternal_[i]);
 899     }
 900     return res;
 901 }
 902
 903 /*! \brief SIMD a!=b for single SIMD.
 904  *
 905  * \param a value1
 906  * \param b value2
 907  * \return Each element of the boolean will be set to true if a!=b.
 908  *
 909  * Beware that exact floating-point comparisons are difficult.
 910  */
 911 static inline SimdFBool gmx_simdcall operator!=(SimdFloat a, SimdFloat b)
 912 {
 913     SimdFBool res;
 914
 915     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 916     {
 917         res.simdInternal_[i] = (a.simdInternal_[i] != b.simdInternal_[i]);
 918     }
 919     return res;
 920 }
 921
 922 /*! \brief SIMD a<b for single SIMD.
 923  *
 924  * \param a value1
 925  * \param b value2
 926  * \return Each element of the boolean will be set to true if a<b.
 927  */
 928 static inline SimdFBool gmx_simdcall operator<(SimdFloat a, SimdFloat b)
 929 {
 930     SimdFBool res;
 931
 932     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 933     {
 934         res.simdInternal_[i] = (a.simdInternal_[i] < b.simdInternal_[i]);
 935     }
 936     return res;
 937 }
 938
 939 /*! \brief SIMD a<=b for single SIMD.
 940  *
 941  * \param a value1
 942  * \param b value2
 943  * \return Each element of the boolean will be set to true if a<=b.
 944  */
 945 static inline SimdFBool gmx_simdcall operator<=(SimdFloat a, SimdFloat b)
 946 {
 947     SimdFBool res;
 948
 949     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 950     {
 951         res.simdInternal_[i] = (a.simdInternal_[i] <= b.simdInternal_[i]);
 952     }
 953     return res;
 954 }
 955
 956 /*! \brief Return true if any bits are set in the single precision SIMD.
 957  *
 958  * This function is used to handle bitmasks, mainly for exclusions in the
 959  * inner kernels. Note that it will return true even for -0.0F (sign bit set),
 960  * so it is not identical to not-equal.
 961  *
 962  * \param a value
 963  * \return Each element of the boolean will be true if any bit in a is nonzero.
 964  */
 965 static inline SimdFBool gmx_simdcall testBits(SimdFloat a)
 966 {
 967     SimdFBool res;
 968
 969     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 970     {
 971         union {
 972             std::uint32_t i;
 973             float         f;
 974         } conv;
 975
 976         conv.f               = a.simdInternal_[i];
 977         res.simdInternal_[i] = (conv.i != 0);
 978     }
 979     return res;
 980 }
 981
 982 /*! \brief Logical \a and on single precision SIMD booleans.
 983  *
 984  * \param a logical vars 1
 985  * \param b logical vars 2
 986  * \return For each element, the result boolean is true if a \& b are true.
 987  *
 988  * \note This is not necessarily a bitwise operation - the storage format
 989  * of booleans is implementation-dependent.
 990  */
 991 static inline SimdFBool gmx_simdcall operator&&(SimdFBool a, SimdFBool b)
 992 {
 993     SimdFBool res;
 994
 995     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 996     {
 997         res.simdInternal_[i] = (a.simdInternal_[i] && b.simdInternal_[i]);
 998     }
 999     return res;
1000 }
1001
1002 /*! \brief Logical \a or on single precision SIMD booleans.
1003  *
1004  * \param a logical vars 1
1005  * \param b logical vars 2
1006  * \return For each element, the result boolean is true if a or b is true.
1007  *
1008  * Note that this is not necessarily a bitwise operation - the storage format
1009  * of booleans is implementation-dependent.
1010  *
1011  \ */
1012 static inline SimdFBool gmx_simdcall operator||(SimdFBool a, SimdFBool b)
1013 {
1014     SimdFBool res;
1015
1016     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1017     {
1018         res.simdInternal_[i] = (a.simdInternal_[i] || b.simdInternal_[i]);
1019     }
1020     return res;
1021 }
1022
1023 /*! \brief Returns non-zero if any of the boolean in SIMD a is True, otherwise 0.
1024  *
1025  * \param a Logical variable.
1026  * \return true if any element in a is true, otherwise false.
1027  *
1028  * The actual return value for truth will depend on the architecture,
1029  * so any non-zero value is considered truth.
1030  */
1031 static inline bool gmx_simdcall anyTrue(SimdFBool a)
1032 {
1033     bool res = false;
1034
1035     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
1036     {
1037         res = res || a.simdInternal_[i];
1038     }
1039     return res;
1040 }
1041
1042 /*! \brief Select from single precision SIMD variable where boolean is true.
1043  *
1044  * \param a Floating-point variable to select from
1045  * \param mask Boolean selector
1046  * \return  For each element, a is selected for true, 0 for false.
1047  */
1048 static inline SimdFloat gmx_simdcall selectByMask(SimdFloat a, SimdFBool mask)
1049 {
1050     SimdFloat res;
1051
1052     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1053     {
1054         res.simdInternal_[i] = mask.simdInternal_[i] ? a.simdInternal_[i] : 0.0F;
1055     }
1056     return res;
1057 }
1058
1059 /*! \brief Select from single precision SIMD variable where boolean is false.
1060  *
1061  * \param a Floating-point variable to select from
1062  * \param mask Boolean selector
1063  * \return  For each element, a is selected for false, 0 for true (sic).
1064  */
1065 static inline SimdFloat gmx_simdcall selectByNotMask(SimdFloat a, SimdFBool mask)
1066 {
1067     SimdFloat res;
1068
1069     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1070     {
1071         res.simdInternal_[i] = mask.simdInternal_[i] ? 0.0F : a.simdInternal_[i];
1072     }
1073     return res;
1074 }
1075
1076 /*! \brief Vector-blend SIMD float selection.
1077  *
1078  * \param a First source
1079  * \param b Second source
1080  * \param sel Boolean selector
1081  * \return For each element, select b if sel is true, a otherwise.
1082  */
1083 static inline SimdFloat gmx_simdcall blend(SimdFloat a, SimdFloat b, SimdFBool sel)
1084 {
1085     SimdFloat res;
1086
1087     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1088     {
1089         res.simdInternal_[i] = sel.simdInternal_[i] ? b.simdInternal_[i] : a.simdInternal_[i];
1090     }
1091     return res;
1092 }
1093
1094 /*! \}
1095  *
1096  * \name SIMD implementation integer (corresponding to float) bitwise logical operations
1097  * \{
1098  */
1099
1100 /*! \brief Integer SIMD bitwise and.
1101  *
1102  * Available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL is 1.
1103  *
1104  * \note You can \a not use this operation directly to select based on a boolean
1105  * SIMD variable, since booleans are separate from integer SIMD. If that
1106  * is what you need, have a look at \ref gmx::selectByMask instead.
1107  *
1108  * \param a first integer SIMD
1109  * \param b second integer SIMD
1110  * \return a \& b (bitwise and)
1111  */
1112 static inline SimdFInt32 gmx_simdcall operator&(SimdFInt32 a, SimdFInt32 b)
1113 {
1114     SimdFInt32 res;
1115
1116     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1117     {
1118         res.simdInternal_[i] = a.simdInternal_[i] & b.simdInternal_[i];
1119     }
1120     return res;
1121 }
1122
1123 /*! \brief Integer SIMD bitwise not/complement.
1124  *
1125  * Available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL is 1.
1126  *
1127  * \note You can \a not use this operation directly to select based on a boolean
1128  * SIMD variable, since booleans are separate from integer SIMD. If that
1129  * is what you need, have a look at \ref gmx::selectByMask instead.
1130  *
1131  * \param a integer SIMD
1132  * \param b integer SIMD
1133  * \return (~a) & b
1134  */
1135 static inline SimdFInt32 gmx_simdcall andNot(SimdFInt32 a, SimdFInt32 b)
1136 {
1137     SimdFInt32 res;
1138
1139     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1140     {
1141         res.simdInternal_[i] = ~a.simdInternal_[i] & b.simdInternal_[i];
1142     }
1143     return res;
1144 }
1145
1146 /*! \brief Integer SIMD bitwise or.
1147  *
1148  * Available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL is 1.
1149  *
1150  * \param a first integer SIMD
1151  * \param b second integer SIMD
1152  * \return a \| b (bitwise or)
1153  */
1154 static inline SimdFInt32 gmx_simdcall operator|(SimdFInt32 a, SimdFInt32 b)
1155 {
1156     SimdFInt32 res;
1157
1158     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1159     {
1160         res.simdInternal_[i] = a.simdInternal_[i] | b.simdInternal_[i];
1161     }
1162     return res;
1163 }
1164
1165 /*! \brief Integer SIMD bitwise xor.
1166  *
1167  * Available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL is 1.
1168  *
1169  * \param a first integer SIMD
1170  * \param b second integer SIMD
1171  * \return a ^ b (bitwise xor)
1172  */
1173 static inline SimdFInt32 gmx_simdcall operator^(SimdFInt32 a, SimdFInt32 b)
1174 {
1175     SimdFInt32 res;
1176
1177     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1178     {
1179         res.simdInternal_[i] = a.simdInternal_[i] ^ b.simdInternal_[i];
1180     }
1181     return res;
1182 }
1183
1184 /*! \}
1185  *
1186  * \name SIMD implementation integer (corresponding to float) arithmetics
1187  * \{
1188  */
1189
1190 /*! \brief Add SIMD integers.
1191  *
1192  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
1193  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is 1.
1194  *
1195  * \param a term1
1196  * \param b term2
1197  * \return a+b
1198  */
1199 static inline SimdFInt32 gmx_simdcall operator+(SimdFInt32 a, SimdFInt32 b)
1200 {
1201     SimdFInt32 res;
1202
1203     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1204     {
1205         res.simdInternal_[i] = a.simdInternal_[i] + b.simdInternal_[i];
1206     }
1207     return res;
1208 }
1209
1210 /*! \brief Subtract SIMD integers.
1211  *
1212  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
1213  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is 1.
1214  *
1215  * \param a term1
1216  * \param b term2
1217  * \return a-b
1218  */
1219 static inline SimdFInt32 gmx_simdcall operator-(SimdFInt32 a, SimdFInt32 b)
1220 {
1221     SimdFInt32 res;
1222
1223     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1224     {
1225         res.simdInternal_[i] = a.simdInternal_[i] - b.simdInternal_[i];
1226     }
1227     return res;
1228 }
1229
1230 /*! \brief Multiply SIMD integers.
1231  *
1232  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
1233  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is 1.
1234  *
1235  * \param a factor1
1236  * \param b factor2
1237  * \return a*b.
1238  *
1239  * \note Only the low 32 bits are retained, so this can overflow.
1240  */
1241 static inline SimdFInt32 gmx_simdcall operator*(SimdFInt32 a, SimdFInt32 b)
1242 {
1243     SimdFInt32 res;
1244
1245     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1246     {
1247         res.simdInternal_[i] = a.simdInternal_[i] * b.simdInternal_[i];
1248     }
1249     return res;
1250 }
1251
1252 /*! \}
1253  *
1254  * \name SIMD implementation integer (corresponding to float) comparisons, boolean, selection
1255  * \{
1256  */
1257
1258 /*! \brief Equality comparison of two integers corresponding to float values.
1259  *
1260  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1261  *
1262  * \param a SIMD integer1
1263  * \param b SIMD integer2
1264  * \return SIMD integer boolean with true for elements where a==b
1265  */
1266 static inline SimdFIBool gmx_simdcall operator==(SimdFInt32 a, SimdFInt32 b)
1267 {
1268     SimdFIBool res;
1269
1270     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1271     {
1272         res.simdInternal_[i] = (a.simdInternal_[i] == b.simdInternal_[i]);
1273     }
1274     return res;
1275 }
1276
1277 /*! \brief Less-than comparison of two SIMD integers corresponding to float values.
1278  *
1279  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1280  *
1281  * \param a SIMD integer1
1282  * \param b SIMD integer2
1283  * \return SIMD integer boolean with true for elements where a<b
1284  */
1285 static inline SimdFIBool gmx_simdcall operator<(SimdFInt32 a, SimdFInt32 b)
1286 {
1287     SimdFIBool res;
1288
1289     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1290     {
1291         res.simdInternal_[i] = (a.simdInternal_[i] < b.simdInternal_[i]);
1292     }
1293     return res;
1294 }
1295
1296 /*! \brief Check if any bit is set in each element
1297  *
1298  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1299  *
1300  * \param a SIMD integer
1301  * \return SIMD integer boolean with true for elements where any bit is set
1302  */
1303 static inline SimdFIBool gmx_simdcall testBits(SimdFInt32 a)
1304 {
1305     SimdFIBool res;
1306
1307     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1308     {
1309         res.simdInternal_[i] = (a.simdInternal_[i] != 0);
1310     }
1311     return res;
1312 }
1313
1314 /*! \brief Logical AND on SimdFIBool.
1315  *
1316  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1317  *
1318  * \param a SIMD boolean 1
1319  * \param b SIMD boolean 2
1320  * \return True for elements where both a and b are true.
1321  */
1322 static inline SimdFIBool gmx_simdcall operator&&(SimdFIBool a, SimdFIBool b)
1323 {
1324     SimdFIBool res;
1325
1326     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1327     {
1328         res.simdInternal_[i] = (a.simdInternal_[i] && b.simdInternal_[i]);
1329     }
1330     return res;
1331 }
1332
1333 /*! \brief Logical OR on SimdFIBool.
1334  *
1335  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1336  *
1337  * \param a SIMD boolean 1
1338  * \param b SIMD boolean 2
1339  * \return True for elements where both a and b are true.
1340  */
1341 static inline SimdFIBool gmx_simdcall operator||(SimdFIBool a, SimdFIBool b)
1342 {
1343     SimdFIBool res;
1344
1345     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1346     {
1347         res.simdInternal_[i] = (a.simdInternal_[i] || b.simdInternal_[i]);
1348     }
1349     return res;
1350 }
1351
1352 /*! \brief Returns true if any of the boolean in x is True, otherwise 0.
1353  *
1354  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1355  *
1356  * The actual return value for "any true" will depend on the architecture.
1357  * Any non-zero value should be considered truth.
1358  *
1359  * \param a SIMD boolean
1360  * \return True if any of the elements in a is true, otherwise 0.
1361  */
1362 static inline bool gmx_simdcall anyTrue(SimdFIBool a)
1363 {
1364     bool res = false;
1365
1366     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
1367     {
1368         res = res || a.simdInternal_[i];
1369     }
1370     return res;
1371 }
1372
1373 /*! \brief Select from \ref gmx::SimdFInt32 variable where boolean is true.
1374  *
1375  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1376  *
1377  * \param a SIMD integer to select from
1378  * \param mask Boolean selector
1379  * \return Elements from a where sel is true, 0 otherwise.
1380  */
1381 static inline SimdFInt32 gmx_simdcall selectByMask(SimdFInt32 a, SimdFIBool mask)
1382 {
1383     SimdFInt32 res;
1384
1385     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1386     {
1387         res.simdInternal_[i] = mask.simdInternal_[i] ? a.simdInternal_[i] : 0.0F;
1388     }
1389     return res;
1390 }
1391
1392 /*! \brief Select from \ref gmx::SimdFInt32 variable where boolean is false.
1393  *
1394  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1395  *
1396  * \param a SIMD integer to select from
1397  * \param mask Boolean selector
1398  * \return Elements from a where sel is false, 0 otherwise (sic).
1399  */
1400 static inline SimdFInt32 gmx_simdcall selectByNotMask(SimdFInt32 a, SimdFIBool mask)
1401 {
1402     SimdFInt32 res;
1403
1404     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1405     {
1406         res.simdInternal_[i] = mask.simdInternal_[i] ? 0.0F : a.simdInternal_[i];
1407     }
1408     return res;
1409 }
1410
1411 /*! \brief Vector-blend SIMD integer selection.
1412  *
1413  * Available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS is 1.
1414  *
1415  * \param a First source
1416  * \param b Second source
1417  * \param sel Boolean selector
1418  * \return For each element, select b if sel is true, a otherwise.
1419  */
1420 static inline SimdFInt32 gmx_simdcall blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
1421 {
1422     SimdFInt32 res;
1423
1424     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
1425     {
1426         res.simdInternal_[i] = sel.simdInternal_[i] ? b.simdInternal_[i] : a.simdInternal_[i];
1427     }
1428     return res;
1429 }
1430
1431 /*! \}
1432  *
1433  * \name SIMD implementation conversion operations
1434  * \{
1435  */
1436
1437 /*! \brief Round single precision floating point to integer.
1438  *
1439  * \param a SIMD floating-point
1440  * \return SIMD integer, rounded to nearest integer.
1441  *
1442  * \note Round mode is implementation defined. The only guarantee is that it
1443  * is consistent between rounding functions (round, cvtR2I).
1444  */
1445 static inline SimdFInt32 gmx_simdcall cvtR2I(SimdFloat a)
1446 {
1447     SimdFInt32 b;
1448
1449     for (std::size_t i = 0; i < b.simdInternal_.size(); i++)
1450     {
1451         b.simdInternal_[i] = std::round(a.simdInternal_[i]);
1452     }
1453     return b;
1454 };
1455
1456 /*! \brief Truncate single precision floating point to integer.
1457  *
1458  * \param a SIMD floating-point
1459  * \return SIMD integer, truncated to nearest integer.
1460  */
1461 static inline SimdFInt32 gmx_simdcall cvttR2I(SimdFloat a)
1462 {
1463     SimdFInt32 b;
1464
1465     for (std::size_t i = 0; i < b.simdInternal_.size(); i++)
1466     {
1467         b.simdInternal_[i] = std::trunc(a.simdInternal_[i]);
1468     }
1469     return b;
1470 };
1471
1472 /*! \brief Convert integer to single precision floating point.
1473  *
1474  * \param a SIMD integer
1475  * \return SIMD floating-point
1476  */
1477 static inline SimdFloat gmx_simdcall cvtI2R(SimdFInt32 a)
1478 {
1479     SimdFloat b;
1480
1481     for (std::size_t i = 0; i < b.simdInternal_.size(); i++)
1482     {
1483         b.simdInternal_[i] = a.simdInternal_[i];
1484     }
1485     return b;
1486 };
1487
1488 /*! \brief Convert from single precision boolean to corresponding integer boolean
1489  *
1490  * \param a SIMD floating-point boolean
1491  * \return SIMD integer boolean
1492  */
1493 static inline SimdFIBool gmx_simdcall cvtB2IB(SimdFBool a)
1494 {
1495     SimdFIBool b;
1496
1497     for (std::size_t i = 0; i < b.simdInternal_.size(); i++)
1498     {
1499         b.simdInternal_[i] = a.simdInternal_[i];
1500     }
1501     return b;
1502 };
1503
1504 /*! \brief Convert from integer boolean to corresponding single precision boolean
1505  *
1506  * \param a SIMD integer boolean
1507  * \return SIMD floating-point boolean
1508  */
1509 static inline SimdFBool gmx_simdcall cvtIB2B(SimdFIBool a)
1510 {
1511     SimdFBool b;
1512
1513     for (std::size_t i = 0; i < b.simdInternal_.size(); i++)
1514     {
1515         b.simdInternal_[i] = a.simdInternal_[i];
1516     }
1517     return b;
1518 };
1519
1520 /*! \} */
1521
1522 /*! \} */
1523 /*! \endcond */
1524
1525 } // namespace gmx
1526
1527 #endif // GMX_SIMD_IMPL_REFERENCE_SIMD_FLOAT_H