src/gromacs/simd/simd_math.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_SIMD_MATH_H
  36 #define GMX_SIMD_SIMD_MATH_H
  37
  38 /*! \libinternal \file
  39  *
  40  * \brief Math functions for SIMD datatypes.
  41  *
  42  * \attention This file is generic for all SIMD architectures, so you cannot
  43  * assume that any of the optional SIMD features (as defined in simd.h) are
  44  * present. In particular, this means you cannot assume support for integers,
  45  * logical operations (neither on floating-point nor integer values), shifts,
  46  * and the architecture might only have SIMD for either float or double.
  47  * Second, to keep this file clean and general, any additions to this file
  48  * must work for all possible SIMD architectures in both single and double
  49  * precision (if they support it), and you cannot make any assumptions about
  50  * SIMD width.
  51  *
  52  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  53  *
  54  * \inlibraryapi
  55  * \ingroup module_simd
  56  */
  57
  58 #include "config.h"
  59
  60 #include <cmath>
  61
  62 #include <limits>
  63
  64 #include "gromacs/math/utilities.h"
  65 #include "gromacs/simd/simd.h"
  66 #include "gromacs/utility/basedefinitions.h"
  67 #include "gromacs/utility/real.h"
  68
  69 namespace gmx
  70 {
  71
  72 #if GMX_SIMD
  73
  74 /*! \cond libapi */
  75 /*! \addtogroup module_simd */
  76 /*! \{ */
  77
  78 /*! \name Implementation accuracy settings
  79  *  \{
  80  */
  81
  82 /*! \} */
  83
  84 #if GMX_SIMD_HAVE_FLOAT
  85
  86 /*! \name Single precision SIMD math functions
  87  *
  88  *  \note In most cases you should use the real-precision functions instead.
  89  *  \{
  90  */
  91
  92 /****************************************
  93  * SINGLE PRECISION SIMD MATH FUNCTIONS *
  94  ****************************************/
  95
  96 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
  97 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
  98  *
  99  * \param x Values to set sign for
 100  * \param y Values used to set sign
 101  * \return  Magnitude of x, sign of y
 102  */
 103 static inline SimdFloat gmx_simdcall
 104 copysign(SimdFloat x, SimdFloat y)
 105 {
 106 #if GMX_SIMD_HAVE_LOGICAL
 107     return abs(x) | ( SimdFloat(GMX_FLOAT_NEGZERO) & y );
 108 #else
 109     return blend(abs(x), -abs(x), y < setZero());
 110 #endif
 111 }
 112 #endif
 113
 114 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
 115 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
 116  *
 117  * This is a low-level routine that should only be used by SIMD math routine
 118  * that evaluates the inverse square root.
 119  *
 120  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
 121  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
 122  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 123  */
 124 static inline SimdFloat gmx_simdcall
 125 rsqrtIter(SimdFloat lu, SimdFloat x)
 126 {
 127     SimdFloat tmp1 = x*lu;
 128     SimdFloat tmp2 = SimdFloat(-0.5F)*lu;
 129     tmp1 = fma(tmp1, lu, SimdFloat(-3.0F));
 130     return tmp1*tmp2;
 131 }
 132 #endif
 133
 134 /*! \brief Calculate 1/sqrt(x) for SIMD float.
 135  *
 136  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 137  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 138  *           For the single precision implementation this is obviously always
 139  *           true for positive values, but for double precision it adds an
 140  *           extra restriction since the first lookup step might have to be
 141  *           performed in single precision on some architectures. Note that the
 142  *           responsibility for checking falls on you - this routine does not
 143  *           check arguments.
 144  *
 145  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
 146  */
 147 static inline SimdFloat gmx_simdcall
 148 invsqrt(SimdFloat x)
 149 {
 150     SimdFloat lu = rsqrt(x);
 151 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 152     lu = rsqrtIter(lu, x);
 153 #endif
 154 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 155     lu = rsqrtIter(lu, x);
 156 #endif
 157 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 158     lu = rsqrtIter(lu, x);
 159 #endif
 160     return lu;
 161 }
 162
 163 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
 164  *
 165  * \param x0  First set of arguments, x0 must be in single range (see below).
 166  * \param x1  Second set of arguments, x1 must be in single range (see below).
 167  * \param[out] out0  Result 1/sqrt(x0)
 168  * \param[out] out1  Result 1/sqrt(x1)
 169  *
 170  *  In particular for double precision we can sometimes calculate square root
 171  *  pairs slightly faster by using single precision until the very last step.
 172  *
 173  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
 174  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
 175  *       For the single precision implementation this is obviously always
 176  *       true for positive values, but for double precision it adds an
 177  *       extra restriction since the first lookup step might have to be
 178  *       performed in single precision on some architectures. Note that the
 179  *       responsibility for checking falls on you - this routine does not
 180  *       check arguments.
 181  */
 182 static inline void gmx_simdcall
 183 invsqrtPair(SimdFloat x0,    SimdFloat x1,
 184             SimdFloat *out0, SimdFloat *out1)
 185 {
 186     *out0 = invsqrt(x0);
 187     *out1 = invsqrt(x1);
 188 }
 189
 190 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
 191 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
 192  *
 193  * This is a low-level routine that should only be used by SIMD math routine
 194  * that evaluates the reciprocal.
 195  *
 196  *  \param lu Approximation of 1/x, typically obtained from lookup.
 197  *  \param x  The reference (starting) value x for which we want 1/x.
 198  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 199  */
 200 static inline SimdFloat gmx_simdcall
 201 rcpIter(SimdFloat lu, SimdFloat x)
 202 {
 203     return lu*fnma(lu, x, SimdFloat(2.0F));
 204 }
 205 #endif
 206
 207 /*! \brief Calculate 1/x for SIMD float.
 208  *
 209  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 210  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 211  *           For the single precision implementation this is obviously always
 212  *           true for positive values, but for double precision it adds an
 213  *           extra restriction since the first lookup step might have to be
 214  *           performed in single precision on some architectures. Note that the
 215  *           responsibility for checking falls on you - this routine does not
 216  *           check arguments.
 217  *
 218  *  \return 1/x. Result is undefined if your argument was invalid.
 219  */
 220 static inline SimdFloat gmx_simdcall
 221 inv(SimdFloat x)
 222 {
 223     SimdFloat lu = rcp(x);
 224 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 225     lu = rcpIter(lu, x);
 226 #endif
 227 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 228     lu = rcpIter(lu, x);
 229 #endif
 230 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 231     lu = rcpIter(lu, x);
 232 #endif
 233     return lu;
 234 }
 235
 236 /*! \brief Division for SIMD floats
 237  *
 238  * \param nom    Nominator
 239  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
 240  *               For single precision this is equivalent to a nonzero argument,
 241  *               but in double precision it adds an extra restriction since
 242  *               the first lookup step might have to be performed in single
 243  *               precision on some architectures. Note that the responsibility
 244  *               for checking falls on you - this routine does not check arguments.
 245  *
 246  * \return nom/denom
 247  *
 248  * \note This function does not use any masking to avoid problems with
 249  *       zero values in the denominator.
 250  */
 251 static inline SimdFloat gmx_simdcall
 252 operator/(SimdFloat nom, SimdFloat denom)
 253 {
 254     return nom*inv(denom);
 255 }
 256
 257 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
 258  *
 259  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
 260  *  Illegal values in the masked-out elements will not lead to
 261  *  floating-point exceptions.
 262  *
 263  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 264  *           GMX_FLOAT_MAX for masked-in entries.
 265  *           See \ref invsqrt for the discussion about argument restrictions.
 266  *  \param m Mask
 267  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
 268  *          entry was not masked, and 0.0 for masked-out entries.
 269  */
 270 static inline SimdFloat
 271 maskzInvsqrt(SimdFloat x, SimdFBool m)
 272 {
 273     SimdFloat lu = maskzRsqrt(x, m);
 274 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 275     lu = rsqrtIter(lu, x);
 276 #endif
 277 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 278     lu = rsqrtIter(lu, x);
 279 #endif
 280 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 281     lu = rsqrtIter(lu, x);
 282 #endif
 283     return lu;
 284 }
 285
 286 /*! \brief Calculate 1/x for SIMD float, masked version.
 287  *
 288  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 289  *           GMX_FLOAT_MAX for masked-in entries.
 290  *           See \ref invsqrt for the discussion about argument restrictions.
 291  *  \param m Mask
 292  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
 293  */
 294 static inline SimdFloat gmx_simdcall
 295 maskzInv(SimdFloat x, SimdFBool m)
 296 {
 297     SimdFloat lu = maskzRcp(x, m);
 298 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 299     lu = rcpIter(lu, x);
 300 #endif
 301 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 302     lu = rcpIter(lu, x);
 303 #endif
 304 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 305     lu = rcpIter(lu, x);
 306 #endif
 307     return lu;
 308 }
 309
 310 /*! \brief Calculate sqrt(x) for SIMD floats
 311  *
 312  *  \tparam opt By default, this function checks if the input value is 0.0
 313  *              and masks this to return the correct result. If you are certain
 314  *              your argument will never be zero, and you know you need to save
 315  *              every single cycle you can, you can alternatively call the
 316  *              function as sqrt<MathOptimization::Unsafe>(x).
 317  *
 318  *  \param  x   Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
 319  *              lookup step often has to be implemented in single precision.
 320  *              Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
 321  *              result, even in double precision. If you are using the unsafe
 322  *              math optimization parameter, the argument must be in the range
 323  *              GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
 324  *
 325  *  \return sqrt(x). The result is undefined if the input value does not fall
 326  *          in the allowed range specified for the argument.
 327  */
 328 template <MathOptimization opt = MathOptimization::Safe>
 329 static inline SimdFloat gmx_simdcall
 330 sqrt(SimdFloat x)
 331 {
 332     if (opt == MathOptimization::Safe)
 333     {
 334         SimdFloat  res = maskzInvsqrt(x, setZero() < x);
 335         return res*x;
 336     }
 337     else
 338     {
 339         return x * invsqrt(x);
 340     }
 341 }
 342
 343 #if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
 344 /*! \brief SIMD float log(x). This is the natural logarithm.
 345  *
 346  * \param x Argument, should be >0.
 347  * \result The natural logarithm of x. Undefined if argument is invalid.
 348  */
 349 static inline SimdFloat gmx_simdcall
 350 log(SimdFloat x)
 351 {
 352     const SimdFloat  one(1.0F);
 353     const SimdFloat  two(2.0F);
 354     const SimdFloat  invsqrt2(1.0F/std::sqrt(2.0F));
 355     const SimdFloat  corr(0.693147180559945286226764F);
 356     const SimdFloat  CL9(0.2371599674224853515625F);
 357     const SimdFloat  CL7(0.285279005765914916992188F);
 358     const SimdFloat  CL5(0.400005519390106201171875F);
 359     const SimdFloat  CL3(0.666666567325592041015625F);
 360     const SimdFloat  CL1(2.0F);
 361     SimdFloat        fExp, x2, p;
 362     SimdFBool        m;
 363     SimdFInt32       iExp;
 364
 365     x     = frexp(x, &iExp);
 366     fExp  = cvtI2R(iExp);
 367
 368     m     = x < invsqrt2;
 369     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
 370     fExp  = fExp - selectByMask(one, m);
 371     x     = x * blend(one, two, m);
 372
 373     x     = (x-one) * inv( x+one );
 374     x2    = x * x;
 375
 376     p     = fma(CL9, x2, CL7);
 377     p     = fma(p, x2, CL5);
 378     p     = fma(p, x2, CL3);
 379     p     = fma(p, x2, CL1);
 380     p     = fma(p, x, corr*fExp);
 381
 382     return p;
 383 }
 384 #endif
 385
 386 #if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
 387 /*! \brief SIMD float 2^x
 388  *
 389  * \tparam opt If this is changed from the default (safe) into the unsafe
 390  *             option, input values that would otherwise lead to zero-clamped
 391  *             results are not allowed and will lead to undefined results.
 392  *
 393  * \param x Argument. For the default (safe) function version this can be
 394  *          arbitrarily small value, but the routine might clamp the result to
 395  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 396  *          This corresponds to inputs below -126 in single or -1022 in double,
 397  *          and it might overflow for arguments reaching 127 (single) or
 398  *          1023 (double). If you enable the unsafe math optimization,
 399  *          very small arguments will not necessarily be zero-clamped, but
 400  *          can produce undefined results.
 401  *
 402  * \result 2^x. The result is undefined for very large arguments that cause
 403  *          internal floating-point overflow. If unsafe optimizations are enabled,
 404  *          this is also true for very small values.
 405  *
 406  * \note    The definition range of this function is just-so-slightly smaller
 407  *          than the allowed IEEE exponents for many architectures. This is due
 408  *          to the implementation, which will hopefully improve in the future.
 409  *
 410  * \warning You cannot rely on this implementation returning inf for arguments
 411  *          that cause overflow. If you have some very large
 412  *          values and need to rely on getting a valid numerical output,
 413  *          take the minimum of your variable and the largest valid argument
 414  *          before calling this routine.
 415  */
 416 template <MathOptimization opt = MathOptimization::Safe>
 417 static inline SimdFloat gmx_simdcall
 418 exp2(SimdFloat x)
 419 {
 420     const SimdFloat  CC6(0.0001534581200287996416911311F);
 421     const SimdFloat  CC5(0.001339993121934088894618990F);
 422     const SimdFloat  CC4(0.009618488957115180159497841F);
 423     const SimdFloat  CC3(0.05550328776964726865751735F);
 424     const SimdFloat  CC2(0.2402264689063408646490722F);
 425     const SimdFloat  CC1(0.6931472057372680777553816F);
 426     const SimdFloat  one(1.0F);
 427
 428     SimdFloat        intpart;
 429     SimdFloat        fexppart;
 430     SimdFloat        p;
 431
 432     // Large negative values are valid arguments to exp2(), so there are two
 433     // things we need to account for:
 434     // 1. When the exponents reaches -127, the (biased) exponent field will be
 435     //    zero and we can no longer multiply with it. There are special IEEE
 436     //    formats to handle this range, but for now we have to accept that
 437     //    we cannot handle those arguments. If input value becomes even more
 438     //    negative, it will start to loop and we would end up with invalid
 439     //    exponents. Thus, we need to limit or mask this.
 440     // 2. For VERY large negative values, we will have problems that the
 441     //    subtraction to get the fractional part loses accuracy, and then we
 442     //    can end up with overflows in the polynomial.
 443     //
 444     // For now, we handle this by forwarding the math optimization setting to
 445     // ldexp, where the routine will return zero for very small arguments.
 446     //
 447     // However, before doing that we need to make sure we do not call cvtR2I
 448     // with an argument that is so negative it cannot be converted to an integer.
 449     if (opt == MathOptimization::Safe)
 450     {
 451         x         = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()));
 452     }
 453
 454     fexppart  = ldexp<opt>(one, cvtR2I(x));
 455     intpart   = round(x);
 456     x         = x - intpart;
 457
 458     p         = fma(CC6, x, CC5);
 459     p         = fma(p, x, CC4);
 460     p         = fma(p, x, CC3);
 461     p         = fma(p, x, CC2);
 462     p         = fma(p, x, CC1);
 463     p         = fma(p, x, one);
 464     x         = p * fexppart;
 465     return x;
 466 }
 467 #endif
 468
 469 #if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
 470 /*! \brief SIMD float exp(x).
 471  *
 472  * In addition to scaling the argument for 2^x this routine correctly does
 473  * extended precision arithmetics to improve accuracy.
 474  *
 475  * \tparam opt If this is changed from the default (safe) into the unsafe
 476  *             option, input values that would otherwise lead to zero-clamped
 477  *             results are not allowed and will lead to undefined results.
 478  *
 479  * \param x Argument. For the default (safe) function version this can be
 480  *          arbitrarily small value, but the routine might clamp the result to
 481  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 482  *          This corresponds to input arguments reaching
 483  *          -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
 484  *          Similarly, it might overflow for arguments reaching
 485  *          127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
 486  *          unsafe math optimizations are enabled, small input values that would
 487  *          result in zero-clamped output are not allowed.
 488  *
 489  * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
 490  *         depending on the underlying implementation. If unsafe optimizations
 491  *         are enabled, this is also true for very small values.
 492  *
 493  * \note    The definition range of this function is just-so-slightly smaller
 494  *          than the allowed IEEE exponents for many architectures. This is due
 495  *          to the implementation, which will hopefully improve in the future.
 496  *
 497  * \warning You cannot rely on this implementation returning inf for arguments
 498  *          that cause overflow. If you have some very large
 499  *          values and need to rely on getting a valid numerical output,
 500  *          take the minimum of your variable and the largest valid argument
 501  *          before calling this routine.
 502  */
 503 template <MathOptimization opt = MathOptimization::Safe>
 504 static inline SimdFloat gmx_simdcall
 505 exp(SimdFloat x)
 506 {
 507     const SimdFloat  argscale(1.44269504088896341F);
 508     const SimdFloat  invargscale0(-0.693145751953125F);
 509     const SimdFloat  invargscale1(-1.428606765330187045e-06F);
 510     const SimdFloat  CC4(0.00136324646882712841033936F);
 511     const SimdFloat  CC3(0.00836596917361021041870117F);
 512     const SimdFloat  CC2(0.0416710823774337768554688F);
 513     const SimdFloat  CC1(0.166665524244308471679688F);
 514     const SimdFloat  CC0(0.499999850988388061523438F);
 515     const SimdFloat  one(1.0F);
 516     SimdFloat        fexppart;
 517     SimdFloat        intpart;
 518     SimdFloat        y, p;
 519
 520     // Large negative values are valid arguments to exp2(), so there are two
 521     // things we need to account for:
 522     // 1. When the exponents reaches -127, the (biased) exponent field will be
 523     //    zero and we can no longer multiply with it. There are special IEEE
 524     //    formats to handle this range, but for now we have to accept that
 525     //    we cannot handle those arguments. If input value becomes even more
 526     //    negative, it will start to loop and we would end up with invalid
 527     //    exponents. Thus, we need to limit or mask this.
 528     // 2. For VERY large negative values, we will have problems that the
 529     //    subtraction to get the fractional part loses accuracy, and then we
 530     //    can end up with overflows in the polynomial.
 531     //
 532     // For now, we handle this by forwarding the math optimization setting to
 533     // ldexp, where the routine will return zero for very small arguments.
 534     //
 535     // However, before doing that we need to make sure we do not call cvtR2I
 536     // with an argument that is so negative it cannot be converted to an integer
 537     // after being multiplied by argscale.
 538
 539     if (opt == MathOptimization::Safe)
 540     {
 541         x         = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest())/argscale);
 542     }
 543
 544     y         = x * argscale;
 545
 546
 547     fexppart  = ldexp<opt>(one, cvtR2I(y));
 548     intpart   = round(y);
 549
 550     // Extended precision arithmetics
 551     x         = fma(invargscale0, intpart, x);
 552     x         = fma(invargscale1, intpart, x);
 553
 554     p         = fma(CC4, x, CC3);
 555     p         = fma(p, x, CC2);
 556     p         = fma(p, x, CC1);
 557     p         = fma(p, x, CC0);
 558     p         = fma(x*x, p, x);
 559     x         = fma(p, fexppart, fexppart);
 560     return x;
 561 }
 562 #endif
 563
 564 /*! \brief SIMD float erf(x).
 565  *
 566  * \param x The value to calculate erf(x) for.
 567  * \result erf(x)
 568  *
 569  * This routine achieves very close to full precision, but we do not care about
 570  * the last bit or the subnormal result range.
 571  */
 572 static inline SimdFloat gmx_simdcall
 573 erf(SimdFloat x)
 574 {
 575     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 576     const SimdFloat  CA6(7.853861353153693e-5F);
 577     const SimdFloat  CA5(-8.010193625184903e-4F);
 578     const SimdFloat  CA4(5.188327685732524e-3F);
 579     const SimdFloat  CA3(-2.685381193529856e-2F);
 580     const SimdFloat  CA2(1.128358514861418e-1F);
 581     const SimdFloat  CA1(-3.761262582423300e-1F);
 582     const SimdFloat  CA0(1.128379165726710F);
 583     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 584     const SimdFloat  CB9(-0.0018629930017603923F);
 585     const SimdFloat  CB8(0.003909821287598495F);
 586     const SimdFloat  CB7(-0.0052094582210355615F);
 587     const SimdFloat  CB6(0.005685614362160572F);
 588     const SimdFloat  CB5(-0.0025367682853477272F);
 589     const SimdFloat  CB4(-0.010199799682318782F);
 590     const SimdFloat  CB3(0.04369575504816542F);
 591     const SimdFloat  CB2(-0.11884063474674492F);
 592     const SimdFloat  CB1(0.2732120154030589F);
 593     const SimdFloat  CB0(0.42758357702025784F);
 594     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
 595     const SimdFloat  CC10(-0.0445555913112064F);
 596     const SimdFloat  CC9(0.21376355144663348F);
 597     const SimdFloat  CC8(-0.3473187200259257F);
 598     const SimdFloat  CC7(0.016690861551248114F);
 599     const SimdFloat  CC6(0.7560973182491192F);
 600     const SimdFloat  CC5(-1.2137903600145787F);
 601     const SimdFloat  CC4(0.8411872321232948F);
 602     const SimdFloat  CC3(-0.08670413896296343F);
 603     const SimdFloat  CC2(-0.27124782687240334F);
 604     const SimdFloat  CC1(-0.0007502488047806069F);
 605     const SimdFloat  CC0(0.5642114853803148F);
 606     const SimdFloat  one(1.0F);
 607     const SimdFloat  two(2.0F);
 608
 609     SimdFloat        x2, x4, y;
 610     SimdFloat        t, t2, w, w2;
 611     SimdFloat        pA0, pA1, pB0, pB1, pC0, pC1;
 612     SimdFloat        expmx2;
 613     SimdFloat        res_erf, res_erfc, res;
 614     SimdFBool        m, maskErf;
 615
 616     // Calculate erf()
 617     x2   = x * x;
 618     x4   = x2 * x2;
 619
 620     pA0  = fma(CA6, x4, CA4);
 621     pA1  = fma(CA5, x4, CA3);
 622     pA0  = fma(pA0, x4, CA2);
 623     pA1  = fma(pA1, x4, CA1);
 624     pA0  = pA0*x4;
 625     pA0  = fma(pA1, x2, pA0);
 626     // Constant term must come last for precision reasons
 627     pA0  = pA0+CA0;
 628
 629     res_erf = x*pA0;
 630
 631     // Calculate erfc
 632     y       = abs(x);
 633     maskErf = SimdFloat(0.75F) <= y;
 634     t       = maskzInv(y, maskErf);
 635     w       = t-one;
 636     t2      = t*t;
 637     w2      = w*w;
 638
 639     // No need for a floating-point sieve here (as in erfc), since erf()
 640     // will never return values that are extremely small for large args.
 641     expmx2  = exp( -y*y );
 642
 643     pB1  = fma(CB9, w2, CB7);
 644     pB0  = fma(CB8, w2, CB6);
 645     pB1  = fma(pB1, w2, CB5);
 646     pB0  = fma(pB0, w2, CB4);
 647     pB1  = fma(pB1, w2, CB3);
 648     pB0  = fma(pB0, w2, CB2);
 649     pB1  = fma(pB1, w2, CB1);
 650     pB0  = fma(pB0, w2, CB0);
 651     pB0  = fma(pB1, w, pB0);
 652
 653     pC0  = fma(CC10, t2, CC8);
 654     pC1  = fma(CC9, t2, CC7);
 655     pC0  = fma(pC0, t2, CC6);
 656     pC1  = fma(pC1, t2, CC5);
 657     pC0  = fma(pC0, t2, CC4);
 658     pC1  = fma(pC1, t2, CC3);
 659     pC0  = fma(pC0, t2, CC2);
 660     pC1  = fma(pC1, t2, CC1);
 661
 662     pC0  = fma(pC0, t2, CC0);
 663     pC0  = fma(pC1, t, pC0);
 664     pC0  = pC0*t;
 665
 666     // Select pB0 or pC0 for erfc()
 667     m        = two < y;
 668     res_erfc = blend(pB0, pC0, m);
 669     res_erfc = res_erfc * expmx2;
 670
 671     // erfc(x<0) = 2-erfc(|x|)
 672     m        = x < setZero();
 673     res_erfc = blend(res_erfc, two-res_erfc, m);
 674
 675     // Select erf() or erfc()
 676     res  = blend(res_erf, one-res_erfc, maskErf);
 677
 678     return res;
 679 }
 680
 681 /*! \brief SIMD float erfc(x).
 682  *
 683  * \param x The value to calculate erfc(x) for.
 684  * \result erfc(x)
 685  *
 686  * This routine achieves full precision (bar the last bit) over most of the
 687  * input range, but for large arguments where the result is getting close
 688  * to the minimum representable numbers we accept slightly larger errors
 689  * (think results that are in the ballpark of 10^-30 for single precision)
 690  * since that is not relevant for MD.
 691  */
 692 static inline SimdFloat gmx_simdcall
 693 erfc(SimdFloat x)
 694 {
 695     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 696     const SimdFloat  CA6(7.853861353153693e-5F);
 697     const SimdFloat  CA5(-8.010193625184903e-4F);
 698     const SimdFloat  CA4(5.188327685732524e-3F);
 699     const SimdFloat  CA3(-2.685381193529856e-2F);
 700     const SimdFloat  CA2(1.128358514861418e-1F);
 701     const SimdFloat  CA1(-3.761262582423300e-1F);
 702     const SimdFloat  CA0(1.128379165726710F);
 703     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 704     const SimdFloat  CB9(-0.0018629930017603923F);
 705     const SimdFloat  CB8(0.003909821287598495F);
 706     const SimdFloat  CB7(-0.0052094582210355615F);
 707     const SimdFloat  CB6(0.005685614362160572F);
 708     const SimdFloat  CB5(-0.0025367682853477272F);
 709     const SimdFloat  CB4(-0.010199799682318782F);
 710     const SimdFloat  CB3(0.04369575504816542F);
 711     const SimdFloat  CB2(-0.11884063474674492F);
 712     const SimdFloat  CB1(0.2732120154030589F);
 713     const SimdFloat  CB0(0.42758357702025784F);
 714     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
 715     const SimdFloat  CC10(-0.0445555913112064F);
 716     const SimdFloat  CC9(0.21376355144663348F);
 717     const SimdFloat  CC8(-0.3473187200259257F);
 718     const SimdFloat  CC7(0.016690861551248114F);
 719     const SimdFloat  CC6(0.7560973182491192F);
 720     const SimdFloat  CC5(-1.2137903600145787F);
 721     const SimdFloat  CC4(0.8411872321232948F);
 722     const SimdFloat  CC3(-0.08670413896296343F);
 723     const SimdFloat  CC2(-0.27124782687240334F);
 724     const SimdFloat  CC1(-0.0007502488047806069F);
 725     const SimdFloat  CC0(0.5642114853803148F);
 726     // Coefficients for expansion of exp(x) in [0,0.1]
 727     // CD0 and CD1 are both 1.0, so no need to declare them separately
 728     const SimdFloat  CD2(0.5000066608081202F);
 729     const SimdFloat  CD3(0.1664795422874624F);
 730     const SimdFloat  CD4(0.04379839977652482F);
 731     const SimdFloat  one(1.0F);
 732     const SimdFloat  two(2.0F);
 733
 734     /* We need to use a small trick here, since we cannot assume all SIMD
 735      * architectures support integers, and the flag we want (0xfffff000) would
 736      * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
 737      * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
 738      * fp numbers, and perform a logical or. Since the expression is constant,
 739      * we can at least hope it is evaluated at compile-time.
 740      */
 741 #if GMX_SIMD_HAVE_LOGICAL
 742     const SimdFloat         sieve(SimdFloat(-5.965323564e+29F) | SimdFloat(7.05044434e-30F));
 743 #else
 744     const int               isieve   = 0xFFFFF000;
 745     alignas(GMX_SIMD_ALIGNMENT) float  mem[GMX_SIMD_FLOAT_WIDTH];
 746
 747     union {
 748         float f; int i;
 749     } conv;
 750     int                     i;
 751 #endif
 752
 753     SimdFloat        x2, x4, y;
 754     SimdFloat        q, z, t, t2, w, w2;
 755     SimdFloat        pA0, pA1, pB0, pB1, pC0, pC1;
 756     SimdFloat        expmx2, corr;
 757     SimdFloat        res_erf, res_erfc, res;
 758     SimdFBool        m, msk_erf;
 759
 760     // Calculate erf()
 761     x2     = x * x;
 762     x4     = x2 * x2;
 763
 764     pA0  = fma(CA6, x4, CA4);
 765     pA1  = fma(CA5, x4, CA3);
 766     pA0  = fma(pA0, x4, CA2);
 767     pA1  = fma(pA1, x4, CA1);
 768     pA1  = pA1 * x2;
 769     pA0  = fma(pA0, x4, pA1);
 770     // Constant term must come last for precision reasons
 771     pA0  = pA0 + CA0;
 772
 773     res_erf = x * pA0;
 774
 775     // Calculate erfc
 776     y       = abs(x);
 777     msk_erf = SimdFloat(0.75F) <= y;
 778     t       = maskzInv(y, msk_erf);
 779     w       = t - one;
 780     t2      = t * t;
 781     w2      = w * w;
 782     /*
 783      * We cannot simply calculate exp(-y2) directly in single precision, since
 784      * that will lose a couple of bits of precision due to the multiplication.
 785      * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
 786      * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
 787      *
 788      * The only drawback with this is that it requires TWO separate exponential
 789      * evaluations, which would be horrible performance-wise. However, the argument
 790      * for the second exp() call is always small, so there we simply use a
 791      * low-order minimax expansion on [0,0.1].
 792      *
 793      * However, this neat idea requires support for logical ops (and) on
 794      * FP numbers, which some vendors decided isn't necessary in their SIMD
 795      * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
 796      * in double, but we still need memory as a backup when that is not available,
 797      * and this case is rare enough that we go directly there...
 798      */
 799 #if GMX_SIMD_HAVE_LOGICAL
 800     z       = y & sieve;
 801 #else
 802     store(mem, y);
 803     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 804     {
 805         conv.f  = mem[i];
 806         conv.i  = conv.i & isieve;
 807         mem[i]  = conv.f;
 808     }
 809     z = load<SimdFloat>(mem);
 810 #endif
 811     q       = (z-y) * (z+y);
 812     corr    = fma(CD4, q, CD3);
 813     corr    = fma(corr, q, CD2);
 814     corr    = fma(corr, q, one);
 815     corr    = fma(corr, q, one);
 816
 817     expmx2  = exp( -z*z );
 818     expmx2  = expmx2 * corr;
 819
 820     pB1  = fma(CB9, w2, CB7);
 821     pB0  = fma(CB8, w2, CB6);
 822     pB1  = fma(pB1, w2, CB5);
 823     pB0  = fma(pB0, w2, CB4);
 824     pB1  = fma(pB1, w2, CB3);
 825     pB0  = fma(pB0, w2, CB2);
 826     pB1  = fma(pB1, w2, CB1);
 827     pB0  = fma(pB0, w2, CB0);
 828     pB0  = fma(pB1, w, pB0);
 829
 830     pC0  = fma(CC10, t2, CC8);
 831     pC1  = fma(CC9, t2, CC7);
 832     pC0  = fma(pC0, t2, CC6);
 833     pC1  = fma(pC1, t2, CC5);
 834     pC0  = fma(pC0, t2, CC4);
 835     pC1  = fma(pC1, t2, CC3);
 836     pC0  = fma(pC0, t2, CC2);
 837     pC1  = fma(pC1, t2, CC1);
 838
 839     pC0  = fma(pC0, t2, CC0);
 840     pC0  = fma(pC1, t, pC0);
 841     pC0  = pC0 * t;
 842
 843     // Select pB0 or pC0 for erfc()
 844     m        = two < y;
 845     res_erfc = blend(pB0, pC0, m);
 846     res_erfc = res_erfc * expmx2;
 847
 848     // erfc(x<0) = 2-erfc(|x|)
 849     m        = x < setZero();
 850     res_erfc = blend(res_erfc, two-res_erfc, m);
 851
 852     // Select erf() or erfc()
 853     res  = blend(one-res_erf, res_erfc, msk_erf);
 854
 855     return res;
 856 }
 857
 858 /*! \brief SIMD float sin \& cos.
 859  *
 860  * \param x The argument to evaluate sin/cos for
 861  * \param[out] sinval Sin(x)
 862  * \param[out] cosval Cos(x)
 863  *
 864  * This version achieves close to machine precision, but for very large
 865  * magnitudes of the argument we inherently begin to lose accuracy due to the
 866  * argument reduction, despite using extended precision arithmetics internally.
 867  */
 868 static inline void gmx_simdcall
 869 sincos(SimdFloat x, SimdFloat *sinval, SimdFloat *cosval)
 870 {
 871     // Constants to subtract Pi/4*x from y while minimizing precision loss
 872     const SimdFloat  argred0(-1.5703125);
 873     const SimdFloat  argred1(-4.83751296997070312500e-04F);
 874     const SimdFloat  argred2(-7.54953362047672271729e-08F);
 875     const SimdFloat  argred3(-2.56334406825708960298e-12F);
 876     const SimdFloat  two_over_pi(static_cast<float>(2.0F/M_PI));
 877     const SimdFloat  const_sin2(-1.9515295891e-4F);
 878     const SimdFloat  const_sin1( 8.3321608736e-3F);
 879     const SimdFloat  const_sin0(-1.6666654611e-1F);
 880     const SimdFloat  const_cos2( 2.443315711809948e-5F);
 881     const SimdFloat  const_cos1(-1.388731625493765e-3F);
 882     const SimdFloat  const_cos0( 4.166664568298827e-2F);
 883     const SimdFloat  half(0.5F);
 884     const SimdFloat  one(1.0F);
 885     SimdFloat        ssign, csign;
 886     SimdFloat        x2, y, z, psin, pcos, sss, ccc;
 887     SimdFBool        m;
 888
 889 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
 890     const SimdFInt32 ione(1);
 891     const SimdFInt32 itwo(2);
 892     SimdFInt32       iy;
 893
 894     z       = x * two_over_pi;
 895     iy      = cvtR2I(z);
 896     y       = round(z);
 897
 898     m       = cvtIB2B((iy & ione) == SimdFInt32(0));
 899     ssign   = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
 900     csign   = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy+ione) & itwo) == itwo));
 901 #else
 902     const SimdFloat  quarter(0.25f);
 903     const SimdFloat  minusquarter(-0.25f);
 904     SimdFloat        q;
 905     SimdFBool        m1, m2, m3;
 906
 907     /* The most obvious way to find the arguments quadrant in the unit circle
 908      * to calculate the sign is to use integer arithmetic, but that is not
 909      * present in all SIMD implementations. As an alternative, we have devised a
 910      * pure floating-point algorithm that uses truncation for argument reduction
 911      * so that we get a new value 0<=q<1 over the unit circle, and then
 912      * do floating-point comparisons with fractions. This is likely to be
 913      * slightly slower (~10%) due to the longer latencies of floating-point, so
 914      * we only use it when integer SIMD arithmetic is not present.
 915      */
 916     ssign   = x;
 917     x       = abs(x);
 918     // It is critical that half-way cases are rounded down
 919     z       = fma(x, two_over_pi, half);
 920     y       = trunc(z);
 921     q       = z * quarter;
 922     q       = q - trunc(q);
 923     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
 924      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
 925      * This removes the 2*Pi periodicity without using any integer arithmetic.
 926      * First check if y had the value 2 or 3, set csign if true.
 927      */
 928     q       = q - half;
 929     /* If we have logical operations we can work directly on the signbit, which
 930      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
 931      * Thus, if you are altering defines to debug alternative code paths, the
 932      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
 933      * active or inactive - you will get errors if only one is used.
 934      */
 935 #    if GMX_SIMD_HAVE_LOGICAL
 936     ssign   = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
 937     csign   = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
 938     ssign   = ssign ^ csign;
 939 #    else
 940     ssign   = copysign(SimdFloat(1.0f), ssign);
 941     csign   = copysign(SimdFloat(1.0f), q);
 942     csign   = -csign;
 943     ssign   = ssign * csign;    // swap ssign if csign was set.
 944 #    endif
 945     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
 946     m1      = (q < minusquarter);
 947     m2      = (setZero() <= q);
 948     m3      = (q < quarter);
 949     m2      = m2 && m3;
 950     m       = m1 || m2;
 951     // where mask is FALSE, swap sign.
 952     csign   = csign * blend(SimdFloat(-1.0f), one, m);
 953 #endif
 954     x       = fma(y, argred0, x);
 955     x       = fma(y, argred1, x);
 956     x       = fma(y, argred2, x);
 957     x       = fma(y, argred3, x);
 958     x2      = x * x;
 959
 960     psin    = fma(const_sin2, x2, const_sin1);
 961     psin    = fma(psin, x2, const_sin0);
 962     psin    = fma(psin, x * x2, x);
 963     pcos    = fma(const_cos2, x2, const_cos1);
 964     pcos    = fma(pcos, x2, const_cos0);
 965     pcos    = fms(pcos, x2, half);
 966     pcos    = fma(pcos, x2, one);
 967
 968     sss     = blend(pcos, psin, m);
 969     ccc     = blend(psin, pcos, m);
 970     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
 971 #if GMX_SIMD_HAVE_LOGICAL
 972     *sinval = sss ^ ssign;
 973     *cosval = ccc ^ csign;
 974 #else
 975     *sinval = sss * ssign;
 976     *cosval = ccc * csign;
 977 #endif
 978 }
 979
 980 /*! \brief SIMD float sin(x).
 981  *
 982  * \param x The argument to evaluate sin for
 983  * \result Sin(x)
 984  *
 985  * \attention Do NOT call both sin & cos if you need both results, since each of them
 986  * will then call \ref sincos and waste a factor 2 in performance.
 987  */
 988 static inline SimdFloat gmx_simdcall
 989 sin(SimdFloat x)
 990 {
 991     SimdFloat s, c;
 992     sincos(x, &s, &c);
 993     return s;
 994 }
 995
 996 /*! \brief SIMD float cos(x).
 997  *
 998  * \param x The argument to evaluate cos for
 999  * \result Cos(x)
1000  *
1001  * \attention Do NOT call both sin & cos if you need both results, since each of them
1002  * will then call \ref sincos and waste a factor 2 in performance.
1003  */
1004 static inline SimdFloat gmx_simdcall
1005 cos(SimdFloat x)
1006 {
1007     SimdFloat s, c;
1008     sincos(x, &s, &c);
1009     return c;
1010 }
1011
1012 /*! \brief SIMD float tan(x).
1013  *
1014  * \param x The argument to evaluate tan for
1015  * \result Tan(x)
1016  */
1017 static inline SimdFloat gmx_simdcall
1018 tan(SimdFloat x)
1019 {
1020     const SimdFloat  argred0(-1.5703125);
1021     const SimdFloat  argred1(-4.83751296997070312500e-04F);
1022     const SimdFloat  argred2(-7.54953362047672271729e-08F);
1023     const SimdFloat  argred3(-2.56334406825708960298e-12F);
1024     const SimdFloat  two_over_pi(static_cast<float>(2.0F/M_PI));
1025     const SimdFloat  CT6(0.009498288995810566122993911);
1026     const SimdFloat  CT5(0.002895755790837379295226923);
1027     const SimdFloat  CT4(0.02460087336161924491836265);
1028     const SimdFloat  CT3(0.05334912882656359828045988);
1029     const SimdFloat  CT2(0.1333989091464957704418495);
1030     const SimdFloat  CT1(0.3333307599244198227797507);
1031
1032     SimdFloat        x2, p, y, z;
1033     SimdFBool        m;
1034
1035 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1036     SimdFInt32  iy;
1037     SimdFInt32  ione(1);
1038
1039     z       = x * two_over_pi;
1040     iy      = cvtR2I(z);
1041     y       = round(z);
1042     m       = cvtIB2B((iy & ione) == ione);
1043
1044     x       = fma(y, argred0, x);
1045     x       = fma(y, argred1, x);
1046     x       = fma(y, argred2, x);
1047     x       = fma(y, argred3, x);
1048     x       = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
1049 #else
1050     const SimdFloat  quarter(0.25f);
1051     const SimdFloat  half(0.5f);
1052     const SimdFloat  threequarter(0.75f);
1053     SimdFloat        w, q;
1054     SimdFBool        m1, m2, m3;
1055
1056     w       = abs(x);
1057     z       = fma(w, two_over_pi, half);
1058     y       = trunc(z);
1059     q       = z * quarter;
1060     q       = q - trunc(q);
1061     m1      = quarter <= q;
1062     m2      = q < half;
1063     m3      = threequarter <= q;
1064     m1      = m1 && m2;
1065     m       = m1 || m3;
1066     w       = fma(y, argred0, w);
1067     w       = fma(y, argred1, w);
1068     w       = fma(y, argred2, w);
1069     w       = fma(y, argred3, w);
1070     w       = blend(w, -w, m);
1071     x       = w * copysign( SimdFloat(1.0), x );
1072 #endif
1073     x2      = x * x;
1074     p       = fma(CT6, x2, CT5);
1075     p       = fma(p, x2, CT4);
1076     p       = fma(p, x2, CT3);
1077     p       = fma(p, x2, CT2);
1078     p       = fma(p, x2, CT1);
1079     p       = fma(x2 * p, x, x);
1080
1081     p       = blend( p, maskzInv(p, m), m);
1082     return p;
1083 }
1084
1085 /*! \brief SIMD float asin(x).
1086  *
1087  * \param x The argument to evaluate asin for
1088  * \result Asin(x)
1089  */
1090 static inline SimdFloat gmx_simdcall
1091 asin(SimdFloat x)
1092 {
1093     const SimdFloat limitlow(1e-4F);
1094     const SimdFloat half(0.5F);
1095     const SimdFloat one(1.0F);
1096     const SimdFloat halfpi(static_cast<float>(M_PI/2.0F));
1097     const SimdFloat CC5(4.2163199048E-2F);
1098     const SimdFloat CC4(2.4181311049E-2F);
1099     const SimdFloat CC3(4.5470025998E-2F);
1100     const SimdFloat CC2(7.4953002686E-2F);
1101     const SimdFloat CC1(1.6666752422E-1F);
1102     SimdFloat       xabs;
1103     SimdFloat       z, z1, z2, q, q1, q2;
1104     SimdFloat       pA, pB;
1105     SimdFBool       m, m2;
1106
1107     xabs  = abs(x);
1108     m     = half < xabs;
1109     z1    = half * (one-xabs);
1110     m2    = xabs < one;
1111     q1    = z1 * maskzInvsqrt(z1, m2);
1112     q2    = xabs;
1113     z2    = q2 * q2;
1114     z     = blend(z2, z1, m);
1115     q     = blend(q2, q1, m);
1116
1117     z2    = z * z;
1118     pA    = fma(CC5, z2, CC3);
1119     pB    = fma(CC4, z2, CC2);
1120     pA    = fma(pA, z2, CC1);
1121     pA    = pA * z;
1122     z     = fma(pB, z2, pA);
1123     z     = fma(z, q, q);
1124     q2    = halfpi - z;
1125     q2    = q2 - z;
1126     z     = blend(z, q2, m);
1127
1128     m     = limitlow < xabs;
1129     z     = blend( xabs, z, m );
1130     z     = copysign(z, x);
1131
1132     return z;
1133 }
1134
1135 /*! \brief SIMD float acos(x).
1136  *
1137  * \param x The argument to evaluate acos for
1138  * \result Acos(x)
1139  */
1140 static inline SimdFloat gmx_simdcall
1141 acos(SimdFloat x)
1142 {
1143     const SimdFloat one(1.0F);
1144     const SimdFloat half(0.5F);
1145     const SimdFloat pi(static_cast<float>(M_PI));
1146     const SimdFloat halfpi(static_cast<float>(M_PI/2.0F));
1147     SimdFloat       xabs;
1148     SimdFloat       z, z1, z2, z3;
1149     SimdFBool       m1, m2, m3;
1150
1151     xabs  = abs(x);
1152     m1    = half < xabs;
1153     m2    = setZero() < x;
1154
1155     z     = fnma(half, xabs, half);
1156     m3    = xabs <  one;
1157     z     = z * maskzInvsqrt(z, m3);
1158     z     = blend(x, z, m1);
1159     z     = asin(z);
1160
1161     z2    = z + z;
1162     z1    = pi - z2;
1163     z3    = halfpi - z;
1164     z     = blend(z1, z2, m2);
1165     z     = blend(z3, z, m1);
1166
1167     return z;
1168 }
1169
1170 /*! \brief SIMD float asin(x).
1171  *
1172  * \param x The argument to evaluate atan for
1173  * \result Atan(x), same argument/value range as standard math library.
1174  */
1175 static inline SimdFloat gmx_simdcall
1176 atan(SimdFloat x)
1177 {
1178     const SimdFloat halfpi(static_cast<float>(M_PI/2.0F));
1179     const SimdFloat CA17(0.002823638962581753730774F);
1180     const SimdFloat CA15(-0.01595690287649631500244F);
1181     const SimdFloat CA13(0.04250498861074447631836F);
1182     const SimdFloat CA11(-0.07489009201526641845703F);
1183     const SimdFloat CA9 (0.1063479334115982055664F);
1184     const SimdFloat CA7 (-0.1420273631811141967773F);
1185     const SimdFloat CA5 (0.1999269574880599975585F);
1186     const SimdFloat CA3 (-0.3333310186862945556640F);
1187     const SimdFloat one (1.0F);
1188     SimdFloat       x2, x3, x4, pA, pB;
1189     SimdFBool       m, m2;
1190
1191     m     = x < setZero();
1192     x     = abs(x);
1193     m2    = one < x;
1194     x     = blend(x, maskzInv(x, m2), m2);
1195
1196     x2    = x * x;
1197     x3    = x2 * x;
1198     x4    = x2 * x2;
1199     pA    = fma(CA17, x4, CA13);
1200     pB    = fma(CA15, x4, CA11);
1201     pA    = fma(pA, x4, CA9);
1202     pB    = fma(pB, x4, CA7);
1203     pA    = fma(pA, x4, CA5);
1204     pB    = fma(pB, x4, CA3);
1205     pA    = fma(pA, x2, pB);
1206     pA    = fma(pA, x3, x);
1207
1208     pA    = blend(pA, halfpi-pA, m2);
1209     pA    = blend(pA, -pA, m);
1210
1211     return pA;
1212 }
1213
1214 /*! \brief SIMD float atan2(y,x).
1215  *
1216  * \param y Y component of vector, any quartile
1217  * \param x X component of vector, any quartile
1218  * \result Atan(y,x), same argument/value range as standard math library.
1219  *
1220  * \note This routine should provide correct results for all finite
1221  * non-zero or positive-zero arguments. However, negative zero arguments will
1222  * be treated as positive zero, which means the return value will deviate from
1223  * the standard math library atan2(y,x) for those cases. That should not be
1224  * of any concern in Gromacs, and in particular it will not affect calculations
1225  * of angles from vectors.
1226  */
1227 static inline SimdFloat gmx_simdcall
1228 atan2(SimdFloat y, SimdFloat x)
1229 {
1230     const SimdFloat pi(static_cast<float>(M_PI));
1231     const SimdFloat halfpi(static_cast<float>(M_PI/2.0));
1232     SimdFloat       xinv, p, aoffset;
1233     SimdFBool       mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1234
1235     mask_xnz  = x != setZero();
1236     mask_ynz  = y != setZero();
1237     mask_xlt0 = x < setZero();
1238     mask_ylt0 = y < setZero();
1239
1240     aoffset   = selectByNotMask(halfpi, mask_xnz);
1241     aoffset   = selectByMask(aoffset, mask_ynz);
1242
1243     aoffset   = blend(aoffset, pi, mask_xlt0);
1244     aoffset   = blend(aoffset, -aoffset, mask_ylt0);
1245
1246     xinv      = maskzInv(x, mask_xnz);
1247     p         = y * xinv;
1248     p         = atan(p);
1249     p         = p + aoffset;
1250
1251     return p;
1252 }
1253
1254 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1255  *
1256  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1257  * \result Correction factor to coulomb force - see below for details.
1258  *
1259  * This routine is meant to enable analytical evaluation of the
1260  * direct-space PME electrostatic force to avoid tables.
1261  *
1262  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1263  * are some problems evaluating that:
1264  *
1265  * First, the error function is difficult (read: expensive) to
1266  * approxmiate accurately for intermediate to large arguments, and
1267  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1268  * Second, we now try to avoid calculating potentials in Gromacs but
1269  * use forces directly.
1270  *
1271  * We can simply things slight by noting that the PME part is really
1272  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1273  * \f[
1274  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1275  * \f]
1276  * The first term we already have from the inverse square root, so
1277  * that we can leave out of this routine.
1278  *
1279  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1280  * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1281  * the range used for the minimax fit. Use your favorite plotting program
1282  * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1283  *
1284  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1285  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1286  * then only use even powers. This is another minor optimization, since
1287  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1288  * the vector between the two atoms to get the vectorial force. The
1289  * fastest flops are the ones we can avoid calculating!
1290  *
1291  * So, here's how it should be used:
1292  *
1293  * 1. Calculate \f$r^2\f$.
1294  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1295  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1296  * 4. The return value is the expression:
1297  *
1298  * \f[
1299  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1300  * \f]
1301  *
1302  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1303  *
1304  *  \f[
1305  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1306  *  \f]
1307  *
1308  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1309  *
1310  *  \f[
1311  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1312  *  \f]
1313  *
1314  *    With a bit of math exercise you should be able to confirm that
1315  *    this is exactly
1316  *
1317  *  \f[
1318  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1319  *  \f]
1320  *
1321  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1322  *    and you have your force (divided by \f$r\f$). A final multiplication
1323  *    with the vector connecting the two particles and you have your
1324  *    vectorial force to add to the particles.
1325  *
1326  * This approximation achieves an error slightly lower than 1e-6
1327  * in single precision and 1e-11 in double precision
1328  * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1329  * when added to \f$1/r\f$ the error will be insignificant.
1330  * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1331  *
1332  */
1333 static inline SimdFloat gmx_simdcall
1334 pmeForceCorrection(SimdFloat z2)
1335 {
1336     const SimdFloat  FN6(-1.7357322914161492954e-8F);
1337     const SimdFloat  FN5(1.4703624142580877519e-6F);
1338     const SimdFloat  FN4(-0.000053401640219807709149F);
1339     const SimdFloat  FN3(0.0010054721316683106153F);
1340     const SimdFloat  FN2(-0.019278317264888380590F);
1341     const SimdFloat  FN1(0.069670166153766424023F);
1342     const SimdFloat  FN0(-0.75225204789749321333F);
1343
1344     const SimdFloat  FD4(0.0011193462567257629232F);
1345     const SimdFloat  FD3(0.014866955030185295499F);
1346     const SimdFloat  FD2(0.11583842382862377919F);
1347     const SimdFloat  FD1(0.50736591960530292870F);
1348     const SimdFloat  FD0(1.0F);
1349
1350     SimdFloat        z4;
1351     SimdFloat        polyFN0, polyFN1, polyFD0, polyFD1;
1352
1353     z4             = z2 * z2;
1354
1355     polyFD0        = fma(FD4, z4, FD2);
1356     polyFD1        = fma(FD3, z4, FD1);
1357     polyFD0        = fma(polyFD0, z4, FD0);
1358     polyFD0        = fma(polyFD1, z2, polyFD0);
1359
1360     polyFD0        = inv(polyFD0);
1361
1362     polyFN0        = fma(FN6, z4, FN4);
1363     polyFN1        = fma(FN5, z4, FN3);
1364     polyFN0        = fma(polyFN0, z4, FN2);
1365     polyFN1        = fma(polyFN1, z4, FN1);
1366     polyFN0        = fma(polyFN0, z4, FN0);
1367     polyFN0        = fma(polyFN1, z2, polyFN0);
1368
1369     return polyFN0 * polyFD0;
1370 }
1371
1372
1373
1374 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1375  *
1376  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1377  * \result Correction factor to coulomb potential - see below for details.
1378  *
1379  * See \ref pmeForceCorrection for details about the approximation.
1380  *
1381  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1382  * as the input argument.
1383  *
1384  * Here's how it should be used:
1385  *
1386  * 1. Calculate \f$r^2\f$.
1387  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1388  * 3. Evaluate this routine with z^2 as the argument.
1389  * 4. The return value is the expression:
1390  *
1391  *  \f[
1392  *   \frac{\mbox{erf}(z)}{z}
1393  *  \f]
1394  *
1395  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1396  *
1397  *  \f[
1398  *    \frac{\mbox{erf}(r \beta)}{r}
1399  *  \f]
1400  *
1401  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1402  *    and you have your potential.
1403  *
1404  * This approximation achieves an error slightly lower than 1e-6
1405  * in single precision and 4e-11 in double precision
1406  * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1407  * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1408  * when added to \f$1/r\f$ the error will be insignificant.
1409  * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1410  */
1411 static inline SimdFloat gmx_simdcall
1412 pmePotentialCorrection(SimdFloat z2)
1413 {
1414     const SimdFloat  VN6(1.9296833005951166339e-8F);
1415     const SimdFloat  VN5(-1.4213390571557850962e-6F);
1416     const SimdFloat  VN4(0.000041603292906656984871F);
1417     const SimdFloat  VN3(-0.00013134036773265025626F);
1418     const SimdFloat  VN2(0.038657983986041781264F);
1419     const SimdFloat  VN1(0.11285044772717598220F);
1420     const SimdFloat  VN0(1.1283802385263030286F);
1421
1422     const SimdFloat  VD3(0.0066752224023576045451F);
1423     const SimdFloat  VD2(0.078647795836373922256F);
1424     const SimdFloat  VD1(0.43336185284710920150F);
1425     const SimdFloat  VD0(1.0F);
1426
1427     SimdFloat        z4;
1428     SimdFloat        polyVN0, polyVN1, polyVD0, polyVD1;
1429
1430     z4             = z2 * z2;
1431
1432     polyVD1        = fma(VD3, z4, VD1);
1433     polyVD0        = fma(VD2, z4, VD0);
1434     polyVD0        = fma(polyVD1, z2, polyVD0);
1435
1436     polyVD0        = inv(polyVD0);
1437
1438     polyVN0        = fma(VN6, z4, VN4);
1439     polyVN1        = fma(VN5, z4, VN3);
1440     polyVN0        = fma(polyVN0, z4, VN2);
1441     polyVN1        = fma(polyVN1, z4, VN1);
1442     polyVN0        = fma(polyVN0, z4, VN0);
1443     polyVN0        = fma(polyVN1, z2, polyVN0);
1444
1445     return polyVN0 * polyVD0;
1446 }
1447 #endif
1448
1449 /*! \} */
1450
1451 #if GMX_SIMD_HAVE_DOUBLE
1452
1453
1454 /*! \name Double precision SIMD math functions
1455  *
1456  *  \note In most cases you should use the real-precision functions instead.
1457  *  \{
1458  */
1459
1460 /****************************************
1461  * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1462  ****************************************/
1463
1464 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1465 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1466  *
1467  * \param x Values to set sign for
1468  * \param y Values used to set sign
1469  * \return  Magnitude of x, sign of y
1470  */
1471 static inline SimdDouble gmx_simdcall
1472 copysign(SimdDouble x, SimdDouble y)
1473 {
1474 #if GMX_SIMD_HAVE_LOGICAL
1475     return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1476 #else
1477     return blend(abs(x), -abs(x), (y < setZero()));
1478 #endif
1479 }
1480 #endif
1481
1482 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1483 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1484  *
1485  * This is a low-level routine that should only be used by SIMD math routine
1486  * that evaluates the inverse square root.
1487  *
1488  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1489  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
1490  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1491  */
1492 static inline SimdDouble gmx_simdcall
1493 rsqrtIter(SimdDouble lu, SimdDouble x)
1494 {
1495     SimdDouble tmp1 = x*lu;
1496     SimdDouble tmp2 = SimdDouble(-0.5)*lu;
1497     tmp1 = fma(tmp1, lu, SimdDouble(-3.0));
1498     return tmp1*tmp2;
1499 }
1500 #endif
1501
1502 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1503  *
1504  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1505  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1506  *           For the single precision implementation this is obviously always
1507  *           true for positive values, but for double precision it adds an
1508  *           extra restriction since the first lookup step might have to be
1509  *           performed in single precision on some architectures. Note that the
1510  *           responsibility for checking falls on you - this routine does not
1511  *           check arguments.
1512  *
1513  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
1514  */
1515 static inline SimdDouble gmx_simdcall
1516 invsqrt(SimdDouble x)
1517 {
1518     SimdDouble lu = rsqrt(x);
1519 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1520     lu = rsqrtIter(lu, x);
1521 #endif
1522 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1523     lu = rsqrtIter(lu, x);
1524 #endif
1525 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1526     lu = rsqrtIter(lu, x);
1527 #endif
1528 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1529     lu = rsqrtIter(lu, x);
1530 #endif
1531     return lu;
1532 }
1533
1534 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1535  *
1536  * \param x0  First set of arguments, x0 must be in single range (see below).
1537  * \param x1  Second set of arguments, x1 must be in single range (see below).
1538  * \param[out] out0  Result 1/sqrt(x0)
1539  * \param[out] out1  Result 1/sqrt(x1)
1540  *
1541  *  In particular for double precision we can sometimes calculate square root
1542  *  pairs slightly faster by using single precision until the very last step.
1543  *
1544  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1545  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
1546  *       For the single precision implementation this is obviously always
1547  *       true for positive values, but for double precision it adds an
1548  *       extra restriction since the first lookup step might have to be
1549  *       performed in single precision on some architectures. Note that the
1550  *       responsibility for checking falls on you - this routine does not
1551  *       check arguments.
1552  */
1553 static inline void gmx_simdcall
1554 invsqrtPair(SimdDouble x0,    SimdDouble x1,
1555             SimdDouble *out0, SimdDouble *out1)
1556 {
1557 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1558     SimdFloat  xf  = cvtDD2F(x0, x1);
1559     SimdFloat  luf = rsqrt(xf);
1560     SimdDouble lu0, lu1;
1561     // Intermediate target is single - mantissa+1 bits
1562 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1563     luf = rsqrtIter(luf, xf);
1564 #endif
1565 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1566     luf = rsqrtIter(luf, xf);
1567 #endif
1568 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1569     luf = rsqrtIter(luf, xf);
1570 #endif
1571     cvtF2DD(luf, &lu0, &lu1);
1572     // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1573 #if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1574     lu0 = rsqrtIter(lu0, x0);
1575     lu1 = rsqrtIter(lu1, x1);
1576 #endif
1577 #if (GMX_SIMD_ACCURACY_BITS_SINGLE*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1578     lu0 = rsqrtIter(lu0, x0);
1579     lu1 = rsqrtIter(lu1, x1);
1580 #endif
1581     *out0 = lu0;
1582     *out1 = lu1;
1583 #else
1584     *out0 = invsqrt(x0);
1585     *out1 = invsqrt(x1);
1586 #endif
1587 }
1588
1589 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1590 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1591  *
1592  * This is a low-level routine that should only be used by SIMD math routine
1593  * that evaluates the reciprocal.
1594  *
1595  *  \param lu Approximation of 1/x, typically obtained from lookup.
1596  *  \param x  The reference (starting) value x for which we want 1/x.
1597  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1598  */
1599 static inline SimdDouble gmx_simdcall
1600 rcpIter(SimdDouble lu, SimdDouble x)
1601 {
1602     return lu*fnma(lu, x, SimdDouble(2.0));
1603 }
1604 #endif
1605
1606 /*! \brief Calculate 1/x for SIMD double.
1607  *
1608  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1609  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1610  *           For the single precision implementation this is obviously always
1611  *           true for positive values, but for double precision it adds an
1612  *           extra restriction since the first lookup step might have to be
1613  *           performed in single precision on some architectures. Note that the
1614  *           responsibility for checking falls on you - this routine does not
1615  *           check arguments.
1616  *
1617  *  \return 1/x. Result is undefined if your argument was invalid.
1618  */
1619 static inline SimdDouble gmx_simdcall
1620 inv(SimdDouble x)
1621 {
1622     SimdDouble lu = rcp(x);
1623 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1624     lu = rcpIter(lu, x);
1625 #endif
1626 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1627     lu = rcpIter(lu, x);
1628 #endif
1629 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1630     lu = rcpIter(lu, x);
1631 #endif
1632 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1633     lu = rcpIter(lu, x);
1634 #endif
1635     return lu;
1636 }
1637
1638 /*! \brief Division for SIMD doubles
1639  *
1640  * \param nom    Nominator
1641  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1642  *               For single precision this is equivalent to a nonzero argument,
1643  *               but in double precision it adds an extra restriction since
1644  *               the first lookup step might have to be performed in single
1645  *               precision on some architectures. Note that the responsibility
1646  *               for checking falls on you - this routine does not check arguments.
1647  *
1648  * \return nom/denom
1649  *
1650  * \note This function does not use any masking to avoid problems with
1651  *       zero values in the denominator.
1652  */
1653 static inline SimdDouble gmx_simdcall
1654 operator/(SimdDouble nom, SimdDouble denom)
1655 {
1656     return nom*inv(denom);
1657 }
1658
1659
1660 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1661  *
1662  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1663  *  Illegal values in the masked-out elements will not lead to
1664  *  floating-point exceptions.
1665  *
1666  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1667  *           GMX_FLOAT_MAX for masked-in entries.
1668  *           See \ref invsqrt for the discussion about argument restrictions.
1669  *  \param m Mask
1670  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
1671  *          entry was not masked, and 0.0 for masked-out entries.
1672  */
1673 static inline SimdDouble
1674 maskzInvsqrt(SimdDouble x, SimdDBool m)
1675 {
1676     SimdDouble lu = maskzRsqrt(x, m);
1677 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1678     lu = rsqrtIter(lu, x);
1679 #endif
1680 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1681     lu = rsqrtIter(lu, x);
1682 #endif
1683 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1684     lu = rsqrtIter(lu, x);
1685 #endif
1686 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1687     lu = rsqrtIter(lu, x);
1688 #endif
1689     return lu;
1690 }
1691
1692 /*! \brief Calculate 1/x for SIMD double, masked version.
1693  *
1694  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1695  *           GMX_FLOAT_MAX for masked-in entries.
1696  *           See \ref invsqrt for the discussion about argument restrictions.
1697  *  \param m Mask
1698  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1699  */
1700 static inline SimdDouble gmx_simdcall
1701 maskzInv(SimdDouble x, SimdDBool m)
1702 {
1703     SimdDouble lu = maskzRcp(x, m);
1704 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1705     lu = rcpIter(lu, x);
1706 #endif
1707 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1708     lu = rcpIter(lu, x);
1709 #endif
1710 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1711     lu = rcpIter(lu, x);
1712 #endif
1713 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1714     lu = rcpIter(lu, x);
1715 #endif
1716     return lu;
1717 }
1718
1719
1720 /*! \brief Calculate sqrt(x) for SIMD doubles.
1721  *
1722  *  \copydetails sqrt(SimdFloat)
1723  */
1724 template <MathOptimization opt = MathOptimization::Safe>
1725 static inline SimdDouble gmx_simdcall
1726 sqrt(SimdDouble x)
1727 {
1728     if (opt == MathOptimization::Safe)
1729     {
1730         // As we might use a float version of rsqrt, we mask out small values
1731         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
1732         return res*x;
1733     }
1734     else
1735     {
1736         return x * invsqrt(x);
1737     }
1738 }
1739
1740 #if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
1741 /*! \brief SIMD double log(x). This is the natural logarithm.
1742  *
1743  * \param x Argument, should be >0.
1744  * \result The natural logarithm of x. Undefined if argument is invalid.
1745  */
1746 static inline SimdDouble gmx_simdcall
1747 log(SimdDouble x)
1748 {
1749     const SimdDouble  one(1.0);
1750     const SimdDouble  two(2.0);
1751     const SimdDouble  invsqrt2(1.0/std::sqrt(2.0));
1752     const SimdDouble  corr(0.693147180559945286226764);
1753     const SimdDouble  CL15(0.148197055177935105296783);
1754     const SimdDouble  CL13(0.153108178020442575739679);
1755     const SimdDouble  CL11(0.181837339521549679055568);
1756     const SimdDouble  CL9(0.22222194152736701733275);
1757     const SimdDouble  CL7(0.285714288030134544449368);
1758     const SimdDouble  CL5(0.399999999989941956712869);
1759     const SimdDouble  CL3(0.666666666666685503450651);
1760     const SimdDouble  CL1(2.0);
1761     SimdDouble        fExp, x2, p;
1762     SimdDBool         m;
1763     SimdDInt32        iExp;
1764
1765     x     = frexp(x, &iExp);
1766     fExp  = cvtI2R(iExp);
1767
1768     m     = x < invsqrt2;
1769     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
1770     fExp  = fExp - selectByMask(one, m);
1771     x     = x * blend(one, two, m);
1772
1773     x     = (x-one) * inv( x+one );
1774     x2    = x * x;
1775
1776     p     = fma(CL15, x2, CL13);
1777     p     = fma(p, x2, CL11);
1778     p     = fma(p, x2, CL9);
1779     p     = fma(p, x2, CL7);
1780     p     = fma(p, x2, CL5);
1781     p     = fma(p, x2, CL3);
1782     p     = fma(p, x2, CL1);
1783     p     = fma(p, x, corr * fExp);
1784
1785     return p;
1786 }
1787 #endif
1788
1789 #if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
1790 /*! \brief SIMD double 2^x.
1791  *
1792  * \copydetails exp2(SimdFloat)
1793  */
1794 template <MathOptimization opt = MathOptimization::Safe>
1795 static inline SimdDouble gmx_simdcall
1796 exp2(SimdDouble x)
1797 {
1798     const SimdDouble  CE11(4.435280790452730022081181e-10);
1799     const SimdDouble  CE10(7.074105630863314448024247e-09);
1800     const SimdDouble  CE9(1.017819803432096698472621e-07);
1801     const SimdDouble  CE8(1.321543308956718799557863e-06);
1802     const SimdDouble  CE7(0.00001525273348995851746990884);
1803     const SimdDouble  CE6(0.0001540353046251466849082632);
1804     const SimdDouble  CE5(0.001333355814678995257307880);
1805     const SimdDouble  CE4(0.009618129107588335039176502);
1806     const SimdDouble  CE3(0.05550410866481992147457793);
1807     const SimdDouble  CE2(0.2402265069591015620470894);
1808     const SimdDouble  CE1(0.6931471805599453304615075);
1809     const SimdDouble  one(1.0);
1810
1811     SimdDouble        intpart;
1812     SimdDouble        fexppart;
1813     SimdDouble        p;
1814
1815     // Large negative values are valid arguments to exp2(), so there are two
1816     // things we need to account for:
1817     // 1. When the exponents reaches -1023, the (biased) exponent field will be
1818     //    zero and we can no longer multiply with it. There are special IEEE
1819     //    formats to handle this range, but for now we have to accept that
1820     //    we cannot handle those arguments. If input value becomes even more
1821     //    negative, it will start to loop and we would end up with invalid
1822     //    exponents. Thus, we need to limit or mask this.
1823     // 2. For VERY large negative values, we will have problems that the
1824     //    subtraction to get the fractional part loses accuracy, and then we
1825     //    can end up with overflows in the polynomial.
1826     //
1827     // For now, we handle this by forwarding the math optimization setting to
1828     // ldexp, where the routine will return zero for very small arguments.
1829     //
1830     // However, before doing that we need to make sure we do not call cvtR2I
1831     // with an argument that is so negative it cannot be converted to an integer.
1832     if (opt == MathOptimization::Safe)
1833     {
1834         x         = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
1835     }
1836
1837     fexppart  = ldexp<opt>(one, cvtR2I(x));
1838     intpart   = round(x);
1839     x         = x - intpart;
1840
1841     p         = fma(CE11, x, CE10);
1842     p         = fma(p, x, CE9);
1843     p         = fma(p, x, CE8);
1844     p         = fma(p, x, CE7);
1845     p         = fma(p, x, CE6);
1846     p         = fma(p, x, CE5);
1847     p         = fma(p, x, CE4);
1848     p         = fma(p, x, CE3);
1849     p         = fma(p, x, CE2);
1850     p         = fma(p, x, CE1);
1851     p         = fma(p, x, one);
1852     x         = p * fexppart;
1853     return x;
1854 }
1855 #endif
1856
1857 #if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
1858 /*! \brief SIMD double exp(x).
1859  *
1860  * \copydetails exp(SimdFloat)
1861  */
1862 template <MathOptimization opt = MathOptimization::Safe>
1863 static inline SimdDouble gmx_simdcall
1864 exp(SimdDouble x)
1865 {
1866     const SimdDouble  argscale(1.44269504088896340735992468100);
1867     const SimdDouble  invargscale0(-0.69314718055966295651160180568695068359375);
1868     const SimdDouble  invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
1869     const SimdDouble  CE12(2.078375306791423699350304e-09);
1870     const SimdDouble  CE11(2.518173854179933105218635e-08);
1871     const SimdDouble  CE10(2.755842049600488770111608e-07);
1872     const SimdDouble  CE9(2.755691815216689746619849e-06);
1873     const SimdDouble  CE8(2.480158383706245033920920e-05);
1874     const SimdDouble  CE7(0.0001984127043518048611841321);
1875     const SimdDouble  CE6(0.001388888889360258341755930);
1876     const SimdDouble  CE5(0.008333333332907368102819109);
1877     const SimdDouble  CE4(0.04166666666663836745814631);
1878     const SimdDouble  CE3(0.1666666666666796929434570);
1879     const SimdDouble  CE2(0.5);
1880     const SimdDouble  one(1.0);
1881     SimdDouble        fexppart;
1882     SimdDouble        intpart;
1883     SimdDouble        y, p;
1884
1885     // Large negative values are valid arguments to exp2(), so there are two
1886     // things we need to account for:
1887     // 1. When the exponents reaches -1023, the (biased) exponent field will be
1888     //    zero and we can no longer multiply with it. There are special IEEE
1889     //    formats to handle this range, but for now we have to accept that
1890     //    we cannot handle those arguments. If input value becomes even more
1891     //    negative, it will start to loop and we would end up with invalid
1892     //    exponents. Thus, we need to limit or mask this.
1893     // 2. For VERY large negative values, we will have problems that the
1894     //    subtraction to get the fractional part loses accuracy, and then we
1895     //    can end up with overflows in the polynomial.
1896     //
1897     // For now, we handle this by forwarding the math optimization setting to
1898     // ldexp, where the routine will return zero for very small arguments.
1899     //
1900     // However, before doing that we need to make sure we do not call cvtR2I
1901     // with an argument that is so negative it cannot be converted to an integer
1902     // after being multiplied by argscale.
1903
1904     if (opt == MathOptimization::Safe)
1905     {
1906         x         = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest())/argscale);
1907     }
1908
1909     y         = x * argscale;
1910
1911     fexppart  = ldexp<opt>(one, cvtR2I(y));
1912     intpart   = round(y);
1913
1914     // Extended precision arithmetics
1915     x         = fma(invargscale0, intpart, x);
1916     x         = fma(invargscale1, intpart, x);
1917
1918     p         = fma(CE12, x, CE11);
1919     p         = fma(p, x, CE10);
1920     p         = fma(p, x, CE9);
1921     p         = fma(p, x, CE8);
1922     p         = fma(p, x, CE7);
1923     p         = fma(p, x, CE6);
1924     p         = fma(p, x, CE5);
1925     p         = fma(p, x, CE4);
1926     p         = fma(p, x, CE3);
1927     p         = fma(p, x, CE2);
1928     p         = fma(p, x * x, x);
1929     x         = fma(p, fexppart, fexppart);
1930
1931     return x;
1932 }
1933 #endif
1934
1935 /*! \brief SIMD double erf(x).
1936  *
1937  * \param x The value to calculate erf(x) for.
1938  * \result erf(x)
1939  *
1940  * This routine achieves very close to full precision, but we do not care about
1941  * the last bit or the subnormal result range.
1942  */
1943 static inline SimdDouble gmx_simdcall
1944 erf(SimdDouble x)
1945 {
1946     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
1947     const SimdDouble CAP4(-0.431780540597889301512e-4);
1948     const SimdDouble CAP3(-0.00578562306260059236059);
1949     const SimdDouble CAP2(-0.028593586920219752446);
1950     const SimdDouble CAP1(-0.315924962948621698209);
1951     const SimdDouble CAP0(0.14952975608477029151);
1952
1953     const SimdDouble CAQ5(-0.374089300177174709737e-5);
1954     const SimdDouble CAQ4(0.00015126584532155383535);
1955     const SimdDouble CAQ3(0.00536692680669480725423);
1956     const SimdDouble CAQ2(0.0668686825594046122636);
1957     const SimdDouble CAQ1(0.402604990869284362773);
1958     // CAQ0 == 1.0
1959     const SimdDouble CAoffset(0.9788494110107421875);
1960
1961     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
1962     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
1963     const SimdDouble CBP5(0.00119770193298159629350136085658);
1964     const SimdDouble CBP4(0.0164944422378370965881008942733);
1965     const SimdDouble CBP3(0.0984581468691775932063932439252);
1966     const SimdDouble CBP2(0.317364595806937763843589437418);
1967     const SimdDouble CBP1(0.554167062641455850932670067075);
1968     const SimdDouble CBP0(0.427583576155807163756925301060);
1969     const SimdDouble CBQ7(0.00212288829699830145976198384930);
1970     const SimdDouble CBQ6(0.0334810979522685300554606393425);
1971     const SimdDouble CBQ5(0.2361713785181450957579508850717);
1972     const SimdDouble CBQ4(0.955364736493055670530981883072);
1973     const SimdDouble CBQ3(2.36815675631420037315349279199);
1974     const SimdDouble CBQ2(3.55261649184083035537184223542);
1975     const SimdDouble CBQ1(2.93501136050160872574376997993);
1976     // CBQ0 == 1.0
1977
1978     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
1979     const SimdDouble CCP6(-2.8175401114513378771);
1980     const SimdDouble CCP5(-3.22729451764143718517);
1981     const SimdDouble CCP4(-2.5518551727311523996);
1982     const SimdDouble CCP3(-0.687717681153649930619);
1983     const SimdDouble CCP2(-0.212652252872804219852);
1984     const SimdDouble CCP1(0.0175389834052493308818);
1985     const SimdDouble CCP0(0.00628057170626964891937);
1986
1987     const SimdDouble CCQ6(5.48409182238641741584);
1988     const SimdDouble CCQ5(13.5064170191802889145);
1989     const SimdDouble CCQ4(22.9367376522880577224);
1990     const SimdDouble CCQ3(15.930646027911794143);
1991     const SimdDouble CCQ2(11.0567237927800161565);
1992     const SimdDouble CCQ1(2.79257750980575282228);
1993     // CCQ0 == 1.0
1994     const SimdDouble CCoffset(0.5579090118408203125);
1995
1996     const SimdDouble one(1.0);
1997     const SimdDouble two(2.0);
1998
1999     SimdDouble       xabs, x2, x4, t, t2, w, w2;
2000     SimdDouble       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2001     SimdDouble       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2002     SimdDouble       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2003     SimdDouble       res_erf, res_erfcB, res_erfcC, res_erfc, res;
2004     SimdDouble       expmx2;
2005     SimdDBool        mask, mask_erf, notmask_erf;
2006
2007     // Calculate erf()
2008     xabs        = abs(x);
2009     mask_erf    = (xabs < one);
2010     notmask_erf = (one <= xabs);
2011     x2          = x * x;
2012     x4          = x2 * x2;
2013
2014     PolyAP0  = fma(CAP4, x4, CAP2);
2015     PolyAP1  = fma(CAP3, x4, CAP1);
2016     PolyAP0  = fma(PolyAP0, x4, CAP0);
2017     PolyAP0  = fma(PolyAP1, x2, PolyAP0);
2018
2019     PolyAQ1  = fma(CAQ5, x4, CAQ3);
2020     PolyAQ0  = fma(CAQ4, x4, CAQ2);
2021     PolyAQ1  = fma(PolyAQ1, x4, CAQ1);
2022     PolyAQ0  = fma(PolyAQ0, x4, one);
2023     PolyAQ0  = fma(PolyAQ1, x2, PolyAQ0);
2024
2025     res_erf  = PolyAP0 * maskzInv(PolyAQ0, mask_erf);
2026     res_erf  = CAoffset + res_erf;
2027     res_erf  = x * res_erf;
2028
2029     // Calculate erfc() in range [1,4.5]
2030     t       = xabs - one;
2031     t2      = t * t;
2032
2033     PolyBP0  = fma(CBP6, t2, CBP4);
2034     PolyBP1  = fma(CBP5, t2, CBP3);
2035     PolyBP0  = fma(PolyBP0, t2, CBP2);
2036     PolyBP1  = fma(PolyBP1, t2, CBP1);
2037     PolyBP0  = fma(PolyBP0, t2, CBP0);
2038     PolyBP0  = fma(PolyBP1, t, PolyBP0);
2039
2040     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2041     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2042     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2043     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2044     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2045     PolyBQ0 = fma(PolyBQ0, t2, one);
2046     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2047
2048     // The denominator polynomial can be zero outside the range
2049     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf);
2050
2051     res_erfcB = res_erfcB * xabs;
2052
2053     // Calculate erfc() in range [4.5,inf]
2054     w       = maskzInv(xabs, notmask_erf);
2055     w2      = w * w;
2056
2057     PolyCP0  = fma(CCP6, w2, CCP4);
2058     PolyCP1  = fma(CCP5, w2, CCP3);
2059     PolyCP0  = fma(PolyCP0, w2, CCP2);
2060     PolyCP1  = fma(PolyCP1, w2, CCP1);
2061     PolyCP0  = fma(PolyCP0, w2, CCP0);
2062     PolyCP0  = fma(PolyCP1, w, PolyCP0);
2063
2064     PolyCQ0  = fma(CCQ6, w2, CCQ4);
2065     PolyCQ1  = fma(CCQ5, w2, CCQ3);
2066     PolyCQ0  = fma(PolyCQ0, w2, CCQ2);
2067     PolyCQ1  = fma(PolyCQ1, w2, CCQ1);
2068     PolyCQ0  = fma(PolyCQ0, w2, one);
2069     PolyCQ0  = fma(PolyCQ1, w, PolyCQ0);
2070
2071     expmx2   = exp( -x2 );
2072
2073     // The denominator polynomial can be zero outside the range
2074     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf);
2075     res_erfcC = res_erfcC + CCoffset;
2076     res_erfcC = res_erfcC * w;
2077
2078     mask     = (SimdDouble(4.5) < xabs);
2079     res_erfc = blend(res_erfcB, res_erfcC, mask);
2080
2081     res_erfc = res_erfc * expmx2;
2082
2083     // erfc(x<0) = 2-erfc(|x|)
2084     mask     = (x < setZero());
2085     res_erfc = blend(res_erfc, two - res_erfc, mask);
2086
2087     // Select erf() or erfc()
2088     res  = blend(one - res_erfc, res_erf, mask_erf);
2089
2090     return res;
2091 }
2092
2093 /*! \brief SIMD double erfc(x).
2094  *
2095  * \param x The value to calculate erfc(x) for.
2096  * \result erfc(x)
2097  *
2098  * This routine achieves full precision (bar the last bit) over most of the
2099  * input range, but for large arguments where the result is getting close
2100  * to the minimum representable numbers we accept slightly larger errors
2101  * (think results that are in the ballpark of 10^-200 for double)
2102  * since that is not relevant for MD.
2103  */
2104 static inline SimdDouble gmx_simdcall
2105 erfc(SimdDouble x)
2106 {
2107     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2108     const SimdDouble CAP4(-0.431780540597889301512e-4);
2109     const SimdDouble CAP3(-0.00578562306260059236059);
2110     const SimdDouble CAP2(-0.028593586920219752446);
2111     const SimdDouble CAP1(-0.315924962948621698209);
2112     const SimdDouble CAP0(0.14952975608477029151);
2113
2114     const SimdDouble CAQ5(-0.374089300177174709737e-5);
2115     const SimdDouble CAQ4(0.00015126584532155383535);
2116     const SimdDouble CAQ3(0.00536692680669480725423);
2117     const SimdDouble CAQ2(0.0668686825594046122636);
2118     const SimdDouble CAQ1(0.402604990869284362773);
2119     // CAQ0 == 1.0
2120     const SimdDouble CAoffset(0.9788494110107421875);
2121
2122     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2123     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2124     const SimdDouble CBP5(0.00119770193298159629350136085658);
2125     const SimdDouble CBP4(0.0164944422378370965881008942733);
2126     const SimdDouble CBP3(0.0984581468691775932063932439252);
2127     const SimdDouble CBP2(0.317364595806937763843589437418);
2128     const SimdDouble CBP1(0.554167062641455850932670067075);
2129     const SimdDouble CBP0(0.427583576155807163756925301060);
2130     const SimdDouble CBQ7(0.00212288829699830145976198384930);
2131     const SimdDouble CBQ6(0.0334810979522685300554606393425);
2132     const SimdDouble CBQ5(0.2361713785181450957579508850717);
2133     const SimdDouble CBQ4(0.955364736493055670530981883072);
2134     const SimdDouble CBQ3(2.36815675631420037315349279199);
2135     const SimdDouble CBQ2(3.55261649184083035537184223542);
2136     const SimdDouble CBQ1(2.93501136050160872574376997993);
2137     // CBQ0 == 1.0
2138
2139     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2140     const SimdDouble CCP6(-2.8175401114513378771);
2141     const SimdDouble CCP5(-3.22729451764143718517);
2142     const SimdDouble CCP4(-2.5518551727311523996);
2143     const SimdDouble CCP3(-0.687717681153649930619);
2144     const SimdDouble CCP2(-0.212652252872804219852);
2145     const SimdDouble CCP1(0.0175389834052493308818);
2146     const SimdDouble CCP0(0.00628057170626964891937);
2147
2148     const SimdDouble CCQ6(5.48409182238641741584);
2149     const SimdDouble CCQ5(13.5064170191802889145);
2150     const SimdDouble CCQ4(22.9367376522880577224);
2151     const SimdDouble CCQ3(15.930646027911794143);
2152     const SimdDouble CCQ2(11.0567237927800161565);
2153     const SimdDouble CCQ1(2.79257750980575282228);
2154     // CCQ0 == 1.0
2155     const SimdDouble CCoffset(0.5579090118408203125);
2156
2157     const SimdDouble one(1.0);
2158     const SimdDouble two(2.0);
2159
2160     SimdDouble       xabs, x2, x4, t, t2, w, w2;
2161     SimdDouble       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2162     SimdDouble       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2163     SimdDouble       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2164     SimdDouble       res_erf, res_erfcB, res_erfcC, res_erfc, res;
2165     SimdDouble       expmx2;
2166     SimdDBool        mask, mask_erf, notmask_erf;
2167
2168     // Calculate erf()
2169     xabs        = abs(x);
2170     mask_erf    = (xabs < one);
2171     notmask_erf = (one <= xabs);
2172     x2          = x * x;
2173     x4          = x2 * x2;
2174
2175     PolyAP0  = fma(CAP4, x4, CAP2);
2176     PolyAP1  = fma(CAP3, x4, CAP1);
2177     PolyAP0  = fma(PolyAP0, x4, CAP0);
2178     PolyAP0  = fma(PolyAP1, x2, PolyAP0);
2179     PolyAQ1  = fma(CAQ5, x4, CAQ3);
2180     PolyAQ0  = fma(CAQ4, x4, CAQ2);
2181     PolyAQ1  = fma(PolyAQ1, x4, CAQ1);
2182     PolyAQ0  = fma(PolyAQ0, x4, one);
2183     PolyAQ0  = fma(PolyAQ1, x2, PolyAQ0);
2184
2185     res_erf  = PolyAP0 * maskzInv(PolyAQ0, mask_erf);
2186     res_erf  = CAoffset + res_erf;
2187     res_erf  = x * res_erf;
2188
2189     // Calculate erfc() in range [1,4.5]
2190     t       = xabs - one;
2191     t2      = t * t;
2192
2193     PolyBP0  = fma(CBP6, t2, CBP4);
2194     PolyBP1  = fma(CBP5, t2, CBP3);
2195     PolyBP0  = fma(PolyBP0, t2, CBP2);
2196     PolyBP1  = fma(PolyBP1, t2, CBP1);
2197     PolyBP0  = fma(PolyBP0, t2, CBP0);
2198     PolyBP0  = fma(PolyBP1, t, PolyBP0);
2199
2200     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2201     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2202     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2203     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2204     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2205     PolyBQ0 = fma(PolyBQ0, t2, one);
2206     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2207
2208     // The denominator polynomial can be zero outside the range
2209     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf);
2210
2211     res_erfcB = res_erfcB * xabs;
2212
2213     // Calculate erfc() in range [4.5,inf]
2214     w       = maskzInv(xabs, xabs != setZero());
2215     w2      = w * w;
2216
2217     PolyCP0  = fma(CCP6, w2, CCP4);
2218     PolyCP1  = fma(CCP5, w2, CCP3);
2219     PolyCP0  = fma(PolyCP0, w2, CCP2);
2220     PolyCP1  = fma(PolyCP1, w2, CCP1);
2221     PolyCP0  = fma(PolyCP0, w2, CCP0);
2222     PolyCP0  = fma(PolyCP1, w, PolyCP0);
2223
2224     PolyCQ0  = fma(CCQ6, w2, CCQ4);
2225     PolyCQ1  = fma(CCQ5, w2, CCQ3);
2226     PolyCQ0  = fma(PolyCQ0, w2, CCQ2);
2227     PolyCQ1  = fma(PolyCQ1, w2, CCQ1);
2228     PolyCQ0  = fma(PolyCQ0, w2, one);
2229     PolyCQ0  = fma(PolyCQ1, w, PolyCQ0);
2230
2231     expmx2   = exp( -x2 );
2232
2233     // The denominator polynomial can be zero outside the range
2234     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf);
2235     res_erfcC = res_erfcC + CCoffset;
2236     res_erfcC = res_erfcC * w;
2237
2238     mask     = (SimdDouble(4.5) < xabs);
2239     res_erfc = blend(res_erfcB, res_erfcC, mask);
2240
2241     res_erfc = res_erfc * expmx2;
2242
2243     // erfc(x<0) = 2-erfc(|x|)
2244     mask     = (x < setZero());
2245     res_erfc = blend(res_erfc, two - res_erfc, mask);
2246
2247     // Select erf() or erfc()
2248     res  = blend(res_erfc, one - res_erf, mask_erf);
2249
2250     return res;
2251 }
2252
2253 /*! \brief SIMD double sin \& cos.
2254  *
2255  * \param x The argument to evaluate sin/cos for
2256  * \param[out] sinval Sin(x)
2257  * \param[out] cosval Cos(x)
2258  *
2259  * This version achieves close to machine precision, but for very large
2260  * magnitudes of the argument we inherently begin to lose accuracy due to the
2261  * argument reduction, despite using extended precision arithmetics internally.
2262  */
2263 static inline void gmx_simdcall
2264 sincos(SimdDouble x, SimdDouble *sinval, SimdDouble *cosval)
2265 {
2266     // Constants to subtract Pi/4*x from y while minimizing precision loss
2267     const SimdDouble  argred0(-2*0.78539816290140151978);
2268     const SimdDouble  argred1(-2*4.9604678871439933374e-10);
2269     const SimdDouble  argred2(-2*1.1258708853173288931e-18);
2270     const SimdDouble  argred3(-2*1.7607799325916000908e-27);
2271     const SimdDouble  two_over_pi(2.0/M_PI);
2272     const SimdDouble  const_sin5( 1.58938307283228937328511e-10);
2273     const SimdDouble  const_sin4(-2.50506943502539773349318e-08);
2274     const SimdDouble  const_sin3( 2.75573131776846360512547e-06);
2275     const SimdDouble  const_sin2(-0.000198412698278911770864914);
2276     const SimdDouble  const_sin1( 0.0083333333333191845961746);
2277     const SimdDouble  const_sin0(-0.166666666666666130709393);
2278
2279     const SimdDouble  const_cos7(-1.13615350239097429531523e-11);
2280     const SimdDouble  const_cos6( 2.08757471207040055479366e-09);
2281     const SimdDouble  const_cos5(-2.75573144028847567498567e-07);
2282     const SimdDouble  const_cos4( 2.48015872890001867311915e-05);
2283     const SimdDouble  const_cos3(-0.00138888888888714019282329);
2284     const SimdDouble  const_cos2( 0.0416666666666665519592062);
2285     const SimdDouble  half(0.5);
2286     const SimdDouble  one(1.0);
2287     SimdDouble        ssign, csign;
2288     SimdDouble        x2, y, z, psin, pcos, sss, ccc;
2289     SimdDBool         mask;
2290 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2291     const SimdDInt32  ione(1);
2292     const SimdDInt32  itwo(2);
2293     SimdDInt32        iy;
2294
2295     z       = x * two_over_pi;
2296     iy      = cvtR2I(z);
2297     y       = round(z);
2298
2299     mask    = cvtIB2B((iy & ione) == setZero());
2300     ssign   = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2301     csign   = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2302 #else
2303     const SimdDouble  quarter(0.25);
2304     const SimdDouble  minusquarter(-0.25);
2305     SimdDouble        q;
2306     SimdDBool         m1, m2, m3;
2307
2308     /* The most obvious way to find the arguments quadrant in the unit circle
2309      * to calculate the sign is to use integer arithmetic, but that is not
2310      * present in all SIMD implementations. As an alternative, we have devised a
2311      * pure floating-point algorithm that uses truncation for argument reduction
2312      * so that we get a new value 0<=q<1 over the unit circle, and then
2313      * do floating-point comparisons with fractions. This is likely to be
2314      * slightly slower (~10%) due to the longer latencies of floating-point, so
2315      * we only use it when integer SIMD arithmetic is not present.
2316      */
2317     ssign   = x;
2318     x       = abs(x);
2319     // It is critical that half-way cases are rounded down
2320     z       = fma(x, two_over_pi, half);
2321     y       = trunc(z);
2322     q       = z * quarter;
2323     q       = q - trunc(q);
2324     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2325      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2326      * This removes the 2*Pi periodicity without using any integer arithmetic.
2327      * First check if y had the value 2 or 3, set csign if true.
2328      */
2329     q       = q - half;
2330     /* If we have logical operations we can work directly on the signbit, which
2331      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2332      * Thus, if you are altering defines to debug alternative code paths, the
2333      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2334      * active or inactive - you will get errors if only one is used.
2335      */
2336 #    if GMX_SIMD_HAVE_LOGICAL
2337     ssign   = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2338     csign   = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2339     ssign   = ssign ^ csign;
2340 #    else
2341     ssign   = copysign(SimdDouble(1.0), ssign);
2342     csign   = copysign(SimdDouble(1.0), q);
2343     csign   = -csign;
2344     ssign   = ssign * csign;    // swap ssign if csign was set.
2345 #    endif
2346     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2347     m1      = (q < minusquarter);
2348     m2      = (setZero() <= q);
2349     m3      = (q < quarter);
2350     m2      = m2 && m3;
2351     mask    = m1 || m2;
2352     // where mask is FALSE, swap sign.
2353     csign   = csign * blend(SimdDouble(-1.0), one, mask);
2354 #endif
2355     x       = fma(y, argred0, x);
2356     x       = fma(y, argred1, x);
2357     x       = fma(y, argred2, x);
2358     x       = fma(y, argred3, x);
2359     x2      = x * x;
2360
2361     psin    = fma(const_sin5, x2, const_sin4);
2362     psin    = fma(psin, x2, const_sin3);
2363     psin    = fma(psin, x2, const_sin2);
2364     psin    = fma(psin, x2, const_sin1);
2365     psin    = fma(psin, x2, const_sin0);
2366     psin    = fma(psin, x2 * x, x);
2367
2368     pcos    = fma(const_cos7, x2, const_cos6);
2369     pcos    = fma(pcos, x2, const_cos5);
2370     pcos    = fma(pcos, x2, const_cos4);
2371     pcos    = fma(pcos, x2, const_cos3);
2372     pcos    = fma(pcos, x2, const_cos2);
2373     pcos    = fms(pcos, x2, half);
2374     pcos    = fma(pcos, x2, one);
2375
2376     sss     = blend(pcos, psin, mask);
2377     ccc     = blend(psin, pcos, mask);
2378     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2379 #if GMX_SIMD_HAVE_LOGICAL
2380     *sinval = sss ^ ssign;
2381     *cosval = ccc ^ csign;
2382 #else
2383     *sinval = sss * ssign;
2384     *cosval = ccc * csign;
2385 #endif
2386 }
2387
2388 /*! \brief SIMD double sin(x).
2389  *
2390  * \param x The argument to evaluate sin for
2391  * \result Sin(x)
2392  *
2393  * \attention Do NOT call both sin & cos if you need both results, since each of them
2394  * will then call \ref sincos and waste a factor 2 in performance.
2395  */
2396 static inline SimdDouble gmx_simdcall
2397 sin(SimdDouble x)
2398 {
2399     SimdDouble s, c;
2400     sincos(x, &s, &c);
2401     return s;
2402 }
2403
2404 /*! \brief SIMD double cos(x).
2405  *
2406  * \param x The argument to evaluate cos for
2407  * \result Cos(x)
2408  *
2409  * \attention Do NOT call both sin & cos if you need both results, since each of them
2410  * will then call \ref sincos and waste a factor 2 in performance.
2411  */
2412 static inline SimdDouble gmx_simdcall
2413 cos(SimdDouble x)
2414 {
2415     SimdDouble s, c;
2416     sincos(x, &s, &c);
2417     return c;
2418 }
2419
2420 /*! \brief SIMD double tan(x).
2421  *
2422  * \param x The argument to evaluate tan for
2423  * \result Tan(x)
2424  */
2425 static inline SimdDouble gmx_simdcall
2426 tan(SimdDouble x)
2427 {
2428     const SimdDouble  argred0(-2*0.78539816290140151978);
2429     const SimdDouble  argred1(-2*4.9604678871439933374e-10);
2430     const SimdDouble  argred2(-2*1.1258708853173288931e-18);
2431     const SimdDouble  argred3(-2*1.7607799325916000908e-27);
2432     const SimdDouble  two_over_pi(2.0/M_PI);
2433     const SimdDouble  CT15(1.01419718511083373224408e-05);
2434     const SimdDouble  CT14(-2.59519791585924697698614e-05);
2435     const SimdDouble  CT13(5.23388081915899855325186e-05);
2436     const SimdDouble  CT12(-3.05033014433946488225616e-05);
2437     const SimdDouble  CT11(7.14707504084242744267497e-05);
2438     const SimdDouble  CT10(8.09674518280159187045078e-05);
2439     const SimdDouble  CT9(0.000244884931879331847054404);
2440     const SimdDouble  CT8(0.000588505168743587154904506);
2441     const SimdDouble  CT7(0.00145612788922812427978848);
2442     const SimdDouble  CT6(0.00359208743836906619142924);
2443     const SimdDouble  CT5(0.00886323944362401618113356);
2444     const SimdDouble  CT4(0.0218694882853846389592078);
2445     const SimdDouble  CT3(0.0539682539781298417636002);
2446     const SimdDouble  CT2(0.133333333333125941821962);
2447     const SimdDouble  CT1(0.333333333333334980164153);
2448
2449     SimdDouble        x2, p, y, z;
2450     SimdDBool         m;
2451
2452 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2453     SimdDInt32  iy;
2454     SimdDInt32  ione(1);
2455
2456     z       = x * two_over_pi;
2457     iy      = cvtR2I(z);
2458     y       = round(z);
2459     m       = cvtIB2B((iy & ione) == ione);
2460
2461     x       = fma(y, argred0, x);
2462     x       = fma(y, argred1, x);
2463     x       = fma(y, argred2, x);
2464     x       = fma(y, argred3, x);
2465     x       = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2466 #else
2467     const SimdDouble  quarter(0.25);
2468     const SimdDouble  half(0.5);
2469     const SimdDouble  threequarter(0.75);
2470     SimdDouble        w, q;
2471     SimdDBool         m1, m2, m3;
2472
2473     w       = abs(x);
2474     z       = fma(w, two_over_pi, half);
2475     y       = trunc(z);
2476     q       = z * quarter;
2477     q       = q - trunc(q);
2478     m1      = (quarter <= q);
2479     m2      = (q < half);
2480     m3      = (threequarter <= q);
2481     m1      = m1 && m2;
2482     m       = m1 || m3;
2483     w       = fma(y, argred0, w);
2484     w       = fma(y, argred1, w);
2485     w       = fma(y, argred2, w);
2486     w       = fma(y, argred3, w);
2487
2488     w       = blend(w, -w, m);
2489     x       = w * copysign( SimdDouble(1.0), x );
2490 #endif
2491     x2      = x * x;
2492     p       = fma(CT15, x2, CT14);
2493     p       = fma(p, x2, CT13);
2494     p       = fma(p, x2, CT12);
2495     p       = fma(p, x2, CT11);
2496     p       = fma(p, x2, CT10);
2497     p       = fma(p, x2, CT9);
2498     p       = fma(p, x2, CT8);
2499     p       = fma(p, x2, CT7);
2500     p       = fma(p, x2, CT6);
2501     p       = fma(p, x2, CT5);
2502     p       = fma(p, x2, CT4);
2503     p       = fma(p, x2, CT3);
2504     p       = fma(p, x2, CT2);
2505     p       = fma(p, x2, CT1);
2506     p       = fma(x2, p * x, x);
2507
2508     p       = blend( p, maskzInv(p, m), m);
2509     return p;
2510 }
2511
2512 /*! \brief SIMD double asin(x).
2513  *
2514  * \param x The argument to evaluate asin for
2515  * \result Asin(x)
2516  */
2517 static inline SimdDouble gmx_simdcall
2518 asin(SimdDouble x)
2519 {
2520     // Same algorithm as cephes library
2521     const SimdDouble limit1(0.625);
2522     const SimdDouble limit2(1e-8);
2523     const SimdDouble one(1.0);
2524     const SimdDouble quarterpi(M_PI/4.0);
2525     const SimdDouble morebits(6.123233995736765886130e-17);
2526
2527     const SimdDouble P5(4.253011369004428248960e-3);
2528     const SimdDouble P4(-6.019598008014123785661e-1);
2529     const SimdDouble P3(5.444622390564711410273e0);
2530     const SimdDouble P2(-1.626247967210700244449e1);
2531     const SimdDouble P1(1.956261983317594739197e1);
2532     const SimdDouble P0(-8.198089802484824371615e0);
2533
2534     const SimdDouble Q4(-1.474091372988853791896e1);
2535     const SimdDouble Q3(7.049610280856842141659e1);
2536     const SimdDouble Q2(-1.471791292232726029859e2);
2537     const SimdDouble Q1(1.395105614657485689735e2);
2538     const SimdDouble Q0(-4.918853881490881290097e1);
2539
2540     const SimdDouble R4(2.967721961301243206100e-3);
2541     const SimdDouble R3(-5.634242780008963776856e-1);
2542     const SimdDouble R2(6.968710824104713396794e0);
2543     const SimdDouble R1(-2.556901049652824852289e1);
2544     const SimdDouble R0(2.853665548261061424989e1);
2545
2546     const SimdDouble S3(-2.194779531642920639778e1);
2547     const SimdDouble S2(1.470656354026814941758e2);
2548     const SimdDouble S1(-3.838770957603691357202e2);
2549     const SimdDouble S0(3.424398657913078477438e2);
2550
2551     SimdDouble       xabs;
2552     SimdDouble       zz, ww, z, q, w, zz2, ww2;
2553     SimdDouble       PA, PB;
2554     SimdDouble       QA, QB;
2555     SimdDouble       RA, RB;
2556     SimdDouble       SA, SB;
2557     SimdDouble       nom, denom;
2558     SimdDBool        mask, mask2;
2559
2560     xabs  = abs(x);
2561
2562     mask  = (limit1 < xabs);
2563
2564     zz    = one - xabs;
2565     ww    = xabs * xabs;
2566     zz2   = zz * zz;
2567     ww2   = ww * ww;
2568
2569     // R
2570     RA    = fma(R4, zz2, R2);
2571     RB    = fma(R3, zz2, R1);
2572     RA    = fma(RA, zz2, R0);
2573     RA    = fma(RB, zz, RA);
2574
2575     // S, SA = zz2
2576     SB    = fma(S3, zz2, S1);
2577     SA    = zz2 +  S2;
2578     SA    = fma(SA, zz2, S0);
2579     SA    = fma(SB, zz, SA);
2580
2581     // P
2582     PA    = fma(P5, ww2, P3);
2583     PB    = fma(P4, ww2, P2);
2584     PA    = fma(PA, ww2, P1);
2585     PB    = fma(PB, ww2, P0);
2586     PA    = fma(PA, ww, PB);
2587
2588     // Q, QA = ww2
2589     QB    = fma(Q4, ww2, Q2);
2590     QA    = ww2 + Q3;
2591     QA    = fma(QA, ww2, Q1);
2592     QB    = fma(QB, ww2, Q0);
2593     QA    = fma(QA, ww, QB);
2594
2595     RA    = RA * zz;
2596     PA    = PA * ww;
2597
2598     nom   = blend( PA, RA, mask );
2599     denom = blend( QA, SA, mask );
2600
2601     mask2 = (limit2 < xabs);
2602     q     = nom * maskzInv(denom, mask2);
2603
2604     zz    = zz + zz;
2605     zz    = sqrt(zz);
2606     z     = quarterpi - zz;
2607     zz    = fms(zz, q, morebits);
2608     z     = z - zz;
2609     z     = z + quarterpi;
2610
2611     w     = xabs * q;
2612     w     = w + xabs;
2613
2614     z     = blend( w, z, mask );
2615
2616     z     = blend( xabs, z, mask2 );
2617
2618     z = copysign(z, x);
2619
2620     return z;
2621 }
2622
2623 /*! \brief SIMD double acos(x).
2624  *
2625  * \param x The argument to evaluate acos for
2626  * \result Acos(x)
2627  */
2628 static inline SimdDouble gmx_simdcall
2629 acos(SimdDouble x)
2630 {
2631     const SimdDouble one(1.0);
2632     const SimdDouble half(0.5);
2633     const SimdDouble quarterpi0(7.85398163397448309616e-1);
2634     const SimdDouble quarterpi1(6.123233995736765886130e-17);
2635
2636     SimdDBool        mask1;
2637     SimdDouble       z, z1, z2;
2638
2639     mask1 = (half < x);
2640     z1    = half * (one - x);
2641     z1    = sqrt(z1);
2642     z     = blend( x, z1, mask1 );
2643
2644     z     = asin(z);
2645
2646     z1    = z + z;
2647
2648     z2    = quarterpi0 - z;
2649     z2    = z2 + quarterpi1;
2650     z2    = z2 + quarterpi0;
2651
2652     z     = blend(z2, z1, mask1);
2653
2654     return z;
2655 }
2656
2657 /*! \brief SIMD double asin(x).
2658  *
2659  * \param x The argument to evaluate atan for
2660  * \result Atan(x), same argument/value range as standard math library.
2661  */
2662 static inline SimdDouble gmx_simdcall
2663 atan(SimdDouble x)
2664 {
2665     // Same algorithm as cephes library
2666     const SimdDouble limit1(0.66);
2667     const SimdDouble limit2(2.41421356237309504880);
2668     const SimdDouble quarterpi(M_PI/4.0);
2669     const SimdDouble halfpi(M_PI/2.0);
2670     const SimdDouble mone(-1.0);
2671     const SimdDouble morebits1(0.5*6.123233995736765886130E-17);
2672     const SimdDouble morebits2(6.123233995736765886130E-17);
2673
2674     const SimdDouble P4(-8.750608600031904122785E-1);
2675     const SimdDouble P3(-1.615753718733365076637E1);
2676     const SimdDouble P2(-7.500855792314704667340E1);
2677     const SimdDouble P1(-1.228866684490136173410E2);
2678     const SimdDouble P0(-6.485021904942025371773E1);
2679
2680     const SimdDouble Q4(2.485846490142306297962E1);
2681     const SimdDouble Q3(1.650270098316988542046E2);
2682     const SimdDouble Q2(4.328810604912902668951E2);
2683     const SimdDouble Q1(4.853903996359136964868E2);
2684     const SimdDouble Q0(1.945506571482613964425E2);
2685
2686     SimdDouble       y, xabs, t1, t2;
2687     SimdDouble       z, z2;
2688     SimdDouble       P_A, P_B, Q_A, Q_B;
2689     SimdDBool        mask1, mask2;
2690
2691     xabs   = abs(x);
2692
2693     mask1  = (limit1 < xabs);
2694     mask2  = (limit2 < xabs);
2695
2696     t1     = (xabs + mone) * maskzInv(xabs - mone, mask1);
2697     t2     = mone * maskzInv(xabs, mask2);
2698
2699     y      = selectByMask(quarterpi, mask1);
2700     y      = blend(y, halfpi, mask2);
2701     xabs   = blend(xabs, t1, mask1);
2702     xabs   = blend(xabs, t2, mask2);
2703
2704     z      = xabs * xabs;
2705     z2     = z * z;
2706
2707     P_A    = fma(P4, z2, P2);
2708     P_B    = fma(P3, z2, P1);
2709     P_A    = fma(P_A, z2, P0);
2710     P_A    = fma(P_B, z, P_A);
2711
2712     // Q_A = z2
2713     Q_B    = fma(Q4, z2, Q2);
2714     Q_A    = z2 + Q3;
2715     Q_A    = fma(Q_A, z2, Q1);
2716     Q_B    = fma(Q_B, z2, Q0);
2717     Q_A    = fma(Q_A, z, Q_B);
2718
2719     z      = z * P_A;
2720     z      = z * inv(Q_A);
2721     z      = fma(z, xabs, xabs);
2722
2723     t1     = selectByMask(morebits1, mask1);
2724     t1     = blend(t1, morebits2, mask2);
2725
2726     z      = z + t1;
2727     y      = y + z;
2728
2729     y      = copysign(y, x);
2730
2731     return y;
2732 }
2733
2734 /*! \brief SIMD double atan2(y,x).
2735  *
2736  * \param y Y component of vector, any quartile
2737  * \param x X component of vector, any quartile
2738  * \result Atan(y,x), same argument/value range as standard math library.
2739  *
2740  * \note This routine should provide correct results for all finite
2741  * non-zero or positive-zero arguments. However, negative zero arguments will
2742  * be treated as positive zero, which means the return value will deviate from
2743  * the standard math library atan2(y,x) for those cases. That should not be
2744  * of any concern in Gromacs, and in particular it will not affect calculations
2745  * of angles from vectors.
2746  */
2747 static inline SimdDouble gmx_simdcall
2748 atan2(SimdDouble y, SimdDouble x)
2749 {
2750     const SimdDouble pi(M_PI);
2751     const SimdDouble halfpi(M_PI/2.0);
2752     SimdDouble       xinv, p, aoffset;
2753     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
2754
2755     mask_xnz  = x != setZero();
2756     mask_ynz  = y != setZero();
2757     mask_xlt0 = (x < setZero());
2758     mask_ylt0 = (y < setZero());
2759
2760     aoffset   = selectByNotMask(halfpi, mask_xnz);
2761     aoffset   = selectByMask(aoffset, mask_ynz);
2762
2763     aoffset   = blend(aoffset, pi, mask_xlt0);
2764     aoffset   = blend(aoffset, -aoffset, mask_ylt0);
2765
2766     xinv      = maskzInv(x, mask_xnz);
2767     p         = y * xinv;
2768     p         = atan(p);
2769     p         = p + aoffset;
2770
2771     return p;
2772 }
2773
2774
2775 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
2776  *
2777  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2778  *           interaction distance and beta the ewald splitting parameters.
2779  * \result Correction factor to coulomb force.
2780  *
2781  * This routine is meant to enable analytical evaluation of the
2782  * direct-space PME electrostatic force to avoid tables. For details, see the
2783  * single precision function.
2784  */
2785 static inline SimdDouble gmx_simdcall
2786 pmeForceCorrection(SimdDouble z2)
2787 {
2788     const SimdDouble  FN10(-8.0072854618360083154e-14);
2789     const SimdDouble  FN9(1.1859116242260148027e-11);
2790     const SimdDouble  FN8(-8.1490406329798423616e-10);
2791     const SimdDouble  FN7(3.4404793543907847655e-8);
2792     const SimdDouble  FN6(-9.9471420832602741006e-7);
2793     const SimdDouble  FN5(0.000020740315999115847456);
2794     const SimdDouble  FN4(-0.00031991745139313364005);
2795     const SimdDouble  FN3(0.0035074449373659008203);
2796     const SimdDouble  FN2(-0.031750380176100813405);
2797     const SimdDouble  FN1(0.13884101728898463426);
2798     const SimdDouble  FN0(-0.75225277815249618847);
2799
2800     const SimdDouble  FD5(0.000016009278224355026701);
2801     const SimdDouble  FD4(0.00051055686934806966046);
2802     const SimdDouble  FD3(0.0081803507497974289008);
2803     const SimdDouble  FD2(0.077181146026670287235);
2804     const SimdDouble  FD1(0.41543303143712535988);
2805     const SimdDouble  FD0(1.0);
2806
2807     SimdDouble        z4;
2808     SimdDouble        polyFN0, polyFN1, polyFD0, polyFD1;
2809
2810     z4             = z2 * z2;
2811
2812     polyFD1        = fma(FD5, z4, FD3);
2813     polyFD1        = fma(polyFD1, z4, FD1);
2814     polyFD1        = polyFD1 * z2;
2815     polyFD0        = fma(FD4, z4, FD2);
2816     polyFD0        = fma(polyFD0, z4, FD0);
2817     polyFD0        = polyFD0 + polyFD1;
2818
2819     polyFD0        = inv(polyFD0);
2820
2821     polyFN0        = fma(FN10, z4, FN8);
2822     polyFN0        = fma(polyFN0, z4, FN6);
2823     polyFN0        = fma(polyFN0, z4, FN4);
2824     polyFN0        = fma(polyFN0, z4, FN2);
2825     polyFN0        = fma(polyFN0, z4, FN0);
2826     polyFN1        = fma(FN9, z4, FN7);
2827     polyFN1        = fma(polyFN1, z4, FN5);
2828     polyFN1        = fma(polyFN1, z4, FN3);
2829     polyFN1        = fma(polyFN1, z4, FN1);
2830     polyFN0        = fma(polyFN1, z2, polyFN0);
2831
2832
2833     return polyFN0 * polyFD0;
2834 }
2835
2836
2837
2838 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
2839  *
2840  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2841  *           interaction distance and beta the ewald splitting parameters.
2842  * \result Correction factor to coulomb force.
2843  *
2844  * This routine is meant to enable analytical evaluation of the
2845  * direct-space PME electrostatic potential to avoid tables. For details, see the
2846  * single precision function.
2847  */
2848 static inline SimdDouble gmx_simdcall
2849 pmePotentialCorrection(SimdDouble z2)
2850 {
2851     const SimdDouble  VN9(-9.3723776169321855475e-13);
2852     const SimdDouble  VN8(1.2280156762674215741e-10);
2853     const SimdDouble  VN7(-7.3562157912251309487e-9);
2854     const SimdDouble  VN6(2.6215886208032517509e-7);
2855     const SimdDouble  VN5(-4.9532491651265819499e-6);
2856     const SimdDouble  VN4(0.00025907400778966060389);
2857     const SimdDouble  VN3(0.0010585044856156469792);
2858     const SimdDouble  VN2(0.045247661136833092885);
2859     const SimdDouble  VN1(0.11643931522926034421);
2860     const SimdDouble  VN0(1.1283791671726767970);
2861
2862     const SimdDouble  VD5(0.000021784709867336150342);
2863     const SimdDouble  VD4(0.00064293662010911388448);
2864     const SimdDouble  VD3(0.0096311444822588683504);
2865     const SimdDouble  VD2(0.085608012351550627051);
2866     const SimdDouble  VD1(0.43652499166614811084);
2867     const SimdDouble  VD0(1.0);
2868
2869     SimdDouble        z4;
2870     SimdDouble        polyVN0, polyVN1, polyVD0, polyVD1;
2871
2872     z4             = z2 * z2;
2873
2874     polyVD1        = fma(VD5, z4, VD3);
2875     polyVD0        = fma(VD4, z4, VD2);
2876     polyVD1        = fma(polyVD1, z4, VD1);
2877     polyVD0        = fma(polyVD0, z4, VD0);
2878     polyVD0        = fma(polyVD1, z2, polyVD0);
2879
2880     polyVD0        = inv(polyVD0);
2881
2882     polyVN1        = fma(VN9, z4, VN7);
2883     polyVN0        = fma(VN8, z4, VN6);
2884     polyVN1        = fma(polyVN1, z4, VN5);
2885     polyVN0        = fma(polyVN0, z4, VN4);
2886     polyVN1        = fma(polyVN1, z4, VN3);
2887     polyVN0        = fma(polyVN0, z4, VN2);
2888     polyVN1        = fma(polyVN1, z4, VN1);
2889     polyVN0        = fma(polyVN0, z4, VN0);
2890     polyVN0        = fma(polyVN1, z2, polyVN0);
2891
2892     return polyVN0 * polyVD0;
2893 }
2894
2895 /*! \} */
2896
2897
2898 /*! \name SIMD math functions for double prec. data, single prec. accuracy
2899  *
2900  *  \note In some cases we do not need full double accuracy of individual
2901  *        SIMD math functions, although the data is stored in double precision
2902  *        SIMD registers. This might be the case for special algorithms, or
2903  *        if the architecture does not support single precision.
2904  *        Since the full double precision evaluation of math functions
2905  *        typically require much more expensive polynomial approximations
2906  *        these functions implement the algorithms used in the single precision
2907  *        SIMD math functions, but they operate on double precision
2908  *        SIMD variables.
2909  *
2910  *  \{
2911  */
2912
2913 /*********************************************************************
2914  * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
2915  *********************************************************************/
2916
2917 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
2918  *
2919  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
2920  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
2921  *           For the single precision implementation this is obviously always
2922  *           true for positive values, but for double precision it adds an
2923  *           extra restriction since the first lookup step might have to be
2924  *           performed in single precision on some architectures. Note that the
2925  *           responsibility for checking falls on you - this routine does not
2926  *           check arguments.
2927  *
2928  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
2929  */
2930 static inline SimdDouble gmx_simdcall
2931 invsqrtSingleAccuracy(SimdDouble x)
2932 {
2933     SimdDouble lu = rsqrt(x);
2934 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2935     lu = rsqrtIter(lu, x);
2936 #endif
2937 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2938     lu = rsqrtIter(lu, x);
2939 #endif
2940 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2941     lu = rsqrtIter(lu, x);
2942 #endif
2943     return lu;
2944 }
2945
2946 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
2947  *
2948  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
2949  *  Illegal values in the masked-out elements will not lead to
2950  *  floating-point exceptions.
2951  *
2952  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
2953  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
2954  *           For the single precision implementation this is obviously always
2955  *           true for positive values, but for double precision it adds an
2956  *           extra restriction since the first lookup step might have to be
2957  *           performed in single precision on some architectures. Note that the
2958  *           responsibility for checking falls on you - this routine does not
2959  *           check arguments.
2960  *
2961  *  \param m Mask
2962  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
2963  *          entry was not masked, and 0.0 for masked-out entries.
2964  */
2965 static inline SimdDouble
2966 maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
2967 {
2968     SimdDouble lu = maskzRsqrt(x, m);
2969 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2970     lu = rsqrtIter(lu, x);
2971 #endif
2972 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2973     lu = rsqrtIter(lu, x);
2974 #endif
2975 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2976     lu = rsqrtIter(lu, x);
2977 #endif
2978     return lu;
2979 }
2980
2981 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
2982  *
2983  * \param x0  First set of arguments, x0 must be in single range (see below).
2984  * \param x1  Second set of arguments, x1 must be in single range (see below).
2985  * \param[out] out0  Result 1/sqrt(x0)
2986  * \param[out] out1  Result 1/sqrt(x1)
2987  *
2988  *  In particular for double precision we can sometimes calculate square root
2989  *  pairs slightly faster by using single precision until the very last step.
2990  *
2991  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
2992  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
2993  *       For the single precision implementation this is obviously always
2994  *       true for positive values, but for double precision it adds an
2995  *       extra restriction since the first lookup step might have to be
2996  *       performed in single precision on some architectures. Note that the
2997  *       responsibility for checking falls on you - this routine does not
2998  *       check arguments.
2999  */
3000 static inline void gmx_simdcall
3001 invsqrtPairSingleAccuracy(SimdDouble x0,    SimdDouble x1,
3002                           SimdDouble *out0, SimdDouble *out1)
3003 {
3004 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
3005     SimdFloat  xf  = cvtDD2F(x0, x1);
3006     SimdFloat  luf = rsqrt(xf);
3007     SimdDouble lu0, lu1;
3008     // Intermediate target is single - mantissa+1 bits
3009 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3010     luf = rsqrtIter(luf, xf);
3011 #endif
3012 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3013     luf = rsqrtIter(luf, xf);
3014 #endif
3015 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3016     luf = rsqrtIter(luf, xf);
3017 #endif
3018     cvtF2DD(luf, &lu0, &lu1);
3019     // We now have single-precision accuracy values in lu0/lu1
3020     *out0 = lu0;
3021     *out1 = lu1;
3022 #else
3023     *out0 = invsqrtSingleAccuracy(x0);
3024     *out1 = invsqrtSingleAccuracy(x1);
3025 #endif
3026 }
3027
3028 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3029  *
3030  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3031  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3032  *           For the single precision implementation this is obviously always
3033  *           true for positive values, but for double precision it adds an
3034  *           extra restriction since the first lookup step might have to be
3035  *           performed in single precision on some architectures. Note that the
3036  *           responsibility for checking falls on you - this routine does not
3037  *           check arguments.
3038  *
3039  *  \return 1/x. Result is undefined if your argument was invalid.
3040  */
3041 static inline SimdDouble gmx_simdcall
3042 invSingleAccuracy(SimdDouble x)
3043 {
3044     SimdDouble lu = rcp(x);
3045 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3046     lu = rcpIter(lu, x);
3047 #endif
3048 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3049     lu = rcpIter(lu, x);
3050 #endif
3051 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3052     lu = rcpIter(lu, x);
3053 #endif
3054     return lu;
3055 }
3056
3057 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3058  *
3059  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3060  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3061  *           For the single precision implementation this is obviously always
3062  *           true for positive values, but for double precision it adds an
3063  *           extra restriction since the first lookup step might have to be
3064  *           performed in single precision on some architectures. Note that the
3065  *           responsibility for checking falls on you - this routine does not
3066  *           check arguments.
3067  *
3068  *  \param m Mask
3069  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3070  */
3071 static inline SimdDouble gmx_simdcall
3072 maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
3073 {
3074     SimdDouble lu = maskzRcp(x, m);
3075 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3076     lu = rcpIter(lu, x);
3077 #endif
3078 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3079     lu = rcpIter(lu, x);
3080 #endif
3081 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3082     lu = rcpIter(lu, x);
3083 #endif
3084     return lu;
3085 }
3086
3087
3088 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3089  *
3090  *  \copydetails sqrt(SimdFloat)
3091  */
3092 template <MathOptimization opt = MathOptimization::Safe>
3093 static inline SimdDouble gmx_simdcall
3094 sqrtSingleAccuracy(SimdDouble x)
3095 {
3096     if (opt == MathOptimization::Safe)
3097     {
3098         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
3099         return res*x;
3100     }
3101     else
3102     {
3103         return x * invsqrtSingleAccuracy(x);
3104     }
3105 }
3106
3107
3108 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3109  *
3110  * \param x Argument, should be >0.
3111  * \result The natural logarithm of x. Undefined if argument is invalid.
3112  */
3113 static inline SimdDouble gmx_simdcall
3114 logSingleAccuracy(SimdDouble x)
3115 {
3116     const SimdDouble  one(1.0);
3117     const SimdDouble  two(2.0);
3118     const SimdDouble  sqrt2(std::sqrt(2.0));
3119     const SimdDouble  corr(0.693147180559945286226764);
3120     const SimdDouble  CL9(0.2371599674224853515625);
3121     const SimdDouble  CL7(0.285279005765914916992188);
3122     const SimdDouble  CL5(0.400005519390106201171875);
3123     const SimdDouble  CL3(0.666666567325592041015625);
3124     const SimdDouble  CL1(2.0);
3125     SimdDouble        fexp, x2, p;
3126     SimdDInt32        iexp;
3127     SimdDBool         mask;
3128
3129     x     = frexp(x, &iexp);
3130     fexp  = cvtI2R(iexp);
3131
3132     mask  = (x < sqrt2);
3133     // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3134     fexp  = fexp - selectByMask(one, mask);
3135     x     = x * blend(one, two, mask);
3136
3137     x     = (x - one) * invSingleAccuracy( x + one );
3138     x2    = x * x;
3139
3140     p     = fma(CL9, x2, CL7);
3141     p     = fma(p, x2, CL5);
3142     p     = fma(p, x2, CL3);
3143     p     = fma(p, x2, CL1);
3144     p     = fma(p, x, corr * fexp);
3145
3146     return p;
3147 }
3148
3149 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3150  *
3151  * \copydetails exp2(SimdFloat)
3152  */
3153 template <MathOptimization opt = MathOptimization::Safe>
3154 static inline SimdDouble gmx_simdcall
3155 exp2SingleAccuracy(SimdDouble x)
3156 {
3157     const SimdDouble  CC6(0.0001534581200287996416911311);
3158     const SimdDouble  CC5(0.001339993121934088894618990);
3159     const SimdDouble  CC4(0.009618488957115180159497841);
3160     const SimdDouble  CC3(0.05550328776964726865751735);
3161     const SimdDouble  CC2(0.2402264689063408646490722);
3162     const SimdDouble  CC1(0.6931472057372680777553816);
3163     const SimdDouble  one(1.0);
3164
3165     SimdDouble        intpart;
3166     SimdDouble        p;
3167     SimdDInt32        ix;
3168
3169     // Large negative values are valid arguments to exp2(), so there are two
3170     // things we need to account for:
3171     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3172     //    zero and we can no longer multiply with it. There are special IEEE
3173     //    formats to handle this range, but for now we have to accept that
3174     //    we cannot handle those arguments. If input value becomes even more
3175     //    negative, it will start to loop and we would end up with invalid
3176     //    exponents. Thus, we need to limit or mask this.
3177     // 2. For VERY large negative values, we will have problems that the
3178     //    subtraction to get the fractional part loses accuracy, and then we
3179     //    can end up with overflows in the polynomial.
3180     //
3181     // For now, we handle this by forwarding the math optimization setting to
3182     // ldexp, where the routine will return zero for very small arguments.
3183     //
3184     // However, before doing that we need to make sure we do not call cvtR2I
3185     // with an argument that is so negative it cannot be converted to an integer.
3186     if (opt == MathOptimization::Safe)
3187     {
3188         x         = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
3189     }
3190
3191     ix        = cvtR2I(x);
3192     intpart   = round(x);
3193     x         = x - intpart;
3194
3195     p         = fma(CC6, x, CC5);
3196     p         = fma(p, x, CC4);
3197     p         = fma(p, x, CC3);
3198     p         = fma(p, x, CC2);
3199     p         = fma(p, x, CC1);
3200     p         = fma(p, x, one);
3201     x         = ldexp<opt>(p, ix);
3202
3203     return x;
3204 }
3205
3206
3207
3208 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3209  *
3210  * \copydetails exp(SimdFloat)
3211  */
3212 template <MathOptimization opt = MathOptimization::Safe>
3213 static inline SimdDouble gmx_simdcall
3214 expSingleAccuracy(SimdDouble x)
3215 {
3216     const SimdDouble  argscale(1.44269504088896341);
3217     // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3218     const SimdDouble  smallArgLimit(-709.0895657128);
3219     const SimdDouble  invargscale(-0.69314718055994528623);
3220     const SimdDouble  CC4(0.00136324646882712841033936);
3221     const SimdDouble  CC3(0.00836596917361021041870117);
3222     const SimdDouble  CC2(0.0416710823774337768554688);
3223     const SimdDouble  CC1(0.166665524244308471679688);
3224     const SimdDouble  CC0(0.499999850988388061523438);
3225     const SimdDouble  one(1.0);
3226     SimdDouble        intpart;
3227     SimdDouble        y, p;
3228     SimdDInt32        iy;
3229
3230     // Large negative values are valid arguments to exp2(), so there are two
3231     // things we need to account for:
3232     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3233     //    zero and we can no longer multiply with it. There are special IEEE
3234     //    formats to handle this range, but for now we have to accept that
3235     //    we cannot handle those arguments. If input value becomes even more
3236     //    negative, it will start to loop and we would end up with invalid
3237     //    exponents. Thus, we need to limit or mask this.
3238     // 2. For VERY large negative values, we will have problems that the
3239     //    subtraction to get the fractional part loses accuracy, and then we
3240     //    can end up with overflows in the polynomial.
3241     //
3242     // For now, we handle this by forwarding the math optimization setting to
3243     // ldexp, where the routine will return zero for very small arguments.
3244     //
3245     // However, before doing that we need to make sure we do not call cvtR2I
3246     // with an argument that is so negative it cannot be converted to an integer
3247     // after being multiplied by argscale.
3248
3249     if (opt == MathOptimization::Safe)
3250     {
3251         x         = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest())/argscale);
3252     }
3253
3254     y         = x * argscale;
3255
3256     iy        = cvtR2I(y);
3257     intpart   = round(y);        // use same rounding algorithm here
3258
3259     // Extended precision arithmetics not needed since
3260     // we have double precision and only need single accuracy.
3261     x         = fma(invargscale, intpart, x);
3262
3263     p         = fma(CC4, x, CC3);
3264     p         = fma(p, x, CC2);
3265     p         = fma(p, x, CC1);
3266     p         = fma(p, x, CC0);
3267     p         = fma(x*x, p, x);
3268     p         = p + one;
3269     x         = ldexp<opt>(p, iy);
3270     return x;
3271 }
3272
3273
3274 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3275  *
3276  * \param x The value to calculate erf(x) for.
3277  * \result erf(x)
3278  *
3279  * This routine achieves very close to single precision, but we do not care about
3280  * the last bit or the subnormal result range.
3281  */
3282 static inline SimdDouble gmx_simdcall
3283 erfSingleAccuracy(SimdDouble x)
3284 {
3285     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3286     const SimdDouble  CA6(7.853861353153693e-5);
3287     const SimdDouble  CA5(-8.010193625184903e-4);
3288     const SimdDouble  CA4(5.188327685732524e-3);
3289     const SimdDouble  CA3(-2.685381193529856e-2);
3290     const SimdDouble  CA2(1.128358514861418e-1);
3291     const SimdDouble  CA1(-3.761262582423300e-1);
3292     const SimdDouble  CA0(1.128379165726710);
3293     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3294     const SimdDouble  CB9(-0.0018629930017603923);
3295     const SimdDouble  CB8(0.003909821287598495);
3296     const SimdDouble  CB7(-0.0052094582210355615);
3297     const SimdDouble  CB6(0.005685614362160572);
3298     const SimdDouble  CB5(-0.0025367682853477272);
3299     const SimdDouble  CB4(-0.010199799682318782);
3300     const SimdDouble  CB3(0.04369575504816542);
3301     const SimdDouble  CB2(-0.11884063474674492);
3302     const SimdDouble  CB1(0.2732120154030589);
3303     const SimdDouble  CB0(0.42758357702025784);
3304     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3305     const SimdDouble  CC10(-0.0445555913112064);
3306     const SimdDouble  CC9(0.21376355144663348);
3307     const SimdDouble  CC8(-0.3473187200259257);
3308     const SimdDouble  CC7(0.016690861551248114);
3309     const SimdDouble  CC6(0.7560973182491192);
3310     const SimdDouble  CC5(-1.2137903600145787);
3311     const SimdDouble  CC4(0.8411872321232948);
3312     const SimdDouble  CC3(-0.08670413896296343);
3313     const SimdDouble  CC2(-0.27124782687240334);
3314     const SimdDouble  CC1(-0.0007502488047806069);
3315     const SimdDouble  CC0(0.5642114853803148);
3316     const SimdDouble  one(1.0);
3317     const SimdDouble  two(2.0);
3318
3319     SimdDouble        x2, x4, y;
3320     SimdDouble        t, t2, w, w2;
3321     SimdDouble        pA0, pA1, pB0, pB1, pC0, pC1;
3322     SimdDouble        expmx2;
3323     SimdDouble        res_erf, res_erfc, res;
3324     SimdDBool         mask, msk_erf;
3325
3326     // Calculate erf()
3327     x2   = x * x;
3328     x4   = x2 * x2;
3329
3330     pA0  = fma(CA6, x4, CA4);
3331     pA1  = fma(CA5, x4, CA3);
3332     pA0  = fma(pA0, x4, CA2);
3333     pA1  = fma(pA1, x4, CA1);
3334     pA0  = pA0 * x4;
3335     pA0  = fma(pA1, x2, pA0);
3336     // Constant term must come last for precision reasons
3337     pA0  = pA0 + CA0;
3338
3339     res_erf = x * pA0;
3340
3341     // Calculate erfc
3342     y       = abs(x);
3343     msk_erf = (SimdDouble(0.75) <= y);
3344     t       = maskzInvSingleAccuracy(y, msk_erf);
3345     w       = t - one;
3346     t2      = t * t;
3347     w2      = w * w;
3348
3349     expmx2  = expSingleAccuracy( -y*y );
3350
3351     pB1  = fma(CB9, w2, CB7);
3352     pB0  = fma(CB8, w2, CB6);
3353     pB1  = fma(pB1, w2, CB5);
3354     pB0  = fma(pB0, w2, CB4);
3355     pB1  = fma(pB1, w2, CB3);
3356     pB0  = fma(pB0, w2, CB2);
3357     pB1  = fma(pB1, w2, CB1);
3358     pB0  = fma(pB0, w2, CB0);
3359     pB0  = fma(pB1, w, pB0);
3360
3361     pC0  = fma(CC10, t2, CC8);
3362     pC1  = fma(CC9, t2, CC7);
3363     pC0  = fma(pC0, t2, CC6);
3364     pC1  = fma(pC1, t2, CC5);
3365     pC0  = fma(pC0, t2, CC4);
3366     pC1  = fma(pC1, t2, CC3);
3367     pC0  = fma(pC0, t2, CC2);
3368     pC1  = fma(pC1, t2, CC1);
3369
3370     pC0  = fma(pC0, t2, CC0);
3371     pC0  = fma(pC1, t, pC0);
3372     pC0  = pC0 * t;
3373
3374     // Select pB0 or pC0 for erfc()
3375     mask     = (two < y);
3376     res_erfc = blend(pB0, pC0, mask);
3377     res_erfc = res_erfc * expmx2;
3378
3379     // erfc(x<0) = 2-erfc(|x|)
3380     mask     = (x < setZero());
3381     res_erfc = blend(res_erfc, two - res_erfc, mask);
3382
3383     // Select erf() or erfc()
3384     mask = (y < SimdDouble(0.75));
3385     res  = blend(one - res_erfc, res_erf, mask);
3386
3387     return res;
3388 }
3389
3390 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
3391  *
3392  * \param x The value to calculate erfc(x) for.
3393  * \result erfc(x)
3394  *
3395  * This routine achieves singleprecision (bar the last bit) over most of the
3396  * input range, but for large arguments where the result is getting close
3397  * to the minimum representable numbers we accept slightly larger errors
3398  * (think results that are in the ballpark of 10^-30) since that is not
3399  * relevant for MD.
3400  */
3401 static inline SimdDouble gmx_simdcall
3402 erfcSingleAccuracy(SimdDouble x)
3403 {
3404     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3405     const SimdDouble  CA6(7.853861353153693e-5);
3406     const SimdDouble  CA5(-8.010193625184903e-4);
3407     const SimdDouble  CA4(5.188327685732524e-3);
3408     const SimdDouble  CA3(-2.685381193529856e-2);
3409     const SimdDouble  CA2(1.128358514861418e-1);
3410     const SimdDouble  CA1(-3.761262582423300e-1);
3411     const SimdDouble  CA0(1.128379165726710);
3412     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3413     const SimdDouble  CB9(-0.0018629930017603923);
3414     const SimdDouble  CB8(0.003909821287598495);
3415     const SimdDouble  CB7(-0.0052094582210355615);
3416     const SimdDouble  CB6(0.005685614362160572);
3417     const SimdDouble  CB5(-0.0025367682853477272);
3418     const SimdDouble  CB4(-0.010199799682318782);
3419     const SimdDouble  CB3(0.04369575504816542);
3420     const SimdDouble  CB2(-0.11884063474674492);
3421     const SimdDouble  CB1(0.2732120154030589);
3422     const SimdDouble  CB0(0.42758357702025784);
3423     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3424     const SimdDouble  CC10(-0.0445555913112064);
3425     const SimdDouble  CC9(0.21376355144663348);
3426     const SimdDouble  CC8(-0.3473187200259257);
3427     const SimdDouble  CC7(0.016690861551248114);
3428     const SimdDouble  CC6(0.7560973182491192);
3429     const SimdDouble  CC5(-1.2137903600145787);
3430     const SimdDouble  CC4(0.8411872321232948);
3431     const SimdDouble  CC3(-0.08670413896296343);
3432     const SimdDouble  CC2(-0.27124782687240334);
3433     const SimdDouble  CC1(-0.0007502488047806069);
3434     const SimdDouble  CC0(0.5642114853803148);
3435     const SimdDouble  one(1.0);
3436     const SimdDouble  two(2.0);
3437
3438     SimdDouble        x2, x4, y;
3439     SimdDouble        t, t2, w, w2;
3440     SimdDouble        pA0, pA1, pB0, pB1, pC0, pC1;
3441     SimdDouble        expmx2;
3442     SimdDouble        res_erf, res_erfc, res;
3443     SimdDBool         mask, msk_erf;
3444
3445     // Calculate erf()
3446     x2     = x * x;
3447     x4     = x2 * x2;
3448
3449     pA0  = fma(CA6, x4, CA4);
3450     pA1  = fma(CA5, x4, CA3);
3451     pA0  = fma(pA0, x4, CA2);
3452     pA1  = fma(pA1, x4, CA1);
3453     pA1  = pA1 * x2;
3454     pA0  = fma(pA0, x4, pA1);
3455     // Constant term must come last for precision reasons
3456     pA0  = pA0 + CA0;
3457
3458     res_erf = x * pA0;
3459
3460     // Calculate erfc
3461     y       = abs(x);
3462     msk_erf = (SimdDouble(0.75) <= y);
3463     t       = maskzInvSingleAccuracy(y, msk_erf);
3464     w       = t - one;
3465     t2      = t * t;
3466     w2      = w * w;
3467
3468     expmx2  = expSingleAccuracy( -y*y );
3469
3470     pB1  = fma(CB9, w2, CB7);
3471     pB0  = fma(CB8, w2, CB6);
3472     pB1  = fma(pB1, w2, CB5);
3473     pB0  = fma(pB0, w2, CB4);
3474     pB1  = fma(pB1, w2, CB3);
3475     pB0  = fma(pB0, w2, CB2);
3476     pB1  = fma(pB1, w2, CB1);
3477     pB0  = fma(pB0, w2, CB0);
3478     pB0  = fma(pB1, w, pB0);
3479
3480     pC0  = fma(CC10, t2, CC8);
3481     pC1  = fma(CC9, t2, CC7);
3482     pC0  = fma(pC0, t2, CC6);
3483     pC1  = fma(pC1, t2, CC5);
3484     pC0  = fma(pC0, t2, CC4);
3485     pC1  = fma(pC1, t2, CC3);
3486     pC0  = fma(pC0, t2, CC2);
3487     pC1  = fma(pC1, t2, CC1);
3488
3489     pC0  = fma(pC0, t2, CC0);
3490     pC0  = fma(pC1, t, pC0);
3491     pC0  = pC0 * t;
3492
3493     // Select pB0 or pC0 for erfc()
3494     mask     = (two < y);
3495     res_erfc = blend(pB0, pC0, mask);
3496     res_erfc = res_erfc * expmx2;
3497
3498     // erfc(x<0) = 2-erfc(|x|)
3499     mask     = (x < setZero());
3500     res_erfc = blend(res_erfc, two - res_erfc, mask);
3501
3502     // Select erf() or erfc()
3503     mask = (y < SimdDouble(0.75));
3504     res  = blend(res_erfc, one - res_erf, mask);
3505
3506     return res;
3507 }
3508
3509 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
3510  *
3511  * \param x The argument to evaluate sin/cos for
3512  * \param[out] sinval Sin(x)
3513  * \param[out] cosval Cos(x)
3514  */
3515 static inline void gmx_simdcall
3516 sinCosSingleAccuracy(SimdDouble x, SimdDouble *sinval, SimdDouble *cosval)
3517 {
3518     // Constants to subtract Pi/4*x from y while minimizing precision loss
3519     const SimdDouble  argred0(2*0.78539816290140151978);
3520     const SimdDouble  argred1(2*4.9604678871439933374e-10);
3521     const SimdDouble  argred2(2*1.1258708853173288931e-18);
3522     const SimdDouble  two_over_pi(2.0/M_PI);
3523     const SimdDouble  const_sin2(-1.9515295891e-4);
3524     const SimdDouble  const_sin1( 8.3321608736e-3);
3525     const SimdDouble  const_sin0(-1.6666654611e-1);
3526     const SimdDouble  const_cos2( 2.443315711809948e-5);
3527     const SimdDouble  const_cos1(-1.388731625493765e-3);
3528     const SimdDouble  const_cos0( 4.166664568298827e-2);
3529
3530     const SimdDouble  half(0.5);
3531     const SimdDouble  one(1.0);
3532     SimdDouble        ssign, csign;
3533     SimdDouble        x2, y, z, psin, pcos, sss, ccc;
3534     SimdDBool         mask;
3535
3536 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3537     const SimdDInt32  ione(1);
3538     const SimdDInt32  itwo(2);
3539     SimdDInt32        iy;
3540
3541     z       = x * two_over_pi;
3542     iy      = cvtR2I(z);
3543     y       = round(z);
3544
3545     mask    = cvtIB2B((iy & ione) == setZero());
3546     ssign   = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
3547     csign   = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
3548 #else
3549     const SimdDouble  quarter(0.25);
3550     const SimdDouble  minusquarter(-0.25);
3551     SimdDouble        q;
3552     SimdDBool         m1, m2, m3;
3553
3554     /* The most obvious way to find the arguments quadrant in the unit circle
3555      * to calculate the sign is to use integer arithmetic, but that is not
3556      * present in all SIMD implementations. As an alternative, we have devised a
3557      * pure floating-point algorithm that uses truncation for argument reduction
3558      * so that we get a new value 0<=q<1 over the unit circle, and then
3559      * do floating-point comparisons with fractions. This is likely to be
3560      * slightly slower (~10%) due to the longer latencies of floating-point, so
3561      * we only use it when integer SIMD arithmetic is not present.
3562      */
3563     ssign   = x;
3564     x       = abs(x);
3565     // It is critical that half-way cases are rounded down
3566     z       = fma(x, two_over_pi, half);
3567     y       = trunc(z);
3568     q       = z * quarter;
3569     q       = q - trunc(q);
3570     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
3571      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
3572      * This removes the 2*Pi periodicity without using any integer arithmetic.
3573      * First check if y had the value 2 or 3, set csign if true.
3574      */
3575     q       = q - half;
3576     /* If we have logical operations we can work directly on the signbit, which
3577      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
3578      * Thus, if you are altering defines to debug alternative code paths, the
3579      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
3580      * active or inactive - you will get errors if only one is used.
3581      */
3582 #    if GMX_SIMD_HAVE_LOGICAL
3583     ssign   = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
3584     csign   = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
3585     ssign   = ssign ^ csign;
3586 #    else
3587     ssign   = copysign(SimdDouble(1.0), ssign);
3588     csign   = copysign(SimdDouble(1.0), q);
3589     csign   = -csign;
3590     ssign   = ssign * csign;    // swap ssign if csign was set.
3591 #    endif
3592     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
3593     m1      = (q < minusquarter);
3594     m2      = (setZero() <= q);
3595     m3      = (q < quarter);
3596     m2      = m2 && m3;
3597     mask    = m1 || m2;
3598     // where mask is FALSE, swap sign.
3599     csign   = csign * blend(SimdDouble(-1.0), one, mask);
3600 #endif
3601     x       = fnma(y, argred0, x);
3602     x       = fnma(y, argred1, x);
3603     x       = fnma(y, argred2, x);
3604     x2      = x * x;
3605
3606     psin    = fma(const_sin2, x2, const_sin1);
3607     psin    = fma(psin, x2, const_sin0);
3608     psin    = fma(psin, x * x2, x);
3609     pcos    = fma(const_cos2, x2, const_cos1);
3610     pcos    = fma(pcos, x2, const_cos0);
3611     pcos    = fms(pcos, x2, half);
3612     pcos    = fma(pcos, x2, one);
3613
3614     sss     = blend(pcos, psin, mask);
3615     ccc     = blend(psin, pcos, mask);
3616     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
3617 #if GMX_SIMD_HAVE_LOGICAL
3618     *sinval = sss ^ ssign;
3619     *cosval = ccc ^ csign;
3620 #else
3621     *sinval = sss * ssign;
3622     *cosval = ccc * csign;
3623 #endif
3624 }
3625
3626 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
3627  *
3628  * \param x The argument to evaluate sin for
3629  * \result Sin(x)
3630  *
3631  * \attention Do NOT call both sin & cos if you need both results, since each of them
3632  * will then call \ref sincos and waste a factor 2 in performance.
3633  */
3634 static inline SimdDouble gmx_simdcall
3635 sinSingleAccuracy(SimdDouble x)
3636 {
3637     SimdDouble s, c;
3638     sinCosSingleAccuracy(x, &s, &c);
3639     return s;
3640 }
3641
3642 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
3643  *
3644  * \param x The argument to evaluate cos for
3645  * \result Cos(x)
3646  *
3647  * \attention Do NOT call both sin & cos if you need both results, since each of them
3648  * will then call \ref sincos and waste a factor 2 in performance.
3649  */
3650 static inline SimdDouble gmx_simdcall
3651 cosSingleAccuracy(SimdDouble x)
3652 {
3653     SimdDouble s, c;
3654     sinCosSingleAccuracy(x, &s, &c);
3655     return c;
3656 }
3657
3658 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
3659  *
3660  * \param x The argument to evaluate tan for
3661  * \result Tan(x)
3662  */
3663 static inline SimdDouble gmx_simdcall
3664 tanSingleAccuracy(SimdDouble x)
3665 {
3666     const SimdDouble  argred0(2*0.78539816290140151978);
3667     const SimdDouble  argred1(2*4.9604678871439933374e-10);
3668     const SimdDouble  argred2(2*1.1258708853173288931e-18);
3669     const SimdDouble  two_over_pi(2.0/M_PI);
3670     const SimdDouble  CT6(0.009498288995810566122993911);
3671     const SimdDouble  CT5(0.002895755790837379295226923);
3672     const SimdDouble  CT4(0.02460087336161924491836265);
3673     const SimdDouble  CT3(0.05334912882656359828045988);
3674     const SimdDouble  CT2(0.1333989091464957704418495);
3675     const SimdDouble  CT1(0.3333307599244198227797507);
3676
3677     SimdDouble        x2, p, y, z;
3678     SimdDBool         mask;
3679
3680 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3681     SimdDInt32  iy;
3682     SimdDInt32  ione(1);
3683
3684     z       = x * two_over_pi;
3685     iy      = cvtR2I(z);
3686     y       = round(z);
3687     mask    = cvtIB2B((iy & ione) == ione);
3688
3689     x       = fnma(y, argred0, x);
3690     x       = fnma(y, argred1, x);
3691     x       = fnma(y, argred2, x);
3692     x       = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
3693 #else
3694     const SimdDouble  quarter(0.25);
3695     const SimdDouble  half(0.5);
3696     const SimdDouble  threequarter(0.75);
3697     SimdDouble        w, q;
3698     SimdDBool         m1, m2, m3;
3699
3700     w       = abs(x);
3701     z       = fma(w, two_over_pi, half);
3702     y       = trunc(z);
3703     q       = z * quarter;
3704     q       = q - trunc(q);
3705     m1      = (quarter <= q);
3706     m2      = (q < half);
3707     m3      = (threequarter <= q);
3708     m1      = m1 && m2;
3709     mask    = m1 || m3;
3710     w       = fnma(y, argred0, w);
3711     w       = fnma(y, argred1, w);
3712     w       = fnma(y, argred2, w);
3713
3714     w       = blend(w, -w, mask);
3715     x       = w * copysign( SimdDouble(1.0), x );
3716 #endif
3717     x2      = x * x;
3718     p       = fma(CT6, x2, CT5);
3719     p       = fma(p, x2, CT4);
3720     p       = fma(p, x2, CT3);
3721     p       = fma(p, x2, CT2);
3722     p       = fma(p, x2, CT1);
3723     p       = fma(x2, p * x, x);
3724
3725     p       = blend( p, maskzInvSingleAccuracy(p, mask), mask);
3726     return p;
3727 }
3728
3729 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3730  *
3731  * \param x The argument to evaluate asin for
3732  * \result Asin(x)
3733  */
3734 static inline SimdDouble gmx_simdcall
3735 asinSingleAccuracy(SimdDouble x)
3736 {
3737     const SimdDouble limitlow(1e-4);
3738     const SimdDouble half(0.5);
3739     const SimdDouble one(1.0);
3740     const SimdDouble halfpi(M_PI/2.0);
3741     const SimdDouble CC5(4.2163199048E-2);
3742     const SimdDouble CC4(2.4181311049E-2);
3743     const SimdDouble CC3(4.5470025998E-2);
3744     const SimdDouble CC2(7.4953002686E-2);
3745     const SimdDouble CC1(1.6666752422E-1);
3746     SimdDouble       xabs;
3747     SimdDouble       z, z1, z2, q, q1, q2;
3748     SimdDouble       pA, pB;
3749     SimdDBool        mask, mask2;
3750
3751     xabs  = abs(x);
3752     mask  = (half < xabs);
3753     z1    = half * (one - xabs);
3754     mask2 = (xabs < one);
3755     q1    = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
3756     q2    = xabs;
3757     z2    = q2 * q2;
3758     z     = blend(z2, z1, mask);
3759     q     = blend(q2, q1, mask);
3760
3761     z2    = z * z;
3762     pA    = fma(CC5, z2, CC3);
3763     pB    = fma(CC4, z2, CC2);
3764     pA    = fma(pA, z2, CC1);
3765     pA    = pA * z;
3766     z     = fma(pB, z2, pA);
3767     z     = fma(z, q, q);
3768     q2    = halfpi - z;
3769     q2    = q2 - z;
3770     z     = blend(z, q2, mask);
3771
3772     mask  = (limitlow < xabs);
3773     z     = blend( xabs, z, mask );
3774     z     = copysign(z, x);
3775
3776     return z;
3777 }
3778
3779 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
3780  *
3781  * \param x The argument to evaluate acos for
3782  * \result Acos(x)
3783  */
3784 static inline SimdDouble gmx_simdcall
3785 acosSingleAccuracy(SimdDouble x)
3786 {
3787     const SimdDouble one(1.0);
3788     const SimdDouble half(0.5);
3789     const SimdDouble pi(M_PI);
3790     const SimdDouble halfpi(M_PI/2.0);
3791     SimdDouble       xabs;
3792     SimdDouble       z, z1, z2, z3;
3793     SimdDBool        mask1, mask2, mask3;
3794
3795     xabs  = abs(x);
3796     mask1 = (half < xabs);
3797     mask2 = (setZero() < x);
3798
3799     z     = half * (one - xabs);
3800     mask3 = (xabs < one);
3801     z     = z * maskzInvsqrtSingleAccuracy(z, mask3);
3802     z     = blend(x, z, mask1);
3803     z     = asinSingleAccuracy(z);
3804
3805     z2    = z + z;
3806     z1    = pi - z2;
3807     z3    = halfpi - z;
3808     z     = blend(z1, z2, mask2);
3809     z     = blend(z3, z, mask1);
3810
3811     return z;
3812 }
3813
3814 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3815  *
3816  * \param x The argument to evaluate atan for
3817  * \result Atan(x), same argument/value range as standard math library.
3818  */
3819 static inline SimdDouble gmx_simdcall
3820 atanSingleAccuracy(SimdDouble x)
3821 {
3822     const SimdDouble halfpi(M_PI/2);
3823     const SimdDouble CA17(0.002823638962581753730774);
3824     const SimdDouble CA15(-0.01595690287649631500244);
3825     const SimdDouble CA13(0.04250498861074447631836);
3826     const SimdDouble CA11(-0.07489009201526641845703);
3827     const SimdDouble CA9(0.1063479334115982055664);
3828     const SimdDouble CA7(-0.1420273631811141967773);
3829     const SimdDouble CA5(0.1999269574880599975585);
3830     const SimdDouble CA3(-0.3333310186862945556640);
3831     SimdDouble       x2, x3, x4, pA, pB;
3832     SimdDBool        mask, mask2;
3833
3834     mask  = (x < setZero());
3835     x     = abs(x);
3836     mask2 = (SimdDouble(1.0) < x);
3837     x     = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
3838
3839     x2    = x * x;
3840     x3    = x2 * x;
3841     x4    = x2 * x2;
3842     pA    = fma(CA17, x4, CA13);
3843     pB    = fma(CA15, x4, CA11);
3844     pA    = fma(pA, x4, CA9);
3845     pB    = fma(pB, x4, CA7);
3846     pA    = fma(pA, x4, CA5);
3847     pB    = fma(pB, x4, CA3);
3848     pA    = fma(pA, x2, pB);
3849     pA    = fma(pA, x3, x);
3850
3851     pA    = blend(pA, halfpi - pA, mask2);
3852     pA    = blend(pA, -pA, mask);
3853
3854     return pA;
3855 }
3856
3857 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
3858  *
3859  * \param y Y component of vector, any quartile
3860  * \param x X component of vector, any quartile
3861  * \result Atan(y,x), same argument/value range as standard math library.
3862  *
3863  * \note This routine should provide correct results for all finite
3864  * non-zero or positive-zero arguments. However, negative zero arguments will
3865  * be treated as positive zero, which means the return value will deviate from
3866  * the standard math library atan2(y,x) for those cases. That should not be
3867  * of any concern in Gromacs, and in particular it will not affect calculations
3868  * of angles from vectors.
3869  */
3870 static inline SimdDouble gmx_simdcall
3871 atan2SingleAccuracy(SimdDouble y, SimdDouble x)
3872 {
3873     const SimdDouble pi(M_PI);
3874     const SimdDouble halfpi(M_PI/2.0);
3875     SimdDouble       xinv, p, aoffset;
3876     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3877
3878     mask_xnz  = x != setZero();
3879     mask_ynz  = y != setZero();
3880     mask_xlt0 = (x < setZero());
3881     mask_ylt0 = (y < setZero());
3882
3883     aoffset   = selectByNotMask(halfpi, mask_xnz);
3884     aoffset   = selectByMask(aoffset, mask_ynz);
3885
3886     aoffset   = blend(aoffset, pi, mask_xlt0);
3887     aoffset   = blend(aoffset, -aoffset, mask_ylt0);
3888
3889     xinv      = maskzInvSingleAccuracy(x, mask_xnz);
3890     p         = y * xinv;
3891     p         = atanSingleAccuracy(p);
3892     p         = p + aoffset;
3893
3894     return p;
3895 }
3896
3897 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
3898  *
3899  * \param z2 \f$(r \beta)^2\f$ - see below for details.
3900  * \result Correction factor to coulomb force - see below for details.
3901  *
3902  * This routine is meant to enable analytical evaluation of the
3903  * direct-space PME electrostatic force to avoid tables.
3904  *
3905  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
3906  * are some problems evaluating that:
3907  *
3908  * First, the error function is difficult (read: expensive) to
3909  * approxmiate accurately for intermediate to large arguments, and
3910  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
3911  * Second, we now try to avoid calculating potentials in Gromacs but
3912  * use forces directly.
3913  *
3914  * We can simply things slight by noting that the PME part is really
3915  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
3916  * \f[
3917  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
3918  * \f]
3919  * The first term we already have from the inverse square root, so
3920  * that we can leave out of this routine.
3921  *
3922  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
3923  * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
3924  * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
3925  * in this range!
3926  *
3927  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
3928  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
3929  * then only use even powers. This is another minor optimization, since
3930  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
3931  * the vector between the two atoms to get the vectorial force. The
3932  * fastest flops are the ones we can avoid calculating!
3933  *
3934  * So, here's how it should be used:
3935  *
3936  * 1. Calculate \f$r^2\f$.
3937  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
3938  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
3939  * 4. The return value is the expression:
3940  *
3941  * \f[
3942  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
3943  * \f]
3944  *
3945  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
3946  *
3947  *  \f[
3948  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
3949  *  \f]
3950  *
3951  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
3952  *
3953  *  \f[
3954  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
3955  *  \f]
3956  *
3957  *    With a bit of math exercise you should be able to confirm that
3958  *    this is exactly
3959  *
3960  *  \f[
3961  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
3962  *  \f]
3963  *
3964  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
3965  *    and you have your force (divided by \f$r\f$). A final multiplication
3966  *    with the vector connecting the two particles and you have your
3967  *    vectorial force to add to the particles.
3968  *
3969  * This approximation achieves an accuracy slightly lower than 1e-6; when
3970  * added to \f$1/r\f$ the error will be insignificant.
3971  *
3972  */
3973 static inline SimdDouble gmx_simdcall
3974 pmeForceCorrectionSingleAccuracy(SimdDouble z2)
3975 {
3976     const SimdDouble  FN6(-1.7357322914161492954e-8);
3977     const SimdDouble  FN5(1.4703624142580877519e-6);
3978     const SimdDouble  FN4(-0.000053401640219807709149);
3979     const SimdDouble  FN3(0.0010054721316683106153);
3980     const SimdDouble  FN2(-0.019278317264888380590);
3981     const SimdDouble  FN1(0.069670166153766424023);
3982     const SimdDouble  FN0(-0.75225204789749321333);
3983
3984     const SimdDouble  FD4(0.0011193462567257629232);
3985     const SimdDouble  FD3(0.014866955030185295499);
3986     const SimdDouble  FD2(0.11583842382862377919);
3987     const SimdDouble  FD1(0.50736591960530292870);
3988     const SimdDouble  FD0(1.0);
3989
3990     SimdDouble        z4;
3991     SimdDouble        polyFN0, polyFN1, polyFD0, polyFD1;
3992
3993     z4             = z2 * z2;
3994
3995     polyFD0        = fma(FD4, z4, FD2);
3996     polyFD1        = fma(FD3, z4, FD1);
3997     polyFD0        = fma(polyFD0, z4, FD0);
3998     polyFD0        = fma(polyFD1, z2, polyFD0);
3999
4000     polyFD0        = invSingleAccuracy(polyFD0);
4001
4002     polyFN0        = fma(FN6, z4, FN4);
4003     polyFN1        = fma(FN5, z4, FN3);
4004     polyFN0        = fma(polyFN0, z4, FN2);
4005     polyFN1        = fma(polyFN1, z4, FN1);
4006     polyFN0        = fma(polyFN0, z4, FN0);
4007     polyFN0        = fma(polyFN1, z2, polyFN0);
4008
4009     return polyFN0 * polyFD0;
4010 }
4011
4012
4013
4014 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4015  *
4016  * \param z2 \f$(r \beta)^2\f$ - see below for details.
4017  * \result Correction factor to coulomb potential - see below for details.
4018  *
4019  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4020  * as the input argument.
4021  *
4022  * Here's how it should be used:
4023  *
4024  * 1. Calculate \f$r^2\f$.
4025  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4026  * 3. Evaluate this routine with z^2 as the argument.
4027  * 4. The return value is the expression:
4028  *
4029  *  \f[
4030  *   \frac{\mbox{erf}(z)}{z}
4031  *  \f]
4032  *
4033  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4034  *
4035  *  \f[
4036  *    \frac{\mbox{erf}(r \beta)}{r}
4037  *  \f]
4038  *
4039  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4040  *    and you have your potential.
4041  *
4042  * This approximation achieves an accuracy slightly lower than 1e-6; when
4043  * added to \f$1/r\f$ the error will be insignificant.
4044  */
4045 static inline SimdDouble gmx_simdcall
4046 pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
4047 {
4048     const SimdDouble  VN6(1.9296833005951166339e-8);
4049     const SimdDouble  VN5(-1.4213390571557850962e-6);
4050     const SimdDouble  VN4(0.000041603292906656984871);
4051     const SimdDouble  VN3(-0.00013134036773265025626);
4052     const SimdDouble  VN2(0.038657983986041781264);
4053     const SimdDouble  VN1(0.11285044772717598220);
4054     const SimdDouble  VN0(1.1283802385263030286);
4055
4056     const SimdDouble  VD3(0.0066752224023576045451);
4057     const SimdDouble  VD2(0.078647795836373922256);
4058     const SimdDouble  VD1(0.43336185284710920150);
4059     const SimdDouble  VD0(1.0);
4060
4061     SimdDouble        z4;
4062     SimdDouble        polyVN0, polyVN1, polyVD0, polyVD1;
4063
4064     z4             = z2 * z2;
4065
4066     polyVD1        = fma(VD3, z4, VD1);
4067     polyVD0        = fma(VD2, z4, VD0);
4068     polyVD0        = fma(polyVD1, z2, polyVD0);
4069
4070     polyVD0        = invSingleAccuracy(polyVD0);
4071
4072     polyVN0        = fma(VN6, z4, VN4);
4073     polyVN1        = fma(VN5, z4, VN3);
4074     polyVN0        = fma(polyVN0, z4, VN2);
4075     polyVN1        = fma(polyVN1, z4, VN1);
4076     polyVN0        = fma(polyVN0, z4, VN0);
4077     polyVN0        = fma(polyVN1, z2, polyVN0);
4078
4079     return polyVN0 * polyVD0;
4080 }
4081
4082 #endif
4083
4084
4085 /*! \name SIMD4 math functions
4086  *
4087  * \note Only a subset of the math functions are implemented for SIMD4.
4088  *  \{
4089  */
4090
4091
4092 #if GMX_SIMD4_HAVE_FLOAT
4093
4094 /*************************************************************************
4095  * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4096  *************************************************************************/
4097
4098 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4099  *
4100  * This is a low-level routine that should only be used by SIMD math routine
4101  * that evaluates the inverse square root.
4102  *
4103  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4104  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4105  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4106  */
4107 static inline Simd4Float gmx_simdcall
4108 rsqrtIter(Simd4Float lu, Simd4Float x)
4109 {
4110     Simd4Float tmp1 = x*lu;
4111     Simd4Float tmp2 = Simd4Float(-0.5F)*lu;
4112     tmp1 = fma(tmp1, lu, Simd4Float(-3.0F));
4113     return tmp1*tmp2;
4114 }
4115
4116 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4117  *
4118  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4119  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4120  *           For the single precision implementation this is obviously always
4121  *           true for positive values, but for double precision it adds an
4122  *           extra restriction since the first lookup step might have to be
4123  *           performed in single precision on some architectures. Note that the
4124  *           responsibility for checking falls on you - this routine does not
4125  *           check arguments.
4126  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4127  */
4128 static inline Simd4Float gmx_simdcall
4129 invsqrt(Simd4Float x)
4130 {
4131     Simd4Float lu = rsqrt(x);
4132 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4133     lu = rsqrtIter(lu, x);
4134 #endif
4135 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4136     lu = rsqrtIter(lu, x);
4137 #endif
4138 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4139     lu = rsqrtIter(lu, x);
4140 #endif
4141     return lu;
4142 }
4143
4144
4145 #endif // GMX_SIMD4_HAVE_FLOAT
4146
4147
4148
4149 #if GMX_SIMD4_HAVE_DOUBLE
4150 /*************************************************************************
4151  * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4152  *************************************************************************/
4153
4154 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4155  *
4156  * This is a low-level routine that should only be used by SIMD math routine
4157  * that evaluates the inverse square root.
4158  *
4159  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4160  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4161  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4162  */
4163 static inline Simd4Double gmx_simdcall
4164 rsqrtIter(Simd4Double lu, Simd4Double x)
4165 {
4166     Simd4Double tmp1 = x*lu;
4167     Simd4Double tmp2 = Simd4Double(-0.5F)*lu;
4168     tmp1             = fma(tmp1, lu, Simd4Double(-3.0F));
4169     return tmp1*tmp2;
4170 }
4171
4172 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4173  *
4174  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4175  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4176  *           For the single precision implementation this is obviously always
4177  *           true for positive values, but for double precision it adds an
4178  *           extra restriction since the first lookup step might have to be
4179  *           performed in single precision on some architectures. Note that the
4180  *           responsibility for checking falls on you - this routine does not
4181  *           check arguments.
4182  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4183  */
4184 static inline Simd4Double gmx_simdcall
4185 invsqrt(Simd4Double x)
4186 {
4187     Simd4Double lu = rsqrt(x);
4188 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4189     lu = rsqrtIter(lu, x);
4190 #endif
4191 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4192     lu = rsqrtIter(lu, x);
4193 #endif
4194 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4195     lu = rsqrtIter(lu, x);
4196 #endif
4197 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4198     lu = rsqrtIter(lu, x);
4199 #endif
4200     return lu;
4201 }
4202
4203
4204 /**********************************************************************
4205  * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4206  **********************************************************************/
4207
4208 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4209  *
4210  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4211  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4212  *           For the single precision implementation this is obviously always
4213  *           true for positive values, but for double precision it adds an
4214  *           extra restriction since the first lookup step might have to be
4215  *           performed in single precision on some architectures. Note that the
4216  *           responsibility for checking falls on you - this routine does not
4217  *           check arguments.
4218  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4219  */
4220 static inline Simd4Double gmx_simdcall
4221 invsqrtSingleAccuracy(Simd4Double x)
4222 {
4223     Simd4Double lu = rsqrt(x);
4224 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4225     lu = rsqrtIter(lu, x);
4226 #endif
4227 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4228     lu = rsqrtIter(lu, x);
4229 #endif
4230 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4231     lu = rsqrtIter(lu, x);
4232 #endif
4233     return lu;
4234 }
4235
4236
4237
4238 #endif // GMX_SIMD4_HAVE_DOUBLE
4239
4240 /*! \} */
4241
4242 #if GMX_SIMD_HAVE_FLOAT
4243 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4244  *
4245  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4246  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4247  *           For the single precision implementation this is obviously always
4248  *           true for positive values, but for double precision it adds an
4249  *           extra restriction since the first lookup step might have to be
4250  *           performed in single precision on some architectures. Note that the
4251  *           responsibility for checking falls on you - this routine does not
4252  *           check arguments.
4253  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4254  */
4255 static inline SimdFloat gmx_simdcall
4256 invsqrtSingleAccuracy(SimdFloat x)
4257 {
4258     return invsqrt(x);
4259 }
4260
4261 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4262  *
4263  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4264  *  Illegal values in the masked-out elements will not lead to
4265  *  floating-point exceptions.
4266  *
4267  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4268  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4269  *           For the single precision implementation this is obviously always
4270  *           true for positive values, but for double precision it adds an
4271  *           extra restriction since the first lookup step might have to be
4272  *           performed in single precision on some architectures. Note that the
4273  *           responsibility for checking falls on you - this routine does not
4274  *           check arguments.
4275  *  \param m Mask
4276  *  \return  1/sqrt(x). Result is undefined if your argument was invalid or
4277  *           entry was not masked, and 0.0 for masked-out entries.
4278  */
4279 static inline SimdFloat
4280 maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
4281 {
4282     return maskzInvsqrt(x, m);
4283 }
4284
4285 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4286  *
4287  * \param x0  First set of arguments, x0 must be in single range (see below).
4288  * \param x1  Second set of arguments, x1 must be in single range (see below).
4289  * \param[out] out0  Result 1/sqrt(x0)
4290  * \param[out] out1  Result 1/sqrt(x1)
4291  *
4292  *  In particular for double precision we can sometimes calculate square root
4293  *  pairs slightly faster by using single precision until the very last step.
4294  *
4295  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4296  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
4297  *       For the single precision implementation this is obviously always
4298  *       true for positive values, but for double precision it adds an
4299  *       extra restriction since the first lookup step might have to be
4300  *       performed in single precision on some architectures. Note that the
4301  *       responsibility for checking falls on you - this routine does not
4302  *       check arguments.
4303  */
4304 static inline void gmx_simdcall
4305 invsqrtPairSingleAccuracy(SimdFloat x0,    SimdFloat x1,
4306                           SimdFloat *out0, SimdFloat *out1)
4307 {
4308     return invsqrtPair(x0, x1, out0, out1);
4309 }
4310
4311 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4312  *
4313  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4314  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4315  *           For the single precision implementation this is obviously always
4316  *           true for positive values, but for double precision it adds an
4317  *           extra restriction since the first lookup step might have to be
4318  *           performed in single precision on some architectures. Note that the
4319  *           responsibility for checking falls on you - this routine does not
4320  *           check arguments.
4321  *  \return  1/x. Result is undefined if your argument was invalid.
4322  */
4323 static inline SimdFloat gmx_simdcall
4324 invSingleAccuracy(SimdFloat x)
4325 {
4326     return inv(x);
4327 }
4328
4329
4330 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4331  *
4332  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4333  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4334  *           For the single precision implementation this is obviously always
4335  *           true for positive values, but for double precision it adds an
4336  *           extra restriction since the first lookup step might have to be
4337  *           performed in single precision on some architectures. Note that the
4338  *           responsibility for checking falls on you - this routine does not
4339  *           check arguments.
4340  *  \param m Mask
4341  *  \return  1/x for elements where m is true, or 0.0 for masked-out entries.
4342  */
4343 static inline SimdFloat
4344 maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
4345 {
4346     return maskzInv(x, m);
4347 }
4348
4349 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4350  *
4351  * \copydetails sqrt(SimdFloat)
4352  */
4353 template <MathOptimization opt = MathOptimization::Safe>
4354 static inline SimdFloat gmx_simdcall
4355 sqrtSingleAccuracy(SimdFloat x)
4356 {
4357     return sqrt<opt>(x);
4358 }
4359
4360 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
4361  *
4362  * \param x Argument, should be >0.
4363  * \result The natural logarithm of x. Undefined if argument is invalid.
4364  */
4365 static inline SimdFloat gmx_simdcall
4366 logSingleAccuracy(SimdFloat x)
4367 {
4368     return log(x);
4369 }
4370
4371 /*! \brief SIMD float 2^x, only targeting single accuracy.
4372  *
4373  * \copydetails exp2(SimdFloat)
4374  */
4375 template <MathOptimization opt = MathOptimization::Safe>
4376 static inline SimdFloat gmx_simdcall
4377 exp2SingleAccuracy(SimdFloat x)
4378 {
4379     return exp2<opt>(x);
4380 }
4381
4382 /*! \brief SIMD float e^x, only targeting single accuracy.
4383  *
4384  * \copydetails exp(SimdFloat)
4385  */
4386 template <MathOptimization opt = MathOptimization::Safe>
4387 static inline SimdFloat gmx_simdcall
4388 expSingleAccuracy(SimdFloat x)
4389 {
4390     return exp<opt>(x);
4391 }
4392
4393
4394 /*! \brief SIMD float erf(x), only targeting single accuracy.
4395  *
4396  * \param x The value to calculate erf(x) for.
4397  * \result erf(x)
4398  *
4399  * This routine achieves very close to single precision, but we do not care about
4400  * the last bit or the subnormal result range.
4401  */
4402 static inline SimdFloat gmx_simdcall
4403 erfSingleAccuracy(SimdFloat x)
4404 {
4405     return erf(x);
4406 }
4407
4408 /*! \brief SIMD float erfc(x), only targeting single accuracy.
4409  *
4410  * \param x The value to calculate erfc(x) for.
4411  * \result erfc(x)
4412  *
4413  * This routine achieves singleprecision (bar the last bit) over most of the
4414  * input range, but for large arguments where the result is getting close
4415  * to the minimum representable numbers we accept slightly larger errors
4416  * (think results that are in the ballpark of 10^-30) since that is not
4417  * relevant for MD.
4418  */
4419 static inline SimdFloat gmx_simdcall
4420 erfcSingleAccuracy(SimdFloat x)
4421 {
4422     return erfc(x);
4423 }
4424
4425 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
4426  *
4427  * \param x The argument to evaluate sin/cos for
4428  * \param[out] sinval Sin(x)
4429  * \param[out] cosval Cos(x)
4430  */
4431 static inline void gmx_simdcall
4432 sinCosSingleAccuracy(SimdFloat x, SimdFloat *sinval, SimdFloat *cosval)
4433 {
4434     sincos(x, sinval, cosval);
4435 }
4436
4437 /*! \brief SIMD float sin(x), only targeting single accuracy.
4438  *
4439  * \param x The argument to evaluate sin for
4440  * \result Sin(x)
4441  *
4442  * \attention Do NOT call both sin & cos if you need both results, since each of them
4443  * will then call \ref sincos and waste a factor 2 in performance.
4444  */
4445 static inline SimdFloat gmx_simdcall
4446 sinSingleAccuracy(SimdFloat x)
4447 {
4448     return sin(x);
4449 }
4450
4451 /*! \brief SIMD float cos(x), only targeting single accuracy.
4452  *
4453  * \param x The argument to evaluate cos for
4454  * \result Cos(x)
4455  *
4456  * \attention Do NOT call both sin & cos if you need both results, since each of them
4457  * will then call \ref sincos and waste a factor 2 in performance.
4458  */
4459 static inline SimdFloat gmx_simdcall
4460 cosSingleAccuracy(SimdFloat x)
4461 {
4462     return cos(x);
4463 }
4464
4465 /*! \brief SIMD float tan(x), only targeting single accuracy.
4466  *
4467  * \param x The argument to evaluate tan for
4468  * \result Tan(x)
4469  */
4470 static inline SimdFloat gmx_simdcall
4471 tanSingleAccuracy(SimdFloat x)
4472 {
4473     return tan(x);
4474 }
4475
4476 /*! \brief SIMD float asin(x), only targeting single accuracy.
4477  *
4478  * \param x The argument to evaluate asin for
4479  * \result Asin(x)
4480  */
4481 static inline SimdFloat gmx_simdcall
4482 asinSingleAccuracy(SimdFloat x)
4483 {
4484     return asin(x);
4485 }
4486
4487 /*! \brief SIMD float acos(x), only targeting single accuracy.
4488  *
4489  * \param x The argument to evaluate acos for
4490  * \result Acos(x)
4491  */
4492 static inline SimdFloat gmx_simdcall
4493 acosSingleAccuracy(SimdFloat x)
4494 {
4495     return acos(x);
4496 }
4497
4498 /*! \brief SIMD float atan(x), only targeting single accuracy.
4499  *
4500  * \param x The argument to evaluate atan for
4501  * \result Atan(x), same argument/value range as standard math library.
4502  */
4503 static inline SimdFloat gmx_simdcall
4504 atanSingleAccuracy(SimdFloat x)
4505 {
4506     return atan(x);
4507 }
4508
4509 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
4510  *
4511  * \param y Y component of vector, any quartile
4512  * \param x X component of vector, any quartile
4513  * \result Atan(y,x), same argument/value range as standard math library.
4514  *
4515  * \note This routine should provide correct results for all finite
4516  * non-zero or positive-zero arguments. However, negative zero arguments will
4517  * be treated as positive zero, which means the return value will deviate from
4518  * the standard math library atan2(y,x) for those cases. That should not be
4519  * of any concern in Gromacs, and in particular it will not affect calculations
4520  * of angles from vectors.
4521  */
4522 static inline SimdFloat gmx_simdcall
4523 atan2SingleAccuracy(SimdFloat y, SimdFloat x)
4524 {
4525     return atan2(y, x);
4526 }
4527
4528 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
4529  *
4530  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4531  * \result Correction factor to coulomb force.
4532  */
4533 static inline SimdFloat gmx_simdcall
4534 pmeForceCorrectionSingleAccuracy(SimdFloat z2)
4535 {
4536     return pmeForceCorrection(z2);
4537 }
4538
4539 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
4540  *
4541  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4542  * \result Correction factor to coulomb force.
4543  */
4544 static inline SimdFloat gmx_simdcall
4545 pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
4546 {
4547     return pmePotentialCorrection(z2);
4548 }
4549 #endif // GMX_SIMD_HAVE_FLOAT
4550
4551 #if GMX_SIMD4_HAVE_FLOAT
4552 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
4553  *
4554  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4555  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4556  *           For the single precision implementation this is obviously always
4557  *           true for positive values, but for double precision it adds an
4558  *           extra restriction since the first lookup step might have to be
4559  *           performed in single precision on some architectures. Note that the
4560  *           responsibility for checking falls on you - this routine does not
4561  *           check arguments.
4562  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
4563  */
4564 static inline Simd4Float gmx_simdcall
4565 invsqrtSingleAccuracy(Simd4Float x)
4566 {
4567     return invsqrt(x);
4568 }
4569 #endif // GMX_SIMD4_HAVE_FLOAT
4570
4571 /*! \}   end of addtogroup module_simd */
4572 /*! \endcond  end of condition libabl */
4573
4574 #endif // GMX_SIMD
4575
4576 }      // namespace gmx
4577
4578 #endif // GMX_SIMD_SIMD_MATH_H