src/gromacs/simd/simd_math.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_SIMD_MATH_H_
  36 #define GMX_SIMD_SIMD_MATH_H_
  37
  38 /*! \libinternal \file
  39  *
  40  * \brief Math functions for SIMD datatypes.
  41  *
  42  * \attention This file is generic for all SIMD architectures, so you cannot
  43  * assume that any of the optional SIMD features (as defined in simd.h) are
  44  * present. In particular, this means you cannot assume support for integers,
  45  * logical operations (neither on floating-point nor integer values), shifts,
  46  * and the architecture might only have SIMD for either float or double.
  47  * Second, to keep this file clean and general, any additions to this file
  48  * must work for all possible SIMD architectures in both single and double
  49  * precision (if they support it), and you cannot make any assumptions about
  50  * SIMD width.
  51  *
  52  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  53  *
  54  * \inlibraryapi
  55  * \ingroup module_simd
  56  */
  57
  58 #include "config.h"
  59
  60 #include <math.h>
  61
  62 #include "gromacs/math/utilities.h"
  63 #include "gromacs/simd/simd.h"
  64 #include "gromacs/utility/real.h"
  65
  66 /*! \cond libapi */
  67 /*! \addtogroup module_simd */
  68 /*! \{ */
  69
  70 /*! \name Implementation accuracy settings
  71  *  \{
  72  */
  73
  74 /*! \} */
  75
  76 #ifdef GMX_SIMD_HAVE_FLOAT
  77
  78 /*! \name Single precision SIMD math functions
  79  *
  80  *  \note In most cases you should use the real-precision functions instead.
  81  *  \{
  82  */
  83
  84 /****************************************
  85  * SINGLE PRECISION SIMD MATH FUNCTIONS *
  86  ****************************************/
  87
  88 /*! \brief SIMD float utility to sum a+b+c+d.
  89  *
  90  * You should normally call the real-precision routine \ref gmx_simd_sum4_r.
  91  *
  92  * \param a term 1 (multiple values)
  93  * \param b term 2 (multiple values)
  94  * \param c term 3 (multiple values)
  95  * \param d term 4 (multiple values)
  96  * \return sum of terms 1-4 (multiple values)
  97  */
  98 static gmx_inline gmx_simd_float_t gmx_simdcall
  99 gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
 100                 gmx_simd_float_t c, gmx_simd_float_t d)
 101 {
 102     return gmx_simd_add_f(gmx_simd_add_f(a, b), gmx_simd_add_f(c, d));
 103 }
 104
 105 /*! \brief Return -a if b is negative, SIMD float.
 106  *
 107  * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
 108  *
 109  * \param a Values to set sign for
 110  * \param b Values used to set sign
 111  * \return if b is negative, the sign of a will be changed.
 112  *
 113  * This is equivalent to doing an xor operation on a with the sign bit of b,
 114  * with the exception that negative zero is not considered to be negative
 115  * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
 116  */
 117 static gmx_inline gmx_simd_float_t gmx_simdcall
 118 gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
 119 {
 120 #ifdef GMX_SIMD_HAVE_LOGICAL
 121     return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), b));
 122 #else
 123     return gmx_simd_blendv_f(a, gmx_simd_fneg_f(a), gmx_simd_cmplt_f(b, gmx_simd_setzero_f()));
 124 #endif
 125 }
 126
 127 #ifndef gmx_simd_rsqrt_iter_f
 128 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
 129  *
 130  * This is a low-level routine that should only be used by SIMD math routine
 131  * that evaluates the inverse square root.
 132  *
 133  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
 134  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
 135  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 136  */
 137 static gmx_inline gmx_simd_float_t gmx_simdcall
 138 gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
 139 {
 140 #    ifdef GMX_SIMD_HAVE_FMA
 141     return gmx_simd_fmadd_f(gmx_simd_fnmadd_f(x, gmx_simd_mul_f(lu, lu), gmx_simd_set1_f(1.0f)), gmx_simd_mul_f(lu, gmx_simd_set1_f(0.5f)), lu);
 142 #    else
 143     return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
 144 #    endif
 145 }
 146 #endif
 147
 148 /*! \brief Calculate 1/sqrt(x) for SIMD float.
 149  *
 150  * You should normally call the real-precision routine \ref gmx_simd_invsqrt_r.
 151  *
 152  *  \param x Argument that must be >0. This routine does not check arguments.
 153  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
 154  */
 155 static gmx_inline gmx_simd_float_t gmx_simdcall
 156 gmx_simd_invsqrt_f(gmx_simd_float_t x)
 157 {
 158     gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
 159 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 160     lu = gmx_simd_rsqrt_iter_f(lu, x);
 161 #endif
 162 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 163     lu = gmx_simd_rsqrt_iter_f(lu, x);
 164 #endif
 165 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 166     lu = gmx_simd_rsqrt_iter_f(lu, x);
 167 #endif
 168     return lu;
 169 }
 170
 171 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
 172  *
 173  * Identical to gmx_simd_invsqrt_f but avoids fp-exception for non-masked entries.
 174  * The result for the non-masked entries is undefined and the user has to use blend
 175  * with the same mask to obtain a defined result.
 176  *
 177  *  \param x Argument that must be >0 for masked entries
 178  *  \param m Masked entries
 179  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was not masked.
 180  */
 181 static gmx_inline gmx_simd_float_t
 182 gmx_simd_invsqrt_maskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
 183 {
 184 #ifdef NDEBUG
 185     return gmx_simd_invsqrt_f(x);
 186 #else
 187     return gmx_simd_invsqrt_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f), x, m));
 188 #endif
 189 }
 190
 191 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD float.
 192  *
 193  * Identical to gmx_simd_invsqrt_f but avoids fp-exception for masked entries.
 194  * The result for the non-masked entries is undefined and the user has to use blend
 195  * with the same mask to obtain a defined result.
 196  *
 197  *  \param x Argument that must be >0 for non-masked entries
 198  *  \param m Masked entries
 199  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was masked.
 200  */
 201 static gmx_inline gmx_simd_float_t
 202 gmx_simd_invsqrt_notmaskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
 203 {
 204 #ifdef NDEBUG
 205     return gmx_simd_invsqrt_f(x);
 206 #else
 207     return gmx_simd_invsqrt_f(gmx_simd_blendv_f(x, gmx_simd_set1_f(1.0f), m));
 208 #endif
 209 }
 210
 211 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
 212  *
 213  * You should normally call the real-precision routine \ref gmx_simd_invsqrt_pair_r.
 214  *
 215  * \param x0  First set of arguments, x0 must be positive - no argument checking.
 216  * \param x1  Second set of arguments, x1 must be positive - no argument checking.
 217  * \param[out] out0  Result 1/sqrt(x0)
 218  * \param[out] out1  Result 1/sqrt(x1)
 219  *
 220  *  In particular for double precision we can sometimes calculate square root
 221  *  pairs slightly faster by using single precision until the very last step.
 222  */
 223 static gmx_inline void gmx_simdcall
 224 gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0,    gmx_simd_float_t x1,
 225                         gmx_simd_float_t *out0, gmx_simd_float_t *out1)
 226 {
 227     *out0 = gmx_simd_invsqrt_f(x0);
 228     *out1 = gmx_simd_invsqrt_f(x1);
 229 }
 230
 231 #ifndef gmx_simd_rcp_iter_f
 232 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
 233  *
 234  * This is a low-level routine that should only be used by SIMD math routine
 235  * that evaluates the reciprocal.
 236  *
 237  *  \param lu Approximation of 1/x, typically obtained from lookup.
 238  *  \param x  The reference (starting) value x for which we want 1/x.
 239  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 240  */
 241 static gmx_inline gmx_simd_float_t gmx_simdcall
 242 gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
 243 {
 244     return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
 245 }
 246 #endif
 247
 248 /*! \brief Calculate 1/x for SIMD float.
 249  *
 250  * You should normally call the real-precision routine \ref gmx_simd_inv_r.
 251  *
 252  *  \param x Argument that must be nonzero. This routine does not check arguments.
 253  *  \return 1/x. Result is undefined if your argument was invalid.
 254  */
 255 static gmx_inline gmx_simd_float_t gmx_simdcall
 256 gmx_simd_inv_f(gmx_simd_float_t x)
 257 {
 258     gmx_simd_float_t lu = gmx_simd_rcp_f(x);
 259 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 260     lu = gmx_simd_rcp_iter_f(lu, x);
 261 #endif
 262 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 263     lu = gmx_simd_rcp_iter_f(lu, x);
 264 #endif
 265 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 266     lu = gmx_simd_rcp_iter_f(lu, x);
 267 #endif
 268     return lu;
 269 }
 270
 271 /*! \brief Calculate 1/x for masked entries of SIMD float.
 272  *
 273  * Identical to gmx_simd_inv_f but avoids fp-exception for non-masked entries.
 274  * The result for the non-masked entries is undefined and the user has to use blend
 275  * with the same mask to obtain a defined result.
 276  *
 277  *  \param x Argument that must be nonzero for masked entries
 278  *  \param m Masked entries
 279  *  \return 1/x. Result is undefined if your argument was invalid or entry was not masked.
 280  */
 281 static gmx_inline gmx_simd_float_t
 282 gmx_simd_inv_maskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
 283 {
 284 #ifdef NDEBUG
 285     return gmx_simd_inv_f(x);
 286 #else
 287     return gmx_simd_inv_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f), x, m));
 288 #endif
 289 }
 290
 291
 292 /*! \brief Calculate 1/x for non-masked entries of SIMD float.
 293  *
 294  * Identical to gmx_simd_inv_f but avoids fp-exception for masked entries.
 295  * The result for the non-masked entries is undefined and the user has to use blend
 296  * with the same mask to obtain a defined result.
 297  *
 298  *  \param x Argument that must be nonzero for non-masked entries
 299  *  \param m Masked entries
 300  *  \return 1/x. Result is undefined if your argument was invalid or entry was masked.
 301  */
 302 static gmx_inline gmx_simd_float_t
 303 gmx_simd_inv_notmaskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
 304 {
 305 #ifdef NDEBUG
 306     return gmx_simd_inv_f(x);
 307 #else
 308     return gmx_simd_inv_f(gmx_simd_blendv_f(x, gmx_simd_set1_f(1.0f), m));
 309 #endif
 310 }
 311
 312 /*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
 313  *
 314  * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
 315  *
 316  *  \param x Argument that must be >=0.
 317  *  \return sqrt(x). If x=0, the result will correctly be set to 0.
 318  *          The result is undefined if the input value is negative.
 319  */
 320 static gmx_inline gmx_simd_float_t gmx_simdcall
 321 gmx_simd_sqrt_f(gmx_simd_float_t x)
 322 {
 323     gmx_simd_fbool_t  mask;
 324     gmx_simd_float_t  res;
 325
 326     mask = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
 327     res  = gmx_simd_blendnotzero_f(gmx_simd_invsqrt_notmaskfpe_f(x, mask), mask);
 328     return gmx_simd_mul_f(res, x);
 329 }
 330
 331 /*! \brief SIMD float log(x). This is the natural logarithm.
 332  *
 333  * You should normally call the real-precision routine \ref gmx_simd_log_r.
 334  *
 335  * \param x Argument, should be >0.
 336  * \result The natural logarithm of x. Undefined if argument is invalid.
 337  */
 338 #ifndef gmx_simd_log_f
 339 static gmx_inline gmx_simd_float_t gmx_simdcall
 340 gmx_simd_log_f(gmx_simd_float_t x)
 341 {
 342     const gmx_simd_float_t  half       = gmx_simd_set1_f(0.5f);
 343     const gmx_simd_float_t  one        = gmx_simd_set1_f(1.0f);
 344     const gmx_simd_float_t  sqrt2      = gmx_simd_set1_f(sqrt(2.0f));
 345     const gmx_simd_float_t  corr       = gmx_simd_set1_f(0.693147180559945286226764f);
 346     const gmx_simd_float_t  CL9        = gmx_simd_set1_f(0.2371599674224853515625f);
 347     const gmx_simd_float_t  CL7        = gmx_simd_set1_f(0.285279005765914916992188f);
 348     const gmx_simd_float_t  CL5        = gmx_simd_set1_f(0.400005519390106201171875f);
 349     const gmx_simd_float_t  CL3        = gmx_simd_set1_f(0.666666567325592041015625f);
 350     const gmx_simd_float_t  CL1        = gmx_simd_set1_f(2.0f);
 351     gmx_simd_float_t        fexp, x2, p;
 352     gmx_simd_fbool_t        mask;
 353
 354     fexp  = gmx_simd_get_exponent_f(x);
 355     x     = gmx_simd_get_mantissa_f(x);
 356
 357     mask  = gmx_simd_cmplt_f(sqrt2, x);
 358     /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
 359     fexp  = gmx_simd_add_f(fexp, gmx_simd_blendzero_f(one, mask));
 360     x     = gmx_simd_mul_f(x, gmx_simd_blendv_f(one, half, mask));
 361
 362     x     = gmx_simd_mul_f( gmx_simd_sub_f(x, one), gmx_simd_inv_f( gmx_simd_add_f(x, one) ) );
 363     x2    = gmx_simd_mul_f(x, x);
 364
 365     p     = gmx_simd_fmadd_f(CL9, x2, CL7);
 366     p     = gmx_simd_fmadd_f(p, x2, CL5);
 367     p     = gmx_simd_fmadd_f(p, x2, CL3);
 368     p     = gmx_simd_fmadd_f(p, x2, CL1);
 369     p     = gmx_simd_fmadd_f(p, x, gmx_simd_mul_f(corr, fexp));
 370
 371     return p;
 372 }
 373 #endif
 374
 375 #ifndef gmx_simd_exp2_f
 376 /*! \brief SIMD float 2^x.
 377  *
 378  * You should normally call the real-precision routine \ref gmx_simd_exp2_r.
 379  *
 380  * \param x Argument.
 381  * \result 2^x. Undefined if input argument caused overflow.
 382  */
 383 static gmx_inline gmx_simd_float_t gmx_simdcall
 384 gmx_simd_exp2_f(gmx_simd_float_t x)
 385 {
 386     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 387     const gmx_simd_float_t  arglimit = gmx_simd_set1_f(126.0f);
 388     const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.0001534581200287996416911311);
 389     const gmx_simd_float_t  CC5      = gmx_simd_set1_f(0.001339993121934088894618990);
 390     const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.009618488957115180159497841);
 391     const gmx_simd_float_t  CC3      = gmx_simd_set1_f(0.05550328776964726865751735);
 392     const gmx_simd_float_t  CC2      = gmx_simd_set1_f(0.2402264689063408646490722);
 393     const gmx_simd_float_t  CC1      = gmx_simd_set1_f(0.6931472057372680777553816);
 394     const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
 395
 396     gmx_simd_float_t        fexppart;
 397     gmx_simd_float_t        intpart;
 398     gmx_simd_float_t        p;
 399     gmx_simd_fbool_t        valuemask;
 400
 401     fexppart  = gmx_simd_set_exponent_f(x);
 402     intpart   = gmx_simd_round_f(x);
 403     valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(x), arglimit);
 404     fexppart  = gmx_simd_blendzero_f(fexppart, valuemask);
 405     x         = gmx_simd_sub_f(x, intpart);
 406
 407     p         = gmx_simd_fmadd_f(CC6, x, CC5);
 408     p         = gmx_simd_fmadd_f(p, x, CC4);
 409     p         = gmx_simd_fmadd_f(p, x, CC3);
 410     p         = gmx_simd_fmadd_f(p, x, CC2);
 411     p         = gmx_simd_fmadd_f(p, x, CC1);
 412     p         = gmx_simd_fmadd_f(p, x, one);
 413     x         = gmx_simd_mul_f(p, fexppart);
 414     return x;
 415 }
 416 #endif
 417
 418 #ifndef gmx_simd_exp_f
 419 /*! \brief SIMD float exp(x).
 420  *
 421  * You should normally call the real-precision routine \ref gmx_simd_exp_r.
 422  *
 423  * In addition to scaling the argument for 2^x this routine correctly does
 424  * extended precision arithmetics to improve accuracy.
 425  *
 426  * \param x Argument.
 427  * \result exp(x). Undefined if input argument caused overflow,
 428  * which can happen if abs(x) \> 7e13.
 429  */
 430 static gmx_inline gmx_simd_float_t gmx_simdcall
 431 gmx_simd_exp_f(gmx_simd_float_t x)
 432 {
 433     const gmx_simd_float_t  argscale     = gmx_simd_set1_f(1.44269504088896341f);
 434     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 435     const gmx_simd_float_t  arglimit     = gmx_simd_set1_f(126.0f);
 436     const gmx_simd_float_t  invargscale0 = gmx_simd_set1_f(-0.693145751953125f);
 437     const gmx_simd_float_t  invargscale1 = gmx_simd_set1_f(-1.428606765330187045e-06f);
 438     const gmx_simd_float_t  CC4          = gmx_simd_set1_f(0.00136324646882712841033936f);
 439     const gmx_simd_float_t  CC3          = gmx_simd_set1_f(0.00836596917361021041870117f);
 440     const gmx_simd_float_t  CC2          = gmx_simd_set1_f(0.0416710823774337768554688f);
 441     const gmx_simd_float_t  CC1          = gmx_simd_set1_f(0.166665524244308471679688f);
 442     const gmx_simd_float_t  CC0          = gmx_simd_set1_f(0.499999850988388061523438f);
 443     const gmx_simd_float_t  one          = gmx_simd_set1_f(1.0f);
 444     gmx_simd_float_t        fexppart;
 445     gmx_simd_float_t        intpart;
 446     gmx_simd_float_t        y, p;
 447     gmx_simd_fbool_t        valuemask;
 448
 449     y         = gmx_simd_mul_f(x, argscale);
 450     fexppart  = gmx_simd_set_exponent_f(y);  /* rounds to nearest int internally */
 451     intpart   = gmx_simd_round_f(y);         /* use same rounding algorithm here */
 452     valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(y), arglimit);
 453     fexppart  = gmx_simd_blendzero_f(fexppart, valuemask);
 454
 455     /* Extended precision arithmetics */
 456     x         = gmx_simd_fmadd_f(invargscale0, intpart, x);
 457     x         = gmx_simd_fmadd_f(invargscale1, intpart, x);
 458
 459     p         = gmx_simd_fmadd_f(CC4, x, CC3);
 460     p         = gmx_simd_fmadd_f(p, x, CC2);
 461     p         = gmx_simd_fmadd_f(p, x, CC1);
 462     p         = gmx_simd_fmadd_f(p, x, CC0);
 463     p         = gmx_simd_fmadd_f(gmx_simd_mul_f(x, x), p, x);
 464     p         = gmx_simd_add_f(p, one);
 465     x         = gmx_simd_mul_f(p, fexppart);
 466     return x;
 467 }
 468 #endif
 469
 470 /*! \brief SIMD float erf(x).
 471  *
 472  * You should normally call the real-precision routine \ref gmx_simd_erf_r.
 473  *
 474  * \param x The value to calculate erf(x) for.
 475  * \result erf(x)
 476  *
 477  * This routine achieves very close to full precision, but we do not care about
 478  * the last bit or the subnormal result range.
 479  */
 480 static gmx_inline gmx_simd_float_t gmx_simdcall
 481 gmx_simd_erf_f(gmx_simd_float_t x)
 482 {
 483     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 484     const gmx_simd_float_t  CA6      = gmx_simd_set1_f(7.853861353153693e-5f);
 485     const gmx_simd_float_t  CA5      = gmx_simd_set1_f(-8.010193625184903e-4f);
 486     const gmx_simd_float_t  CA4      = gmx_simd_set1_f(5.188327685732524e-3f);
 487     const gmx_simd_float_t  CA3      = gmx_simd_set1_f(-2.685381193529856e-2f);
 488     const gmx_simd_float_t  CA2      = gmx_simd_set1_f(1.128358514861418e-1f);
 489     const gmx_simd_float_t  CA1      = gmx_simd_set1_f(-3.761262582423300e-1f);
 490     const gmx_simd_float_t  CA0      = gmx_simd_set1_f(1.128379165726710f);
 491     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 492     const gmx_simd_float_t  CB9      = gmx_simd_set1_f(-0.0018629930017603923f);
 493     const gmx_simd_float_t  CB8      = gmx_simd_set1_f(0.003909821287598495f);
 494     const gmx_simd_float_t  CB7      = gmx_simd_set1_f(-0.0052094582210355615f);
 495     const gmx_simd_float_t  CB6      = gmx_simd_set1_f(0.005685614362160572f);
 496     const gmx_simd_float_t  CB5      = gmx_simd_set1_f(-0.0025367682853477272f);
 497     const gmx_simd_float_t  CB4      = gmx_simd_set1_f(-0.010199799682318782f);
 498     const gmx_simd_float_t  CB3      = gmx_simd_set1_f(0.04369575504816542f);
 499     const gmx_simd_float_t  CB2      = gmx_simd_set1_f(-0.11884063474674492f);
 500     const gmx_simd_float_t  CB1      = gmx_simd_set1_f(0.2732120154030589f);
 501     const gmx_simd_float_t  CB0      = gmx_simd_set1_f(0.42758357702025784f);
 502     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 503     const gmx_simd_float_t  CC10     = gmx_simd_set1_f(-0.0445555913112064f);
 504     const gmx_simd_float_t  CC9      = gmx_simd_set1_f(0.21376355144663348f);
 505     const gmx_simd_float_t  CC8      = gmx_simd_set1_f(-0.3473187200259257f);
 506     const gmx_simd_float_t  CC7      = gmx_simd_set1_f(0.016690861551248114f);
 507     const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.7560973182491192f);
 508     const gmx_simd_float_t  CC5      = gmx_simd_set1_f(-1.2137903600145787f);
 509     const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.8411872321232948f);
 510     const gmx_simd_float_t  CC3      = gmx_simd_set1_f(-0.08670413896296343f);
 511     const gmx_simd_float_t  CC2      = gmx_simd_set1_f(-0.27124782687240334f);
 512     const gmx_simd_float_t  CC1      = gmx_simd_set1_f(-0.0007502488047806069f);
 513     const gmx_simd_float_t  CC0      = gmx_simd_set1_f(0.5642114853803148f);
 514     const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
 515     const gmx_simd_float_t  two      = gmx_simd_set1_f(2.0f);
 516
 517     gmx_simd_float_t        x2, x4, y;
 518     gmx_simd_float_t        t, t2, w, w2;
 519     gmx_simd_float_t        pA0, pA1, pB0, pB1, pC0, pC1;
 520     gmx_simd_float_t        expmx2;
 521     gmx_simd_float_t        res_erf, res_erfc, res;
 522     gmx_simd_fbool_t        mask, msk_erf;
 523
 524     /* Calculate erf() */
 525     x2   = gmx_simd_mul_f(x, x);
 526     x4   = gmx_simd_mul_f(x2, x2);
 527
 528     pA0  = gmx_simd_fmadd_f(CA6, x4, CA4);
 529     pA1  = gmx_simd_fmadd_f(CA5, x4, CA3);
 530     pA0  = gmx_simd_fmadd_f(pA0, x4, CA2);
 531     pA1  = gmx_simd_fmadd_f(pA1, x4, CA1);
 532     pA0  = gmx_simd_mul_f(pA0, x4);
 533     pA0  = gmx_simd_fmadd_f(pA1, x2, pA0);
 534     /* Constant term must come last for precision reasons */
 535     pA0  = gmx_simd_add_f(pA0, CA0);
 536
 537     res_erf = gmx_simd_mul_f(x, pA0);
 538
 539     /* Calculate erfc */
 540     y       = gmx_simd_fabs_f(x);
 541     msk_erf = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
 542     t       = gmx_simd_inv_notmaskfpe_f(y, msk_erf);
 543     w       = gmx_simd_sub_f(t, one);
 544     t2      = gmx_simd_mul_f(t, t);
 545     w2      = gmx_simd_mul_f(w, w);
 546
 547     /* No need for a floating-point sieve here (as in erfc), since erf()
 548      * will never return values that are extremely small for large args.
 549      */
 550     expmx2  = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(y, y)));
 551
 552     pB1  = gmx_simd_fmadd_f(CB9, w2, CB7);
 553     pB0  = gmx_simd_fmadd_f(CB8, w2, CB6);
 554     pB1  = gmx_simd_fmadd_f(pB1, w2, CB5);
 555     pB0  = gmx_simd_fmadd_f(pB0, w2, CB4);
 556     pB1  = gmx_simd_fmadd_f(pB1, w2, CB3);
 557     pB0  = gmx_simd_fmadd_f(pB0, w2, CB2);
 558     pB1  = gmx_simd_fmadd_f(pB1, w2, CB1);
 559     pB0  = gmx_simd_fmadd_f(pB0, w2, CB0);
 560     pB0  = gmx_simd_fmadd_f(pB1, w, pB0);
 561
 562     pC0  = gmx_simd_fmadd_f(CC10, t2, CC8);
 563     pC1  = gmx_simd_fmadd_f(CC9, t2, CC7);
 564     pC0  = gmx_simd_fmadd_f(pC0, t2, CC6);
 565     pC1  = gmx_simd_fmadd_f(pC1, t2, CC5);
 566     pC0  = gmx_simd_fmadd_f(pC0, t2, CC4);
 567     pC1  = gmx_simd_fmadd_f(pC1, t2, CC3);
 568     pC0  = gmx_simd_fmadd_f(pC0, t2, CC2);
 569     pC1  = gmx_simd_fmadd_f(pC1, t2, CC1);
 570
 571     pC0  = gmx_simd_fmadd_f(pC0, t2, CC0);
 572     pC0  = gmx_simd_fmadd_f(pC1, t, pC0);
 573     pC0  = gmx_simd_mul_f(pC0, t);
 574
 575     /* SELECT pB0 or pC0 for erfc() */
 576     mask     = gmx_simd_cmplt_f(two, y);
 577     res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
 578     res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
 579
 580     /* erfc(x<0) = 2-erfc(|x|) */
 581     mask     = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
 582     res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
 583
 584     /* Select erf() or erfc() */
 585     res  = gmx_simd_blendv_f(gmx_simd_sub_f(one, res_erfc), res_erf, msk_erf);
 586
 587     return res;
 588 }
 589
 590 /*! \brief SIMD float erfc(x).
 591  *
 592  * You should normally call the real-precision routine \ref gmx_simd_erfc_r.
 593  *
 594  * \param x The value to calculate erfc(x) for.
 595  * \result erfc(x)
 596  *
 597  * This routine achieves full precision (bar the last bit) over most of the
 598  * input range, but for large arguments where the result is getting close
 599  * to the minimum representable numbers we accept slightly larger errors
 600  * (think results that are in the ballpark of 10^-30 for single precision,
 601  * or 10^-200 for double) since that is not relevant for MD.
 602  */
 603 static gmx_inline gmx_simd_float_t gmx_simdcall
 604 gmx_simd_erfc_f(gmx_simd_float_t x)
 605 {
 606     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 607     const gmx_simd_float_t  CA6      = gmx_simd_set1_f(7.853861353153693e-5f);
 608     const gmx_simd_float_t  CA5      = gmx_simd_set1_f(-8.010193625184903e-4f);
 609     const gmx_simd_float_t  CA4      = gmx_simd_set1_f(5.188327685732524e-3f);
 610     const gmx_simd_float_t  CA3      = gmx_simd_set1_f(-2.685381193529856e-2f);
 611     const gmx_simd_float_t  CA2      = gmx_simd_set1_f(1.128358514861418e-1f);
 612     const gmx_simd_float_t  CA1      = gmx_simd_set1_f(-3.761262582423300e-1f);
 613     const gmx_simd_float_t  CA0      = gmx_simd_set1_f(1.128379165726710f);
 614     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 615     const gmx_simd_float_t  CB9      = gmx_simd_set1_f(-0.0018629930017603923f);
 616     const gmx_simd_float_t  CB8      = gmx_simd_set1_f(0.003909821287598495f);
 617     const gmx_simd_float_t  CB7      = gmx_simd_set1_f(-0.0052094582210355615f);
 618     const gmx_simd_float_t  CB6      = gmx_simd_set1_f(0.005685614362160572f);
 619     const gmx_simd_float_t  CB5      = gmx_simd_set1_f(-0.0025367682853477272f);
 620     const gmx_simd_float_t  CB4      = gmx_simd_set1_f(-0.010199799682318782f);
 621     const gmx_simd_float_t  CB3      = gmx_simd_set1_f(0.04369575504816542f);
 622     const gmx_simd_float_t  CB2      = gmx_simd_set1_f(-0.11884063474674492f);
 623     const gmx_simd_float_t  CB1      = gmx_simd_set1_f(0.2732120154030589f);
 624     const gmx_simd_float_t  CB0      = gmx_simd_set1_f(0.42758357702025784f);
 625     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 626     const gmx_simd_float_t  CC10     = gmx_simd_set1_f(-0.0445555913112064f);
 627     const gmx_simd_float_t  CC9      = gmx_simd_set1_f(0.21376355144663348f);
 628     const gmx_simd_float_t  CC8      = gmx_simd_set1_f(-0.3473187200259257f);
 629     const gmx_simd_float_t  CC7      = gmx_simd_set1_f(0.016690861551248114f);
 630     const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.7560973182491192f);
 631     const gmx_simd_float_t  CC5      = gmx_simd_set1_f(-1.2137903600145787f);
 632     const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.8411872321232948f);
 633     const gmx_simd_float_t  CC3      = gmx_simd_set1_f(-0.08670413896296343f);
 634     const gmx_simd_float_t  CC2      = gmx_simd_set1_f(-0.27124782687240334f);
 635     const gmx_simd_float_t  CC1      = gmx_simd_set1_f(-0.0007502488047806069f);
 636     const gmx_simd_float_t  CC0      = gmx_simd_set1_f(0.5642114853803148f);
 637     /* Coefficients for expansion of exp(x) in [0,0.1] */
 638     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 639     const gmx_simd_float_t  CD2      = gmx_simd_set1_f(0.5000066608081202f);
 640     const gmx_simd_float_t  CD3      = gmx_simd_set1_f(0.1664795422874624f);
 641     const gmx_simd_float_t  CD4      = gmx_simd_set1_f(0.04379839977652482f);
 642     const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
 643     const gmx_simd_float_t  two      = gmx_simd_set1_f(2.0f);
 644
 645     /* We need to use a small trick here, since we cannot assume all SIMD
 646      * architectures support integers, and the flag we want (0xfffff000) would
 647      * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
 648      * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
 649      * fp numbers, and perform a logical or. Since the expression is constant,
 650      * we can at least hope it is evaluated at compile-time.
 651      */
 652 #ifdef GMX_SIMD_HAVE_LOGICAL
 653     const gmx_simd_float_t  sieve    = gmx_simd_or_f(gmx_simd_set1_f(-5.965323564e+29f), gmx_simd_set1_f(7.05044434e-30f));
 654 #else
 655     const int               isieve   = 0xFFFFF000;
 656     float                   mem[GMX_SIMD_REAL_WIDTH*2];
 657     float *                 pmem = gmx_simd_align_f(mem);
 658     union {
 659         float f; int i;
 660     } conv;
 661     int                     i;
 662 #endif
 663
 664     gmx_simd_float_t        x2, x4, y;
 665     gmx_simd_float_t        q, z, t, t2, w, w2;
 666     gmx_simd_float_t        pA0, pA1, pB0, pB1, pC0, pC1;
 667     gmx_simd_float_t        expmx2, corr;
 668     gmx_simd_float_t        res_erf, res_erfc, res;
 669     gmx_simd_fbool_t        mask, msk_erf;
 670
 671     /* Calculate erf() */
 672     x2     = gmx_simd_mul_f(x, x);
 673     x4     = gmx_simd_mul_f(x2, x2);
 674
 675     pA0  = gmx_simd_fmadd_f(CA6, x4, CA4);
 676     pA1  = gmx_simd_fmadd_f(CA5, x4, CA3);
 677     pA0  = gmx_simd_fmadd_f(pA0, x4, CA2);
 678     pA1  = gmx_simd_fmadd_f(pA1, x4, CA1);
 679     pA1  = gmx_simd_mul_f(pA1, x2);
 680     pA0  = gmx_simd_fmadd_f(pA0, x4, pA1);
 681     /* Constant term must come last for precision reasons */
 682     pA0  = gmx_simd_add_f(pA0, CA0);
 683
 684     res_erf = gmx_simd_mul_f(x, pA0);
 685
 686     /* Calculate erfc */
 687     y       = gmx_simd_fabs_f(x);
 688     msk_erf = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
 689     t       = gmx_simd_inv_notmaskfpe_f(y, msk_erf);
 690     w       = gmx_simd_sub_f(t, one);
 691     t2      = gmx_simd_mul_f(t, t);
 692     w2      = gmx_simd_mul_f(w, w);
 693     /*
 694      * We cannot simply calculate exp(-y2) directly in single precision, since
 695      * that will lose a couple of bits of precision due to the multiplication.
 696      * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
 697      * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
 698      *
 699      * The only drawback with this is that it requires TWO separate exponential
 700      * evaluations, which would be horrible performance-wise. However, the argument
 701      * for the second exp() call is always small, so there we simply use a
 702      * low-order minimax expansion on [0,0.1].
 703      *
 704      * However, this neat idea requires support for logical ops (and) on
 705      * FP numbers, which some vendors decided isn't necessary in their SIMD
 706      * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
 707      * in double, but we still need memory as a backup when that is not available,
 708      * and this case is rare enough that we go directly there...
 709      */
 710 #ifdef GMX_SIMD_HAVE_LOGICAL
 711     z       = gmx_simd_and_f(y, sieve);
 712 #else
 713     gmx_simd_store_f(pmem, y);
 714     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 715     {
 716         conv.f  = pmem[i];
 717         conv.i  = conv.i & isieve;
 718         pmem[i] = conv.f;
 719     }
 720     z = gmx_simd_load_f(pmem);
 721 #endif
 722     q       = gmx_simd_mul_f( gmx_simd_sub_f(z, y), gmx_simd_add_f(z, y) );
 723     corr    = gmx_simd_fmadd_f(CD4, q, CD3);
 724     corr    = gmx_simd_fmadd_f(corr, q, CD2);
 725     corr    = gmx_simd_fmadd_f(corr, q, one);
 726     corr    = gmx_simd_fmadd_f(corr, q, one);
 727
 728     expmx2  = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(z, z) ) );
 729     expmx2  = gmx_simd_mul_f(expmx2, corr);
 730
 731     pB1  = gmx_simd_fmadd_f(CB9, w2, CB7);
 732     pB0  = gmx_simd_fmadd_f(CB8, w2, CB6);
 733     pB1  = gmx_simd_fmadd_f(pB1, w2, CB5);
 734     pB0  = gmx_simd_fmadd_f(pB0, w2, CB4);
 735     pB1  = gmx_simd_fmadd_f(pB1, w2, CB3);
 736     pB0  = gmx_simd_fmadd_f(pB0, w2, CB2);
 737     pB1  = gmx_simd_fmadd_f(pB1, w2, CB1);
 738     pB0  = gmx_simd_fmadd_f(pB0, w2, CB0);
 739     pB0  = gmx_simd_fmadd_f(pB1, w, pB0);
 740
 741     pC0  = gmx_simd_fmadd_f(CC10, t2, CC8);
 742     pC1  = gmx_simd_fmadd_f(CC9, t2, CC7);
 743     pC0  = gmx_simd_fmadd_f(pC0, t2, CC6);
 744     pC1  = gmx_simd_fmadd_f(pC1, t2, CC5);
 745     pC0  = gmx_simd_fmadd_f(pC0, t2, CC4);
 746     pC1  = gmx_simd_fmadd_f(pC1, t2, CC3);
 747     pC0  = gmx_simd_fmadd_f(pC0, t2, CC2);
 748     pC1  = gmx_simd_fmadd_f(pC1, t2, CC1);
 749
 750     pC0  = gmx_simd_fmadd_f(pC0, t2, CC0);
 751     pC0  = gmx_simd_fmadd_f(pC1, t, pC0);
 752     pC0  = gmx_simd_mul_f(pC0, t);
 753
 754     /* SELECT pB0 or pC0 for erfc() */
 755     mask     = gmx_simd_cmplt_f(two, y);
 756     res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
 757     res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
 758
 759     /* erfc(x<0) = 2-erfc(|x|) */
 760     mask     = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
 761     res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
 762
 763     /* Select erf() or erfc() */
 764     res  = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(one, res_erf), msk_erf);
 765
 766     return res;
 767 }
 768
 769 /*! \brief SIMD float sin \& cos.
 770  *
 771  * You should normally call the real-precision routine \ref gmx_simd_sincos_r.
 772  *
 773  * \param x The argument to evaluate sin/cos for
 774  * \param[out] sinval Sin(x)
 775  * \param[out] cosval Cos(x)
 776  *
 777  * This version achieves close to machine precision, but for very large
 778  * magnitudes of the argument we inherently begin to lose accuracy due to the
 779  * argument reduction, despite using extended precision arithmetics internally.
 780  */
 781 static gmx_inline void gmx_simdcall
 782 gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
 783 {
 784     /* Constants to subtract Pi/4*x from y while minimizing precision loss */
 785     const gmx_simd_float_t  argred0         = gmx_simd_set1_f(-1.5703125);
 786     const gmx_simd_float_t  argred1         = gmx_simd_set1_f(-4.83751296997070312500e-04f);
 787     const gmx_simd_float_t  argred2         = gmx_simd_set1_f(-7.54953362047672271729e-08f);
 788     const gmx_simd_float_t  argred3         = gmx_simd_set1_f(-2.56334406825708960298e-12f);
 789     const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
 790     const gmx_simd_float_t  const_sin2      = gmx_simd_set1_f(-1.9515295891e-4f);
 791     const gmx_simd_float_t  const_sin1      = gmx_simd_set1_f( 8.3321608736e-3f);
 792     const gmx_simd_float_t  const_sin0      = gmx_simd_set1_f(-1.6666654611e-1f);
 793     const gmx_simd_float_t  const_cos2      = gmx_simd_set1_f( 2.443315711809948e-5f);
 794     const gmx_simd_float_t  const_cos1      = gmx_simd_set1_f(-1.388731625493765e-3f);
 795     const gmx_simd_float_t  const_cos0      = gmx_simd_set1_f( 4.166664568298827e-2f);
 796     const gmx_simd_float_t  half            = gmx_simd_set1_f(0.5f);
 797     const gmx_simd_float_t  one             = gmx_simd_set1_f(1.0f);
 798     gmx_simd_float_t        ssign, csign;
 799     gmx_simd_float_t        x2, y, z, psin, pcos, sss, ccc;
 800     gmx_simd_fbool_t        mask;
 801 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
 802     const gmx_simd_fint32_t ione            = gmx_simd_set1_fi(1);
 803     const gmx_simd_fint32_t itwo            = gmx_simd_set1_fi(2);
 804     gmx_simd_fint32_t       iy;
 805
 806     z       = gmx_simd_mul_f(x, two_over_pi);
 807     iy      = gmx_simd_cvt_f2i(z);
 808     y       = gmx_simd_round_f(z);
 809
 810     mask    = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), gmx_simd_setzero_fi()));
 811     ssign   = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
 812     csign   = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
 813 #else
 814     const gmx_simd_float_t  quarter         = gmx_simd_set1_f(0.25f);
 815     const gmx_simd_float_t  minusquarter    = gmx_simd_set1_f(-0.25f);
 816     gmx_simd_float_t        q;
 817     gmx_simd_fbool_t        m1, m2, m3;
 818
 819     /* The most obvious way to find the arguments quadrant in the unit circle
 820      * to calculate the sign is to use integer arithmetic, but that is not
 821      * present in all SIMD implementations. As an alternative, we have devised a
 822      * pure floating-point algorithm that uses truncation for argument reduction
 823      * so that we get a new value 0<=q<1 over the unit circle, and then
 824      * do floating-point comparisons with fractions. This is likely to be
 825      * slightly slower (~10%) due to the longer latencies of floating-point, so
 826      * we only use it when integer SIMD arithmetic is not present.
 827      */
 828     ssign   = x;
 829     x       = gmx_simd_fabs_f(x);
 830     /* It is critical that half-way cases are rounded down */
 831     z       = gmx_simd_fmadd_f(x, two_over_pi, half);
 832     y       = gmx_simd_trunc_f(z);
 833     q       = gmx_simd_mul_f(z, quarter);
 834     q       = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
 835     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
 836      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
 837      * This removes the 2*Pi periodicity without using any integer arithmetic.
 838      * First check if y had the value 2 or 3, set csign if true.
 839      */
 840     q       = gmx_simd_sub_f(q, half);
 841     /* If we have logical operations we can work directly on the signbit, which
 842      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
 843      * Thus, if you are altering defines to debug alternative code paths, the
 844      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
 845      * active or inactive - you will get errors if only one is used.
 846      */
 847 #    ifdef GMX_SIMD_HAVE_LOGICAL
 848     ssign   = gmx_simd_and_f(ssign, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
 849     csign   = gmx_simd_andnot_f(q, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
 850     ssign   = gmx_simd_xor_f(ssign, csign);
 851 #    else
 852     csign   = gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f), q);
 853     // ALT: csign = gmx_simd_fneg_f(gmx_simd_copysign(gmx_simd_set1_f(1.0),q));
 854
 855     ssign   = gmx_simd_xor_sign_f(ssign, csign);    /* swap ssign if csign was set. */
 856 #    endif
 857     /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
 858     m1      = gmx_simd_cmplt_f(q, minusquarter);
 859     m2      = gmx_simd_cmple_f(gmx_simd_setzero_f(), q);
 860     m3      = gmx_simd_cmplt_f(q, quarter);
 861     m2      = gmx_simd_and_fb(m2, m3);
 862     mask    = gmx_simd_or_fb(m1, m2);
 863     /* where mask is FALSE, set sign. */
 864     csign   = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask));
 865 #endif
 866     x       = gmx_simd_fmadd_f(y, argred0, x);
 867     x       = gmx_simd_fmadd_f(y, argred1, x);
 868     x       = gmx_simd_fmadd_f(y, argred2, x);
 869     x       = gmx_simd_fmadd_f(y, argred3, x);
 870     x2      = gmx_simd_mul_f(x, x);
 871
 872     psin    = gmx_simd_fmadd_f(const_sin2, x2, const_sin1);
 873     psin    = gmx_simd_fmadd_f(psin, x2, const_sin0);
 874     psin    = gmx_simd_fmadd_f(psin, gmx_simd_mul_f(x, x2), x);
 875     pcos    = gmx_simd_fmadd_f(const_cos2, x2, const_cos1);
 876     pcos    = gmx_simd_fmadd_f(pcos, x2, const_cos0);
 877     pcos    = gmx_simd_fmsub_f(pcos, x2, half);
 878     pcos    = gmx_simd_fmadd_f(pcos, x2, one);
 879
 880     sss     = gmx_simd_blendv_f(pcos, psin, mask);
 881     ccc     = gmx_simd_blendv_f(psin, pcos, mask);
 882     /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
 883 #ifdef GMX_SIMD_HAVE_LOGICAL
 884     *sinval = gmx_simd_xor_f(sss, ssign);
 885     *cosval = gmx_simd_xor_f(ccc, csign);
 886 #else
 887     *sinval = gmx_simd_xor_sign_f(sss, ssign);
 888     *cosval = gmx_simd_xor_sign_f(ccc, csign);
 889 #endif
 890 }
 891
 892 /*! \brief SIMD float sin(x).
 893  *
 894  * You should normally call the real-precision routine \ref gmx_simd_sin_r.
 895  *
 896  * \param x The argument to evaluate sin for
 897  * \result Sin(x)
 898  *
 899  * \attention Do NOT call both sin & cos if you need both results, since each of them
 900  * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
 901  */
 902 static gmx_inline gmx_simd_float_t gmx_simdcall
 903 gmx_simd_sin_f(gmx_simd_float_t x)
 904 {
 905     gmx_simd_float_t s, c;
 906     gmx_simd_sincos_f(x, &s, &c);
 907     return s;
 908 }
 909
 910 /*! \brief SIMD float cos(x).
 911  *
 912  * You should normally call the real-precision routine \ref gmx_simd_cos_r.
 913  *
 914  * \param x The argument to evaluate cos for
 915  * \result Cos(x)
 916  *
 917  * \attention Do NOT call both sin & cos if you need both results, since each of them
 918  * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
 919  */
 920 static gmx_inline gmx_simd_float_t gmx_simdcall
 921 gmx_simd_cos_f(gmx_simd_float_t x)
 922 {
 923     gmx_simd_float_t s, c;
 924     gmx_simd_sincos_f(x, &s, &c);
 925     return c;
 926 }
 927
 928 /*! \brief SIMD float tan(x).
 929  *
 930  * You should normally call the real-precision routine \ref gmx_simd_tan_r.
 931  *
 932  * \param x The argument to evaluate tan for
 933  * \result Tan(x)
 934  */
 935 static gmx_inline gmx_simd_float_t gmx_simdcall
 936 gmx_simd_tan_f(gmx_simd_float_t x)
 937 {
 938     const gmx_simd_float_t  argred0         = gmx_simd_set1_f(-1.5703125);
 939     const gmx_simd_float_t  argred1         = gmx_simd_set1_f(-4.83751296997070312500e-04f);
 940     const gmx_simd_float_t  argred2         = gmx_simd_set1_f(-7.54953362047672271729e-08f);
 941     const gmx_simd_float_t  argred3         = gmx_simd_set1_f(-2.56334406825708960298e-12f);
 942     const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
 943     const gmx_simd_float_t  CT6             = gmx_simd_set1_f(0.009498288995810566122993911);
 944     const gmx_simd_float_t  CT5             = gmx_simd_set1_f(0.002895755790837379295226923);
 945     const gmx_simd_float_t  CT4             = gmx_simd_set1_f(0.02460087336161924491836265);
 946     const gmx_simd_float_t  CT3             = gmx_simd_set1_f(0.05334912882656359828045988);
 947     const gmx_simd_float_t  CT2             = gmx_simd_set1_f(0.1333989091464957704418495);
 948     const gmx_simd_float_t  CT1             = gmx_simd_set1_f(0.3333307599244198227797507);
 949
 950     gmx_simd_float_t        x2, p, y, z;
 951     gmx_simd_fbool_t        mask;
 952
 953 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
 954     gmx_simd_fint32_t  iy;
 955     gmx_simd_fint32_t  ione = gmx_simd_set1_fi(1);
 956
 957     z       = gmx_simd_mul_f(x, two_over_pi);
 958     iy      = gmx_simd_cvt_f2i(z);
 959     y       = gmx_simd_round_f(z);
 960     mask    = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione));
 961
 962     x       = gmx_simd_fmadd_f(y, argred0, x);
 963     x       = gmx_simd_fmadd_f(y, argred1, x);
 964     x       = gmx_simd_fmadd_f(y, argred2, x);
 965     x       = gmx_simd_fmadd_f(y, argred3, x);
 966     x       = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), mask), x);
 967 #else
 968     const gmx_simd_float_t  quarter         = gmx_simd_set1_f(0.25f);
 969     const gmx_simd_float_t  half            = gmx_simd_set1_f(0.5f);
 970     const gmx_simd_float_t  threequarter    = gmx_simd_set1_f(0.75f);
 971     gmx_simd_float_t        w, q;
 972     gmx_simd_fbool_t        m1, m2, m3;
 973
 974     w       = gmx_simd_fabs_f(x);
 975     z       = gmx_simd_fmadd_f(w, two_over_pi, half);
 976     y       = gmx_simd_trunc_f(z);
 977     q       = gmx_simd_mul_f(z, quarter);
 978     q       = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
 979     m1      = gmx_simd_cmple_f(quarter, q);
 980     m2      = gmx_simd_cmplt_f(q, half);
 981     m3      = gmx_simd_cmple_f(threequarter, q);
 982     m1      = gmx_simd_and_fb(m1, m2);
 983     mask    = gmx_simd_or_fb(m1, m3);
 984     w       = gmx_simd_fmadd_f(y, argred0, w);
 985     w       = gmx_simd_fmadd_f(y, argred1, w);
 986     w       = gmx_simd_fmadd_f(y, argred2, w);
 987     w       = gmx_simd_fmadd_f(y, argred3, w);
 988
 989     w       = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask);
 990     x       = gmx_simd_xor_sign_f(w, x);
 991 #endif
 992     x2      = gmx_simd_mul_f(x, x);
 993     p       = gmx_simd_fmadd_f(CT6, x2, CT5);
 994     p       = gmx_simd_fmadd_f(p, x2, CT4);
 995     p       = gmx_simd_fmadd_f(p, x2, CT3);
 996     p       = gmx_simd_fmadd_f(p, x2, CT2);
 997     p       = gmx_simd_fmadd_f(p, x2, CT1);
 998     p       = gmx_simd_fmadd_f(x2, gmx_simd_mul_f(p, x), x);
 999
1000     p       = gmx_simd_blendv_f( p, gmx_simd_inv_maskfpe_f(p, mask), mask);
1001     return p;
1002 }
1003
1004 /*! \brief SIMD float asin(x).
1005  *
1006  * You should normally call the real-precision routine \ref gmx_simd_asin_r.
1007  *
1008  * \param x The argument to evaluate asin for
1009  * \result Asin(x)
1010  */
1011 static gmx_inline gmx_simd_float_t gmx_simdcall
1012 gmx_simd_asin_f(gmx_simd_float_t x)
1013 {
1014     const gmx_simd_float_t limitlow   = gmx_simd_set1_f(1e-4f);
1015     const gmx_simd_float_t half       = gmx_simd_set1_f(0.5f);
1016     const gmx_simd_float_t one        = gmx_simd_set1_f(1.0f);
1017     const gmx_simd_float_t halfpi     = gmx_simd_set1_f((float)M_PI/2.0f);
1018     const gmx_simd_float_t CC5        = gmx_simd_set1_f(4.2163199048E-2f);
1019     const gmx_simd_float_t CC4        = gmx_simd_set1_f(2.4181311049E-2f);
1020     const gmx_simd_float_t CC3        = gmx_simd_set1_f(4.5470025998E-2f);
1021     const gmx_simd_float_t CC2        = gmx_simd_set1_f(7.4953002686E-2f);
1022     const gmx_simd_float_t CC1        = gmx_simd_set1_f(1.6666752422E-1f);
1023     gmx_simd_float_t       xabs;
1024     gmx_simd_float_t       z, z1, z2, q, q1, q2;
1025     gmx_simd_float_t       pA, pB;
1026     gmx_simd_fbool_t       mask, mask2;
1027
1028     xabs  = gmx_simd_fabs_f(x);
1029     mask  = gmx_simd_cmplt_f(half, xabs);
1030     z1    = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
1031     mask2 = gmx_simd_cmpeq_f(xabs, one);
1032     q1    = gmx_simd_mul_f(z1, gmx_simd_invsqrt_notmaskfpe_f(z1, mask2));
1033     q1    = gmx_simd_blendnotzero_f(q1, mask2);
1034     q2    = xabs;
1035     z2    = gmx_simd_mul_f(q2, q2);
1036     z     = gmx_simd_blendv_f(z2, z1, mask);
1037     q     = gmx_simd_blendv_f(q2, q1, mask);
1038
1039     z2    = gmx_simd_mul_f(z, z);
1040     pA    = gmx_simd_fmadd_f(CC5, z2, CC3);
1041     pB    = gmx_simd_fmadd_f(CC4, z2, CC2);
1042     pA    = gmx_simd_fmadd_f(pA, z2, CC1);
1043     pA    = gmx_simd_mul_f(pA, z);
1044     z     = gmx_simd_fmadd_f(pB, z2, pA);
1045     z     = gmx_simd_fmadd_f(z, q, q);
1046     q2    = gmx_simd_sub_f(halfpi, z);
1047     q2    = gmx_simd_sub_f(q2, z);
1048     z     = gmx_simd_blendv_f(z, q2, mask);
1049
1050     mask  = gmx_simd_cmplt_f(limitlow, xabs);
1051     z     = gmx_simd_blendv_f( xabs, z, mask );
1052     z     = gmx_simd_xor_sign_f(z, x);
1053
1054     return z;
1055 }
1056
1057 /*! \brief SIMD float acos(x).
1058  *
1059  * You should normally call the real-precision routine \ref gmx_simd_acos_r.
1060  *
1061  * \param x The argument to evaluate acos for
1062  * \result Acos(x)
1063  */
1064 static gmx_inline gmx_simd_float_t gmx_simdcall
1065 gmx_simd_acos_f(gmx_simd_float_t x)
1066 {
1067     const gmx_simd_float_t one       = gmx_simd_set1_f(1.0f);
1068     const gmx_simd_float_t half      = gmx_simd_set1_f(0.5f);
1069     const gmx_simd_float_t pi        = gmx_simd_set1_f((float)M_PI);
1070     const gmx_simd_float_t halfpi    = gmx_simd_set1_f((float)M_PI/2.0f);
1071     gmx_simd_float_t       xabs;
1072     gmx_simd_float_t       z, z1, z2, z3;
1073     gmx_simd_fbool_t       mask1, mask2, mask3;
1074
1075     xabs  = gmx_simd_fabs_f(x);
1076     mask1 = gmx_simd_cmplt_f(half, xabs);
1077     mask2 = gmx_simd_cmplt_f(gmx_simd_setzero_f(), x);
1078
1079     z     = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
1080     mask3 = gmx_simd_cmpeq_f(xabs, one);
1081     z     = gmx_simd_mul_f(z, gmx_simd_invsqrt_notmaskfpe_f(z, mask3));
1082     z     = gmx_simd_blendnotzero_f(z, mask3);
1083     z     = gmx_simd_blendv_f(x, z, mask1);
1084     z     = gmx_simd_asin_f(z);
1085
1086     z2    = gmx_simd_add_f(z, z);
1087     z1    = gmx_simd_sub_f(pi, z2);
1088     z3    = gmx_simd_sub_f(halfpi, z);
1089     z     = gmx_simd_blendv_f(z1, z2, mask2);
1090     z     = gmx_simd_blendv_f(z3, z, mask1);
1091
1092     return z;
1093 }
1094
1095 /*! \brief SIMD float asin(x).
1096  *
1097  * You should normally call the real-precision routine \ref gmx_simd_atan_r.
1098  *
1099  * \param x The argument to evaluate atan for
1100  * \result Atan(x), same argument/value range as standard math library.
1101  */
1102 static gmx_inline gmx_simd_float_t gmx_simdcall
1103 gmx_simd_atan_f(gmx_simd_float_t x)
1104 {
1105     const gmx_simd_float_t halfpi    = gmx_simd_set1_f(M_PI/2);
1106     const gmx_simd_float_t CA17      = gmx_simd_set1_f(0.002823638962581753730774f);
1107     const gmx_simd_float_t CA15      = gmx_simd_set1_f(-0.01595690287649631500244f);
1108     const gmx_simd_float_t CA13      = gmx_simd_set1_f(0.04250498861074447631836f);
1109     const gmx_simd_float_t CA11      = gmx_simd_set1_f(-0.07489009201526641845703f);
1110     const gmx_simd_float_t CA9       = gmx_simd_set1_f(0.1063479334115982055664f);
1111     const gmx_simd_float_t CA7       = gmx_simd_set1_f(-0.1420273631811141967773f);
1112     const gmx_simd_float_t CA5       = gmx_simd_set1_f(0.1999269574880599975585f);
1113     const gmx_simd_float_t CA3       = gmx_simd_set1_f(-0.3333310186862945556640f);
1114     const gmx_simd_float_t one       = gmx_simd_set1_f(1.0f);
1115     gmx_simd_float_t       x2, x3, x4, pA, pB;
1116     gmx_simd_fbool_t       mask, mask2;
1117
1118     mask  = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1119     x     = gmx_simd_fabs_f(x);
1120     mask2 = gmx_simd_cmplt_f(one, x);
1121     x     = gmx_simd_blendv_f(x, gmx_simd_inv_maskfpe_f(x, mask2), mask2);
1122
1123     x2    = gmx_simd_mul_f(x, x);
1124     x3    = gmx_simd_mul_f(x2, x);
1125     x4    = gmx_simd_mul_f(x2, x2);
1126     pA    = gmx_simd_fmadd_f(CA17, x4, CA13);
1127     pB    = gmx_simd_fmadd_f(CA15, x4, CA11);
1128     pA    = gmx_simd_fmadd_f(pA, x4, CA9);
1129     pB    = gmx_simd_fmadd_f(pB, x4, CA7);
1130     pA    = gmx_simd_fmadd_f(pA, x4, CA5);
1131     pB    = gmx_simd_fmadd_f(pB, x4, CA3);
1132     pA    = gmx_simd_fmadd_f(pA, x2, pB);
1133     pA    = gmx_simd_fmadd_f(pA, x3, x);
1134
1135     pA    = gmx_simd_blendv_f(pA, gmx_simd_sub_f(halfpi, pA), mask2);
1136     pA    = gmx_simd_blendv_f(pA, gmx_simd_fneg_f(pA), mask);
1137
1138     return pA;
1139 }
1140
1141 /*! \brief SIMD float atan2(y,x).
1142  *
1143  * You should normally call the real-precision routine \ref gmx_simd_atan2_r.
1144  *
1145  * \param y Y component of vector, any quartile
1146  * \param x X component of vector, any quartile
1147  * \result Atan(y,x), same argument/value range as standard math library.
1148  *
1149  * \note This routine should provide correct results for all finite
1150  * non-zero or positive-zero arguments. However, negative zero arguments will
1151  * be treated as positive zero, which means the return value will deviate from
1152  * the standard math library atan2(y,x) for those cases. That should not be
1153  * of any concern in Gromacs, and in particular it will not affect calculations
1154  * of angles from vectors.
1155  */
1156 static gmx_inline gmx_simd_float_t gmx_simdcall
1157 gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
1158 {
1159     const gmx_simd_float_t pi          = gmx_simd_set1_f(M_PI);
1160     const gmx_simd_float_t halfpi      = gmx_simd_set1_f(M_PI/2.0);
1161     gmx_simd_float_t       xinv, p, aoffset;
1162     gmx_simd_fbool_t       mask_x0, mask_y0, mask_xlt0, mask_ylt0;
1163
1164     mask_x0   = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
1165     mask_y0   = gmx_simd_cmpeq_f(y, gmx_simd_setzero_f());
1166     mask_xlt0 = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1167     mask_ylt0 = gmx_simd_cmplt_f(y, gmx_simd_setzero_f());
1168
1169     aoffset   = gmx_simd_blendzero_f(halfpi, mask_x0);
1170     aoffset   = gmx_simd_blendnotzero_f(aoffset, mask_y0);
1171
1172     aoffset   = gmx_simd_blendv_f(aoffset, pi, mask_xlt0);
1173     aoffset   = gmx_simd_blendv_f(aoffset, gmx_simd_fneg_f(aoffset), mask_ylt0);
1174
1175     xinv      = gmx_simd_blendnotzero_f(gmx_simd_inv_notmaskfpe_f(x, mask_x0), mask_x0);
1176     p         = gmx_simd_mul_f(y, xinv);
1177     p         = gmx_simd_atan_f(p);
1178     p         = gmx_simd_add_f(p, aoffset);
1179
1180     return p;
1181 }
1182
1183 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1184  *
1185  * You should normally call the real-precision routine \ref gmx_simd_pmecorrF_r.
1186  *
1187  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1188  * \result Correction factor to coulomb force - see below for details.
1189  *
1190  * This routine is meant to enable analytical evaluation of the
1191  * direct-space PME electrostatic force to avoid tables.
1192  *
1193  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1194  * are some problems evaluating that:
1195  *
1196  * First, the error function is difficult (read: expensive) to
1197  * approxmiate accurately for intermediate to large arguments, and
1198  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1199  * Second, we now try to avoid calculating potentials in Gromacs but
1200  * use forces directly.
1201  *
1202  * We can simply things slight by noting that the PME part is really
1203  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1204  * \f[
1205  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1206  * \f]
1207  * The first term we already have from the inverse square root, so
1208  * that we can leave out of this routine.
1209  *
1210  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1211  * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1212  * the range used for the minimax fit. Use your favorite plotting program
1213  * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1214  *
1215  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1216  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1217  * then only use even powers. This is another minor optimization, since
1218  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1219  * the vector between the two atoms to get the vectorial force. The
1220  * fastest flops are the ones we can avoid calculating!
1221  *
1222  * So, here's how it should be used:
1223  *
1224  * 1. Calculate \f$r^2\f$.
1225  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1226  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1227  * 4. The return value is the expression:
1228  *
1229  * \f[
1230  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1231  * \f]
1232  *
1233  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1234  *
1235  *  \f[
1236  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1237  *  \f]
1238  *
1239  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1240  *
1241  *  \f[
1242  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1243  *  \f]
1244  *
1245  *    With a bit of math exercise you should be able to confirm that
1246  *    this is exactly
1247  *
1248  *  \f[
1249  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1250  *  \f]
1251  *
1252  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1253  *    and you have your force (divided by \f$r\f$). A final multiplication
1254  *    with the vector connecting the two particles and you have your
1255  *    vectorial force to add to the particles.
1256  *
1257  * This approximation achieves an error slightly lower than 1e-6
1258  * in single precision and 1e-11 in double precision
1259  * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1260  * when added to \f$1/r\f$ the error will be insignificant.
1261  * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1262  *
1263  */
1264 static gmx_inline gmx_simd_float_t gmx_simdcall
1265 gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
1266 {
1267     const gmx_simd_float_t  FN6      = gmx_simd_set1_f(-1.7357322914161492954e-8f);
1268     const gmx_simd_float_t  FN5      = gmx_simd_set1_f(1.4703624142580877519e-6f);
1269     const gmx_simd_float_t  FN4      = gmx_simd_set1_f(-0.000053401640219807709149f);
1270     const gmx_simd_float_t  FN3      = gmx_simd_set1_f(0.0010054721316683106153f);
1271     const gmx_simd_float_t  FN2      = gmx_simd_set1_f(-0.019278317264888380590f);
1272     const gmx_simd_float_t  FN1      = gmx_simd_set1_f(0.069670166153766424023f);
1273     const gmx_simd_float_t  FN0      = gmx_simd_set1_f(-0.75225204789749321333f);
1274
1275     const gmx_simd_float_t  FD4      = gmx_simd_set1_f(0.0011193462567257629232f);
1276     const gmx_simd_float_t  FD3      = gmx_simd_set1_f(0.014866955030185295499f);
1277     const gmx_simd_float_t  FD2      = gmx_simd_set1_f(0.11583842382862377919f);
1278     const gmx_simd_float_t  FD1      = gmx_simd_set1_f(0.50736591960530292870f);
1279     const gmx_simd_float_t  FD0      = gmx_simd_set1_f(1.0f);
1280
1281     gmx_simd_float_t        z4;
1282     gmx_simd_float_t        polyFN0, polyFN1, polyFD0, polyFD1;
1283
1284     z4             = gmx_simd_mul_f(z2, z2);
1285
1286     polyFD0        = gmx_simd_fmadd_f(FD4, z4, FD2);
1287     polyFD1        = gmx_simd_fmadd_f(FD3, z4, FD1);
1288     polyFD0        = gmx_simd_fmadd_f(polyFD0, z4, FD0);
1289     polyFD0        = gmx_simd_fmadd_f(polyFD1, z2, polyFD0);
1290
1291     polyFD0        = gmx_simd_inv_f(polyFD0);
1292
1293     polyFN0        = gmx_simd_fmadd_f(FN6, z4, FN4);
1294     polyFN1        = gmx_simd_fmadd_f(FN5, z4, FN3);
1295     polyFN0        = gmx_simd_fmadd_f(polyFN0, z4, FN2);
1296     polyFN1        = gmx_simd_fmadd_f(polyFN1, z4, FN1);
1297     polyFN0        = gmx_simd_fmadd_f(polyFN0, z4, FN0);
1298     polyFN0        = gmx_simd_fmadd_f(polyFN1, z2, polyFN0);
1299
1300     return gmx_simd_mul_f(polyFN0, polyFD0);
1301 }
1302
1303
1304
1305 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1306  *
1307  * You should normally call the real-precision routine \ref gmx_simd_pmecorrV_r.
1308  *
1309  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1310  * \result Correction factor to coulomb potential - see below for details.
1311  *
1312  * See \ref gmx_simd_pmecorrF_f for details about the approximation.
1313  *
1314  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1315  * as the input argument.
1316  *
1317  * Here's how it should be used:
1318  *
1319  * 1. Calculate \f$r^2\f$.
1320  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1321  * 3. Evaluate this routine with z^2 as the argument.
1322  * 4. The return value is the expression:
1323  *
1324  *  \f[
1325  *   \frac{\mbox{erf}(z)}{z}
1326  *  \f]
1327  *
1328  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1329  *
1330  *  \f[
1331  *    \frac{\mbox{erf}(r \beta)}{r}
1332  *  \f]
1333  *
1334  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1335  *    and you have your potential.
1336  *
1337  * This approximation achieves an error slightly lower than 1e-6
1338  * in single precision and 4e-11 in double precision
1339  * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1340  * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1341  * when added to \f$1/r\f$ the error will be insignificant.
1342  * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1343  */
1344 static gmx_inline gmx_simd_float_t gmx_simdcall
1345 gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
1346 {
1347     const gmx_simd_float_t  VN6      = gmx_simd_set1_f(1.9296833005951166339e-8f);
1348     const gmx_simd_float_t  VN5      = gmx_simd_set1_f(-1.4213390571557850962e-6f);
1349     const gmx_simd_float_t  VN4      = gmx_simd_set1_f(0.000041603292906656984871f);
1350     const gmx_simd_float_t  VN3      = gmx_simd_set1_f(-0.00013134036773265025626f);
1351     const gmx_simd_float_t  VN2      = gmx_simd_set1_f(0.038657983986041781264f);
1352     const gmx_simd_float_t  VN1      = gmx_simd_set1_f(0.11285044772717598220f);
1353     const gmx_simd_float_t  VN0      = gmx_simd_set1_f(1.1283802385263030286f);
1354
1355     const gmx_simd_float_t  VD3      = gmx_simd_set1_f(0.0066752224023576045451f);
1356     const gmx_simd_float_t  VD2      = gmx_simd_set1_f(0.078647795836373922256f);
1357     const gmx_simd_float_t  VD1      = gmx_simd_set1_f(0.43336185284710920150f);
1358     const gmx_simd_float_t  VD0      = gmx_simd_set1_f(1.0f);
1359
1360     gmx_simd_float_t        z4;
1361     gmx_simd_float_t        polyVN0, polyVN1, polyVD0, polyVD1;
1362
1363     z4             = gmx_simd_mul_f(z2, z2);
1364
1365     polyVD1        = gmx_simd_fmadd_f(VD3, z4, VD1);
1366     polyVD0        = gmx_simd_fmadd_f(VD2, z4, VD0);
1367     polyVD0        = gmx_simd_fmadd_f(polyVD1, z2, polyVD0);
1368
1369     polyVD0        = gmx_simd_inv_f(polyVD0);
1370
1371     polyVN0        = gmx_simd_fmadd_f(VN6, z4, VN4);
1372     polyVN1        = gmx_simd_fmadd_f(VN5, z4, VN3);
1373     polyVN0        = gmx_simd_fmadd_f(polyVN0, z4, VN2);
1374     polyVN1        = gmx_simd_fmadd_f(polyVN1, z4, VN1);
1375     polyVN0        = gmx_simd_fmadd_f(polyVN0, z4, VN0);
1376     polyVN0        = gmx_simd_fmadd_f(polyVN1, z2, polyVN0);
1377
1378     return gmx_simd_mul_f(polyVN0, polyVD0);
1379 }
1380 #endif
1381
1382 /*! \} */
1383
1384 #ifdef GMX_SIMD_HAVE_DOUBLE
1385
1386 /*! \name Double precision SIMD math functions
1387  *
1388  *  \note In most cases you should use the real-precision functions instead.
1389  *  \{
1390  */
1391
1392 /****************************************
1393  * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1394  ****************************************/
1395
1396 /*! \brief SIMD utility function to sum a+b+c+d for SIMD doubles.
1397  *
1398  * \copydetails gmx_simd_sum4_f
1399  */
1400 static gmx_inline gmx_simd_double_t gmx_simdcall
1401 gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
1402                 gmx_simd_double_t c, gmx_simd_double_t d)
1403 {
1404     return gmx_simd_add_d(gmx_simd_add_d(a, b), gmx_simd_add_d(c, d));
1405 }
1406
1407 /*! \brief Return -a if b is negative, SIMD double.
1408  *
1409  * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
1410  *
1411  * \param a Values to set sign for
1412  * \param b Values used to set sign
1413  * \return if b is negative, the sign of a will be changed.
1414  *
1415  * This is equivalent to doing an xor operation on a with the sign bit of b,
1416  * with the exception that negative zero is not considered to be negative
1417  * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
1418  */
1419 static gmx_inline gmx_simd_double_t gmx_simdcall
1420 gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
1421 {
1422 #ifdef GMX_SIMD_HAVE_LOGICAL
1423     return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), b));
1424 #else
1425     return gmx_simd_blendv_d(a, gmx_simd_fneg_d(a), gmx_simd_cmplt_d(b, gmx_simd_setzero_d()));
1426 #endif
1427 }
1428
1429 #ifndef gmx_simd_rsqrt_iter_d
1430 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1431  *
1432  * \copydetails gmx_simd_rsqrt_iter_f
1433  */
1434 static gmx_inline gmx_simd_double_t gmx_simdcall
1435 gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1436 {
1437 #ifdef GMX_SIMD_HAVE_FMA
1438     return gmx_simd_fmadd_d(gmx_simd_fnmadd_d(x, gmx_simd_mul_d(lu, lu), gmx_simd_set1_d(1.0)), gmx_simd_mul_d(lu, gmx_simd_set1_d(0.5)), lu);
1439 #else
1440     return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
1441 #endif
1442 }
1443 #endif
1444
1445 /*! \brief Calculate 1/sqrt(x) for SIMD double
1446  *
1447  * \copydetails gmx_simd_invsqrt_f
1448  */
1449 static gmx_inline gmx_simd_double_t gmx_simdcall
1450 gmx_simd_invsqrt_d(gmx_simd_double_t x)
1451 {
1452     gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
1453 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1454     lu = gmx_simd_rsqrt_iter_d(lu, x);
1455 #endif
1456 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1457     lu = gmx_simd_rsqrt_iter_d(lu, x);
1458 #endif
1459 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1460     lu = gmx_simd_rsqrt_iter_d(lu, x);
1461 #endif
1462 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1463     lu = gmx_simd_rsqrt_iter_d(lu, x);
1464 #endif
1465     return lu;
1466 }
1467
1468 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1469  *
1470  * \copydetails gmx_simd_invsqrt_maskfpe_f
1471  */
1472 static gmx_inline gmx_simd_double_t
1473 gmx_simd_invsqrt_maskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1474 {
1475 #ifdef NDEBUG
1476     return gmx_simd_invsqrt_d(x);
1477 #else
1478     return gmx_simd_invsqrt_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
1479 #endif
1480 }
1481
1482 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD double.
1483  *
1484  * \copydetails gmx_simd_invsqrt_notmaskfpe_f
1485  */
1486 static gmx_inline gmx_simd_double_t
1487 gmx_simd_invsqrt_notmaskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1488 {
1489 #ifdef NDEBUG
1490     return gmx_simd_invsqrt_d(x);
1491 #else
1492     return gmx_simd_invsqrt_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
1493 #endif
1494 }
1495
1496 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1497  *
1498  * \copydetails gmx_simd_invsqrt_pair_f
1499  */
1500 static gmx_inline void gmx_simdcall
1501 gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0,    gmx_simd_double_t x1,
1502                         gmx_simd_double_t *out0, gmx_simd_double_t *out1)
1503 {
1504 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1505     gmx_simd_float_t  xf  = gmx_simd_cvt_dd2f(x0, x1);
1506     gmx_simd_float_t  luf = gmx_simd_rsqrt_f(xf);
1507     gmx_simd_double_t lu0, lu1;
1508     /* Intermediate target is single - mantissa+1 bits */
1509 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1510     luf = gmx_simd_rsqrt_iter_f(luf, xf);
1511 #endif
1512 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1513     luf = gmx_simd_rsqrt_iter_f(luf, xf);
1514 #endif
1515 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1516     luf = gmx_simd_rsqrt_iter_f(luf, xf);
1517 #endif
1518     gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
1519     /* Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15) */
1520 #if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1521     lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1522     lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1523 #endif
1524 #if (GMX_SIMD_ACCURACY_BITS_SINGLE*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1525     lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1526     lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1527 #endif
1528     *out0 = lu0;
1529     *out1 = lu1;
1530 #else
1531     *out0 = gmx_simd_invsqrt_d(x0);
1532     *out1 = gmx_simd_invsqrt_d(x1);
1533 #endif
1534 }
1535
1536 #ifndef gmx_simd_rcp_iter_d
1537 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1538  *
1539  * \copydetails gmx_simd_rcp_iter_f
1540  */
1541 static gmx_inline gmx_simd_double_t gmx_simdcall
1542 gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1543 {
1544     return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
1545 }
1546 #endif
1547
1548 /*! \brief Calculate 1/x for SIMD double.
1549  *
1550  * \copydetails gmx_simd_inv_f
1551  */
1552 static gmx_inline gmx_simd_double_t gmx_simdcall
1553 gmx_simd_inv_d(gmx_simd_double_t x)
1554 {
1555     gmx_simd_double_t lu = gmx_simd_rcp_d(x);
1556 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1557     lu = gmx_simd_rcp_iter_d(lu, x);
1558 #endif
1559 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1560     lu = gmx_simd_rcp_iter_d(lu, x);
1561 #endif
1562 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1563     lu = gmx_simd_rcp_iter_d(lu, x);
1564 #endif
1565 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1566     lu = gmx_simd_rcp_iter_d(lu, x);
1567 #endif
1568     return lu;
1569 }
1570
1571 /*! \brief Calculate 1/x for masked entries of SIMD double.
1572  *
1573  * \copydetails gmx_simd_inv_maskfpe_f
1574  */
1575 static gmx_inline gmx_simd_double_t
1576 gmx_simd_inv_maskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1577 {
1578 #ifdef NDEBUG
1579     return gmx_simd_inv_d(x);
1580 #else
1581     return gmx_simd_inv_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
1582 #endif
1583 }
1584
1585 /*! \brief Calculate 1/x for non-masked entries of SIMD double.
1586  *
1587  * \copydetails gmx_simd_inv_notmaskfpe_f
1588  */
1589 static gmx_inline gmx_simd_double_t
1590 gmx_simd_inv_notmaskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1591 {
1592 #ifdef NDEBUG
1593     return gmx_simd_inv_d(x);
1594 #else
1595     return gmx_simd_inv_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
1596 #endif
1597 }
1598
1599 /*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
1600  *
1601  * \copydetails gmx_simd_sqrt_f
1602  */
1603 static gmx_inline gmx_simd_double_t gmx_simdcall
1604 gmx_simd_sqrt_d(gmx_simd_double_t x)
1605 {
1606     gmx_simd_dbool_t   mask;
1607     gmx_simd_double_t  res;
1608
1609     mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
1610     res  = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_notmaskfpe_d(x, mask), mask);
1611     return gmx_simd_mul_d(res, x);
1612 }
1613
1614 /*! \brief SIMD double log(x). This is the natural logarithm.
1615  *
1616  * \copydetails gmx_simd_log_f
1617  */
1618 static gmx_inline gmx_simd_double_t gmx_simdcall
1619 gmx_simd_log_d(gmx_simd_double_t x)
1620 {
1621     const gmx_simd_double_t  half       = gmx_simd_set1_d(0.5);
1622     const gmx_simd_double_t  one        = gmx_simd_set1_d(1.0);
1623     const gmx_simd_double_t  sqrt2      = gmx_simd_set1_d(sqrt(2.0));
1624     const gmx_simd_double_t  corr       = gmx_simd_set1_d(0.693147180559945286226764);
1625     const gmx_simd_double_t  CL15       = gmx_simd_set1_d(0.148197055177935105296783);
1626     const gmx_simd_double_t  CL13       = gmx_simd_set1_d(0.153108178020442575739679);
1627     const gmx_simd_double_t  CL11       = gmx_simd_set1_d(0.181837339521549679055568);
1628     const gmx_simd_double_t  CL9        = gmx_simd_set1_d(0.22222194152736701733275);
1629     const gmx_simd_double_t  CL7        = gmx_simd_set1_d(0.285714288030134544449368);
1630     const gmx_simd_double_t  CL5        = gmx_simd_set1_d(0.399999999989941956712869);
1631     const gmx_simd_double_t  CL3        = gmx_simd_set1_d(0.666666666666685503450651);
1632     const gmx_simd_double_t  CL1        = gmx_simd_set1_d(2.0);
1633     gmx_simd_double_t        fexp, x2, p;
1634     gmx_simd_dbool_t         mask;
1635
1636     fexp  = gmx_simd_get_exponent_d(x);
1637     x     = gmx_simd_get_mantissa_d(x);
1638
1639     mask  = gmx_simd_cmplt_d(sqrt2, x);
1640     /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
1641     fexp  = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
1642     x     = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
1643
1644     x     = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_d( gmx_simd_add_d(x, one) ) );
1645     x2    = gmx_simd_mul_d(x, x);
1646
1647     p     = gmx_simd_fmadd_d(CL15, x2, CL13);
1648     p     = gmx_simd_fmadd_d(p, x2, CL11);
1649     p     = gmx_simd_fmadd_d(p, x2, CL9);
1650     p     = gmx_simd_fmadd_d(p, x2, CL7);
1651     p     = gmx_simd_fmadd_d(p, x2, CL5);
1652     p     = gmx_simd_fmadd_d(p, x2, CL3);
1653     p     = gmx_simd_fmadd_d(p, x2, CL1);
1654     p     = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
1655
1656     return p;
1657 }
1658
1659 /*! \brief SIMD double 2^x.
1660  *
1661  * \copydetails gmx_simd_exp2_f
1662  */
1663 static gmx_inline gmx_simd_double_t gmx_simdcall
1664 gmx_simd_exp2_d(gmx_simd_double_t x)
1665 {
1666     const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
1667     const gmx_simd_double_t  CE11          = gmx_simd_set1_d(4.435280790452730022081181e-10);
1668     const gmx_simd_double_t  CE10          = gmx_simd_set1_d(7.074105630863314448024247e-09);
1669     const gmx_simd_double_t  CE9           = gmx_simd_set1_d(1.017819803432096698472621e-07);
1670     const gmx_simd_double_t  CE8           = gmx_simd_set1_d(1.321543308956718799557863e-06);
1671     const gmx_simd_double_t  CE7           = gmx_simd_set1_d(0.00001525273348995851746990884);
1672     const gmx_simd_double_t  CE6           = gmx_simd_set1_d(0.0001540353046251466849082632);
1673     const gmx_simd_double_t  CE5           = gmx_simd_set1_d(0.001333355814678995257307880);
1674     const gmx_simd_double_t  CE4           = gmx_simd_set1_d(0.009618129107588335039176502);
1675     const gmx_simd_double_t  CE3           = gmx_simd_set1_d(0.05550410866481992147457793);
1676     const gmx_simd_double_t  CE2           = gmx_simd_set1_d(0.2402265069591015620470894);
1677     const gmx_simd_double_t  CE1           = gmx_simd_set1_d(0.6931471805599453304615075);
1678     const gmx_simd_double_t  one           = gmx_simd_set1_d(1.0);
1679     gmx_simd_double_t        fexppart;
1680     gmx_simd_double_t        intpart;
1681     gmx_simd_double_t        p;
1682     gmx_simd_dbool_t         valuemask;
1683
1684     fexppart  = gmx_simd_set_exponent_d(x);  /* rounds to nearest int internally */
1685     intpart   = gmx_simd_round_d(x);         /* use same rounding mode here */
1686     valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
1687     fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
1688     x         = gmx_simd_sub_d(x, intpart);
1689
1690     p         = gmx_simd_fmadd_d(CE11, x, CE10);
1691     p         = gmx_simd_fmadd_d(p, x, CE9);
1692     p         = gmx_simd_fmadd_d(p, x, CE8);
1693     p         = gmx_simd_fmadd_d(p, x, CE7);
1694     p         = gmx_simd_fmadd_d(p, x, CE6);
1695     p         = gmx_simd_fmadd_d(p, x, CE5);
1696     p         = gmx_simd_fmadd_d(p, x, CE4);
1697     p         = gmx_simd_fmadd_d(p, x, CE3);
1698     p         = gmx_simd_fmadd_d(p, x, CE2);
1699     p         = gmx_simd_fmadd_d(p, x, CE1);
1700     p         = gmx_simd_fmadd_d(p, x, one);
1701     x         = gmx_simd_mul_d(p, fexppart);
1702     return x;
1703 }
1704
1705 /*! \brief SIMD double exp(x).
1706  *
1707  * \copydetails gmx_simd_exp_f
1708  */
1709 static gmx_inline gmx_simd_double_t gmx_simdcall
1710 gmx_simd_exp_d(gmx_simd_double_t x)
1711 {
1712     const gmx_simd_double_t  argscale      = gmx_simd_set1_d(1.44269504088896340735992468100);
1713     const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
1714     const gmx_simd_double_t  invargscale0  = gmx_simd_set1_d(-0.69314718055966295651160180568695068359375);
1715     const gmx_simd_double_t  invargscale1  = gmx_simd_set1_d(-2.8235290563031577122588448175013436025525412068e-13);
1716     const gmx_simd_double_t  CE12          = gmx_simd_set1_d(2.078375306791423699350304e-09);
1717     const gmx_simd_double_t  CE11          = gmx_simd_set1_d(2.518173854179933105218635e-08);
1718     const gmx_simd_double_t  CE10          = gmx_simd_set1_d(2.755842049600488770111608e-07);
1719     const gmx_simd_double_t  CE9           = gmx_simd_set1_d(2.755691815216689746619849e-06);
1720     const gmx_simd_double_t  CE8           = gmx_simd_set1_d(2.480158383706245033920920e-05);
1721     const gmx_simd_double_t  CE7           = gmx_simd_set1_d(0.0001984127043518048611841321);
1722     const gmx_simd_double_t  CE6           = gmx_simd_set1_d(0.001388888889360258341755930);
1723     const gmx_simd_double_t  CE5           = gmx_simd_set1_d(0.008333333332907368102819109);
1724     const gmx_simd_double_t  CE4           = gmx_simd_set1_d(0.04166666666663836745814631);
1725     const gmx_simd_double_t  CE3           = gmx_simd_set1_d(0.1666666666666796929434570);
1726     const gmx_simd_double_t  CE2           = gmx_simd_set1_d(0.5);
1727     const gmx_simd_double_t  one           = gmx_simd_set1_d(1.0);
1728     gmx_simd_double_t        fexppart;
1729     gmx_simd_double_t        intpart;
1730     gmx_simd_double_t        y, p;
1731     gmx_simd_dbool_t         valuemask;
1732
1733     y         = gmx_simd_mul_d(x, argscale);
1734     fexppart  = gmx_simd_set_exponent_d(y);  /* rounds to nearest int internally */
1735     intpart   = gmx_simd_round_d(y);         /* use same rounding mode here */
1736     valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
1737     fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
1738
1739     /* Extended precision arithmetics */
1740     x         = gmx_simd_fmadd_d(invargscale0, intpart, x);
1741     x         = gmx_simd_fmadd_d(invargscale1, intpart, x);
1742
1743     p         = gmx_simd_fmadd_d(CE12, x, CE11);
1744     p         = gmx_simd_fmadd_d(p, x, CE10);
1745     p         = gmx_simd_fmadd_d(p, x, CE9);
1746     p         = gmx_simd_fmadd_d(p, x, CE8);
1747     p         = gmx_simd_fmadd_d(p, x, CE7);
1748     p         = gmx_simd_fmadd_d(p, x, CE6);
1749     p         = gmx_simd_fmadd_d(p, x, CE5);
1750     p         = gmx_simd_fmadd_d(p, x, CE4);
1751     p         = gmx_simd_fmadd_d(p, x, CE3);
1752     p         = gmx_simd_fmadd_d(p, x, CE2);
1753     p         = gmx_simd_fmadd_d(p, gmx_simd_mul_d(x, x), gmx_simd_add_d(x, one));
1754     x         = gmx_simd_mul_d(p, fexppart);
1755     return x;
1756 }
1757
1758 /*! \brief SIMD double erf(x).
1759  *
1760  * \copydetails gmx_simd_erf_f
1761  */
1762 static gmx_inline gmx_simd_double_t gmx_simdcall
1763 gmx_simd_erf_d(gmx_simd_double_t x)
1764 {
1765     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1766     const gmx_simd_double_t CAP4      = gmx_simd_set1_d(-0.431780540597889301512e-4);
1767     const gmx_simd_double_t CAP3      = gmx_simd_set1_d(-0.00578562306260059236059);
1768     const gmx_simd_double_t CAP2      = gmx_simd_set1_d(-0.028593586920219752446);
1769     const gmx_simd_double_t CAP1      = gmx_simd_set1_d(-0.315924962948621698209);
1770     const gmx_simd_double_t CAP0      = gmx_simd_set1_d(0.14952975608477029151);
1771
1772     const gmx_simd_double_t CAQ5      = gmx_simd_set1_d(-0.374089300177174709737e-5);
1773     const gmx_simd_double_t CAQ4      = gmx_simd_set1_d(0.00015126584532155383535);
1774     const gmx_simd_double_t CAQ3      = gmx_simd_set1_d(0.00536692680669480725423);
1775     const gmx_simd_double_t CAQ2      = gmx_simd_set1_d(0.0668686825594046122636);
1776     const gmx_simd_double_t CAQ1      = gmx_simd_set1_d(0.402604990869284362773);
1777     /* CAQ0 == 1.0 */
1778     const gmx_simd_double_t CAoffset  = gmx_simd_set1_d(0.9788494110107421875);
1779
1780     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1781     const gmx_simd_double_t CBP6      = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1782     const gmx_simd_double_t CBP5      = gmx_simd_set1_d(0.00119770193298159629350136085658);
1783     const gmx_simd_double_t CBP4      = gmx_simd_set1_d(0.0164944422378370965881008942733);
1784     const gmx_simd_double_t CBP3      = gmx_simd_set1_d(0.0984581468691775932063932439252);
1785     const gmx_simd_double_t CBP2      = gmx_simd_set1_d(0.317364595806937763843589437418);
1786     const gmx_simd_double_t CBP1      = gmx_simd_set1_d(0.554167062641455850932670067075);
1787     const gmx_simd_double_t CBP0      = gmx_simd_set1_d(0.427583576155807163756925301060);
1788     const gmx_simd_double_t CBQ7      = gmx_simd_set1_d(0.00212288829699830145976198384930);
1789     const gmx_simd_double_t CBQ6      = gmx_simd_set1_d(0.0334810979522685300554606393425);
1790     const gmx_simd_double_t CBQ5      = gmx_simd_set1_d(0.2361713785181450957579508850717);
1791     const gmx_simd_double_t CBQ4      = gmx_simd_set1_d(0.955364736493055670530981883072);
1792     const gmx_simd_double_t CBQ3      = gmx_simd_set1_d(2.36815675631420037315349279199);
1793     const gmx_simd_double_t CBQ2      = gmx_simd_set1_d(3.55261649184083035537184223542);
1794     const gmx_simd_double_t CBQ1      = gmx_simd_set1_d(2.93501136050160872574376997993);
1795     /* CBQ0 == 1.0 */
1796
1797     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1798     const gmx_simd_double_t CCP6      = gmx_simd_set1_d(-2.8175401114513378771);
1799     const gmx_simd_double_t CCP5      = gmx_simd_set1_d(-3.22729451764143718517);
1800     const gmx_simd_double_t CCP4      = gmx_simd_set1_d(-2.5518551727311523996);
1801     const gmx_simd_double_t CCP3      = gmx_simd_set1_d(-0.687717681153649930619);
1802     const gmx_simd_double_t CCP2      = gmx_simd_set1_d(-0.212652252872804219852);
1803     const gmx_simd_double_t CCP1      = gmx_simd_set1_d(0.0175389834052493308818);
1804     const gmx_simd_double_t CCP0      = gmx_simd_set1_d(0.00628057170626964891937);
1805
1806     const gmx_simd_double_t CCQ6      = gmx_simd_set1_d(5.48409182238641741584);
1807     const gmx_simd_double_t CCQ5      = gmx_simd_set1_d(13.5064170191802889145);
1808     const gmx_simd_double_t CCQ4      = gmx_simd_set1_d(22.9367376522880577224);
1809     const gmx_simd_double_t CCQ3      = gmx_simd_set1_d(15.930646027911794143);
1810     const gmx_simd_double_t CCQ2      = gmx_simd_set1_d(11.0567237927800161565);
1811     const gmx_simd_double_t CCQ1      = gmx_simd_set1_d(2.79257750980575282228);
1812     /* CCQ0 == 1.0 */
1813     const gmx_simd_double_t CCoffset  = gmx_simd_set1_d(0.5579090118408203125);
1814
1815     const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
1816     const gmx_simd_double_t two       = gmx_simd_set1_d(2.0);
1817
1818     gmx_simd_double_t       xabs, x2, x4, t, t2, w, w2;
1819     gmx_simd_double_t       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1820     gmx_simd_double_t       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1821     gmx_simd_double_t       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1822     gmx_simd_double_t       res_erf, res_erfcB, res_erfcC, res_erfc, res;
1823     gmx_simd_double_t       expmx2;
1824     gmx_simd_dbool_t        mask, mask_erf;
1825
1826     /* Calculate erf() */
1827     xabs     = gmx_simd_fabs_d(x);
1828     mask_erf = gmx_simd_cmplt_d(xabs, one);
1829     x2       = gmx_simd_mul_d(x, x);
1830     x4       = gmx_simd_mul_d(x2, x2);
1831
1832     PolyAP0  = gmx_simd_mul_d(CAP4, x4);
1833     PolyAP1  = gmx_simd_mul_d(CAP3, x4);
1834     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP2);
1835     PolyAP1  = gmx_simd_add_d(PolyAP1, CAP1);
1836     PolyAP0  = gmx_simd_mul_d(PolyAP0, x4);
1837     PolyAP1  = gmx_simd_mul_d(PolyAP1, x2);
1838     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP0);
1839     PolyAP0  = gmx_simd_add_d(PolyAP0, PolyAP1);
1840
1841     PolyAQ1  = gmx_simd_mul_d(CAQ5, x4);
1842     PolyAQ0  = gmx_simd_mul_d(CAQ4, x4);
1843     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ3);
1844     PolyAQ0  = gmx_simd_add_d(PolyAQ0, CAQ2);
1845     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x4);
1846     PolyAQ0  = gmx_simd_mul_d(PolyAQ0, x4);
1847     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ1);
1848     PolyAQ0  = gmx_simd_add_d(PolyAQ0, one);
1849     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x2);
1850     PolyAQ0  = gmx_simd_add_d(PolyAQ0, PolyAQ1);
1851
1852     res_erf  = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_maskfpe_d(PolyAQ0, mask_erf));
1853     res_erf  = gmx_simd_add_d(CAoffset, res_erf);
1854     res_erf  = gmx_simd_mul_d(x, res_erf);
1855
1856     /* Calculate erfc() in range [1,4.5] */
1857     t       = gmx_simd_sub_d(xabs, one);
1858     t2      = gmx_simd_mul_d(t, t);
1859
1860     PolyBP0  = gmx_simd_mul_d(CBP6, t2);
1861     PolyBP1  = gmx_simd_mul_d(CBP5, t2);
1862     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP4);
1863     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP3);
1864     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
1865     PolyBP1  = gmx_simd_mul_d(PolyBP1, t2);
1866     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP2);
1867     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP1);
1868     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
1869     PolyBP1  = gmx_simd_mul_d(PolyBP1, t);
1870     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP0);
1871     PolyBP0  = gmx_simd_add_d(PolyBP0, PolyBP1);
1872
1873     PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
1874     PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
1875     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
1876     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
1877     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1878     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1879     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
1880     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
1881     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1882     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1883     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
1884     PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
1885     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
1886     PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
1887
1888     res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_notmaskfpe_d(PolyBQ0, mask_erf));
1889
1890     res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
1891
1892     /* Calculate erfc() in range [4.5,inf] */
1893     w       = gmx_simd_inv_notmaskfpe_d(xabs, mask_erf);
1894     w2      = gmx_simd_mul_d(w, w);
1895
1896     PolyCP0  = gmx_simd_mul_d(CCP6, w2);
1897     PolyCP1  = gmx_simd_mul_d(CCP5, w2);
1898     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP4);
1899     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP3);
1900     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
1901     PolyCP1  = gmx_simd_mul_d(PolyCP1, w2);
1902     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP2);
1903     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP1);
1904     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
1905     PolyCP1  = gmx_simd_mul_d(PolyCP1, w);
1906     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP0);
1907     PolyCP0  = gmx_simd_add_d(PolyCP0, PolyCP1);
1908
1909     PolyCQ0  = gmx_simd_mul_d(CCQ6, w2);
1910     PolyCQ1  = gmx_simd_mul_d(CCQ5, w2);
1911     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ4);
1912     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ3);
1913     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
1914     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w2);
1915     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ2);
1916     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ1);
1917     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
1918     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w);
1919     PolyCQ0  = gmx_simd_add_d(PolyCQ0, one);
1920     PolyCQ0  = gmx_simd_add_d(PolyCQ0, PolyCQ1);
1921
1922     expmx2   = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
1923
1924     res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_notmaskfpe_d(PolyCQ0, mask_erf));
1925     res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
1926     res_erfcC = gmx_simd_mul_d(res_erfcC, w);
1927
1928     mask     = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
1929     res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
1930
1931     res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
1932
1933     /* erfc(x<0) = 2-erfc(|x|) */
1934     mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
1935     res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
1936
1937     /* Select erf() or erfc() */
1938     res  = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask_erf);
1939
1940     return res;
1941 }
1942
1943 /*! \brief SIMD double erfc(x).
1944  *
1945  * \copydetails gmx_simd_erfc_f
1946  */
1947 static gmx_inline gmx_simd_double_t gmx_simdcall
1948 gmx_simd_erfc_d(gmx_simd_double_t x)
1949 {
1950     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1951     const gmx_simd_double_t CAP4      = gmx_simd_set1_d(-0.431780540597889301512e-4);
1952     const gmx_simd_double_t CAP3      = gmx_simd_set1_d(-0.00578562306260059236059);
1953     const gmx_simd_double_t CAP2      = gmx_simd_set1_d(-0.028593586920219752446);
1954     const gmx_simd_double_t CAP1      = gmx_simd_set1_d(-0.315924962948621698209);
1955     const gmx_simd_double_t CAP0      = gmx_simd_set1_d(0.14952975608477029151);
1956
1957     const gmx_simd_double_t CAQ5      = gmx_simd_set1_d(-0.374089300177174709737e-5);
1958     const gmx_simd_double_t CAQ4      = gmx_simd_set1_d(0.00015126584532155383535);
1959     const gmx_simd_double_t CAQ3      = gmx_simd_set1_d(0.00536692680669480725423);
1960     const gmx_simd_double_t CAQ2      = gmx_simd_set1_d(0.0668686825594046122636);
1961     const gmx_simd_double_t CAQ1      = gmx_simd_set1_d(0.402604990869284362773);
1962     /* CAQ0 == 1.0 */
1963     const gmx_simd_double_t CAoffset  = gmx_simd_set1_d(0.9788494110107421875);
1964
1965     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1966     const gmx_simd_double_t CBP6      = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1967     const gmx_simd_double_t CBP5      = gmx_simd_set1_d(0.00119770193298159629350136085658);
1968     const gmx_simd_double_t CBP4      = gmx_simd_set1_d(0.0164944422378370965881008942733);
1969     const gmx_simd_double_t CBP3      = gmx_simd_set1_d(0.0984581468691775932063932439252);
1970     const gmx_simd_double_t CBP2      = gmx_simd_set1_d(0.317364595806937763843589437418);
1971     const gmx_simd_double_t CBP1      = gmx_simd_set1_d(0.554167062641455850932670067075);
1972     const gmx_simd_double_t CBP0      = gmx_simd_set1_d(0.427583576155807163756925301060);
1973     const gmx_simd_double_t CBQ7      = gmx_simd_set1_d(0.00212288829699830145976198384930);
1974     const gmx_simd_double_t CBQ6      = gmx_simd_set1_d(0.0334810979522685300554606393425);
1975     const gmx_simd_double_t CBQ5      = gmx_simd_set1_d(0.2361713785181450957579508850717);
1976     const gmx_simd_double_t CBQ4      = gmx_simd_set1_d(0.955364736493055670530981883072);
1977     const gmx_simd_double_t CBQ3      = gmx_simd_set1_d(2.36815675631420037315349279199);
1978     const gmx_simd_double_t CBQ2      = gmx_simd_set1_d(3.55261649184083035537184223542);
1979     const gmx_simd_double_t CBQ1      = gmx_simd_set1_d(2.93501136050160872574376997993);
1980     /* CBQ0 == 1.0 */
1981
1982     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1983     const gmx_simd_double_t CCP6      = gmx_simd_set1_d(-2.8175401114513378771);
1984     const gmx_simd_double_t CCP5      = gmx_simd_set1_d(-3.22729451764143718517);
1985     const gmx_simd_double_t CCP4      = gmx_simd_set1_d(-2.5518551727311523996);
1986     const gmx_simd_double_t CCP3      = gmx_simd_set1_d(-0.687717681153649930619);
1987     const gmx_simd_double_t CCP2      = gmx_simd_set1_d(-0.212652252872804219852);
1988     const gmx_simd_double_t CCP1      = gmx_simd_set1_d(0.0175389834052493308818);
1989     const gmx_simd_double_t CCP0      = gmx_simd_set1_d(0.00628057170626964891937);
1990
1991     const gmx_simd_double_t CCQ6      = gmx_simd_set1_d(5.48409182238641741584);
1992     const gmx_simd_double_t CCQ5      = gmx_simd_set1_d(13.5064170191802889145);
1993     const gmx_simd_double_t CCQ4      = gmx_simd_set1_d(22.9367376522880577224);
1994     const gmx_simd_double_t CCQ3      = gmx_simd_set1_d(15.930646027911794143);
1995     const gmx_simd_double_t CCQ2      = gmx_simd_set1_d(11.0567237927800161565);
1996     const gmx_simd_double_t CCQ1      = gmx_simd_set1_d(2.79257750980575282228);
1997     /* CCQ0 == 1.0 */
1998     const gmx_simd_double_t CCoffset  = gmx_simd_set1_d(0.5579090118408203125);
1999
2000     const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
2001     const gmx_simd_double_t two       = gmx_simd_set1_d(2.0);
2002
2003     gmx_simd_double_t       xabs, x2, x4, t, t2, w, w2;
2004     gmx_simd_double_t       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2005     gmx_simd_double_t       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2006     gmx_simd_double_t       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2007     gmx_simd_double_t       res_erf, res_erfcB, res_erfcC, res_erfc, res;
2008     gmx_simd_double_t       expmx2;
2009     gmx_simd_dbool_t        mask, mask_erf;
2010
2011     /* Calculate erf() */
2012     xabs     = gmx_simd_fabs_d(x);
2013     mask_erf = gmx_simd_cmplt_d(xabs, one);
2014     x2       = gmx_simd_mul_d(x, x);
2015     x4       = gmx_simd_mul_d(x2, x2);
2016
2017     PolyAP0  = gmx_simd_mul_d(CAP4, x4);
2018     PolyAP1  = gmx_simd_mul_d(CAP3, x4);
2019     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP2);
2020     PolyAP1  = gmx_simd_add_d(PolyAP1, CAP1);
2021     PolyAP0  = gmx_simd_mul_d(PolyAP0, x4);
2022     PolyAP1  = gmx_simd_mul_d(PolyAP1, x2);
2023     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP0);
2024     PolyAP0  = gmx_simd_add_d(PolyAP0, PolyAP1);
2025
2026     PolyAQ1  = gmx_simd_mul_d(CAQ5, x4);
2027     PolyAQ0  = gmx_simd_mul_d(CAQ4, x4);
2028     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ3);
2029     PolyAQ0  = gmx_simd_add_d(PolyAQ0, CAQ2);
2030     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x4);
2031     PolyAQ0  = gmx_simd_mul_d(PolyAQ0, x4);
2032     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ1);
2033     PolyAQ0  = gmx_simd_add_d(PolyAQ0, one);
2034     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x2);
2035     PolyAQ0  = gmx_simd_add_d(PolyAQ0, PolyAQ1);
2036
2037     res_erf  = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_maskfpe_d(PolyAQ0, mask_erf));
2038     res_erf  = gmx_simd_add_d(CAoffset, res_erf);
2039     res_erf  = gmx_simd_mul_d(x, res_erf);
2040
2041     /* Calculate erfc() in range [1,4.5] */
2042     t       = gmx_simd_sub_d(xabs, one);
2043     t2      = gmx_simd_mul_d(t, t);
2044
2045     PolyBP0  = gmx_simd_mul_d(CBP6, t2);
2046     PolyBP1  = gmx_simd_mul_d(CBP5, t2);
2047     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP4);
2048     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP3);
2049     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
2050     PolyBP1  = gmx_simd_mul_d(PolyBP1, t2);
2051     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP2);
2052     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP1);
2053     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
2054     PolyBP1  = gmx_simd_mul_d(PolyBP1, t);
2055     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP0);
2056     PolyBP0  = gmx_simd_add_d(PolyBP0, PolyBP1);
2057
2058     PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
2059     PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
2060     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
2061     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
2062     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
2063     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
2064     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
2065     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
2066     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
2067     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
2068     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
2069     PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
2070     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
2071     PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
2072
2073     res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_notmaskfpe_d(PolyBQ0, mask_erf));
2074
2075     res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
2076
2077     /* Calculate erfc() in range [4.5,inf] */
2078     w       = gmx_simd_inv_notmaskfpe_d(xabs, mask_erf);
2079     w2      = gmx_simd_mul_d(w, w);
2080
2081     PolyCP0  = gmx_simd_mul_d(CCP6, w2);
2082     PolyCP1  = gmx_simd_mul_d(CCP5, w2);
2083     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP4);
2084     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP3);
2085     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
2086     PolyCP1  = gmx_simd_mul_d(PolyCP1, w2);
2087     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP2);
2088     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP1);
2089     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
2090     PolyCP1  = gmx_simd_mul_d(PolyCP1, w);
2091     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP0);
2092     PolyCP0  = gmx_simd_add_d(PolyCP0, PolyCP1);
2093
2094     PolyCQ0  = gmx_simd_mul_d(CCQ6, w2);
2095     PolyCQ1  = gmx_simd_mul_d(CCQ5, w2);
2096     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ4);
2097     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ3);
2098     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
2099     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w2);
2100     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ2);
2101     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ1);
2102     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
2103     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w);
2104     PolyCQ0  = gmx_simd_add_d(PolyCQ0, one);
2105     PolyCQ0  = gmx_simd_add_d(PolyCQ0, PolyCQ1);
2106
2107     expmx2   = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
2108
2109     res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_notmaskfpe_d(PolyCQ0, mask_erf));
2110     res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
2111     res_erfcC = gmx_simd_mul_d(res_erfcC, w);
2112
2113     mask     = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
2114     res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
2115
2116     res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
2117
2118     /* erfc(x<0) = 2-erfc(|x|) */
2119     mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
2120     res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
2121
2122     /* Select erf() or erfc() */
2123     res  = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask_erf);
2124
2125     return res;
2126 }
2127
2128 /*! \brief SIMD double sin \& cos.
2129  *
2130  * \copydetails gmx_simd_sincos_f
2131  */
2132 static gmx_inline void gmx_simdcall
2133 gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
2134 {
2135     /* Constants to subtract Pi/4*x from y while minimizing precision loss */
2136     const gmx_simd_double_t  argred0         = gmx_simd_set1_d(-2*0.78539816290140151978);
2137     const gmx_simd_double_t  argred1         = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2138     const gmx_simd_double_t  argred2         = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2139     const gmx_simd_double_t  argred3         = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2140     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
2141     const gmx_simd_double_t  const_sin5      = gmx_simd_set1_d( 1.58938307283228937328511e-10);
2142     const gmx_simd_double_t  const_sin4      = gmx_simd_set1_d(-2.50506943502539773349318e-08);
2143     const gmx_simd_double_t  const_sin3      = gmx_simd_set1_d( 2.75573131776846360512547e-06);
2144     const gmx_simd_double_t  const_sin2      = gmx_simd_set1_d(-0.000198412698278911770864914);
2145     const gmx_simd_double_t  const_sin1      = gmx_simd_set1_d( 0.0083333333333191845961746);
2146     const gmx_simd_double_t  const_sin0      = gmx_simd_set1_d(-0.166666666666666130709393);
2147
2148     const gmx_simd_double_t  const_cos7      = gmx_simd_set1_d(-1.13615350239097429531523e-11);
2149     const gmx_simd_double_t  const_cos6      = gmx_simd_set1_d( 2.08757471207040055479366e-09);
2150     const gmx_simd_double_t  const_cos5      = gmx_simd_set1_d(-2.75573144028847567498567e-07);
2151     const gmx_simd_double_t  const_cos4      = gmx_simd_set1_d( 2.48015872890001867311915e-05);
2152     const gmx_simd_double_t  const_cos3      = gmx_simd_set1_d(-0.00138888888888714019282329);
2153     const gmx_simd_double_t  const_cos2      = gmx_simd_set1_d( 0.0416666666666665519592062);
2154     const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
2155     const gmx_simd_double_t  one             = gmx_simd_set1_d(1.0);
2156     gmx_simd_double_t        ssign, csign;
2157     gmx_simd_double_t        x2, y, z, psin, pcos, sss, ccc;
2158     gmx_simd_dbool_t         mask;
2159 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2160     const gmx_simd_dint32_t  ione            = gmx_simd_set1_di(1);
2161     const gmx_simd_dint32_t  itwo            = gmx_simd_set1_di(2);
2162     gmx_simd_dint32_t        iy;
2163
2164     z       = gmx_simd_mul_d(x, two_over_pi);
2165     iy      = gmx_simd_cvt_d2i(z);
2166     y       = gmx_simd_round_d(z);
2167
2168     mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
2169     ssign   = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
2170     csign   = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
2171 #else
2172     const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
2173     const gmx_simd_double_t  minusquarter    = gmx_simd_set1_d(-0.25);
2174     gmx_simd_double_t        q;
2175     gmx_simd_dbool_t         m1, m2, m3;
2176
2177     /* The most obvious way to find the arguments quadrant in the unit circle
2178      * to calculate the sign is to use integer arithmetic, but that is not
2179      * present in all SIMD implementations. As an alternative, we have devised a
2180      * pure floating-point algorithm that uses truncation for argument reduction
2181      * so that we get a new value 0<=q<1 over the unit circle, and then
2182      * do floating-point comparisons with fractions. This is likely to be
2183      * slightly slower (~10%) due to the longer latencies of floating-point, so
2184      * we only use it when integer SIMD arithmetic is not present.
2185      */
2186     ssign   = x;
2187     x       = gmx_simd_fabs_d(x);
2188     /* It is critical that half-way cases are rounded down */
2189     z       = gmx_simd_fmadd_d(x, two_over_pi, half);
2190     y       = gmx_simd_trunc_d(z);
2191     q       = gmx_simd_mul_d(z, quarter);
2192     q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2193     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2194      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2195      * This removes the 2*Pi periodicity without using any integer arithmetic.
2196      * First check if y had the value 2 or 3, set csign if true.
2197      */
2198     q       = gmx_simd_sub_d(q, half);
2199     /* If we have logical operations we can work directly on the signbit, which
2200      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2201      * Thus, if you are altering defines to debug alternative code paths, the
2202      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2203      * active or inactive - you will get errors if only one is used.
2204      */
2205 #    ifdef GMX_SIMD_HAVE_LOGICAL
2206     ssign   = gmx_simd_and_d(ssign, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2207     csign   = gmx_simd_andnot_d(q, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2208     ssign   = gmx_simd_xor_d(ssign, csign);
2209 #    else
2210     csign   = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
2211     ssign   = gmx_simd_xor_sign_d(ssign, csign);    /* swap ssign if csign was set. */
2212 #    endif
2213     /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
2214     m1      = gmx_simd_cmplt_d(q, minusquarter);
2215     m2      = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
2216     m3      = gmx_simd_cmplt_d(q, quarter);
2217     m2      = gmx_simd_and_db(m2, m3);
2218     mask    = gmx_simd_or_db(m1, m2);
2219     /* where mask is FALSE, set sign. */
2220     csign   = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
2221 #endif
2222     x       = gmx_simd_fmadd_d(y, argred0, x);
2223     x       = gmx_simd_fmadd_d(y, argred1, x);
2224     x       = gmx_simd_fmadd_d(y, argred2, x);
2225     x       = gmx_simd_fmadd_d(y, argred3, x);
2226     x2      = gmx_simd_mul_d(x, x);
2227
2228     psin    = gmx_simd_fmadd_d(const_sin5, x2, const_sin4);
2229     psin    = gmx_simd_fmadd_d(psin, x2, const_sin3);
2230     psin    = gmx_simd_fmadd_d(psin, x2, const_sin2);
2231     psin    = gmx_simd_fmadd_d(psin, x2, const_sin1);
2232     psin    = gmx_simd_fmadd_d(psin, x2, const_sin0);
2233     psin    = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x2, x), x);
2234
2235     pcos    = gmx_simd_fmadd_d(const_cos7, x2, const_cos6);
2236     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos5);
2237     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos4);
2238     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos3);
2239     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos2);
2240     pcos    = gmx_simd_fmsub_d(pcos, x2, half);
2241     pcos    = gmx_simd_fmadd_d(pcos, x2, one);
2242
2243     sss     = gmx_simd_blendv_d(pcos, psin, mask);
2244     ccc     = gmx_simd_blendv_d(psin, pcos, mask);
2245     /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
2246 #ifdef GMX_SIMD_HAVE_LOGICAL
2247     *sinval = gmx_simd_xor_d(sss, ssign);
2248     *cosval = gmx_simd_xor_d(ccc, csign);
2249 #else
2250     *sinval = gmx_simd_xor_sign_d(sss, ssign);
2251     *cosval = gmx_simd_xor_sign_d(ccc, csign);
2252 #endif
2253 }
2254
2255 /*! \brief SIMD double sin(x).
2256  *
2257  * \copydetails gmx_simd_sin_f
2258  */
2259 static gmx_inline gmx_simd_double_t gmx_simdcall
2260 gmx_simd_sin_d(gmx_simd_double_t x)
2261 {
2262     gmx_simd_double_t s, c;
2263     gmx_simd_sincos_d(x, &s, &c);
2264     return s;
2265 }
2266
2267 /*! \brief SIMD double cos(x).
2268  *
2269  * \copydetails gmx_simd_cos_f
2270  */
2271 static gmx_inline gmx_simd_double_t gmx_simdcall
2272 gmx_simd_cos_d(gmx_simd_double_t x)
2273 {
2274     gmx_simd_double_t s, c;
2275     gmx_simd_sincos_d(x, &s, &c);
2276     return c;
2277 }
2278
2279 /*! \brief SIMD double tan(x).
2280  *
2281  * \copydetails gmx_simd_tan_f
2282  */
2283 static gmx_inline gmx_simd_double_t gmx_simdcall
2284 gmx_simd_tan_d(gmx_simd_double_t x)
2285 {
2286     const gmx_simd_double_t  argred0         = gmx_simd_set1_d(-2*0.78539816290140151978);
2287     const gmx_simd_double_t  argred1         = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2288     const gmx_simd_double_t  argred2         = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2289     const gmx_simd_double_t  argred3         = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2290     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
2291     const gmx_simd_double_t  CT15            = gmx_simd_set1_d(1.01419718511083373224408e-05);
2292     const gmx_simd_double_t  CT14            = gmx_simd_set1_d(-2.59519791585924697698614e-05);
2293     const gmx_simd_double_t  CT13            = gmx_simd_set1_d(5.23388081915899855325186e-05);
2294     const gmx_simd_double_t  CT12            = gmx_simd_set1_d(-3.05033014433946488225616e-05);
2295     const gmx_simd_double_t  CT11            = gmx_simd_set1_d(7.14707504084242744267497e-05);
2296     const gmx_simd_double_t  CT10            = gmx_simd_set1_d(8.09674518280159187045078e-05);
2297     const gmx_simd_double_t  CT9             = gmx_simd_set1_d(0.000244884931879331847054404);
2298     const gmx_simd_double_t  CT8             = gmx_simd_set1_d(0.000588505168743587154904506);
2299     const gmx_simd_double_t  CT7             = gmx_simd_set1_d(0.00145612788922812427978848);
2300     const gmx_simd_double_t  CT6             = gmx_simd_set1_d(0.00359208743836906619142924);
2301     const gmx_simd_double_t  CT5             = gmx_simd_set1_d(0.00886323944362401618113356);
2302     const gmx_simd_double_t  CT4             = gmx_simd_set1_d(0.0218694882853846389592078);
2303     const gmx_simd_double_t  CT3             = gmx_simd_set1_d(0.0539682539781298417636002);
2304     const gmx_simd_double_t  CT2             = gmx_simd_set1_d(0.133333333333125941821962);
2305     const gmx_simd_double_t  CT1             = gmx_simd_set1_d(0.333333333333334980164153);
2306
2307     gmx_simd_double_t        x2, p, y, z;
2308     gmx_simd_dbool_t         mask;
2309
2310 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2311     gmx_simd_dint32_t  iy;
2312     gmx_simd_dint32_t  ione = gmx_simd_set1_di(1);
2313
2314     z       = gmx_simd_mul_d(x, two_over_pi);
2315     iy      = gmx_simd_cvt_d2i(z);
2316     y       = gmx_simd_round_d(z);
2317     mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
2318
2319     x       = gmx_simd_fmadd_d(y, argred0, x);
2320     x       = gmx_simd_fmadd_d(y, argred1, x);
2321     x       = gmx_simd_fmadd_d(y, argred2, x);
2322     x       = gmx_simd_fmadd_d(y, argred3, x);
2323     x       = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), mask), x);
2324 #else
2325     const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
2326     const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
2327     const gmx_simd_double_t  threequarter    = gmx_simd_set1_d(0.75);
2328     gmx_simd_double_t        w, q;
2329     gmx_simd_dbool_t         m1, m2, m3;
2330
2331     w       = gmx_simd_fabs_d(x);
2332     z       = gmx_simd_fmadd_d(w, two_over_pi, half);
2333     y       = gmx_simd_trunc_d(z);
2334     q       = gmx_simd_mul_d(z, quarter);
2335     q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2336     m1      = gmx_simd_cmple_d(quarter, q);
2337     m2      = gmx_simd_cmplt_d(q, half);
2338     m3      = gmx_simd_cmple_d(threequarter, q);
2339     m1      = gmx_simd_and_db(m1, m2);
2340     mask    = gmx_simd_or_db(m1, m3);
2341     w       = gmx_simd_fmadd_d(y, argred0, w);
2342     w       = gmx_simd_fmadd_d(y, argred1, w);
2343     w       = gmx_simd_fmadd_d(y, argred2, w);
2344     w       = gmx_simd_fmadd_d(y, argred3, w);
2345
2346     w       = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
2347     x       = gmx_simd_xor_sign_d(w, x);
2348 #endif
2349     x2      = gmx_simd_mul_d(x, x);
2350     p       = gmx_simd_fmadd_d(CT15, x2, CT14);
2351     p       = gmx_simd_fmadd_d(p, x2, CT13);
2352     p       = gmx_simd_fmadd_d(p, x2, CT12);
2353     p       = gmx_simd_fmadd_d(p, x2, CT11);
2354     p       = gmx_simd_fmadd_d(p, x2, CT10);
2355     p       = gmx_simd_fmadd_d(p, x2, CT9);
2356     p       = gmx_simd_fmadd_d(p, x2, CT8);
2357     p       = gmx_simd_fmadd_d(p, x2, CT7);
2358     p       = gmx_simd_fmadd_d(p, x2, CT6);
2359     p       = gmx_simd_fmadd_d(p, x2, CT5);
2360     p       = gmx_simd_fmadd_d(p, x2, CT4);
2361     p       = gmx_simd_fmadd_d(p, x2, CT3);
2362     p       = gmx_simd_fmadd_d(p, x2, CT2);
2363     p       = gmx_simd_fmadd_d(p, x2, CT1);
2364     p       = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
2365
2366     p       = gmx_simd_blendv_d( p, gmx_simd_inv_maskfpe_d(p, mask), mask);
2367     return p;
2368 }
2369
2370 /*! \brief SIMD double asin(x).
2371  *
2372  * \copydetails gmx_simd_asin_f
2373  */
2374 static gmx_inline gmx_simd_double_t gmx_simdcall
2375 gmx_simd_asin_d(gmx_simd_double_t x)
2376 {
2377     /* Same algorithm as cephes library */
2378     const gmx_simd_double_t limit1    = gmx_simd_set1_d(0.625);
2379     const gmx_simd_double_t limit2    = gmx_simd_set1_d(1e-8);
2380     const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
2381     const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2382     const gmx_simd_double_t morebits  = gmx_simd_set1_d(6.123233995736765886130e-17);
2383
2384     const gmx_simd_double_t P5        = gmx_simd_set1_d(4.253011369004428248960e-3);
2385     const gmx_simd_double_t P4        = gmx_simd_set1_d(-6.019598008014123785661e-1);
2386     const gmx_simd_double_t P3        = gmx_simd_set1_d(5.444622390564711410273e0);
2387     const gmx_simd_double_t P2        = gmx_simd_set1_d(-1.626247967210700244449e1);
2388     const gmx_simd_double_t P1        = gmx_simd_set1_d(1.956261983317594739197e1);
2389     const gmx_simd_double_t P0        = gmx_simd_set1_d(-8.198089802484824371615e0);
2390
2391     const gmx_simd_double_t Q4        = gmx_simd_set1_d(-1.474091372988853791896e1);
2392     const gmx_simd_double_t Q3        = gmx_simd_set1_d(7.049610280856842141659e1);
2393     const gmx_simd_double_t Q2        = gmx_simd_set1_d(-1.471791292232726029859e2);
2394     const gmx_simd_double_t Q1        = gmx_simd_set1_d(1.395105614657485689735e2);
2395     const gmx_simd_double_t Q0        = gmx_simd_set1_d(-4.918853881490881290097e1);
2396
2397     const gmx_simd_double_t R4        = gmx_simd_set1_d(2.967721961301243206100e-3);
2398     const gmx_simd_double_t R3        = gmx_simd_set1_d(-5.634242780008963776856e-1);
2399     const gmx_simd_double_t R2        = gmx_simd_set1_d(6.968710824104713396794e0);
2400     const gmx_simd_double_t R1        = gmx_simd_set1_d(-2.556901049652824852289e1);
2401     const gmx_simd_double_t R0        = gmx_simd_set1_d(2.853665548261061424989e1);
2402
2403     const gmx_simd_double_t S3        = gmx_simd_set1_d(-2.194779531642920639778e1);
2404     const gmx_simd_double_t S2        = gmx_simd_set1_d(1.470656354026814941758e2);
2405     const gmx_simd_double_t S1        = gmx_simd_set1_d(-3.838770957603691357202e2);
2406     const gmx_simd_double_t S0        = gmx_simd_set1_d(3.424398657913078477438e2);
2407
2408     gmx_simd_double_t       xabs;
2409     gmx_simd_double_t       zz, ww, z, q, w, zz2, ww2;
2410     gmx_simd_double_t       PA, PB;
2411     gmx_simd_double_t       QA, QB;
2412     gmx_simd_double_t       RA, RB;
2413     gmx_simd_double_t       SA, SB;
2414     gmx_simd_double_t       nom, denom;
2415     gmx_simd_dbool_t        mask, mask2;
2416
2417     xabs  = gmx_simd_fabs_d(x);
2418
2419     mask  = gmx_simd_cmplt_d(limit1, xabs);
2420
2421     zz    = gmx_simd_sub_d(one, xabs);
2422     ww    = gmx_simd_mul_d(xabs, xabs);
2423     zz2   = gmx_simd_mul_d(zz, zz);
2424     ww2   = gmx_simd_mul_d(ww, ww);
2425
2426     /* R */
2427     RA    = gmx_simd_mul_d(R4, zz2);
2428     RB    = gmx_simd_mul_d(R3, zz2);
2429     RA    = gmx_simd_add_d(RA, R2);
2430     RB    = gmx_simd_add_d(RB, R1);
2431     RA    = gmx_simd_mul_d(RA, zz2);
2432     RB    = gmx_simd_mul_d(RB, zz);
2433     RA    = gmx_simd_add_d(RA, R0);
2434     RA    = gmx_simd_add_d(RA, RB);
2435
2436     /* S, SA = zz2 */
2437     SB    = gmx_simd_mul_d(S3, zz2);
2438     SA    = gmx_simd_add_d(zz2, S2);
2439     SB    = gmx_simd_add_d(SB, S1);
2440     SA    = gmx_simd_mul_d(SA, zz2);
2441     SB    = gmx_simd_mul_d(SB, zz);
2442     SA    = gmx_simd_add_d(SA, S0);
2443     SA    = gmx_simd_add_d(SA, SB);
2444
2445     /* P */
2446     PA    = gmx_simd_mul_d(P5, ww2);
2447     PB    = gmx_simd_mul_d(P4, ww2);
2448     PA    = gmx_simd_add_d(PA, P3);
2449     PB    = gmx_simd_add_d(PB, P2);
2450     PA    = gmx_simd_mul_d(PA, ww2);
2451     PB    = gmx_simd_mul_d(PB, ww2);
2452     PA    = gmx_simd_add_d(PA, P1);
2453     PB    = gmx_simd_add_d(PB, P0);
2454     PA    = gmx_simd_mul_d(PA, ww);
2455     PA    = gmx_simd_add_d(PA, PB);
2456
2457     /* Q, QA = ww2 */
2458     QB    = gmx_simd_mul_d(Q4, ww2);
2459     QA    = gmx_simd_add_d(ww2, Q3);
2460     QB    = gmx_simd_add_d(QB, Q2);
2461     QA    = gmx_simd_mul_d(QA, ww2);
2462     QB    = gmx_simd_mul_d(QB, ww2);
2463     QA    = gmx_simd_add_d(QA, Q1);
2464     QB    = gmx_simd_add_d(QB, Q0);
2465     QA    = gmx_simd_mul_d(QA, ww);
2466     QA    = gmx_simd_add_d(QA, QB);
2467
2468     RA    = gmx_simd_mul_d(RA, zz);
2469     PA    = gmx_simd_mul_d(PA, ww);
2470
2471     nom   = gmx_simd_blendv_d( PA, RA, mask );
2472     denom = gmx_simd_blendv_d( QA, SA, mask );
2473
2474     mask2 = gmx_simd_cmplt_d(limit2, xabs);
2475     q     = gmx_simd_mul_d( nom, gmx_simd_inv_maskfpe_d(denom, mask2) );
2476
2477     zz    = gmx_simd_add_d(zz, zz);
2478     zz    = gmx_simd_sqrt_d(zz);
2479     z     = gmx_simd_sub_d(quarterpi, zz);
2480     zz    = gmx_simd_mul_d(zz, q);
2481     zz    = gmx_simd_sub_d(zz, morebits);
2482     z     = gmx_simd_sub_d(z, zz);
2483     z     = gmx_simd_add_d(z, quarterpi);
2484
2485     w     = gmx_simd_mul_d(xabs, q);
2486     w     = gmx_simd_add_d(w, xabs);
2487
2488     z     = gmx_simd_blendv_d( w, z, mask );
2489
2490     z     = gmx_simd_blendv_d( xabs, z, mask2 );
2491
2492     z = gmx_simd_xor_sign_d(z, x);
2493
2494     return z;
2495 }
2496
2497 /*! \brief SIMD double acos(x).
2498  *
2499  * \copydetails gmx_simd_acos_f
2500  */
2501 static gmx_inline gmx_simd_double_t gmx_simdcall
2502 gmx_simd_acos_d(gmx_simd_double_t x)
2503 {
2504     const gmx_simd_double_t one        = gmx_simd_set1_d(1.0);
2505     const gmx_simd_double_t half       = gmx_simd_set1_d(0.5);
2506     const gmx_simd_double_t quarterpi0 = gmx_simd_set1_d(7.85398163397448309616e-1);
2507     const gmx_simd_double_t quarterpi1 = gmx_simd_set1_d(6.123233995736765886130e-17);
2508
2509     gmx_simd_dbool_t        mask1;
2510     gmx_simd_double_t       z, z1, z2;
2511
2512     mask1 = gmx_simd_cmplt_d(half, x);
2513     z1    = gmx_simd_mul_d(half, gmx_simd_sub_d(one, x));
2514     z1    = gmx_simd_sqrt_d(z1);
2515     z     = gmx_simd_blendv_d( x, z1, mask1 );
2516
2517     z     = gmx_simd_asin_d(z);
2518
2519     z1    = gmx_simd_add_d(z, z);
2520
2521     z2    = gmx_simd_sub_d(quarterpi0, z);
2522     z2    = gmx_simd_add_d(z2, quarterpi1);
2523     z2    = gmx_simd_add_d(z2, quarterpi0);
2524
2525     z     = gmx_simd_blendv_d(z2, z1, mask1);
2526
2527     return z;
2528 }
2529
2530 /*! \brief SIMD double atan(x).
2531  *
2532  * \copydetails gmx_simd_atan_f
2533  */
2534 static gmx_inline gmx_simd_double_t gmx_simdcall
2535 gmx_simd_atan_d(gmx_simd_double_t x)
2536 {
2537     /* Same algorithm as cephes library */
2538     const gmx_simd_double_t limit1    = gmx_simd_set1_d(0.66);
2539     const gmx_simd_double_t limit2    = gmx_simd_set1_d(2.41421356237309504880);
2540     const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2541     const gmx_simd_double_t halfpi    = gmx_simd_set1_d(M_PI/2.0);
2542     const gmx_simd_double_t mone      = gmx_simd_set1_d(-1.0);
2543     const gmx_simd_double_t morebits1 = gmx_simd_set1_d(0.5*6.123233995736765886130E-17);
2544     const gmx_simd_double_t morebits2 = gmx_simd_set1_d(6.123233995736765886130E-17);
2545
2546     const gmx_simd_double_t P4        = gmx_simd_set1_d(-8.750608600031904122785E-1);
2547     const gmx_simd_double_t P3        = gmx_simd_set1_d(-1.615753718733365076637E1);
2548     const gmx_simd_double_t P2        = gmx_simd_set1_d(-7.500855792314704667340E1);
2549     const gmx_simd_double_t P1        = gmx_simd_set1_d(-1.228866684490136173410E2);
2550     const gmx_simd_double_t P0        = gmx_simd_set1_d(-6.485021904942025371773E1);
2551
2552     const gmx_simd_double_t Q4        = gmx_simd_set1_d(2.485846490142306297962E1);
2553     const gmx_simd_double_t Q3        = gmx_simd_set1_d(1.650270098316988542046E2);
2554     const gmx_simd_double_t Q2        = gmx_simd_set1_d(4.328810604912902668951E2);
2555     const gmx_simd_double_t Q1        = gmx_simd_set1_d(4.853903996359136964868E2);
2556     const gmx_simd_double_t Q0        = gmx_simd_set1_d(1.945506571482613964425E2);
2557
2558     gmx_simd_double_t       y, xabs, t1, t2;
2559     gmx_simd_double_t       z, z2;
2560     gmx_simd_double_t       P_A, P_B, Q_A, Q_B;
2561     gmx_simd_dbool_t        mask1, mask2;
2562
2563     xabs   = gmx_simd_fabs_d(x);
2564
2565     mask1  = gmx_simd_cmplt_d(limit1, xabs);
2566     mask2  = gmx_simd_cmplt_d(limit2, xabs);
2567
2568     t1     = gmx_simd_mul_d(gmx_simd_add_d(xabs, mone),
2569                             gmx_simd_inv_maskfpe_d(gmx_simd_sub_d(xabs, mone), mask1));
2570     t2     = gmx_simd_mul_d(mone, gmx_simd_inv_maskfpe_d(xabs, mask2));
2571
2572     y      = gmx_simd_blendzero_d(quarterpi, mask1);
2573     y      = gmx_simd_blendv_d(y, halfpi, mask2);
2574     xabs   = gmx_simd_blendv_d(xabs, t1, mask1);
2575     xabs   = gmx_simd_blendv_d(xabs, t2, mask2);
2576
2577     z      = gmx_simd_mul_d(xabs, xabs);
2578     z2     = gmx_simd_mul_d(z, z);
2579
2580     P_A    = gmx_simd_mul_d(P4, z2);
2581     P_B    = gmx_simd_mul_d(P3, z2);
2582     P_A    = gmx_simd_add_d(P_A, P2);
2583     P_B    = gmx_simd_add_d(P_B, P1);
2584     P_A    = gmx_simd_mul_d(P_A, z2);
2585     P_B    = gmx_simd_mul_d(P_B, z);
2586     P_A    = gmx_simd_add_d(P_A, P0);
2587     P_A    = gmx_simd_add_d(P_A, P_B);
2588
2589     /* Q_A = z2 */
2590     Q_B    = gmx_simd_mul_d(Q4, z2);
2591     Q_A    = gmx_simd_add_d(z2, Q3);
2592     Q_B    = gmx_simd_add_d(Q_B, Q2);
2593     Q_A    = gmx_simd_mul_d(Q_A, z2);
2594     Q_B    = gmx_simd_mul_d(Q_B, z2);
2595     Q_A    = gmx_simd_add_d(Q_A, Q1);
2596     Q_B    = gmx_simd_add_d(Q_B, Q0);
2597     Q_A    = gmx_simd_mul_d(Q_A, z);
2598     Q_A    = gmx_simd_add_d(Q_A, Q_B);
2599
2600     z      = gmx_simd_mul_d(z, P_A);
2601     z      = gmx_simd_mul_d(z, gmx_simd_inv_d(Q_A));
2602     z      = gmx_simd_mul_d(z, xabs);
2603     z      = gmx_simd_add_d(z, xabs);
2604
2605     t1     = gmx_simd_blendzero_d(morebits1, mask1);
2606     t1     = gmx_simd_blendv_d(t1, morebits2, mask2);
2607
2608     z      = gmx_simd_add_d(z, t1);
2609     y      = gmx_simd_add_d(y, z);
2610
2611     y      = gmx_simd_xor_sign_d(y, x);
2612
2613     return y;
2614 }
2615
2616 /*! \brief SIMD double atan2(y,x).
2617  *
2618  * \copydetails gmx_simd_atan2_f
2619  */
2620 static gmx_inline gmx_simd_double_t gmx_simdcall
2621 gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
2622 {
2623     const gmx_simd_double_t pi          = gmx_simd_set1_d(M_PI);
2624     const gmx_simd_double_t halfpi      = gmx_simd_set1_d(M_PI/2.0);
2625     gmx_simd_double_t       xinv, p, aoffset;
2626     gmx_simd_dbool_t        mask_x0, mask_y0, mask_xlt0, mask_ylt0;
2627
2628     mask_x0   = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
2629     mask_y0   = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
2630     mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
2631     mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
2632
2633     aoffset   = gmx_simd_blendzero_d(halfpi, mask_x0);
2634     aoffset   = gmx_simd_blendnotzero_d(aoffset, mask_y0);
2635
2636     aoffset   = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
2637     aoffset   = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
2638
2639     xinv      = gmx_simd_blendnotzero_d(gmx_simd_inv_notmaskfpe_d(x, mask_x0), mask_x0);
2640     p         = gmx_simd_mul_d(y, xinv);
2641     p         = gmx_simd_atan_d(p);
2642     p         = gmx_simd_add_d(p, aoffset);
2643
2644     return p;
2645 }
2646
2647
2648 /*! \brief Calculate the force correction due to PME analytically for SIMD double.
2649  *
2650  * \copydetails gmx_simd_pmecorrF_f
2651  */
2652 static gmx_inline gmx_simd_double_t gmx_simdcall
2653 gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
2654 {
2655     const gmx_simd_double_t  FN10     = gmx_simd_set1_d(-8.0072854618360083154e-14);
2656     const gmx_simd_double_t  FN9      = gmx_simd_set1_d(1.1859116242260148027e-11);
2657     const gmx_simd_double_t  FN8      = gmx_simd_set1_d(-8.1490406329798423616e-10);
2658     const gmx_simd_double_t  FN7      = gmx_simd_set1_d(3.4404793543907847655e-8);
2659     const gmx_simd_double_t  FN6      = gmx_simd_set1_d(-9.9471420832602741006e-7);
2660     const gmx_simd_double_t  FN5      = gmx_simd_set1_d(0.000020740315999115847456);
2661     const gmx_simd_double_t  FN4      = gmx_simd_set1_d(-0.00031991745139313364005);
2662     const gmx_simd_double_t  FN3      = gmx_simd_set1_d(0.0035074449373659008203);
2663     const gmx_simd_double_t  FN2      = gmx_simd_set1_d(-0.031750380176100813405);
2664     const gmx_simd_double_t  FN1      = gmx_simd_set1_d(0.13884101728898463426);
2665     const gmx_simd_double_t  FN0      = gmx_simd_set1_d(-0.75225277815249618847);
2666
2667     const gmx_simd_double_t  FD5      = gmx_simd_set1_d(0.000016009278224355026701);
2668     const gmx_simd_double_t  FD4      = gmx_simd_set1_d(0.00051055686934806966046);
2669     const gmx_simd_double_t  FD3      = gmx_simd_set1_d(0.0081803507497974289008);
2670     const gmx_simd_double_t  FD2      = gmx_simd_set1_d(0.077181146026670287235);
2671     const gmx_simd_double_t  FD1      = gmx_simd_set1_d(0.41543303143712535988);
2672     const gmx_simd_double_t  FD0      = gmx_simd_set1_d(1.0);
2673
2674     gmx_simd_double_t        z4;
2675     gmx_simd_double_t        polyFN0, polyFN1, polyFD0, polyFD1;
2676
2677     z4             = gmx_simd_mul_d(z2, z2);
2678
2679     polyFD1        = gmx_simd_fmadd_d(FD5, z4, FD3);
2680     polyFD1        = gmx_simd_fmadd_d(polyFD1, z4, FD1);
2681     polyFD1        = gmx_simd_mul_d(polyFD1, z2);
2682     polyFD0        = gmx_simd_fmadd_d(FD4, z4, FD2);
2683     polyFD0        = gmx_simd_fmadd_d(polyFD0, z4, FD0);
2684     polyFD0        = gmx_simd_add_d(polyFD0, polyFD1);
2685
2686     polyFD0        = gmx_simd_inv_d(polyFD0);
2687
2688     polyFN0        = gmx_simd_fmadd_d(FN10, z4, FN8);
2689     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN6);
2690     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN4);
2691     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN2);
2692     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN0);
2693     polyFN1        = gmx_simd_fmadd_d(FN9, z4, FN7);
2694     polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN5);
2695     polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN3);
2696     polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN1);
2697     polyFN0        = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
2698
2699
2700     return gmx_simd_mul_d(polyFN0, polyFD0);
2701 }
2702
2703
2704
2705 /*! \brief Calculate the potential correction due to PME analytically for SIMD double.
2706  *
2707  * \copydetails gmx_simd_pmecorrV_f
2708  */
2709 static gmx_inline gmx_simd_double_t gmx_simdcall
2710 gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
2711 {
2712     const gmx_simd_double_t  VN9      = gmx_simd_set1_d(-9.3723776169321855475e-13);
2713     const gmx_simd_double_t  VN8      = gmx_simd_set1_d(1.2280156762674215741e-10);
2714     const gmx_simd_double_t  VN7      = gmx_simd_set1_d(-7.3562157912251309487e-9);
2715     const gmx_simd_double_t  VN6      = gmx_simd_set1_d(2.6215886208032517509e-7);
2716     const gmx_simd_double_t  VN5      = gmx_simd_set1_d(-4.9532491651265819499e-6);
2717     const gmx_simd_double_t  VN4      = gmx_simd_set1_d(0.00025907400778966060389);
2718     const gmx_simd_double_t  VN3      = gmx_simd_set1_d(0.0010585044856156469792);
2719     const gmx_simd_double_t  VN2      = gmx_simd_set1_d(0.045247661136833092885);
2720     const gmx_simd_double_t  VN1      = gmx_simd_set1_d(0.11643931522926034421);
2721     const gmx_simd_double_t  VN0      = gmx_simd_set1_d(1.1283791671726767970);
2722
2723     const gmx_simd_double_t  VD5      = gmx_simd_set1_d(0.000021784709867336150342);
2724     const gmx_simd_double_t  VD4      = gmx_simd_set1_d(0.00064293662010911388448);
2725     const gmx_simd_double_t  VD3      = gmx_simd_set1_d(0.0096311444822588683504);
2726     const gmx_simd_double_t  VD2      = gmx_simd_set1_d(0.085608012351550627051);
2727     const gmx_simd_double_t  VD1      = gmx_simd_set1_d(0.43652499166614811084);
2728     const gmx_simd_double_t  VD0      = gmx_simd_set1_d(1.0);
2729
2730     gmx_simd_double_t        z4;
2731     gmx_simd_double_t        polyVN0, polyVN1, polyVD0, polyVD1;
2732
2733     z4             = gmx_simd_mul_d(z2, z2);
2734
2735     polyVD1        = gmx_simd_fmadd_d(VD5, z4, VD3);
2736     polyVD0        = gmx_simd_fmadd_d(VD4, z4, VD2);
2737     polyVD1        = gmx_simd_fmadd_d(polyVD1, z4, VD1);
2738     polyVD0        = gmx_simd_fmadd_d(polyVD0, z4, VD0);
2739     polyVD0        = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
2740
2741     polyVD0        = gmx_simd_inv_d(polyVD0);
2742
2743     polyVN1        = gmx_simd_fmadd_d(VN9, z4, VN7);
2744     polyVN0        = gmx_simd_fmadd_d(VN8, z4, VN6);
2745     polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN5);
2746     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN4);
2747     polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN3);
2748     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN2);
2749     polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN1);
2750     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN0);
2751     polyVN0        = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
2752
2753     return gmx_simd_mul_d(polyVN0, polyVD0);
2754 }
2755
2756 /*! \} */
2757
2758
2759 /*! \name SIMD math functions for double prec. data, single prec. accuracy
2760  *
2761  *  \note In some cases we do not need full double accuracy of individual
2762  *        SIMD math functions, although the data is stored in double precision
2763  *        SIMD registers. This might be the case for special algorithms, or
2764  *        if the architecture does not support single precision.
2765  *        Since the full double precision evaluation of math functions
2766  *        typically require much more expensive polynomial approximations
2767  *        these functions implement the algorithms used in the single precision
2768  *        SIMD math functions, but they operate on double precision
2769  *        SIMD variables.
2770  *
2771  *  \note You should normally not use these functions directly, but the
2772  *        real-precision wrappers instead. When Gromacs is compiled in single
2773  *        precision, those will be aliases to the normal single precision
2774  *        SIMD math functions.
2775  *  \{
2776  */
2777
2778 /*********************************************************************
2779  * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
2780  *********************************************************************/
2781
2782 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
2783  *
2784  * You should normally call the real-precision routine
2785  * \ref gmx_simd_invsqrt_singleaccuracy_r.
2786  *
2787  *  \param x Argument that must be >0. This routine does not check arguments.
2788  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
2789  */
2790 static gmx_inline gmx_simd_double_t gmx_simdcall
2791 gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_double_t x)
2792 {
2793     gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
2794 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2795     lu = gmx_simd_rsqrt_iter_d(lu, x);
2796 #endif
2797 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2798     lu = gmx_simd_rsqrt_iter_d(lu, x);
2799 #endif
2800 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2801     lu = gmx_simd_rsqrt_iter_d(lu, x);
2802 #endif
2803     return lu;
2804 }
2805
2806 /*! \brief 1/sqrt(x) for masked entries of SIMD double, but in single accuracy.
2807  *
2808  * \copydetails gmx_simd_invsqrt_maskfpe_f
2809  */
2810 static gmx_inline gmx_simd_double_t
2811 gmx_simd_invsqrt_maskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2812 {
2813 #ifdef NDEBUG
2814     return gmx_simd_invsqrt_singleaccuracy_d(x);
2815 #else
2816     return gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
2817 #endif
2818 }
2819
2820 /*! \brief 1/sqrt(x) for non-masked entries of SIMD double, in single accuracy.
2821  *
2822  * \copydetails gmx_simd_invsqrt_notmaskfpe_f
2823  */
2824 static gmx_inline gmx_simd_double_t
2825 gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2826 {
2827 #ifdef NDEBUG
2828     return gmx_simd_invsqrt_singleaccuracy_d(x);
2829 #else
2830     return gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
2831 #endif
2832 }
2833
2834 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
2835  *
2836  * You should normally call the real-precision routine
2837  * \ref gmx_simd_invsqrt_pair_singleaccuracy_r.
2838  *
2839  * \param x0  First set of arguments, x0 must be positive - no argument checking.
2840  * \param x1  Second set of arguments, x1 must be positive - no argument checking.
2841  * \param[out] out0  Result 1/sqrt(x0)
2842  * \param[out] out1  Result 1/sqrt(x1)
2843  *
2844  *  In particular for double precision we can sometimes calculate square root
2845  *  pairs slightly faster by using single precision until the very last step.
2846  */
2847 static gmx_inline void gmx_simdcall
2848 gmx_simd_invsqrt_pair_singleaccuracy_d(gmx_simd_double_t x0,    gmx_simd_double_t x1,
2849                                        gmx_simd_double_t *out0, gmx_simd_double_t *out1)
2850 {
2851 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
2852     gmx_simd_float_t  xf  = gmx_simd_cvt_dd2f(x0, x1);
2853     gmx_simd_float_t  luf = gmx_simd_rsqrt_f(xf);
2854     gmx_simd_double_t lu0, lu1;
2855     /* Intermediate target is single - mantissa+1 bits */
2856 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2857     luf = gmx_simd_rsqrt_iter_f(luf, xf);
2858 #endif
2859 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2860     luf = gmx_simd_rsqrt_iter_f(luf, xf);
2861 #endif
2862 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2863     luf = gmx_simd_rsqrt_iter_f(luf, xf);
2864 #endif
2865     gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
2866     /* We now have single-precision accuracy values in lu0/lu1 */
2867     *out0 = lu0;
2868     *out1 = lu1;
2869 #else
2870     *out0 = gmx_simd_invsqrt_singleaccuracy_d(x0);
2871     *out1 = gmx_simd_invsqrt_singleaccuracy_d(x1);
2872 #endif
2873 }
2874
2875
2876 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
2877  *
2878  * You should normally call the real-precision routine
2879  * \ref gmx_simd_inv_singleaccuracy_r.
2880  *
2881  *  \param x Argument that must be nonzero. This routine does not check arguments.
2882  *  \return 1/x. Result is undefined if your argument was invalid.
2883  */
2884 static gmx_inline gmx_simd_double_t gmx_simdcall
2885 gmx_simd_inv_singleaccuracy_d(gmx_simd_double_t x)
2886 {
2887     gmx_simd_double_t lu = gmx_simd_rcp_d(x);
2888 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2889     lu = gmx_simd_rcp_iter_d(lu, x);
2890 #endif
2891 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2892     lu = gmx_simd_rcp_iter_d(lu, x);
2893 #endif
2894 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2895     lu = gmx_simd_rcp_iter_d(lu, x);
2896 #endif
2897     return lu;
2898 }
2899
2900 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
2901  *
2902  * \copydetails gmx_simd_inv_maskfpe_f
2903  */
2904 static gmx_inline gmx_simd_double_t
2905 gmx_simd_inv_maskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2906 {
2907 #ifdef NDEBUG
2908     return gmx_simd_inv_singleaccuracy_d(x);
2909 #else
2910     return gmx_simd_inv_singleaccuracy_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
2911 #endif
2912 }
2913
2914 /*! \brief 1/x for non-masked entries of SIMD double, single accuracy.
2915  *
2916  * \copydetails gmx_simd_inv_notmaskfpe_f
2917  */
2918 static gmx_inline gmx_simd_double_t
2919 gmx_simd_inv_notmaskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2920 {
2921 #ifdef NDEBUG
2922     return gmx_simd_inv_singleaccuracy_d(x);
2923 #else
2924     return gmx_simd_inv_singleaccuracy_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
2925 #endif
2926 }
2927
2928 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, single accuracy.
2929  *
2930  * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
2931  *
2932  *  \param x Argument that must be >=0.
2933  *  \return sqrt(x). If x=0, the result will correctly be set to 0.
2934  *          The result is undefined if the input value is negative.
2935  */
2936 static gmx_inline gmx_simd_double_t gmx_simdcall
2937 gmx_simd_sqrt_singleaccuracy_d(gmx_simd_double_t x)
2938 {
2939     gmx_simd_dbool_t   mask;
2940     gmx_simd_double_t  res;
2941
2942     mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
2943     res  = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(x, mask), mask);
2944     return gmx_simd_mul_d(res, x);
2945 }
2946
2947 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
2948  *
2949  * You should normally call the real-precision routine
2950  * \ref gmx_simd_log_singleaccuracy_r.
2951  *
2952  * \param x Argument, should be >0.
2953  * \result The natural logarithm of x. Undefined if argument is invalid.
2954  */
2955 #ifndef gmx_simd_log_singleaccuracy_d
2956 static gmx_inline gmx_simd_double_t gmx_simdcall
2957 gmx_simd_log_singleaccuracy_d(gmx_simd_double_t x)
2958 {
2959     const gmx_simd_double_t  half       = gmx_simd_set1_d(0.5);
2960     const gmx_simd_double_t  one        = gmx_simd_set1_d(1.0);
2961     const gmx_simd_double_t  sqrt2      = gmx_simd_set1_d(sqrt(2.0));
2962     const gmx_simd_double_t  corr       = gmx_simd_set1_d(0.693147180559945286226764);
2963     const gmx_simd_double_t  CL9        = gmx_simd_set1_d(0.2371599674224853515625);
2964     const gmx_simd_double_t  CL7        = gmx_simd_set1_d(0.285279005765914916992188);
2965     const gmx_simd_double_t  CL5        = gmx_simd_set1_d(0.400005519390106201171875);
2966     const gmx_simd_double_t  CL3        = gmx_simd_set1_d(0.666666567325592041015625);
2967     const gmx_simd_double_t  CL1        = gmx_simd_set1_d(2.0);
2968     gmx_simd_double_t        fexp, x2, p;
2969     gmx_simd_dbool_t         mask;
2970
2971     fexp  = gmx_simd_get_exponent_d(x);
2972     x     = gmx_simd_get_mantissa_d(x);
2973
2974     mask  = gmx_simd_cmplt_d(sqrt2, x);
2975     /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
2976     fexp  = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
2977     x     = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
2978
2979     x     = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_singleaccuracy_d( gmx_simd_add_d(x, one) ) );
2980     x2    = gmx_simd_mul_d(x, x);
2981
2982     p     = gmx_simd_fmadd_d(CL9, x2, CL7);
2983     p     = gmx_simd_fmadd_d(p, x2, CL5);
2984     p     = gmx_simd_fmadd_d(p, x2, CL3);
2985     p     = gmx_simd_fmadd_d(p, x2, CL1);
2986     p     = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
2987
2988     return p;
2989 }
2990 #endif
2991
2992 #ifndef gmx_simd_exp2_singleaccuracy_d
2993 /*! \brief SIMD 2^x. Double precision SIMD data, single accuracy.
2994  *
2995  * You should normally call the real-precision routine
2996  * \ref gmx_simd_exp2_singleaccuracy_r.
2997  *
2998  * \param x Argument.
2999  * \result 2^x. Undefined if input argument caused overflow.
3000  */
3001 static gmx_inline gmx_simd_double_t gmx_simdcall
3002 gmx_simd_exp2_singleaccuracy_d(gmx_simd_double_t x)
3003 {
3004     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
3005     const gmx_simd_double_t  arglimit = gmx_simd_set1_d(126.0);
3006     const gmx_simd_double_t  CC6      = gmx_simd_set1_d(0.0001534581200287996416911311);
3007     const gmx_simd_double_t  CC5      = gmx_simd_set1_d(0.001339993121934088894618990);
3008     const gmx_simd_double_t  CC4      = gmx_simd_set1_d(0.009618488957115180159497841);
3009     const gmx_simd_double_t  CC3      = gmx_simd_set1_d(0.05550328776964726865751735);
3010     const gmx_simd_double_t  CC2      = gmx_simd_set1_d(0.2402264689063408646490722);
3011     const gmx_simd_double_t  CC1      = gmx_simd_set1_d(0.6931472057372680777553816);
3012     const gmx_simd_double_t  one      = gmx_simd_set1_d(1.0);
3013
3014     gmx_simd_double_t        fexppart;
3015     gmx_simd_double_t        intpart;
3016     gmx_simd_double_t        p;
3017     gmx_simd_dbool_t         valuemask;
3018
3019     fexppart  = gmx_simd_set_exponent_d(x);
3020     intpart   = gmx_simd_round_d(x);
3021     valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
3022     fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
3023     x         = gmx_simd_sub_d(x, intpart);
3024
3025     p         = gmx_simd_fmadd_d(CC6, x, CC5);
3026     p         = gmx_simd_fmadd_d(p, x, CC4);
3027     p         = gmx_simd_fmadd_d(p, x, CC3);
3028     p         = gmx_simd_fmadd_d(p, x, CC2);
3029     p         = gmx_simd_fmadd_d(p, x, CC1);
3030     p         = gmx_simd_fmadd_d(p, x, one);
3031     x         = gmx_simd_mul_d(p, fexppart);
3032     return x;
3033 }
3034 #endif
3035
3036 #ifndef gmx_simd_exp_singleaccuracy_d
3037 /*! \brief SIMD exp(x). Double precision SIMD data, single accuracy.
3038  *
3039  * You should normally call the real-precision routine
3040  * \ref gmx_simd_exp_singleaccuracy_r.
3041  *
3042  * \param x Argument.
3043  * \result exp(x). Undefined if input argument caused overflow.
3044  */
3045 static gmx_inline gmx_simd_double_t gmx_simdcall
3046 gmx_simd_exp_singleaccuracy_d(gmx_simd_double_t x)
3047 {
3048     const gmx_simd_double_t  argscale     = gmx_simd_set1_d(1.44269504088896341);
3049     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
3050     const gmx_simd_double_t  arglimit     = gmx_simd_set1_d(126.0);
3051     const gmx_simd_double_t  invargscale  = gmx_simd_set1_d(0.69314718055994528623);
3052     const gmx_simd_double_t  CC4          = gmx_simd_set1_d(0.00136324646882712841033936);
3053     const gmx_simd_double_t  CC3          = gmx_simd_set1_d(0.00836596917361021041870117);
3054     const gmx_simd_double_t  CC2          = gmx_simd_set1_d(0.0416710823774337768554688);
3055     const gmx_simd_double_t  CC1          = gmx_simd_set1_d(0.166665524244308471679688);
3056     const gmx_simd_double_t  CC0          = gmx_simd_set1_d(0.499999850988388061523438);
3057     const gmx_simd_double_t  one          = gmx_simd_set1_d(1.0);
3058     gmx_simd_double_t        fexppart;
3059     gmx_simd_double_t        intpart;
3060     gmx_simd_double_t        y, p;
3061     gmx_simd_dbool_t         valuemask;
3062
3063     y         = gmx_simd_mul_d(x, argscale);
3064     fexppart  = gmx_simd_set_exponent_d(y);  /* rounds to nearest int internally */
3065     intpart   = gmx_simd_round_d(y);         /* use same rounding algorithm here */
3066     valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
3067     fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
3068
3069     /* Extended precision arithmetics not needed since
3070      * we have double precision and only need single accuracy.
3071      */
3072     x         = gmx_simd_fnmadd_d(invargscale, intpart, x);
3073
3074     p         = gmx_simd_fmadd_d(CC4, x, CC3);
3075     p         = gmx_simd_fmadd_d(p, x, CC2);
3076     p         = gmx_simd_fmadd_d(p, x, CC1);
3077     p         = gmx_simd_fmadd_d(p, x, CC0);
3078     p         = gmx_simd_fmadd_d(gmx_simd_mul_d(x, x), p, x);
3079     p         = gmx_simd_add_d(p, one);
3080     x         = gmx_simd_mul_d(p, fexppart);
3081     return x;
3082 }
3083 #endif
3084
3085 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3086  *
3087  * You should normally call the real-precision routine
3088  * \ref gmx_simd_erf_singleaccuracy_r.
3089  *
3090  * \param x The value to calculate erf(x) for.
3091  * \result erf(x)
3092  *
3093  * This routine achieves very close to single precision, but we do not care about
3094  * the last bit or the subnormal result range.
3095  */
3096 static gmx_inline gmx_simd_double_t gmx_simdcall
3097 gmx_simd_erf_singleaccuracy_d(gmx_simd_double_t x)
3098 {
3099     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
3100     const gmx_simd_double_t  CA6      = gmx_simd_set1_d(7.853861353153693e-5);
3101     const gmx_simd_double_t  CA5      = gmx_simd_set1_d(-8.010193625184903e-4);
3102     const gmx_simd_double_t  CA4      = gmx_simd_set1_d(5.188327685732524e-3);
3103     const gmx_simd_double_t  CA3      = gmx_simd_set1_d(-2.685381193529856e-2);
3104     const gmx_simd_double_t  CA2      = gmx_simd_set1_d(1.128358514861418e-1);
3105     const gmx_simd_double_t  CA1      = gmx_simd_set1_d(-3.761262582423300e-1);
3106     const gmx_simd_double_t  CA0      = gmx_simd_set1_d(1.128379165726710);
3107     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
3108     const gmx_simd_double_t  CB9      = gmx_simd_set1_d(-0.0018629930017603923);
3109     const gmx_simd_double_t  CB8      = gmx_simd_set1_d(0.003909821287598495);
3110     const gmx_simd_double_t  CB7      = gmx_simd_set1_d(-0.0052094582210355615);
3111     const gmx_simd_double_t  CB6      = gmx_simd_set1_d(0.005685614362160572);
3112     const gmx_simd_double_t  CB5      = gmx_simd_set1_d(-0.0025367682853477272);
3113     const gmx_simd_double_t  CB4      = gmx_simd_set1_d(-0.010199799682318782);
3114     const gmx_simd_double_t  CB3      = gmx_simd_set1_d(0.04369575504816542);
3115     const gmx_simd_double_t  CB2      = gmx_simd_set1_d(-0.11884063474674492);
3116     const gmx_simd_double_t  CB1      = gmx_simd_set1_d(0.2732120154030589);
3117     const gmx_simd_double_t  CB0      = gmx_simd_set1_d(0.42758357702025784);
3118     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
3119     const gmx_simd_double_t  CC10     = gmx_simd_set1_d(-0.0445555913112064);
3120     const gmx_simd_double_t  CC9      = gmx_simd_set1_d(0.21376355144663348);
3121     const gmx_simd_double_t  CC8      = gmx_simd_set1_d(-0.3473187200259257);
3122     const gmx_simd_double_t  CC7      = gmx_simd_set1_d(0.016690861551248114);
3123     const gmx_simd_double_t  CC6      = gmx_simd_set1_d(0.7560973182491192);
3124     const gmx_simd_double_t  CC5      = gmx_simd_set1_d(-1.2137903600145787);
3125     const gmx_simd_double_t  CC4      = gmx_simd_set1_d(0.8411872321232948);
3126     const gmx_simd_double_t  CC3      = gmx_simd_set1_d(-0.08670413896296343);
3127     const gmx_simd_double_t  CC2      = gmx_simd_set1_d(-0.27124782687240334);
3128     const gmx_simd_double_t  CC1      = gmx_simd_set1_d(-0.0007502488047806069);
3129     const gmx_simd_double_t  CC0      = gmx_simd_set1_d(0.5642114853803148);
3130     const gmx_simd_double_t  one      = gmx_simd_set1_d(1.0);
3131     const gmx_simd_double_t  two      = gmx_simd_set1_d(2.0);
3132
3133     gmx_simd_double_t        x2, x4, y;
3134     gmx_simd_double_t        t, t2, w, w2;
3135     gmx_simd_double_t        pA0, pA1, pB0, pB1, pC0, pC1;
3136     gmx_simd_double_t        expmx2;
3137     gmx_simd_double_t        res_erf, res_erfc, res;
3138     gmx_simd_dbool_t         mask, msk_erf;
3139
3140     /* Calculate erf() */
3141     x2   = gmx_simd_mul_d(x, x);
3142     x4   = gmx_simd_mul_d(x2, x2);
3143
3144     pA0  = gmx_simd_fmadd_d(CA6, x4, CA4);
3145     pA1  = gmx_simd_fmadd_d(CA5, x4, CA3);
3146     pA0  = gmx_simd_fmadd_d(pA0, x4, CA2);
3147     pA1  = gmx_simd_fmadd_d(pA1, x4, CA1);
3148     pA0  = gmx_simd_mul_d(pA0, x4);
3149     pA0  = gmx_simd_fmadd_d(pA1, x2, pA0);
3150     /* Constant term must come last for precision reasons */
3151     pA0  = gmx_simd_add_d(pA0, CA0);
3152
3153     res_erf = gmx_simd_mul_d(x, pA0);
3154
3155     /* Calculate erfc */
3156     y       = gmx_simd_fabs_d(x);
3157     msk_erf = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3158     t       = gmx_simd_inv_notmaskfpe_singleaccuracy_d(y, msk_erf);
3159     w       = gmx_simd_sub_d(t, one);
3160     t2      = gmx_simd_mul_d(t, t);
3161     w2      = gmx_simd_mul_d(w, w);
3162
3163     expmx2  = gmx_simd_exp_singleaccuracy_d( gmx_simd_fneg_d( gmx_simd_mul_d(y, y)));
3164
3165     pB1  = gmx_simd_fmadd_d(CB9, w2, CB7);
3166     pB0  = gmx_simd_fmadd_d(CB8, w2, CB6);
3167     pB1  = gmx_simd_fmadd_d(pB1, w2, CB5);
3168     pB0  = gmx_simd_fmadd_d(pB0, w2, CB4);
3169     pB1  = gmx_simd_fmadd_d(pB1, w2, CB3);
3170     pB0  = gmx_simd_fmadd_d(pB0, w2, CB2);
3171     pB1  = gmx_simd_fmadd_d(pB1, w2, CB1);
3172     pB0  = gmx_simd_fmadd_d(pB0, w2, CB0);
3173     pB0  = gmx_simd_fmadd_d(pB1, w, pB0);
3174
3175     pC0  = gmx_simd_fmadd_d(CC10, t2, CC8);
3176     pC1  = gmx_simd_fmadd_d(CC9, t2, CC7);
3177     pC0  = gmx_simd_fmadd_d(pC0, t2, CC6);
3178     pC1  = gmx_simd_fmadd_d(pC1, t2, CC5);
3179     pC0  = gmx_simd_fmadd_d(pC0, t2, CC4);
3180     pC1  = gmx_simd_fmadd_d(pC1, t2, CC3);
3181     pC0  = gmx_simd_fmadd_d(pC0, t2, CC2);
3182     pC1  = gmx_simd_fmadd_d(pC1, t2, CC1);
3183
3184     pC0  = gmx_simd_fmadd_d(pC0, t2, CC0);
3185     pC0  = gmx_simd_fmadd_d(pC1, t, pC0);
3186     pC0  = gmx_simd_mul_d(pC0, t);
3187
3188     /* SELECT pB0 or pC0 for erfc() */
3189     mask     = gmx_simd_cmplt_d(two, y);
3190     res_erfc = gmx_simd_blendv_d(pB0, pC0, mask);
3191     res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
3192
3193     /* erfc(x<0) = 2-erfc(|x|) */
3194     mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3195     res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
3196
3197     /* Select erf() or erfc() */
3198     mask = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3199     res  = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask);
3200
3201     return res;
3202 }
3203
3204 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
3205  *
3206  * You should normally call the real-precision routine
3207  * \ref gmx_simd_erfc_singleaccuracy_r.
3208  *
3209  * \param x The value to calculate erfc(x) for.
3210  * \result erfc(x)
3211  *
3212  * This routine achieves singleprecision (bar the last bit) over most of the
3213  * input range, but for large arguments where the result is getting close
3214  * to the minimum representable numbers we accept slightly larger errors
3215  * (think results that are in the ballpark of 10^-30) since that is not
3216  * relevant for MD.
3217  */
3218 static gmx_inline gmx_simd_double_t gmx_simdcall
3219 gmx_simd_erfc_singleaccuracy_d(gmx_simd_double_t x)
3220 {
3221     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
3222     const gmx_simd_double_t  CA6      = gmx_simd_set1_d(7.853861353153693e-5);
3223     const gmx_simd_double_t  CA5      = gmx_simd_set1_d(-8.010193625184903e-4);
3224     const gmx_simd_double_t  CA4      = gmx_simd_set1_d(5.188327685732524e-3);
3225     const gmx_simd_double_t  CA3      = gmx_simd_set1_d(-2.685381193529856e-2);
3226     const gmx_simd_double_t  CA2      = gmx_simd_set1_d(1.128358514861418e-1);
3227     const gmx_simd_double_t  CA1      = gmx_simd_set1_d(-3.761262582423300e-1);
3228     const gmx_simd_double_t  CA0      = gmx_simd_set1_d(1.128379165726710);
3229     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
3230     const gmx_simd_double_t  CB9      = gmx_simd_set1_d(-0.0018629930017603923);
3231     const gmx_simd_double_t  CB8      = gmx_simd_set1_d(0.003909821287598495);
3232     const gmx_simd_double_t  CB7      = gmx_simd_set1_d(-0.0052094582210355615);
3233     const gmx_simd_double_t  CB6      = gmx_simd_set1_d(0.005685614362160572);
3234     const gmx_simd_double_t  CB5      = gmx_simd_set1_d(-0.0025367682853477272);
3235     const gmx_simd_double_t  CB4      = gmx_simd_set1_d(-0.010199799682318782);
3236     const gmx_simd_double_t  CB3      = gmx_simd_set1_d(0.04369575504816542);
3237     const gmx_simd_double_t  CB2      = gmx_simd_set1_d(-0.11884063474674492);
3238     const gmx_simd_double_t  CB1      = gmx_simd_set1_d(0.2732120154030589);
3239     const gmx_simd_double_t  CB0      = gmx_simd_set1_d(0.42758357702025784);
3240     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
3241     const gmx_simd_double_t  CC10     = gmx_simd_set1_d(-0.0445555913112064);
3242     const gmx_simd_double_t  CC9      = gmx_simd_set1_d(0.21376355144663348);
3243     const gmx_simd_double_t  CC8      = gmx_simd_set1_d(-0.3473187200259257);
3244     const gmx_simd_double_t  CC7      = gmx_simd_set1_d(0.016690861551248114);
3245     const gmx_simd_double_t  CC6      = gmx_simd_set1_d(0.7560973182491192);
3246     const gmx_simd_double_t  CC5      = gmx_simd_set1_d(-1.2137903600145787);
3247     const gmx_simd_double_t  CC4      = gmx_simd_set1_d(0.8411872321232948);
3248     const gmx_simd_double_t  CC3      = gmx_simd_set1_d(-0.08670413896296343);
3249     const gmx_simd_double_t  CC2      = gmx_simd_set1_d(-0.27124782687240334);
3250     const gmx_simd_double_t  CC1      = gmx_simd_set1_d(-0.0007502488047806069);
3251     const gmx_simd_double_t  CC0      = gmx_simd_set1_d(0.5642114853803148);
3252     const gmx_simd_double_t  one      = gmx_simd_set1_d(1.0);
3253     const gmx_simd_double_t  two      = gmx_simd_set1_d(2.0);
3254
3255     gmx_simd_double_t        x2, x4, y;
3256     gmx_simd_double_t        t, t2, w, w2;
3257     gmx_simd_double_t        pA0, pA1, pB0, pB1, pC0, pC1;
3258     gmx_simd_double_t        expmx2;
3259     gmx_simd_double_t        res_erf, res_erfc, res;
3260     gmx_simd_dbool_t         mask, msk_erf;
3261
3262     /* Calculate erf() */
3263     x2     = gmx_simd_mul_d(x, x);
3264     x4     = gmx_simd_mul_d(x2, x2);
3265
3266     pA0  = gmx_simd_fmadd_d(CA6, x4, CA4);
3267     pA1  = gmx_simd_fmadd_d(CA5, x4, CA3);
3268     pA0  = gmx_simd_fmadd_d(pA0, x4, CA2);
3269     pA1  = gmx_simd_fmadd_d(pA1, x4, CA1);
3270     pA1  = gmx_simd_mul_d(pA1, x2);
3271     pA0  = gmx_simd_fmadd_d(pA0, x4, pA1);
3272     /* Constant term must come last for precision reasons */
3273     pA0  = gmx_simd_add_d(pA0, CA0);
3274
3275     res_erf = gmx_simd_mul_d(x, pA0);
3276
3277     /* Calculate erfc */
3278     y       = gmx_simd_fabs_d(x);
3279     msk_erf = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3280     t       = gmx_simd_inv_notmaskfpe_singleaccuracy_d(y, msk_erf);
3281     w       = gmx_simd_sub_d(t, one);
3282     t2      = gmx_simd_mul_d(t, t);
3283     w2      = gmx_simd_mul_d(w, w);
3284
3285     expmx2  = gmx_simd_exp_singleaccuracy_d( gmx_simd_fneg_d( gmx_simd_mul_d(y, y) ) );
3286
3287     pB1  = gmx_simd_fmadd_d(CB9, w2, CB7);
3288     pB0  = gmx_simd_fmadd_d(CB8, w2, CB6);
3289     pB1  = gmx_simd_fmadd_d(pB1, w2, CB5);
3290     pB0  = gmx_simd_fmadd_d(pB0, w2, CB4);
3291     pB1  = gmx_simd_fmadd_d(pB1, w2, CB3);
3292     pB0  = gmx_simd_fmadd_d(pB0, w2, CB2);
3293     pB1  = gmx_simd_fmadd_d(pB1, w2, CB1);
3294     pB0  = gmx_simd_fmadd_d(pB0, w2, CB0);
3295     pB0  = gmx_simd_fmadd_d(pB1, w, pB0);
3296
3297     pC0  = gmx_simd_fmadd_d(CC10, t2, CC8);
3298     pC1  = gmx_simd_fmadd_d(CC9, t2, CC7);
3299     pC0  = gmx_simd_fmadd_d(pC0, t2, CC6);
3300     pC1  = gmx_simd_fmadd_d(pC1, t2, CC5);
3301     pC0  = gmx_simd_fmadd_d(pC0, t2, CC4);
3302     pC1  = gmx_simd_fmadd_d(pC1, t2, CC3);
3303     pC0  = gmx_simd_fmadd_d(pC0, t2, CC2);
3304     pC1  = gmx_simd_fmadd_d(pC1, t2, CC1);
3305
3306     pC0  = gmx_simd_fmadd_d(pC0, t2, CC0);
3307     pC0  = gmx_simd_fmadd_d(pC1, t, pC0);
3308     pC0  = gmx_simd_mul_d(pC0, t);
3309
3310     /* SELECT pB0 or pC0 for erfc() */
3311     mask     = gmx_simd_cmplt_d(two, y);
3312     res_erfc = gmx_simd_blendv_d(pB0, pC0, mask);
3313     res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
3314
3315     /* erfc(x<0) = 2-erfc(|x|) */
3316     mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3317     res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
3318
3319     /* Select erf() or erfc() */
3320     mask = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3321     res  = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask);
3322
3323     return res;
3324 }
3325
3326 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
3327  *
3328  * You should normally call the real-precision routine
3329  * \ref gmx_simd_sincos_singleaccuracy_r.
3330  *
3331  * \param x The argument to evaluate sin/cos for
3332  * \param[out] sinval Sin(x)
3333  * \param[out] cosval Cos(x)
3334  *
3335  */
3336 static gmx_inline void gmx_simdcall
3337 gmx_simd_sincos_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
3338 {
3339     /* Constants to subtract Pi/4*x from y while minimizing precision loss */
3340     const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
3341     const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
3342     const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
3343     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
3344     const gmx_simd_double_t  const_sin2      = gmx_simd_set1_d(-1.9515295891e-4);
3345     const gmx_simd_double_t  const_sin1      = gmx_simd_set1_d( 8.3321608736e-3);
3346     const gmx_simd_double_t  const_sin0      = gmx_simd_set1_d(-1.6666654611e-1);
3347     const gmx_simd_double_t  const_cos2      = gmx_simd_set1_d( 2.443315711809948e-5);
3348     const gmx_simd_double_t  const_cos1      = gmx_simd_set1_d(-1.388731625493765e-3);
3349     const gmx_simd_double_t  const_cos0      = gmx_simd_set1_d( 4.166664568298827e-2);
3350
3351     const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
3352     const gmx_simd_double_t  one             = gmx_simd_set1_d(1.0);
3353     gmx_simd_double_t        ssign, csign;
3354     gmx_simd_double_t        x2, y, z, psin, pcos, sss, ccc;
3355     gmx_simd_dbool_t         mask;
3356 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
3357     const gmx_simd_dint32_t  ione            = gmx_simd_set1_di(1);
3358     const gmx_simd_dint32_t  itwo            = gmx_simd_set1_di(2);
3359     gmx_simd_dint32_t        iy;
3360
3361     z       = gmx_simd_mul_d(x, two_over_pi);
3362     iy      = gmx_simd_cvt_d2i(z);
3363     y       = gmx_simd_round_d(z);
3364
3365     mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
3366     ssign   = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
3367     csign   = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
3368 #else
3369     const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
3370     const gmx_simd_double_t  minusquarter    = gmx_simd_set1_d(-0.25);
3371     gmx_simd_double_t        q;
3372     gmx_simd_dbool_t         m1, m2, m3;
3373
3374     /* The most obvious way to find the arguments quadrant in the unit circle
3375      * to calculate the sign is to use integer arithmetic, but that is not
3376      * present in all SIMD implementations. As an alternative, we have devised a
3377      * pure floating-point algorithm that uses truncation for argument reduction
3378      * so that we get a new value 0<=q<1 over the unit circle, and then
3379      * do floating-point comparisons with fractions. This is likely to be
3380      * slightly slower (~10%) due to the longer latencies of floating-point, so
3381      * we only use it when integer SIMD arithmetic is not present.
3382      */
3383     ssign   = x;
3384     x       = gmx_simd_fabs_d(x);
3385     /* It is critical that half-way cases are rounded down */
3386     z       = gmx_simd_fmadd_d(x, two_over_pi, half);
3387     y       = gmx_simd_trunc_d(z);
3388     q       = gmx_simd_mul_d(z, quarter);
3389     q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
3390     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
3391      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
3392      * This removes the 2*Pi periodicity without using any integer arithmetic.
3393      * First check if y had the value 2 or 3, set csign if true.
3394      */
3395     q       = gmx_simd_sub_d(q, half);
3396     /* If we have logical operations we can work directly on the signbit, which
3397      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
3398      * Thus, if you are altering defines to debug alternative code paths, the
3399      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
3400      * active or inactive - you will get errors if only one is used.
3401      */
3402 #    ifdef GMX_SIMD_HAVE_LOGICAL
3403     ssign   = gmx_simd_and_d(ssign, gmx_simd_set1_d(-0.0));
3404     csign   = gmx_simd_andnot_d(q, gmx_simd_set1_d(-0.0));
3405     ssign   = gmx_simd_xor_d(ssign, csign);
3406 #    else
3407     csign   = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
3408
3409     ssign   = gmx_simd_xor_sign_d(ssign, csign);    /* swap ssign if csign was set. */
3410 #    endif
3411     /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
3412     m1      = gmx_simd_cmplt_d(q, minusquarter);
3413     m2      = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
3414     m3      = gmx_simd_cmplt_d(q, quarter);
3415     m2      = gmx_simd_and_db(m2, m3);
3416     mask    = gmx_simd_or_db(m1, m2);
3417     /* where mask is FALSE, set sign. */
3418     csign   = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
3419 #endif
3420     x       = gmx_simd_fnmadd_d(y, argred0, x);
3421     x       = gmx_simd_fnmadd_d(y, argred1, x);
3422     x       = gmx_simd_fnmadd_d(y, argred2, x);
3423     x2      = gmx_simd_mul_d(x, x);
3424
3425     psin    = gmx_simd_fmadd_d(const_sin2, x2, const_sin1);
3426     psin    = gmx_simd_fmadd_d(psin, x2, const_sin0);
3427     psin    = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x, x2), x);
3428     pcos    = gmx_simd_fmadd_d(const_cos2, x2, const_cos1);
3429     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos0);
3430     pcos    = gmx_simd_fmsub_d(pcos, x2, half);
3431     pcos    = gmx_simd_fmadd_d(pcos, x2, one);
3432
3433     sss     = gmx_simd_blendv_d(pcos, psin, mask);
3434     ccc     = gmx_simd_blendv_d(psin, pcos, mask);
3435     /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
3436 #ifdef GMX_SIMD_HAVE_LOGICAL
3437     *sinval = gmx_simd_xor_d(sss, ssign);
3438     *cosval = gmx_simd_xor_d(ccc, csign);
3439 #else
3440     *sinval = gmx_simd_xor_sign_d(sss, ssign);
3441     *cosval = gmx_simd_xor_sign_d(ccc, csign);
3442 #endif
3443 }
3444
3445 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
3446  *
3447  * You should normally call the real-precision routine
3448  * \ref gmx_simd_sin_singleaccuracy_r.
3449  *
3450  * \param x The argument to evaluate sin for
3451  * \result Sin(x)
3452  *
3453  * \attention Do NOT call both sin & cos if you need both results, since each of them
3454  * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
3455  */
3456 static gmx_inline gmx_simd_double_t gmx_simdcall
3457 gmx_simd_sin_singleaccuracy_d(gmx_simd_double_t x)
3458 {
3459     gmx_simd_double_t s, c;
3460     gmx_simd_sincos_singleaccuracy_d(x, &s, &c);
3461     return s;
3462 }
3463
3464 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
3465  *
3466  * You should normally call the real-precision routine
3467  * \ref gmx_simd_cos_singleaccuracy_r.
3468  *
3469  * \param x The argument to evaluate cos for
3470  * \result Cos(x)
3471  *
3472  * \attention Do NOT call both sin & cos if you need both results, since each of them
3473  * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
3474  */
3475 static gmx_inline gmx_simd_double_t gmx_simdcall
3476 gmx_simd_cos_singleaccuracy_d(gmx_simd_double_t x)
3477 {
3478     gmx_simd_double_t s, c;
3479     gmx_simd_sincos_singleaccuracy_d(x, &s, &c);
3480     return c;
3481 }
3482
3483 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
3484  *
3485  * You should normally call the real-precision routine
3486  * \ref gmx_simd_tan_singleaccuracy_r.
3487  *
3488  * \param x The argument to evaluate tan for
3489  * \result Tan(x)
3490  */
3491 static gmx_inline gmx_simd_double_t gmx_simdcall
3492 gmx_simd_tan_singleaccuracy_d(gmx_simd_double_t x)
3493 {
3494     const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
3495     const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
3496     const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
3497     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
3498     const gmx_simd_double_t  CT6             = gmx_simd_set1_d(0.009498288995810566122993911);
3499     const gmx_simd_double_t  CT5             = gmx_simd_set1_d(0.002895755790837379295226923);
3500     const gmx_simd_double_t  CT4             = gmx_simd_set1_d(0.02460087336161924491836265);
3501     const gmx_simd_double_t  CT3             = gmx_simd_set1_d(0.05334912882656359828045988);
3502     const gmx_simd_double_t  CT2             = gmx_simd_set1_d(0.1333989091464957704418495);
3503     const gmx_simd_double_t  CT1             = gmx_simd_set1_d(0.3333307599244198227797507);
3504
3505     gmx_simd_double_t        x2, p, y, z;
3506     gmx_simd_dbool_t         mask;
3507
3508 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
3509     gmx_simd_dint32_t  iy;
3510     gmx_simd_dint32_t  ione = gmx_simd_set1_di(1);
3511
3512     z       = gmx_simd_mul_d(x, two_over_pi);
3513     iy      = gmx_simd_cvt_d2i(z);
3514     y       = gmx_simd_round_d(z);
3515     mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
3516
3517     x       = gmx_simd_fnmadd_d(y, argred0, x);
3518     x       = gmx_simd_fnmadd_d(y, argred1, x);
3519     x       = gmx_simd_fnmadd_d(y, argred2, x);
3520     x       = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), mask), x);
3521 #else
3522     const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
3523     const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
3524     const gmx_simd_double_t  threequarter    = gmx_simd_set1_d(0.75);
3525     gmx_simd_double_t        w, q;
3526     gmx_simd_dbool_t         m1, m2, m3;
3527
3528     w       = gmx_simd_fabs_d(x);
3529     z       = gmx_simd_fmadd_d(w, two_over_pi, half);
3530     y       = gmx_simd_trunc_d(z);
3531     q       = gmx_simd_mul_d(z, quarter);
3532     q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
3533     m1      = gmx_simd_cmple_d(quarter, q);
3534     m2      = gmx_simd_cmplt_d(q, half);
3535     m3      = gmx_simd_cmple_d(threequarter, q);
3536     m1      = gmx_simd_and_db(m1, m2);
3537     mask    = gmx_simd_or_db(m1, m3);
3538     w       = gmx_simd_fnmadd_d(y, argred0, w);
3539     w       = gmx_simd_fnmadd_d(y, argred1, w);
3540     w       = gmx_simd_fnmadd_d(y, argred2, w);
3541
3542     w       = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
3543     x       = gmx_simd_xor_sign_d(w, x);
3544 #endif
3545     x2      = gmx_simd_mul_d(x, x);
3546     p       = gmx_simd_fmadd_d(CT6, x2, CT5);
3547     p       = gmx_simd_fmadd_d(p, x2, CT4);
3548     p       = gmx_simd_fmadd_d(p, x2, CT3);
3549     p       = gmx_simd_fmadd_d(p, x2, CT2);
3550     p       = gmx_simd_fmadd_d(p, x2, CT1);
3551     p       = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
3552
3553     p       = gmx_simd_blendv_d( p, gmx_simd_inv_maskfpe_singleaccuracy_d(p, mask), mask);
3554     return p;
3555 }
3556
3557 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3558  *
3559  * You should normally call the real-precision routine
3560  * \ref gmx_simd_asin_singleaccuracy_r.
3561  *
3562  * \param x The argument to evaluate asin for
3563  * \result Asin(x)
3564  */
3565 static gmx_inline gmx_simd_double_t gmx_simdcall
3566 gmx_simd_asin_singleaccuracy_d(gmx_simd_double_t x)
3567 {
3568     const gmx_simd_double_t limitlow   = gmx_simd_set1_d(1e-4);
3569     const gmx_simd_double_t half       = gmx_simd_set1_d(0.5);
3570     const gmx_simd_double_t one        = gmx_simd_set1_d(1.0);
3571     const gmx_simd_double_t halfpi     = gmx_simd_set1_d(M_PI/2.0);
3572     const gmx_simd_double_t CC5        = gmx_simd_set1_d(4.2163199048E-2);
3573     const gmx_simd_double_t CC4        = gmx_simd_set1_d(2.4181311049E-2);
3574     const gmx_simd_double_t CC3        = gmx_simd_set1_d(4.5470025998E-2);
3575     const gmx_simd_double_t CC2        = gmx_simd_set1_d(7.4953002686E-2);
3576     const gmx_simd_double_t CC1        = gmx_simd_set1_d(1.6666752422E-1);
3577     gmx_simd_double_t       xabs;
3578     gmx_simd_double_t       z, z1, z2, q, q1, q2;
3579     gmx_simd_double_t       pA, pB;
3580     gmx_simd_dbool_t        mask, mask2;
3581
3582     xabs  = gmx_simd_fabs_d(x);
3583     mask  = gmx_simd_cmplt_d(half, xabs);
3584     z1    = gmx_simd_mul_d(half, gmx_simd_sub_d(one, xabs));
3585     mask2 = gmx_simd_cmpeq_d(xabs, one);
3586     q1    = gmx_simd_mul_d(z1, gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(z1, mask2));
3587     q1    = gmx_simd_blendnotzero_d(q1, gmx_simd_cmpeq_d(xabs, one));
3588     q2    = xabs;
3589     z2    = gmx_simd_mul_d(q2, q2);
3590     z     = gmx_simd_blendv_d(z2, z1, mask);
3591     q     = gmx_simd_blendv_d(q2, q1, mask);
3592
3593     z2    = gmx_simd_mul_d(z, z);
3594     pA    = gmx_simd_fmadd_d(CC5, z2, CC3);
3595     pB    = gmx_simd_fmadd_d(CC4, z2, CC2);
3596     pA    = gmx_simd_fmadd_d(pA, z2, CC1);
3597     pA    = gmx_simd_mul_d(pA, z);
3598     z     = gmx_simd_fmadd_d(pB, z2, pA);
3599     z     = gmx_simd_fmadd_d(z, q, q);
3600     q2    = gmx_simd_sub_d(halfpi, z);
3601     q2    = gmx_simd_sub_d(q2, z);
3602     z     = gmx_simd_blendv_d(z, q2, mask);
3603
3604     mask  = gmx_simd_cmplt_d(limitlow, xabs);
3605     z     = gmx_simd_blendv_d( xabs, z, mask );
3606     z     = gmx_simd_xor_sign_d(z, x);
3607
3608     return z;
3609 }
3610
3611 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
3612  *
3613  * You should normally call the real-precision routine
3614  * \ref gmx_simd_acos_singleaccuracy_r.
3615  *
3616  * \param x The argument to evaluate acos for
3617  * \result Acos(x)
3618  */
3619 static gmx_inline gmx_simd_double_t gmx_simdcall
3620 gmx_simd_acos_singleaccuracy_d(gmx_simd_double_t x)
3621 {
3622     const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
3623     const gmx_simd_double_t half      = gmx_simd_set1_d(0.5);
3624     const gmx_simd_double_t pi        = gmx_simd_set1_d(M_PI);
3625     const gmx_simd_double_t halfpi    = gmx_simd_set1_d(M_PI/2.0);
3626     gmx_simd_double_t       xabs;
3627     gmx_simd_double_t       z, z1, z2, z3;
3628     gmx_simd_dbool_t        mask1, mask2, mask3;
3629
3630     xabs  = gmx_simd_fabs_d(x);
3631     mask1 = gmx_simd_cmplt_d(half, xabs);
3632     mask2 = gmx_simd_cmplt_d(gmx_simd_setzero_d(), x);
3633
3634     z     = gmx_simd_mul_d(half, gmx_simd_sub_d(one, xabs));
3635     mask3 = gmx_simd_cmpeq_d(xabs, one);
3636     z     = gmx_simd_mul_d(z, gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(z, mask3));
3637     z     = gmx_simd_blendnotzero_d(z, gmx_simd_cmpeq_d(xabs, one));
3638     z     = gmx_simd_blendv_d(x, z, mask1);
3639     z     = gmx_simd_asin_singleaccuracy_d(z);
3640
3641     z2    = gmx_simd_add_d(z, z);
3642     z1    = gmx_simd_sub_d(pi, z2);
3643     z3    = gmx_simd_sub_d(halfpi, z);
3644     z     = gmx_simd_blendv_d(z1, z2, mask2);
3645     z     = gmx_simd_blendv_d(z3, z, mask1);
3646
3647     return z;
3648 }
3649
3650 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3651  *
3652  * You should normally call the real-precision routine
3653  * \ref gmx_simd_atan_singleaccuracy_r.
3654  *
3655  * \param x The argument to evaluate atan for
3656  * \result Atan(x), same argument/value range as standard math library.
3657  */
3658 static gmx_inline gmx_simd_double_t gmx_simdcall
3659 gmx_simd_atan_singleaccuracy_d(gmx_simd_double_t x)
3660 {
3661     const gmx_simd_double_t halfpi    = gmx_simd_set1_d(M_PI/2);
3662     const gmx_simd_double_t CA17      = gmx_simd_set1_d(0.002823638962581753730774);
3663     const gmx_simd_double_t CA15      = gmx_simd_set1_d(-0.01595690287649631500244);
3664     const gmx_simd_double_t CA13      = gmx_simd_set1_d(0.04250498861074447631836);
3665     const gmx_simd_double_t CA11      = gmx_simd_set1_d(-0.07489009201526641845703);
3666     const gmx_simd_double_t CA9       = gmx_simd_set1_d(0.1063479334115982055664);
3667     const gmx_simd_double_t CA7       = gmx_simd_set1_d(-0.1420273631811141967773);
3668     const gmx_simd_double_t CA5       = gmx_simd_set1_d(0.1999269574880599975585);
3669     const gmx_simd_double_t CA3       = gmx_simd_set1_d(-0.3333310186862945556640);
3670     gmx_simd_double_t       x2, x3, x4, pA, pB;
3671     gmx_simd_dbool_t        mask, mask2;
3672
3673     mask  = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3674     x     = gmx_simd_fabs_d(x);
3675     mask2 = gmx_simd_cmplt_d(gmx_simd_set1_d(1.0), x);
3676     x     = gmx_simd_blendv_d(x, gmx_simd_inv_maskfpe_singleaccuracy_d(x, mask2), mask2);
3677
3678     x2    = gmx_simd_mul_d(x, x);
3679     x3    = gmx_simd_mul_d(x2, x);
3680     x4    = gmx_simd_mul_d(x2, x2);
3681     pA    = gmx_simd_fmadd_d(CA17, x4, CA13);
3682     pB    = gmx_simd_fmadd_d(CA15, x4, CA11);
3683     pA    = gmx_simd_fmadd_d(pA, x4, CA9);
3684     pB    = gmx_simd_fmadd_d(pB, x4, CA7);
3685     pA    = gmx_simd_fmadd_d(pA, x4, CA5);
3686     pB    = gmx_simd_fmadd_d(pB, x4, CA3);
3687     pA    = gmx_simd_fmadd_d(pA, x2, pB);
3688     pA    = gmx_simd_fmadd_d(pA, x3, x);
3689
3690     pA    = gmx_simd_blendv_d(pA, gmx_simd_sub_d(halfpi, pA), mask2);
3691     pA    = gmx_simd_blendv_d(pA, gmx_simd_fneg_d(pA), mask);
3692
3693     return pA;
3694 }
3695
3696 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
3697  *
3698  * You should normally call the real-precision routine
3699  * \ref gmx_simd_atan2_singleaccuracy_r.
3700  *
3701  * \param y Y component of vector, any quartile
3702  * \param x X component of vector, any quartile
3703  * \result Atan(y,x), same argument/value range as standard math library.
3704  *
3705  * \note This routine should provide correct results for all finite
3706  * non-zero or positive-zero arguments. However, negative zero arguments will
3707  * be treated as positive zero, which means the return value will deviate from
3708  * the standard math library atan2(y,x) for those cases. That should not be
3709  * of any concern in Gromacs, and in particular it will not affect calculations
3710  * of angles from vectors.
3711  */
3712 static gmx_inline gmx_simd_double_t gmx_simdcall
3713 gmx_simd_atan2_singleaccuracy_d(gmx_simd_double_t y, gmx_simd_double_t x)
3714 {
3715     const gmx_simd_double_t pi          = gmx_simd_set1_d(M_PI);
3716     const gmx_simd_double_t halfpi      = gmx_simd_set1_d(M_PI/2.0);
3717     gmx_simd_double_t       xinv, p, aoffset;
3718     gmx_simd_dbool_t        mask_x0, mask_y0, mask_xlt0, mask_ylt0;
3719
3720     mask_x0   = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
3721     mask_y0   = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
3722     mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3723     mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
3724
3725     aoffset   = gmx_simd_blendzero_d(halfpi, mask_x0);
3726     aoffset   = gmx_simd_blendnotzero_d(aoffset, mask_y0);
3727
3728     aoffset   = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
3729     aoffset   = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
3730
3731     xinv      = gmx_simd_blendnotzero_d(gmx_simd_inv_notmaskfpe_singleaccuracy_d(x, mask_x0), mask_x0);
3732     p         = gmx_simd_mul_d(y, xinv);
3733     p         = gmx_simd_atan_singleaccuracy_d(p);
3734     p         = gmx_simd_add_d(p, aoffset);
3735
3736     return p;
3737 }
3738
3739 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
3740  *
3741  * You should normally call the real-precision routine
3742  * \ref gmx_simd_pmecorrF_singleaccuracy_r.
3743  *
3744  * \param z2 \f$(r \beta)^2\f$ - see below for details.
3745  * \result Correction factor to coulomb force - see below for details.
3746  *
3747  * This routine is meant to enable analytical evaluation of the
3748  * direct-space PME electrostatic force to avoid tables.
3749  *
3750  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
3751  * are some problems evaluating that:
3752  *
3753  * First, the error function is difficult (read: expensive) to
3754  * approxmiate accurately for intermediate to large arguments, and
3755  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
3756  * Second, we now try to avoid calculating potentials in Gromacs but
3757  * use forces directly.
3758  *
3759  * We can simply things slight by noting that the PME part is really
3760  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
3761  * \f[
3762  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
3763  * \f]
3764  * The first term we already have from the inverse square root, so
3765  * that we can leave out of this routine.
3766  *
3767  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
3768  * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
3769  * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
3770  * in this range!
3771  *
3772  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
3773  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
3774  * then only use even powers. This is another minor optimization, since
3775  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
3776  * the vector between the two atoms to get the vectorial force. The
3777  * fastest flops are the ones we can avoid calculating!
3778  *
3779  * So, here's how it should be used:
3780  *
3781  * 1. Calculate \f$r^2\f$.
3782  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
3783  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
3784  * 4. The return value is the expression:
3785  *
3786  * \f[
3787  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
3788  * \f]
3789  *
3790  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
3791  *
3792  *  \f[
3793  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
3794  *  \f]
3795  *
3796  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
3797  *
3798  *  \f[
3799  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
3800  *  \f]
3801  *
3802  *    With a bit of math exercise you should be able to confirm that
3803  *    this is exactly
3804  *
3805  *  \f[
3806  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
3807  *  \f]
3808  *
3809  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
3810  *    and you have your force (divided by \f$r\f$). A final multiplication
3811  *    with the vector connecting the two particles and you have your
3812  *    vectorial force to add to the particles.
3813  *
3814  * This approximation achieves an accuracy slightly lower than 1e-6; when
3815  * added to \f$1/r\f$ the error will be insignificant.
3816  *
3817  */
3818 static gmx_simd_double_t gmx_simdcall
3819 gmx_simd_pmecorrF_singleaccuracy_d(gmx_simd_double_t z2)
3820 {
3821     const gmx_simd_double_t  FN6      = gmx_simd_set1_d(-1.7357322914161492954e-8);
3822     const gmx_simd_double_t  FN5      = gmx_simd_set1_d(1.4703624142580877519e-6);
3823     const gmx_simd_double_t  FN4      = gmx_simd_set1_d(-0.000053401640219807709149);
3824     const gmx_simd_double_t  FN3      = gmx_simd_set1_d(0.0010054721316683106153);
3825     const gmx_simd_double_t  FN2      = gmx_simd_set1_d(-0.019278317264888380590);
3826     const gmx_simd_double_t  FN1      = gmx_simd_set1_d(0.069670166153766424023);
3827     const gmx_simd_double_t  FN0      = gmx_simd_set1_d(-0.75225204789749321333);
3828
3829     const gmx_simd_double_t  FD4      = gmx_simd_set1_d(0.0011193462567257629232);
3830     const gmx_simd_double_t  FD3      = gmx_simd_set1_d(0.014866955030185295499);
3831     const gmx_simd_double_t  FD2      = gmx_simd_set1_d(0.11583842382862377919);
3832     const gmx_simd_double_t  FD1      = gmx_simd_set1_d(0.50736591960530292870);
3833     const gmx_simd_double_t  FD0      = gmx_simd_set1_d(1.0);
3834
3835     gmx_simd_double_t        z4;
3836     gmx_simd_double_t        polyFN0, polyFN1, polyFD0, polyFD1;
3837
3838     z4             = gmx_simd_mul_d(z2, z2);
3839
3840     polyFD0        = gmx_simd_fmadd_d(FD4, z4, FD2);
3841     polyFD1        = gmx_simd_fmadd_d(FD3, z4, FD1);
3842     polyFD0        = gmx_simd_fmadd_d(polyFD0, z4, FD0);
3843     polyFD0        = gmx_simd_fmadd_d(polyFD1, z2, polyFD0);
3844
3845     polyFD0        = gmx_simd_inv_singleaccuracy_d(polyFD0);
3846
3847     polyFN0        = gmx_simd_fmadd_d(FN6, z4, FN4);
3848     polyFN1        = gmx_simd_fmadd_d(FN5, z4, FN3);
3849     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN2);
3850     polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN1);
3851     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN0);
3852     polyFN0        = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
3853
3854     return gmx_simd_mul_d(polyFN0, polyFD0);
3855 }
3856
3857
3858
3859 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
3860  *
3861  * You should normally call the real-precision routine
3862  * \ref gmx_simd_pmecorrV_singleaccuracy_r.
3863  *
3864  * \param z2 \f$(r \beta)^2\f$ - see below for details.
3865  * \result Correction factor to coulomb potential - see below for details.
3866  *
3867  * See \ref gmx_simd_pmecorrF_f for details about the approximation.
3868  *
3869  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
3870  * as the input argument.
3871  *
3872  * Here's how it should be used:
3873  *
3874  * 1. Calculate \f$r^2\f$.
3875  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
3876  * 3. Evaluate this routine with z^2 as the argument.
3877  * 4. The return value is the expression:
3878  *
3879  *  \f[
3880  *   \frac{\mbox{erf}(z)}{z}
3881  *  \f]
3882  *
3883  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
3884  *
3885  *  \f[
3886  *    \frac{\mbox{erf}(r \beta)}{r}
3887  *  \f]
3888  *
3889  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
3890  *    and you have your potential.
3891  *
3892  * This approximation achieves an accuracy slightly lower than 1e-6; when
3893  * added to \f$1/r\f$ the error will be insignificant.
3894  */
3895 static gmx_simd_double_t gmx_simdcall
3896 gmx_simd_pmecorrV_singleaccuracy_d(gmx_simd_double_t z2)
3897 {
3898     const gmx_simd_double_t  VN6      = gmx_simd_set1_d(1.9296833005951166339e-8);
3899     const gmx_simd_double_t  VN5      = gmx_simd_set1_d(-1.4213390571557850962e-6);
3900     const gmx_simd_double_t  VN4      = gmx_simd_set1_d(0.000041603292906656984871);
3901     const gmx_simd_double_t  VN3      = gmx_simd_set1_d(-0.00013134036773265025626);
3902     const gmx_simd_double_t  VN2      = gmx_simd_set1_d(0.038657983986041781264);
3903     const gmx_simd_double_t  VN1      = gmx_simd_set1_d(0.11285044772717598220);
3904     const gmx_simd_double_t  VN0      = gmx_simd_set1_d(1.1283802385263030286);
3905
3906     const gmx_simd_double_t  VD3      = gmx_simd_set1_d(0.0066752224023576045451);
3907     const gmx_simd_double_t  VD2      = gmx_simd_set1_d(0.078647795836373922256);
3908     const gmx_simd_double_t  VD1      = gmx_simd_set1_d(0.43336185284710920150);
3909     const gmx_simd_double_t  VD0      = gmx_simd_set1_d(1.0);
3910
3911     gmx_simd_double_t        z4;
3912     gmx_simd_double_t        polyVN0, polyVN1, polyVD0, polyVD1;
3913
3914     z4             = gmx_simd_mul_d(z2, z2);
3915
3916     polyVD1        = gmx_simd_fmadd_d(VD3, z4, VD1);
3917     polyVD0        = gmx_simd_fmadd_d(VD2, z4, VD0);
3918     polyVD0        = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
3919
3920     polyVD0        = gmx_simd_inv_singleaccuracy_d(polyVD0);
3921
3922     polyVN0        = gmx_simd_fmadd_d(VN6, z4, VN4);
3923     polyVN1        = gmx_simd_fmadd_d(VN5, z4, VN3);
3924     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN2);
3925     polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN1);
3926     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN0);
3927     polyVN0        = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
3928
3929     return gmx_simd_mul_d(polyVN0, polyVD0);
3930 }
3931
3932 #endif
3933
3934
3935 /*! \name SIMD4 math functions
3936  *
3937  * \note Only a subset of the math functions are implemented for SIMD4.
3938  *  \{
3939  */
3940
3941
3942 #ifdef GMX_SIMD4_HAVE_FLOAT
3943
3944 /*************************************************************************
3945  * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
3946  *************************************************************************/
3947
3948 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 floats.
3949  *
3950  * \copydetails gmx_simd_sum4_f
3951  */
3952 static gmx_inline gmx_simd4_float_t gmx_simdcall
3953 gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
3954                  gmx_simd4_float_t c, gmx_simd4_float_t d)
3955 {
3956     return gmx_simd4_add_f(gmx_simd4_add_f(a, b), gmx_simd4_add_f(c, d));
3957 }
3958
3959 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
3960  *
3961  * \copydetails gmx_simd_rsqrt_iter_f
3962  */
3963 static gmx_inline gmx_simd4_float_t gmx_simdcall
3964 gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
3965 {
3966 #    ifdef GMX_SIMD_HAVE_FMA
3967     return gmx_simd4_fmadd_f(gmx_simd4_fnmadd_f(x, gmx_simd4_mul_f(lu, lu), gmx_simd4_set1_f(1.0f)), gmx_simd4_mul_f(lu, gmx_simd4_set1_f(0.5f)), lu);
3968 #    else
3969     return gmx_simd4_mul_f(gmx_simd4_set1_f(0.5f), gmx_simd4_mul_f(gmx_simd4_sub_f(gmx_simd4_set1_f(3.0f), gmx_simd4_mul_f(gmx_simd4_mul_f(lu, lu), x)), lu));
3970 #    endif
3971 }
3972
3973 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
3974  *
3975  * \copydetails gmx_simd_invsqrt_f
3976  */
3977 static gmx_inline gmx_simd4_float_t gmx_simdcall
3978 gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
3979 {
3980     gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
3981 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3982     lu = gmx_simd4_rsqrt_iter_f(lu, x);
3983 #endif
3984 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3985     lu = gmx_simd4_rsqrt_iter_f(lu, x);
3986 #endif
3987 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3988     lu = gmx_simd4_rsqrt_iter_f(lu, x);
3989 #endif
3990     return lu;
3991 }
3992
3993 #endif /* GMX_SIMD4_HAVE_FLOAT */
3994
3995
3996
3997 #ifdef GMX_SIMD4_HAVE_DOUBLE
3998 /*************************************************************************
3999  * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4000  *************************************************************************/
4001
4002
4003 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 doubles.
4004  *
4005  * \copydetails gmx_simd_sum4_f
4006  */
4007 static gmx_inline gmx_simd4_double_t gmx_simdcall
4008 gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
4009                  gmx_simd4_double_t c, gmx_simd4_double_t d)
4010 {
4011     return gmx_simd4_add_d(gmx_simd4_add_d(a, b), gmx_simd4_add_d(c, d));
4012 }
4013
4014 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4015  *
4016  * \copydetails gmx_simd_rsqrt_iter_f
4017  */
4018 static gmx_inline gmx_simd4_double_t gmx_simdcall
4019 gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
4020 {
4021 #ifdef GMX_SIMD_HAVE_FMA
4022     return gmx_simd4_fmadd_d(gmx_simd4_fnmadd_d(x, gmx_simd4_mul_d(lu, lu), gmx_simd4_set1_d(1.0)), gmx_simd4_mul_d(lu, gmx_simd4_set1_d(0.5)), lu);
4023 #else
4024     return gmx_simd4_mul_d(gmx_simd4_set1_d(0.5), gmx_simd4_mul_d(gmx_simd4_sub_d(gmx_simd4_set1_d(3.0), gmx_simd4_mul_d(gmx_simd4_mul_d(lu, lu), x)), lu));
4025 #endif
4026 }
4027
4028 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4029  *
4030  * \copydetails gmx_simd_invsqrt_f
4031  */
4032 static gmx_inline gmx_simd4_double_t gmx_simdcall
4033 gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
4034 {
4035     gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
4036 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4037     lu = gmx_simd4_rsqrt_iter_d(lu, x);
4038 #endif
4039 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4040     lu = gmx_simd4_rsqrt_iter_d(lu, x);
4041 #endif
4042 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4043     lu = gmx_simd4_rsqrt_iter_d(lu, x);
4044 #endif
4045 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4046     lu = gmx_simd4_rsqrt_iter_d(lu, x);
4047 #endif
4048     return lu;
4049 }
4050
4051 /**********************************************************************
4052  * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4053  **********************************************************************/
4054
4055 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4056  *
4057  * \copydetails gmx_simd_invsqrt_singleaccuracy_d
4058  */
4059 static gmx_inline gmx_simd4_double_t gmx_simdcall
4060 gmx_simd4_invsqrt_singleaccuracy_d(gmx_simd4_double_t x)
4061 {
4062     gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
4063 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4064     lu = gmx_simd4_rsqrt_iter_d(lu, x);
4065 #endif
4066 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4067     lu = gmx_simd4_rsqrt_iter_d(lu, x);
4068 #endif
4069 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4070     lu = gmx_simd4_rsqrt_iter_d(lu, x);
4071 #endif
4072     return lu;
4073 }
4074
4075
4076 #endif /* GMX_SIMD4_HAVE_DOUBLE */
4077
4078 /*! \} */
4079
4080
4081 /* Set defines based on default Gromacs precision */
4082 #ifdef GMX_DOUBLE
4083 /* Documentation in single branch below */
4084
4085 #    define gmx_simd_sum4_r           gmx_simd_sum4_d
4086 #    define gmx_simd_xor_sign_r       gmx_simd_xor_sign_d
4087 #    define gmx_simd4_sum4_r          gmx_simd4_sum4_d
4088
4089 /* On hardware that only supports double precision SIMD it is possible to use
4090  * the faster _singleaccuracy_d routines everywhere by setting the requested SIMD
4091  * accuracy to single precision.
4092  */
4093 #if (GMX_SIMD_ACCURACY_BITS_DOUBLE > GMX_SIMD_ACCURACY_BITS_SINGLE)
4094
4095 #        define gmx_simd_invsqrt_r        gmx_simd_invsqrt_d
4096 #        define gmx_simd_invsqrt_pair_r   gmx_simd_invsqrt_pair_d
4097 #        define gmx_simd_sqrt_r           gmx_simd_sqrt_d
4098 #        define gmx_simd_inv_r            gmx_simd_inv_d
4099 #        define gmx_simd_log_r            gmx_simd_log_d
4100 #        define gmx_simd_exp2_r           gmx_simd_exp2_d
4101 #        define gmx_simd_exp_r            gmx_simd_exp_d
4102 #        define gmx_simd_erf_r            gmx_simd_erf_d
4103 #        define gmx_simd_erfc_r           gmx_simd_erfc_d
4104 #        define gmx_simd_sincos_r         gmx_simd_sincos_d
4105 #        define gmx_simd_sin_r            gmx_simd_sin_d
4106 #        define gmx_simd_cos_r            gmx_simd_cos_d
4107 #        define gmx_simd_tan_r            gmx_simd_tan_d
4108 #        define gmx_simd_asin_r           gmx_simd_asin_d
4109 #        define gmx_simd_acos_r           gmx_simd_acos_d
4110 #        define gmx_simd_atan_r           gmx_simd_atan_d
4111 #        define gmx_simd_atan2_r          gmx_simd_atan2_d
4112 #        define gmx_simd_pmecorrF_r       gmx_simd_pmecorrF_d
4113 #        define gmx_simd_pmecorrV_r       gmx_simd_pmecorrV_d
4114 #        define gmx_simd4_invsqrt_r       gmx_simd4_invsqrt_d
4115
4116 #else
4117
4118 #        define gmx_simd_invsqrt_r        gmx_simd_invsqrt_singleaccuracy_d
4119 #        define gmx_simd_invsqrt_pair_r   gmx_simd_invsqrt_pair_singleaccuracy_d
4120 #        define gmx_simd_sqrt_r           gmx_simd_sqrt_singleaccuracy_d
4121 #        define gmx_simd_inv_r            gmx_simd_inv_singleaccuracy_d
4122 #        define gmx_simd_log_r            gmx_simd_log_singleaccuracy_d
4123 #        define gmx_simd_exp2_r           gmx_simd_exp2_singleaccuracy_d
4124 #        define gmx_simd_exp_r            gmx_simd_exp_singleaccuracy_d
4125 #        define gmx_simd_erf_r            gmx_simd_erf_singleaccuracy_d
4126 #        define gmx_simd_erfc_r           gmx_simd_erfc_singleaccuracy_d
4127 #        define gmx_simd_sincos_r         gmx_simd_sincos_singleaccuracy_d
4128 #        define gmx_simd_sin_r            gmx_simd_sin_singleaccuracy_d
4129 #        define gmx_simd_cos_r            gmx_simd_cos_singleaccuracy_d
4130 #        define gmx_simd_tan_r            gmx_simd_tan_singleaccuracy_d
4131 #        define gmx_simd_asin_r           gmx_simd_asin_singleaccuracy_d
4132 #        define gmx_simd_acos_r           gmx_simd_acos_singleaccuracy_d
4133 #        define gmx_simd_atan_r           gmx_simd_atan_singleaccuracy_d
4134 #        define gmx_simd_atan2_r          gmx_simd_atan2_singleaccuracy_d
4135 #        define gmx_simd_pmecorrF_r       gmx_simd_pmecorrF_singleaccuracy_d
4136 #        define gmx_simd_pmecorrV_r       gmx_simd_pmecorrV_singleaccuracy_d
4137 #        define gmx_simd4_invsqrt_r       gmx_simd4_invsqrt_singleaccuracy_d
4138
4139 #endif
4140
4141 #    define gmx_simd_invsqrt_singleaccuracy_r        gmx_simd_invsqrt_singleaccuracy_d
4142 #    define gmx_simd_invsqrt_pair_singleaccuracy_r   gmx_simd_invsqrt_pair_singleaccuracy_d
4143 #    define gmx_simd_sqrt_singleaccuracy_r           gmx_simd_sqrt_singleaccuracy_d
4144 #    define gmx_simd_inv_singleaccuracy_r            gmx_simd_inv_singleaccuracy_d
4145 #    define gmx_simd_log_singleaccuracy_r            gmx_simd_log_singleaccuracy_d
4146 #    define gmx_simd_exp2_singleaccuracy_r           gmx_simd_exp2_singleaccuracy_d
4147 #    define gmx_simd_exp_singleaccuracy_r            gmx_simd_exp_singleaccuracy_d
4148 #    define gmx_simd_erf_singleaccuracy_r            gmx_simd_erf_singleaccuracy_d
4149 #    define gmx_simd_erfc_singleaccuracy_r           gmx_simd_erfc_singleaccuracy_d
4150 #    define gmx_simd_sincos_singleaccuracy_r         gmx_simd_sincos_singleaccuracy_d
4151 #    define gmx_simd_sin_singleaccuracy_r            gmx_simd_sin_singleaccuracy_d
4152 #    define gmx_simd_cos_singleaccuracy_r            gmx_simd_cos_singleaccuracy_d
4153 #    define gmx_simd_tan_singleaccuracy_r            gmx_simd_tan_singleaccuracy_d
4154 #    define gmx_simd_asin_singleaccuracy_r           gmx_simd_asin_singleaccuracy_d
4155 #    define gmx_simd_acos_singleaccuracy_r           gmx_simd_acos_singleaccuracy_d
4156 #    define gmx_simd_atan_singleaccuracy_r           gmx_simd_atan_singleaccuracy_d
4157 #    define gmx_simd_atan2_singleaccuracy_r          gmx_simd_atan2_singleaccuracy_d
4158 #    define gmx_simd_pmecorrF_singleaccuracy_r       gmx_simd_pmecorrF_singleaccuracy_d
4159 #    define gmx_simd_pmecorrV_singleaccuracy_r       gmx_simd_pmecorrV_singleaccuracy_d
4160 #    define gmx_simd4_invsqrt_singleaccuracy_r       gmx_simd4_invsqrt_singleaccuracy_d
4161
4162 #else /* GMX_DOUBLE */
4163
4164 /*! \name Real-precision SIMD math functions
4165  *
4166  *  These are the ones you should typically call in Gromacs.
4167  * \{
4168  */
4169
4170 /*! \brief SIMD utility function to sum a+b+c+d for SIMD reals.
4171  *
4172  * \copydetails gmx_simd_sum4_f
4173  */
4174 #    define gmx_simd_sum4_r           gmx_simd_sum4_f
4175
4176 /*! \brief Return -a if b is negative, SIMD real.
4177  *
4178  * \copydetails gmx_simd_xor_sign_f
4179  */
4180 #    define gmx_simd_xor_sign_r       gmx_simd_xor_sign_f
4181
4182 /*! \brief Calculate 1/sqrt(x) for SIMD real.
4183  *
4184  * \copydetails gmx_simd_invsqrt_f
4185  */
4186 #    define gmx_simd_invsqrt_r        gmx_simd_invsqrt_f
4187
4188 /*! \brief Calculate 1/sqrt(x) for two SIMD reals.
4189  *
4190  * \copydetails gmx_simd_invsqrt_pair_f
4191  */
4192 #    define gmx_simd_invsqrt_pair_r   gmx_simd_invsqrt_pair_f
4193
4194 /*! \brief Calculate sqrt(x) correctly for SIMD real, including argument 0.0.
4195  *
4196  * \copydetails gmx_simd_sqrt_f
4197  */
4198 #    define gmx_simd_sqrt_r           gmx_simd_sqrt_f
4199
4200 /*! \brief Calculate 1/x for SIMD real.
4201  *
4202  * \copydetails gmx_simd_inv_f
4203  */
4204 #    define gmx_simd_inv_r            gmx_simd_inv_f
4205
4206 /*! \brief SIMD real log(x). This is the natural logarithm.
4207  *
4208  * \copydetails gmx_simd_log_f
4209  */
4210 #    define gmx_simd_log_r            gmx_simd_log_f
4211
4212 /*! \brief SIMD real 2^x.
4213  *
4214  * \copydetails gmx_simd_exp2_f
4215  */
4216 #    define gmx_simd_exp2_r           gmx_simd_exp2_f
4217
4218 /*! \brief SIMD real e^x.
4219  *
4220  * \copydetails gmx_simd_exp_f
4221  */
4222 #    define gmx_simd_exp_r            gmx_simd_exp_f
4223
4224 /*! \brief SIMD real erf(x).
4225  *
4226  * \copydetails gmx_simd_erf_f
4227  */
4228 #    define gmx_simd_erf_r            gmx_simd_erf_f
4229
4230 /*! \brief SIMD real erfc(x).
4231  *
4232  * \copydetails gmx_simd_erfc_f
4233  */
4234 #    define gmx_simd_erfc_r           gmx_simd_erfc_f
4235
4236 /*! \brief SIMD real sin \& cos.
4237  *
4238  * \copydetails gmx_simd_sincos_f
4239  */
4240 #    define gmx_simd_sincos_r         gmx_simd_sincos_f
4241
4242 /*! \brief SIMD real sin(x).
4243  *
4244  * \copydetails gmx_simd_sin_f
4245  */
4246 #    define gmx_simd_sin_r            gmx_simd_sin_f
4247
4248 /*! \brief SIMD real cos(x).
4249  *
4250  * \copydetails gmx_simd_cos_f
4251  */
4252 #    define gmx_simd_cos_r            gmx_simd_cos_f
4253
4254 /*! \brief SIMD real tan(x).
4255  *
4256  * \copydetails gmx_simd_tan_f
4257  */
4258 #    define gmx_simd_tan_r            gmx_simd_tan_f
4259
4260 /*! \brief SIMD real asin(x).
4261  *
4262  * \copydetails gmx_simd_asin_f
4263  */
4264 #    define gmx_simd_asin_r           gmx_simd_asin_f
4265
4266 /*! \brief SIMD real acos(x).
4267  *
4268  * \copydetails gmx_simd_acos_f
4269  */
4270 #    define gmx_simd_acos_r           gmx_simd_acos_f
4271
4272 /*! \brief SIMD real atan(x).
4273  *
4274  * \copydetails gmx_simd_atan_f
4275  */
4276 #    define gmx_simd_atan_r           gmx_simd_atan_f
4277
4278 /*! \brief SIMD real atan2(y,x).
4279  *
4280  * \copydetails gmx_simd_atan2_f
4281  */
4282 #    define gmx_simd_atan2_r          gmx_simd_atan2_f
4283
4284 /*! \brief SIMD Analytic PME force correction.
4285  *
4286  * \copydetails gmx_simd_pmecorrF_f
4287  */
4288 #    define gmx_simd_pmecorrF_r       gmx_simd_pmecorrF_f
4289
4290 /*! \brief SIMD Analytic PME potential correction.
4291  *
4292  * \copydetails gmx_simd_pmecorrV_f
4293  */
4294 #    define gmx_simd_pmecorrV_r       gmx_simd_pmecorrV_f
4295
4296 /*! \brief Calculate 1/sqrt(x) for SIMD, only targeting single accuracy.
4297  *
4298  * \copydetails gmx_simd_invsqrt_r
4299  *
4300  * \note This is a performance-targeted function that only achieves single
4301  *       precision accuracy, even when the SIMD data is double precision.
4302  */
4303 #    define gmx_simd_invsqrt_singleaccuracy_r        gmx_simd_invsqrt_f
4304
4305 /*! \brief Calculate 1/sqrt(x) for SIMD pair, only targeting single accuracy.
4306  *
4307  * \copydetails gmx_simd_invsqrt_pair_r
4308  *
4309  * \note This is a performance-targeted function that only achieves single
4310  *       precision accuracy, even when the SIMD data is double precision.
4311  */
4312 #    define gmx_simd_invsqrt_pair_singleaccuracy_r   gmx_simd_invsqrt_pair_f
4313
4314 /*! \brief Calculate sqrt(x), only targeting single accuracy.
4315  *
4316  * \copydetails gmx_simd_sqrt_r
4317  *
4318  * \note This is a performance-targeted function that only achieves single
4319  *       precision accuracy, even when the SIMD data is double precision.
4320  */
4321 #    define gmx_simd_sqrt_singleaccuracy_r           gmx_simd_sqrt_f
4322
4323 /*! \brief Calculate 1/x for SIMD real, only targeting single accuracy.
4324  *
4325  * \copydetails gmx_simd_inv_r
4326  *
4327  * \note This is a performance-targeted function that only achieves single
4328  *       precision accuracy, even when the SIMD data is double precision.
4329  */
4330 #    define gmx_simd_inv_singleaccuracy_r            gmx_simd_inv_f
4331
4332 /*! \brief SIMD real log(x), only targeting single accuracy.
4333  *
4334  * \copydetails gmx_simd_log_r
4335  *
4336  * \note This is a performance-targeted function that only achieves single
4337  *       precision accuracy, even when the SIMD data is double precision.
4338  */
4339 #    define gmx_simd_log_singleaccuracy_r            gmx_simd_log_f
4340
4341 /*! \brief SIMD real 2^x, only targeting single accuracy.
4342  *
4343  * \copydetails gmx_simd_exp2_r
4344  *
4345  * \note This is a performance-targeted function that only achieves single
4346  *       precision accuracy, even when the SIMD data is double precision.
4347  */
4348 #    define gmx_simd_exp2_singleaccuracy_r           gmx_simd_exp2_f
4349
4350 /*! \brief SIMD real e^x, only targeting single accuracy.
4351  *
4352  * \copydetails gmx_simd_exp_r
4353  *
4354  * \note This is a performance-targeted function that only achieves single
4355  *       precision accuracy, even when the SIMD data is double precision.
4356  */
4357 #    define gmx_simd_exp_singleaccuracy_r            gmx_simd_exp_f
4358
4359 /*! \brief SIMD real erf(x), only targeting single accuracy.
4360  *
4361  * \copydetails gmx_simd_erf_r
4362  *
4363  * \note This is a performance-targeted function that only achieves single
4364  *       precision accuracy, even when the SIMD data is double precision.
4365  */
4366 #    define gmx_simd_erf_singleaccuracy_r            gmx_simd_erf_f
4367
4368 /*! \brief SIMD real erfc(x), only targeting single accuracy.
4369  *
4370  * \copydetails gmx_simd_erfc_r
4371  *
4372  * \note This is a performance-targeted function that only achieves single
4373  *       precision accuracy, even when the SIMD data is double precision.
4374  */
4375 #    define gmx_simd_erfc_singleaccuracy_r           gmx_simd_erfc_f
4376
4377 /*! \brief SIMD real sin \& cos, only targeting single accuracy.
4378  *
4379  * \copydetails gmx_simd_sincos_r
4380  *
4381  * \note This is a performance-targeted function that only achieves single
4382  *       precision accuracy, even when the SIMD data is double precision.
4383  */
4384 #    define gmx_simd_sincos_singleaccuracy_r         gmx_simd_sincos_f
4385
4386 /*! \brief SIMD real sin(x), only targeting single accuracy.
4387  *
4388  * \copydetails gmx_simd_sin_r
4389  *
4390  * \note This is a performance-targeted function that only achieves single
4391  *       precision accuracy, even when the SIMD data is double precision.
4392  */
4393 #    define gmx_simd_sin_singleaccuracy_r            gmx_simd_sin_f
4394
4395 /*! \brief SIMD real cos(x), only targeting single accuracy.
4396  *
4397  * \copydetails gmx_simd_cos_r
4398  *
4399  * \note This is a performance-targeted function that only achieves single
4400  *       precision accuracy, even when the SIMD data is double precision.
4401  */
4402 #    define gmx_simd_cos_singleaccuracy_r            gmx_simd_cos_f
4403
4404 /*! \brief SIMD real tan(x), only targeting single accuracy.
4405  *
4406  * \copydetails gmx_simd_tan_r
4407  *
4408  * \note This is a performance-targeted function that only achieves single
4409  *       precision accuracy, even when the SIMD data is double precision.
4410  */
4411 #    define gmx_simd_tan_singleaccuracy_r            gmx_simd_tan_f
4412
4413 /*! \brief SIMD real asin(x), only targeting single accuracy.
4414  *
4415  * \copydetails gmx_simd_asin_r
4416  *
4417  * \note This is a performance-targeted function that only achieves single
4418  *       precision accuracy, even when the SIMD data is double precision.
4419  */
4420 #    define gmx_simd_asin_singleaccuracy_r           gmx_simd_asin_f
4421
4422 /*! \brief SIMD real acos(x), only targeting single accuracy.
4423  *
4424  * \copydetails gmx_simd_acos_r
4425  *
4426  * \note This is a performance-targeted function that only achieves single
4427  *       precision accuracy, even when the SIMD data is double precision.
4428  */
4429 #    define gmx_simd_acos_singleaccuracy_r           gmx_simd_acos_f
4430
4431 /*! \brief SIMD real atan(x), only targeting single accuracy.
4432  *
4433  * \copydetails gmx_simd_atan_r
4434  *
4435  * \note This is a performance-targeted function that only achieves single
4436  *       precision accuracy, even when the SIMD data is double precision.
4437  */
4438 #    define gmx_simd_atan_singleaccuracy_r           gmx_simd_atan_f
4439
4440 /*! \brief SIMD real atan2(y,x), only targeting single accuracy.
4441  *
4442  * \copydetails gmx_simd_atan2_r
4443  *
4444  * \note This is a performance-targeted function that only achieves single
4445  *       precision accuracy, even when the SIMD data is double precision.
4446  */
4447 #    define gmx_simd_atan2_singleaccuracy_r          gmx_simd_atan2_f
4448
4449 /*! \brief SIMD Analytic PME force corr., only targeting single accuracy.
4450  *
4451  * \copydetails gmx_simd_pmecorrF_r
4452  *
4453  * \note This is a performance-targeted function that only achieves single
4454  *       precision accuracy, even when the SIMD data is double precision.
4455  */
4456 #    define gmx_simd_pmecorrF_singleaccuracy_r       gmx_simd_pmecorrF_f
4457
4458 /*! \brief SIMD Analytic PME potential corr., only targeting single accuracy.
4459  *
4460  * \copydetails gmx_simd_pmecorrV_r
4461  *
4462  * \note This is a performance-targeted function that only achieves single
4463  *       precision accuracy, even when the SIMD data is double precision.
4464  */
4465 #    define gmx_simd_pmecorrV_singleaccuracy_r       gmx_simd_pmecorrV_f
4466
4467
4468 /*! \}
4469  * \name SIMD4 math functions
4470  * \{
4471  */
4472
4473 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 reals.
4474  *
4475  * \copydetails gmx_simd_sum4_f
4476  */
4477 #    define gmx_simd4_sum4_r          gmx_simd4_sum4_f
4478
4479 /*! \brief Calculate 1/sqrt(x) for SIMD4 real.
4480  *
4481  * \copydetails gmx_simd_invsqrt_f
4482  */
4483 #    define gmx_simd4_invsqrt_r       gmx_simd4_invsqrt_f
4484
4485 /*! \brief 1/sqrt(x) for SIMD4 real. Single accuracy, even for double prec.
4486  *
4487  * \copydetails gmx_simd4_invsqrt_r
4488  *
4489  * \note This is a performance-targeted function that only achieves single
4490  *       precision accuracy, even when the SIMD data is double precision.
4491  */
4492 #    define gmx_simd4_invsqrt_singleaccuracy_r       gmx_simd4_invsqrt_f
4493
4494 /*! \} */
4495
4496 #endif /* GMX_DOUBLE */
4497
4498 /*! \} */
4499 /*! \endcond */
4500
4501 #endif /* GMX_SIMD_SIMD_MATH_H_ */