Removed simple.h from nb_kernel_sse4_1_XX
[gromacs.git] / src / gromacs / simd / simd_math.h
blob2daf49e33da7507197ff0e37a23195c2ecad2d1c
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SIMD_MATH_H_
36 #define GMX_SIMD_SIMD_MATH_H_
38 /*! \libinternal \file
40 * \brief Math functions for SIMD datatypes.
42 * \attention This file is generic for all SIMD architectures, so you cannot
43 * assume that any of the optional SIMD features (as defined in simd.h) are
44 * present. In particular, this means you cannot assume support for integers,
45 * logical operations (neither on floating-point nor integer values), shifts,
46 * and the architecture might only have SIMD for either float or double.
47 * Second, to keep this file clean and general, any additions to this file
48 * must work for all possible SIMD architectures in both single and double
49 * precision (if they support it), and you cannot make any assumptions about
50 * SIMD width.
52 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
54 * \inlibraryapi
55 * \ingroup module_simd
58 #include "config.h"
60 #include <math.h>
62 #include "gromacs/math/utilities.h"
63 #include "gromacs/simd/simd.h"
64 #include "gromacs/utility/real.h"
66 /*! \cond libapi */
67 /*! \addtogroup module_simd */
68 /*! \{ */
70 /*! \name Implementation accuracy settings
71 * \{
74 /*! \} */
76 #ifdef GMX_SIMD_HAVE_FLOAT
78 /*! \name Single precision SIMD math functions
80 * \note In most cases you should use the real-precision functions instead.
81 * \{
84 /****************************************
85 * SINGLE PRECISION SIMD MATH FUNCTIONS *
86 ****************************************/
88 /*! \brief SIMD float utility to sum a+b+c+d.
90 * You should normally call the real-precision routine \ref gmx_simd_sum4_r.
92 * \param a term 1 (multiple values)
93 * \param b term 2 (multiple values)
94 * \param c term 3 (multiple values)
95 * \param d term 4 (multiple values)
96 * \return sum of terms 1-4 (multiple values)
98 static gmx_inline gmx_simd_float_t gmx_simdcall
99 gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
100 gmx_simd_float_t c, gmx_simd_float_t d)
102 return gmx_simd_add_f(gmx_simd_add_f(a, b), gmx_simd_add_f(c, d));
105 /*! \brief Return -a if b is negative, SIMD float.
107 * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
109 * \param a Values to set sign for
110 * \param b Values used to set sign
111 * \return if b is negative, the sign of a will be changed.
113 * This is equivalent to doing an xor operation on a with the sign bit of b,
114 * with the exception that negative zero is not considered to be negative
115 * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
117 static gmx_inline gmx_simd_float_t gmx_simdcall
118 gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
120 #ifdef GMX_SIMD_HAVE_LOGICAL
121 return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), b));
122 #else
123 return gmx_simd_blendv_f(a, gmx_simd_fneg_f(a), gmx_simd_cmplt_f(b, gmx_simd_setzero_f()));
124 #endif
127 #ifndef gmx_simd_rsqrt_iter_f
128 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
130 * This is a low-level routine that should only be used by SIMD math routine
131 * that evaluates the inverse square root.
133 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
134 * \param x The reference (starting) value x for which we want 1/sqrt(x).
135 * \return An improved approximation with roughly twice as many bits of accuracy.
137 static gmx_inline gmx_simd_float_t gmx_simdcall
138 gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
140 # ifdef GMX_SIMD_HAVE_FMA
141 return gmx_simd_fmadd_f(gmx_simd_fnmadd_f(x, gmx_simd_mul_f(lu, lu), gmx_simd_set1_f(1.0f)), gmx_simd_mul_f(lu, gmx_simd_set1_f(0.5f)), lu);
142 # else
143 return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
144 # endif
146 #endif
148 /*! \brief Calculate 1/sqrt(x) for SIMD float.
150 * You should normally call the real-precision routine \ref gmx_simd_invsqrt_r.
152 * \param x Argument that must be >0. This routine does not check arguments.
153 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
155 static gmx_inline gmx_simd_float_t gmx_simdcall
156 gmx_simd_invsqrt_f(gmx_simd_float_t x)
158 gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
159 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
160 lu = gmx_simd_rsqrt_iter_f(lu, x);
161 #endif
162 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
163 lu = gmx_simd_rsqrt_iter_f(lu, x);
164 #endif
165 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
166 lu = gmx_simd_rsqrt_iter_f(lu, x);
167 #endif
168 return lu;
171 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
173 * Identical to gmx_simd_invsqrt_f but avoids fp-exception for non-masked entries.
174 * The result for the non-masked entries is undefined and the user has to use blend
175 * with the same mask to obtain a defined result.
177 * \param x Argument that must be >0 for masked entries
178 * \param m Masked entries
179 * \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was not masked.
181 static gmx_inline gmx_simd_float_t
182 gmx_simd_invsqrt_maskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
184 #ifdef NDEBUG
185 return gmx_simd_invsqrt_f(x);
186 #else
187 return gmx_simd_invsqrt_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f), x, m));
188 #endif
191 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD float.
193 * Identical to gmx_simd_invsqrt_f but avoids fp-exception for masked entries.
194 * The result for the non-masked entries is undefined and the user has to use blend
195 * with the same mask to obtain a defined result.
197 * \param x Argument that must be >0 for non-masked entries
198 * \param m Masked entries
199 * \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was masked.
201 static gmx_inline gmx_simd_float_t
202 gmx_simd_invsqrt_notmaskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
204 #ifdef NDEBUG
205 return gmx_simd_invsqrt_f(x);
206 #else
207 return gmx_simd_invsqrt_f(gmx_simd_blendv_f(x, gmx_simd_set1_f(1.0f), m));
208 #endif
211 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
213 * You should normally call the real-precision routine \ref gmx_simd_invsqrt_pair_r.
215 * \param x0 First set of arguments, x0 must be positive - no argument checking.
216 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
217 * \param[out] out0 Result 1/sqrt(x0)
218 * \param[out] out1 Result 1/sqrt(x1)
220 * In particular for double precision we can sometimes calculate square root
221 * pairs slightly faster by using single precision until the very last step.
223 static gmx_inline void gmx_simdcall
224 gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0, gmx_simd_float_t x1,
225 gmx_simd_float_t *out0, gmx_simd_float_t *out1)
227 *out0 = gmx_simd_invsqrt_f(x0);
228 *out1 = gmx_simd_invsqrt_f(x1);
231 #ifndef gmx_simd_rcp_iter_f
232 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
234 * This is a low-level routine that should only be used by SIMD math routine
235 * that evaluates the reciprocal.
237 * \param lu Approximation of 1/x, typically obtained from lookup.
238 * \param x The reference (starting) value x for which we want 1/x.
239 * \return An improved approximation with roughly twice as many bits of accuracy.
241 static gmx_inline gmx_simd_float_t gmx_simdcall
242 gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
244 return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
246 #endif
248 /*! \brief Calculate 1/x for SIMD float.
250 * You should normally call the real-precision routine \ref gmx_simd_inv_r.
252 * \param x Argument that must be nonzero. This routine does not check arguments.
253 * \return 1/x. Result is undefined if your argument was invalid.
255 static gmx_inline gmx_simd_float_t gmx_simdcall
256 gmx_simd_inv_f(gmx_simd_float_t x)
258 gmx_simd_float_t lu = gmx_simd_rcp_f(x);
259 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
260 lu = gmx_simd_rcp_iter_f(lu, x);
261 #endif
262 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
263 lu = gmx_simd_rcp_iter_f(lu, x);
264 #endif
265 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
266 lu = gmx_simd_rcp_iter_f(lu, x);
267 #endif
268 return lu;
271 /*! \brief Calculate 1/x for masked entries of SIMD float.
273 * Identical to gmx_simd_inv_f but avoids fp-exception for non-masked entries.
274 * The result for the non-masked entries is undefined and the user has to use blend
275 * with the same mask to obtain a defined result.
277 * \param x Argument that must be nonzero for masked entries
278 * \param m Masked entries
279 * \return 1/x. Result is undefined if your argument was invalid or entry was not masked.
281 static gmx_inline gmx_simd_float_t
282 gmx_simd_inv_maskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
284 #ifdef NDEBUG
285 return gmx_simd_inv_f(x);
286 #else
287 return gmx_simd_inv_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f), x, m));
288 #endif
292 /*! \brief Calculate 1/x for non-masked entries of SIMD float.
294 * Identical to gmx_simd_inv_f but avoids fp-exception for masked entries.
295 * The result for the non-masked entries is undefined and the user has to use blend
296 * with the same mask to obtain a defined result.
298 * \param x Argument that must be nonzero for non-masked entries
299 * \param m Masked entries
300 * \return 1/x. Result is undefined if your argument was invalid or entry was masked.
302 static gmx_inline gmx_simd_float_t
303 gmx_simd_inv_notmaskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t gmx_unused m)
305 #ifdef NDEBUG
306 return gmx_simd_inv_f(x);
307 #else
308 return gmx_simd_inv_f(gmx_simd_blendv_f(x, gmx_simd_set1_f(1.0f), m));
309 #endif
312 /*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
314 * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
316 * \param x Argument that must be >=0.
317 * \return sqrt(x). If x=0, the result will correctly be set to 0.
318 * The result is undefined if the input value is negative.
320 static gmx_inline gmx_simd_float_t gmx_simdcall
321 gmx_simd_sqrt_f(gmx_simd_float_t x)
323 gmx_simd_fbool_t mask;
324 gmx_simd_float_t res;
326 mask = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
327 res = gmx_simd_blendnotzero_f(gmx_simd_invsqrt_notmaskfpe_f(x, mask), mask);
328 return gmx_simd_mul_f(res, x);
331 /*! \brief SIMD float log(x). This is the natural logarithm.
333 * You should normally call the real-precision routine \ref gmx_simd_log_r.
335 * \param x Argument, should be >0.
336 * \result The natural logarithm of x. Undefined if argument is invalid.
338 #ifndef gmx_simd_log_f
339 static gmx_inline gmx_simd_float_t gmx_simdcall
340 gmx_simd_log_f(gmx_simd_float_t x)
342 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
343 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
344 const gmx_simd_float_t sqrt2 = gmx_simd_set1_f(sqrt(2.0f));
345 const gmx_simd_float_t corr = gmx_simd_set1_f(0.693147180559945286226764f);
346 const gmx_simd_float_t CL9 = gmx_simd_set1_f(0.2371599674224853515625f);
347 const gmx_simd_float_t CL7 = gmx_simd_set1_f(0.285279005765914916992188f);
348 const gmx_simd_float_t CL5 = gmx_simd_set1_f(0.400005519390106201171875f);
349 const gmx_simd_float_t CL3 = gmx_simd_set1_f(0.666666567325592041015625f);
350 const gmx_simd_float_t CL1 = gmx_simd_set1_f(2.0f);
351 gmx_simd_float_t fexp, x2, p;
352 gmx_simd_fbool_t mask;
354 fexp = gmx_simd_get_exponent_f(x);
355 x = gmx_simd_get_mantissa_f(x);
357 mask = gmx_simd_cmplt_f(sqrt2, x);
358 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
359 fexp = gmx_simd_add_f(fexp, gmx_simd_blendzero_f(one, mask));
360 x = gmx_simd_mul_f(x, gmx_simd_blendv_f(one, half, mask));
362 x = gmx_simd_mul_f( gmx_simd_sub_f(x, one), gmx_simd_inv_f( gmx_simd_add_f(x, one) ) );
363 x2 = gmx_simd_mul_f(x, x);
365 p = gmx_simd_fmadd_f(CL9, x2, CL7);
366 p = gmx_simd_fmadd_f(p, x2, CL5);
367 p = gmx_simd_fmadd_f(p, x2, CL3);
368 p = gmx_simd_fmadd_f(p, x2, CL1);
369 p = gmx_simd_fmadd_f(p, x, gmx_simd_mul_f(corr, fexp));
371 return p;
373 #endif
375 #ifndef gmx_simd_exp2_f
376 /*! \brief SIMD float 2^x.
378 * You should normally call the real-precision routine \ref gmx_simd_exp2_r.
380 * \param x Argument.
381 * \result 2^x. Undefined if input argument caused overflow.
383 static gmx_inline gmx_simd_float_t gmx_simdcall
384 gmx_simd_exp2_f(gmx_simd_float_t x)
386 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
387 const gmx_simd_float_t arglimit = gmx_simd_set1_f(126.0f);
388 const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.0001534581200287996416911311);
389 const gmx_simd_float_t CC5 = gmx_simd_set1_f(0.001339993121934088894618990);
390 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.009618488957115180159497841);
391 const gmx_simd_float_t CC3 = gmx_simd_set1_f(0.05550328776964726865751735);
392 const gmx_simd_float_t CC2 = gmx_simd_set1_f(0.2402264689063408646490722);
393 const gmx_simd_float_t CC1 = gmx_simd_set1_f(0.6931472057372680777553816);
394 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
396 gmx_simd_float_t fexppart;
397 gmx_simd_float_t intpart;
398 gmx_simd_float_t p;
399 gmx_simd_fbool_t valuemask;
401 fexppart = gmx_simd_set_exponent_f(x);
402 intpart = gmx_simd_round_f(x);
403 valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(x), arglimit);
404 fexppart = gmx_simd_blendzero_f(fexppart, valuemask);
405 x = gmx_simd_sub_f(x, intpart);
407 p = gmx_simd_fmadd_f(CC6, x, CC5);
408 p = gmx_simd_fmadd_f(p, x, CC4);
409 p = gmx_simd_fmadd_f(p, x, CC3);
410 p = gmx_simd_fmadd_f(p, x, CC2);
411 p = gmx_simd_fmadd_f(p, x, CC1);
412 p = gmx_simd_fmadd_f(p, x, one);
413 x = gmx_simd_mul_f(p, fexppart);
414 return x;
416 #endif
418 #ifndef gmx_simd_exp_f
419 /*! \brief SIMD float exp(x).
421 * You should normally call the real-precision routine \ref gmx_simd_exp_r.
423 * In addition to scaling the argument for 2^x this routine correctly does
424 * extended precision arithmetics to improve accuracy.
426 * \param x Argument.
427 * \result exp(x). Undefined if input argument caused overflow,
428 * which can happen if abs(x) \> 7e13.
430 static gmx_inline gmx_simd_float_t gmx_simdcall
431 gmx_simd_exp_f(gmx_simd_float_t x)
433 const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f);
434 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
435 const gmx_simd_float_t arglimit = gmx_simd_set1_f(126.0f);
436 const gmx_simd_float_t invargscale0 = gmx_simd_set1_f(-0.693145751953125f);
437 const gmx_simd_float_t invargscale1 = gmx_simd_set1_f(-1.428606765330187045e-06f);
438 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.00136324646882712841033936f);
439 const gmx_simd_float_t CC3 = gmx_simd_set1_f(0.00836596917361021041870117f);
440 const gmx_simd_float_t CC2 = gmx_simd_set1_f(0.0416710823774337768554688f);
441 const gmx_simd_float_t CC1 = gmx_simd_set1_f(0.166665524244308471679688f);
442 const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.499999850988388061523438f);
443 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
444 gmx_simd_float_t fexppart;
445 gmx_simd_float_t intpart;
446 gmx_simd_float_t y, p;
447 gmx_simd_fbool_t valuemask;
449 y = gmx_simd_mul_f(x, argscale);
450 fexppart = gmx_simd_set_exponent_f(y); /* rounds to nearest int internally */
451 intpart = gmx_simd_round_f(y); /* use same rounding algorithm here */
452 valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(y), arglimit);
453 fexppart = gmx_simd_blendzero_f(fexppart, valuemask);
455 /* Extended precision arithmetics */
456 x = gmx_simd_fmadd_f(invargscale0, intpart, x);
457 x = gmx_simd_fmadd_f(invargscale1, intpart, x);
459 p = gmx_simd_fmadd_f(CC4, x, CC3);
460 p = gmx_simd_fmadd_f(p, x, CC2);
461 p = gmx_simd_fmadd_f(p, x, CC1);
462 p = gmx_simd_fmadd_f(p, x, CC0);
463 p = gmx_simd_fmadd_f(gmx_simd_mul_f(x, x), p, x);
464 p = gmx_simd_add_f(p, one);
465 x = gmx_simd_mul_f(p, fexppart);
466 return x;
468 #endif
470 /*! \brief SIMD float erf(x).
472 * You should normally call the real-precision routine \ref gmx_simd_erf_r.
474 * \param x The value to calculate erf(x) for.
475 * \result erf(x)
477 * This routine achieves very close to full precision, but we do not care about
478 * the last bit or the subnormal result range.
480 static gmx_inline gmx_simd_float_t gmx_simdcall
481 gmx_simd_erf_f(gmx_simd_float_t x)
483 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
484 const gmx_simd_float_t CA6 = gmx_simd_set1_f(7.853861353153693e-5f);
485 const gmx_simd_float_t CA5 = gmx_simd_set1_f(-8.010193625184903e-4f);
486 const gmx_simd_float_t CA4 = gmx_simd_set1_f(5.188327685732524e-3f);
487 const gmx_simd_float_t CA3 = gmx_simd_set1_f(-2.685381193529856e-2f);
488 const gmx_simd_float_t CA2 = gmx_simd_set1_f(1.128358514861418e-1f);
489 const gmx_simd_float_t CA1 = gmx_simd_set1_f(-3.761262582423300e-1f);
490 const gmx_simd_float_t CA0 = gmx_simd_set1_f(1.128379165726710f);
491 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
492 const gmx_simd_float_t CB9 = gmx_simd_set1_f(-0.0018629930017603923f);
493 const gmx_simd_float_t CB8 = gmx_simd_set1_f(0.003909821287598495f);
494 const gmx_simd_float_t CB7 = gmx_simd_set1_f(-0.0052094582210355615f);
495 const gmx_simd_float_t CB6 = gmx_simd_set1_f(0.005685614362160572f);
496 const gmx_simd_float_t CB5 = gmx_simd_set1_f(-0.0025367682853477272f);
497 const gmx_simd_float_t CB4 = gmx_simd_set1_f(-0.010199799682318782f);
498 const gmx_simd_float_t CB3 = gmx_simd_set1_f(0.04369575504816542f);
499 const gmx_simd_float_t CB2 = gmx_simd_set1_f(-0.11884063474674492f);
500 const gmx_simd_float_t CB1 = gmx_simd_set1_f(0.2732120154030589f);
501 const gmx_simd_float_t CB0 = gmx_simd_set1_f(0.42758357702025784f);
502 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
503 const gmx_simd_float_t CC10 = gmx_simd_set1_f(-0.0445555913112064f);
504 const gmx_simd_float_t CC9 = gmx_simd_set1_f(0.21376355144663348f);
505 const gmx_simd_float_t CC8 = gmx_simd_set1_f(-0.3473187200259257f);
506 const gmx_simd_float_t CC7 = gmx_simd_set1_f(0.016690861551248114f);
507 const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.7560973182491192f);
508 const gmx_simd_float_t CC5 = gmx_simd_set1_f(-1.2137903600145787f);
509 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.8411872321232948f);
510 const gmx_simd_float_t CC3 = gmx_simd_set1_f(-0.08670413896296343f);
511 const gmx_simd_float_t CC2 = gmx_simd_set1_f(-0.27124782687240334f);
512 const gmx_simd_float_t CC1 = gmx_simd_set1_f(-0.0007502488047806069f);
513 const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.5642114853803148f);
514 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
515 const gmx_simd_float_t two = gmx_simd_set1_f(2.0f);
517 gmx_simd_float_t x2, x4, y;
518 gmx_simd_float_t t, t2, w, w2;
519 gmx_simd_float_t pA0, pA1, pB0, pB1, pC0, pC1;
520 gmx_simd_float_t expmx2;
521 gmx_simd_float_t res_erf, res_erfc, res;
522 gmx_simd_fbool_t mask, msk_erf;
524 /* Calculate erf() */
525 x2 = gmx_simd_mul_f(x, x);
526 x4 = gmx_simd_mul_f(x2, x2);
528 pA0 = gmx_simd_fmadd_f(CA6, x4, CA4);
529 pA1 = gmx_simd_fmadd_f(CA5, x4, CA3);
530 pA0 = gmx_simd_fmadd_f(pA0, x4, CA2);
531 pA1 = gmx_simd_fmadd_f(pA1, x4, CA1);
532 pA0 = gmx_simd_mul_f(pA0, x4);
533 pA0 = gmx_simd_fmadd_f(pA1, x2, pA0);
534 /* Constant term must come last for precision reasons */
535 pA0 = gmx_simd_add_f(pA0, CA0);
537 res_erf = gmx_simd_mul_f(x, pA0);
539 /* Calculate erfc */
540 y = gmx_simd_fabs_f(x);
541 msk_erf = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
542 t = gmx_simd_inv_notmaskfpe_f(y, msk_erf);
543 w = gmx_simd_sub_f(t, one);
544 t2 = gmx_simd_mul_f(t, t);
545 w2 = gmx_simd_mul_f(w, w);
547 /* No need for a floating-point sieve here (as in erfc), since erf()
548 * will never return values that are extremely small for large args.
550 expmx2 = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(y, y)));
552 pB1 = gmx_simd_fmadd_f(CB9, w2, CB7);
553 pB0 = gmx_simd_fmadd_f(CB8, w2, CB6);
554 pB1 = gmx_simd_fmadd_f(pB1, w2, CB5);
555 pB0 = gmx_simd_fmadd_f(pB0, w2, CB4);
556 pB1 = gmx_simd_fmadd_f(pB1, w2, CB3);
557 pB0 = gmx_simd_fmadd_f(pB0, w2, CB2);
558 pB1 = gmx_simd_fmadd_f(pB1, w2, CB1);
559 pB0 = gmx_simd_fmadd_f(pB0, w2, CB0);
560 pB0 = gmx_simd_fmadd_f(pB1, w, pB0);
562 pC0 = gmx_simd_fmadd_f(CC10, t2, CC8);
563 pC1 = gmx_simd_fmadd_f(CC9, t2, CC7);
564 pC0 = gmx_simd_fmadd_f(pC0, t2, CC6);
565 pC1 = gmx_simd_fmadd_f(pC1, t2, CC5);
566 pC0 = gmx_simd_fmadd_f(pC0, t2, CC4);
567 pC1 = gmx_simd_fmadd_f(pC1, t2, CC3);
568 pC0 = gmx_simd_fmadd_f(pC0, t2, CC2);
569 pC1 = gmx_simd_fmadd_f(pC1, t2, CC1);
571 pC0 = gmx_simd_fmadd_f(pC0, t2, CC0);
572 pC0 = gmx_simd_fmadd_f(pC1, t, pC0);
573 pC0 = gmx_simd_mul_f(pC0, t);
575 /* SELECT pB0 or pC0 for erfc() */
576 mask = gmx_simd_cmplt_f(two, y);
577 res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
578 res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
580 /* erfc(x<0) = 2-erfc(|x|) */
581 mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
582 res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
584 /* Select erf() or erfc() */
585 res = gmx_simd_blendv_f(gmx_simd_sub_f(one, res_erfc), res_erf, msk_erf);
587 return res;
590 /*! \brief SIMD float erfc(x).
592 * You should normally call the real-precision routine \ref gmx_simd_erfc_r.
594 * \param x The value to calculate erfc(x) for.
595 * \result erfc(x)
597 * This routine achieves full precision (bar the last bit) over most of the
598 * input range, but for large arguments where the result is getting close
599 * to the minimum representable numbers we accept slightly larger errors
600 * (think results that are in the ballpark of 10^-30 for single precision,
601 * or 10^-200 for double) since that is not relevant for MD.
603 static gmx_inline gmx_simd_float_t gmx_simdcall
604 gmx_simd_erfc_f(gmx_simd_float_t x)
606 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
607 const gmx_simd_float_t CA6 = gmx_simd_set1_f(7.853861353153693e-5f);
608 const gmx_simd_float_t CA5 = gmx_simd_set1_f(-8.010193625184903e-4f);
609 const gmx_simd_float_t CA4 = gmx_simd_set1_f(5.188327685732524e-3f);
610 const gmx_simd_float_t CA3 = gmx_simd_set1_f(-2.685381193529856e-2f);
611 const gmx_simd_float_t CA2 = gmx_simd_set1_f(1.128358514861418e-1f);
612 const gmx_simd_float_t CA1 = gmx_simd_set1_f(-3.761262582423300e-1f);
613 const gmx_simd_float_t CA0 = gmx_simd_set1_f(1.128379165726710f);
614 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
615 const gmx_simd_float_t CB9 = gmx_simd_set1_f(-0.0018629930017603923f);
616 const gmx_simd_float_t CB8 = gmx_simd_set1_f(0.003909821287598495f);
617 const gmx_simd_float_t CB7 = gmx_simd_set1_f(-0.0052094582210355615f);
618 const gmx_simd_float_t CB6 = gmx_simd_set1_f(0.005685614362160572f);
619 const gmx_simd_float_t CB5 = gmx_simd_set1_f(-0.0025367682853477272f);
620 const gmx_simd_float_t CB4 = gmx_simd_set1_f(-0.010199799682318782f);
621 const gmx_simd_float_t CB3 = gmx_simd_set1_f(0.04369575504816542f);
622 const gmx_simd_float_t CB2 = gmx_simd_set1_f(-0.11884063474674492f);
623 const gmx_simd_float_t CB1 = gmx_simd_set1_f(0.2732120154030589f);
624 const gmx_simd_float_t CB0 = gmx_simd_set1_f(0.42758357702025784f);
625 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
626 const gmx_simd_float_t CC10 = gmx_simd_set1_f(-0.0445555913112064f);
627 const gmx_simd_float_t CC9 = gmx_simd_set1_f(0.21376355144663348f);
628 const gmx_simd_float_t CC8 = gmx_simd_set1_f(-0.3473187200259257f);
629 const gmx_simd_float_t CC7 = gmx_simd_set1_f(0.016690861551248114f);
630 const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.7560973182491192f);
631 const gmx_simd_float_t CC5 = gmx_simd_set1_f(-1.2137903600145787f);
632 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.8411872321232948f);
633 const gmx_simd_float_t CC3 = gmx_simd_set1_f(-0.08670413896296343f);
634 const gmx_simd_float_t CC2 = gmx_simd_set1_f(-0.27124782687240334f);
635 const gmx_simd_float_t CC1 = gmx_simd_set1_f(-0.0007502488047806069f);
636 const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.5642114853803148f);
637 /* Coefficients for expansion of exp(x) in [0,0.1] */
638 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
639 const gmx_simd_float_t CD2 = gmx_simd_set1_f(0.5000066608081202f);
640 const gmx_simd_float_t CD3 = gmx_simd_set1_f(0.1664795422874624f);
641 const gmx_simd_float_t CD4 = gmx_simd_set1_f(0.04379839977652482f);
642 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
643 const gmx_simd_float_t two = gmx_simd_set1_f(2.0f);
645 /* We need to use a small trick here, since we cannot assume all SIMD
646 * architectures support integers, and the flag we want (0xfffff000) would
647 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
648 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
649 * fp numbers, and perform a logical or. Since the expression is constant,
650 * we can at least hope it is evaluated at compile-time.
652 #ifdef GMX_SIMD_HAVE_LOGICAL
653 const gmx_simd_float_t sieve = gmx_simd_or_f(gmx_simd_set1_f(-5.965323564e+29f), gmx_simd_set1_f(7.05044434e-30f));
654 #else
655 const int isieve = 0xFFFFF000;
656 float mem[GMX_SIMD_REAL_WIDTH*2];
657 float * pmem = gmx_simd_align_f(mem);
658 union {
659 float f; int i;
660 } conv;
661 int i;
662 #endif
664 gmx_simd_float_t x2, x4, y;
665 gmx_simd_float_t q, z, t, t2, w, w2;
666 gmx_simd_float_t pA0, pA1, pB0, pB1, pC0, pC1;
667 gmx_simd_float_t expmx2, corr;
668 gmx_simd_float_t res_erf, res_erfc, res;
669 gmx_simd_fbool_t mask, msk_erf;
671 /* Calculate erf() */
672 x2 = gmx_simd_mul_f(x, x);
673 x4 = gmx_simd_mul_f(x2, x2);
675 pA0 = gmx_simd_fmadd_f(CA6, x4, CA4);
676 pA1 = gmx_simd_fmadd_f(CA5, x4, CA3);
677 pA0 = gmx_simd_fmadd_f(pA0, x4, CA2);
678 pA1 = gmx_simd_fmadd_f(pA1, x4, CA1);
679 pA1 = gmx_simd_mul_f(pA1, x2);
680 pA0 = gmx_simd_fmadd_f(pA0, x4, pA1);
681 /* Constant term must come last for precision reasons */
682 pA0 = gmx_simd_add_f(pA0, CA0);
684 res_erf = gmx_simd_mul_f(x, pA0);
686 /* Calculate erfc */
687 y = gmx_simd_fabs_f(x);
688 msk_erf = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
689 t = gmx_simd_inv_notmaskfpe_f(y, msk_erf);
690 w = gmx_simd_sub_f(t, one);
691 t2 = gmx_simd_mul_f(t, t);
692 w2 = gmx_simd_mul_f(w, w);
694 * We cannot simply calculate exp(-y2) directly in single precision, since
695 * that will lose a couple of bits of precision due to the multiplication.
696 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
697 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
699 * The only drawback with this is that it requires TWO separate exponential
700 * evaluations, which would be horrible performance-wise. However, the argument
701 * for the second exp() call is always small, so there we simply use a
702 * low-order minimax expansion on [0,0.1].
704 * However, this neat idea requires support for logical ops (and) on
705 * FP numbers, which some vendors decided isn't necessary in their SIMD
706 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
707 * in double, but we still need memory as a backup when that is not available,
708 * and this case is rare enough that we go directly there...
710 #ifdef GMX_SIMD_HAVE_LOGICAL
711 z = gmx_simd_and_f(y, sieve);
712 #else
713 gmx_simd_store_f(pmem, y);
714 for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
716 conv.f = pmem[i];
717 conv.i = conv.i & isieve;
718 pmem[i] = conv.f;
720 z = gmx_simd_load_f(pmem);
721 #endif
722 q = gmx_simd_mul_f( gmx_simd_sub_f(z, y), gmx_simd_add_f(z, y) );
723 corr = gmx_simd_fmadd_f(CD4, q, CD3);
724 corr = gmx_simd_fmadd_f(corr, q, CD2);
725 corr = gmx_simd_fmadd_f(corr, q, one);
726 corr = gmx_simd_fmadd_f(corr, q, one);
728 expmx2 = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(z, z) ) );
729 expmx2 = gmx_simd_mul_f(expmx2, corr);
731 pB1 = gmx_simd_fmadd_f(CB9, w2, CB7);
732 pB0 = gmx_simd_fmadd_f(CB8, w2, CB6);
733 pB1 = gmx_simd_fmadd_f(pB1, w2, CB5);
734 pB0 = gmx_simd_fmadd_f(pB0, w2, CB4);
735 pB1 = gmx_simd_fmadd_f(pB1, w2, CB3);
736 pB0 = gmx_simd_fmadd_f(pB0, w2, CB2);
737 pB1 = gmx_simd_fmadd_f(pB1, w2, CB1);
738 pB0 = gmx_simd_fmadd_f(pB0, w2, CB0);
739 pB0 = gmx_simd_fmadd_f(pB1, w, pB0);
741 pC0 = gmx_simd_fmadd_f(CC10, t2, CC8);
742 pC1 = gmx_simd_fmadd_f(CC9, t2, CC7);
743 pC0 = gmx_simd_fmadd_f(pC0, t2, CC6);
744 pC1 = gmx_simd_fmadd_f(pC1, t2, CC5);
745 pC0 = gmx_simd_fmadd_f(pC0, t2, CC4);
746 pC1 = gmx_simd_fmadd_f(pC1, t2, CC3);
747 pC0 = gmx_simd_fmadd_f(pC0, t2, CC2);
748 pC1 = gmx_simd_fmadd_f(pC1, t2, CC1);
750 pC0 = gmx_simd_fmadd_f(pC0, t2, CC0);
751 pC0 = gmx_simd_fmadd_f(pC1, t, pC0);
752 pC0 = gmx_simd_mul_f(pC0, t);
754 /* SELECT pB0 or pC0 for erfc() */
755 mask = gmx_simd_cmplt_f(two, y);
756 res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
757 res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
759 /* erfc(x<0) = 2-erfc(|x|) */
760 mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
761 res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
763 /* Select erf() or erfc() */
764 res = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(one, res_erf), msk_erf);
766 return res;
769 /*! \brief SIMD float sin \& cos.
771 * You should normally call the real-precision routine \ref gmx_simd_sincos_r.
773 * \param x The argument to evaluate sin/cos for
774 * \param[out] sinval Sin(x)
775 * \param[out] cosval Cos(x)
777 * This version achieves close to machine precision, but for very large
778 * magnitudes of the argument we inherently begin to lose accuracy due to the
779 * argument reduction, despite using extended precision arithmetics internally.
781 static gmx_inline void gmx_simdcall
782 gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
784 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
785 const gmx_simd_float_t argred0 = gmx_simd_set1_f(-1.5703125);
786 const gmx_simd_float_t argred1 = gmx_simd_set1_f(-4.83751296997070312500e-04f);
787 const gmx_simd_float_t argred2 = gmx_simd_set1_f(-7.54953362047672271729e-08f);
788 const gmx_simd_float_t argred3 = gmx_simd_set1_f(-2.56334406825708960298e-12f);
789 const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI);
790 const gmx_simd_float_t const_sin2 = gmx_simd_set1_f(-1.9515295891e-4f);
791 const gmx_simd_float_t const_sin1 = gmx_simd_set1_f( 8.3321608736e-3f);
792 const gmx_simd_float_t const_sin0 = gmx_simd_set1_f(-1.6666654611e-1f);
793 const gmx_simd_float_t const_cos2 = gmx_simd_set1_f( 2.443315711809948e-5f);
794 const gmx_simd_float_t const_cos1 = gmx_simd_set1_f(-1.388731625493765e-3f);
795 const gmx_simd_float_t const_cos0 = gmx_simd_set1_f( 4.166664568298827e-2f);
796 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
797 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
798 gmx_simd_float_t ssign, csign;
799 gmx_simd_float_t x2, y, z, psin, pcos, sss, ccc;
800 gmx_simd_fbool_t mask;
801 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
802 const gmx_simd_fint32_t ione = gmx_simd_set1_fi(1);
803 const gmx_simd_fint32_t itwo = gmx_simd_set1_fi(2);
804 gmx_simd_fint32_t iy;
806 z = gmx_simd_mul_f(x, two_over_pi);
807 iy = gmx_simd_cvt_f2i(z);
808 y = gmx_simd_round_f(z);
810 mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), gmx_simd_setzero_fi()));
811 ssign = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
812 csign = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
813 #else
814 const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
815 const gmx_simd_float_t minusquarter = gmx_simd_set1_f(-0.25f);
816 gmx_simd_float_t q;
817 gmx_simd_fbool_t m1, m2, m3;
819 /* The most obvious way to find the arguments quadrant in the unit circle
820 * to calculate the sign is to use integer arithmetic, but that is not
821 * present in all SIMD implementations. As an alternative, we have devised a
822 * pure floating-point algorithm that uses truncation for argument reduction
823 * so that we get a new value 0<=q<1 over the unit circle, and then
824 * do floating-point comparisons with fractions. This is likely to be
825 * slightly slower (~10%) due to the longer latencies of floating-point, so
826 * we only use it when integer SIMD arithmetic is not present.
828 ssign = x;
829 x = gmx_simd_fabs_f(x);
830 /* It is critical that half-way cases are rounded down */
831 z = gmx_simd_fmadd_f(x, two_over_pi, half);
832 y = gmx_simd_trunc_f(z);
833 q = gmx_simd_mul_f(z, quarter);
834 q = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
835 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
836 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
837 * This removes the 2*Pi periodicity without using any integer arithmetic.
838 * First check if y had the value 2 or 3, set csign if true.
840 q = gmx_simd_sub_f(q, half);
841 /* If we have logical operations we can work directly on the signbit, which
842 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
843 * Thus, if you are altering defines to debug alternative code paths, the
844 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
845 * active or inactive - you will get errors if only one is used.
847 # ifdef GMX_SIMD_HAVE_LOGICAL
848 ssign = gmx_simd_and_f(ssign, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
849 csign = gmx_simd_andnot_f(q, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
850 ssign = gmx_simd_xor_f(ssign, csign);
851 # else
852 csign = gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f), q);
853 // ALT: csign = gmx_simd_fneg_f(gmx_simd_copysign(gmx_simd_set1_f(1.0),q));
855 ssign = gmx_simd_xor_sign_f(ssign, csign); /* swap ssign if csign was set. */
856 # endif
857 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
858 m1 = gmx_simd_cmplt_f(q, minusquarter);
859 m2 = gmx_simd_cmple_f(gmx_simd_setzero_f(), q);
860 m3 = gmx_simd_cmplt_f(q, quarter);
861 m2 = gmx_simd_and_fb(m2, m3);
862 mask = gmx_simd_or_fb(m1, m2);
863 /* where mask is FALSE, set sign. */
864 csign = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask));
865 #endif
866 x = gmx_simd_fmadd_f(y, argred0, x);
867 x = gmx_simd_fmadd_f(y, argred1, x);
868 x = gmx_simd_fmadd_f(y, argred2, x);
869 x = gmx_simd_fmadd_f(y, argred3, x);
870 x2 = gmx_simd_mul_f(x, x);
872 psin = gmx_simd_fmadd_f(const_sin2, x2, const_sin1);
873 psin = gmx_simd_fmadd_f(psin, x2, const_sin0);
874 psin = gmx_simd_fmadd_f(psin, gmx_simd_mul_f(x, x2), x);
875 pcos = gmx_simd_fmadd_f(const_cos2, x2, const_cos1);
876 pcos = gmx_simd_fmadd_f(pcos, x2, const_cos0);
877 pcos = gmx_simd_fmsub_f(pcos, x2, half);
878 pcos = gmx_simd_fmadd_f(pcos, x2, one);
880 sss = gmx_simd_blendv_f(pcos, psin, mask);
881 ccc = gmx_simd_blendv_f(psin, pcos, mask);
882 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
883 #ifdef GMX_SIMD_HAVE_LOGICAL
884 *sinval = gmx_simd_xor_f(sss, ssign);
885 *cosval = gmx_simd_xor_f(ccc, csign);
886 #else
887 *sinval = gmx_simd_xor_sign_f(sss, ssign);
888 *cosval = gmx_simd_xor_sign_f(ccc, csign);
889 #endif
892 /*! \brief SIMD float sin(x).
894 * You should normally call the real-precision routine \ref gmx_simd_sin_r.
896 * \param x The argument to evaluate sin for
897 * \result Sin(x)
899 * \attention Do NOT call both sin & cos if you need both results, since each of them
900 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
902 static gmx_inline gmx_simd_float_t gmx_simdcall
903 gmx_simd_sin_f(gmx_simd_float_t x)
905 gmx_simd_float_t s, c;
906 gmx_simd_sincos_f(x, &s, &c);
907 return s;
910 /*! \brief SIMD float cos(x).
912 * You should normally call the real-precision routine \ref gmx_simd_cos_r.
914 * \param x The argument to evaluate cos for
915 * \result Cos(x)
917 * \attention Do NOT call both sin & cos if you need both results, since each of them
918 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
920 static gmx_inline gmx_simd_float_t gmx_simdcall
921 gmx_simd_cos_f(gmx_simd_float_t x)
923 gmx_simd_float_t s, c;
924 gmx_simd_sincos_f(x, &s, &c);
925 return c;
928 /*! \brief SIMD float tan(x).
930 * You should normally call the real-precision routine \ref gmx_simd_tan_r.
932 * \param x The argument to evaluate tan for
933 * \result Tan(x)
935 static gmx_inline gmx_simd_float_t gmx_simdcall
936 gmx_simd_tan_f(gmx_simd_float_t x)
938 const gmx_simd_float_t argred0 = gmx_simd_set1_f(-1.5703125);
939 const gmx_simd_float_t argred1 = gmx_simd_set1_f(-4.83751296997070312500e-04f);
940 const gmx_simd_float_t argred2 = gmx_simd_set1_f(-7.54953362047672271729e-08f);
941 const gmx_simd_float_t argred3 = gmx_simd_set1_f(-2.56334406825708960298e-12f);
942 const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI);
943 const gmx_simd_float_t CT6 = gmx_simd_set1_f(0.009498288995810566122993911);
944 const gmx_simd_float_t CT5 = gmx_simd_set1_f(0.002895755790837379295226923);
945 const gmx_simd_float_t CT4 = gmx_simd_set1_f(0.02460087336161924491836265);
946 const gmx_simd_float_t CT3 = gmx_simd_set1_f(0.05334912882656359828045988);
947 const gmx_simd_float_t CT2 = gmx_simd_set1_f(0.1333989091464957704418495);
948 const gmx_simd_float_t CT1 = gmx_simd_set1_f(0.3333307599244198227797507);
950 gmx_simd_float_t x2, p, y, z;
951 gmx_simd_fbool_t mask;
953 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
954 gmx_simd_fint32_t iy;
955 gmx_simd_fint32_t ione = gmx_simd_set1_fi(1);
957 z = gmx_simd_mul_f(x, two_over_pi);
958 iy = gmx_simd_cvt_f2i(z);
959 y = gmx_simd_round_f(z);
960 mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione));
962 x = gmx_simd_fmadd_f(y, argred0, x);
963 x = gmx_simd_fmadd_f(y, argred1, x);
964 x = gmx_simd_fmadd_f(y, argred2, x);
965 x = gmx_simd_fmadd_f(y, argred3, x);
966 x = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), mask), x);
967 #else
968 const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
969 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
970 const gmx_simd_float_t threequarter = gmx_simd_set1_f(0.75f);
971 gmx_simd_float_t w, q;
972 gmx_simd_fbool_t m1, m2, m3;
974 w = gmx_simd_fabs_f(x);
975 z = gmx_simd_fmadd_f(w, two_over_pi, half);
976 y = gmx_simd_trunc_f(z);
977 q = gmx_simd_mul_f(z, quarter);
978 q = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
979 m1 = gmx_simd_cmple_f(quarter, q);
980 m2 = gmx_simd_cmplt_f(q, half);
981 m3 = gmx_simd_cmple_f(threequarter, q);
982 m1 = gmx_simd_and_fb(m1, m2);
983 mask = gmx_simd_or_fb(m1, m3);
984 w = gmx_simd_fmadd_f(y, argred0, w);
985 w = gmx_simd_fmadd_f(y, argred1, w);
986 w = gmx_simd_fmadd_f(y, argred2, w);
987 w = gmx_simd_fmadd_f(y, argred3, w);
989 w = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask);
990 x = gmx_simd_xor_sign_f(w, x);
991 #endif
992 x2 = gmx_simd_mul_f(x, x);
993 p = gmx_simd_fmadd_f(CT6, x2, CT5);
994 p = gmx_simd_fmadd_f(p, x2, CT4);
995 p = gmx_simd_fmadd_f(p, x2, CT3);
996 p = gmx_simd_fmadd_f(p, x2, CT2);
997 p = gmx_simd_fmadd_f(p, x2, CT1);
998 p = gmx_simd_fmadd_f(x2, gmx_simd_mul_f(p, x), x);
1000 p = gmx_simd_blendv_f( p, gmx_simd_inv_maskfpe_f(p, mask), mask);
1001 return p;
1004 /*! \brief SIMD float asin(x).
1006 * You should normally call the real-precision routine \ref gmx_simd_asin_r.
1008 * \param x The argument to evaluate asin for
1009 * \result Asin(x)
1011 static gmx_inline gmx_simd_float_t gmx_simdcall
1012 gmx_simd_asin_f(gmx_simd_float_t x)
1014 const gmx_simd_float_t limitlow = gmx_simd_set1_f(1e-4f);
1015 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
1016 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
1017 const gmx_simd_float_t halfpi = gmx_simd_set1_f((float)M_PI/2.0f);
1018 const gmx_simd_float_t CC5 = gmx_simd_set1_f(4.2163199048E-2f);
1019 const gmx_simd_float_t CC4 = gmx_simd_set1_f(2.4181311049E-2f);
1020 const gmx_simd_float_t CC3 = gmx_simd_set1_f(4.5470025998E-2f);
1021 const gmx_simd_float_t CC2 = gmx_simd_set1_f(7.4953002686E-2f);
1022 const gmx_simd_float_t CC1 = gmx_simd_set1_f(1.6666752422E-1f);
1023 gmx_simd_float_t xabs;
1024 gmx_simd_float_t z, z1, z2, q, q1, q2;
1025 gmx_simd_float_t pA, pB;
1026 gmx_simd_fbool_t mask, mask2;
1028 xabs = gmx_simd_fabs_f(x);
1029 mask = gmx_simd_cmplt_f(half, xabs);
1030 z1 = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
1031 mask2 = gmx_simd_cmpeq_f(xabs, one);
1032 q1 = gmx_simd_mul_f(z1, gmx_simd_invsqrt_notmaskfpe_f(z1, mask2));
1033 q1 = gmx_simd_blendnotzero_f(q1, mask2);
1034 q2 = xabs;
1035 z2 = gmx_simd_mul_f(q2, q2);
1036 z = gmx_simd_blendv_f(z2, z1, mask);
1037 q = gmx_simd_blendv_f(q2, q1, mask);
1039 z2 = gmx_simd_mul_f(z, z);
1040 pA = gmx_simd_fmadd_f(CC5, z2, CC3);
1041 pB = gmx_simd_fmadd_f(CC4, z2, CC2);
1042 pA = gmx_simd_fmadd_f(pA, z2, CC1);
1043 pA = gmx_simd_mul_f(pA, z);
1044 z = gmx_simd_fmadd_f(pB, z2, pA);
1045 z = gmx_simd_fmadd_f(z, q, q);
1046 q2 = gmx_simd_sub_f(halfpi, z);
1047 q2 = gmx_simd_sub_f(q2, z);
1048 z = gmx_simd_blendv_f(z, q2, mask);
1050 mask = gmx_simd_cmplt_f(limitlow, xabs);
1051 z = gmx_simd_blendv_f( xabs, z, mask );
1052 z = gmx_simd_xor_sign_f(z, x);
1054 return z;
1057 /*! \brief SIMD float acos(x).
1059 * You should normally call the real-precision routine \ref gmx_simd_acos_r.
1061 * \param x The argument to evaluate acos for
1062 * \result Acos(x)
1064 static gmx_inline gmx_simd_float_t gmx_simdcall
1065 gmx_simd_acos_f(gmx_simd_float_t x)
1067 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
1068 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
1069 const gmx_simd_float_t pi = gmx_simd_set1_f((float)M_PI);
1070 const gmx_simd_float_t halfpi = gmx_simd_set1_f((float)M_PI/2.0f);
1071 gmx_simd_float_t xabs;
1072 gmx_simd_float_t z, z1, z2, z3;
1073 gmx_simd_fbool_t mask1, mask2, mask3;
1075 xabs = gmx_simd_fabs_f(x);
1076 mask1 = gmx_simd_cmplt_f(half, xabs);
1077 mask2 = gmx_simd_cmplt_f(gmx_simd_setzero_f(), x);
1079 z = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
1080 mask3 = gmx_simd_cmpeq_f(xabs, one);
1081 z = gmx_simd_mul_f(z, gmx_simd_invsqrt_notmaskfpe_f(z, mask3));
1082 z = gmx_simd_blendnotzero_f(z, mask3);
1083 z = gmx_simd_blendv_f(x, z, mask1);
1084 z = gmx_simd_asin_f(z);
1086 z2 = gmx_simd_add_f(z, z);
1087 z1 = gmx_simd_sub_f(pi, z2);
1088 z3 = gmx_simd_sub_f(halfpi, z);
1089 z = gmx_simd_blendv_f(z1, z2, mask2);
1090 z = gmx_simd_blendv_f(z3, z, mask1);
1092 return z;
1095 /*! \brief SIMD float asin(x).
1097 * You should normally call the real-precision routine \ref gmx_simd_atan_r.
1099 * \param x The argument to evaluate atan for
1100 * \result Atan(x), same argument/value range as standard math library.
1102 static gmx_inline gmx_simd_float_t gmx_simdcall
1103 gmx_simd_atan_f(gmx_simd_float_t x)
1105 const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2);
1106 const gmx_simd_float_t CA17 = gmx_simd_set1_f(0.002823638962581753730774f);
1107 const gmx_simd_float_t CA15 = gmx_simd_set1_f(-0.01595690287649631500244f);
1108 const gmx_simd_float_t CA13 = gmx_simd_set1_f(0.04250498861074447631836f);
1109 const gmx_simd_float_t CA11 = gmx_simd_set1_f(-0.07489009201526641845703f);
1110 const gmx_simd_float_t CA9 = gmx_simd_set1_f(0.1063479334115982055664f);
1111 const gmx_simd_float_t CA7 = gmx_simd_set1_f(-0.1420273631811141967773f);
1112 const gmx_simd_float_t CA5 = gmx_simd_set1_f(0.1999269574880599975585f);
1113 const gmx_simd_float_t CA3 = gmx_simd_set1_f(-0.3333310186862945556640f);
1114 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
1115 gmx_simd_float_t x2, x3, x4, pA, pB;
1116 gmx_simd_fbool_t mask, mask2;
1118 mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1119 x = gmx_simd_fabs_f(x);
1120 mask2 = gmx_simd_cmplt_f(one, x);
1121 x = gmx_simd_blendv_f(x, gmx_simd_inv_maskfpe_f(x, mask2), mask2);
1123 x2 = gmx_simd_mul_f(x, x);
1124 x3 = gmx_simd_mul_f(x2, x);
1125 x4 = gmx_simd_mul_f(x2, x2);
1126 pA = gmx_simd_fmadd_f(CA17, x4, CA13);
1127 pB = gmx_simd_fmadd_f(CA15, x4, CA11);
1128 pA = gmx_simd_fmadd_f(pA, x4, CA9);
1129 pB = gmx_simd_fmadd_f(pB, x4, CA7);
1130 pA = gmx_simd_fmadd_f(pA, x4, CA5);
1131 pB = gmx_simd_fmadd_f(pB, x4, CA3);
1132 pA = gmx_simd_fmadd_f(pA, x2, pB);
1133 pA = gmx_simd_fmadd_f(pA, x3, x);
1135 pA = gmx_simd_blendv_f(pA, gmx_simd_sub_f(halfpi, pA), mask2);
1136 pA = gmx_simd_blendv_f(pA, gmx_simd_fneg_f(pA), mask);
1138 return pA;
1141 /*! \brief SIMD float atan2(y,x).
1143 * You should normally call the real-precision routine \ref gmx_simd_atan2_r.
1145 * \param y Y component of vector, any quartile
1146 * \param x X component of vector, any quartile
1147 * \result Atan(y,x), same argument/value range as standard math library.
1149 * \note This routine should provide correct results for all finite
1150 * non-zero or positive-zero arguments. However, negative zero arguments will
1151 * be treated as positive zero, which means the return value will deviate from
1152 * the standard math library atan2(y,x) for those cases. That should not be
1153 * of any concern in Gromacs, and in particular it will not affect calculations
1154 * of angles from vectors.
1156 static gmx_inline gmx_simd_float_t gmx_simdcall
1157 gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
1159 const gmx_simd_float_t pi = gmx_simd_set1_f(M_PI);
1160 const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2.0);
1161 gmx_simd_float_t xinv, p, aoffset;
1162 gmx_simd_fbool_t mask_x0, mask_y0, mask_xlt0, mask_ylt0;
1164 mask_x0 = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
1165 mask_y0 = gmx_simd_cmpeq_f(y, gmx_simd_setzero_f());
1166 mask_xlt0 = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1167 mask_ylt0 = gmx_simd_cmplt_f(y, gmx_simd_setzero_f());
1169 aoffset = gmx_simd_blendzero_f(halfpi, mask_x0);
1170 aoffset = gmx_simd_blendnotzero_f(aoffset, mask_y0);
1172 aoffset = gmx_simd_blendv_f(aoffset, pi, mask_xlt0);
1173 aoffset = gmx_simd_blendv_f(aoffset, gmx_simd_fneg_f(aoffset), mask_ylt0);
1175 xinv = gmx_simd_blendnotzero_f(gmx_simd_inv_notmaskfpe_f(x, mask_x0), mask_x0);
1176 p = gmx_simd_mul_f(y, xinv);
1177 p = gmx_simd_atan_f(p);
1178 p = gmx_simd_add_f(p, aoffset);
1180 return p;
1183 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1185 * You should normally call the real-precision routine \ref gmx_simd_pmecorrF_r.
1187 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1188 * \result Correction factor to coulomb force - see below for details.
1190 * This routine is meant to enable analytical evaluation of the
1191 * direct-space PME electrostatic force to avoid tables.
1193 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1194 * are some problems evaluating that:
1196 * First, the error function is difficult (read: expensive) to
1197 * approxmiate accurately for intermediate to large arguments, and
1198 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1199 * Second, we now try to avoid calculating potentials in Gromacs but
1200 * use forces directly.
1202 * We can simply things slight by noting that the PME part is really
1203 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1204 * \f[
1205 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1206 * \f]
1207 * The first term we already have from the inverse square root, so
1208 * that we can leave out of this routine.
1210 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1211 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1212 * the range used for the minimax fit. Use your favorite plotting program
1213 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1215 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1216 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1217 * then only use even powers. This is another minor optimization, since
1218 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1219 * the vector between the two atoms to get the vectorial force. The
1220 * fastest flops are the ones we can avoid calculating!
1222 * So, here's how it should be used:
1224 * 1. Calculate \f$r^2\f$.
1225 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1226 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1227 * 4. The return value is the expression:
1229 * \f[
1230 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1231 * \f]
1233 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1235 * \f[
1236 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1237 * \f]
1239 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1241 * \f[
1242 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1243 * \f]
1245 * With a bit of math exercise you should be able to confirm that
1246 * this is exactly
1248 * \f[
1249 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1250 * \f]
1252 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1253 * and you have your force (divided by \f$r\f$). A final multiplication
1254 * with the vector connecting the two particles and you have your
1255 * vectorial force to add to the particles.
1257 * This approximation achieves an error slightly lower than 1e-6
1258 * in single precision and 1e-11 in double precision
1259 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1260 * when added to \f$1/r\f$ the error will be insignificant.
1261 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1264 static gmx_inline gmx_simd_float_t gmx_simdcall
1265 gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
1267 const gmx_simd_float_t FN6 = gmx_simd_set1_f(-1.7357322914161492954e-8f);
1268 const gmx_simd_float_t FN5 = gmx_simd_set1_f(1.4703624142580877519e-6f);
1269 const gmx_simd_float_t FN4 = gmx_simd_set1_f(-0.000053401640219807709149f);
1270 const gmx_simd_float_t FN3 = gmx_simd_set1_f(0.0010054721316683106153f);
1271 const gmx_simd_float_t FN2 = gmx_simd_set1_f(-0.019278317264888380590f);
1272 const gmx_simd_float_t FN1 = gmx_simd_set1_f(0.069670166153766424023f);
1273 const gmx_simd_float_t FN0 = gmx_simd_set1_f(-0.75225204789749321333f);
1275 const gmx_simd_float_t FD4 = gmx_simd_set1_f(0.0011193462567257629232f);
1276 const gmx_simd_float_t FD3 = gmx_simd_set1_f(0.014866955030185295499f);
1277 const gmx_simd_float_t FD2 = gmx_simd_set1_f(0.11583842382862377919f);
1278 const gmx_simd_float_t FD1 = gmx_simd_set1_f(0.50736591960530292870f);
1279 const gmx_simd_float_t FD0 = gmx_simd_set1_f(1.0f);
1281 gmx_simd_float_t z4;
1282 gmx_simd_float_t polyFN0, polyFN1, polyFD0, polyFD1;
1284 z4 = gmx_simd_mul_f(z2, z2);
1286 polyFD0 = gmx_simd_fmadd_f(FD4, z4, FD2);
1287 polyFD1 = gmx_simd_fmadd_f(FD3, z4, FD1);
1288 polyFD0 = gmx_simd_fmadd_f(polyFD0, z4, FD0);
1289 polyFD0 = gmx_simd_fmadd_f(polyFD1, z2, polyFD0);
1291 polyFD0 = gmx_simd_inv_f(polyFD0);
1293 polyFN0 = gmx_simd_fmadd_f(FN6, z4, FN4);
1294 polyFN1 = gmx_simd_fmadd_f(FN5, z4, FN3);
1295 polyFN0 = gmx_simd_fmadd_f(polyFN0, z4, FN2);
1296 polyFN1 = gmx_simd_fmadd_f(polyFN1, z4, FN1);
1297 polyFN0 = gmx_simd_fmadd_f(polyFN0, z4, FN0);
1298 polyFN0 = gmx_simd_fmadd_f(polyFN1, z2, polyFN0);
1300 return gmx_simd_mul_f(polyFN0, polyFD0);
1305 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1307 * You should normally call the real-precision routine \ref gmx_simd_pmecorrV_r.
1309 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1310 * \result Correction factor to coulomb potential - see below for details.
1312 * See \ref gmx_simd_pmecorrF_f for details about the approximation.
1314 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1315 * as the input argument.
1317 * Here's how it should be used:
1319 * 1. Calculate \f$r^2\f$.
1320 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1321 * 3. Evaluate this routine with z^2 as the argument.
1322 * 4. The return value is the expression:
1324 * \f[
1325 * \frac{\mbox{erf}(z)}{z}
1326 * \f]
1328 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1330 * \f[
1331 * \frac{\mbox{erf}(r \beta)}{r}
1332 * \f]
1334 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1335 * and you have your potential.
1337 * This approximation achieves an error slightly lower than 1e-6
1338 * in single precision and 4e-11 in double precision
1339 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1340 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1341 * when added to \f$1/r\f$ the error will be insignificant.
1342 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1344 static gmx_inline gmx_simd_float_t gmx_simdcall
1345 gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
1347 const gmx_simd_float_t VN6 = gmx_simd_set1_f(1.9296833005951166339e-8f);
1348 const gmx_simd_float_t VN5 = gmx_simd_set1_f(-1.4213390571557850962e-6f);
1349 const gmx_simd_float_t VN4 = gmx_simd_set1_f(0.000041603292906656984871f);
1350 const gmx_simd_float_t VN3 = gmx_simd_set1_f(-0.00013134036773265025626f);
1351 const gmx_simd_float_t VN2 = gmx_simd_set1_f(0.038657983986041781264f);
1352 const gmx_simd_float_t VN1 = gmx_simd_set1_f(0.11285044772717598220f);
1353 const gmx_simd_float_t VN0 = gmx_simd_set1_f(1.1283802385263030286f);
1355 const gmx_simd_float_t VD3 = gmx_simd_set1_f(0.0066752224023576045451f);
1356 const gmx_simd_float_t VD2 = gmx_simd_set1_f(0.078647795836373922256f);
1357 const gmx_simd_float_t VD1 = gmx_simd_set1_f(0.43336185284710920150f);
1358 const gmx_simd_float_t VD0 = gmx_simd_set1_f(1.0f);
1360 gmx_simd_float_t z4;
1361 gmx_simd_float_t polyVN0, polyVN1, polyVD0, polyVD1;
1363 z4 = gmx_simd_mul_f(z2, z2);
1365 polyVD1 = gmx_simd_fmadd_f(VD3, z4, VD1);
1366 polyVD0 = gmx_simd_fmadd_f(VD2, z4, VD0);
1367 polyVD0 = gmx_simd_fmadd_f(polyVD1, z2, polyVD0);
1369 polyVD0 = gmx_simd_inv_f(polyVD0);
1371 polyVN0 = gmx_simd_fmadd_f(VN6, z4, VN4);
1372 polyVN1 = gmx_simd_fmadd_f(VN5, z4, VN3);
1373 polyVN0 = gmx_simd_fmadd_f(polyVN0, z4, VN2);
1374 polyVN1 = gmx_simd_fmadd_f(polyVN1, z4, VN1);
1375 polyVN0 = gmx_simd_fmadd_f(polyVN0, z4, VN0);
1376 polyVN0 = gmx_simd_fmadd_f(polyVN1, z2, polyVN0);
1378 return gmx_simd_mul_f(polyVN0, polyVD0);
1380 #endif
1382 /*! \} */
1384 #ifdef GMX_SIMD_HAVE_DOUBLE
1386 /*! \name Double precision SIMD math functions
1388 * \note In most cases you should use the real-precision functions instead.
1389 * \{
1392 /****************************************
1393 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1394 ****************************************/
1396 /*! \brief SIMD utility function to sum a+b+c+d for SIMD doubles.
1398 * \copydetails gmx_simd_sum4_f
1400 static gmx_inline gmx_simd_double_t gmx_simdcall
1401 gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
1402 gmx_simd_double_t c, gmx_simd_double_t d)
1404 return gmx_simd_add_d(gmx_simd_add_d(a, b), gmx_simd_add_d(c, d));
1407 /*! \brief Return -a if b is negative, SIMD double.
1409 * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
1411 * \param a Values to set sign for
1412 * \param b Values used to set sign
1413 * \return if b is negative, the sign of a will be changed.
1415 * This is equivalent to doing an xor operation on a with the sign bit of b,
1416 * with the exception that negative zero is not considered to be negative
1417 * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
1419 static gmx_inline gmx_simd_double_t gmx_simdcall
1420 gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
1422 #ifdef GMX_SIMD_HAVE_LOGICAL
1423 return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), b));
1424 #else
1425 return gmx_simd_blendv_d(a, gmx_simd_fneg_d(a), gmx_simd_cmplt_d(b, gmx_simd_setzero_d()));
1426 #endif
1429 #ifndef gmx_simd_rsqrt_iter_d
1430 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1432 * \copydetails gmx_simd_rsqrt_iter_f
1434 static gmx_inline gmx_simd_double_t gmx_simdcall
1435 gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1437 #ifdef GMX_SIMD_HAVE_FMA
1438 return gmx_simd_fmadd_d(gmx_simd_fnmadd_d(x, gmx_simd_mul_d(lu, lu), gmx_simd_set1_d(1.0)), gmx_simd_mul_d(lu, gmx_simd_set1_d(0.5)), lu);
1439 #else
1440 return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
1441 #endif
1443 #endif
1445 /*! \brief Calculate 1/sqrt(x) for SIMD double
1447 * \copydetails gmx_simd_invsqrt_f
1449 static gmx_inline gmx_simd_double_t gmx_simdcall
1450 gmx_simd_invsqrt_d(gmx_simd_double_t x)
1452 gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
1453 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1454 lu = gmx_simd_rsqrt_iter_d(lu, x);
1455 #endif
1456 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1457 lu = gmx_simd_rsqrt_iter_d(lu, x);
1458 #endif
1459 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1460 lu = gmx_simd_rsqrt_iter_d(lu, x);
1461 #endif
1462 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1463 lu = gmx_simd_rsqrt_iter_d(lu, x);
1464 #endif
1465 return lu;
1468 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1470 * \copydetails gmx_simd_invsqrt_maskfpe_f
1472 static gmx_inline gmx_simd_double_t
1473 gmx_simd_invsqrt_maskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1475 #ifdef NDEBUG
1476 return gmx_simd_invsqrt_d(x);
1477 #else
1478 return gmx_simd_invsqrt_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
1479 #endif
1482 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD double.
1484 * \copydetails gmx_simd_invsqrt_notmaskfpe_f
1486 static gmx_inline gmx_simd_double_t
1487 gmx_simd_invsqrt_notmaskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1489 #ifdef NDEBUG
1490 return gmx_simd_invsqrt_d(x);
1491 #else
1492 return gmx_simd_invsqrt_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
1493 #endif
1496 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1498 * \copydetails gmx_simd_invsqrt_pair_f
1500 static gmx_inline void gmx_simdcall
1501 gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0, gmx_simd_double_t x1,
1502 gmx_simd_double_t *out0, gmx_simd_double_t *out1)
1504 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1505 gmx_simd_float_t xf = gmx_simd_cvt_dd2f(x0, x1);
1506 gmx_simd_float_t luf = gmx_simd_rsqrt_f(xf);
1507 gmx_simd_double_t lu0, lu1;
1508 /* Intermediate target is single - mantissa+1 bits */
1509 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1510 luf = gmx_simd_rsqrt_iter_f(luf, xf);
1511 #endif
1512 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1513 luf = gmx_simd_rsqrt_iter_f(luf, xf);
1514 #endif
1515 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1516 luf = gmx_simd_rsqrt_iter_f(luf, xf);
1517 #endif
1518 gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
1519 /* Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15) */
1520 #if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1521 lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1522 lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1523 #endif
1524 #if (GMX_SIMD_ACCURACY_BITS_SINGLE*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1525 lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1526 lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1527 #endif
1528 *out0 = lu0;
1529 *out1 = lu1;
1530 #else
1531 *out0 = gmx_simd_invsqrt_d(x0);
1532 *out1 = gmx_simd_invsqrt_d(x1);
1533 #endif
1536 #ifndef gmx_simd_rcp_iter_d
1537 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1539 * \copydetails gmx_simd_rcp_iter_f
1541 static gmx_inline gmx_simd_double_t gmx_simdcall
1542 gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1544 return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
1546 #endif
1548 /*! \brief Calculate 1/x for SIMD double.
1550 * \copydetails gmx_simd_inv_f
1552 static gmx_inline gmx_simd_double_t gmx_simdcall
1553 gmx_simd_inv_d(gmx_simd_double_t x)
1555 gmx_simd_double_t lu = gmx_simd_rcp_d(x);
1556 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1557 lu = gmx_simd_rcp_iter_d(lu, x);
1558 #endif
1559 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1560 lu = gmx_simd_rcp_iter_d(lu, x);
1561 #endif
1562 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1563 lu = gmx_simd_rcp_iter_d(lu, x);
1564 #endif
1565 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1566 lu = gmx_simd_rcp_iter_d(lu, x);
1567 #endif
1568 return lu;
1571 /*! \brief Calculate 1/x for masked entries of SIMD double.
1573 * \copydetails gmx_simd_inv_maskfpe_f
1575 static gmx_inline gmx_simd_double_t
1576 gmx_simd_inv_maskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1578 #ifdef NDEBUG
1579 return gmx_simd_inv_d(x);
1580 #else
1581 return gmx_simd_inv_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
1582 #endif
1585 /*! \brief Calculate 1/x for non-masked entries of SIMD double.
1587 * \copydetails gmx_simd_inv_notmaskfpe_f
1589 static gmx_inline gmx_simd_double_t
1590 gmx_simd_inv_notmaskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
1592 #ifdef NDEBUG
1593 return gmx_simd_inv_d(x);
1594 #else
1595 return gmx_simd_inv_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
1596 #endif
1599 /*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
1601 * \copydetails gmx_simd_sqrt_f
1603 static gmx_inline gmx_simd_double_t gmx_simdcall
1604 gmx_simd_sqrt_d(gmx_simd_double_t x)
1606 gmx_simd_dbool_t mask;
1607 gmx_simd_double_t res;
1609 mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
1610 res = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_notmaskfpe_d(x, mask), mask);
1611 return gmx_simd_mul_d(res, x);
1614 /*! \brief SIMD double log(x). This is the natural logarithm.
1616 * \copydetails gmx_simd_log_f
1618 static gmx_inline gmx_simd_double_t gmx_simdcall
1619 gmx_simd_log_d(gmx_simd_double_t x)
1621 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
1622 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1623 const gmx_simd_double_t sqrt2 = gmx_simd_set1_d(sqrt(2.0));
1624 const gmx_simd_double_t corr = gmx_simd_set1_d(0.693147180559945286226764);
1625 const gmx_simd_double_t CL15 = gmx_simd_set1_d(0.148197055177935105296783);
1626 const gmx_simd_double_t CL13 = gmx_simd_set1_d(0.153108178020442575739679);
1627 const gmx_simd_double_t CL11 = gmx_simd_set1_d(0.181837339521549679055568);
1628 const gmx_simd_double_t CL9 = gmx_simd_set1_d(0.22222194152736701733275);
1629 const gmx_simd_double_t CL7 = gmx_simd_set1_d(0.285714288030134544449368);
1630 const gmx_simd_double_t CL5 = gmx_simd_set1_d(0.399999999989941956712869);
1631 const gmx_simd_double_t CL3 = gmx_simd_set1_d(0.666666666666685503450651);
1632 const gmx_simd_double_t CL1 = gmx_simd_set1_d(2.0);
1633 gmx_simd_double_t fexp, x2, p;
1634 gmx_simd_dbool_t mask;
1636 fexp = gmx_simd_get_exponent_d(x);
1637 x = gmx_simd_get_mantissa_d(x);
1639 mask = gmx_simd_cmplt_d(sqrt2, x);
1640 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
1641 fexp = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
1642 x = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
1644 x = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_d( gmx_simd_add_d(x, one) ) );
1645 x2 = gmx_simd_mul_d(x, x);
1647 p = gmx_simd_fmadd_d(CL15, x2, CL13);
1648 p = gmx_simd_fmadd_d(p, x2, CL11);
1649 p = gmx_simd_fmadd_d(p, x2, CL9);
1650 p = gmx_simd_fmadd_d(p, x2, CL7);
1651 p = gmx_simd_fmadd_d(p, x2, CL5);
1652 p = gmx_simd_fmadd_d(p, x2, CL3);
1653 p = gmx_simd_fmadd_d(p, x2, CL1);
1654 p = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
1656 return p;
1659 /*! \brief SIMD double 2^x.
1661 * \copydetails gmx_simd_exp2_f
1663 static gmx_inline gmx_simd_double_t gmx_simdcall
1664 gmx_simd_exp2_d(gmx_simd_double_t x)
1666 const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
1667 const gmx_simd_double_t CE11 = gmx_simd_set1_d(4.435280790452730022081181e-10);
1668 const gmx_simd_double_t CE10 = gmx_simd_set1_d(7.074105630863314448024247e-09);
1669 const gmx_simd_double_t CE9 = gmx_simd_set1_d(1.017819803432096698472621e-07);
1670 const gmx_simd_double_t CE8 = gmx_simd_set1_d(1.321543308956718799557863e-06);
1671 const gmx_simd_double_t CE7 = gmx_simd_set1_d(0.00001525273348995851746990884);
1672 const gmx_simd_double_t CE6 = gmx_simd_set1_d(0.0001540353046251466849082632);
1673 const gmx_simd_double_t CE5 = gmx_simd_set1_d(0.001333355814678995257307880);
1674 const gmx_simd_double_t CE4 = gmx_simd_set1_d(0.009618129107588335039176502);
1675 const gmx_simd_double_t CE3 = gmx_simd_set1_d(0.05550410866481992147457793);
1676 const gmx_simd_double_t CE2 = gmx_simd_set1_d(0.2402265069591015620470894);
1677 const gmx_simd_double_t CE1 = gmx_simd_set1_d(0.6931471805599453304615075);
1678 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1679 gmx_simd_double_t fexppart;
1680 gmx_simd_double_t intpart;
1681 gmx_simd_double_t p;
1682 gmx_simd_dbool_t valuemask;
1684 fexppart = gmx_simd_set_exponent_d(x); /* rounds to nearest int internally */
1685 intpart = gmx_simd_round_d(x); /* use same rounding mode here */
1686 valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
1687 fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
1688 x = gmx_simd_sub_d(x, intpart);
1690 p = gmx_simd_fmadd_d(CE11, x, CE10);
1691 p = gmx_simd_fmadd_d(p, x, CE9);
1692 p = gmx_simd_fmadd_d(p, x, CE8);
1693 p = gmx_simd_fmadd_d(p, x, CE7);
1694 p = gmx_simd_fmadd_d(p, x, CE6);
1695 p = gmx_simd_fmadd_d(p, x, CE5);
1696 p = gmx_simd_fmadd_d(p, x, CE4);
1697 p = gmx_simd_fmadd_d(p, x, CE3);
1698 p = gmx_simd_fmadd_d(p, x, CE2);
1699 p = gmx_simd_fmadd_d(p, x, CE1);
1700 p = gmx_simd_fmadd_d(p, x, one);
1701 x = gmx_simd_mul_d(p, fexppart);
1702 return x;
1705 /*! \brief SIMD double exp(x).
1707 * \copydetails gmx_simd_exp_f
1709 static gmx_inline gmx_simd_double_t gmx_simdcall
1710 gmx_simd_exp_d(gmx_simd_double_t x)
1712 const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896340735992468100);
1713 const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
1714 const gmx_simd_double_t invargscale0 = gmx_simd_set1_d(-0.69314718055966295651160180568695068359375);
1715 const gmx_simd_double_t invargscale1 = gmx_simd_set1_d(-2.8235290563031577122588448175013436025525412068e-13);
1716 const gmx_simd_double_t CE12 = gmx_simd_set1_d(2.078375306791423699350304e-09);
1717 const gmx_simd_double_t CE11 = gmx_simd_set1_d(2.518173854179933105218635e-08);
1718 const gmx_simd_double_t CE10 = gmx_simd_set1_d(2.755842049600488770111608e-07);
1719 const gmx_simd_double_t CE9 = gmx_simd_set1_d(2.755691815216689746619849e-06);
1720 const gmx_simd_double_t CE8 = gmx_simd_set1_d(2.480158383706245033920920e-05);
1721 const gmx_simd_double_t CE7 = gmx_simd_set1_d(0.0001984127043518048611841321);
1722 const gmx_simd_double_t CE6 = gmx_simd_set1_d(0.001388888889360258341755930);
1723 const gmx_simd_double_t CE5 = gmx_simd_set1_d(0.008333333332907368102819109);
1724 const gmx_simd_double_t CE4 = gmx_simd_set1_d(0.04166666666663836745814631);
1725 const gmx_simd_double_t CE3 = gmx_simd_set1_d(0.1666666666666796929434570);
1726 const gmx_simd_double_t CE2 = gmx_simd_set1_d(0.5);
1727 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1728 gmx_simd_double_t fexppart;
1729 gmx_simd_double_t intpart;
1730 gmx_simd_double_t y, p;
1731 gmx_simd_dbool_t valuemask;
1733 y = gmx_simd_mul_d(x, argscale);
1734 fexppart = gmx_simd_set_exponent_d(y); /* rounds to nearest int internally */
1735 intpart = gmx_simd_round_d(y); /* use same rounding mode here */
1736 valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
1737 fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
1739 /* Extended precision arithmetics */
1740 x = gmx_simd_fmadd_d(invargscale0, intpart, x);
1741 x = gmx_simd_fmadd_d(invargscale1, intpart, x);
1743 p = gmx_simd_fmadd_d(CE12, x, CE11);
1744 p = gmx_simd_fmadd_d(p, x, CE10);
1745 p = gmx_simd_fmadd_d(p, x, CE9);
1746 p = gmx_simd_fmadd_d(p, x, CE8);
1747 p = gmx_simd_fmadd_d(p, x, CE7);
1748 p = gmx_simd_fmadd_d(p, x, CE6);
1749 p = gmx_simd_fmadd_d(p, x, CE5);
1750 p = gmx_simd_fmadd_d(p, x, CE4);
1751 p = gmx_simd_fmadd_d(p, x, CE3);
1752 p = gmx_simd_fmadd_d(p, x, CE2);
1753 p = gmx_simd_fmadd_d(p, gmx_simd_mul_d(x, x), gmx_simd_add_d(x, one));
1754 x = gmx_simd_mul_d(p, fexppart);
1755 return x;
1758 /*! \brief SIMD double erf(x).
1760 * \copydetails gmx_simd_erf_f
1762 static gmx_inline gmx_simd_double_t gmx_simdcall
1763 gmx_simd_erf_d(gmx_simd_double_t x)
1765 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1766 const gmx_simd_double_t CAP4 = gmx_simd_set1_d(-0.431780540597889301512e-4);
1767 const gmx_simd_double_t CAP3 = gmx_simd_set1_d(-0.00578562306260059236059);
1768 const gmx_simd_double_t CAP2 = gmx_simd_set1_d(-0.028593586920219752446);
1769 const gmx_simd_double_t CAP1 = gmx_simd_set1_d(-0.315924962948621698209);
1770 const gmx_simd_double_t CAP0 = gmx_simd_set1_d(0.14952975608477029151);
1772 const gmx_simd_double_t CAQ5 = gmx_simd_set1_d(-0.374089300177174709737e-5);
1773 const gmx_simd_double_t CAQ4 = gmx_simd_set1_d(0.00015126584532155383535);
1774 const gmx_simd_double_t CAQ3 = gmx_simd_set1_d(0.00536692680669480725423);
1775 const gmx_simd_double_t CAQ2 = gmx_simd_set1_d(0.0668686825594046122636);
1776 const gmx_simd_double_t CAQ1 = gmx_simd_set1_d(0.402604990869284362773);
1777 /* CAQ0 == 1.0 */
1778 const gmx_simd_double_t CAoffset = gmx_simd_set1_d(0.9788494110107421875);
1780 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1781 const gmx_simd_double_t CBP6 = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1782 const gmx_simd_double_t CBP5 = gmx_simd_set1_d(0.00119770193298159629350136085658);
1783 const gmx_simd_double_t CBP4 = gmx_simd_set1_d(0.0164944422378370965881008942733);
1784 const gmx_simd_double_t CBP3 = gmx_simd_set1_d(0.0984581468691775932063932439252);
1785 const gmx_simd_double_t CBP2 = gmx_simd_set1_d(0.317364595806937763843589437418);
1786 const gmx_simd_double_t CBP1 = gmx_simd_set1_d(0.554167062641455850932670067075);
1787 const gmx_simd_double_t CBP0 = gmx_simd_set1_d(0.427583576155807163756925301060);
1788 const gmx_simd_double_t CBQ7 = gmx_simd_set1_d(0.00212288829699830145976198384930);
1789 const gmx_simd_double_t CBQ6 = gmx_simd_set1_d(0.0334810979522685300554606393425);
1790 const gmx_simd_double_t CBQ5 = gmx_simd_set1_d(0.2361713785181450957579508850717);
1791 const gmx_simd_double_t CBQ4 = gmx_simd_set1_d(0.955364736493055670530981883072);
1792 const gmx_simd_double_t CBQ3 = gmx_simd_set1_d(2.36815675631420037315349279199);
1793 const gmx_simd_double_t CBQ2 = gmx_simd_set1_d(3.55261649184083035537184223542);
1794 const gmx_simd_double_t CBQ1 = gmx_simd_set1_d(2.93501136050160872574376997993);
1795 /* CBQ0 == 1.0 */
1797 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1798 const gmx_simd_double_t CCP6 = gmx_simd_set1_d(-2.8175401114513378771);
1799 const gmx_simd_double_t CCP5 = gmx_simd_set1_d(-3.22729451764143718517);
1800 const gmx_simd_double_t CCP4 = gmx_simd_set1_d(-2.5518551727311523996);
1801 const gmx_simd_double_t CCP3 = gmx_simd_set1_d(-0.687717681153649930619);
1802 const gmx_simd_double_t CCP2 = gmx_simd_set1_d(-0.212652252872804219852);
1803 const gmx_simd_double_t CCP1 = gmx_simd_set1_d(0.0175389834052493308818);
1804 const gmx_simd_double_t CCP0 = gmx_simd_set1_d(0.00628057170626964891937);
1806 const gmx_simd_double_t CCQ6 = gmx_simd_set1_d(5.48409182238641741584);
1807 const gmx_simd_double_t CCQ5 = gmx_simd_set1_d(13.5064170191802889145);
1808 const gmx_simd_double_t CCQ4 = gmx_simd_set1_d(22.9367376522880577224);
1809 const gmx_simd_double_t CCQ3 = gmx_simd_set1_d(15.930646027911794143);
1810 const gmx_simd_double_t CCQ2 = gmx_simd_set1_d(11.0567237927800161565);
1811 const gmx_simd_double_t CCQ1 = gmx_simd_set1_d(2.79257750980575282228);
1812 /* CCQ0 == 1.0 */
1813 const gmx_simd_double_t CCoffset = gmx_simd_set1_d(0.5579090118408203125);
1815 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1816 const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
1818 gmx_simd_double_t xabs, x2, x4, t, t2, w, w2;
1819 gmx_simd_double_t PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1820 gmx_simd_double_t PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1821 gmx_simd_double_t PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1822 gmx_simd_double_t res_erf, res_erfcB, res_erfcC, res_erfc, res;
1823 gmx_simd_double_t expmx2;
1824 gmx_simd_dbool_t mask, mask_erf;
1826 /* Calculate erf() */
1827 xabs = gmx_simd_fabs_d(x);
1828 mask_erf = gmx_simd_cmplt_d(xabs, one);
1829 x2 = gmx_simd_mul_d(x, x);
1830 x4 = gmx_simd_mul_d(x2, x2);
1832 PolyAP0 = gmx_simd_mul_d(CAP4, x4);
1833 PolyAP1 = gmx_simd_mul_d(CAP3, x4);
1834 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP2);
1835 PolyAP1 = gmx_simd_add_d(PolyAP1, CAP1);
1836 PolyAP0 = gmx_simd_mul_d(PolyAP0, x4);
1837 PolyAP1 = gmx_simd_mul_d(PolyAP1, x2);
1838 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP0);
1839 PolyAP0 = gmx_simd_add_d(PolyAP0, PolyAP1);
1841 PolyAQ1 = gmx_simd_mul_d(CAQ5, x4);
1842 PolyAQ0 = gmx_simd_mul_d(CAQ4, x4);
1843 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ3);
1844 PolyAQ0 = gmx_simd_add_d(PolyAQ0, CAQ2);
1845 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x4);
1846 PolyAQ0 = gmx_simd_mul_d(PolyAQ0, x4);
1847 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ1);
1848 PolyAQ0 = gmx_simd_add_d(PolyAQ0, one);
1849 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x2);
1850 PolyAQ0 = gmx_simd_add_d(PolyAQ0, PolyAQ1);
1852 res_erf = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_maskfpe_d(PolyAQ0, mask_erf));
1853 res_erf = gmx_simd_add_d(CAoffset, res_erf);
1854 res_erf = gmx_simd_mul_d(x, res_erf);
1856 /* Calculate erfc() in range [1,4.5] */
1857 t = gmx_simd_sub_d(xabs, one);
1858 t2 = gmx_simd_mul_d(t, t);
1860 PolyBP0 = gmx_simd_mul_d(CBP6, t2);
1861 PolyBP1 = gmx_simd_mul_d(CBP5, t2);
1862 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP4);
1863 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP3);
1864 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
1865 PolyBP1 = gmx_simd_mul_d(PolyBP1, t2);
1866 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP2);
1867 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP1);
1868 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
1869 PolyBP1 = gmx_simd_mul_d(PolyBP1, t);
1870 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP0);
1871 PolyBP0 = gmx_simd_add_d(PolyBP0, PolyBP1);
1873 PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
1874 PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
1875 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
1876 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
1877 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1878 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1879 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
1880 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
1881 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1882 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1883 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
1884 PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
1885 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
1886 PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
1888 res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_notmaskfpe_d(PolyBQ0, mask_erf));
1890 res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
1892 /* Calculate erfc() in range [4.5,inf] */
1893 w = gmx_simd_inv_notmaskfpe_d(xabs, mask_erf);
1894 w2 = gmx_simd_mul_d(w, w);
1896 PolyCP0 = gmx_simd_mul_d(CCP6, w2);
1897 PolyCP1 = gmx_simd_mul_d(CCP5, w2);
1898 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP4);
1899 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP3);
1900 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
1901 PolyCP1 = gmx_simd_mul_d(PolyCP1, w2);
1902 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP2);
1903 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP1);
1904 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
1905 PolyCP1 = gmx_simd_mul_d(PolyCP1, w);
1906 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP0);
1907 PolyCP0 = gmx_simd_add_d(PolyCP0, PolyCP1);
1909 PolyCQ0 = gmx_simd_mul_d(CCQ6, w2);
1910 PolyCQ1 = gmx_simd_mul_d(CCQ5, w2);
1911 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ4);
1912 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ3);
1913 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
1914 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w2);
1915 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ2);
1916 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ1);
1917 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
1918 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w);
1919 PolyCQ0 = gmx_simd_add_d(PolyCQ0, one);
1920 PolyCQ0 = gmx_simd_add_d(PolyCQ0, PolyCQ1);
1922 expmx2 = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
1924 res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_notmaskfpe_d(PolyCQ0, mask_erf));
1925 res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
1926 res_erfcC = gmx_simd_mul_d(res_erfcC, w);
1928 mask = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
1929 res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
1931 res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
1933 /* erfc(x<0) = 2-erfc(|x|) */
1934 mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
1935 res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
1937 /* Select erf() or erfc() */
1938 res = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask_erf);
1940 return res;
1943 /*! \brief SIMD double erfc(x).
1945 * \copydetails gmx_simd_erfc_f
1947 static gmx_inline gmx_simd_double_t gmx_simdcall
1948 gmx_simd_erfc_d(gmx_simd_double_t x)
1950 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1951 const gmx_simd_double_t CAP4 = gmx_simd_set1_d(-0.431780540597889301512e-4);
1952 const gmx_simd_double_t CAP3 = gmx_simd_set1_d(-0.00578562306260059236059);
1953 const gmx_simd_double_t CAP2 = gmx_simd_set1_d(-0.028593586920219752446);
1954 const gmx_simd_double_t CAP1 = gmx_simd_set1_d(-0.315924962948621698209);
1955 const gmx_simd_double_t CAP0 = gmx_simd_set1_d(0.14952975608477029151);
1957 const gmx_simd_double_t CAQ5 = gmx_simd_set1_d(-0.374089300177174709737e-5);
1958 const gmx_simd_double_t CAQ4 = gmx_simd_set1_d(0.00015126584532155383535);
1959 const gmx_simd_double_t CAQ3 = gmx_simd_set1_d(0.00536692680669480725423);
1960 const gmx_simd_double_t CAQ2 = gmx_simd_set1_d(0.0668686825594046122636);
1961 const gmx_simd_double_t CAQ1 = gmx_simd_set1_d(0.402604990869284362773);
1962 /* CAQ0 == 1.0 */
1963 const gmx_simd_double_t CAoffset = gmx_simd_set1_d(0.9788494110107421875);
1965 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1966 const gmx_simd_double_t CBP6 = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1967 const gmx_simd_double_t CBP5 = gmx_simd_set1_d(0.00119770193298159629350136085658);
1968 const gmx_simd_double_t CBP4 = gmx_simd_set1_d(0.0164944422378370965881008942733);
1969 const gmx_simd_double_t CBP3 = gmx_simd_set1_d(0.0984581468691775932063932439252);
1970 const gmx_simd_double_t CBP2 = gmx_simd_set1_d(0.317364595806937763843589437418);
1971 const gmx_simd_double_t CBP1 = gmx_simd_set1_d(0.554167062641455850932670067075);
1972 const gmx_simd_double_t CBP0 = gmx_simd_set1_d(0.427583576155807163756925301060);
1973 const gmx_simd_double_t CBQ7 = gmx_simd_set1_d(0.00212288829699830145976198384930);
1974 const gmx_simd_double_t CBQ6 = gmx_simd_set1_d(0.0334810979522685300554606393425);
1975 const gmx_simd_double_t CBQ5 = gmx_simd_set1_d(0.2361713785181450957579508850717);
1976 const gmx_simd_double_t CBQ4 = gmx_simd_set1_d(0.955364736493055670530981883072);
1977 const gmx_simd_double_t CBQ3 = gmx_simd_set1_d(2.36815675631420037315349279199);
1978 const gmx_simd_double_t CBQ2 = gmx_simd_set1_d(3.55261649184083035537184223542);
1979 const gmx_simd_double_t CBQ1 = gmx_simd_set1_d(2.93501136050160872574376997993);
1980 /* CBQ0 == 1.0 */
1982 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1983 const gmx_simd_double_t CCP6 = gmx_simd_set1_d(-2.8175401114513378771);
1984 const gmx_simd_double_t CCP5 = gmx_simd_set1_d(-3.22729451764143718517);
1985 const gmx_simd_double_t CCP4 = gmx_simd_set1_d(-2.5518551727311523996);
1986 const gmx_simd_double_t CCP3 = gmx_simd_set1_d(-0.687717681153649930619);
1987 const gmx_simd_double_t CCP2 = gmx_simd_set1_d(-0.212652252872804219852);
1988 const gmx_simd_double_t CCP1 = gmx_simd_set1_d(0.0175389834052493308818);
1989 const gmx_simd_double_t CCP0 = gmx_simd_set1_d(0.00628057170626964891937);
1991 const gmx_simd_double_t CCQ6 = gmx_simd_set1_d(5.48409182238641741584);
1992 const gmx_simd_double_t CCQ5 = gmx_simd_set1_d(13.5064170191802889145);
1993 const gmx_simd_double_t CCQ4 = gmx_simd_set1_d(22.9367376522880577224);
1994 const gmx_simd_double_t CCQ3 = gmx_simd_set1_d(15.930646027911794143);
1995 const gmx_simd_double_t CCQ2 = gmx_simd_set1_d(11.0567237927800161565);
1996 const gmx_simd_double_t CCQ1 = gmx_simd_set1_d(2.79257750980575282228);
1997 /* CCQ0 == 1.0 */
1998 const gmx_simd_double_t CCoffset = gmx_simd_set1_d(0.5579090118408203125);
2000 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2001 const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
2003 gmx_simd_double_t xabs, x2, x4, t, t2, w, w2;
2004 gmx_simd_double_t PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2005 gmx_simd_double_t PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2006 gmx_simd_double_t PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2007 gmx_simd_double_t res_erf, res_erfcB, res_erfcC, res_erfc, res;
2008 gmx_simd_double_t expmx2;
2009 gmx_simd_dbool_t mask, mask_erf;
2011 /* Calculate erf() */
2012 xabs = gmx_simd_fabs_d(x);
2013 mask_erf = gmx_simd_cmplt_d(xabs, one);
2014 x2 = gmx_simd_mul_d(x, x);
2015 x4 = gmx_simd_mul_d(x2, x2);
2017 PolyAP0 = gmx_simd_mul_d(CAP4, x4);
2018 PolyAP1 = gmx_simd_mul_d(CAP3, x4);
2019 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP2);
2020 PolyAP1 = gmx_simd_add_d(PolyAP1, CAP1);
2021 PolyAP0 = gmx_simd_mul_d(PolyAP0, x4);
2022 PolyAP1 = gmx_simd_mul_d(PolyAP1, x2);
2023 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP0);
2024 PolyAP0 = gmx_simd_add_d(PolyAP0, PolyAP1);
2026 PolyAQ1 = gmx_simd_mul_d(CAQ5, x4);
2027 PolyAQ0 = gmx_simd_mul_d(CAQ4, x4);
2028 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ3);
2029 PolyAQ0 = gmx_simd_add_d(PolyAQ0, CAQ2);
2030 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x4);
2031 PolyAQ0 = gmx_simd_mul_d(PolyAQ0, x4);
2032 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ1);
2033 PolyAQ0 = gmx_simd_add_d(PolyAQ0, one);
2034 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x2);
2035 PolyAQ0 = gmx_simd_add_d(PolyAQ0, PolyAQ1);
2037 res_erf = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_maskfpe_d(PolyAQ0, mask_erf));
2038 res_erf = gmx_simd_add_d(CAoffset, res_erf);
2039 res_erf = gmx_simd_mul_d(x, res_erf);
2041 /* Calculate erfc() in range [1,4.5] */
2042 t = gmx_simd_sub_d(xabs, one);
2043 t2 = gmx_simd_mul_d(t, t);
2045 PolyBP0 = gmx_simd_mul_d(CBP6, t2);
2046 PolyBP1 = gmx_simd_mul_d(CBP5, t2);
2047 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP4);
2048 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP3);
2049 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
2050 PolyBP1 = gmx_simd_mul_d(PolyBP1, t2);
2051 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP2);
2052 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP1);
2053 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
2054 PolyBP1 = gmx_simd_mul_d(PolyBP1, t);
2055 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP0);
2056 PolyBP0 = gmx_simd_add_d(PolyBP0, PolyBP1);
2058 PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
2059 PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
2060 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
2061 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
2062 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
2063 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
2064 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
2065 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
2066 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
2067 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
2068 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
2069 PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
2070 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
2071 PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
2073 res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_notmaskfpe_d(PolyBQ0, mask_erf));
2075 res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
2077 /* Calculate erfc() in range [4.5,inf] */
2078 w = gmx_simd_inv_notmaskfpe_d(xabs, mask_erf);
2079 w2 = gmx_simd_mul_d(w, w);
2081 PolyCP0 = gmx_simd_mul_d(CCP6, w2);
2082 PolyCP1 = gmx_simd_mul_d(CCP5, w2);
2083 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP4);
2084 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP3);
2085 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
2086 PolyCP1 = gmx_simd_mul_d(PolyCP1, w2);
2087 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP2);
2088 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP1);
2089 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
2090 PolyCP1 = gmx_simd_mul_d(PolyCP1, w);
2091 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP0);
2092 PolyCP0 = gmx_simd_add_d(PolyCP0, PolyCP1);
2094 PolyCQ0 = gmx_simd_mul_d(CCQ6, w2);
2095 PolyCQ1 = gmx_simd_mul_d(CCQ5, w2);
2096 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ4);
2097 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ3);
2098 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
2099 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w2);
2100 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ2);
2101 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ1);
2102 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
2103 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w);
2104 PolyCQ0 = gmx_simd_add_d(PolyCQ0, one);
2105 PolyCQ0 = gmx_simd_add_d(PolyCQ0, PolyCQ1);
2107 expmx2 = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
2109 res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_notmaskfpe_d(PolyCQ0, mask_erf));
2110 res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
2111 res_erfcC = gmx_simd_mul_d(res_erfcC, w);
2113 mask = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
2114 res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
2116 res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
2118 /* erfc(x<0) = 2-erfc(|x|) */
2119 mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
2120 res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
2122 /* Select erf() or erfc() */
2123 res = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask_erf);
2125 return res;
2128 /*! \brief SIMD double sin \& cos.
2130 * \copydetails gmx_simd_sincos_f
2132 static gmx_inline void gmx_simdcall
2133 gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
2135 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
2136 const gmx_simd_double_t argred0 = gmx_simd_set1_d(-2*0.78539816290140151978);
2137 const gmx_simd_double_t argred1 = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2138 const gmx_simd_double_t argred2 = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2139 const gmx_simd_double_t argred3 = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2140 const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
2141 const gmx_simd_double_t const_sin5 = gmx_simd_set1_d( 1.58938307283228937328511e-10);
2142 const gmx_simd_double_t const_sin4 = gmx_simd_set1_d(-2.50506943502539773349318e-08);
2143 const gmx_simd_double_t const_sin3 = gmx_simd_set1_d( 2.75573131776846360512547e-06);
2144 const gmx_simd_double_t const_sin2 = gmx_simd_set1_d(-0.000198412698278911770864914);
2145 const gmx_simd_double_t const_sin1 = gmx_simd_set1_d( 0.0083333333333191845961746);
2146 const gmx_simd_double_t const_sin0 = gmx_simd_set1_d(-0.166666666666666130709393);
2148 const gmx_simd_double_t const_cos7 = gmx_simd_set1_d(-1.13615350239097429531523e-11);
2149 const gmx_simd_double_t const_cos6 = gmx_simd_set1_d( 2.08757471207040055479366e-09);
2150 const gmx_simd_double_t const_cos5 = gmx_simd_set1_d(-2.75573144028847567498567e-07);
2151 const gmx_simd_double_t const_cos4 = gmx_simd_set1_d( 2.48015872890001867311915e-05);
2152 const gmx_simd_double_t const_cos3 = gmx_simd_set1_d(-0.00138888888888714019282329);
2153 const gmx_simd_double_t const_cos2 = gmx_simd_set1_d( 0.0416666666666665519592062);
2154 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
2155 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2156 gmx_simd_double_t ssign, csign;
2157 gmx_simd_double_t x2, y, z, psin, pcos, sss, ccc;
2158 gmx_simd_dbool_t mask;
2159 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2160 const gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
2161 const gmx_simd_dint32_t itwo = gmx_simd_set1_di(2);
2162 gmx_simd_dint32_t iy;
2164 z = gmx_simd_mul_d(x, two_over_pi);
2165 iy = gmx_simd_cvt_d2i(z);
2166 y = gmx_simd_round_d(z);
2168 mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
2169 ssign = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
2170 csign = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
2171 #else
2172 const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
2173 const gmx_simd_double_t minusquarter = gmx_simd_set1_d(-0.25);
2174 gmx_simd_double_t q;
2175 gmx_simd_dbool_t m1, m2, m3;
2177 /* The most obvious way to find the arguments quadrant in the unit circle
2178 * to calculate the sign is to use integer arithmetic, but that is not
2179 * present in all SIMD implementations. As an alternative, we have devised a
2180 * pure floating-point algorithm that uses truncation for argument reduction
2181 * so that we get a new value 0<=q<1 over the unit circle, and then
2182 * do floating-point comparisons with fractions. This is likely to be
2183 * slightly slower (~10%) due to the longer latencies of floating-point, so
2184 * we only use it when integer SIMD arithmetic is not present.
2186 ssign = x;
2187 x = gmx_simd_fabs_d(x);
2188 /* It is critical that half-way cases are rounded down */
2189 z = gmx_simd_fmadd_d(x, two_over_pi, half);
2190 y = gmx_simd_trunc_d(z);
2191 q = gmx_simd_mul_d(z, quarter);
2192 q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2193 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2194 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2195 * This removes the 2*Pi periodicity without using any integer arithmetic.
2196 * First check if y had the value 2 or 3, set csign if true.
2198 q = gmx_simd_sub_d(q, half);
2199 /* If we have logical operations we can work directly on the signbit, which
2200 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2201 * Thus, if you are altering defines to debug alternative code paths, the
2202 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2203 * active or inactive - you will get errors if only one is used.
2205 # ifdef GMX_SIMD_HAVE_LOGICAL
2206 ssign = gmx_simd_and_d(ssign, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2207 csign = gmx_simd_andnot_d(q, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2208 ssign = gmx_simd_xor_d(ssign, csign);
2209 # else
2210 csign = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
2211 ssign = gmx_simd_xor_sign_d(ssign, csign); /* swap ssign if csign was set. */
2212 # endif
2213 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
2214 m1 = gmx_simd_cmplt_d(q, minusquarter);
2215 m2 = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
2216 m3 = gmx_simd_cmplt_d(q, quarter);
2217 m2 = gmx_simd_and_db(m2, m3);
2218 mask = gmx_simd_or_db(m1, m2);
2219 /* where mask is FALSE, set sign. */
2220 csign = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
2221 #endif
2222 x = gmx_simd_fmadd_d(y, argred0, x);
2223 x = gmx_simd_fmadd_d(y, argred1, x);
2224 x = gmx_simd_fmadd_d(y, argred2, x);
2225 x = gmx_simd_fmadd_d(y, argred3, x);
2226 x2 = gmx_simd_mul_d(x, x);
2228 psin = gmx_simd_fmadd_d(const_sin5, x2, const_sin4);
2229 psin = gmx_simd_fmadd_d(psin, x2, const_sin3);
2230 psin = gmx_simd_fmadd_d(psin, x2, const_sin2);
2231 psin = gmx_simd_fmadd_d(psin, x2, const_sin1);
2232 psin = gmx_simd_fmadd_d(psin, x2, const_sin0);
2233 psin = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x2, x), x);
2235 pcos = gmx_simd_fmadd_d(const_cos7, x2, const_cos6);
2236 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos5);
2237 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos4);
2238 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos3);
2239 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos2);
2240 pcos = gmx_simd_fmsub_d(pcos, x2, half);
2241 pcos = gmx_simd_fmadd_d(pcos, x2, one);
2243 sss = gmx_simd_blendv_d(pcos, psin, mask);
2244 ccc = gmx_simd_blendv_d(psin, pcos, mask);
2245 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
2246 #ifdef GMX_SIMD_HAVE_LOGICAL
2247 *sinval = gmx_simd_xor_d(sss, ssign);
2248 *cosval = gmx_simd_xor_d(ccc, csign);
2249 #else
2250 *sinval = gmx_simd_xor_sign_d(sss, ssign);
2251 *cosval = gmx_simd_xor_sign_d(ccc, csign);
2252 #endif
2255 /*! \brief SIMD double sin(x).
2257 * \copydetails gmx_simd_sin_f
2259 static gmx_inline gmx_simd_double_t gmx_simdcall
2260 gmx_simd_sin_d(gmx_simd_double_t x)
2262 gmx_simd_double_t s, c;
2263 gmx_simd_sincos_d(x, &s, &c);
2264 return s;
2267 /*! \brief SIMD double cos(x).
2269 * \copydetails gmx_simd_cos_f
2271 static gmx_inline gmx_simd_double_t gmx_simdcall
2272 gmx_simd_cos_d(gmx_simd_double_t x)
2274 gmx_simd_double_t s, c;
2275 gmx_simd_sincos_d(x, &s, &c);
2276 return c;
2279 /*! \brief SIMD double tan(x).
2281 * \copydetails gmx_simd_tan_f
2283 static gmx_inline gmx_simd_double_t gmx_simdcall
2284 gmx_simd_tan_d(gmx_simd_double_t x)
2286 const gmx_simd_double_t argred0 = gmx_simd_set1_d(-2*0.78539816290140151978);
2287 const gmx_simd_double_t argred1 = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2288 const gmx_simd_double_t argred2 = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2289 const gmx_simd_double_t argred3 = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2290 const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
2291 const gmx_simd_double_t CT15 = gmx_simd_set1_d(1.01419718511083373224408e-05);
2292 const gmx_simd_double_t CT14 = gmx_simd_set1_d(-2.59519791585924697698614e-05);
2293 const gmx_simd_double_t CT13 = gmx_simd_set1_d(5.23388081915899855325186e-05);
2294 const gmx_simd_double_t CT12 = gmx_simd_set1_d(-3.05033014433946488225616e-05);
2295 const gmx_simd_double_t CT11 = gmx_simd_set1_d(7.14707504084242744267497e-05);
2296 const gmx_simd_double_t CT10 = gmx_simd_set1_d(8.09674518280159187045078e-05);
2297 const gmx_simd_double_t CT9 = gmx_simd_set1_d(0.000244884931879331847054404);
2298 const gmx_simd_double_t CT8 = gmx_simd_set1_d(0.000588505168743587154904506);
2299 const gmx_simd_double_t CT7 = gmx_simd_set1_d(0.00145612788922812427978848);
2300 const gmx_simd_double_t CT6 = gmx_simd_set1_d(0.00359208743836906619142924);
2301 const gmx_simd_double_t CT5 = gmx_simd_set1_d(0.00886323944362401618113356);
2302 const gmx_simd_double_t CT4 = gmx_simd_set1_d(0.0218694882853846389592078);
2303 const gmx_simd_double_t CT3 = gmx_simd_set1_d(0.0539682539781298417636002);
2304 const gmx_simd_double_t CT2 = gmx_simd_set1_d(0.133333333333125941821962);
2305 const gmx_simd_double_t CT1 = gmx_simd_set1_d(0.333333333333334980164153);
2307 gmx_simd_double_t x2, p, y, z;
2308 gmx_simd_dbool_t mask;
2310 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2311 gmx_simd_dint32_t iy;
2312 gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
2314 z = gmx_simd_mul_d(x, two_over_pi);
2315 iy = gmx_simd_cvt_d2i(z);
2316 y = gmx_simd_round_d(z);
2317 mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
2319 x = gmx_simd_fmadd_d(y, argred0, x);
2320 x = gmx_simd_fmadd_d(y, argred1, x);
2321 x = gmx_simd_fmadd_d(y, argred2, x);
2322 x = gmx_simd_fmadd_d(y, argred3, x);
2323 x = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), mask), x);
2324 #else
2325 const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
2326 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
2327 const gmx_simd_double_t threequarter = gmx_simd_set1_d(0.75);
2328 gmx_simd_double_t w, q;
2329 gmx_simd_dbool_t m1, m2, m3;
2331 w = gmx_simd_fabs_d(x);
2332 z = gmx_simd_fmadd_d(w, two_over_pi, half);
2333 y = gmx_simd_trunc_d(z);
2334 q = gmx_simd_mul_d(z, quarter);
2335 q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2336 m1 = gmx_simd_cmple_d(quarter, q);
2337 m2 = gmx_simd_cmplt_d(q, half);
2338 m3 = gmx_simd_cmple_d(threequarter, q);
2339 m1 = gmx_simd_and_db(m1, m2);
2340 mask = gmx_simd_or_db(m1, m3);
2341 w = gmx_simd_fmadd_d(y, argred0, w);
2342 w = gmx_simd_fmadd_d(y, argred1, w);
2343 w = gmx_simd_fmadd_d(y, argred2, w);
2344 w = gmx_simd_fmadd_d(y, argred3, w);
2346 w = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
2347 x = gmx_simd_xor_sign_d(w, x);
2348 #endif
2349 x2 = gmx_simd_mul_d(x, x);
2350 p = gmx_simd_fmadd_d(CT15, x2, CT14);
2351 p = gmx_simd_fmadd_d(p, x2, CT13);
2352 p = gmx_simd_fmadd_d(p, x2, CT12);
2353 p = gmx_simd_fmadd_d(p, x2, CT11);
2354 p = gmx_simd_fmadd_d(p, x2, CT10);
2355 p = gmx_simd_fmadd_d(p, x2, CT9);
2356 p = gmx_simd_fmadd_d(p, x2, CT8);
2357 p = gmx_simd_fmadd_d(p, x2, CT7);
2358 p = gmx_simd_fmadd_d(p, x2, CT6);
2359 p = gmx_simd_fmadd_d(p, x2, CT5);
2360 p = gmx_simd_fmadd_d(p, x2, CT4);
2361 p = gmx_simd_fmadd_d(p, x2, CT3);
2362 p = gmx_simd_fmadd_d(p, x2, CT2);
2363 p = gmx_simd_fmadd_d(p, x2, CT1);
2364 p = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
2366 p = gmx_simd_blendv_d( p, gmx_simd_inv_maskfpe_d(p, mask), mask);
2367 return p;
2370 /*! \brief SIMD double asin(x).
2372 * \copydetails gmx_simd_asin_f
2374 static gmx_inline gmx_simd_double_t gmx_simdcall
2375 gmx_simd_asin_d(gmx_simd_double_t x)
2377 /* Same algorithm as cephes library */
2378 const gmx_simd_double_t limit1 = gmx_simd_set1_d(0.625);
2379 const gmx_simd_double_t limit2 = gmx_simd_set1_d(1e-8);
2380 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2381 const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2382 const gmx_simd_double_t morebits = gmx_simd_set1_d(6.123233995736765886130e-17);
2384 const gmx_simd_double_t P5 = gmx_simd_set1_d(4.253011369004428248960e-3);
2385 const gmx_simd_double_t P4 = gmx_simd_set1_d(-6.019598008014123785661e-1);
2386 const gmx_simd_double_t P3 = gmx_simd_set1_d(5.444622390564711410273e0);
2387 const gmx_simd_double_t P2 = gmx_simd_set1_d(-1.626247967210700244449e1);
2388 const gmx_simd_double_t P1 = gmx_simd_set1_d(1.956261983317594739197e1);
2389 const gmx_simd_double_t P0 = gmx_simd_set1_d(-8.198089802484824371615e0);
2391 const gmx_simd_double_t Q4 = gmx_simd_set1_d(-1.474091372988853791896e1);
2392 const gmx_simd_double_t Q3 = gmx_simd_set1_d(7.049610280856842141659e1);
2393 const gmx_simd_double_t Q2 = gmx_simd_set1_d(-1.471791292232726029859e2);
2394 const gmx_simd_double_t Q1 = gmx_simd_set1_d(1.395105614657485689735e2);
2395 const gmx_simd_double_t Q0 = gmx_simd_set1_d(-4.918853881490881290097e1);
2397 const gmx_simd_double_t R4 = gmx_simd_set1_d(2.967721961301243206100e-3);
2398 const gmx_simd_double_t R3 = gmx_simd_set1_d(-5.634242780008963776856e-1);
2399 const gmx_simd_double_t R2 = gmx_simd_set1_d(6.968710824104713396794e0);
2400 const gmx_simd_double_t R1 = gmx_simd_set1_d(-2.556901049652824852289e1);
2401 const gmx_simd_double_t R0 = gmx_simd_set1_d(2.853665548261061424989e1);
2403 const gmx_simd_double_t S3 = gmx_simd_set1_d(-2.194779531642920639778e1);
2404 const gmx_simd_double_t S2 = gmx_simd_set1_d(1.470656354026814941758e2);
2405 const gmx_simd_double_t S1 = gmx_simd_set1_d(-3.838770957603691357202e2);
2406 const gmx_simd_double_t S0 = gmx_simd_set1_d(3.424398657913078477438e2);
2408 gmx_simd_double_t xabs;
2409 gmx_simd_double_t zz, ww, z, q, w, zz2, ww2;
2410 gmx_simd_double_t PA, PB;
2411 gmx_simd_double_t QA, QB;
2412 gmx_simd_double_t RA, RB;
2413 gmx_simd_double_t SA, SB;
2414 gmx_simd_double_t nom, denom;
2415 gmx_simd_dbool_t mask, mask2;
2417 xabs = gmx_simd_fabs_d(x);
2419 mask = gmx_simd_cmplt_d(limit1, xabs);
2421 zz = gmx_simd_sub_d(one, xabs);
2422 ww = gmx_simd_mul_d(xabs, xabs);
2423 zz2 = gmx_simd_mul_d(zz, zz);
2424 ww2 = gmx_simd_mul_d(ww, ww);
2426 /* R */
2427 RA = gmx_simd_mul_d(R4, zz2);
2428 RB = gmx_simd_mul_d(R3, zz2);
2429 RA = gmx_simd_add_d(RA, R2);
2430 RB = gmx_simd_add_d(RB, R1);
2431 RA = gmx_simd_mul_d(RA, zz2);
2432 RB = gmx_simd_mul_d(RB, zz);
2433 RA = gmx_simd_add_d(RA, R0);
2434 RA = gmx_simd_add_d(RA, RB);
2436 /* S, SA = zz2 */
2437 SB = gmx_simd_mul_d(S3, zz2);
2438 SA = gmx_simd_add_d(zz2, S2);
2439 SB = gmx_simd_add_d(SB, S1);
2440 SA = gmx_simd_mul_d(SA, zz2);
2441 SB = gmx_simd_mul_d(SB, zz);
2442 SA = gmx_simd_add_d(SA, S0);
2443 SA = gmx_simd_add_d(SA, SB);
2445 /* P */
2446 PA = gmx_simd_mul_d(P5, ww2);
2447 PB = gmx_simd_mul_d(P4, ww2);
2448 PA = gmx_simd_add_d(PA, P3);
2449 PB = gmx_simd_add_d(PB, P2);
2450 PA = gmx_simd_mul_d(PA, ww2);
2451 PB = gmx_simd_mul_d(PB, ww2);
2452 PA = gmx_simd_add_d(PA, P1);
2453 PB = gmx_simd_add_d(PB, P0);
2454 PA = gmx_simd_mul_d(PA, ww);
2455 PA = gmx_simd_add_d(PA, PB);
2457 /* Q, QA = ww2 */
2458 QB = gmx_simd_mul_d(Q4, ww2);
2459 QA = gmx_simd_add_d(ww2, Q3);
2460 QB = gmx_simd_add_d(QB, Q2);
2461 QA = gmx_simd_mul_d(QA, ww2);
2462 QB = gmx_simd_mul_d(QB, ww2);
2463 QA = gmx_simd_add_d(QA, Q1);
2464 QB = gmx_simd_add_d(QB, Q0);
2465 QA = gmx_simd_mul_d(QA, ww);
2466 QA = gmx_simd_add_d(QA, QB);
2468 RA = gmx_simd_mul_d(RA, zz);
2469 PA = gmx_simd_mul_d(PA, ww);
2471 nom = gmx_simd_blendv_d( PA, RA, mask );
2472 denom = gmx_simd_blendv_d( QA, SA, mask );
2474 mask2 = gmx_simd_cmplt_d(limit2, xabs);
2475 q = gmx_simd_mul_d( nom, gmx_simd_inv_maskfpe_d(denom, mask2) );
2477 zz = gmx_simd_add_d(zz, zz);
2478 zz = gmx_simd_sqrt_d(zz);
2479 z = gmx_simd_sub_d(quarterpi, zz);
2480 zz = gmx_simd_mul_d(zz, q);
2481 zz = gmx_simd_sub_d(zz, morebits);
2482 z = gmx_simd_sub_d(z, zz);
2483 z = gmx_simd_add_d(z, quarterpi);
2485 w = gmx_simd_mul_d(xabs, q);
2486 w = gmx_simd_add_d(w, xabs);
2488 z = gmx_simd_blendv_d( w, z, mask );
2490 z = gmx_simd_blendv_d( xabs, z, mask2 );
2492 z = gmx_simd_xor_sign_d(z, x);
2494 return z;
2497 /*! \brief SIMD double acos(x).
2499 * \copydetails gmx_simd_acos_f
2501 static gmx_inline gmx_simd_double_t gmx_simdcall
2502 gmx_simd_acos_d(gmx_simd_double_t x)
2504 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2505 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
2506 const gmx_simd_double_t quarterpi0 = gmx_simd_set1_d(7.85398163397448309616e-1);
2507 const gmx_simd_double_t quarterpi1 = gmx_simd_set1_d(6.123233995736765886130e-17);
2509 gmx_simd_dbool_t mask1;
2510 gmx_simd_double_t z, z1, z2;
2512 mask1 = gmx_simd_cmplt_d(half, x);
2513 z1 = gmx_simd_mul_d(half, gmx_simd_sub_d(one, x));
2514 z1 = gmx_simd_sqrt_d(z1);
2515 z = gmx_simd_blendv_d( x, z1, mask1 );
2517 z = gmx_simd_asin_d(z);
2519 z1 = gmx_simd_add_d(z, z);
2521 z2 = gmx_simd_sub_d(quarterpi0, z);
2522 z2 = gmx_simd_add_d(z2, quarterpi1);
2523 z2 = gmx_simd_add_d(z2, quarterpi0);
2525 z = gmx_simd_blendv_d(z2, z1, mask1);
2527 return z;
2530 /*! \brief SIMD double atan(x).
2532 * \copydetails gmx_simd_atan_f
2534 static gmx_inline gmx_simd_double_t gmx_simdcall
2535 gmx_simd_atan_d(gmx_simd_double_t x)
2537 /* Same algorithm as cephes library */
2538 const gmx_simd_double_t limit1 = gmx_simd_set1_d(0.66);
2539 const gmx_simd_double_t limit2 = gmx_simd_set1_d(2.41421356237309504880);
2540 const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2541 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
2542 const gmx_simd_double_t mone = gmx_simd_set1_d(-1.0);
2543 const gmx_simd_double_t morebits1 = gmx_simd_set1_d(0.5*6.123233995736765886130E-17);
2544 const gmx_simd_double_t morebits2 = gmx_simd_set1_d(6.123233995736765886130E-17);
2546 const gmx_simd_double_t P4 = gmx_simd_set1_d(-8.750608600031904122785E-1);
2547 const gmx_simd_double_t P3 = gmx_simd_set1_d(-1.615753718733365076637E1);
2548 const gmx_simd_double_t P2 = gmx_simd_set1_d(-7.500855792314704667340E1);
2549 const gmx_simd_double_t P1 = gmx_simd_set1_d(-1.228866684490136173410E2);
2550 const gmx_simd_double_t P0 = gmx_simd_set1_d(-6.485021904942025371773E1);
2552 const gmx_simd_double_t Q4 = gmx_simd_set1_d(2.485846490142306297962E1);
2553 const gmx_simd_double_t Q3 = gmx_simd_set1_d(1.650270098316988542046E2);
2554 const gmx_simd_double_t Q2 = gmx_simd_set1_d(4.328810604912902668951E2);
2555 const gmx_simd_double_t Q1 = gmx_simd_set1_d(4.853903996359136964868E2);
2556 const gmx_simd_double_t Q0 = gmx_simd_set1_d(1.945506571482613964425E2);
2558 gmx_simd_double_t y, xabs, t1, t2;
2559 gmx_simd_double_t z, z2;
2560 gmx_simd_double_t P_A, P_B, Q_A, Q_B;
2561 gmx_simd_dbool_t mask1, mask2;
2563 xabs = gmx_simd_fabs_d(x);
2565 mask1 = gmx_simd_cmplt_d(limit1, xabs);
2566 mask2 = gmx_simd_cmplt_d(limit2, xabs);
2568 t1 = gmx_simd_mul_d(gmx_simd_add_d(xabs, mone),
2569 gmx_simd_inv_maskfpe_d(gmx_simd_sub_d(xabs, mone), mask1));
2570 t2 = gmx_simd_mul_d(mone, gmx_simd_inv_maskfpe_d(xabs, mask2));
2572 y = gmx_simd_blendzero_d(quarterpi, mask1);
2573 y = gmx_simd_blendv_d(y, halfpi, mask2);
2574 xabs = gmx_simd_blendv_d(xabs, t1, mask1);
2575 xabs = gmx_simd_blendv_d(xabs, t2, mask2);
2577 z = gmx_simd_mul_d(xabs, xabs);
2578 z2 = gmx_simd_mul_d(z, z);
2580 P_A = gmx_simd_mul_d(P4, z2);
2581 P_B = gmx_simd_mul_d(P3, z2);
2582 P_A = gmx_simd_add_d(P_A, P2);
2583 P_B = gmx_simd_add_d(P_B, P1);
2584 P_A = gmx_simd_mul_d(P_A, z2);
2585 P_B = gmx_simd_mul_d(P_B, z);
2586 P_A = gmx_simd_add_d(P_A, P0);
2587 P_A = gmx_simd_add_d(P_A, P_B);
2589 /* Q_A = z2 */
2590 Q_B = gmx_simd_mul_d(Q4, z2);
2591 Q_A = gmx_simd_add_d(z2, Q3);
2592 Q_B = gmx_simd_add_d(Q_B, Q2);
2593 Q_A = gmx_simd_mul_d(Q_A, z2);
2594 Q_B = gmx_simd_mul_d(Q_B, z2);
2595 Q_A = gmx_simd_add_d(Q_A, Q1);
2596 Q_B = gmx_simd_add_d(Q_B, Q0);
2597 Q_A = gmx_simd_mul_d(Q_A, z);
2598 Q_A = gmx_simd_add_d(Q_A, Q_B);
2600 z = gmx_simd_mul_d(z, P_A);
2601 z = gmx_simd_mul_d(z, gmx_simd_inv_d(Q_A));
2602 z = gmx_simd_mul_d(z, xabs);
2603 z = gmx_simd_add_d(z, xabs);
2605 t1 = gmx_simd_blendzero_d(morebits1, mask1);
2606 t1 = gmx_simd_blendv_d(t1, morebits2, mask2);
2608 z = gmx_simd_add_d(z, t1);
2609 y = gmx_simd_add_d(y, z);
2611 y = gmx_simd_xor_sign_d(y, x);
2613 return y;
2616 /*! \brief SIMD double atan2(y,x).
2618 * \copydetails gmx_simd_atan2_f
2620 static gmx_inline gmx_simd_double_t gmx_simdcall
2621 gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
2623 const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
2624 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
2625 gmx_simd_double_t xinv, p, aoffset;
2626 gmx_simd_dbool_t mask_x0, mask_y0, mask_xlt0, mask_ylt0;
2628 mask_x0 = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
2629 mask_y0 = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
2630 mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
2631 mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
2633 aoffset = gmx_simd_blendzero_d(halfpi, mask_x0);
2634 aoffset = gmx_simd_blendnotzero_d(aoffset, mask_y0);
2636 aoffset = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
2637 aoffset = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
2639 xinv = gmx_simd_blendnotzero_d(gmx_simd_inv_notmaskfpe_d(x, mask_x0), mask_x0);
2640 p = gmx_simd_mul_d(y, xinv);
2641 p = gmx_simd_atan_d(p);
2642 p = gmx_simd_add_d(p, aoffset);
2644 return p;
2648 /*! \brief Calculate the force correction due to PME analytically for SIMD double.
2650 * \copydetails gmx_simd_pmecorrF_f
2652 static gmx_inline gmx_simd_double_t gmx_simdcall
2653 gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
2655 const gmx_simd_double_t FN10 = gmx_simd_set1_d(-8.0072854618360083154e-14);
2656 const gmx_simd_double_t FN9 = gmx_simd_set1_d(1.1859116242260148027e-11);
2657 const gmx_simd_double_t FN8 = gmx_simd_set1_d(-8.1490406329798423616e-10);
2658 const gmx_simd_double_t FN7 = gmx_simd_set1_d(3.4404793543907847655e-8);
2659 const gmx_simd_double_t FN6 = gmx_simd_set1_d(-9.9471420832602741006e-7);
2660 const gmx_simd_double_t FN5 = gmx_simd_set1_d(0.000020740315999115847456);
2661 const gmx_simd_double_t FN4 = gmx_simd_set1_d(-0.00031991745139313364005);
2662 const gmx_simd_double_t FN3 = gmx_simd_set1_d(0.0035074449373659008203);
2663 const gmx_simd_double_t FN2 = gmx_simd_set1_d(-0.031750380176100813405);
2664 const gmx_simd_double_t FN1 = gmx_simd_set1_d(0.13884101728898463426);
2665 const gmx_simd_double_t FN0 = gmx_simd_set1_d(-0.75225277815249618847);
2667 const gmx_simd_double_t FD5 = gmx_simd_set1_d(0.000016009278224355026701);
2668 const gmx_simd_double_t FD4 = gmx_simd_set1_d(0.00051055686934806966046);
2669 const gmx_simd_double_t FD3 = gmx_simd_set1_d(0.0081803507497974289008);
2670 const gmx_simd_double_t FD2 = gmx_simd_set1_d(0.077181146026670287235);
2671 const gmx_simd_double_t FD1 = gmx_simd_set1_d(0.41543303143712535988);
2672 const gmx_simd_double_t FD0 = gmx_simd_set1_d(1.0);
2674 gmx_simd_double_t z4;
2675 gmx_simd_double_t polyFN0, polyFN1, polyFD0, polyFD1;
2677 z4 = gmx_simd_mul_d(z2, z2);
2679 polyFD1 = gmx_simd_fmadd_d(FD5, z4, FD3);
2680 polyFD1 = gmx_simd_fmadd_d(polyFD1, z4, FD1);
2681 polyFD1 = gmx_simd_mul_d(polyFD1, z2);
2682 polyFD0 = gmx_simd_fmadd_d(FD4, z4, FD2);
2683 polyFD0 = gmx_simd_fmadd_d(polyFD0, z4, FD0);
2684 polyFD0 = gmx_simd_add_d(polyFD0, polyFD1);
2686 polyFD0 = gmx_simd_inv_d(polyFD0);
2688 polyFN0 = gmx_simd_fmadd_d(FN10, z4, FN8);
2689 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN6);
2690 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN4);
2691 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN2);
2692 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN0);
2693 polyFN1 = gmx_simd_fmadd_d(FN9, z4, FN7);
2694 polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN5);
2695 polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN3);
2696 polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN1);
2697 polyFN0 = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
2700 return gmx_simd_mul_d(polyFN0, polyFD0);
2705 /*! \brief Calculate the potential correction due to PME analytically for SIMD double.
2707 * \copydetails gmx_simd_pmecorrV_f
2709 static gmx_inline gmx_simd_double_t gmx_simdcall
2710 gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
2712 const gmx_simd_double_t VN9 = gmx_simd_set1_d(-9.3723776169321855475e-13);
2713 const gmx_simd_double_t VN8 = gmx_simd_set1_d(1.2280156762674215741e-10);
2714 const gmx_simd_double_t VN7 = gmx_simd_set1_d(-7.3562157912251309487e-9);
2715 const gmx_simd_double_t VN6 = gmx_simd_set1_d(2.6215886208032517509e-7);
2716 const gmx_simd_double_t VN5 = gmx_simd_set1_d(-4.9532491651265819499e-6);
2717 const gmx_simd_double_t VN4 = gmx_simd_set1_d(0.00025907400778966060389);
2718 const gmx_simd_double_t VN3 = gmx_simd_set1_d(0.0010585044856156469792);
2719 const gmx_simd_double_t VN2 = gmx_simd_set1_d(0.045247661136833092885);
2720 const gmx_simd_double_t VN1 = gmx_simd_set1_d(0.11643931522926034421);
2721 const gmx_simd_double_t VN0 = gmx_simd_set1_d(1.1283791671726767970);
2723 const gmx_simd_double_t VD5 = gmx_simd_set1_d(0.000021784709867336150342);
2724 const gmx_simd_double_t VD4 = gmx_simd_set1_d(0.00064293662010911388448);
2725 const gmx_simd_double_t VD3 = gmx_simd_set1_d(0.0096311444822588683504);
2726 const gmx_simd_double_t VD2 = gmx_simd_set1_d(0.085608012351550627051);
2727 const gmx_simd_double_t VD1 = gmx_simd_set1_d(0.43652499166614811084);
2728 const gmx_simd_double_t VD0 = gmx_simd_set1_d(1.0);
2730 gmx_simd_double_t z4;
2731 gmx_simd_double_t polyVN0, polyVN1, polyVD0, polyVD1;
2733 z4 = gmx_simd_mul_d(z2, z2);
2735 polyVD1 = gmx_simd_fmadd_d(VD5, z4, VD3);
2736 polyVD0 = gmx_simd_fmadd_d(VD4, z4, VD2);
2737 polyVD1 = gmx_simd_fmadd_d(polyVD1, z4, VD1);
2738 polyVD0 = gmx_simd_fmadd_d(polyVD0, z4, VD0);
2739 polyVD0 = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
2741 polyVD0 = gmx_simd_inv_d(polyVD0);
2743 polyVN1 = gmx_simd_fmadd_d(VN9, z4, VN7);
2744 polyVN0 = gmx_simd_fmadd_d(VN8, z4, VN6);
2745 polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN5);
2746 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN4);
2747 polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN3);
2748 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN2);
2749 polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN1);
2750 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN0);
2751 polyVN0 = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
2753 return gmx_simd_mul_d(polyVN0, polyVD0);
2756 /*! \} */
2759 /*! \name SIMD math functions for double prec. data, single prec. accuracy
2761 * \note In some cases we do not need full double accuracy of individual
2762 * SIMD math functions, although the data is stored in double precision
2763 * SIMD registers. This might be the case for special algorithms, or
2764 * if the architecture does not support single precision.
2765 * Since the full double precision evaluation of math functions
2766 * typically require much more expensive polynomial approximations
2767 * these functions implement the algorithms used in the single precision
2768 * SIMD math functions, but they operate on double precision
2769 * SIMD variables.
2771 * \note You should normally not use these functions directly, but the
2772 * real-precision wrappers instead. When Gromacs is compiled in single
2773 * precision, those will be aliases to the normal single precision
2774 * SIMD math functions.
2775 * \{
2778 /*********************************************************************
2779 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
2780 *********************************************************************/
2782 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
2784 * You should normally call the real-precision routine
2785 * \ref gmx_simd_invsqrt_singleaccuracy_r.
2787 * \param x Argument that must be >0. This routine does not check arguments.
2788 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
2790 static gmx_inline gmx_simd_double_t gmx_simdcall
2791 gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_double_t x)
2793 gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
2794 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2795 lu = gmx_simd_rsqrt_iter_d(lu, x);
2796 #endif
2797 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2798 lu = gmx_simd_rsqrt_iter_d(lu, x);
2799 #endif
2800 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2801 lu = gmx_simd_rsqrt_iter_d(lu, x);
2802 #endif
2803 return lu;
2806 /*! \brief 1/sqrt(x) for masked entries of SIMD double, but in single accuracy.
2808 * \copydetails gmx_simd_invsqrt_maskfpe_f
2810 static gmx_inline gmx_simd_double_t
2811 gmx_simd_invsqrt_maskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2813 #ifdef NDEBUG
2814 return gmx_simd_invsqrt_singleaccuracy_d(x);
2815 #else
2816 return gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
2817 #endif
2820 /*! \brief 1/sqrt(x) for non-masked entries of SIMD double, in single accuracy.
2822 * \copydetails gmx_simd_invsqrt_notmaskfpe_f
2824 static gmx_inline gmx_simd_double_t
2825 gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2827 #ifdef NDEBUG
2828 return gmx_simd_invsqrt_singleaccuracy_d(x);
2829 #else
2830 return gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
2831 #endif
2834 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
2836 * You should normally call the real-precision routine
2837 * \ref gmx_simd_invsqrt_pair_singleaccuracy_r.
2839 * \param x0 First set of arguments, x0 must be positive - no argument checking.
2840 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
2841 * \param[out] out0 Result 1/sqrt(x0)
2842 * \param[out] out1 Result 1/sqrt(x1)
2844 * In particular for double precision we can sometimes calculate square root
2845 * pairs slightly faster by using single precision until the very last step.
2847 static gmx_inline void gmx_simdcall
2848 gmx_simd_invsqrt_pair_singleaccuracy_d(gmx_simd_double_t x0, gmx_simd_double_t x1,
2849 gmx_simd_double_t *out0, gmx_simd_double_t *out1)
2851 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
2852 gmx_simd_float_t xf = gmx_simd_cvt_dd2f(x0, x1);
2853 gmx_simd_float_t luf = gmx_simd_rsqrt_f(xf);
2854 gmx_simd_double_t lu0, lu1;
2855 /* Intermediate target is single - mantissa+1 bits */
2856 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2857 luf = gmx_simd_rsqrt_iter_f(luf, xf);
2858 #endif
2859 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2860 luf = gmx_simd_rsqrt_iter_f(luf, xf);
2861 #endif
2862 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2863 luf = gmx_simd_rsqrt_iter_f(luf, xf);
2864 #endif
2865 gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
2866 /* We now have single-precision accuracy values in lu0/lu1 */
2867 *out0 = lu0;
2868 *out1 = lu1;
2869 #else
2870 *out0 = gmx_simd_invsqrt_singleaccuracy_d(x0);
2871 *out1 = gmx_simd_invsqrt_singleaccuracy_d(x1);
2872 #endif
2876 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
2878 * You should normally call the real-precision routine
2879 * \ref gmx_simd_inv_singleaccuracy_r.
2881 * \param x Argument that must be nonzero. This routine does not check arguments.
2882 * \return 1/x. Result is undefined if your argument was invalid.
2884 static gmx_inline gmx_simd_double_t gmx_simdcall
2885 gmx_simd_inv_singleaccuracy_d(gmx_simd_double_t x)
2887 gmx_simd_double_t lu = gmx_simd_rcp_d(x);
2888 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2889 lu = gmx_simd_rcp_iter_d(lu, x);
2890 #endif
2891 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2892 lu = gmx_simd_rcp_iter_d(lu, x);
2893 #endif
2894 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2895 lu = gmx_simd_rcp_iter_d(lu, x);
2896 #endif
2897 return lu;
2900 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
2902 * \copydetails gmx_simd_inv_maskfpe_f
2904 static gmx_inline gmx_simd_double_t
2905 gmx_simd_inv_maskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2907 #ifdef NDEBUG
2908 return gmx_simd_inv_singleaccuracy_d(x);
2909 #else
2910 return gmx_simd_inv_singleaccuracy_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x, m));
2911 #endif
2914 /*! \brief 1/x for non-masked entries of SIMD double, single accuracy.
2916 * \copydetails gmx_simd_inv_notmaskfpe_f
2918 static gmx_inline gmx_simd_double_t
2919 gmx_simd_inv_notmaskfpe_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_dbool_t gmx_unused m)
2921 #ifdef NDEBUG
2922 return gmx_simd_inv_singleaccuracy_d(x);
2923 #else
2924 return gmx_simd_inv_singleaccuracy_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0), m));
2925 #endif
2928 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, single accuracy.
2930 * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
2932 * \param x Argument that must be >=0.
2933 * \return sqrt(x). If x=0, the result will correctly be set to 0.
2934 * The result is undefined if the input value is negative.
2936 static gmx_inline gmx_simd_double_t gmx_simdcall
2937 gmx_simd_sqrt_singleaccuracy_d(gmx_simd_double_t x)
2939 gmx_simd_dbool_t mask;
2940 gmx_simd_double_t res;
2942 mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
2943 res = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(x, mask), mask);
2944 return gmx_simd_mul_d(res, x);
2947 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
2949 * You should normally call the real-precision routine
2950 * \ref gmx_simd_log_singleaccuracy_r.
2952 * \param x Argument, should be >0.
2953 * \result The natural logarithm of x. Undefined if argument is invalid.
2955 #ifndef gmx_simd_log_singleaccuracy_d
2956 static gmx_inline gmx_simd_double_t gmx_simdcall
2957 gmx_simd_log_singleaccuracy_d(gmx_simd_double_t x)
2959 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
2960 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2961 const gmx_simd_double_t sqrt2 = gmx_simd_set1_d(sqrt(2.0));
2962 const gmx_simd_double_t corr = gmx_simd_set1_d(0.693147180559945286226764);
2963 const gmx_simd_double_t CL9 = gmx_simd_set1_d(0.2371599674224853515625);
2964 const gmx_simd_double_t CL7 = gmx_simd_set1_d(0.285279005765914916992188);
2965 const gmx_simd_double_t CL5 = gmx_simd_set1_d(0.400005519390106201171875);
2966 const gmx_simd_double_t CL3 = gmx_simd_set1_d(0.666666567325592041015625);
2967 const gmx_simd_double_t CL1 = gmx_simd_set1_d(2.0);
2968 gmx_simd_double_t fexp, x2, p;
2969 gmx_simd_dbool_t mask;
2971 fexp = gmx_simd_get_exponent_d(x);
2972 x = gmx_simd_get_mantissa_d(x);
2974 mask = gmx_simd_cmplt_d(sqrt2, x);
2975 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
2976 fexp = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
2977 x = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
2979 x = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_singleaccuracy_d( gmx_simd_add_d(x, one) ) );
2980 x2 = gmx_simd_mul_d(x, x);
2982 p = gmx_simd_fmadd_d(CL9, x2, CL7);
2983 p = gmx_simd_fmadd_d(p, x2, CL5);
2984 p = gmx_simd_fmadd_d(p, x2, CL3);
2985 p = gmx_simd_fmadd_d(p, x2, CL1);
2986 p = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
2988 return p;
2990 #endif
2992 #ifndef gmx_simd_exp2_singleaccuracy_d
2993 /*! \brief SIMD 2^x. Double precision SIMD data, single accuracy.
2995 * You should normally call the real-precision routine
2996 * \ref gmx_simd_exp2_singleaccuracy_r.
2998 * \param x Argument.
2999 * \result 2^x. Undefined if input argument caused overflow.
3001 static gmx_inline gmx_simd_double_t gmx_simdcall
3002 gmx_simd_exp2_singleaccuracy_d(gmx_simd_double_t x)
3004 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
3005 const gmx_simd_double_t arglimit = gmx_simd_set1_d(126.0);
3006 const gmx_simd_double_t CC6 = gmx_simd_set1_d(0.0001534581200287996416911311);
3007 const gmx_simd_double_t CC5 = gmx_simd_set1_d(0.001339993121934088894618990);
3008 const gmx_simd_double_t CC4 = gmx_simd_set1_d(0.009618488957115180159497841);
3009 const gmx_simd_double_t CC3 = gmx_simd_set1_d(0.05550328776964726865751735);
3010 const gmx_simd_double_t CC2 = gmx_simd_set1_d(0.2402264689063408646490722);
3011 const gmx_simd_double_t CC1 = gmx_simd_set1_d(0.6931472057372680777553816);
3012 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
3014 gmx_simd_double_t fexppart;
3015 gmx_simd_double_t intpart;
3016 gmx_simd_double_t p;
3017 gmx_simd_dbool_t valuemask;
3019 fexppart = gmx_simd_set_exponent_d(x);
3020 intpart = gmx_simd_round_d(x);
3021 valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
3022 fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
3023 x = gmx_simd_sub_d(x, intpart);
3025 p = gmx_simd_fmadd_d(CC6, x, CC5);
3026 p = gmx_simd_fmadd_d(p, x, CC4);
3027 p = gmx_simd_fmadd_d(p, x, CC3);
3028 p = gmx_simd_fmadd_d(p, x, CC2);
3029 p = gmx_simd_fmadd_d(p, x, CC1);
3030 p = gmx_simd_fmadd_d(p, x, one);
3031 x = gmx_simd_mul_d(p, fexppart);
3032 return x;
3034 #endif
3036 #ifndef gmx_simd_exp_singleaccuracy_d
3037 /*! \brief SIMD exp(x). Double precision SIMD data, single accuracy.
3039 * You should normally call the real-precision routine
3040 * \ref gmx_simd_exp_singleaccuracy_r.
3042 * \param x Argument.
3043 * \result exp(x). Undefined if input argument caused overflow.
3045 static gmx_inline gmx_simd_double_t gmx_simdcall
3046 gmx_simd_exp_singleaccuracy_d(gmx_simd_double_t x)
3048 const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896341);
3049 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
3050 const gmx_simd_double_t arglimit = gmx_simd_set1_d(126.0);
3051 const gmx_simd_double_t invargscale = gmx_simd_set1_d(0.69314718055994528623);
3052 const gmx_simd_double_t CC4 = gmx_simd_set1_d(0.00136324646882712841033936);
3053 const gmx_simd_double_t CC3 = gmx_simd_set1_d(0.00836596917361021041870117);
3054 const gmx_simd_double_t CC2 = gmx_simd_set1_d(0.0416710823774337768554688);
3055 const gmx_simd_double_t CC1 = gmx_simd_set1_d(0.166665524244308471679688);
3056 const gmx_simd_double_t CC0 = gmx_simd_set1_d(0.499999850988388061523438);
3057 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
3058 gmx_simd_double_t fexppart;
3059 gmx_simd_double_t intpart;
3060 gmx_simd_double_t y, p;
3061 gmx_simd_dbool_t valuemask;
3063 y = gmx_simd_mul_d(x, argscale);
3064 fexppart = gmx_simd_set_exponent_d(y); /* rounds to nearest int internally */
3065 intpart = gmx_simd_round_d(y); /* use same rounding algorithm here */
3066 valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
3067 fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
3069 /* Extended precision arithmetics not needed since
3070 * we have double precision and only need single accuracy.
3072 x = gmx_simd_fnmadd_d(invargscale, intpart, x);
3074 p = gmx_simd_fmadd_d(CC4, x, CC3);
3075 p = gmx_simd_fmadd_d(p, x, CC2);
3076 p = gmx_simd_fmadd_d(p, x, CC1);
3077 p = gmx_simd_fmadd_d(p, x, CC0);
3078 p = gmx_simd_fmadd_d(gmx_simd_mul_d(x, x), p, x);
3079 p = gmx_simd_add_d(p, one);
3080 x = gmx_simd_mul_d(p, fexppart);
3081 return x;
3083 #endif
3085 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3087 * You should normally call the real-precision routine
3088 * \ref gmx_simd_erf_singleaccuracy_r.
3090 * \param x The value to calculate erf(x) for.
3091 * \result erf(x)
3093 * This routine achieves very close to single precision, but we do not care about
3094 * the last bit or the subnormal result range.
3096 static gmx_inline gmx_simd_double_t gmx_simdcall
3097 gmx_simd_erf_singleaccuracy_d(gmx_simd_double_t x)
3099 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
3100 const gmx_simd_double_t CA6 = gmx_simd_set1_d(7.853861353153693e-5);
3101 const gmx_simd_double_t CA5 = gmx_simd_set1_d(-8.010193625184903e-4);
3102 const gmx_simd_double_t CA4 = gmx_simd_set1_d(5.188327685732524e-3);
3103 const gmx_simd_double_t CA3 = gmx_simd_set1_d(-2.685381193529856e-2);
3104 const gmx_simd_double_t CA2 = gmx_simd_set1_d(1.128358514861418e-1);
3105 const gmx_simd_double_t CA1 = gmx_simd_set1_d(-3.761262582423300e-1);
3106 const gmx_simd_double_t CA0 = gmx_simd_set1_d(1.128379165726710);
3107 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
3108 const gmx_simd_double_t CB9 = gmx_simd_set1_d(-0.0018629930017603923);
3109 const gmx_simd_double_t CB8 = gmx_simd_set1_d(0.003909821287598495);
3110 const gmx_simd_double_t CB7 = gmx_simd_set1_d(-0.0052094582210355615);
3111 const gmx_simd_double_t CB6 = gmx_simd_set1_d(0.005685614362160572);
3112 const gmx_simd_double_t CB5 = gmx_simd_set1_d(-0.0025367682853477272);
3113 const gmx_simd_double_t CB4 = gmx_simd_set1_d(-0.010199799682318782);
3114 const gmx_simd_double_t CB3 = gmx_simd_set1_d(0.04369575504816542);
3115 const gmx_simd_double_t CB2 = gmx_simd_set1_d(-0.11884063474674492);
3116 const gmx_simd_double_t CB1 = gmx_simd_set1_d(0.2732120154030589);
3117 const gmx_simd_double_t CB0 = gmx_simd_set1_d(0.42758357702025784);
3118 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
3119 const gmx_simd_double_t CC10 = gmx_simd_set1_d(-0.0445555913112064);
3120 const gmx_simd_double_t CC9 = gmx_simd_set1_d(0.21376355144663348);
3121 const gmx_simd_double_t CC8 = gmx_simd_set1_d(-0.3473187200259257);
3122 const gmx_simd_double_t CC7 = gmx_simd_set1_d(0.016690861551248114);
3123 const gmx_simd_double_t CC6 = gmx_simd_set1_d(0.7560973182491192);
3124 const gmx_simd_double_t CC5 = gmx_simd_set1_d(-1.2137903600145787);
3125 const gmx_simd_double_t CC4 = gmx_simd_set1_d(0.8411872321232948);
3126 const gmx_simd_double_t CC3 = gmx_simd_set1_d(-0.08670413896296343);
3127 const gmx_simd_double_t CC2 = gmx_simd_set1_d(-0.27124782687240334);
3128 const gmx_simd_double_t CC1 = gmx_simd_set1_d(-0.0007502488047806069);
3129 const gmx_simd_double_t CC0 = gmx_simd_set1_d(0.5642114853803148);
3130 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
3131 const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
3133 gmx_simd_double_t x2, x4, y;
3134 gmx_simd_double_t t, t2, w, w2;
3135 gmx_simd_double_t pA0, pA1, pB0, pB1, pC0, pC1;
3136 gmx_simd_double_t expmx2;
3137 gmx_simd_double_t res_erf, res_erfc, res;
3138 gmx_simd_dbool_t mask, msk_erf;
3140 /* Calculate erf() */
3141 x2 = gmx_simd_mul_d(x, x);
3142 x4 = gmx_simd_mul_d(x2, x2);
3144 pA0 = gmx_simd_fmadd_d(CA6, x4, CA4);
3145 pA1 = gmx_simd_fmadd_d(CA5, x4, CA3);
3146 pA0 = gmx_simd_fmadd_d(pA0, x4, CA2);
3147 pA1 = gmx_simd_fmadd_d(pA1, x4, CA1);
3148 pA0 = gmx_simd_mul_d(pA0, x4);
3149 pA0 = gmx_simd_fmadd_d(pA1, x2, pA0);
3150 /* Constant term must come last for precision reasons */
3151 pA0 = gmx_simd_add_d(pA0, CA0);
3153 res_erf = gmx_simd_mul_d(x, pA0);
3155 /* Calculate erfc */
3156 y = gmx_simd_fabs_d(x);
3157 msk_erf = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3158 t = gmx_simd_inv_notmaskfpe_singleaccuracy_d(y, msk_erf);
3159 w = gmx_simd_sub_d(t, one);
3160 t2 = gmx_simd_mul_d(t, t);
3161 w2 = gmx_simd_mul_d(w, w);
3163 expmx2 = gmx_simd_exp_singleaccuracy_d( gmx_simd_fneg_d( gmx_simd_mul_d(y, y)));
3165 pB1 = gmx_simd_fmadd_d(CB9, w2, CB7);
3166 pB0 = gmx_simd_fmadd_d(CB8, w2, CB6);
3167 pB1 = gmx_simd_fmadd_d(pB1, w2, CB5);
3168 pB0 = gmx_simd_fmadd_d(pB0, w2, CB4);
3169 pB1 = gmx_simd_fmadd_d(pB1, w2, CB3);
3170 pB0 = gmx_simd_fmadd_d(pB0, w2, CB2);
3171 pB1 = gmx_simd_fmadd_d(pB1, w2, CB1);
3172 pB0 = gmx_simd_fmadd_d(pB0, w2, CB0);
3173 pB0 = gmx_simd_fmadd_d(pB1, w, pB0);
3175 pC0 = gmx_simd_fmadd_d(CC10, t2, CC8);
3176 pC1 = gmx_simd_fmadd_d(CC9, t2, CC7);
3177 pC0 = gmx_simd_fmadd_d(pC0, t2, CC6);
3178 pC1 = gmx_simd_fmadd_d(pC1, t2, CC5);
3179 pC0 = gmx_simd_fmadd_d(pC0, t2, CC4);
3180 pC1 = gmx_simd_fmadd_d(pC1, t2, CC3);
3181 pC0 = gmx_simd_fmadd_d(pC0, t2, CC2);
3182 pC1 = gmx_simd_fmadd_d(pC1, t2, CC1);
3184 pC0 = gmx_simd_fmadd_d(pC0, t2, CC0);
3185 pC0 = gmx_simd_fmadd_d(pC1, t, pC0);
3186 pC0 = gmx_simd_mul_d(pC0, t);
3188 /* SELECT pB0 or pC0 for erfc() */
3189 mask = gmx_simd_cmplt_d(two, y);
3190 res_erfc = gmx_simd_blendv_d(pB0, pC0, mask);
3191 res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
3193 /* erfc(x<0) = 2-erfc(|x|) */
3194 mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3195 res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
3197 /* Select erf() or erfc() */
3198 mask = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3199 res = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask);
3201 return res;
3204 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
3206 * You should normally call the real-precision routine
3207 * \ref gmx_simd_erfc_singleaccuracy_r.
3209 * \param x The value to calculate erfc(x) for.
3210 * \result erfc(x)
3212 * This routine achieves singleprecision (bar the last bit) over most of the
3213 * input range, but for large arguments where the result is getting close
3214 * to the minimum representable numbers we accept slightly larger errors
3215 * (think results that are in the ballpark of 10^-30) since that is not
3216 * relevant for MD.
3218 static gmx_inline gmx_simd_double_t gmx_simdcall
3219 gmx_simd_erfc_singleaccuracy_d(gmx_simd_double_t x)
3221 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
3222 const gmx_simd_double_t CA6 = gmx_simd_set1_d(7.853861353153693e-5);
3223 const gmx_simd_double_t CA5 = gmx_simd_set1_d(-8.010193625184903e-4);
3224 const gmx_simd_double_t CA4 = gmx_simd_set1_d(5.188327685732524e-3);
3225 const gmx_simd_double_t CA3 = gmx_simd_set1_d(-2.685381193529856e-2);
3226 const gmx_simd_double_t CA2 = gmx_simd_set1_d(1.128358514861418e-1);
3227 const gmx_simd_double_t CA1 = gmx_simd_set1_d(-3.761262582423300e-1);
3228 const gmx_simd_double_t CA0 = gmx_simd_set1_d(1.128379165726710);
3229 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
3230 const gmx_simd_double_t CB9 = gmx_simd_set1_d(-0.0018629930017603923);
3231 const gmx_simd_double_t CB8 = gmx_simd_set1_d(0.003909821287598495);
3232 const gmx_simd_double_t CB7 = gmx_simd_set1_d(-0.0052094582210355615);
3233 const gmx_simd_double_t CB6 = gmx_simd_set1_d(0.005685614362160572);
3234 const gmx_simd_double_t CB5 = gmx_simd_set1_d(-0.0025367682853477272);
3235 const gmx_simd_double_t CB4 = gmx_simd_set1_d(-0.010199799682318782);
3236 const gmx_simd_double_t CB3 = gmx_simd_set1_d(0.04369575504816542);
3237 const gmx_simd_double_t CB2 = gmx_simd_set1_d(-0.11884063474674492);
3238 const gmx_simd_double_t CB1 = gmx_simd_set1_d(0.2732120154030589);
3239 const gmx_simd_double_t CB0 = gmx_simd_set1_d(0.42758357702025784);
3240 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
3241 const gmx_simd_double_t CC10 = gmx_simd_set1_d(-0.0445555913112064);
3242 const gmx_simd_double_t CC9 = gmx_simd_set1_d(0.21376355144663348);
3243 const gmx_simd_double_t CC8 = gmx_simd_set1_d(-0.3473187200259257);
3244 const gmx_simd_double_t CC7 = gmx_simd_set1_d(0.016690861551248114);
3245 const gmx_simd_double_t CC6 = gmx_simd_set1_d(0.7560973182491192);
3246 const gmx_simd_double_t CC5 = gmx_simd_set1_d(-1.2137903600145787);
3247 const gmx_simd_double_t CC4 = gmx_simd_set1_d(0.8411872321232948);
3248 const gmx_simd_double_t CC3 = gmx_simd_set1_d(-0.08670413896296343);
3249 const gmx_simd_double_t CC2 = gmx_simd_set1_d(-0.27124782687240334);
3250 const gmx_simd_double_t CC1 = gmx_simd_set1_d(-0.0007502488047806069);
3251 const gmx_simd_double_t CC0 = gmx_simd_set1_d(0.5642114853803148);
3252 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
3253 const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
3255 gmx_simd_double_t x2, x4, y;
3256 gmx_simd_double_t t, t2, w, w2;
3257 gmx_simd_double_t pA0, pA1, pB0, pB1, pC0, pC1;
3258 gmx_simd_double_t expmx2;
3259 gmx_simd_double_t res_erf, res_erfc, res;
3260 gmx_simd_dbool_t mask, msk_erf;
3262 /* Calculate erf() */
3263 x2 = gmx_simd_mul_d(x, x);
3264 x4 = gmx_simd_mul_d(x2, x2);
3266 pA0 = gmx_simd_fmadd_d(CA6, x4, CA4);
3267 pA1 = gmx_simd_fmadd_d(CA5, x4, CA3);
3268 pA0 = gmx_simd_fmadd_d(pA0, x4, CA2);
3269 pA1 = gmx_simd_fmadd_d(pA1, x4, CA1);
3270 pA1 = gmx_simd_mul_d(pA1, x2);
3271 pA0 = gmx_simd_fmadd_d(pA0, x4, pA1);
3272 /* Constant term must come last for precision reasons */
3273 pA0 = gmx_simd_add_d(pA0, CA0);
3275 res_erf = gmx_simd_mul_d(x, pA0);
3277 /* Calculate erfc */
3278 y = gmx_simd_fabs_d(x);
3279 msk_erf = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3280 t = gmx_simd_inv_notmaskfpe_singleaccuracy_d(y, msk_erf);
3281 w = gmx_simd_sub_d(t, one);
3282 t2 = gmx_simd_mul_d(t, t);
3283 w2 = gmx_simd_mul_d(w, w);
3285 expmx2 = gmx_simd_exp_singleaccuracy_d( gmx_simd_fneg_d( gmx_simd_mul_d(y, y) ) );
3287 pB1 = gmx_simd_fmadd_d(CB9, w2, CB7);
3288 pB0 = gmx_simd_fmadd_d(CB8, w2, CB6);
3289 pB1 = gmx_simd_fmadd_d(pB1, w2, CB5);
3290 pB0 = gmx_simd_fmadd_d(pB0, w2, CB4);
3291 pB1 = gmx_simd_fmadd_d(pB1, w2, CB3);
3292 pB0 = gmx_simd_fmadd_d(pB0, w2, CB2);
3293 pB1 = gmx_simd_fmadd_d(pB1, w2, CB1);
3294 pB0 = gmx_simd_fmadd_d(pB0, w2, CB0);
3295 pB0 = gmx_simd_fmadd_d(pB1, w, pB0);
3297 pC0 = gmx_simd_fmadd_d(CC10, t2, CC8);
3298 pC1 = gmx_simd_fmadd_d(CC9, t2, CC7);
3299 pC0 = gmx_simd_fmadd_d(pC0, t2, CC6);
3300 pC1 = gmx_simd_fmadd_d(pC1, t2, CC5);
3301 pC0 = gmx_simd_fmadd_d(pC0, t2, CC4);
3302 pC1 = gmx_simd_fmadd_d(pC1, t2, CC3);
3303 pC0 = gmx_simd_fmadd_d(pC0, t2, CC2);
3304 pC1 = gmx_simd_fmadd_d(pC1, t2, CC1);
3306 pC0 = gmx_simd_fmadd_d(pC0, t2, CC0);
3307 pC0 = gmx_simd_fmadd_d(pC1, t, pC0);
3308 pC0 = gmx_simd_mul_d(pC0, t);
3310 /* SELECT pB0 or pC0 for erfc() */
3311 mask = gmx_simd_cmplt_d(two, y);
3312 res_erfc = gmx_simd_blendv_d(pB0, pC0, mask);
3313 res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
3315 /* erfc(x<0) = 2-erfc(|x|) */
3316 mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3317 res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
3319 /* Select erf() or erfc() */
3320 mask = gmx_simd_cmplt_d(y, gmx_simd_set1_d(0.75));
3321 res = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask);
3323 return res;
3326 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
3328 * You should normally call the real-precision routine
3329 * \ref gmx_simd_sincos_singleaccuracy_r.
3331 * \param x The argument to evaluate sin/cos for
3332 * \param[out] sinval Sin(x)
3333 * \param[out] cosval Cos(x)
3336 static gmx_inline void gmx_simdcall
3337 gmx_simd_sincos_singleaccuracy_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
3339 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
3340 const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978);
3341 const gmx_simd_double_t argred1 = gmx_simd_set1_d(2*4.9604678871439933374e-10);
3342 const gmx_simd_double_t argred2 = gmx_simd_set1_d(2*1.1258708853173288931e-18);
3343 const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
3344 const gmx_simd_double_t const_sin2 = gmx_simd_set1_d(-1.9515295891e-4);
3345 const gmx_simd_double_t const_sin1 = gmx_simd_set1_d( 8.3321608736e-3);
3346 const gmx_simd_double_t const_sin0 = gmx_simd_set1_d(-1.6666654611e-1);
3347 const gmx_simd_double_t const_cos2 = gmx_simd_set1_d( 2.443315711809948e-5);
3348 const gmx_simd_double_t const_cos1 = gmx_simd_set1_d(-1.388731625493765e-3);
3349 const gmx_simd_double_t const_cos0 = gmx_simd_set1_d( 4.166664568298827e-2);
3351 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
3352 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
3353 gmx_simd_double_t ssign, csign;
3354 gmx_simd_double_t x2, y, z, psin, pcos, sss, ccc;
3355 gmx_simd_dbool_t mask;
3356 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
3357 const gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
3358 const gmx_simd_dint32_t itwo = gmx_simd_set1_di(2);
3359 gmx_simd_dint32_t iy;
3361 z = gmx_simd_mul_d(x, two_over_pi);
3362 iy = gmx_simd_cvt_d2i(z);
3363 y = gmx_simd_round_d(z);
3365 mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
3366 ssign = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
3367 csign = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
3368 #else
3369 const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
3370 const gmx_simd_double_t minusquarter = gmx_simd_set1_d(-0.25);
3371 gmx_simd_double_t q;
3372 gmx_simd_dbool_t m1, m2, m3;
3374 /* The most obvious way to find the arguments quadrant in the unit circle
3375 * to calculate the sign is to use integer arithmetic, but that is not
3376 * present in all SIMD implementations. As an alternative, we have devised a
3377 * pure floating-point algorithm that uses truncation for argument reduction
3378 * so that we get a new value 0<=q<1 over the unit circle, and then
3379 * do floating-point comparisons with fractions. This is likely to be
3380 * slightly slower (~10%) due to the longer latencies of floating-point, so
3381 * we only use it when integer SIMD arithmetic is not present.
3383 ssign = x;
3384 x = gmx_simd_fabs_d(x);
3385 /* It is critical that half-way cases are rounded down */
3386 z = gmx_simd_fmadd_d(x, two_over_pi, half);
3387 y = gmx_simd_trunc_d(z);
3388 q = gmx_simd_mul_d(z, quarter);
3389 q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
3390 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
3391 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
3392 * This removes the 2*Pi periodicity without using any integer arithmetic.
3393 * First check if y had the value 2 or 3, set csign if true.
3395 q = gmx_simd_sub_d(q, half);
3396 /* If we have logical operations we can work directly on the signbit, which
3397 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
3398 * Thus, if you are altering defines to debug alternative code paths, the
3399 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
3400 * active or inactive - you will get errors if only one is used.
3402 # ifdef GMX_SIMD_HAVE_LOGICAL
3403 ssign = gmx_simd_and_d(ssign, gmx_simd_set1_d(-0.0));
3404 csign = gmx_simd_andnot_d(q, gmx_simd_set1_d(-0.0));
3405 ssign = gmx_simd_xor_d(ssign, csign);
3406 # else
3407 csign = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
3409 ssign = gmx_simd_xor_sign_d(ssign, csign); /* swap ssign if csign was set. */
3410 # endif
3411 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
3412 m1 = gmx_simd_cmplt_d(q, minusquarter);
3413 m2 = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
3414 m3 = gmx_simd_cmplt_d(q, quarter);
3415 m2 = gmx_simd_and_db(m2, m3);
3416 mask = gmx_simd_or_db(m1, m2);
3417 /* where mask is FALSE, set sign. */
3418 csign = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
3419 #endif
3420 x = gmx_simd_fnmadd_d(y, argred0, x);
3421 x = gmx_simd_fnmadd_d(y, argred1, x);
3422 x = gmx_simd_fnmadd_d(y, argred2, x);
3423 x2 = gmx_simd_mul_d(x, x);
3425 psin = gmx_simd_fmadd_d(const_sin2, x2, const_sin1);
3426 psin = gmx_simd_fmadd_d(psin, x2, const_sin0);
3427 psin = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x, x2), x);
3428 pcos = gmx_simd_fmadd_d(const_cos2, x2, const_cos1);
3429 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos0);
3430 pcos = gmx_simd_fmsub_d(pcos, x2, half);
3431 pcos = gmx_simd_fmadd_d(pcos, x2, one);
3433 sss = gmx_simd_blendv_d(pcos, psin, mask);
3434 ccc = gmx_simd_blendv_d(psin, pcos, mask);
3435 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
3436 #ifdef GMX_SIMD_HAVE_LOGICAL
3437 *sinval = gmx_simd_xor_d(sss, ssign);
3438 *cosval = gmx_simd_xor_d(ccc, csign);
3439 #else
3440 *sinval = gmx_simd_xor_sign_d(sss, ssign);
3441 *cosval = gmx_simd_xor_sign_d(ccc, csign);
3442 #endif
3445 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
3447 * You should normally call the real-precision routine
3448 * \ref gmx_simd_sin_singleaccuracy_r.
3450 * \param x The argument to evaluate sin for
3451 * \result Sin(x)
3453 * \attention Do NOT call both sin & cos if you need both results, since each of them
3454 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
3456 static gmx_inline gmx_simd_double_t gmx_simdcall
3457 gmx_simd_sin_singleaccuracy_d(gmx_simd_double_t x)
3459 gmx_simd_double_t s, c;
3460 gmx_simd_sincos_singleaccuracy_d(x, &s, &c);
3461 return s;
3464 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
3466 * You should normally call the real-precision routine
3467 * \ref gmx_simd_cos_singleaccuracy_r.
3469 * \param x The argument to evaluate cos for
3470 * \result Cos(x)
3472 * \attention Do NOT call both sin & cos if you need both results, since each of them
3473 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
3475 static gmx_inline gmx_simd_double_t gmx_simdcall
3476 gmx_simd_cos_singleaccuracy_d(gmx_simd_double_t x)
3478 gmx_simd_double_t s, c;
3479 gmx_simd_sincos_singleaccuracy_d(x, &s, &c);
3480 return c;
3483 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
3485 * You should normally call the real-precision routine
3486 * \ref gmx_simd_tan_singleaccuracy_r.
3488 * \param x The argument to evaluate tan for
3489 * \result Tan(x)
3491 static gmx_inline gmx_simd_double_t gmx_simdcall
3492 gmx_simd_tan_singleaccuracy_d(gmx_simd_double_t x)
3494 const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978);
3495 const gmx_simd_double_t argred1 = gmx_simd_set1_d(2*4.9604678871439933374e-10);
3496 const gmx_simd_double_t argred2 = gmx_simd_set1_d(2*1.1258708853173288931e-18);
3497 const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
3498 const gmx_simd_double_t CT6 = gmx_simd_set1_d(0.009498288995810566122993911);
3499 const gmx_simd_double_t CT5 = gmx_simd_set1_d(0.002895755790837379295226923);
3500 const gmx_simd_double_t CT4 = gmx_simd_set1_d(0.02460087336161924491836265);
3501 const gmx_simd_double_t CT3 = gmx_simd_set1_d(0.05334912882656359828045988);
3502 const gmx_simd_double_t CT2 = gmx_simd_set1_d(0.1333989091464957704418495);
3503 const gmx_simd_double_t CT1 = gmx_simd_set1_d(0.3333307599244198227797507);
3505 gmx_simd_double_t x2, p, y, z;
3506 gmx_simd_dbool_t mask;
3508 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
3509 gmx_simd_dint32_t iy;
3510 gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
3512 z = gmx_simd_mul_d(x, two_over_pi);
3513 iy = gmx_simd_cvt_d2i(z);
3514 y = gmx_simd_round_d(z);
3515 mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
3517 x = gmx_simd_fnmadd_d(y, argred0, x);
3518 x = gmx_simd_fnmadd_d(y, argred1, x);
3519 x = gmx_simd_fnmadd_d(y, argred2, x);
3520 x = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), mask), x);
3521 #else
3522 const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
3523 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
3524 const gmx_simd_double_t threequarter = gmx_simd_set1_d(0.75);
3525 gmx_simd_double_t w, q;
3526 gmx_simd_dbool_t m1, m2, m3;
3528 w = gmx_simd_fabs_d(x);
3529 z = gmx_simd_fmadd_d(w, two_over_pi, half);
3530 y = gmx_simd_trunc_d(z);
3531 q = gmx_simd_mul_d(z, quarter);
3532 q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
3533 m1 = gmx_simd_cmple_d(quarter, q);
3534 m2 = gmx_simd_cmplt_d(q, half);
3535 m3 = gmx_simd_cmple_d(threequarter, q);
3536 m1 = gmx_simd_and_db(m1, m2);
3537 mask = gmx_simd_or_db(m1, m3);
3538 w = gmx_simd_fnmadd_d(y, argred0, w);
3539 w = gmx_simd_fnmadd_d(y, argred1, w);
3540 w = gmx_simd_fnmadd_d(y, argred2, w);
3542 w = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
3543 x = gmx_simd_xor_sign_d(w, x);
3544 #endif
3545 x2 = gmx_simd_mul_d(x, x);
3546 p = gmx_simd_fmadd_d(CT6, x2, CT5);
3547 p = gmx_simd_fmadd_d(p, x2, CT4);
3548 p = gmx_simd_fmadd_d(p, x2, CT3);
3549 p = gmx_simd_fmadd_d(p, x2, CT2);
3550 p = gmx_simd_fmadd_d(p, x2, CT1);
3551 p = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
3553 p = gmx_simd_blendv_d( p, gmx_simd_inv_maskfpe_singleaccuracy_d(p, mask), mask);
3554 return p;
3557 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3559 * You should normally call the real-precision routine
3560 * \ref gmx_simd_asin_singleaccuracy_r.
3562 * \param x The argument to evaluate asin for
3563 * \result Asin(x)
3565 static gmx_inline gmx_simd_double_t gmx_simdcall
3566 gmx_simd_asin_singleaccuracy_d(gmx_simd_double_t x)
3568 const gmx_simd_double_t limitlow = gmx_simd_set1_d(1e-4);
3569 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
3570 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
3571 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
3572 const gmx_simd_double_t CC5 = gmx_simd_set1_d(4.2163199048E-2);
3573 const gmx_simd_double_t CC4 = gmx_simd_set1_d(2.4181311049E-2);
3574 const gmx_simd_double_t CC3 = gmx_simd_set1_d(4.5470025998E-2);
3575 const gmx_simd_double_t CC2 = gmx_simd_set1_d(7.4953002686E-2);
3576 const gmx_simd_double_t CC1 = gmx_simd_set1_d(1.6666752422E-1);
3577 gmx_simd_double_t xabs;
3578 gmx_simd_double_t z, z1, z2, q, q1, q2;
3579 gmx_simd_double_t pA, pB;
3580 gmx_simd_dbool_t mask, mask2;
3582 xabs = gmx_simd_fabs_d(x);
3583 mask = gmx_simd_cmplt_d(half, xabs);
3584 z1 = gmx_simd_mul_d(half, gmx_simd_sub_d(one, xabs));
3585 mask2 = gmx_simd_cmpeq_d(xabs, one);
3586 q1 = gmx_simd_mul_d(z1, gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(z1, mask2));
3587 q1 = gmx_simd_blendnotzero_d(q1, gmx_simd_cmpeq_d(xabs, one));
3588 q2 = xabs;
3589 z2 = gmx_simd_mul_d(q2, q2);
3590 z = gmx_simd_blendv_d(z2, z1, mask);
3591 q = gmx_simd_blendv_d(q2, q1, mask);
3593 z2 = gmx_simd_mul_d(z, z);
3594 pA = gmx_simd_fmadd_d(CC5, z2, CC3);
3595 pB = gmx_simd_fmadd_d(CC4, z2, CC2);
3596 pA = gmx_simd_fmadd_d(pA, z2, CC1);
3597 pA = gmx_simd_mul_d(pA, z);
3598 z = gmx_simd_fmadd_d(pB, z2, pA);
3599 z = gmx_simd_fmadd_d(z, q, q);
3600 q2 = gmx_simd_sub_d(halfpi, z);
3601 q2 = gmx_simd_sub_d(q2, z);
3602 z = gmx_simd_blendv_d(z, q2, mask);
3604 mask = gmx_simd_cmplt_d(limitlow, xabs);
3605 z = gmx_simd_blendv_d( xabs, z, mask );
3606 z = gmx_simd_xor_sign_d(z, x);
3608 return z;
3611 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
3613 * You should normally call the real-precision routine
3614 * \ref gmx_simd_acos_singleaccuracy_r.
3616 * \param x The argument to evaluate acos for
3617 * \result Acos(x)
3619 static gmx_inline gmx_simd_double_t gmx_simdcall
3620 gmx_simd_acos_singleaccuracy_d(gmx_simd_double_t x)
3622 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
3623 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
3624 const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
3625 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
3626 gmx_simd_double_t xabs;
3627 gmx_simd_double_t z, z1, z2, z3;
3628 gmx_simd_dbool_t mask1, mask2, mask3;
3630 xabs = gmx_simd_fabs_d(x);
3631 mask1 = gmx_simd_cmplt_d(half, xabs);
3632 mask2 = gmx_simd_cmplt_d(gmx_simd_setzero_d(), x);
3634 z = gmx_simd_mul_d(half, gmx_simd_sub_d(one, xabs));
3635 mask3 = gmx_simd_cmpeq_d(xabs, one);
3636 z = gmx_simd_mul_d(z, gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(z, mask3));
3637 z = gmx_simd_blendnotzero_d(z, gmx_simd_cmpeq_d(xabs, one));
3638 z = gmx_simd_blendv_d(x, z, mask1);
3639 z = gmx_simd_asin_singleaccuracy_d(z);
3641 z2 = gmx_simd_add_d(z, z);
3642 z1 = gmx_simd_sub_d(pi, z2);
3643 z3 = gmx_simd_sub_d(halfpi, z);
3644 z = gmx_simd_blendv_d(z1, z2, mask2);
3645 z = gmx_simd_blendv_d(z3, z, mask1);
3647 return z;
3650 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3652 * You should normally call the real-precision routine
3653 * \ref gmx_simd_atan_singleaccuracy_r.
3655 * \param x The argument to evaluate atan for
3656 * \result Atan(x), same argument/value range as standard math library.
3658 static gmx_inline gmx_simd_double_t gmx_simdcall
3659 gmx_simd_atan_singleaccuracy_d(gmx_simd_double_t x)
3661 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2);
3662 const gmx_simd_double_t CA17 = gmx_simd_set1_d(0.002823638962581753730774);
3663 const gmx_simd_double_t CA15 = gmx_simd_set1_d(-0.01595690287649631500244);
3664 const gmx_simd_double_t CA13 = gmx_simd_set1_d(0.04250498861074447631836);
3665 const gmx_simd_double_t CA11 = gmx_simd_set1_d(-0.07489009201526641845703);
3666 const gmx_simd_double_t CA9 = gmx_simd_set1_d(0.1063479334115982055664);
3667 const gmx_simd_double_t CA7 = gmx_simd_set1_d(-0.1420273631811141967773);
3668 const gmx_simd_double_t CA5 = gmx_simd_set1_d(0.1999269574880599975585);
3669 const gmx_simd_double_t CA3 = gmx_simd_set1_d(-0.3333310186862945556640);
3670 gmx_simd_double_t x2, x3, x4, pA, pB;
3671 gmx_simd_dbool_t mask, mask2;
3673 mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3674 x = gmx_simd_fabs_d(x);
3675 mask2 = gmx_simd_cmplt_d(gmx_simd_set1_d(1.0), x);
3676 x = gmx_simd_blendv_d(x, gmx_simd_inv_maskfpe_singleaccuracy_d(x, mask2), mask2);
3678 x2 = gmx_simd_mul_d(x, x);
3679 x3 = gmx_simd_mul_d(x2, x);
3680 x4 = gmx_simd_mul_d(x2, x2);
3681 pA = gmx_simd_fmadd_d(CA17, x4, CA13);
3682 pB = gmx_simd_fmadd_d(CA15, x4, CA11);
3683 pA = gmx_simd_fmadd_d(pA, x4, CA9);
3684 pB = gmx_simd_fmadd_d(pB, x4, CA7);
3685 pA = gmx_simd_fmadd_d(pA, x4, CA5);
3686 pB = gmx_simd_fmadd_d(pB, x4, CA3);
3687 pA = gmx_simd_fmadd_d(pA, x2, pB);
3688 pA = gmx_simd_fmadd_d(pA, x3, x);
3690 pA = gmx_simd_blendv_d(pA, gmx_simd_sub_d(halfpi, pA), mask2);
3691 pA = gmx_simd_blendv_d(pA, gmx_simd_fneg_d(pA), mask);
3693 return pA;
3696 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
3698 * You should normally call the real-precision routine
3699 * \ref gmx_simd_atan2_singleaccuracy_r.
3701 * \param y Y component of vector, any quartile
3702 * \param x X component of vector, any quartile
3703 * \result Atan(y,x), same argument/value range as standard math library.
3705 * \note This routine should provide correct results for all finite
3706 * non-zero or positive-zero arguments. However, negative zero arguments will
3707 * be treated as positive zero, which means the return value will deviate from
3708 * the standard math library atan2(y,x) for those cases. That should not be
3709 * of any concern in Gromacs, and in particular it will not affect calculations
3710 * of angles from vectors.
3712 static gmx_inline gmx_simd_double_t gmx_simdcall
3713 gmx_simd_atan2_singleaccuracy_d(gmx_simd_double_t y, gmx_simd_double_t x)
3715 const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
3716 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
3717 gmx_simd_double_t xinv, p, aoffset;
3718 gmx_simd_dbool_t mask_x0, mask_y0, mask_xlt0, mask_ylt0;
3720 mask_x0 = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
3721 mask_y0 = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
3722 mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
3723 mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
3725 aoffset = gmx_simd_blendzero_d(halfpi, mask_x0);
3726 aoffset = gmx_simd_blendnotzero_d(aoffset, mask_y0);
3728 aoffset = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
3729 aoffset = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
3731 xinv = gmx_simd_blendnotzero_d(gmx_simd_inv_notmaskfpe_singleaccuracy_d(x, mask_x0), mask_x0);
3732 p = gmx_simd_mul_d(y, xinv);
3733 p = gmx_simd_atan_singleaccuracy_d(p);
3734 p = gmx_simd_add_d(p, aoffset);
3736 return p;
3739 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
3741 * You should normally call the real-precision routine
3742 * \ref gmx_simd_pmecorrF_singleaccuracy_r.
3744 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3745 * \result Correction factor to coulomb force - see below for details.
3747 * This routine is meant to enable analytical evaluation of the
3748 * direct-space PME electrostatic force to avoid tables.
3750 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
3751 * are some problems evaluating that:
3753 * First, the error function is difficult (read: expensive) to
3754 * approxmiate accurately for intermediate to large arguments, and
3755 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
3756 * Second, we now try to avoid calculating potentials in Gromacs but
3757 * use forces directly.
3759 * We can simply things slight by noting that the PME part is really
3760 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
3761 * \f[
3762 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
3763 * \f]
3764 * The first term we already have from the inverse square root, so
3765 * that we can leave out of this routine.
3767 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
3768 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
3769 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
3770 * in this range!
3772 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
3773 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
3774 * then only use even powers. This is another minor optimization, since
3775 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
3776 * the vector between the two atoms to get the vectorial force. The
3777 * fastest flops are the ones we can avoid calculating!
3779 * So, here's how it should be used:
3781 * 1. Calculate \f$r^2\f$.
3782 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
3783 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
3784 * 4. The return value is the expression:
3786 * \f[
3787 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
3788 * \f]
3790 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
3792 * \f[
3793 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
3794 * \f]
3796 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
3798 * \f[
3799 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
3800 * \f]
3802 * With a bit of math exercise you should be able to confirm that
3803 * this is exactly
3805 * \f[
3806 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
3807 * \f]
3809 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
3810 * and you have your force (divided by \f$r\f$). A final multiplication
3811 * with the vector connecting the two particles and you have your
3812 * vectorial force to add to the particles.
3814 * This approximation achieves an accuracy slightly lower than 1e-6; when
3815 * added to \f$1/r\f$ the error will be insignificant.
3818 static gmx_simd_double_t gmx_simdcall
3819 gmx_simd_pmecorrF_singleaccuracy_d(gmx_simd_double_t z2)
3821 const gmx_simd_double_t FN6 = gmx_simd_set1_d(-1.7357322914161492954e-8);
3822 const gmx_simd_double_t FN5 = gmx_simd_set1_d(1.4703624142580877519e-6);
3823 const gmx_simd_double_t FN4 = gmx_simd_set1_d(-0.000053401640219807709149);
3824 const gmx_simd_double_t FN3 = gmx_simd_set1_d(0.0010054721316683106153);
3825 const gmx_simd_double_t FN2 = gmx_simd_set1_d(-0.019278317264888380590);
3826 const gmx_simd_double_t FN1 = gmx_simd_set1_d(0.069670166153766424023);
3827 const gmx_simd_double_t FN0 = gmx_simd_set1_d(-0.75225204789749321333);
3829 const gmx_simd_double_t FD4 = gmx_simd_set1_d(0.0011193462567257629232);
3830 const gmx_simd_double_t FD3 = gmx_simd_set1_d(0.014866955030185295499);
3831 const gmx_simd_double_t FD2 = gmx_simd_set1_d(0.11583842382862377919);
3832 const gmx_simd_double_t FD1 = gmx_simd_set1_d(0.50736591960530292870);
3833 const gmx_simd_double_t FD0 = gmx_simd_set1_d(1.0);
3835 gmx_simd_double_t z4;
3836 gmx_simd_double_t polyFN0, polyFN1, polyFD0, polyFD1;
3838 z4 = gmx_simd_mul_d(z2, z2);
3840 polyFD0 = gmx_simd_fmadd_d(FD4, z4, FD2);
3841 polyFD1 = gmx_simd_fmadd_d(FD3, z4, FD1);
3842 polyFD0 = gmx_simd_fmadd_d(polyFD0, z4, FD0);
3843 polyFD0 = gmx_simd_fmadd_d(polyFD1, z2, polyFD0);
3845 polyFD0 = gmx_simd_inv_singleaccuracy_d(polyFD0);
3847 polyFN0 = gmx_simd_fmadd_d(FN6, z4, FN4);
3848 polyFN1 = gmx_simd_fmadd_d(FN5, z4, FN3);
3849 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN2);
3850 polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN1);
3851 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN0);
3852 polyFN0 = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
3854 return gmx_simd_mul_d(polyFN0, polyFD0);
3859 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
3861 * You should normally call the real-precision routine
3862 * \ref gmx_simd_pmecorrV_singleaccuracy_r.
3864 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3865 * \result Correction factor to coulomb potential - see below for details.
3867 * See \ref gmx_simd_pmecorrF_f for details about the approximation.
3869 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
3870 * as the input argument.
3872 * Here's how it should be used:
3874 * 1. Calculate \f$r^2\f$.
3875 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
3876 * 3. Evaluate this routine with z^2 as the argument.
3877 * 4. The return value is the expression:
3879 * \f[
3880 * \frac{\mbox{erf}(z)}{z}
3881 * \f]
3883 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
3885 * \f[
3886 * \frac{\mbox{erf}(r \beta)}{r}
3887 * \f]
3889 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
3890 * and you have your potential.
3892 * This approximation achieves an accuracy slightly lower than 1e-6; when
3893 * added to \f$1/r\f$ the error will be insignificant.
3895 static gmx_simd_double_t gmx_simdcall
3896 gmx_simd_pmecorrV_singleaccuracy_d(gmx_simd_double_t z2)
3898 const gmx_simd_double_t VN6 = gmx_simd_set1_d(1.9296833005951166339e-8);
3899 const gmx_simd_double_t VN5 = gmx_simd_set1_d(-1.4213390571557850962e-6);
3900 const gmx_simd_double_t VN4 = gmx_simd_set1_d(0.000041603292906656984871);
3901 const gmx_simd_double_t VN3 = gmx_simd_set1_d(-0.00013134036773265025626);
3902 const gmx_simd_double_t VN2 = gmx_simd_set1_d(0.038657983986041781264);
3903 const gmx_simd_double_t VN1 = gmx_simd_set1_d(0.11285044772717598220);
3904 const gmx_simd_double_t VN0 = gmx_simd_set1_d(1.1283802385263030286);
3906 const gmx_simd_double_t VD3 = gmx_simd_set1_d(0.0066752224023576045451);
3907 const gmx_simd_double_t VD2 = gmx_simd_set1_d(0.078647795836373922256);
3908 const gmx_simd_double_t VD1 = gmx_simd_set1_d(0.43336185284710920150);
3909 const gmx_simd_double_t VD0 = gmx_simd_set1_d(1.0);
3911 gmx_simd_double_t z4;
3912 gmx_simd_double_t polyVN0, polyVN1, polyVD0, polyVD1;
3914 z4 = gmx_simd_mul_d(z2, z2);
3916 polyVD1 = gmx_simd_fmadd_d(VD3, z4, VD1);
3917 polyVD0 = gmx_simd_fmadd_d(VD2, z4, VD0);
3918 polyVD0 = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
3920 polyVD0 = gmx_simd_inv_singleaccuracy_d(polyVD0);
3922 polyVN0 = gmx_simd_fmadd_d(VN6, z4, VN4);
3923 polyVN1 = gmx_simd_fmadd_d(VN5, z4, VN3);
3924 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN2);
3925 polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN1);
3926 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN0);
3927 polyVN0 = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
3929 return gmx_simd_mul_d(polyVN0, polyVD0);
3932 #endif
3935 /*! \name SIMD4 math functions
3937 * \note Only a subset of the math functions are implemented for SIMD4.
3938 * \{
3942 #ifdef GMX_SIMD4_HAVE_FLOAT
3944 /*************************************************************************
3945 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
3946 *************************************************************************/
3948 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 floats.
3950 * \copydetails gmx_simd_sum4_f
3952 static gmx_inline gmx_simd4_float_t gmx_simdcall
3953 gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
3954 gmx_simd4_float_t c, gmx_simd4_float_t d)
3956 return gmx_simd4_add_f(gmx_simd4_add_f(a, b), gmx_simd4_add_f(c, d));
3959 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
3961 * \copydetails gmx_simd_rsqrt_iter_f
3963 static gmx_inline gmx_simd4_float_t gmx_simdcall
3964 gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
3966 # ifdef GMX_SIMD_HAVE_FMA
3967 return gmx_simd4_fmadd_f(gmx_simd4_fnmadd_f(x, gmx_simd4_mul_f(lu, lu), gmx_simd4_set1_f(1.0f)), gmx_simd4_mul_f(lu, gmx_simd4_set1_f(0.5f)), lu);
3968 # else
3969 return gmx_simd4_mul_f(gmx_simd4_set1_f(0.5f), gmx_simd4_mul_f(gmx_simd4_sub_f(gmx_simd4_set1_f(3.0f), gmx_simd4_mul_f(gmx_simd4_mul_f(lu, lu), x)), lu));
3970 # endif
3973 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
3975 * \copydetails gmx_simd_invsqrt_f
3977 static gmx_inline gmx_simd4_float_t gmx_simdcall
3978 gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
3980 gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
3981 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3982 lu = gmx_simd4_rsqrt_iter_f(lu, x);
3983 #endif
3984 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3985 lu = gmx_simd4_rsqrt_iter_f(lu, x);
3986 #endif
3987 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3988 lu = gmx_simd4_rsqrt_iter_f(lu, x);
3989 #endif
3990 return lu;
3993 #endif /* GMX_SIMD4_HAVE_FLOAT */
3997 #ifdef GMX_SIMD4_HAVE_DOUBLE
3998 /*************************************************************************
3999 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4000 *************************************************************************/
4003 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 doubles.
4005 * \copydetails gmx_simd_sum4_f
4007 static gmx_inline gmx_simd4_double_t gmx_simdcall
4008 gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
4009 gmx_simd4_double_t c, gmx_simd4_double_t d)
4011 return gmx_simd4_add_d(gmx_simd4_add_d(a, b), gmx_simd4_add_d(c, d));
4014 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4016 * \copydetails gmx_simd_rsqrt_iter_f
4018 static gmx_inline gmx_simd4_double_t gmx_simdcall
4019 gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
4021 #ifdef GMX_SIMD_HAVE_FMA
4022 return gmx_simd4_fmadd_d(gmx_simd4_fnmadd_d(x, gmx_simd4_mul_d(lu, lu), gmx_simd4_set1_d(1.0)), gmx_simd4_mul_d(lu, gmx_simd4_set1_d(0.5)), lu);
4023 #else
4024 return gmx_simd4_mul_d(gmx_simd4_set1_d(0.5), gmx_simd4_mul_d(gmx_simd4_sub_d(gmx_simd4_set1_d(3.0), gmx_simd4_mul_d(gmx_simd4_mul_d(lu, lu), x)), lu));
4025 #endif
4028 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4030 * \copydetails gmx_simd_invsqrt_f
4032 static gmx_inline gmx_simd4_double_t gmx_simdcall
4033 gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
4035 gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
4036 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4037 lu = gmx_simd4_rsqrt_iter_d(lu, x);
4038 #endif
4039 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4040 lu = gmx_simd4_rsqrt_iter_d(lu, x);
4041 #endif
4042 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4043 lu = gmx_simd4_rsqrt_iter_d(lu, x);
4044 #endif
4045 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4046 lu = gmx_simd4_rsqrt_iter_d(lu, x);
4047 #endif
4048 return lu;
4051 /**********************************************************************
4052 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4053 **********************************************************************/
4055 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4057 * \copydetails gmx_simd_invsqrt_singleaccuracy_d
4059 static gmx_inline gmx_simd4_double_t gmx_simdcall
4060 gmx_simd4_invsqrt_singleaccuracy_d(gmx_simd4_double_t x)
4062 gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
4063 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4064 lu = gmx_simd4_rsqrt_iter_d(lu, x);
4065 #endif
4066 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4067 lu = gmx_simd4_rsqrt_iter_d(lu, x);
4068 #endif
4069 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4070 lu = gmx_simd4_rsqrt_iter_d(lu, x);
4071 #endif
4072 return lu;
4076 #endif /* GMX_SIMD4_HAVE_DOUBLE */
4078 /*! \} */
4081 /* Set defines based on default Gromacs precision */
4082 #ifdef GMX_DOUBLE
4083 /* Documentation in single branch below */
4085 # define gmx_simd_sum4_r gmx_simd_sum4_d
4086 # define gmx_simd_xor_sign_r gmx_simd_xor_sign_d
4087 # define gmx_simd4_sum4_r gmx_simd4_sum4_d
4089 /* On hardware that only supports double precision SIMD it is possible to use
4090 * the faster _singleaccuracy_d routines everywhere by setting the requested SIMD
4091 * accuracy to single precision.
4093 #if (GMX_SIMD_ACCURACY_BITS_DOUBLE > GMX_SIMD_ACCURACY_BITS_SINGLE)
4095 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_d
4096 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_d
4097 # define gmx_simd_sqrt_r gmx_simd_sqrt_d
4098 # define gmx_simd_inv_r gmx_simd_inv_d
4099 # define gmx_simd_log_r gmx_simd_log_d
4100 # define gmx_simd_exp2_r gmx_simd_exp2_d
4101 # define gmx_simd_exp_r gmx_simd_exp_d
4102 # define gmx_simd_erf_r gmx_simd_erf_d
4103 # define gmx_simd_erfc_r gmx_simd_erfc_d
4104 # define gmx_simd_sincos_r gmx_simd_sincos_d
4105 # define gmx_simd_sin_r gmx_simd_sin_d
4106 # define gmx_simd_cos_r gmx_simd_cos_d
4107 # define gmx_simd_tan_r gmx_simd_tan_d
4108 # define gmx_simd_asin_r gmx_simd_asin_d
4109 # define gmx_simd_acos_r gmx_simd_acos_d
4110 # define gmx_simd_atan_r gmx_simd_atan_d
4111 # define gmx_simd_atan2_r gmx_simd_atan2_d
4112 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_d
4113 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_d
4114 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_d
4116 #else
4118 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_singleaccuracy_d
4119 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_singleaccuracy_d
4120 # define gmx_simd_sqrt_r gmx_simd_sqrt_singleaccuracy_d
4121 # define gmx_simd_inv_r gmx_simd_inv_singleaccuracy_d
4122 # define gmx_simd_log_r gmx_simd_log_singleaccuracy_d
4123 # define gmx_simd_exp2_r gmx_simd_exp2_singleaccuracy_d
4124 # define gmx_simd_exp_r gmx_simd_exp_singleaccuracy_d
4125 # define gmx_simd_erf_r gmx_simd_erf_singleaccuracy_d
4126 # define gmx_simd_erfc_r gmx_simd_erfc_singleaccuracy_d
4127 # define gmx_simd_sincos_r gmx_simd_sincos_singleaccuracy_d
4128 # define gmx_simd_sin_r gmx_simd_sin_singleaccuracy_d
4129 # define gmx_simd_cos_r gmx_simd_cos_singleaccuracy_d
4130 # define gmx_simd_tan_r gmx_simd_tan_singleaccuracy_d
4131 # define gmx_simd_asin_r gmx_simd_asin_singleaccuracy_d
4132 # define gmx_simd_acos_r gmx_simd_acos_singleaccuracy_d
4133 # define gmx_simd_atan_r gmx_simd_atan_singleaccuracy_d
4134 # define gmx_simd_atan2_r gmx_simd_atan2_singleaccuracy_d
4135 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_singleaccuracy_d
4136 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_singleaccuracy_d
4137 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_singleaccuracy_d
4139 #endif
4141 # define gmx_simd_invsqrt_singleaccuracy_r gmx_simd_invsqrt_singleaccuracy_d
4142 # define gmx_simd_invsqrt_pair_singleaccuracy_r gmx_simd_invsqrt_pair_singleaccuracy_d
4143 # define gmx_simd_sqrt_singleaccuracy_r gmx_simd_sqrt_singleaccuracy_d
4144 # define gmx_simd_inv_singleaccuracy_r gmx_simd_inv_singleaccuracy_d
4145 # define gmx_simd_log_singleaccuracy_r gmx_simd_log_singleaccuracy_d
4146 # define gmx_simd_exp2_singleaccuracy_r gmx_simd_exp2_singleaccuracy_d
4147 # define gmx_simd_exp_singleaccuracy_r gmx_simd_exp_singleaccuracy_d
4148 # define gmx_simd_erf_singleaccuracy_r gmx_simd_erf_singleaccuracy_d
4149 # define gmx_simd_erfc_singleaccuracy_r gmx_simd_erfc_singleaccuracy_d
4150 # define gmx_simd_sincos_singleaccuracy_r gmx_simd_sincos_singleaccuracy_d
4151 # define gmx_simd_sin_singleaccuracy_r gmx_simd_sin_singleaccuracy_d
4152 # define gmx_simd_cos_singleaccuracy_r gmx_simd_cos_singleaccuracy_d
4153 # define gmx_simd_tan_singleaccuracy_r gmx_simd_tan_singleaccuracy_d
4154 # define gmx_simd_asin_singleaccuracy_r gmx_simd_asin_singleaccuracy_d
4155 # define gmx_simd_acos_singleaccuracy_r gmx_simd_acos_singleaccuracy_d
4156 # define gmx_simd_atan_singleaccuracy_r gmx_simd_atan_singleaccuracy_d
4157 # define gmx_simd_atan2_singleaccuracy_r gmx_simd_atan2_singleaccuracy_d
4158 # define gmx_simd_pmecorrF_singleaccuracy_r gmx_simd_pmecorrF_singleaccuracy_d
4159 # define gmx_simd_pmecorrV_singleaccuracy_r gmx_simd_pmecorrV_singleaccuracy_d
4160 # define gmx_simd4_invsqrt_singleaccuracy_r gmx_simd4_invsqrt_singleaccuracy_d
4162 #else /* GMX_DOUBLE */
4164 /*! \name Real-precision SIMD math functions
4166 * These are the ones you should typically call in Gromacs.
4167 * \{
4170 /*! \brief SIMD utility function to sum a+b+c+d for SIMD reals.
4172 * \copydetails gmx_simd_sum4_f
4174 # define gmx_simd_sum4_r gmx_simd_sum4_f
4176 /*! \brief Return -a if b is negative, SIMD real.
4178 * \copydetails gmx_simd_xor_sign_f
4180 # define gmx_simd_xor_sign_r gmx_simd_xor_sign_f
4182 /*! \brief Calculate 1/sqrt(x) for SIMD real.
4184 * \copydetails gmx_simd_invsqrt_f
4186 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_f
4188 /*! \brief Calculate 1/sqrt(x) for two SIMD reals.
4190 * \copydetails gmx_simd_invsqrt_pair_f
4192 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_f
4194 /*! \brief Calculate sqrt(x) correctly for SIMD real, including argument 0.0.
4196 * \copydetails gmx_simd_sqrt_f
4198 # define gmx_simd_sqrt_r gmx_simd_sqrt_f
4200 /*! \brief Calculate 1/x for SIMD real.
4202 * \copydetails gmx_simd_inv_f
4204 # define gmx_simd_inv_r gmx_simd_inv_f
4206 /*! \brief SIMD real log(x). This is the natural logarithm.
4208 * \copydetails gmx_simd_log_f
4210 # define gmx_simd_log_r gmx_simd_log_f
4212 /*! \brief SIMD real 2^x.
4214 * \copydetails gmx_simd_exp2_f
4216 # define gmx_simd_exp2_r gmx_simd_exp2_f
4218 /*! \brief SIMD real e^x.
4220 * \copydetails gmx_simd_exp_f
4222 # define gmx_simd_exp_r gmx_simd_exp_f
4224 /*! \brief SIMD real erf(x).
4226 * \copydetails gmx_simd_erf_f
4228 # define gmx_simd_erf_r gmx_simd_erf_f
4230 /*! \brief SIMD real erfc(x).
4232 * \copydetails gmx_simd_erfc_f
4234 # define gmx_simd_erfc_r gmx_simd_erfc_f
4236 /*! \brief SIMD real sin \& cos.
4238 * \copydetails gmx_simd_sincos_f
4240 # define gmx_simd_sincos_r gmx_simd_sincos_f
4242 /*! \brief SIMD real sin(x).
4244 * \copydetails gmx_simd_sin_f
4246 # define gmx_simd_sin_r gmx_simd_sin_f
4248 /*! \brief SIMD real cos(x).
4250 * \copydetails gmx_simd_cos_f
4252 # define gmx_simd_cos_r gmx_simd_cos_f
4254 /*! \brief SIMD real tan(x).
4256 * \copydetails gmx_simd_tan_f
4258 # define gmx_simd_tan_r gmx_simd_tan_f
4260 /*! \brief SIMD real asin(x).
4262 * \copydetails gmx_simd_asin_f
4264 # define gmx_simd_asin_r gmx_simd_asin_f
4266 /*! \brief SIMD real acos(x).
4268 * \copydetails gmx_simd_acos_f
4270 # define gmx_simd_acos_r gmx_simd_acos_f
4272 /*! \brief SIMD real atan(x).
4274 * \copydetails gmx_simd_atan_f
4276 # define gmx_simd_atan_r gmx_simd_atan_f
4278 /*! \brief SIMD real atan2(y,x).
4280 * \copydetails gmx_simd_atan2_f
4282 # define gmx_simd_atan2_r gmx_simd_atan2_f
4284 /*! \brief SIMD Analytic PME force correction.
4286 * \copydetails gmx_simd_pmecorrF_f
4288 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_f
4290 /*! \brief SIMD Analytic PME potential correction.
4292 * \copydetails gmx_simd_pmecorrV_f
4294 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_f
4296 /*! \brief Calculate 1/sqrt(x) for SIMD, only targeting single accuracy.
4298 * \copydetails gmx_simd_invsqrt_r
4300 * \note This is a performance-targeted function that only achieves single
4301 * precision accuracy, even when the SIMD data is double precision.
4303 # define gmx_simd_invsqrt_singleaccuracy_r gmx_simd_invsqrt_f
4305 /*! \brief Calculate 1/sqrt(x) for SIMD pair, only targeting single accuracy.
4307 * \copydetails gmx_simd_invsqrt_pair_r
4309 * \note This is a performance-targeted function that only achieves single
4310 * precision accuracy, even when the SIMD data is double precision.
4312 # define gmx_simd_invsqrt_pair_singleaccuracy_r gmx_simd_invsqrt_pair_f
4314 /*! \brief Calculate sqrt(x), only targeting single accuracy.
4316 * \copydetails gmx_simd_sqrt_r
4318 * \note This is a performance-targeted function that only achieves single
4319 * precision accuracy, even when the SIMD data is double precision.
4321 # define gmx_simd_sqrt_singleaccuracy_r gmx_simd_sqrt_f
4323 /*! \brief Calculate 1/x for SIMD real, only targeting single accuracy.
4325 * \copydetails gmx_simd_inv_r
4327 * \note This is a performance-targeted function that only achieves single
4328 * precision accuracy, even when the SIMD data is double precision.
4330 # define gmx_simd_inv_singleaccuracy_r gmx_simd_inv_f
4332 /*! \brief SIMD real log(x), only targeting single accuracy.
4334 * \copydetails gmx_simd_log_r
4336 * \note This is a performance-targeted function that only achieves single
4337 * precision accuracy, even when the SIMD data is double precision.
4339 # define gmx_simd_log_singleaccuracy_r gmx_simd_log_f
4341 /*! \brief SIMD real 2^x, only targeting single accuracy.
4343 * \copydetails gmx_simd_exp2_r
4345 * \note This is a performance-targeted function that only achieves single
4346 * precision accuracy, even when the SIMD data is double precision.
4348 # define gmx_simd_exp2_singleaccuracy_r gmx_simd_exp2_f
4350 /*! \brief SIMD real e^x, only targeting single accuracy.
4352 * \copydetails gmx_simd_exp_r
4354 * \note This is a performance-targeted function that only achieves single
4355 * precision accuracy, even when the SIMD data is double precision.
4357 # define gmx_simd_exp_singleaccuracy_r gmx_simd_exp_f
4359 /*! \brief SIMD real erf(x), only targeting single accuracy.
4361 * \copydetails gmx_simd_erf_r
4363 * \note This is a performance-targeted function that only achieves single
4364 * precision accuracy, even when the SIMD data is double precision.
4366 # define gmx_simd_erf_singleaccuracy_r gmx_simd_erf_f
4368 /*! \brief SIMD real erfc(x), only targeting single accuracy.
4370 * \copydetails gmx_simd_erfc_r
4372 * \note This is a performance-targeted function that only achieves single
4373 * precision accuracy, even when the SIMD data is double precision.
4375 # define gmx_simd_erfc_singleaccuracy_r gmx_simd_erfc_f
4377 /*! \brief SIMD real sin \& cos, only targeting single accuracy.
4379 * \copydetails gmx_simd_sincos_r
4381 * \note This is a performance-targeted function that only achieves single
4382 * precision accuracy, even when the SIMD data is double precision.
4384 # define gmx_simd_sincos_singleaccuracy_r gmx_simd_sincos_f
4386 /*! \brief SIMD real sin(x), only targeting single accuracy.
4388 * \copydetails gmx_simd_sin_r
4390 * \note This is a performance-targeted function that only achieves single
4391 * precision accuracy, even when the SIMD data is double precision.
4393 # define gmx_simd_sin_singleaccuracy_r gmx_simd_sin_f
4395 /*! \brief SIMD real cos(x), only targeting single accuracy.
4397 * \copydetails gmx_simd_cos_r
4399 * \note This is a performance-targeted function that only achieves single
4400 * precision accuracy, even when the SIMD data is double precision.
4402 # define gmx_simd_cos_singleaccuracy_r gmx_simd_cos_f
4404 /*! \brief SIMD real tan(x), only targeting single accuracy.
4406 * \copydetails gmx_simd_tan_r
4408 * \note This is a performance-targeted function that only achieves single
4409 * precision accuracy, even when the SIMD data is double precision.
4411 # define gmx_simd_tan_singleaccuracy_r gmx_simd_tan_f
4413 /*! \brief SIMD real asin(x), only targeting single accuracy.
4415 * \copydetails gmx_simd_asin_r
4417 * \note This is a performance-targeted function that only achieves single
4418 * precision accuracy, even when the SIMD data is double precision.
4420 # define gmx_simd_asin_singleaccuracy_r gmx_simd_asin_f
4422 /*! \brief SIMD real acos(x), only targeting single accuracy.
4424 * \copydetails gmx_simd_acos_r
4426 * \note This is a performance-targeted function that only achieves single
4427 * precision accuracy, even when the SIMD data is double precision.
4429 # define gmx_simd_acos_singleaccuracy_r gmx_simd_acos_f
4431 /*! \brief SIMD real atan(x), only targeting single accuracy.
4433 * \copydetails gmx_simd_atan_r
4435 * \note This is a performance-targeted function that only achieves single
4436 * precision accuracy, even when the SIMD data is double precision.
4438 # define gmx_simd_atan_singleaccuracy_r gmx_simd_atan_f
4440 /*! \brief SIMD real atan2(y,x), only targeting single accuracy.
4442 * \copydetails gmx_simd_atan2_r
4444 * \note This is a performance-targeted function that only achieves single
4445 * precision accuracy, even when the SIMD data is double precision.
4447 # define gmx_simd_atan2_singleaccuracy_r gmx_simd_atan2_f
4449 /*! \brief SIMD Analytic PME force corr., only targeting single accuracy.
4451 * \copydetails gmx_simd_pmecorrF_r
4453 * \note This is a performance-targeted function that only achieves single
4454 * precision accuracy, even when the SIMD data is double precision.
4456 # define gmx_simd_pmecorrF_singleaccuracy_r gmx_simd_pmecorrF_f
4458 /*! \brief SIMD Analytic PME potential corr., only targeting single accuracy.
4460 * \copydetails gmx_simd_pmecorrV_r
4462 * \note This is a performance-targeted function that only achieves single
4463 * precision accuracy, even when the SIMD data is double precision.
4465 # define gmx_simd_pmecorrV_singleaccuracy_r gmx_simd_pmecorrV_f
4468 /*! \}
4469 * \name SIMD4 math functions
4470 * \{
4473 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 reals.
4475 * \copydetails gmx_simd_sum4_f
4477 # define gmx_simd4_sum4_r gmx_simd4_sum4_f
4479 /*! \brief Calculate 1/sqrt(x) for SIMD4 real.
4481 * \copydetails gmx_simd_invsqrt_f
4483 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_f
4485 /*! \brief 1/sqrt(x) for SIMD4 real. Single accuracy, even for double prec.
4487 * \copydetails gmx_simd4_invsqrt_r
4489 * \note This is a performance-targeted function that only achieves single
4490 * precision accuracy, even when the SIMD data is double precision.
4492 # define gmx_simd4_invsqrt_singleaccuracy_r gmx_simd4_invsqrt_f
4494 /*! \} */
4496 #endif /* GMX_DOUBLE */
4498 /*! \} */
4499 /*! \endcond */
4501 #endif /* GMX_SIMD_SIMD_MATH_H_ */