include/gmx_math_x86_avx_256_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   6  * others, as listed in the AUTHORS file in the top-level source
   7  * directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _gmx_math_x86_avx_256_double_h_
  36 #define _gmx_math_x86_avx_256_double_h_
  37
  38 #include "gmx_x86_avx_256.h"
  39
  40
  41
  42 #ifndef M_PI
  43 #  define M_PI 3.14159265358979323846264338327950288
  44 #endif
  45
  46
  47 /************************
  48  *                      *
  49  * Simple math routines *
  50  *                      *
  51  ************************/
  52
  53 /* 1.0/sqrt(x), 256 bit wide */
  54 static gmx_inline __m256d
  55 gmx_mm256_invsqrt_pd(__m256d x)
  56 {
  57     const __m256d half  = _mm256_set1_pd(0.5);
  58     const __m256d three = _mm256_set1_pd(3.0);
  59
  60     /* Lookup instruction only exists in single precision, convert back and forth... */
  61     __m256d lu = _mm256_cvtps_pd(_mm_rsqrt_ps( _mm256_cvtpd_ps(x)));
  62
  63     lu = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu, lu), x)), lu));
  64     return _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu, lu), x)), lu));
  65 }
  66
  67 /* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
  68 static void
  69 gmx_mm256_invsqrt_pair_pd(__m256d x1, __m256d x2, __m256d *invsqrt1, __m256d *invsqrt2)
  70 {
  71     const __m256d half   = _mm256_set1_pd(0.5);
  72     const __m256d three  = _mm256_set1_pd(3.0);
  73     const __m256  halff  = _mm256_set1_ps(0.5f);
  74     const __m256  threef = _mm256_set1_ps(3.0f);
  75
  76     __m256        xf, luf;
  77     __m256d       lu1, lu2;
  78
  79     /* Do first N-R step in float for 2x throughput */
  80     xf  = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(x1)), _mm256_cvtpd_ps(x2), 0x1);
  81     luf = _mm256_rsqrt_ps(xf);
  82
  83     luf = _mm256_mul_ps(halff, _mm256_mul_ps(_mm256_sub_ps(threef, _mm256_mul_ps(_mm256_mul_ps(luf, luf), xf)), luf));
  84
  85     lu2 = _mm256_cvtps_pd(_mm256_extractf128_ps(luf, 0x1));
  86     lu1 = _mm256_cvtps_pd(_mm256_castps256_ps128(luf));
  87
  88     *invsqrt1 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu1, lu1), x1)), lu1));
  89     *invsqrt2 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu2, lu2), x2)), lu2));
  90 }
  91
  92 /* 1.0/sqrt(x), 128 bit wide */
  93 static gmx_inline __m128d
  94 gmx_mm_invsqrt_pd(__m128d x)
  95 {
  96     const __m128d half  = _mm_set1_pd(0.5);
  97     const __m128d three = _mm_set1_pd(3.0);
  98
  99     /* Lookup instruction only exists in single precision, convert back and forth... */
 100     __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
 101
 102     lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
 103     return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
 104 }
 105
 106 /* 1.0/sqrt(x), done for two pairs to improve throughput */
 107 static void
 108 gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
 109 {
 110     const __m128d half   = _mm_set1_pd(0.5);
 111     const __m128d three  = _mm_set1_pd(3.0);
 112     const __m128  halff  = _mm_set1_ps(0.5f);
 113     const __m128  threef = _mm_set1_ps(3.0f);
 114
 115     __m128        xf, luf;
 116     __m128d       lu1, lu2;
 117
 118     /* Do first N-R step in float for 2x throughput */
 119     xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
 120     luf = _mm_rsqrt_ps(xf);
 121     luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
 122
 123     lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
 124     lu1 = _mm_cvtps_pd(luf);
 125
 126     *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
 127     *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
 128 }
 129
 130 /* sqrt(x) (256 bit)- Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
 131 static gmx_inline __m256d
 132 gmx_mm256_sqrt_pd(__m256d x)
 133 {
 134     __m256d mask;
 135     __m256d res;
 136
 137     mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
 138     res  = _mm256_andnot_pd(mask, gmx_mm256_invsqrt_pd(x));
 139
 140     res  = _mm256_mul_pd(x, res);
 141
 142     return res;
 143 }
 144
 145 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
 146 static gmx_inline __m128d
 147 gmx_mm_sqrt_pd(__m128d x)
 148 {
 149     __m128d mask;
 150     __m128d res;
 151
 152     mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
 153     res  = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
 154
 155     res  = _mm_mul_pd(x, res);
 156
 157     return res;
 158 }
 159
 160
 161 /* 1.0/x, 256 bit wide */
 162 static gmx_inline __m256d
 163 gmx_mm256_inv_pd(__m256d x)
 164 {
 165     const __m256d two  = _mm256_set1_pd(2.0);
 166
 167     /* Lookup instruction only exists in single precision, convert back and forth... */
 168     __m256d lu = _mm256_cvtps_pd(_mm_rcp_ps( _mm256_cvtpd_ps(x)));
 169
 170     /* Perform two N-R steps for double precision */
 171     lu         = _mm256_mul_pd(lu, _mm256_sub_pd(two, _mm256_mul_pd(x, lu)));
 172     return _mm256_mul_pd(lu, _mm256_sub_pd(two, _mm256_mul_pd(x, lu)));
 173 }
 174
 175 /* 1.0/x, 128 bit */
 176 static gmx_inline __m128d
 177 gmx_mm_inv_pd(__m128d x)
 178 {
 179     const __m128d two  = _mm_set1_pd(2.0);
 180
 181     /* Lookup instruction only exists in single precision, convert back and forth... */
 182     __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
 183
 184     /* Perform two N-R steps for double precision */
 185     lu         = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
 186     return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
 187 }
 188
 189
 190 static gmx_inline __m256d
 191 gmx_mm256_abs_pd(__m256d x)
 192 {
 193     const __m256d signmask  = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
 194                                                                     0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
 195
 196     return _mm256_and_pd(x, signmask);
 197 }
 198
 199 static gmx_inline __m128d
 200 gmx_mm_abs_pd(__m128d x)
 201 {
 202     const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
 203
 204     return _mm_and_pd(x, signmask);
 205 }
 206
 207
 208 /*
 209  * 2^x function, 256 bit
 210  *
 211  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 212  * [-0.5,0.5].
 213  *
 214  * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
 215  * according to the same algorithm as used in the Cephes/netlib math routines.
 216  */
 217 static __m256d
 218 gmx_mm256_exp2_pd(__m256d x)
 219 {
 220     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 221     const __m256d arglimit = _mm256_set1_pd(1022.0);
 222     const __m128i expbase  = _mm_set1_epi32(1023);
 223
 224     const __m256d P2       = _mm256_set1_pd(2.30933477057345225087e-2);
 225     const __m256d P1       = _mm256_set1_pd(2.02020656693165307700e1);
 226     const __m256d P0       = _mm256_set1_pd(1.51390680115615096133e3);
 227     /* Q2 == 1.0 */
 228     const __m256d Q1       = _mm256_set1_pd(2.33184211722314911771e2);
 229     const __m256d Q0       = _mm256_set1_pd(4.36821166879210612817e3);
 230     const __m256d one      = _mm256_set1_pd(1.0);
 231     const __m256d two      = _mm256_set1_pd(2.0);
 232
 233     __m256d       valuemask;
 234     __m256i       iexppart;
 235     __m128i       iexppart128a, iexppart128b;
 236     __m256d       fexppart;
 237     __m256d       intpart;
 238     __m256d       z, z2;
 239     __m256d       PolyP, PolyQ;
 240
 241     iexppart128a  = _mm256_cvtpd_epi32(x);
 242     intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
 243
 244     /* Add exponent bias */
 245     iexppart128a   = _mm_add_epi32(iexppart128a, expbase);
 246
 247     /* We now want to shift the exponent 52 positions left, but to achieve this we need
 248      * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
 249      * shift them, and then merge into a single __m256d.
 250      * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
 251      * It doesnt matter what we put in the 2nd/4th position, since that data will be
 252      * shifted out and replaced with zeros.
 253      */
 254     iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
 255     iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));
 256
 257     iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
 258     iexppart128a   = _mm_slli_epi64(iexppart128a, 52);
 259
 260     iexppart  = _mm256_castsi128_si256(iexppart128a);
 261     iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
 262
 263     valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
 264     fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));
 265
 266     z         = _mm256_sub_pd(x, intpart);
 267
 268     z2        = _mm256_mul_pd(z, z);
 269
 270     PolyP     = _mm256_mul_pd(P2, z2);
 271     PolyP     = _mm256_add_pd(PolyP, P1);
 272     PolyQ     = _mm256_add_pd(z2, Q1);
 273     PolyP     = _mm256_mul_pd(PolyP, z2);
 274     PolyQ     = _mm256_mul_pd(PolyQ, z2);
 275     PolyP     = _mm256_add_pd(PolyP, P0);
 276     PolyQ     = _mm256_add_pd(PolyQ, Q0);
 277     PolyP     = _mm256_mul_pd(PolyP, z);
 278
 279     z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
 280     z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));
 281
 282     z         = _mm256_mul_pd(z, fexppart);
 283
 284     return z;
 285 }
 286
 287 /* 2^x, 128 bit */
 288 static __m128d
 289 gmx_mm_exp2_pd(__m128d x)
 290 {
 291     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 292     const __m128d arglimit = _mm_set1_pd(1022.0);
 293     const __m128i expbase  = _mm_set1_epi32(1023);
 294
 295     const __m128d P2       = _mm_set1_pd(2.30933477057345225087e-2);
 296     const __m128d P1       = _mm_set1_pd(2.02020656693165307700e1);
 297     const __m128d P0       = _mm_set1_pd(1.51390680115615096133e3);
 298     /* Q2 == 1.0 */
 299     const __m128d Q1       = _mm_set1_pd(2.33184211722314911771e2);
 300     const __m128d Q0       = _mm_set1_pd(4.36821166879210612817e3);
 301     const __m128d one      = _mm_set1_pd(1.0);
 302     const __m128d two      = _mm_set1_pd(2.0);
 303
 304     __m128d       valuemask;
 305     __m128i       iexppart;
 306     __m128d       fexppart;
 307     __m128d       intpart;
 308     __m128d       z, z2;
 309     __m128d       PolyP, PolyQ;
 310
 311     iexppart  = _mm_cvtpd_epi32(x);
 312     intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
 313
 314     /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
 315      * To be able to shift it into the exponent for a double precision number we first need to
 316      * shuffle so that the lower half contains the first element, and the upper half the second.
 317      * This should really be done as a zero-extension, but since the next instructions will shift
 318      * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
 319      * (thus we just use element 2 from iexppart).
 320      */
 321     iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
 322
 323     /* Do the shift operation on the 64-bit registers */
 324     iexppart  = _mm_add_epi32(iexppart, expbase);
 325     iexppart  = _mm_slli_epi64(iexppart, 52);
 326
 327     valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
 328     fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
 329
 330     z         = _mm_sub_pd(x, intpart);
 331     z2        = _mm_mul_pd(z, z);
 332
 333     PolyP     = _mm_mul_pd(P2, z2);
 334     PolyP     = _mm_add_pd(PolyP, P1);
 335     PolyQ     = _mm_add_pd(z2, Q1);
 336     PolyP     = _mm_mul_pd(PolyP, z2);
 337     PolyQ     = _mm_mul_pd(PolyQ, z2);
 338     PolyP     = _mm_add_pd(PolyP, P0);
 339     PolyQ     = _mm_add_pd(PolyQ, Q0);
 340     PolyP     = _mm_mul_pd(PolyP, z);
 341
 342     z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
 343     z         = _mm_add_pd(one, _mm_mul_pd(two, z));
 344
 345     z         = _mm_mul_pd(z, fexppart);
 346
 347     return z;
 348 }
 349
 350
 351 /* Exponential function, 256 bit. This could be calculated from 2^x as Exp(x)=2^(y),
 352  * where y=log2(e)*x, but there will then be a small rounding error since we lose
 353  * some precision due to the multiplication. This will then be magnified a lot by
 354  * the exponential.
 355  *
 356  * Instead, we calculate the fractional part directly as a Padé approximation of
 357  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 358  * remaining after 2^y, which avoids the precision-loss.
 359  */
 360 static __m256d
 361 gmx_mm256_exp_pd(__m256d exparg)
 362 {
 363     const __m256d argscale = _mm256_set1_pd(1.4426950408889634073599);
 364     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 365     const __m256d arglimit = _mm256_set1_pd(1022.0);
 366     const __m128i expbase  = _mm_set1_epi32(1023);
 367
 368     const __m256d invargscale0  = _mm256_set1_pd(6.93145751953125e-1);
 369     const __m256d invargscale1  = _mm256_set1_pd(1.42860682030941723212e-6);
 370
 371     const __m256d P2       = _mm256_set1_pd(1.26177193074810590878e-4);
 372     const __m256d P1       = _mm256_set1_pd(3.02994407707441961300e-2);
 373     /* P0 == 1.0 */
 374     const __m256d Q3       = _mm256_set1_pd(3.00198505138664455042E-6);
 375     const __m256d Q2       = _mm256_set1_pd(2.52448340349684104192E-3);
 376     const __m256d Q1       = _mm256_set1_pd(2.27265548208155028766E-1);
 377     /* Q0 == 2.0 */
 378     const __m256d one      = _mm256_set1_pd(1.0);
 379     const __m256d two      = _mm256_set1_pd(2.0);
 380
 381     __m256d       valuemask;
 382     __m256i       iexppart;
 383     __m128i       iexppart128a, iexppart128b;
 384     __m256d       fexppart;
 385     __m256d       intpart;
 386     __m256d       x, z, z2;
 387     __m256d       PolyP, PolyQ;
 388
 389     x             = _mm256_mul_pd(exparg, argscale);
 390
 391     iexppart128a  = _mm256_cvtpd_epi32(x);
 392     intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
 393
 394     /* Add exponent bias */
 395     iexppart128a   = _mm_add_epi32(iexppart128a, expbase);
 396
 397     /* We now want to shift the exponent 52 positions left, but to achieve this we need
 398      * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
 399      * shift them, and then merge into a single __m256d.
 400      * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
 401      * It doesnt matter what we put in the 2nd/4th position, since that data will be
 402      * shifted out and replaced with zeros.
 403      */
 404     iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
 405     iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));
 406
 407     iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
 408     iexppart128a   = _mm_slli_epi64(iexppart128a, 52);
 409
 410     iexppart  = _mm256_castsi128_si256(iexppart128a);
 411     iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
 412
 413     valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
 414     fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));
 415
 416     z         = _mm256_sub_pd(exparg, _mm256_mul_pd(invargscale0, intpart));
 417     z         = _mm256_sub_pd(z, _mm256_mul_pd(invargscale1, intpart));
 418
 419     z2        = _mm256_mul_pd(z, z);
 420
 421     PolyQ     = _mm256_mul_pd(Q3, z2);
 422     PolyQ     = _mm256_add_pd(PolyQ, Q2);
 423     PolyP     = _mm256_mul_pd(P2, z2);
 424     PolyQ     = _mm256_mul_pd(PolyQ, z2);
 425     PolyP     = _mm256_add_pd(PolyP, P1);
 426     PolyQ     = _mm256_add_pd(PolyQ, Q1);
 427     PolyP     = _mm256_mul_pd(PolyP, z2);
 428     PolyQ     = _mm256_mul_pd(PolyQ, z2);
 429     PolyP     = _mm256_add_pd(PolyP, one);
 430     PolyQ     = _mm256_add_pd(PolyQ, two);
 431
 432     PolyP     = _mm256_mul_pd(PolyP, z);
 433
 434     z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
 435     z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));
 436
 437     z         = _mm256_mul_pd(z, fexppart);
 438
 439     return z;
 440 }
 441
 442 /* exp(), 128 bit */
 443 static __m128d
 444 gmx_mm_exp_pd(__m128d exparg)
 445 {
 446     const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
 447     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 448     const __m128d arglimit = _mm_set1_pd(1022.0);
 449     const __m128i expbase  = _mm_set1_epi32(1023);
 450
 451     const __m128d invargscale0  = _mm_set1_pd(6.93145751953125e-1);
 452     const __m128d invargscale1  = _mm_set1_pd(1.42860682030941723212e-6);
 453
 454     const __m128d P2       = _mm_set1_pd(1.26177193074810590878e-4);
 455     const __m128d P1       = _mm_set1_pd(3.02994407707441961300e-2);
 456     /* P0 == 1.0 */
 457     const __m128d Q3       = _mm_set1_pd(3.00198505138664455042E-6);
 458     const __m128d Q2       = _mm_set1_pd(2.52448340349684104192E-3);
 459     const __m128d Q1       = _mm_set1_pd(2.27265548208155028766E-1);
 460     /* Q0 == 2.0 */
 461     const __m128d one      = _mm_set1_pd(1.0);
 462     const __m128d two      = _mm_set1_pd(2.0);
 463
 464     __m128d       valuemask;
 465     __m128i       iexppart;
 466     __m128d       fexppart;
 467     __m128d       intpart;
 468     __m128d       x, z, z2;
 469     __m128d       PolyP, PolyQ;
 470
 471     x             = _mm_mul_pd(exparg, argscale);
 472
 473     iexppart  = _mm_cvtpd_epi32(x);
 474     intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
 475
 476     /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
 477      * To be able to shift it into the exponent for a double precision number we first need to
 478      * shuffle so that the lower half contains the first element, and the upper half the second.
 479      * This should really be done as a zero-extension, but since the next instructions will shift
 480      * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
 481      * (thus we just use element 2 from iexppart).
 482      */
 483     iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
 484
 485     /* Do the shift operation on the 64-bit registers */
 486     iexppart  = _mm_add_epi32(iexppart, expbase);
 487     iexppart  = _mm_slli_epi64(iexppart, 52);
 488
 489     valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
 490     fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
 491
 492     z         = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
 493     z         = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
 494
 495     z2        = _mm_mul_pd(z, z);
 496
 497     PolyQ     = _mm_mul_pd(Q3, z2);
 498     PolyQ     = _mm_add_pd(PolyQ, Q2);
 499     PolyP     = _mm_mul_pd(P2, z2);
 500     PolyQ     = _mm_mul_pd(PolyQ, z2);
 501     PolyP     = _mm_add_pd(PolyP, P1);
 502     PolyQ     = _mm_add_pd(PolyQ, Q1);
 503     PolyP     = _mm_mul_pd(PolyP, z2);
 504     PolyQ     = _mm_mul_pd(PolyQ, z2);
 505     PolyP     = _mm_add_pd(PolyP, one);
 506     PolyQ     = _mm_add_pd(PolyQ, two);
 507
 508     PolyP     = _mm_mul_pd(PolyP, z);
 509
 510     z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
 511     z         = _mm_add_pd(one, _mm_mul_pd(two, z));
 512
 513     z         = _mm_mul_pd(z, fexppart);
 514
 515     return z;
 516 }
 517
 518
 519 static __m256d
 520 gmx_mm256_log_pd(__m256d x)
 521 {
 522     /* Same algorithm as cephes library */
 523     const __m256d expmask    = _mm256_castsi256_pd( _mm256_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000,
 524                                                                      0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
 525
 526     const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
 527
 528     const __m256d half       = _mm256_set1_pd(0.5);
 529     const __m256d one        = _mm256_set1_pd(1.0);
 530     const __m256d two        = _mm256_set1_pd(2.0);
 531     const __m256d invsq2     = _mm256_set1_pd(1.0/sqrt(2.0));
 532
 533     const __m256d corr1      = _mm256_set1_pd(-2.121944400546905827679e-4);
 534     const __m256d corr2      = _mm256_set1_pd(0.693359375);
 535
 536     const __m256d P5         = _mm256_set1_pd(1.01875663804580931796e-4);
 537     const __m256d P4         = _mm256_set1_pd(4.97494994976747001425e-1);
 538     const __m256d P3         = _mm256_set1_pd(4.70579119878881725854e0);
 539     const __m256d P2         = _mm256_set1_pd(1.44989225341610930846e1);
 540     const __m256d P1         = _mm256_set1_pd(1.79368678507819816313e1);
 541     const __m256d P0         = _mm256_set1_pd(7.70838733755885391666e0);
 542
 543     const __m256d Q4         = _mm256_set1_pd(1.12873587189167450590e1);
 544     const __m256d Q3         = _mm256_set1_pd(4.52279145837532221105e1);
 545     const __m256d Q2         = _mm256_set1_pd(8.29875266912776603211e1);
 546     const __m256d Q1         = _mm256_set1_pd(7.11544750618563894466e1);
 547     const __m256d Q0         = _mm256_set1_pd(2.31251620126765340583e1);
 548
 549     const __m256d R2         = _mm256_set1_pd(-7.89580278884799154124e-1);
 550     const __m256d R1         = _mm256_set1_pd(1.63866645699558079767e1);
 551     const __m256d R0         = _mm256_set1_pd(-6.41409952958715622951e1);
 552
 553     const __m256d S2         = _mm256_set1_pd(-3.56722798256324312549E1);
 554     const __m256d S1         = _mm256_set1_pd(3.12093766372244180303E2);
 555     const __m256d S0         = _mm256_set1_pd(-7.69691943550460008604E2);
 556
 557     __m256d       fexp;
 558     __m256i       iexp;
 559     __m128i       iexp128a, iexp128b;
 560
 561     __m256d       mask1, mask2;
 562     __m256d       corr, t1, t2, q;
 563     __m256d       zA, yA, xA, zB, yB, xB, z;
 564     __m256d       polyR, polyS;
 565     __m256d       polyP1, polyP2, polyQ1, polyQ2;
 566
 567     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 568     fexp     = _mm256_and_pd(x, expmask);
 569
 570     iexp     = _mm256_castpd_si256(fexp);
 571     iexp128b = _mm256_extractf128_si256(iexp, 0x1);
 572     iexp128a = _mm256_castsi256_si128(iexp);
 573
 574     iexp128a  = _mm_srli_epi64(iexp128a, 52);
 575     iexp128b  = _mm_srli_epi64(iexp128b, 52);
 576     /* Merge into a single register */
 577     iexp128a  = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 2, 0));
 578     iexp128b  = _mm_shuffle_epi32(iexp128b, _MM_SHUFFLE(2, 0, 1, 1));
 579     iexp128a  = _mm_or_si128(iexp128a, iexp128b);
 580     iexp128a  = _mm_sub_epi32(iexp128a, expbase_m1);
 581
 582     fexp      = _mm256_cvtepi32_pd(iexp128a);
 583
 584     x         = _mm256_andnot_pd(expmask, x); /* Get mantissa */
 585     x         = _mm256_or_pd(x, one);
 586     x         = _mm256_mul_pd(x, half);
 587
 588     mask1     = _mm256_cmp_pd(gmx_mm256_abs_pd(fexp), two, _CMP_GT_OQ);
 589     mask2     = _mm256_cmp_pd(x, invsq2, _CMP_LT_OQ);
 590
 591     fexp      = _mm256_sub_pd(fexp, _mm256_and_pd(mask2, one));
 592
 593     /* If mask1 is set ('A') */
 594     zA     = _mm256_sub_pd(x, half);
 595     t1     = _mm256_blendv_pd( zA, x, mask2 );
 596     zA     = _mm256_sub_pd(t1, half);
 597     t2     = _mm256_blendv_pd( x, zA, mask2 );
 598     yA     = _mm256_mul_pd(half, _mm256_add_pd(t2, one));
 599
 600     xA     = _mm256_mul_pd(zA, gmx_mm256_inv_pd(yA));
 601     zA     = _mm256_mul_pd(xA, xA);
 602
 603     /* EVALUATE POLY */
 604     polyR  = _mm256_mul_pd(R2, zA);
 605     polyR  = _mm256_add_pd(polyR, R1);
 606     polyR  = _mm256_mul_pd(polyR, zA);
 607     polyR  = _mm256_add_pd(polyR, R0);
 608
 609     polyS  = _mm256_add_pd(zA, S2);
 610     polyS  = _mm256_mul_pd(polyS, zA);
 611     polyS  = _mm256_add_pd(polyS, S1);
 612     polyS  = _mm256_mul_pd(polyS, zA);
 613     polyS  = _mm256_add_pd(polyS, S0);
 614
 615     q      = _mm256_mul_pd(polyR, gmx_mm256_inv_pd(polyS));
 616     zA     = _mm256_mul_pd(_mm256_mul_pd(xA, zA), q);
 617
 618     zA     = _mm256_add_pd(zA, _mm256_mul_pd(corr1, fexp));
 619     zA     = _mm256_add_pd(zA, xA);
 620     zA     = _mm256_add_pd(zA, _mm256_mul_pd(corr2, fexp));
 621
 622     /* If mask1 is not set ('B') */
 623     corr   = _mm256_and_pd(mask2, x);
 624     xB     = _mm256_add_pd(x, corr);
 625     xB     = _mm256_sub_pd(xB, one);
 626     zB     = _mm256_mul_pd(xB, xB);
 627
 628     polyP1 = _mm256_mul_pd(P5, zB);
 629     polyP2 = _mm256_mul_pd(P4, zB);
 630     polyP1 = _mm256_add_pd(polyP1, P3);
 631     polyP2 = _mm256_add_pd(polyP2, P2);
 632     polyP1 = _mm256_mul_pd(polyP1, zB);
 633     polyP2 = _mm256_mul_pd(polyP2, zB);
 634     polyP1 = _mm256_add_pd(polyP1, P1);
 635     polyP2 = _mm256_add_pd(polyP2, P0);
 636     polyP1 = _mm256_mul_pd(polyP1, xB);
 637     polyP1 = _mm256_add_pd(polyP1, polyP2);
 638
 639     polyQ2 = _mm256_mul_pd(Q4, zB);
 640     polyQ1 = _mm256_add_pd(zB, Q3);
 641     polyQ2 = _mm256_add_pd(polyQ2, Q2);
 642     polyQ1 = _mm256_mul_pd(polyQ1, zB);
 643     polyQ2 = _mm256_mul_pd(polyQ2, zB);
 644     polyQ1 = _mm256_add_pd(polyQ1, Q1);
 645     polyQ2 = _mm256_add_pd(polyQ2, Q0);
 646     polyQ1 = _mm256_mul_pd(polyQ1, xB);
 647     polyQ1 = _mm256_add_pd(polyQ1, polyQ2);
 648
 649     fexp   = _mm256_and_pd(fexp, _mm256_cmp_pd(fexp, _mm256_setzero_pd(), _CMP_NEQ_OQ));
 650
 651     q      = _mm256_mul_pd(polyP1, gmx_mm256_inv_pd(polyQ1));
 652     yB     = _mm256_mul_pd(_mm256_mul_pd(xB, zB), q);
 653
 654     yB     = _mm256_add_pd(yB, _mm256_mul_pd(corr1, fexp));
 655     yB     = _mm256_sub_pd(yB, _mm256_mul_pd(half, zB));
 656     zB     = _mm256_add_pd(xB, yB);
 657     zB     = _mm256_add_pd(zB, _mm256_mul_pd(corr2, fexp));
 658
 659     z      = _mm256_blendv_pd( zB, zA, mask1 );
 660
 661     return z;
 662 }
 663
 664 static __m128d
 665 gmx_mm_log_pd(__m128d x)
 666 {
 667     /* Same algorithm as cephes library */
 668     const __m128d expmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
 669
 670     const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
 671
 672     const __m128d half       = _mm_set1_pd(0.5);
 673     const __m128d one        = _mm_set1_pd(1.0);
 674     const __m128d two        = _mm_set1_pd(2.0);
 675     const __m128d invsq2     = _mm_set1_pd(1.0/sqrt(2.0));
 676
 677     const __m128d corr1      = _mm_set1_pd(-2.121944400546905827679e-4);
 678     const __m128d corr2      = _mm_set1_pd(0.693359375);
 679
 680     const __m128d P5         = _mm_set1_pd(1.01875663804580931796e-4);
 681     const __m128d P4         = _mm_set1_pd(4.97494994976747001425e-1);
 682     const __m128d P3         = _mm_set1_pd(4.70579119878881725854e0);
 683     const __m128d P2         = _mm_set1_pd(1.44989225341610930846e1);
 684     const __m128d P1         = _mm_set1_pd(1.79368678507819816313e1);
 685     const __m128d P0         = _mm_set1_pd(7.70838733755885391666e0);
 686
 687     const __m128d Q4         = _mm_set1_pd(1.12873587189167450590e1);
 688     const __m128d Q3         = _mm_set1_pd(4.52279145837532221105e1);
 689     const __m128d Q2         = _mm_set1_pd(8.29875266912776603211e1);
 690     const __m128d Q1         = _mm_set1_pd(7.11544750618563894466e1);
 691     const __m128d Q0         = _mm_set1_pd(2.31251620126765340583e1);
 692
 693     const __m128d R2         = _mm_set1_pd(-7.89580278884799154124e-1);
 694     const __m128d R1         = _mm_set1_pd(1.63866645699558079767e1);
 695     const __m128d R0         = _mm_set1_pd(-6.41409952958715622951e1);
 696
 697     const __m128d S2         = _mm_set1_pd(-3.56722798256324312549E1);
 698     const __m128d S1         = _mm_set1_pd(3.12093766372244180303E2);
 699     const __m128d S0         = _mm_set1_pd(-7.69691943550460008604E2);
 700
 701     __m128d       fexp;
 702     __m128i       iexp;
 703
 704     __m128d       mask1, mask2;
 705     __m128d       corr, t1, t2, q;
 706     __m128d       zA, yA, xA, zB, yB, xB, z;
 707     __m128d       polyR, polyS;
 708     __m128d       polyP1, polyP2, polyQ1, polyQ2;
 709
 710     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 711     fexp   = _mm_and_pd(x, expmask);
 712     iexp   = gmx_mm_castpd_si128(fexp);
 713     iexp   = _mm_srli_epi64(iexp, 52);
 714     iexp   = _mm_sub_epi32(iexp, expbase_m1);
 715     iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
 716     fexp   = _mm_cvtepi32_pd(iexp);
 717
 718     x      = _mm_andnot_pd(expmask, x);
 719     x      = _mm_or_pd(x, one);
 720     x      = _mm_mul_pd(x, half);
 721
 722     mask1     = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
 723     mask2     = _mm_cmplt_pd(x, invsq2);
 724
 725     fexp   = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
 726
 727     /* If mask1 is set ('A') */
 728     zA     = _mm_sub_pd(x, half);
 729     t1     = _mm_blendv_pd( zA, x, mask2 );
 730     zA     = _mm_sub_pd(t1, half);
 731     t2     = _mm_blendv_pd( x, zA, mask2 );
 732     yA     = _mm_mul_pd(half, _mm_add_pd(t2, one));
 733
 734     xA     = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
 735     zA     = _mm_mul_pd(xA, xA);
 736
 737     /* EVALUATE POLY */
 738     polyR  = _mm_mul_pd(R2, zA);
 739     polyR  = _mm_add_pd(polyR, R1);
 740     polyR  = _mm_mul_pd(polyR, zA);
 741     polyR  = _mm_add_pd(polyR, R0);
 742
 743     polyS  = _mm_add_pd(zA, S2);
 744     polyS  = _mm_mul_pd(polyS, zA);
 745     polyS  = _mm_add_pd(polyS, S1);
 746     polyS  = _mm_mul_pd(polyS, zA);
 747     polyS  = _mm_add_pd(polyS, S0);
 748
 749     q      = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
 750     zA     = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
 751
 752     zA     = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
 753     zA     = _mm_add_pd(zA, xA);
 754     zA     = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
 755
 756     /* If mask1 is not set ('B') */
 757     corr   = _mm_and_pd(mask2, x);
 758     xB     = _mm_add_pd(x, corr);
 759     xB     = _mm_sub_pd(xB, one);
 760     zB     = _mm_mul_pd(xB, xB);
 761
 762     polyP1 = _mm_mul_pd(P5, zB);
 763     polyP2 = _mm_mul_pd(P4, zB);
 764     polyP1 = _mm_add_pd(polyP1, P3);
 765     polyP2 = _mm_add_pd(polyP2, P2);
 766     polyP1 = _mm_mul_pd(polyP1, zB);
 767     polyP2 = _mm_mul_pd(polyP2, zB);
 768     polyP1 = _mm_add_pd(polyP1, P1);
 769     polyP2 = _mm_add_pd(polyP2, P0);
 770     polyP1 = _mm_mul_pd(polyP1, xB);
 771     polyP1 = _mm_add_pd(polyP1, polyP2);
 772
 773     polyQ2 = _mm_mul_pd(Q4, zB);
 774     polyQ1 = _mm_add_pd(zB, Q3);
 775     polyQ2 = _mm_add_pd(polyQ2, Q2);
 776     polyQ1 = _mm_mul_pd(polyQ1, zB);
 777     polyQ2 = _mm_mul_pd(polyQ2, zB);
 778     polyQ1 = _mm_add_pd(polyQ1, Q1);
 779     polyQ2 = _mm_add_pd(polyQ2, Q0);
 780     polyQ1 = _mm_mul_pd(polyQ1, xB);
 781     polyQ1 = _mm_add_pd(polyQ1, polyQ2);
 782
 783     fexp   = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
 784
 785     q      = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
 786     yB     = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
 787
 788     yB     = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
 789     yB     = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
 790     zB     = _mm_add_pd(xB, yB);
 791     zB     = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
 792
 793     z      = _mm_blendv_pd( zB, zA, mask1 );
 794
 795     return z;
 796 }
 797
 798
 799 static __m256d
 800 gmx_mm256_erf_pd(__m256d x)
 801 {
 802     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
 803     const __m256d CAP4      = _mm256_set1_pd(-0.431780540597889301512e-4);
 804     const __m256d CAP3      = _mm256_set1_pd(-0.00578562306260059236059);
 805     const __m256d CAP2      = _mm256_set1_pd(-0.028593586920219752446);
 806     const __m256d CAP1      = _mm256_set1_pd(-0.315924962948621698209);
 807     const __m256d CAP0      = _mm256_set1_pd(0.14952975608477029151);
 808
 809     const __m256d CAQ5      = _mm256_set1_pd(-0.374089300177174709737e-5);
 810     const __m256d CAQ4      = _mm256_set1_pd(0.00015126584532155383535);
 811     const __m256d CAQ3      = _mm256_set1_pd(0.00536692680669480725423);
 812     const __m256d CAQ2      = _mm256_set1_pd(0.0668686825594046122636);
 813     const __m256d CAQ1      = _mm256_set1_pd(0.402604990869284362773);
 814     /* CAQ0 == 1.0 */
 815     const __m256d CAoffset  = _mm256_set1_pd(0.9788494110107421875);
 816
 817     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
 818     const __m256d CBP6      = _mm256_set1_pd(2.49650423685462752497647637088e-10);
 819     const __m256d CBP5      = _mm256_set1_pd(0.00119770193298159629350136085658);
 820     const __m256d CBP4      = _mm256_set1_pd(0.0164944422378370965881008942733);
 821     const __m256d CBP3      = _mm256_set1_pd(0.0984581468691775932063932439252);
 822     const __m256d CBP2      = _mm256_set1_pd(0.317364595806937763843589437418);
 823     const __m256d CBP1      = _mm256_set1_pd(0.554167062641455850932670067075);
 824     const __m256d CBP0      = _mm256_set1_pd(0.427583576155807163756925301060);
 825     const __m256d CBQ7      = _mm256_set1_pd(0.00212288829699830145976198384930);
 826     const __m256d CBQ6      = _mm256_set1_pd(0.0334810979522685300554606393425);
 827     const __m256d CBQ5      = _mm256_set1_pd(0.2361713785181450957579508850717);
 828     const __m256d CBQ4      = _mm256_set1_pd(0.955364736493055670530981883072);
 829     const __m256d CBQ3      = _mm256_set1_pd(2.36815675631420037315349279199);
 830     const __m256d CBQ2      = _mm256_set1_pd(3.55261649184083035537184223542);
 831     const __m256d CBQ1      = _mm256_set1_pd(2.93501136050160872574376997993);
 832     /* CBQ0 == 1.0 */
 833
 834     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
 835     const __m256d CCP6      = _mm256_set1_pd(-2.8175401114513378771);
 836     const __m256d CCP5      = _mm256_set1_pd(-3.22729451764143718517);
 837     const __m256d CCP4      = _mm256_set1_pd(-2.5518551727311523996);
 838     const __m256d CCP3      = _mm256_set1_pd(-0.687717681153649930619);
 839     const __m256d CCP2      = _mm256_set1_pd(-0.212652252872804219852);
 840     const __m256d CCP1      = _mm256_set1_pd(0.0175389834052493308818);
 841     const __m256d CCP0      = _mm256_set1_pd(0.00628057170626964891937);
 842
 843     const __m256d CCQ6      = _mm256_set1_pd(5.48409182238641741584);
 844     const __m256d CCQ5      = _mm256_set1_pd(13.5064170191802889145);
 845     const __m256d CCQ4      = _mm256_set1_pd(22.9367376522880577224);
 846     const __m256d CCQ3      = _mm256_set1_pd(15.930646027911794143);
 847     const __m256d CCQ2      = _mm256_set1_pd(11.0567237927800161565);
 848     const __m256d CCQ1      = _mm256_set1_pd(2.79257750980575282228);
 849     /* CCQ0 == 1.0 */
 850     const __m256d CCoffset  = _mm256_set1_pd(0.5579090118408203125);
 851
 852     const __m256d one       = _mm256_set1_pd(1.0);
 853     const __m256d two       = _mm256_set1_pd(2.0);
 854
 855     const __m256d signbit   = _mm256_castsi256_pd( _mm256_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000,
 856                                                                     0x80000000, 0x00000000, 0x80000000, 0x00000000) );
 857
 858     __m256d xabs, x2, x4, t, t2, w, w2;
 859     __m256d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
 860     __m256d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
 861     __m256d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
 862     __m256d res_erf, res_erfcB, res_erfcC, res_erfc, res;
 863     __m256d mask, expmx2;
 864
 865     /* Calculate erf() */
 866     xabs     = gmx_mm256_abs_pd(x);
 867     x2       = _mm256_mul_pd(x, x);
 868     x4       = _mm256_mul_pd(x2, x2);
 869
 870     PolyAP0  = _mm256_mul_pd(CAP4, x4);
 871     PolyAP1  = _mm256_mul_pd(CAP3, x4);
 872     PolyAP0  = _mm256_add_pd(PolyAP0, CAP2);
 873     PolyAP1  = _mm256_add_pd(PolyAP1, CAP1);
 874     PolyAP0  = _mm256_mul_pd(PolyAP0, x4);
 875     PolyAP1  = _mm256_mul_pd(PolyAP1, x2);
 876     PolyAP0  = _mm256_add_pd(PolyAP0, CAP0);
 877     PolyAP0  = _mm256_add_pd(PolyAP0, PolyAP1);
 878
 879     PolyAQ1  = _mm256_mul_pd(CAQ5, x4);
 880     PolyAQ0  = _mm256_mul_pd(CAQ4, x4);
 881     PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ3);
 882     PolyAQ0  = _mm256_add_pd(PolyAQ0, CAQ2);
 883     PolyAQ1  = _mm256_mul_pd(PolyAQ1, x4);
 884     PolyAQ0  = _mm256_mul_pd(PolyAQ0, x4);
 885     PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ1);
 886     PolyAQ0  = _mm256_add_pd(PolyAQ0, one);
 887     PolyAQ1  = _mm256_mul_pd(PolyAQ1, x2);
 888     PolyAQ0  = _mm256_add_pd(PolyAQ0, PolyAQ1);
 889
 890     res_erf  = _mm256_mul_pd(PolyAP0, gmx_mm256_inv_pd(PolyAQ0));
 891     res_erf  = _mm256_add_pd(CAoffset, res_erf);
 892     res_erf  = _mm256_mul_pd(x, res_erf);
 893
 894     /* Calculate erfc() in range [1,4.5] */
 895     t       = _mm256_sub_pd(xabs, one);
 896     t2      = _mm256_mul_pd(t, t);
 897
 898     PolyBP0  = _mm256_mul_pd(CBP6, t2);
 899     PolyBP1  = _mm256_mul_pd(CBP5, t2);
 900     PolyBP0  = _mm256_add_pd(PolyBP0, CBP4);
 901     PolyBP1  = _mm256_add_pd(PolyBP1, CBP3);
 902     PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
 903     PolyBP1  = _mm256_mul_pd(PolyBP1, t2);
 904     PolyBP0  = _mm256_add_pd(PolyBP0, CBP2);
 905     PolyBP1  = _mm256_add_pd(PolyBP1, CBP1);
 906     PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
 907     PolyBP1  = _mm256_mul_pd(PolyBP1, t);
 908     PolyBP0  = _mm256_add_pd(PolyBP0, CBP0);
 909     PolyBP0  = _mm256_add_pd(PolyBP0, PolyBP1);
 910
 911     PolyBQ1 = _mm256_mul_pd(CBQ7, t2);
 912     PolyBQ0 = _mm256_mul_pd(CBQ6, t2);
 913     PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ5);
 914     PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ4);
 915     PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
 916     PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
 917     PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ3);
 918     PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ2);
 919     PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
 920     PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
 921     PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ1);
 922     PolyBQ0 = _mm256_add_pd(PolyBQ0, one);
 923     PolyBQ1 = _mm256_mul_pd(PolyBQ1, t);
 924     PolyBQ0 = _mm256_add_pd(PolyBQ0, PolyBQ1);
 925
 926     res_erfcB = _mm256_mul_pd(PolyBP0, gmx_mm256_inv_pd(PolyBQ0));
 927
 928     res_erfcB = _mm256_mul_pd(res_erfcB, xabs);
 929
 930     /* Calculate erfc() in range [4.5,inf] */
 931     w       = gmx_mm256_inv_pd(xabs);
 932     w2      = _mm256_mul_pd(w, w);
 933
 934     PolyCP0  = _mm256_mul_pd(CCP6, w2);
 935     PolyCP1  = _mm256_mul_pd(CCP5, w2);
 936     PolyCP0  = _mm256_add_pd(PolyCP0, CCP4);
 937     PolyCP1  = _mm256_add_pd(PolyCP1, CCP3);
 938     PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
 939     PolyCP1  = _mm256_mul_pd(PolyCP1, w2);
 940     PolyCP0  = _mm256_add_pd(PolyCP0, CCP2);
 941     PolyCP1  = _mm256_add_pd(PolyCP1, CCP1);
 942     PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
 943     PolyCP1  = _mm256_mul_pd(PolyCP1, w);
 944     PolyCP0  = _mm256_add_pd(PolyCP0, CCP0);
 945     PolyCP0  = _mm256_add_pd(PolyCP0, PolyCP1);
 946
 947     PolyCQ0  = _mm256_mul_pd(CCQ6, w2);
 948     PolyCQ1  = _mm256_mul_pd(CCQ5, w2);
 949     PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ4);
 950     PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ3);
 951     PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
 952     PolyCQ1  = _mm256_mul_pd(PolyCQ1, w2);
 953     PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ2);
 954     PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ1);
 955     PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
 956     PolyCQ1  = _mm256_mul_pd(PolyCQ1, w);
 957     PolyCQ0  = _mm256_add_pd(PolyCQ0, one);
 958     PolyCQ0  = _mm256_add_pd(PolyCQ0, PolyCQ1);
 959
 960     expmx2   = gmx_mm256_exp_pd( _mm256_or_pd(signbit, x2) );
 961
 962     res_erfcC = _mm256_mul_pd(PolyCP0, gmx_mm256_inv_pd(PolyCQ0));
 963     res_erfcC = _mm256_add_pd(res_erfcC, CCoffset);
 964     res_erfcC = _mm256_mul_pd(res_erfcC, w);
 965
 966     mask     = _mm256_cmp_pd(xabs, _mm256_set1_pd(4.5), _CMP_GT_OQ);
 967     res_erfc = _mm256_blendv_pd(res_erfcB, res_erfcC, mask);
 968
 969     res_erfc = _mm256_mul_pd(res_erfc, expmx2);
 970
 971     /* erfc(x<0) = 2-erfc(|x|) */
 972     mask     = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
 973     res_erfc = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(two, res_erfc), mask);
 974
 975     /* Select erf() or erfc() */
 976     mask = _mm256_cmp_pd(xabs, one, _CMP_LT_OQ);
 977     res  = _mm256_blendv_pd(_mm256_sub_pd(one, res_erfc), res_erf, mask);
 978
 979     return res;
 980 }
 981
 982 static __m128d
 983 gmx_mm_erf_pd(__m128d x)
 984 {
 985     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
 986     const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
 987     const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
 988     const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
 989     const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
 990     const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
 991
 992     const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
 993     const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
 994     const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
 995     const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
 996     const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
 997     /* CAQ0 == 1.0 */
 998     const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
 999
1000     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1001     const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
1002     const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
1003     const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
1004     const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
1005     const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
1006     const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
1007     const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
1008     const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
1009     const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
1010     const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
1011     const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
1012     const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
1013     const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
1014     const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
1015     /* CBQ0 == 1.0 */
1016
1017     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1018     const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
1019     const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
1020     const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
1021     const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
1022     const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
1023     const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
1024     const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
1025
1026     const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
1027     const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
1028     const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
1029     const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
1030     const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
1031     const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
1032     /* CCQ0 == 1.0 */
1033     const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
1034
1035     const __m128d one       = _mm_set1_pd(1.0);
1036     const __m128d two       = _mm_set1_pd(2.0);
1037
1038     const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
1039
1040     __m128d       xabs, x2, x4, t, t2, w, w2;
1041     __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1042     __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1043     __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1044     __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
1045     __m128d       mask, expmx2;
1046
1047     /* Calculate erf() */
1048     xabs     = gmx_mm_abs_pd(x);
1049     x2       = _mm_mul_pd(x, x);
1050     x4       = _mm_mul_pd(x2, x2);
1051
1052     PolyAP0  = _mm_mul_pd(CAP4, x4);
1053     PolyAP1  = _mm_mul_pd(CAP3, x4);
1054     PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
1055     PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
1056     PolyAP0  = _mm_mul_pd(PolyAP0, x4);
1057     PolyAP1  = _mm_mul_pd(PolyAP1, x2);
1058     PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
1059     PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
1060
1061     PolyAQ1  = _mm_mul_pd(CAQ5, x4);
1062     PolyAQ0  = _mm_mul_pd(CAQ4, x4);
1063     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
1064     PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
1065     PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
1066     PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
1067     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
1068     PolyAQ0  = _mm_add_pd(PolyAQ0, one);
1069     PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
1070     PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
1071
1072     res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
1073     res_erf  = _mm_add_pd(CAoffset, res_erf);
1074     res_erf  = _mm_mul_pd(x, res_erf);
1075
1076     /* Calculate erfc() in range [1,4.5] */
1077     t       = _mm_sub_pd(xabs, one);
1078     t2      = _mm_mul_pd(t, t);
1079
1080     PolyBP0  = _mm_mul_pd(CBP6, t2);
1081     PolyBP1  = _mm_mul_pd(CBP5, t2);
1082     PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
1083     PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
1084     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
1085     PolyBP1  = _mm_mul_pd(PolyBP1, t2);
1086     PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
1087     PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
1088     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
1089     PolyBP1  = _mm_mul_pd(PolyBP1, t);
1090     PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
1091     PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
1092
1093     PolyBQ1 = _mm_mul_pd(CBQ7, t2);
1094     PolyBQ0 = _mm_mul_pd(CBQ6, t2);
1095     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
1096     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
1097     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
1098     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
1099     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
1100     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
1101     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
1102     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
1103     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
1104     PolyBQ0 = _mm_add_pd(PolyBQ0, one);
1105     PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
1106     PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
1107
1108     res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
1109
1110     res_erfcB = _mm_mul_pd(res_erfcB, xabs);
1111
1112     /* Calculate erfc() in range [4.5,inf] */
1113     w       = gmx_mm_inv_pd(xabs);
1114     w2      = _mm_mul_pd(w, w);
1115
1116     PolyCP0  = _mm_mul_pd(CCP6, w2);
1117     PolyCP1  = _mm_mul_pd(CCP5, w2);
1118     PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
1119     PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
1120     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
1121     PolyCP1  = _mm_mul_pd(PolyCP1, w2);
1122     PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
1123     PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
1124     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
1125     PolyCP1  = _mm_mul_pd(PolyCP1, w);
1126     PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
1127     PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
1128
1129     PolyCQ0  = _mm_mul_pd(CCQ6, w2);
1130     PolyCQ1  = _mm_mul_pd(CCQ5, w2);
1131     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
1132     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
1133     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
1134     PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
1135     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
1136     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
1137     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
1138     PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
1139     PolyCQ0  = _mm_add_pd(PolyCQ0, one);
1140     PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
1141
1142     expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
1143
1144     res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
1145     res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
1146     res_erfcC = _mm_mul_pd(res_erfcC, w);
1147
1148     mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
1149     res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
1150
1151     res_erfc = _mm_mul_pd(res_erfc, expmx2);
1152
1153     /* erfc(x<0) = 2-erfc(|x|) */
1154     mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
1155     res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
1156
1157     /* Select erf() or erfc() */
1158     mask = _mm_cmplt_pd(xabs, one);
1159     res  = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
1160
1161     return res;
1162 }
1163
1164
1165 static __m256d
1166 gmx_mm256_erfc_pd(__m256d x)
1167 {
1168     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1169     const __m256d CAP4      = _mm256_set1_pd(-0.431780540597889301512e-4);
1170     const __m256d CAP3      = _mm256_set1_pd(-0.00578562306260059236059);
1171     const __m256d CAP2      = _mm256_set1_pd(-0.028593586920219752446);
1172     const __m256d CAP1      = _mm256_set1_pd(-0.315924962948621698209);
1173     const __m256d CAP0      = _mm256_set1_pd(0.14952975608477029151);
1174
1175     const __m256d CAQ5      = _mm256_set1_pd(-0.374089300177174709737e-5);
1176     const __m256d CAQ4      = _mm256_set1_pd(0.00015126584532155383535);
1177     const __m256d CAQ3      = _mm256_set1_pd(0.00536692680669480725423);
1178     const __m256d CAQ2      = _mm256_set1_pd(0.0668686825594046122636);
1179     const __m256d CAQ1      = _mm256_set1_pd(0.402604990869284362773);
1180     /* CAQ0 == 1.0 */
1181     const __m256d CAoffset  = _mm256_set1_pd(0.9788494110107421875);
1182
1183     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1184     const __m256d CBP6      = _mm256_set1_pd(2.49650423685462752497647637088e-10);
1185     const __m256d CBP5      = _mm256_set1_pd(0.00119770193298159629350136085658);
1186     const __m256d CBP4      = _mm256_set1_pd(0.0164944422378370965881008942733);
1187     const __m256d CBP3      = _mm256_set1_pd(0.0984581468691775932063932439252);
1188     const __m256d CBP2      = _mm256_set1_pd(0.317364595806937763843589437418);
1189     const __m256d CBP1      = _mm256_set1_pd(0.554167062641455850932670067075);
1190     const __m256d CBP0      = _mm256_set1_pd(0.427583576155807163756925301060);
1191     const __m256d CBQ7      = _mm256_set1_pd(0.00212288829699830145976198384930);
1192     const __m256d CBQ6      = _mm256_set1_pd(0.0334810979522685300554606393425);
1193     const __m256d CBQ5      = _mm256_set1_pd(0.2361713785181450957579508850717);
1194     const __m256d CBQ4      = _mm256_set1_pd(0.955364736493055670530981883072);
1195     const __m256d CBQ3      = _mm256_set1_pd(2.36815675631420037315349279199);
1196     const __m256d CBQ2      = _mm256_set1_pd(3.55261649184083035537184223542);
1197     const __m256d CBQ1      = _mm256_set1_pd(2.93501136050160872574376997993);
1198     /* CBQ0 == 1.0 */
1199
1200     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1201     const __m256d CCP6      = _mm256_set1_pd(-2.8175401114513378771);
1202     const __m256d CCP5      = _mm256_set1_pd(-3.22729451764143718517);
1203     const __m256d CCP4      = _mm256_set1_pd(-2.5518551727311523996);
1204     const __m256d CCP3      = _mm256_set1_pd(-0.687717681153649930619);
1205     const __m256d CCP2      = _mm256_set1_pd(-0.212652252872804219852);
1206     const __m256d CCP1      = _mm256_set1_pd(0.0175389834052493308818);
1207     const __m256d CCP0      = _mm256_set1_pd(0.00628057170626964891937);
1208
1209     const __m256d CCQ6      = _mm256_set1_pd(5.48409182238641741584);
1210     const __m256d CCQ5      = _mm256_set1_pd(13.5064170191802889145);
1211     const __m256d CCQ4      = _mm256_set1_pd(22.9367376522880577224);
1212     const __m256d CCQ3      = _mm256_set1_pd(15.930646027911794143);
1213     const __m256d CCQ2      = _mm256_set1_pd(11.0567237927800161565);
1214     const __m256d CCQ1      = _mm256_set1_pd(2.79257750980575282228);
1215     /* CCQ0 == 1.0 */
1216     const __m256d CCoffset  = _mm256_set1_pd(0.5579090118408203125);
1217
1218     const __m256d one       = _mm256_set1_pd(1.0);
1219     const __m256d two       = _mm256_set1_pd(2.0);
1220
1221     const __m256d signbit   = _mm256_castsi256_pd( _mm256_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000,
1222                                                                     0x80000000, 0x00000000, 0x80000000, 0x00000000) );
1223
1224     __m256d xabs, x2, x4, t, t2, w, w2;
1225     __m256d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1226     __m256d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1227     __m256d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1228     __m256d res_erf, res_erfcB, res_erfcC, res_erfc, res;
1229     __m256d mask, expmx2;
1230
1231     /* Calculate erf() */
1232     xabs     = gmx_mm256_abs_pd(x);
1233     x2       = _mm256_mul_pd(x, x);
1234     x4       = _mm256_mul_pd(x2, x2);
1235
1236     PolyAP0  = _mm256_mul_pd(CAP4, x4);
1237     PolyAP1  = _mm256_mul_pd(CAP3, x4);
1238     PolyAP0  = _mm256_add_pd(PolyAP0, CAP2);
1239     PolyAP1  = _mm256_add_pd(PolyAP1, CAP1);
1240     PolyAP0  = _mm256_mul_pd(PolyAP0, x4);
1241     PolyAP1  = _mm256_mul_pd(PolyAP1, x2);
1242     PolyAP0  = _mm256_add_pd(PolyAP0, CAP0);
1243     PolyAP0  = _mm256_add_pd(PolyAP0, PolyAP1);
1244
1245     PolyAQ1  = _mm256_mul_pd(CAQ5, x4);
1246     PolyAQ0  = _mm256_mul_pd(CAQ4, x4);
1247     PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ3);
1248     PolyAQ0  = _mm256_add_pd(PolyAQ0, CAQ2);
1249     PolyAQ1  = _mm256_mul_pd(PolyAQ1, x4);
1250     PolyAQ0  = _mm256_mul_pd(PolyAQ0, x4);
1251     PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ1);
1252     PolyAQ0  = _mm256_add_pd(PolyAQ0, one);
1253     PolyAQ1  = _mm256_mul_pd(PolyAQ1, x2);
1254     PolyAQ0  = _mm256_add_pd(PolyAQ0, PolyAQ1);
1255
1256     res_erf  = _mm256_mul_pd(PolyAP0, gmx_mm256_inv_pd(PolyAQ0));
1257     res_erf  = _mm256_add_pd(CAoffset, res_erf);
1258     res_erf  = _mm256_mul_pd(x, res_erf);
1259
1260     /* Calculate erfc() in range [1,4.5] */
1261     t       = _mm256_sub_pd(xabs, one);
1262     t2      = _mm256_mul_pd(t, t);
1263
1264     PolyBP0  = _mm256_mul_pd(CBP6, t2);
1265     PolyBP1  = _mm256_mul_pd(CBP5, t2);
1266     PolyBP0  = _mm256_add_pd(PolyBP0, CBP4);
1267     PolyBP1  = _mm256_add_pd(PolyBP1, CBP3);
1268     PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
1269     PolyBP1  = _mm256_mul_pd(PolyBP1, t2);
1270     PolyBP0  = _mm256_add_pd(PolyBP0, CBP2);
1271     PolyBP1  = _mm256_add_pd(PolyBP1, CBP1);
1272     PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
1273     PolyBP1  = _mm256_mul_pd(PolyBP1, t);
1274     PolyBP0  = _mm256_add_pd(PolyBP0, CBP0);
1275     PolyBP0  = _mm256_add_pd(PolyBP0, PolyBP1);
1276
1277     PolyBQ1 = _mm256_mul_pd(CBQ7, t2);
1278     PolyBQ0 = _mm256_mul_pd(CBQ6, t2);
1279     PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ5);
1280     PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ4);
1281     PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
1282     PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
1283     PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ3);
1284     PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ2);
1285     PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
1286     PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
1287     PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ1);
1288     PolyBQ0 = _mm256_add_pd(PolyBQ0, one);
1289     PolyBQ1 = _mm256_mul_pd(PolyBQ1, t);
1290     PolyBQ0 = _mm256_add_pd(PolyBQ0, PolyBQ1);
1291
1292     res_erfcB = _mm256_mul_pd(PolyBP0, gmx_mm256_inv_pd(PolyBQ0));
1293
1294     res_erfcB = _mm256_mul_pd(res_erfcB, xabs);
1295
1296     /* Calculate erfc() in range [4.5,inf] */
1297     w       = gmx_mm256_inv_pd(xabs);
1298     w2      = _mm256_mul_pd(w, w);
1299
1300     PolyCP0  = _mm256_mul_pd(CCP6, w2);
1301     PolyCP1  = _mm256_mul_pd(CCP5, w2);
1302     PolyCP0  = _mm256_add_pd(PolyCP0, CCP4);
1303     PolyCP1  = _mm256_add_pd(PolyCP1, CCP3);
1304     PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
1305     PolyCP1  = _mm256_mul_pd(PolyCP1, w2);
1306     PolyCP0  = _mm256_add_pd(PolyCP0, CCP2);
1307     PolyCP1  = _mm256_add_pd(PolyCP1, CCP1);
1308     PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
1309     PolyCP1  = _mm256_mul_pd(PolyCP1, w);
1310     PolyCP0  = _mm256_add_pd(PolyCP0, CCP0);
1311     PolyCP0  = _mm256_add_pd(PolyCP0, PolyCP1);
1312
1313     PolyCQ0  = _mm256_mul_pd(CCQ6, w2);
1314     PolyCQ1  = _mm256_mul_pd(CCQ5, w2);
1315     PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ4);
1316     PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ3);
1317     PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
1318     PolyCQ1  = _mm256_mul_pd(PolyCQ1, w2);
1319     PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ2);
1320     PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ1);
1321     PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
1322     PolyCQ1  = _mm256_mul_pd(PolyCQ1, w);
1323     PolyCQ0  = _mm256_add_pd(PolyCQ0, one);
1324     PolyCQ0  = _mm256_add_pd(PolyCQ0, PolyCQ1);
1325
1326     expmx2   = gmx_mm256_exp_pd( _mm256_or_pd(signbit, x2) );
1327
1328     res_erfcC = _mm256_mul_pd(PolyCP0, gmx_mm256_inv_pd(PolyCQ0));
1329     res_erfcC = _mm256_add_pd(res_erfcC, CCoffset);
1330     res_erfcC = _mm256_mul_pd(res_erfcC, w);
1331
1332     mask     = _mm256_cmp_pd(xabs, _mm256_set1_pd(4.5), _CMP_GT_OQ);
1333     res_erfc = _mm256_blendv_pd(res_erfcB, res_erfcC, mask);
1334
1335     res_erfc = _mm256_mul_pd(res_erfc, expmx2);
1336
1337     /* erfc(x<0) = 2-erfc(|x|) */
1338     mask     = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
1339     res_erfc = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(two, res_erfc), mask);
1340
1341     /* Select erf() or erfc() */
1342     mask = _mm256_cmp_pd(xabs, one, _CMP_LT_OQ);
1343     res  = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(one, res_erf), mask);
1344
1345     return res;
1346 }
1347
1348
1349 static __m128d
1350 gmx_mm_erfc_pd(__m128d x)
1351 {
1352     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1353     const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
1354     const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
1355     const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
1356     const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
1357     const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
1358
1359     const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
1360     const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
1361     const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
1362     const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
1363     const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
1364     /* CAQ0 == 1.0 */
1365     const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
1366
1367     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1368     const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
1369     const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
1370     const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
1371     const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
1372     const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
1373     const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
1374     const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
1375     const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
1376     const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
1377     const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
1378     const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
1379     const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
1380     const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
1381     const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
1382     /* CBQ0 == 1.0 */
1383
1384     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1385     const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
1386     const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
1387     const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
1388     const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
1389     const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
1390     const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
1391     const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
1392
1393     const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
1394     const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
1395     const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
1396     const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
1397     const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
1398     const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
1399     /* CCQ0 == 1.0 */
1400     const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
1401
1402     const __m128d one       = _mm_set1_pd(1.0);
1403     const __m128d two       = _mm_set1_pd(2.0);
1404
1405     const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
1406
1407     __m128d       xabs, x2, x4, t, t2, w, w2;
1408     __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1409     __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1410     __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1411     __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
1412     __m128d       mask, expmx2;
1413
1414     /* Calculate erf() */
1415     xabs     = gmx_mm_abs_pd(x);
1416     x2       = _mm_mul_pd(x, x);
1417     x4       = _mm_mul_pd(x2, x2);
1418
1419     PolyAP0  = _mm_mul_pd(CAP4, x4);
1420     PolyAP1  = _mm_mul_pd(CAP3, x4);
1421     PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
1422     PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
1423     PolyAP0  = _mm_mul_pd(PolyAP0, x4);
1424     PolyAP1  = _mm_mul_pd(PolyAP1, x2);
1425     PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
1426     PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
1427
1428     PolyAQ1  = _mm_mul_pd(CAQ5, x4);
1429     PolyAQ0  = _mm_mul_pd(CAQ4, x4);
1430     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
1431     PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
1432     PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
1433     PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
1434     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
1435     PolyAQ0  = _mm_add_pd(PolyAQ0, one);
1436     PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
1437     PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
1438
1439     res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
1440     res_erf  = _mm_add_pd(CAoffset, res_erf);
1441     res_erf  = _mm_mul_pd(x, res_erf);
1442
1443     /* Calculate erfc() in range [1,4.5] */
1444     t       = _mm_sub_pd(xabs, one);
1445     t2      = _mm_mul_pd(t, t);
1446
1447     PolyBP0  = _mm_mul_pd(CBP6, t2);
1448     PolyBP1  = _mm_mul_pd(CBP5, t2);
1449     PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
1450     PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
1451     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
1452     PolyBP1  = _mm_mul_pd(PolyBP1, t2);
1453     PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
1454     PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
1455     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
1456     PolyBP1  = _mm_mul_pd(PolyBP1, t);
1457     PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
1458     PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
1459
1460     PolyBQ1 = _mm_mul_pd(CBQ7, t2);
1461     PolyBQ0 = _mm_mul_pd(CBQ6, t2);
1462     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
1463     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
1464     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
1465     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
1466     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
1467     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
1468     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
1469     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
1470     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
1471     PolyBQ0 = _mm_add_pd(PolyBQ0, one);
1472     PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
1473     PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
1474
1475     res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
1476
1477     res_erfcB = _mm_mul_pd(res_erfcB, xabs);
1478
1479     /* Calculate erfc() in range [4.5,inf] */
1480     w       = gmx_mm_inv_pd(xabs);
1481     w2      = _mm_mul_pd(w, w);
1482
1483     PolyCP0  = _mm_mul_pd(CCP6, w2);
1484     PolyCP1  = _mm_mul_pd(CCP5, w2);
1485     PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
1486     PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
1487     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
1488     PolyCP1  = _mm_mul_pd(PolyCP1, w2);
1489     PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
1490     PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
1491     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
1492     PolyCP1  = _mm_mul_pd(PolyCP1, w);
1493     PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
1494     PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
1495
1496     PolyCQ0  = _mm_mul_pd(CCQ6, w2);
1497     PolyCQ1  = _mm_mul_pd(CCQ5, w2);
1498     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
1499     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
1500     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
1501     PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
1502     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
1503     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
1504     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
1505     PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
1506     PolyCQ0  = _mm_add_pd(PolyCQ0, one);
1507     PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
1508
1509     expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
1510
1511     res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
1512     res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
1513     res_erfcC = _mm_mul_pd(res_erfcC, w);
1514
1515     mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
1516     res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
1517
1518     res_erfc = _mm_mul_pd(res_erfc, expmx2);
1519
1520     /* erfc(x<0) = 2-erfc(|x|) */
1521     mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
1522     res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
1523
1524     /* Select erf() or erfc() */
1525     mask = _mm_cmplt_pd(xabs, one);
1526     res  = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
1527
1528     return res;
1529 }
1530
1531
1532 /* Calculate the force correction due to PME analytically.
1533  *
1534  * This routine is meant to enable analytical evaluation of the
1535  * direct-space PME electrostatic force to avoid tables.
1536  *
1537  * The direct-space potential should be Erfc(beta*r)/r, but there
1538  * are some problems evaluating that:
1539  *
1540  * First, the error function is difficult (read: expensive) to
1541  * approxmiate accurately for intermediate to large arguments, and
1542  * this happens already in ranges of beta*r that occur in simulations.
1543  * Second, we now try to avoid calculating potentials in Gromacs but
1544  * use forces directly.
1545  *
1546  * We can simply things slight by noting that the PME part is really
1547  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1548  *
1549  * V= 1/r - Erf(beta*r)/r
1550  *
1551  * The first term we already have from the inverse square root, so
1552  * that we can leave out of this routine.
1553  *
1554  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1555  * the argument beta*r will be in the range 0.15 to ~4. Use your
1556  * favorite plotting program to realize how well-behaved Erf(z)/z is
1557  * in this range!
1558  *
1559  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1560  * However, it turns out it is more efficient to approximate f(z)/z and
1561  * then only use even powers. This is another minor optimization, since
1562  * we actually WANT f(z)/z, because it is going to be multiplied by
1563  * the vector between the two atoms to get the vectorial force. The
1564  * fastest flops are the ones we can avoid calculating!
1565  *
1566  * So, here's how it should be used:
1567  *
1568  * 1. Calculate r^2.
1569  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1570  * 3. Evaluate this routine with z^2 as the argument.
1571  * 4. The return value is the expression:
1572  *
1573  *
1574  *       2*exp(-z^2)     erf(z)
1575  *       ------------ - --------
1576  *       sqrt(Pi)*z^2      z^3
1577  *
1578  * 5. Multiply the entire expression by beta^3. This will get you
1579  *
1580  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
1581  *       ------------------  - ---------------
1582  *          sqrt(Pi)*z^2            z^3
1583  *
1584  *    or, switching back to r (z=r*beta):
1585  *
1586  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
1587  *       ----------------------- - -----------
1588  *            sqrt(Pi)*r^2            r^3
1589  *
1590  *
1591  *    With a bit of math exercise you should be able to confirm that
1592  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1593  *
1594  * 6. Add the result to 1/r^3, multiply by the product of the charges,
1595  *    and you have your force (divided by r). A final multiplication
1596  *    with the vector connecting the two particles and you have your
1597  *    vectorial force to add to the particles.
1598  *
1599  */
1600 static __m256d
1601 gmx_mm256_pmecorrF_pd(__m256d z2)
1602 {
1603     const __m256d  FN10     = _mm256_set1_pd(-8.0072854618360083154e-14);
1604     const __m256d  FN9      = _mm256_set1_pd(1.1859116242260148027e-11);
1605     const __m256d  FN8      = _mm256_set1_pd(-8.1490406329798423616e-10);
1606     const __m256d  FN7      = _mm256_set1_pd(3.4404793543907847655e-8);
1607     const __m256d  FN6      = _mm256_set1_pd(-9.9471420832602741006e-7);
1608     const __m256d  FN5      = _mm256_set1_pd(0.000020740315999115847456);
1609     const __m256d  FN4      = _mm256_set1_pd(-0.00031991745139313364005);
1610     const __m256d  FN3      = _mm256_set1_pd(0.0035074449373659008203);
1611     const __m256d  FN2      = _mm256_set1_pd(-0.031750380176100813405);
1612     const __m256d  FN1      = _mm256_set1_pd(0.13884101728898463426);
1613     const __m256d  FN0      = _mm256_set1_pd(-0.75225277815249618847);
1614
1615     const __m256d  FD5      = _mm256_set1_pd(0.000016009278224355026701);
1616     const __m256d  FD4      = _mm256_set1_pd(0.00051055686934806966046);
1617     const __m256d  FD3      = _mm256_set1_pd(0.0081803507497974289008);
1618     const __m256d  FD2      = _mm256_set1_pd(0.077181146026670287235);
1619     const __m256d  FD1      = _mm256_set1_pd(0.41543303143712535988);
1620     const __m256d  FD0      = _mm256_set1_pd(1.0);
1621
1622     __m256d        z4;
1623     __m256d        polyFN0, polyFN1, polyFD0, polyFD1;
1624
1625     z4             = _mm256_mul_pd(z2, z2);
1626
1627     polyFD1        = _mm256_mul_pd(FD5, z4);
1628     polyFD0        = _mm256_mul_pd(FD4, z4);
1629     polyFD1        = _mm256_add_pd(polyFD1, FD3);
1630     polyFD0        = _mm256_add_pd(polyFD0, FD2);
1631     polyFD1        = _mm256_mul_pd(polyFD1, z4);
1632     polyFD0        = _mm256_mul_pd(polyFD0, z4);
1633     polyFD1        = _mm256_add_pd(polyFD1, FD1);
1634     polyFD0        = _mm256_add_pd(polyFD0, FD0);
1635     polyFD1        = _mm256_mul_pd(polyFD1, z2);
1636     polyFD0        = _mm256_add_pd(polyFD0, polyFD1);
1637
1638     polyFD0        = gmx_mm256_inv_pd(polyFD0);
1639
1640     polyFN0        = _mm256_mul_pd(FN10, z4);
1641     polyFN1        = _mm256_mul_pd(FN9, z4);
1642     polyFN0        = _mm256_add_pd(polyFN0, FN8);
1643     polyFN1        = _mm256_add_pd(polyFN1, FN7);
1644     polyFN0        = _mm256_mul_pd(polyFN0, z4);
1645     polyFN1        = _mm256_mul_pd(polyFN1, z4);
1646     polyFN0        = _mm256_add_pd(polyFN0, FN6);
1647     polyFN1        = _mm256_add_pd(polyFN1, FN5);
1648     polyFN0        = _mm256_mul_pd(polyFN0, z4);
1649     polyFN1        = _mm256_mul_pd(polyFN1, z4);
1650     polyFN0        = _mm256_add_pd(polyFN0, FN4);
1651     polyFN1        = _mm256_add_pd(polyFN1, FN3);
1652     polyFN0        = _mm256_mul_pd(polyFN0, z4);
1653     polyFN1        = _mm256_mul_pd(polyFN1, z4);
1654     polyFN0        = _mm256_add_pd(polyFN0, FN2);
1655     polyFN1        = _mm256_add_pd(polyFN1, FN1);
1656     polyFN0        = _mm256_mul_pd(polyFN0, z4);
1657     polyFN1        = _mm256_mul_pd(polyFN1, z2);
1658     polyFN0        = _mm256_add_pd(polyFN0, FN0);
1659     polyFN0        = _mm256_add_pd(polyFN0, polyFN1);
1660
1661     return _mm256_mul_pd(polyFN0, polyFD0);
1662 }
1663
1664
1665 static __m128d
1666 gmx_mm_pmecorrF_pd(__m128d z2)
1667 {
1668     const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
1669     const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
1670     const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
1671     const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
1672     const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
1673     const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
1674     const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
1675     const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
1676     const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
1677     const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
1678     const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
1679
1680     const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
1681     const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
1682     const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
1683     const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
1684     const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
1685     const __m128d  FD0      = _mm_set1_pd(1.0);
1686
1687     __m128d        z4;
1688     __m128d        polyFN0, polyFN1, polyFD0, polyFD1;
1689
1690     z4             = _mm_mul_pd(z2, z2);
1691
1692     polyFD1        = _mm_mul_pd(FD5, z4);
1693     polyFD0        = _mm_mul_pd(FD4, z4);
1694     polyFD1        = _mm_add_pd(polyFD1, FD3);
1695     polyFD0        = _mm_add_pd(polyFD0, FD2);
1696     polyFD1        = _mm_mul_pd(polyFD1, z4);
1697     polyFD0        = _mm_mul_pd(polyFD0, z4);
1698     polyFD1        = _mm_add_pd(polyFD1, FD1);
1699     polyFD0        = _mm_add_pd(polyFD0, FD0);
1700     polyFD1        = _mm_mul_pd(polyFD1, z2);
1701     polyFD0        = _mm_add_pd(polyFD0, polyFD1);
1702
1703     polyFD0        = gmx_mm_inv_pd(polyFD0);
1704
1705     polyFN0        = _mm_mul_pd(FN10, z4);
1706     polyFN1        = _mm_mul_pd(FN9, z4);
1707     polyFN0        = _mm_add_pd(polyFN0, FN8);
1708     polyFN1        = _mm_add_pd(polyFN1, FN7);
1709     polyFN0        = _mm_mul_pd(polyFN0, z4);
1710     polyFN1        = _mm_mul_pd(polyFN1, z4);
1711     polyFN0        = _mm_add_pd(polyFN0, FN6);
1712     polyFN1        = _mm_add_pd(polyFN1, FN5);
1713     polyFN0        = _mm_mul_pd(polyFN0, z4);
1714     polyFN1        = _mm_mul_pd(polyFN1, z4);
1715     polyFN0        = _mm_add_pd(polyFN0, FN4);
1716     polyFN1        = _mm_add_pd(polyFN1, FN3);
1717     polyFN0        = _mm_mul_pd(polyFN0, z4);
1718     polyFN1        = _mm_mul_pd(polyFN1, z4);
1719     polyFN0        = _mm_add_pd(polyFN0, FN2);
1720     polyFN1        = _mm_add_pd(polyFN1, FN1);
1721     polyFN0        = _mm_mul_pd(polyFN0, z4);
1722     polyFN1        = _mm_mul_pd(polyFN1, z2);
1723     polyFN0        = _mm_add_pd(polyFN0, FN0);
1724     polyFN0        = _mm_add_pd(polyFN0, polyFN1);
1725
1726     return _mm_mul_pd(polyFN0, polyFD0);
1727 }
1728
1729
1730
1731
1732 /* Calculate the potential correction due to PME analytically.
1733  *
1734  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1735  *
1736  * This routine calculates Erf(z)/z, although you should provide z^2
1737  * as the input argument.
1738  *
1739  * Here's how it should be used:
1740  *
1741  * 1. Calculate r^2.
1742  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1743  * 3. Evaluate this routine with z^2 as the argument.
1744  * 4. The return value is the expression:
1745  *
1746  *
1747  *        erf(z)
1748  *       --------
1749  *          z
1750  *
1751  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1752  *
1753  *       erf(r*beta)
1754  *       -----------
1755  *           r
1756  *
1757  * 6. Subtract the result from 1/r, multiply by the product of the charges,
1758  *    and you have your potential.
1759  *
1760  */
1761 static __m256d
1762 gmx_mm256_pmecorrV_pd(__m256d z2)
1763 {
1764     const __m256d  VN9      = _mm256_set1_pd(-9.3723776169321855475e-13);
1765     const __m256d  VN8      = _mm256_set1_pd(1.2280156762674215741e-10);
1766     const __m256d  VN7      = _mm256_set1_pd(-7.3562157912251309487e-9);
1767     const __m256d  VN6      = _mm256_set1_pd(2.6215886208032517509e-7);
1768     const __m256d  VN5      = _mm256_set1_pd(-4.9532491651265819499e-6);
1769     const __m256d  VN4      = _mm256_set1_pd(0.00025907400778966060389);
1770     const __m256d  VN3      = _mm256_set1_pd(0.0010585044856156469792);
1771     const __m256d  VN2      = _mm256_set1_pd(0.045247661136833092885);
1772     const __m256d  VN1      = _mm256_set1_pd(0.11643931522926034421);
1773     const __m256d  VN0      = _mm256_set1_pd(1.1283791671726767970);
1774
1775     const __m256d  VD5      = _mm256_set1_pd(0.000021784709867336150342);
1776     const __m256d  VD4      = _mm256_set1_pd(0.00064293662010911388448);
1777     const __m256d  VD3      = _mm256_set1_pd(0.0096311444822588683504);
1778     const __m256d  VD2      = _mm256_set1_pd(0.085608012351550627051);
1779     const __m256d  VD1      = _mm256_set1_pd(0.43652499166614811084);
1780     const __m256d  VD0      = _mm256_set1_pd(1.0);
1781
1782     __m256d        z4;
1783     __m256d        polyVN0, polyVN1, polyVD0, polyVD1;
1784
1785     z4             = _mm256_mul_pd(z2, z2);
1786
1787     polyVD1        = _mm256_mul_pd(VD5, z4);
1788     polyVD0        = _mm256_mul_pd(VD4, z4);
1789     polyVD1        = _mm256_add_pd(polyVD1, VD3);
1790     polyVD0        = _mm256_add_pd(polyVD0, VD2);
1791     polyVD1        = _mm256_mul_pd(polyVD1, z4);
1792     polyVD0        = _mm256_mul_pd(polyVD0, z4);
1793     polyVD1        = _mm256_add_pd(polyVD1, VD1);
1794     polyVD0        = _mm256_add_pd(polyVD0, VD0);
1795     polyVD1        = _mm256_mul_pd(polyVD1, z2);
1796     polyVD0        = _mm256_add_pd(polyVD0, polyVD1);
1797
1798     polyVD0        = gmx_mm256_inv_pd(polyVD0);
1799
1800     polyVN1        = _mm256_mul_pd(VN9, z4);
1801     polyVN0        = _mm256_mul_pd(VN8, z4);
1802     polyVN1        = _mm256_add_pd(polyVN1, VN7);
1803     polyVN0        = _mm256_add_pd(polyVN0, VN6);
1804     polyVN1        = _mm256_mul_pd(polyVN1, z4);
1805     polyVN0        = _mm256_mul_pd(polyVN0, z4);
1806     polyVN1        = _mm256_add_pd(polyVN1, VN5);
1807     polyVN0        = _mm256_add_pd(polyVN0, VN4);
1808     polyVN1        = _mm256_mul_pd(polyVN1, z4);
1809     polyVN0        = _mm256_mul_pd(polyVN0, z4);
1810     polyVN1        = _mm256_add_pd(polyVN1, VN3);
1811     polyVN0        = _mm256_add_pd(polyVN0, VN2);
1812     polyVN1        = _mm256_mul_pd(polyVN1, z4);
1813     polyVN0        = _mm256_mul_pd(polyVN0, z4);
1814     polyVN1        = _mm256_add_pd(polyVN1, VN1);
1815     polyVN0        = _mm256_add_pd(polyVN0, VN0);
1816     polyVN1        = _mm256_mul_pd(polyVN1, z2);
1817     polyVN0        = _mm256_add_pd(polyVN0, polyVN1);
1818
1819     return _mm256_mul_pd(polyVN0, polyVD0);
1820 }
1821
1822
1823 static __m128d
1824 gmx_mm_pmecorrV_pd(__m128d z2)
1825 {
1826     const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
1827     const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
1828     const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
1829     const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
1830     const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
1831     const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
1832     const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
1833     const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
1834     const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
1835     const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
1836
1837     const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
1838     const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
1839     const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
1840     const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
1841     const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
1842     const __m128d  VD0      = _mm_set1_pd(1.0);
1843
1844     __m128d        z4;
1845     __m128d        polyVN0, polyVN1, polyVD0, polyVD1;
1846
1847     z4             = _mm_mul_pd(z2, z2);
1848
1849     polyVD1        = _mm_mul_pd(VD5, z4);
1850     polyVD0        = _mm_mul_pd(VD4, z4);
1851     polyVD1        = _mm_add_pd(polyVD1, VD3);
1852     polyVD0        = _mm_add_pd(polyVD0, VD2);
1853     polyVD1        = _mm_mul_pd(polyVD1, z4);
1854     polyVD0        = _mm_mul_pd(polyVD0, z4);
1855     polyVD1        = _mm_add_pd(polyVD1, VD1);
1856     polyVD0        = _mm_add_pd(polyVD0, VD0);
1857     polyVD1        = _mm_mul_pd(polyVD1, z2);
1858     polyVD0        = _mm_add_pd(polyVD0, polyVD1);
1859
1860     polyVD0        = gmx_mm_inv_pd(polyVD0);
1861
1862     polyVN1        = _mm_mul_pd(VN9, z4);
1863     polyVN0        = _mm_mul_pd(VN8, z4);
1864     polyVN1        = _mm_add_pd(polyVN1, VN7);
1865     polyVN0        = _mm_add_pd(polyVN0, VN6);
1866     polyVN1        = _mm_mul_pd(polyVN1, z4);
1867     polyVN0        = _mm_mul_pd(polyVN0, z4);
1868     polyVN1        = _mm_add_pd(polyVN1, VN5);
1869     polyVN0        = _mm_add_pd(polyVN0, VN4);
1870     polyVN1        = _mm_mul_pd(polyVN1, z4);
1871     polyVN0        = _mm_mul_pd(polyVN0, z4);
1872     polyVN1        = _mm_add_pd(polyVN1, VN3);
1873     polyVN0        = _mm_add_pd(polyVN0, VN2);
1874     polyVN1        = _mm_mul_pd(polyVN1, z4);
1875     polyVN0        = _mm_mul_pd(polyVN0, z4);
1876     polyVN1        = _mm_add_pd(polyVN1, VN1);
1877     polyVN0        = _mm_add_pd(polyVN0, VN0);
1878     polyVN1        = _mm_mul_pd(polyVN1, z2);
1879     polyVN0        = _mm_add_pd(polyVN0, polyVN1);
1880
1881     return _mm_mul_pd(polyVN0, polyVD0);
1882 }
1883
1884
1885
1886 static int
1887 gmx_mm256_sincos_pd(__m256d  x,
1888                     __m256d *sinval,
1889                     __m256d *cosval)
1890 {
1891 #ifdef _MSC_VER
1892     __declspec(align(16))
1893     const double sintable[34] =
1894     {
1895         1.00000000000000000e+00, 0.00000000000000000e+00,
1896         9.95184726672196929e-01, 9.80171403295606036e-02,
1897         9.80785280403230431e-01, 1.95090322016128248e-01,
1898         9.56940335732208824e-01, 2.90284677254462331e-01,
1899         9.23879532511286738e-01, 3.82683432365089782e-01,
1900         8.81921264348355050e-01, 4.71396736825997642e-01,
1901         8.31469612302545236e-01, 5.55570233019602178e-01,
1902         7.73010453362736993e-01, 6.34393284163645488e-01,
1903         7.07106781186547573e-01, 7.07106781186547462e-01,
1904         6.34393284163645599e-01, 7.73010453362736882e-01,
1905         5.55570233019602289e-01, 8.31469612302545125e-01,
1906         4.71396736825997809e-01, 8.81921264348354939e-01,
1907         3.82683432365089837e-01, 9.23879532511286738e-01,
1908         2.90284677254462276e-01, 9.56940335732208935e-01,
1909         1.95090322016128304e-01, 9.80785280403230431e-01,
1910         9.80171403295607702e-02, 9.95184726672196818e-01,
1911         0.0, 1.00000000000000000e+00
1912     };
1913 #else
1914     const __m128d sintable[17] =
1915     {
1916         _mm_set_pd( 0.0, 1.0 ),
1917         _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
1918         _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
1919         _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
1920         _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
1921         _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
1922         _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
1923         _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
1924         _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
1925         _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
1926         _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
1927         _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
1928         _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
1929         _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
1930         _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
1931         _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
1932         _mm_set_pd(  1.0, 0.0 )
1933     };
1934 #endif
1935
1936     const __m256d signmask    = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
1937                                                                       0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
1938     const __m128i signbit_epi32  = _mm_set1_epi32(0x80000000);
1939
1940     const __m256d tabscale      = _mm256_set1_pd(32.0/M_PI);
1941     const __m256d invtabscale0  = _mm256_set1_pd(9.81747508049011230469e-02);
1942     const __m256d invtabscale1  = _mm256_set1_pd(1.96197799156550576057e-08);
1943     const __m128i ione          = _mm_set1_epi32(1);
1944     const __m128i i32           = _mm_set1_epi32(32);
1945     const __m128i i16           = _mm_set1_epi32(16);
1946     const __m128i tabmask       = _mm_set1_epi32(0x3F);
1947     const __m256d sinP7         = _mm256_set1_pd(-1.0/5040.0);
1948     const __m256d sinP5         = _mm256_set1_pd(1.0/120.0);
1949     const __m256d sinP3         = _mm256_set1_pd(-1.0/6.0);
1950     const __m256d sinP1         = _mm256_set1_pd(1.0);
1951
1952     const __m256d cosP6         = _mm256_set1_pd(-1.0/720.0);
1953     const __m256d cosP4         = _mm256_set1_pd(1.0/24.0);
1954     const __m256d cosP2         = _mm256_set1_pd(-1.0/2.0);
1955     const __m256d cosP0         = _mm256_set1_pd(1.0);
1956
1957     __m256d       scalex;
1958     __m128i       tabidx, corridx;
1959     __m256d       xabs, z, z2, polySin, polyCos;
1960     __m256d       xpoint;
1961     __m256d       t1, t2, t3, t4;
1962
1963     __m256d       sinpoint, cospoint;
1964     __m256d       xsign, ssign, csign;
1965     __m128i       imask, sswapsign, cswapsign;
1966     __m256d       minusone;
1967
1968     xsign    = _mm256_andnot_pd(signmask, x);
1969     xabs     = _mm256_and_pd(x, signmask);
1970
1971     scalex   = _mm256_mul_pd(tabscale, xabs);
1972     tabidx   = _mm256_cvtpd_epi32(scalex);
1973
1974     xpoint   = _mm256_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
1975
1976     /* Extended precision arithmetics */
1977     z        = _mm256_sub_pd(xabs, _mm256_mul_pd(invtabscale0, xpoint));
1978     z        = _mm256_sub_pd(z, _mm256_mul_pd(invtabscale1, xpoint));
1979
1980     /* Range reduction to 0..2*Pi */
1981     tabidx   = _mm_and_si128(tabidx, tabmask);
1982
1983     /* tabidx is now in range [0,..,64] */
1984     imask     = _mm_cmpgt_epi32(tabidx, i32);
1985     sswapsign = imask;
1986     cswapsign = imask;
1987     corridx   = _mm_and_si128(imask, i32);
1988     tabidx    = _mm_sub_epi32(tabidx, corridx);
1989
1990     /* tabidx is now in range [0..32] */
1991     imask     = _mm_cmpgt_epi32(tabidx, i16);
1992     cswapsign = _mm_xor_si128(cswapsign, imask);
1993     corridx   = _mm_sub_epi32(i32, tabidx);
1994     tabidx    = _mm_blendv_epi8(tabidx, corridx, imask);
1995     /* tabidx is now in range [0..16] */
1996     ssign     = _mm256_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
1997     csign     = _mm256_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
1998
1999     /* First lookup into table */
2000 #ifdef _MSC_VER
2001     t1       = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0))),
2002                                     _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 2)), 0x1);
2003     t2       = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1))),
2004                                     _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 3)), 0x1);
2005 #else
2006     t1       = _mm256_insertf128_pd(_mm256_castpd128_pd256(sintable[_mm_extract_epi32(tabidx, 0)]),
2007                                     sintable[_mm_extract_epi32(tabidx, 2)], 0x1);
2008     t2       = _mm256_insertf128_pd(_mm256_castpd128_pd256(sintable[_mm_extract_epi32(tabidx, 1)]),
2009                                     sintable[_mm_extract_epi32(tabidx, 3)], 0x1);
2010 #endif
2011
2012     sinpoint  = _mm256_unpackhi_pd(t1, t2);
2013     cospoint  = _mm256_unpacklo_pd(t1, t2);
2014
2015     sinpoint = _mm256_mul_pd(sinpoint, ssign);
2016     cospoint = _mm256_mul_pd(cospoint, csign);
2017
2018     z2       = _mm256_mul_pd(z, z);
2019
2020     polySin  = _mm256_mul_pd(sinP7, z2);
2021     polySin  = _mm256_add_pd(polySin, sinP5);
2022     polySin  = _mm256_mul_pd(polySin, z2);
2023     polySin  = _mm256_add_pd(polySin, sinP3);
2024     polySin  = _mm256_mul_pd(polySin, z2);
2025     polySin  = _mm256_add_pd(polySin, sinP1);
2026     polySin  = _mm256_mul_pd(polySin, z);
2027
2028     polyCos  = _mm256_mul_pd(cosP6, z2);
2029     polyCos  = _mm256_add_pd(polyCos, cosP4);
2030     polyCos  = _mm256_mul_pd(polyCos, z2);
2031     polyCos  = _mm256_add_pd(polyCos, cosP2);
2032     polyCos  = _mm256_mul_pd(polyCos, z2);
2033     polyCos  = _mm256_add_pd(polyCos, cosP0);
2034
2035     *sinval  = _mm256_xor_pd(_mm256_add_pd( _mm256_mul_pd(sinpoint, polyCos), _mm256_mul_pd(cospoint, polySin) ), xsign);
2036     *cosval  = _mm256_sub_pd( _mm256_mul_pd(cospoint, polyCos), _mm256_mul_pd(sinpoint, polySin) );
2037
2038     return 0;
2039 }
2040
2041 static int
2042 gmx_mm_sincos_pd(__m128d  x,
2043                  __m128d *sinval,
2044                  __m128d *cosval)
2045 {
2046 #ifdef _MSC_VER
2047     __declspec(align(16))
2048     const double sintable[34] =
2049     {
2050         1.00000000000000000e+00, 0.00000000000000000e+00,
2051         9.95184726672196929e-01, 9.80171403295606036e-02,
2052         9.80785280403230431e-01, 1.95090322016128248e-01,
2053         9.56940335732208824e-01, 2.90284677254462331e-01,
2054         9.23879532511286738e-01, 3.82683432365089782e-01,
2055         8.81921264348355050e-01, 4.71396736825997642e-01,
2056         8.31469612302545236e-01, 5.55570233019602178e-01,
2057         7.73010453362736993e-01, 6.34393284163645488e-01,
2058         7.07106781186547573e-01, 7.07106781186547462e-01,
2059         6.34393284163645599e-01, 7.73010453362736882e-01,
2060         5.55570233019602289e-01, 8.31469612302545125e-01,
2061         4.71396736825997809e-01, 8.81921264348354939e-01,
2062         3.82683432365089837e-01, 9.23879532511286738e-01,
2063         2.90284677254462276e-01, 9.56940335732208935e-01,
2064         1.95090322016128304e-01, 9.80785280403230431e-01,
2065         9.80171403295607702e-02, 9.95184726672196818e-01,
2066         0.0, 1.00000000000000000e+00
2067     };
2068 #else
2069     const __m128d sintable[17] =
2070     {
2071         _mm_set_pd( 0.0, 1.0 ),
2072         _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
2073         _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
2074         _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
2075         _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
2076         _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
2077         _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
2078         _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
2079         _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
2080         _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
2081         _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
2082         _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
2083         _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
2084         _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
2085         _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
2086         _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
2087         _mm_set_pd(  1.0, 0.0 )
2088     };
2089 #endif
2090
2091     const __m128d signmask       = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
2092     const __m128i signbit_epi32  = _mm_set1_epi32(0x80000000);
2093
2094     const __m128d tabscale      = _mm_set1_pd(32.0/M_PI);
2095     const __m128d invtabscale0  = _mm_set1_pd(9.81747508049011230469e-02);
2096     const __m128d invtabscale1  = _mm_set1_pd(1.96197799156550576057e-08);
2097     const __m128i ione          = _mm_set1_epi32(1);
2098     const __m128i i32           = _mm_set1_epi32(32);
2099     const __m128i i16           = _mm_set1_epi32(16);
2100     const __m128i tabmask       = _mm_set1_epi32(0x3F);
2101     const __m128d sinP7         = _mm_set1_pd(-1.0/5040.0);
2102     const __m128d sinP5         = _mm_set1_pd(1.0/120.0);
2103     const __m128d sinP3         = _mm_set1_pd(-1.0/6.0);
2104     const __m128d sinP1         = _mm_set1_pd(1.0);
2105
2106     const __m128d cosP6         = _mm_set1_pd(-1.0/720.0);
2107     const __m128d cosP4         = _mm_set1_pd(1.0/24.0);
2108     const __m128d cosP2         = _mm_set1_pd(-1.0/2.0);
2109     const __m128d cosP0         = _mm_set1_pd(1.0);
2110
2111     __m128d       scalex;
2112     __m128i       tabidx, corridx;
2113     __m128d       xabs, z, z2, polySin, polyCos;
2114     __m128d       xpoint;
2115     __m128d       ypoint0, ypoint1;
2116
2117     __m128d       sinpoint, cospoint;
2118     __m128d       xsign, ssign, csign;
2119     __m128i       imask, sswapsign, cswapsign;
2120     __m128d       minusone;
2121
2122     xsign    = _mm_andnot_pd(signmask, x);
2123     xabs     = _mm_and_pd(x, signmask);
2124
2125     scalex   = _mm_mul_pd(tabscale, xabs);
2126     tabidx   = _mm_cvtpd_epi32(scalex);
2127
2128     xpoint   = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
2129
2130     /* Extended precision arithmetics */
2131     z        = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
2132     z        = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
2133
2134     /* Range reduction to 0..2*Pi */
2135     tabidx   = _mm_and_si128(tabidx, tabmask);
2136
2137     /* tabidx is now in range [0,..,64] */
2138     imask     = _mm_cmpgt_epi32(tabidx, i32);
2139     sswapsign = imask;
2140     cswapsign = imask;
2141     corridx   = _mm_and_si128(imask, i32);
2142     tabidx    = _mm_sub_epi32(tabidx, corridx);
2143
2144     /* tabidx is now in range [0..32] */
2145     imask     = _mm_cmpgt_epi32(tabidx, i16);
2146     cswapsign = _mm_xor_si128(cswapsign, imask);
2147     corridx   = _mm_sub_epi32(i32, tabidx);
2148     tabidx    = _mm_blendv_epi8(tabidx, corridx, imask);
2149     /* tabidx is now in range [0..16] */
2150     ssign     = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
2151     csign     = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
2152
2153 #ifdef _MSC_VER
2154     ypoint0  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
2155     ypoint1  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
2156 #else
2157     ypoint0  = sintable[_mm_extract_epi32(tabidx, 0)];
2158     ypoint1  = sintable[_mm_extract_epi32(tabidx, 1)];
2159 #endif
2160     sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
2161     cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
2162
2163     sinpoint = _mm_mul_pd(sinpoint, ssign);
2164     cospoint = _mm_mul_pd(cospoint, csign);
2165
2166     z2       = _mm_mul_pd(z, z);
2167
2168     polySin  = _mm_mul_pd(sinP7, z2);
2169     polySin  = _mm_add_pd(polySin, sinP5);
2170     polySin  = _mm_mul_pd(polySin, z2);
2171     polySin  = _mm_add_pd(polySin, sinP3);
2172     polySin  = _mm_mul_pd(polySin, z2);
2173     polySin  = _mm_add_pd(polySin, sinP1);
2174     polySin  = _mm_mul_pd(polySin, z);
2175
2176     polyCos  = _mm_mul_pd(cosP6, z2);
2177     polyCos  = _mm_add_pd(polyCos, cosP4);
2178     polyCos  = _mm_mul_pd(polyCos, z2);
2179     polyCos  = _mm_add_pd(polyCos, cosP2);
2180     polyCos  = _mm_mul_pd(polyCos, z2);
2181     polyCos  = _mm_add_pd(polyCos, cosP0);
2182
2183     *sinval  = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
2184     *cosval  = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
2185
2186     return 0;
2187 }
2188
2189
2190 /*
2191  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
2192  * will then call the sincos() routine and waste a factor 2 in performance!
2193  */
2194 static __m256d
2195 gmx_mm256_sin_pd(__m256d x)
2196 {
2197     __m256d s, c;
2198     gmx_mm256_sincos_pd(x, &s, &c);
2199     return s;
2200 }
2201
2202 static __m128d
2203 gmx_mm_sin_pd(__m128d x)
2204 {
2205     __m128d s, c;
2206     gmx_mm_sincos_pd(x, &s, &c);
2207     return s;
2208 }
2209
2210 /*
2211  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
2212  * will then call the sincos() routine and waste a factor 2 in performance!
2213  */
2214 static __m256d
2215 gmx_mm256_cos_pd(__m256d x)
2216 {
2217     __m256d s, c;
2218     gmx_mm256_sincos_pd(x, &s, &c);
2219     return c;
2220 }
2221
2222 static __m128d
2223 gmx_mm_cos_pd(__m128d x)
2224 {
2225     __m128d s, c;
2226     gmx_mm_sincos_pd(x, &s, &c);
2227     return c;
2228 }
2229
2230
2231 static __m256d
2232 gmx_mm256_tan_pd(__m256d x)
2233 {
2234     __m256d sinval, cosval;
2235     __m256d tanval;
2236
2237     gmx_mm256_sincos_pd(x, &sinval, &cosval);
2238
2239     tanval = _mm256_mul_pd(sinval, gmx_mm256_inv_pd(cosval));
2240
2241     return tanval;
2242 }
2243
2244 static __m128d
2245 gmx_mm_tan_pd(__m128d x)
2246 {
2247     __m128d sinval, cosval;
2248     __m128d tanval;
2249
2250     gmx_mm_sincos_pd(x, &sinval, &cosval);
2251
2252     tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
2253
2254     return tanval;
2255 }
2256
2257
2258 static __m256d
2259 gmx_mm256_asin_pd(__m256d x)
2260 {
2261     /* Same algorithm as cephes library */
2262     const __m256d signmask  = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
2263                                                                     0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
2264     const __m256d limit1    = _mm256_set1_pd(0.625);
2265     const __m256d limit2    = _mm256_set1_pd(1e-8);
2266     const __m256d one       = _mm256_set1_pd(1.0);
2267     const __m256d halfpi    = _mm256_set1_pd(M_PI/2.0);
2268     const __m256d quarterpi = _mm256_set1_pd(M_PI/4.0);
2269     const __m256d morebits  = _mm256_set1_pd(6.123233995736765886130e-17);
2270
2271     const __m256d P5        = _mm256_set1_pd(4.253011369004428248960e-3);
2272     const __m256d P4        = _mm256_set1_pd(-6.019598008014123785661e-1);
2273     const __m256d P3        = _mm256_set1_pd(5.444622390564711410273e0);
2274     const __m256d P2        = _mm256_set1_pd(-1.626247967210700244449e1);
2275     const __m256d P1        = _mm256_set1_pd(1.956261983317594739197e1);
2276     const __m256d P0        = _mm256_set1_pd(-8.198089802484824371615e0);
2277
2278     const __m256d Q4        = _mm256_set1_pd(-1.474091372988853791896e1);
2279     const __m256d Q3        = _mm256_set1_pd(7.049610280856842141659e1);
2280     const __m256d Q2        = _mm256_set1_pd(-1.471791292232726029859e2);
2281     const __m256d Q1        = _mm256_set1_pd(1.395105614657485689735e2);
2282     const __m256d Q0        = _mm256_set1_pd(-4.918853881490881290097e1);
2283
2284     const __m256d R4        = _mm256_set1_pd(2.967721961301243206100e-3);
2285     const __m256d R3        = _mm256_set1_pd(-5.634242780008963776856e-1);
2286     const __m256d R2        = _mm256_set1_pd(6.968710824104713396794e0);
2287     const __m256d R1        = _mm256_set1_pd(-2.556901049652824852289e1);
2288     const __m256d R0        = _mm256_set1_pd(2.853665548261061424989e1);
2289
2290     const __m256d S3        = _mm256_set1_pd(-2.194779531642920639778e1);
2291     const __m256d S2        = _mm256_set1_pd(1.470656354026814941758e2);
2292     const __m256d S1        = _mm256_set1_pd(-3.838770957603691357202e2);
2293     const __m256d S0        = _mm256_set1_pd(3.424398657913078477438e2);
2294
2295     __m256d       sign;
2296     __m256d       mask;
2297     __m256d       xabs;
2298     __m256d       zz, ww, z, q, w, y, zz2, ww2;
2299     __m256d       PA, PB;
2300     __m256d       QA, QB;
2301     __m256d       RA, RB;
2302     __m256d       SA, SB;
2303     __m256d       nom, denom;
2304
2305     sign  = _mm256_andnot_pd(signmask, x);
2306     xabs  = _mm256_and_pd(x, signmask);
2307
2308     mask  = _mm256_cmp_pd(xabs, limit1, _CMP_GT_OQ);
2309
2310     zz    = _mm256_sub_pd(one, xabs);
2311     ww    = _mm256_mul_pd(xabs, xabs);
2312     zz2   = _mm256_mul_pd(zz, zz);
2313     ww2   = _mm256_mul_pd(ww, ww);
2314
2315     /* R */
2316     RA    = _mm256_mul_pd(R4, zz2);
2317     RB    = _mm256_mul_pd(R3, zz2);
2318     RA    = _mm256_add_pd(RA, R2);
2319     RB    = _mm256_add_pd(RB, R1);
2320     RA    = _mm256_mul_pd(RA, zz2);
2321     RB    = _mm256_mul_pd(RB, zz);
2322     RA    = _mm256_add_pd(RA, R0);
2323     RA    = _mm256_add_pd(RA, RB);
2324
2325     /* S, SA = zz2 */
2326     SB    = _mm256_mul_pd(S3, zz2);
2327     SA    = _mm256_add_pd(zz2, S2);
2328     SB    = _mm256_add_pd(SB, S1);
2329     SA    = _mm256_mul_pd(SA, zz2);
2330     SB    = _mm256_mul_pd(SB, zz);
2331     SA    = _mm256_add_pd(SA, S0);
2332     SA    = _mm256_add_pd(SA, SB);
2333
2334     /* P */
2335     PA    = _mm256_mul_pd(P5, ww2);
2336     PB    = _mm256_mul_pd(P4, ww2);
2337     PA    = _mm256_add_pd(PA, P3);
2338     PB    = _mm256_add_pd(PB, P2);
2339     PA    = _mm256_mul_pd(PA, ww2);
2340     PB    = _mm256_mul_pd(PB, ww2);
2341     PA    = _mm256_add_pd(PA, P1);
2342     PB    = _mm256_add_pd(PB, P0);
2343     PA    = _mm256_mul_pd(PA, ww);
2344     PA    = _mm256_add_pd(PA, PB);
2345
2346     /* Q, QA = ww2 */
2347     QB    = _mm256_mul_pd(Q4, ww2);
2348     QA    = _mm256_add_pd(ww2, Q3);
2349     QB    = _mm256_add_pd(QB, Q2);
2350     QA    = _mm256_mul_pd(QA, ww2);
2351     QB    = _mm256_mul_pd(QB, ww2);
2352     QA    = _mm256_add_pd(QA, Q1);
2353     QB    = _mm256_add_pd(QB, Q0);
2354     QA    = _mm256_mul_pd(QA, ww);
2355     QA    = _mm256_add_pd(QA, QB);
2356
2357     RA    = _mm256_mul_pd(RA, zz);
2358     PA    = _mm256_mul_pd(PA, ww);
2359
2360     nom   = _mm256_blendv_pd( PA, RA, mask );
2361     denom = _mm256_blendv_pd( QA, SA, mask );
2362
2363     q     = _mm256_mul_pd( nom, gmx_mm256_inv_pd(denom) );
2364
2365     zz    = _mm256_add_pd(zz, zz);
2366     zz    = gmx_mm256_sqrt_pd(zz);
2367     z     = _mm256_sub_pd(quarterpi, zz);
2368     zz    = _mm256_mul_pd(zz, q);
2369     zz    = _mm256_sub_pd(zz, morebits);
2370     z     = _mm256_sub_pd(z, zz);
2371     z     = _mm256_add_pd(z, quarterpi);
2372
2373     w     = _mm256_mul_pd(xabs, q);
2374     w     = _mm256_add_pd(w, xabs);
2375
2376     z     = _mm256_blendv_pd( w, z, mask );
2377
2378     mask  = _mm256_cmp_pd(xabs, limit2, _CMP_GT_OQ);
2379     z     = _mm256_blendv_pd( xabs, z, mask );
2380
2381     z = _mm256_xor_pd(z, sign);
2382
2383     return z;
2384 }
2385
2386 static __m128d
2387 gmx_mm_asin_pd(__m128d x)
2388 {
2389     /* Same algorithm as cephes library */
2390     const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
2391     const __m128d limit1    = _mm_set1_pd(0.625);
2392     const __m128d limit2    = _mm_set1_pd(1e-8);
2393     const __m128d one       = _mm_set1_pd(1.0);
2394     const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
2395     const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
2396     const __m128d morebits  = _mm_set1_pd(6.123233995736765886130e-17);
2397
2398     const __m128d P5        = _mm_set1_pd(4.253011369004428248960e-3);
2399     const __m128d P4        = _mm_set1_pd(-6.019598008014123785661e-1);
2400     const __m128d P3        = _mm_set1_pd(5.444622390564711410273e0);
2401     const __m128d P2        = _mm_set1_pd(-1.626247967210700244449e1);
2402     const __m128d P1        = _mm_set1_pd(1.956261983317594739197e1);
2403     const __m128d P0        = _mm_set1_pd(-8.198089802484824371615e0);
2404
2405     const __m128d Q4        = _mm_set1_pd(-1.474091372988853791896e1);
2406     const __m128d Q3        = _mm_set1_pd(7.049610280856842141659e1);
2407     const __m128d Q2        = _mm_set1_pd(-1.471791292232726029859e2);
2408     const __m128d Q1        = _mm_set1_pd(1.395105614657485689735e2);
2409     const __m128d Q0        = _mm_set1_pd(-4.918853881490881290097e1);
2410
2411     const __m128d R4        = _mm_set1_pd(2.967721961301243206100e-3);
2412     const __m128d R3        = _mm_set1_pd(-5.634242780008963776856e-1);
2413     const __m128d R2        = _mm_set1_pd(6.968710824104713396794e0);
2414     const __m128d R1        = _mm_set1_pd(-2.556901049652824852289e1);
2415     const __m128d R0        = _mm_set1_pd(2.853665548261061424989e1);
2416
2417     const __m128d S3        = _mm_set1_pd(-2.194779531642920639778e1);
2418     const __m128d S2        = _mm_set1_pd(1.470656354026814941758e2);
2419     const __m128d S1        = _mm_set1_pd(-3.838770957603691357202e2);
2420     const __m128d S0        = _mm_set1_pd(3.424398657913078477438e2);
2421
2422     __m128d       sign;
2423     __m128d       mask;
2424     __m128d       xabs;
2425     __m128d       zz, ww, z, q, w, y, zz2, ww2;
2426     __m128d       PA, PB;
2427     __m128d       QA, QB;
2428     __m128d       RA, RB;
2429     __m128d       SA, SB;
2430     __m128d       nom, denom;
2431
2432     sign  = _mm_andnot_pd(signmask, x);
2433     xabs  = _mm_and_pd(x, signmask);
2434
2435     mask  = _mm_cmpgt_pd(xabs, limit1);
2436
2437     zz    = _mm_sub_pd(one, xabs);
2438     ww    = _mm_mul_pd(xabs, xabs);
2439     zz2   = _mm_mul_pd(zz, zz);
2440     ww2   = _mm_mul_pd(ww, ww);
2441
2442     /* R */
2443     RA    = _mm_mul_pd(R4, zz2);
2444     RB    = _mm_mul_pd(R3, zz2);
2445     RA    = _mm_add_pd(RA, R2);
2446     RB    = _mm_add_pd(RB, R1);
2447     RA    = _mm_mul_pd(RA, zz2);
2448     RB    = _mm_mul_pd(RB, zz);
2449     RA    = _mm_add_pd(RA, R0);
2450     RA    = _mm_add_pd(RA, RB);
2451
2452     /* S, SA = zz2 */
2453     SB    = _mm_mul_pd(S3, zz2);
2454     SA    = _mm_add_pd(zz2, S2);
2455     SB    = _mm_add_pd(SB, S1);
2456     SA    = _mm_mul_pd(SA, zz2);
2457     SB    = _mm_mul_pd(SB, zz);
2458     SA    = _mm_add_pd(SA, S0);
2459     SA    = _mm_add_pd(SA, SB);
2460
2461     /* P */
2462     PA    = _mm_mul_pd(P5, ww2);
2463     PB    = _mm_mul_pd(P4, ww2);
2464     PA    = _mm_add_pd(PA, P3);
2465     PB    = _mm_add_pd(PB, P2);
2466     PA    = _mm_mul_pd(PA, ww2);
2467     PB    = _mm_mul_pd(PB, ww2);
2468     PA    = _mm_add_pd(PA, P1);
2469     PB    = _mm_add_pd(PB, P0);
2470     PA    = _mm_mul_pd(PA, ww);
2471     PA    = _mm_add_pd(PA, PB);
2472
2473     /* Q, QA = ww2 */
2474     QB    = _mm_mul_pd(Q4, ww2);
2475     QA    = _mm_add_pd(ww2, Q3);
2476     QB    = _mm_add_pd(QB, Q2);
2477     QA    = _mm_mul_pd(QA, ww2);
2478     QB    = _mm_mul_pd(QB, ww2);
2479     QA    = _mm_add_pd(QA, Q1);
2480     QB    = _mm_add_pd(QB, Q0);
2481     QA    = _mm_mul_pd(QA, ww);
2482     QA    = _mm_add_pd(QA, QB);
2483
2484     RA    = _mm_mul_pd(RA, zz);
2485     PA    = _mm_mul_pd(PA, ww);
2486
2487     nom   = _mm_blendv_pd( PA, RA, mask );
2488     denom = _mm_blendv_pd( QA, SA, mask );
2489
2490     q     = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
2491
2492     zz    = _mm_add_pd(zz, zz);
2493     zz    = gmx_mm_sqrt_pd(zz);
2494     z     = _mm_sub_pd(quarterpi, zz);
2495     zz    = _mm_mul_pd(zz, q);
2496     zz    = _mm_sub_pd(zz, morebits);
2497     z     = _mm_sub_pd(z, zz);
2498     z     = _mm_add_pd(z, quarterpi);
2499
2500     w     = _mm_mul_pd(xabs, q);
2501     w     = _mm_add_pd(w, xabs);
2502
2503     z     = _mm_blendv_pd( w, z, mask );
2504
2505     mask  = _mm_cmpgt_pd(xabs, limit2);
2506     z     = _mm_blendv_pd( xabs, z, mask );
2507
2508     z = _mm_xor_pd(z, sign);
2509
2510     return z;
2511 }
2512
2513
2514 static __m256d
2515 gmx_mm256_acos_pd(__m256d x)
2516 {
2517     const __m256d signmask  = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
2518                                                                     0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
2519     const __m256d one        = _mm256_set1_pd(1.0);
2520     const __m256d half       = _mm256_set1_pd(0.5);
2521     const __m256d pi         = _mm256_set1_pd(M_PI);
2522     const __m256d quarterpi0 = _mm256_set1_pd(7.85398163397448309616e-1);
2523     const __m256d quarterpi1 = _mm256_set1_pd(6.123233995736765886130e-17);
2524
2525
2526     __m256d mask1;
2527
2528     __m256d z, z1, z2;
2529
2530     mask1 = _mm256_cmp_pd(x, half, _CMP_GT_OQ);
2531     z1    = _mm256_mul_pd(half, _mm256_sub_pd(one, x));
2532     z1    = gmx_mm256_sqrt_pd(z1);
2533     z     = _mm256_blendv_pd( x, z1, mask1 );
2534
2535     z     = gmx_mm256_asin_pd(z);
2536
2537     z1    = _mm256_add_pd(z, z);
2538
2539     z2    = _mm256_sub_pd(quarterpi0, z);
2540     z2    = _mm256_add_pd(z2, quarterpi1);
2541     z2    = _mm256_add_pd(z2, quarterpi0);
2542
2543     z     = _mm256_blendv_pd(z2, z1, mask1);
2544
2545     return z;
2546 }
2547
2548 static __m128d
2549 gmx_mm_acos_pd(__m128d x)
2550 {
2551     const __m128d signmask   = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
2552     const __m128d one        = _mm_set1_pd(1.0);
2553     const __m128d half       = _mm_set1_pd(0.5);
2554     const __m128d pi         = _mm_set1_pd(M_PI);
2555     const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
2556     const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
2557
2558
2559     __m128d mask1;
2560
2561     __m128d z, z1, z2;
2562
2563     mask1 = _mm_cmpgt_pd(x, half);
2564     z1    = _mm_mul_pd(half, _mm_sub_pd(one, x));
2565     z1    = gmx_mm_sqrt_pd(z1);
2566     z     = _mm_blendv_pd( x, z1, mask1 );
2567
2568     z     = gmx_mm_asin_pd(z);
2569
2570     z1    = _mm_add_pd(z, z);
2571
2572     z2    = _mm_sub_pd(quarterpi0, z);
2573     z2    = _mm_add_pd(z2, quarterpi1);
2574     z2    = _mm_add_pd(z2, quarterpi0);
2575
2576     z     = _mm_blendv_pd(z2, z1, mask1);
2577
2578     return z;
2579 }
2580
2581
2582 static __m256d
2583 gmx_mm256_atan_pd(__m256d x)
2584 {
2585     /* Same algorithm as cephes library */
2586     const __m256d signmask  = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
2587                                                                     0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
2588     const __m256d limit1    = _mm256_set1_pd(0.66);
2589     const __m256d limit2    = _mm256_set1_pd(2.41421356237309504880);
2590     const __m256d quarterpi = _mm256_set1_pd(M_PI/4.0);
2591     const __m256d halfpi    = _mm256_set1_pd(M_PI/2.0);
2592     const __m256d mone      = _mm256_set1_pd(-1.0);
2593     const __m256d morebits1 = _mm256_set1_pd(0.5*6.123233995736765886130E-17);
2594     const __m256d morebits2 = _mm256_set1_pd(6.123233995736765886130E-17);
2595
2596     const __m256d P4        = _mm256_set1_pd(-8.750608600031904122785E-1);
2597     const __m256d P3        = _mm256_set1_pd(-1.615753718733365076637E1);
2598     const __m256d P2        = _mm256_set1_pd(-7.500855792314704667340E1);
2599     const __m256d P1        = _mm256_set1_pd(-1.228866684490136173410E2);
2600     const __m256d P0        = _mm256_set1_pd(-6.485021904942025371773E1);
2601
2602     const __m256d Q4        = _mm256_set1_pd(2.485846490142306297962E1);
2603     const __m256d Q3        = _mm256_set1_pd(1.650270098316988542046E2);
2604     const __m256d Q2        = _mm256_set1_pd(4.328810604912902668951E2);
2605     const __m256d Q1        = _mm256_set1_pd(4.853903996359136964868E2);
2606     const __m256d Q0        = _mm256_set1_pd(1.945506571482613964425E2);
2607
2608     __m256d       sign;
2609     __m256d       mask1, mask2;
2610     __m256d       y, t1, t2;
2611     __m256d       z, z2;
2612     __m256d       P_A, P_B, Q_A, Q_B;
2613
2614     sign   = _mm256_andnot_pd(signmask, x);
2615     x      = _mm256_and_pd(x, signmask);
2616
2617     mask1  = _mm256_cmp_pd(x, limit1, _CMP_GT_OQ);
2618     mask2  = _mm256_cmp_pd(x, limit2, _CMP_GT_OQ);
2619
2620     t1     = _mm256_mul_pd(_mm256_add_pd(x, mone), gmx_mm256_inv_pd(_mm256_sub_pd(x, mone)));
2621     t2     = _mm256_mul_pd(mone, gmx_mm256_inv_pd(x));
2622
2623     y      = _mm256_and_pd(mask1, quarterpi);
2624     y      = _mm256_or_pd( _mm256_and_pd(mask2, halfpi), _mm256_andnot_pd(mask2, y) );
2625
2626     x      = _mm256_or_pd( _mm256_and_pd(mask1, t1), _mm256_andnot_pd(mask1, x) );
2627     x      = _mm256_or_pd( _mm256_and_pd(mask2, t2), _mm256_andnot_pd(mask2, x) );
2628
2629     z      = _mm256_mul_pd(x, x);
2630     z2     = _mm256_mul_pd(z, z);
2631
2632     P_A    = _mm256_mul_pd(P4, z2);
2633     P_B    = _mm256_mul_pd(P3, z2);
2634     P_A    = _mm256_add_pd(P_A, P2);
2635     P_B    = _mm256_add_pd(P_B, P1);
2636     P_A    = _mm256_mul_pd(P_A, z2);
2637     P_B    = _mm256_mul_pd(P_B, z);
2638     P_A    = _mm256_add_pd(P_A, P0);
2639     P_A    = _mm256_add_pd(P_A, P_B);
2640
2641     /* Q_A = z2 */
2642     Q_B    = _mm256_mul_pd(Q4, z2);
2643     Q_A    = _mm256_add_pd(z2, Q3);
2644     Q_B    = _mm256_add_pd(Q_B, Q2);
2645     Q_A    = _mm256_mul_pd(Q_A, z2);
2646     Q_B    = _mm256_mul_pd(Q_B, z2);
2647     Q_A    = _mm256_add_pd(Q_A, Q1);
2648     Q_B    = _mm256_add_pd(Q_B, Q0);
2649     Q_A    = _mm256_mul_pd(Q_A, z);
2650     Q_A    = _mm256_add_pd(Q_A, Q_B);
2651
2652     z      = _mm256_mul_pd(z, P_A);
2653     z      = _mm256_mul_pd(z, gmx_mm256_inv_pd(Q_A));
2654     z      = _mm256_mul_pd(z, x);
2655     z      = _mm256_add_pd(z, x);
2656
2657     t1     = _mm256_and_pd(mask1, morebits1);
2658     t1     = _mm256_or_pd( _mm256_and_pd(mask2, morebits2), _mm256_andnot_pd(mask2, t1) );
2659
2660     z      = _mm256_add_pd(z, t1);
2661     y      = _mm256_add_pd(y, z);
2662
2663     y      = _mm256_xor_pd(y, sign);
2664
2665     return y;
2666 }
2667
2668 static __m128d
2669 gmx_mm_atan_pd(__m128d x)
2670 {
2671     /* Same algorithm as cephes library */
2672     const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
2673     const __m128d limit1    = _mm_set1_pd(0.66);
2674     const __m128d limit2    = _mm_set1_pd(2.41421356237309504880);
2675     const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
2676     const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
2677     const __m128d mone      = _mm_set1_pd(-1.0);
2678     const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
2679     const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
2680
2681     const __m128d P4        = _mm_set1_pd(-8.750608600031904122785E-1);
2682     const __m128d P3        = _mm_set1_pd(-1.615753718733365076637E1);
2683     const __m128d P2        = _mm_set1_pd(-7.500855792314704667340E1);
2684     const __m128d P1        = _mm_set1_pd(-1.228866684490136173410E2);
2685     const __m128d P0        = _mm_set1_pd(-6.485021904942025371773E1);
2686
2687     const __m128d Q4        = _mm_set1_pd(2.485846490142306297962E1);
2688     const __m128d Q3        = _mm_set1_pd(1.650270098316988542046E2);
2689     const __m128d Q2        = _mm_set1_pd(4.328810604912902668951E2);
2690     const __m128d Q1        = _mm_set1_pd(4.853903996359136964868E2);
2691     const __m128d Q0        = _mm_set1_pd(1.945506571482613964425E2);
2692
2693     __m128d       sign;
2694     __m128d       mask1, mask2;
2695     __m128d       y, t1, t2;
2696     __m128d       z, z2;
2697     __m128d       P_A, P_B, Q_A, Q_B;
2698
2699     sign   = _mm_andnot_pd(signmask, x);
2700     x      = _mm_and_pd(x, signmask);
2701
2702     mask1  = _mm_cmpgt_pd(x, limit1);
2703     mask2  = _mm_cmpgt_pd(x, limit2);
2704
2705     t1     = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
2706     t2     = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
2707
2708     y      = _mm_and_pd(mask1, quarterpi);
2709     y      = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
2710
2711     x      = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
2712     x      = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
2713
2714     z      = _mm_mul_pd(x, x);
2715     z2     = _mm_mul_pd(z, z);
2716
2717     P_A    = _mm_mul_pd(P4, z2);
2718     P_B    = _mm_mul_pd(P3, z2);
2719     P_A    = _mm_add_pd(P_A, P2);
2720     P_B    = _mm_add_pd(P_B, P1);
2721     P_A    = _mm_mul_pd(P_A, z2);
2722     P_B    = _mm_mul_pd(P_B, z);
2723     P_A    = _mm_add_pd(P_A, P0);
2724     P_A    = _mm_add_pd(P_A, P_B);
2725
2726     /* Q_A = z2 */
2727     Q_B    = _mm_mul_pd(Q4, z2);
2728     Q_A    = _mm_add_pd(z2, Q3);
2729     Q_B    = _mm_add_pd(Q_B, Q2);
2730     Q_A    = _mm_mul_pd(Q_A, z2);
2731     Q_B    = _mm_mul_pd(Q_B, z2);
2732     Q_A    = _mm_add_pd(Q_A, Q1);
2733     Q_B    = _mm_add_pd(Q_B, Q0);
2734     Q_A    = _mm_mul_pd(Q_A, z);
2735     Q_A    = _mm_add_pd(Q_A, Q_B);
2736
2737     z      = _mm_mul_pd(z, P_A);
2738     z      = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
2739     z      = _mm_mul_pd(z, x);
2740     z      = _mm_add_pd(z, x);
2741
2742     t1     = _mm_and_pd(mask1, morebits1);
2743     t1     = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
2744
2745     z      = _mm_add_pd(z, t1);
2746     y      = _mm_add_pd(y, z);
2747
2748     y      = _mm_xor_pd(y, sign);
2749
2750     return y;
2751 }
2752
2753
2754
2755 static __m256d
2756 gmx_mm256_atan2_pd(__m256d y, __m256d x)
2757 {
2758     const __m256d pi          = _mm256_set1_pd(M_PI);
2759     const __m256d minuspi     = _mm256_set1_pd(-M_PI);
2760     const __m256d halfpi      = _mm256_set1_pd(M_PI/2.0);
2761     const __m256d minushalfpi = _mm256_set1_pd(-M_PI/2.0);
2762
2763     __m256d       z, z1, z3, z4;
2764     __m256d       w;
2765     __m256d       maskx_lt, maskx_eq;
2766     __m256d       masky_lt, masky_eq;
2767     __m256d       mask1, mask2, mask3, mask4, maskall;
2768
2769     maskx_lt  = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
2770     masky_lt  = _mm256_cmp_pd(y, _mm256_setzero_pd(), _CMP_LT_OQ);
2771     maskx_eq  = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
2772     masky_eq  = _mm256_cmp_pd(y, _mm256_setzero_pd(), _CMP_EQ_OQ);
2773
2774     z         = _mm256_mul_pd(y, gmx_mm256_inv_pd(x));
2775     z         = gmx_mm256_atan_pd(z);
2776
2777     mask1     = _mm256_and_pd(maskx_eq, masky_lt);
2778     mask2     = _mm256_andnot_pd(maskx_lt, masky_eq);
2779     mask3     = _mm256_andnot_pd( _mm256_or_pd(masky_lt, masky_eq), maskx_eq);
2780     mask4     = _mm256_and_pd(masky_eq, maskx_lt);
2781
2782     maskall   = _mm256_or_pd( _mm256_or_pd(mask1, mask2), _mm256_or_pd(mask3, mask4) );
2783
2784     z         = _mm256_andnot_pd(maskall, z);
2785     z1        = _mm256_and_pd(mask1, minushalfpi);
2786     z3        = _mm256_and_pd(mask3, halfpi);
2787     z4        = _mm256_and_pd(mask4, pi);
2788
2789     z         = _mm256_or_pd( _mm256_or_pd(z, z1), _mm256_or_pd(z3, z4) );
2790
2791     w         = _mm256_blendv_pd(pi, minuspi, masky_lt);
2792     w         = _mm256_and_pd(w, maskx_lt);
2793
2794     w         = _mm256_andnot_pd(maskall, w);
2795
2796     z         = _mm256_add_pd(z, w);
2797
2798     return z;
2799 }
2800
2801 static __m128d
2802 gmx_mm_atan2_pd(__m128d y, __m128d x)
2803 {
2804     const __m128d pi          = _mm_set1_pd(M_PI);
2805     const __m128d minuspi     = _mm_set1_pd(-M_PI);
2806     const __m128d halfpi      = _mm_set1_pd(M_PI/2.0);
2807     const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
2808
2809     __m128d       z, z1, z3, z4;
2810     __m128d       w;
2811     __m128d       maskx_lt, maskx_eq;
2812     __m128d       masky_lt, masky_eq;
2813     __m128d       mask1, mask2, mask3, mask4, maskall;
2814
2815     maskx_lt  = _mm_cmplt_pd(x, _mm_setzero_pd());
2816     masky_lt  = _mm_cmplt_pd(y, _mm_setzero_pd());
2817     maskx_eq  = _mm_cmpeq_pd(x, _mm_setzero_pd());
2818     masky_eq  = _mm_cmpeq_pd(y, _mm_setzero_pd());
2819
2820     z         = _mm_mul_pd(y, gmx_mm_inv_pd(x));
2821     z         = gmx_mm_atan_pd(z);
2822
2823     mask1     = _mm_and_pd(maskx_eq, masky_lt);
2824     mask2     = _mm_andnot_pd(maskx_lt, masky_eq);
2825     mask3     = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
2826     mask4     = _mm_and_pd(masky_eq, maskx_lt);
2827
2828     maskall   = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
2829
2830     z         = _mm_andnot_pd(maskall, z);
2831     z1        = _mm_and_pd(mask1, minushalfpi);
2832     z3        = _mm_and_pd(mask3, halfpi);
2833     z4        = _mm_and_pd(mask4, pi);
2834
2835     z         = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
2836
2837     w         = _mm_blendv_pd(pi, minuspi, masky_lt);
2838     w         = _mm_and_pd(w, maskx_lt);
2839
2840     w         = _mm_andnot_pd(maskall, w);
2841
2842     z         = _mm_add_pd(z, w);
2843
2844     return z;
2845 }
2846
2847 #endif /* _gmx_math_x86_avx_256_double_h_ */