include/gmx_math_x86_avx_256_single.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of GROMACS.
   5  * Copyright (c) 2012-
   6  *
   7  * Written by the Gromacs development team under coordination of
   8  * David van der Spoel, Berk Hess, and Erik Lindahl.
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2
  13  * of the License, or (at your option) any later version.
  14  *
  15  * To help us fund GROMACS development, we humbly ask that you cite
  16  * the research papers on the package. Check out http://www.gromacs.org
  17  *
  18  * And Hey:
  19  * Gnomes, ROck Monsters And Chili Sauce
  20  */
  21 #ifndef _gmx_math_x86_avx_256_single_h_
  22 #define _gmx_math_x86_avx_256_single_h_
  23
  24 #include "gmx_x86_avx_256.h"
  25
  26
  27 #ifndef M_PI
  28 #  define M_PI 3.14159265358979323846264338327950288
  29 #endif
  30
  31
  32
  33 /************************
  34  *                      *
  35  * Simple math routines *
  36  *                      *
  37  ************************/
  38
  39 /* 1.0/sqrt(x), 256-bit wide version */
  40 static gmx_inline __m256
  41 gmx_mm256_invsqrt_ps(__m256 x)
  42 {
  43     const __m256 half  = _mm256_set1_ps(0.5f);
  44     const __m256 three = _mm256_set1_ps(3.0f);
  45
  46     __m256 lu = _mm256_rsqrt_ps(x);
  47
  48     return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
  49 }
  50
  51 /* 1.0/sqrt(x), 128-bit wide version */
  52 static gmx_inline __m128
  53 gmx_mm_invsqrt_ps(__m128 x)
  54 {
  55     const __m128 half  = _mm_set_ps(0.5,0.5,0.5,0.5);
  56     const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
  57
  58     __m128 lu = _mm_rsqrt_ps(x);
  59
  60     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
  61 }
  62
  63
  64 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  65 static gmx_inline __m256
  66 gmx_mm256_sqrt_ps(__m256 x)
  67 {
  68     __m256 mask;
  69     __m256 res;
  70
  71     mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
  72     res  = _mm256_andnot_ps(mask,gmx_mm256_invsqrt_ps(x));
  73
  74     res  = _mm256_mul_ps(x,res);
  75
  76     return res;
  77 }
  78
  79 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  80 static gmx_inline __m128
  81 gmx_mm_sqrt_ps(__m128 x)
  82 {
  83     __m128 mask;
  84     __m128 res;
  85
  86     mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
  87     res  = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
  88
  89     res  = _mm_mul_ps(x,res);
  90
  91     return res;
  92 }
  93
  94
  95 /* 1.0/x, 256-bit wide */
  96 static gmx_inline __m256
  97 gmx_mm256_inv_ps(__m256 x)
  98 {
  99     const __m256 two = _mm256_set1_ps(2.0f);
 100
 101     __m256 lu = _mm256_rcp_ps(x);
 102
 103     return _mm256_mul_ps(lu,_mm256_sub_ps(two,_mm256_mul_ps(lu,x)));
 104 }
 105
 106 /* 1.0/x, 128-bit wide */
 107 static gmx_inline __m128
 108 gmx_mm_inv_ps(__m128 x)
 109 {
 110     const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
 111
 112     __m128 lu = _mm_rcp_ps(x);
 113
 114     return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
 115 }
 116
 117
 118 static gmx_inline __m256
 119 gmx_mm256_abs_ps(__m256 x)
 120 {
 121     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
 122
 123     return _mm256_and_ps(x,signmask);
 124 }
 125
 126 static gmx_inline __m128
 127 gmx_mm_abs_ps(__m128 x)
 128 {
 129     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 130
 131     return _mm_and_ps(x,signmask);
 132 }
 133
 134
 135 static __m256
 136 gmx_mm256_log_ps(__m256 x)
 137 {
 138     const __m256  expmask    = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
 139     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 140     const __m256  half       = _mm256_set1_ps(0.5f);
 141     const __m256  one        = _mm256_set1_ps(1.0f);
 142     const __m256  invsq2     = _mm256_set1_ps(1.0f/sqrt(2.0f));
 143     const __m256  corr1      = _mm256_set1_ps(-2.12194440e-4f);
 144     const __m256  corr2      = _mm256_set1_ps(0.693359375f);
 145
 146     const __m256 CA_1        = _mm256_set1_ps(0.070376836292f);
 147     const __m256 CB_0        = _mm256_set1_ps(1.6714950086782716f);
 148     const __m256 CB_1        = _mm256_set1_ps(-2.452088066061482f);
 149     const __m256 CC_0        = _mm256_set1_ps(1.5220770854701728f);
 150     const __m256 CC_1        = _mm256_set1_ps(-1.3422238433233642f);
 151     const __m256 CD_0        = _mm256_set1_ps(1.386218787509749f);
 152     const __m256 CD_1        = _mm256_set1_ps(0.35075468953796346f);
 153     const __m256 CE_0        = _mm256_set1_ps(1.3429983063133937f);
 154     const __m256 CE_1        = _mm256_set1_ps(1.807420826584643f);
 155
 156     __m256  fexp,fexp1;
 157     __m256i iexp;
 158     __m128i iexp128a,iexp128b;
 159     __m256  mask;
 160     __m256i imask;
 161     __m128i imask128a,imask128b;
 162     __m256  x1,x2;
 163     __m256  y;
 164     __m256  pA,pB,pC,pD,pE,tB,tC,tD,tE;
 165
 166     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 167     fexp  = _mm256_and_ps(x,expmask);
 168     iexp  = _mm256_castps_si256(fexp);
 169
 170     iexp128b = _mm256_extractf128_si256(iexp,0x1);
 171     iexp128a = _mm256_castsi256_si128(iexp);
 172
 173     iexp128a  = _mm_srli_epi32(iexp128a,23);
 174     iexp128b  = _mm_srli_epi32(iexp128b,23);
 175     iexp128a  = _mm_sub_epi32(iexp128a,expbase_m1);
 176     iexp128b  = _mm_sub_epi32(iexp128b,expbase_m1);
 177
 178     x     = _mm256_andnot_ps(expmask,x);
 179     x     = _mm256_or_ps(x,one);
 180     x     = _mm256_mul_ps(x,half);
 181
 182     mask  = _mm256_cmp_ps(x,invsq2,_CMP_LT_OQ);
 183
 184     x     = _mm256_add_ps(x,_mm256_and_ps(mask,x));
 185     x     = _mm256_sub_ps(x,one);
 186
 187     imask = _mm256_castps_si256(mask);
 188
 189     imask128b = _mm256_extractf128_si256(imask,0x1);
 190     imask128a = _mm256_castsi256_si128(imask);
 191
 192     iexp128a  = _mm_add_epi32(iexp128a,imask128a);
 193     iexp128b  = _mm_add_epi32(iexp128b,imask128b);
 194
 195     iexp  = _mm256_castsi128_si256(iexp128a);
 196     iexp  = _mm256_insertf128_si256(iexp,iexp128b,0x1);
 197
 198     x2    = _mm256_mul_ps(x,x);
 199
 200     pA    = _mm256_mul_ps(CA_1,x);
 201     pB    = _mm256_mul_ps(CB_1,x);
 202     pC    = _mm256_mul_ps(CC_1,x);
 203     pD    = _mm256_mul_ps(CD_1,x);
 204     pE    = _mm256_mul_ps(CE_1,x);
 205     tB    = _mm256_add_ps(CB_0,x2);
 206     tC    = _mm256_add_ps(CC_0,x2);
 207     tD    = _mm256_add_ps(CD_0,x2);
 208     tE    = _mm256_add_ps(CE_0,x2);
 209     pB    = _mm256_add_ps(pB,tB);
 210     pC    = _mm256_add_ps(pC,tC);
 211     pD    = _mm256_add_ps(pD,tD);
 212     pE    = _mm256_add_ps(pE,tE);
 213
 214     pA    = _mm256_mul_ps(pA,pB);
 215     pC    = _mm256_mul_ps(pC,pD);
 216     pE    = _mm256_mul_ps(pE,x2);
 217     pA    = _mm256_mul_ps(pA,pC);
 218     y     = _mm256_mul_ps(pA,pE);
 219
 220     fexp  = _mm256_cvtepi32_ps(iexp);
 221     y     = _mm256_add_ps(y,_mm256_mul_ps(fexp,corr1));
 222
 223     y     = _mm256_sub_ps(y, _mm256_mul_ps(half,x2));
 224     x2    = _mm256_add_ps(x,y);
 225
 226     x2    = _mm256_add_ps(x2,_mm256_mul_ps(fexp,corr2));
 227
 228     return x2;
 229 }
 230
 231
 232 static __m128
 233 gmx_mm_log_ps(__m128 x)
 234 {
 235     /* Same algorithm as cephes library */
 236     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 237     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 238     const __m128  half       = _mm_set1_ps(0.5f);
 239     const __m128  one        = _mm_set1_ps(1.0f);
 240     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 241     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 242     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 243
 244     const __m128 CA_1        = _mm_set1_ps(0.070376836292f);
 245     const __m128 CB_0        = _mm_set1_ps(1.6714950086782716f);
 246     const __m128 CB_1        = _mm_set1_ps(-2.452088066061482f);
 247     const __m128 CC_0        = _mm_set1_ps(1.5220770854701728f);
 248     const __m128 CC_1        = _mm_set1_ps(-1.3422238433233642f);
 249     const __m128 CD_0        = _mm_set1_ps(1.386218787509749f);
 250     const __m128 CD_1        = _mm_set1_ps(0.35075468953796346f);
 251     const __m128 CE_0        = _mm_set1_ps(1.3429983063133937f);
 252     const __m128 CE_1        = _mm_set1_ps(1.807420826584643f);
 253
 254     __m128  fexp,fexp1;
 255     __m128i iexp;
 256     __m128  mask;
 257     __m128  x1,x2;
 258     __m128  y;
 259     __m128  pA,pB,pC,pD,pE,tB,tC,tD,tE;
 260
 261     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 262     fexp  = _mm_and_ps(x,expmask);
 263     iexp  = gmx_mm_castps_si128(fexp);
 264     iexp  = _mm_srli_epi32(iexp,23);
 265     iexp  = _mm_sub_epi32(iexp,expbase_m1);
 266
 267     x     = _mm_andnot_ps(expmask,x);
 268     x     = _mm_or_ps(x,one);
 269     x     = _mm_mul_ps(x,half);
 270
 271     mask  = _mm_cmplt_ps(x,invsq2);
 272
 273     x     = _mm_add_ps(x,_mm_and_ps(mask,x));
 274     x     = _mm_sub_ps(x,one);
 275     iexp  = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 276
 277     x2    = _mm_mul_ps(x,x);
 278
 279     pA    = _mm_mul_ps(CA_1,x);
 280     pB    = _mm_mul_ps(CB_1,x);
 281     pC    = _mm_mul_ps(CC_1,x);
 282     pD    = _mm_mul_ps(CD_1,x);
 283     pE    = _mm_mul_ps(CE_1,x);
 284     tB    = _mm_add_ps(CB_0,x2);
 285     tC    = _mm_add_ps(CC_0,x2);
 286     tD    = _mm_add_ps(CD_0,x2);
 287     tE    = _mm_add_ps(CE_0,x2);
 288     pB    = _mm_add_ps(pB,tB);
 289     pC    = _mm_add_ps(pC,tC);
 290     pD    = _mm_add_ps(pD,tD);
 291     pE    = _mm_add_ps(pE,tE);
 292
 293     pA    = _mm_mul_ps(pA,pB);
 294     pC    = _mm_mul_ps(pC,pD);
 295     pE    = _mm_mul_ps(pE,x2);
 296     pA    = _mm_mul_ps(pA,pC);
 297     y     = _mm_mul_ps(pA,pE);
 298
 299     fexp  = _mm_cvtepi32_ps(iexp);
 300     y     = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
 301
 302     y     = _mm_sub_ps(y, _mm_mul_ps(half,x2));
 303     x2    = _mm_add_ps(x,y);
 304
 305     x2    = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
 306
 307     return x2;
 308 }
 309
 310
 311 /*
 312  * 2^x function, 256-bit wide
 313  *
 314  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 315  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 316  *
 317  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 318  *
 319  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 320  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 321  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 322  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 323  * number instead. That would take a few extra cycles and not really help, since something is
 324  * wrong if you are using single precision to work with numbers that cannot really be represented
 325  * in single precision.
 326  *
 327  * The accuracy is at least 23 bits.
 328  */
 329 static __m256
 330 gmx_mm256_exp2_ps(__m256 x)
 331 {
 332     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 333     const __m256  arglimit = _mm256_set1_ps(126.0f);
 334
 335     const __m128i expbase  = _mm_set1_epi32(127);
 336     const __m256  CC6      = _mm256_set1_ps(1.535336188319500E-004);
 337     const __m256  CC5      = _mm256_set1_ps(1.339887440266574E-003);
 338     const __m256  CC4      = _mm256_set1_ps(9.618437357674640E-003);
 339     const __m256  CC3      = _mm256_set1_ps(5.550332471162809E-002);
 340     const __m256  CC2      = _mm256_set1_ps(2.402264791363012E-001);
 341     const __m256  CC1      = _mm256_set1_ps(6.931472028550421E-001);
 342     const __m256  CC0      = _mm256_set1_ps(1.0f);
 343
 344     __m256  p0,p1;
 345     __m256  valuemask;
 346     __m256i iexppart;
 347     __m128i iexppart128a,iexppart128b;
 348     __m256  fexppart;
 349     __m256  intpart;
 350     __m256  x2;
 351
 352
 353     iexppart  = _mm256_cvtps_epi32(x);
 354     intpart   = _mm256_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
 355
 356     iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
 357     iexppart128a = _mm256_castsi256_si128(iexppart);
 358
 359     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
 360     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
 361
 362     iexppart  = _mm256_castsi128_si256(iexppart128a);
 363     iexppart  = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
 364     valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(x),_CMP_GE_OQ);
 365     fexppart  = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
 366
 367     x         = _mm256_sub_ps(x,intpart);
 368     x2        = _mm256_mul_ps(x,x);
 369
 370     p0        = _mm256_mul_ps(CC6,x2);
 371     p1        = _mm256_mul_ps(CC5,x2);
 372     p0        = _mm256_add_ps(p0,CC4);
 373     p1        = _mm256_add_ps(p1,CC3);
 374     p0        = _mm256_mul_ps(p0,x2);
 375     p1        = _mm256_mul_ps(p1,x2);
 376     p0        = _mm256_add_ps(p0,CC2);
 377     p1        = _mm256_add_ps(p1,CC1);
 378     p0        = _mm256_mul_ps(p0,x2);
 379     p1        = _mm256_mul_ps(p1,x);
 380     p0        = _mm256_add_ps(p0,CC0);
 381     p0        = _mm256_add_ps(p0,p1);
 382     x         = _mm256_mul_ps(p0,fexppart);
 383
 384     return  x;
 385 }
 386
 387
 388 /* 2^x, 128 bit wide */
 389 static __m128
 390 gmx_mm_exp2_ps(__m128 x)
 391 {
 392     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 393     const __m128  arglimit = _mm_set1_ps(126.0f);
 394
 395     const __m128i expbase  = _mm_set1_epi32(127);
 396     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 397     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 398     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 399     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 400     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 401     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 402     const __m128  CA0      = _mm_set1_ps(1.0f);
 403
 404     __m128  valuemask;
 405     __m128i iexppart;
 406     __m128  fexppart;
 407     __m128  intpart;
 408     __m128  x2;
 409     __m128  p0,p1;
 410
 411     iexppart  = _mm_cvtps_epi32(x);
 412     intpart   = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
 413     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 414     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(x));
 415     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 416
 417     x         = _mm_sub_ps(x,intpart);
 418     x2        = _mm_mul_ps(x,x);
 419
 420     p0        = _mm_mul_ps(CA6,x2);
 421     p1        = _mm_mul_ps(CA5,x2);
 422     p0        = _mm_add_ps(p0,CA4);
 423     p1        = _mm_add_ps(p1,CA3);
 424     p0        = _mm_mul_ps(p0,x2);
 425     p1        = _mm_mul_ps(p1,x2);
 426     p0        = _mm_add_ps(p0,CA2);
 427     p1        = _mm_add_ps(p1,CA1);
 428     p0        = _mm_mul_ps(p0,x2);
 429     p1        = _mm_mul_ps(p1,x);
 430     p0        = _mm_add_ps(p0,CA0);
 431     p0        = _mm_add_ps(p0,p1);
 432     x         = _mm_mul_ps(p0,fexppart);
 433
 434     return  x;
 435 }
 436
 437
 438 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
 439  * where y=log2(e)*x, but there will then be a small rounding error since we lose some
 440  * precision due to the multiplication. This will then be magnified a lot by the exponential.
 441  *
 442  * Instead, we calculate the fractional part directly as a minimax approximation of
 443  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 444  * remaining after 2^y, which avoids the precision-loss.
 445  * The final result is correct to within 1 LSB over the entire argument range.
 446  */
 447 static __m256
 448 gmx_mm256_exp_ps(__m256 exparg)
 449 {
 450     const __m256  argscale      = _mm256_set1_ps(1.44269504088896341f);
 451     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 452     const __m256  arglimit      = _mm256_set1_ps(126.0f);
 453     const __m128i expbase       = _mm_set1_epi32(127);
 454
 455     const __m256  invargscale0  = _mm256_set1_ps(0.693359375f);
 456     const __m256  invargscale1  = _mm256_set1_ps(-2.12194440e-4f);
 457
 458     const __m256  CE5           = _mm256_set1_ps(1.9875691500e-4f);
 459     const __m256  CE4           = _mm256_set1_ps(1.3981999507e-3f);
 460     const __m256  CE3           = _mm256_set1_ps(8.3334519073e-3f);
 461     const __m256  CE2           = _mm256_set1_ps(4.1665795894e-2f);
 462     const __m256  CE1           = _mm256_set1_ps(1.6666665459e-1f);
 463     const __m256  CE0           = _mm256_set1_ps(5.0000001201e-1f);
 464     const __m256  one           = _mm256_set1_ps(1.0f);
 465
 466     __m256  exparg2,exp2arg;
 467     __m256  pE0,pE1;
 468     __m256  valuemask;
 469     __m256i iexppart;
 470     __m128i iexppart128a,iexppart128b;
 471     __m256  fexppart;
 472     __m256  intpart;
 473
 474     exp2arg = _mm256_mul_ps(exparg,argscale);
 475
 476     iexppart  = _mm256_cvtps_epi32(exp2arg);
 477     intpart   = _mm256_round_ps(exp2arg,_MM_FROUND_TO_NEAREST_INT);
 478
 479     iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
 480     iexppart128a = _mm256_castsi256_si128(iexppart);
 481
 482     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
 483     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
 484
 485     iexppart  = _mm256_castsi128_si256(iexppart128a);
 486     iexppart  = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
 487     valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(exp2arg),_CMP_GE_OQ);
 488     fexppart  = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
 489
 490     /* Extended precision arithmetics */
 491     exparg    = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale0,intpart));
 492     exparg    = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale1,intpart));
 493
 494     exparg2   = _mm256_mul_ps(exparg,exparg);
 495
 496     pE1       = _mm256_mul_ps(CE5,exparg2);
 497     pE0       = _mm256_mul_ps(CE4,exparg2);
 498     pE1       = _mm256_add_ps(pE1,CE3);
 499     pE0       = _mm256_add_ps(pE0,CE2);
 500     pE1       = _mm256_mul_ps(pE1,exparg2);
 501     pE0       = _mm256_mul_ps(pE0,exparg2);
 502     pE1       = _mm256_add_ps(pE1,CE1);
 503     pE0       = _mm256_add_ps(pE0,CE0);
 504     pE1       = _mm256_mul_ps(pE1,exparg);
 505     pE0       = _mm256_add_ps(pE0,pE1);
 506     pE0       = _mm256_mul_ps(pE0,exparg2);
 507     exparg    = _mm256_add_ps(exparg,one);
 508     exparg    = _mm256_add_ps(exparg,pE0);
 509
 510     exparg    = _mm256_mul_ps(exparg,fexppart);
 511
 512     return exparg;
 513 }
 514
 515
 516 /* exp(), 128 bit wide. */
 517 static __m128
 518 gmx_mm_exp_ps(__m128 x)
 519 {
 520     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 521     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 522     const __m128  arglimit      = _mm_set1_ps(126.0f);
 523     const __m128i expbase       = _mm_set1_epi32(127);
 524
 525     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 526     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 527
 528     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 529     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 530     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 531     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 532     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 533     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 534     const __m128  one           = _mm_set1_ps(1.0f);
 535
 536     __m128  y,x2;
 537     __m128  p0,p1;
 538     __m128  valuemask;
 539     __m128i iexppart;
 540     __m128  fexppart;
 541     __m128  intpart;
 542
 543     y = _mm_mul_ps(x,argscale);
 544
 545     iexppart  = _mm_cvtps_epi32(y);
 546     intpart   = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
 547
 548     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 549     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(y));
 550     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 551
 552     /* Extended precision arithmetics */
 553     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale0,intpart));
 554     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale1,intpart));
 555
 556     x2        = _mm_mul_ps(x,x);
 557
 558     p1        = _mm_mul_ps(CC5,x2);
 559     p0        = _mm_mul_ps(CC4,x2);
 560     p1        = _mm_add_ps(p1,CC3);
 561     p0        = _mm_add_ps(p0,CC2);
 562     p1        = _mm_mul_ps(p1,x2);
 563     p0        = _mm_mul_ps(p0,x2);
 564     p1        = _mm_add_ps(p1,CC1);
 565     p0        = _mm_add_ps(p0,CC0);
 566     p1        = _mm_mul_ps(p1,x);
 567     p0        = _mm_add_ps(p0,p1);
 568     p0        = _mm_mul_ps(p0,x2);
 569     x         = _mm_add_ps(x,one);
 570     x         = _mm_add_ps(x,p0);
 571
 572     x         = _mm_mul_ps(x,fexppart);
 573
 574     return x;
 575 }
 576
 577
 578
 579 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
 580 static __m256
 581 gmx_mm256_erf_ps(__m256 x)
 582 {
 583     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 584     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 585     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 586     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 587     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 588     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 589     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 590     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 591     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 592     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 593     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 594     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 595     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 596     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 597     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 598     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 599     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 600     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 601     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 602     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 603     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 604     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 605     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 606     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 607     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 608     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 609     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 610     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 611     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 612     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 613     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 614
 615     /* Coefficients for expansion of exp(x) in [0,0.1] */
 616     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 617     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 618     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 619     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 620
 621     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 622     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 623     const __m256  one      = _mm256_set1_ps(1.0f);
 624     const __m256  two      = _mm256_set1_ps(2.0f);
 625
 626     __m256 x2,x4,y;
 627     __m256 z,q,t,t2,w,w2;
 628     __m256 pA0,pA1,pB0,pB1,pC0,pC1;
 629     __m256 expmx2,corr;
 630     __m256 res_erf,res_erfc,res;
 631     __m256 mask;
 632
 633     /* Calculate erf() */
 634     x2     = _mm256_mul_ps(x,x);
 635     x4     = _mm256_mul_ps(x2,x2);
 636
 637     pA0  = _mm256_mul_ps(CA6,x4);
 638     pA1  = _mm256_mul_ps(CA5,x4);
 639     pA0  = _mm256_add_ps(pA0,CA4);
 640     pA1  = _mm256_add_ps(pA1,CA3);
 641     pA0  = _mm256_mul_ps(pA0,x4);
 642     pA1  = _mm256_mul_ps(pA1,x4);
 643     pA0  = _mm256_add_ps(pA0,CA2);
 644     pA1  = _mm256_add_ps(pA1,CA1);
 645     pA0  = _mm256_mul_ps(pA0,x4);
 646     pA1  = _mm256_mul_ps(pA1,x2);
 647     pA0  = _mm256_add_ps(pA0,pA1);
 648     pA0  = _mm256_add_ps(pA0,CA0);
 649
 650     res_erf = _mm256_mul_ps(x,pA0);
 651
 652     /* Calculate erfc */
 653
 654     y       = gmx_mm256_abs_ps(x);
 655     t       = gmx_mm256_inv_ps(y);
 656     w       = _mm256_sub_ps(t,one);
 657     t2      = _mm256_mul_ps(t,t);
 658     w2      = _mm256_mul_ps(w,w);
 659     /*
 660      * We cannot simply calculate exp(-x2) directly in single precision, since
 661      * that will lose a couple of bits of precision due to the multiplication.
 662      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 663      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 664      *
 665      * The only drawback with this is that it requires TWO separate exponential
 666      * evaluations, which would be horrible performance-wise. However, the argument
 667      * for the second exp() call is always small, so there we simply use a
 668      * low-order minimax expansion on [0,0.1].
 669      */
 670
 671     z       = _mm256_and_ps(y,sieve);
 672     q       = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
 673
 674     corr    = _mm256_mul_ps(CD4,q);
 675     corr    = _mm256_add_ps(corr,CD3);
 676     corr    = _mm256_mul_ps(corr,q);
 677     corr    = _mm256_add_ps(corr,CD2);
 678     corr    = _mm256_mul_ps(corr,q);
 679     corr    = _mm256_add_ps(corr,one);
 680     corr    = _mm256_mul_ps(corr,q);
 681     corr    = _mm256_add_ps(corr,one);
 682
 683     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
 684     expmx2  = _mm256_mul_ps(expmx2,corr);
 685
 686     pB1  = _mm256_mul_ps(CB9,w2);
 687     pB0  = _mm256_mul_ps(CB8,w2);
 688     pB1  = _mm256_add_ps(pB1,CB7);
 689     pB0  = _mm256_add_ps(pB0,CB6);
 690     pB1  = _mm256_mul_ps(pB1,w2);
 691     pB0  = _mm256_mul_ps(pB0,w2);
 692     pB1  = _mm256_add_ps(pB1,CB5);
 693     pB0  = _mm256_add_ps(pB0,CB4);
 694     pB1  = _mm256_mul_ps(pB1,w2);
 695     pB0  = _mm256_mul_ps(pB0,w2);
 696     pB1  = _mm256_add_ps(pB1,CB3);
 697     pB0  = _mm256_add_ps(pB0,CB2);
 698     pB1  = _mm256_mul_ps(pB1,w2);
 699     pB0  = _mm256_mul_ps(pB0,w2);
 700     pB1  = _mm256_add_ps(pB1,CB1);
 701     pB1  = _mm256_mul_ps(pB1,w);
 702     pB0  = _mm256_add_ps(pB0,pB1);
 703     pB0  = _mm256_add_ps(pB0,CB0);
 704
 705     pC0  = _mm256_mul_ps(CC10,t2);
 706     pC1  = _mm256_mul_ps(CC9,t2);
 707     pC0  = _mm256_add_ps(pC0,CC8);
 708     pC1  = _mm256_add_ps(pC1,CC7);
 709     pC0  = _mm256_mul_ps(pC0,t2);
 710     pC1  = _mm256_mul_ps(pC1,t2);
 711     pC0  = _mm256_add_ps(pC0,CC6);
 712     pC1  = _mm256_add_ps(pC1,CC5);
 713     pC0  = _mm256_mul_ps(pC0,t2);
 714     pC1  = _mm256_mul_ps(pC1,t2);
 715     pC0  = _mm256_add_ps(pC0,CC4);
 716     pC1  = _mm256_add_ps(pC1,CC3);
 717     pC0  = _mm256_mul_ps(pC0,t2);
 718     pC1  = _mm256_mul_ps(pC1,t2);
 719     pC0  = _mm256_add_ps(pC0,CC2);
 720     pC1  = _mm256_add_ps(pC1,CC1);
 721     pC0  = _mm256_mul_ps(pC0,t2);
 722     pC1  = _mm256_mul_ps(pC1,t);
 723     pC0  = _mm256_add_ps(pC0,pC1);
 724     pC0  = _mm256_add_ps(pC0,CC0);
 725     pC0  = _mm256_mul_ps(pC0,t);
 726
 727     /* SELECT pB0 or pC0 for erfc() */
 728     mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
 729     res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
 730     res_erfc = _mm256_mul_ps(res_erfc,expmx2);
 731
 732     /* erfc(x<0) = 2-erfc(|x|) */
 733     mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
 734     res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
 735
 736     /* Select erf() or erfc() */
 737     mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
 738     res  = _mm256_blendv_ps(_mm256_sub_ps(one,res_erfc),res_erf,mask);
 739
 740     return res;
 741 }
 742
 743
 744 /* erf(), 128 bit wide */
 745 static __m128
 746 gmx_mm_erf_ps(__m128 x)
 747 {
 748     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 749     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 750     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 751     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 752     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 753     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 754     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 755     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 756     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 757     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 758     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 759     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 760     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 761     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 762     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 763     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 764     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 765     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 766     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 767     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 768     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 769     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 770     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 771     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 772     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 773     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 774     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 775     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 776     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 777     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 778     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 779
 780     /* Coefficients for expansion of exp(x) in [0,0.1] */
 781     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 782     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 783     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 784     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 785
 786     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 787     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 788     const __m128  one      = _mm_set1_ps(1.0f);
 789     const __m128  two      = _mm_set1_ps(2.0f);
 790
 791     __m128 x2,x4,y;
 792     __m128 z,q,t,t2,w,w2;
 793     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
 794     __m128 expmx2,corr;
 795     __m128 res_erf,res_erfc,res;
 796     __m128 mask;
 797
 798     /* Calculate erf() */
 799     x2     = _mm_mul_ps(x,x);
 800     x4     = _mm_mul_ps(x2,x2);
 801
 802     pA0  = _mm_mul_ps(CA6,x4);
 803     pA1  = _mm_mul_ps(CA5,x4);
 804     pA0  = _mm_add_ps(pA0,CA4);
 805     pA1  = _mm_add_ps(pA1,CA3);
 806     pA0  = _mm_mul_ps(pA0,x4);
 807     pA1  = _mm_mul_ps(pA1,x4);
 808     pA0  = _mm_add_ps(pA0,CA2);
 809     pA1  = _mm_add_ps(pA1,CA1);
 810     pA0  = _mm_mul_ps(pA0,x4);
 811     pA1  = _mm_mul_ps(pA1,x2);
 812     pA0  = _mm_add_ps(pA0,pA1);
 813     pA0  = _mm_add_ps(pA0,CA0);
 814
 815     res_erf = _mm_mul_ps(x,pA0);
 816
 817     /* Calculate erfc */
 818
 819     y       = gmx_mm_abs_ps(x);
 820     t       = gmx_mm_inv_ps(y);
 821     w       = _mm_sub_ps(t,one);
 822     t2      = _mm_mul_ps(t,t);
 823     w2      = _mm_mul_ps(w,w);
 824     /*
 825      * We cannot simply calculate exp(-x2) directly in single precision, since
 826      * that will lose a couple of bits of precision due to the multiplication.
 827      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 828      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 829      *
 830      * The only drawback with this is that it requires TWO separate exponential
 831      * evaluations, which would be horrible performance-wise. However, the argument
 832      * for the second exp() call is always small, so there we simply use a
 833      * low-order minimax expansion on [0,0.1].
 834      */
 835
 836     z       = _mm_and_ps(y,sieve);
 837     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
 838
 839     corr    = _mm_mul_ps(CD4,q);
 840     corr    = _mm_add_ps(corr,CD3);
 841     corr    = _mm_mul_ps(corr,q);
 842     corr    = _mm_add_ps(corr,CD2);
 843     corr    = _mm_mul_ps(corr,q);
 844     corr    = _mm_add_ps(corr,one);
 845     corr    = _mm_mul_ps(corr,q);
 846     corr    = _mm_add_ps(corr,one);
 847
 848     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
 849     expmx2  = _mm_mul_ps(expmx2,corr);
 850
 851     pB1  = _mm_mul_ps(CB9,w2);
 852     pB0  = _mm_mul_ps(CB8,w2);
 853     pB1  = _mm_add_ps(pB1,CB7);
 854     pB0  = _mm_add_ps(pB0,CB6);
 855     pB1  = _mm_mul_ps(pB1,w2);
 856     pB0  = _mm_mul_ps(pB0,w2);
 857     pB1  = _mm_add_ps(pB1,CB5);
 858     pB0  = _mm_add_ps(pB0,CB4);
 859     pB1  = _mm_mul_ps(pB1,w2);
 860     pB0  = _mm_mul_ps(pB0,w2);
 861     pB1  = _mm_add_ps(pB1,CB3);
 862     pB0  = _mm_add_ps(pB0,CB2);
 863     pB1  = _mm_mul_ps(pB1,w2);
 864     pB0  = _mm_mul_ps(pB0,w2);
 865     pB1  = _mm_add_ps(pB1,CB1);
 866     pB1  = _mm_mul_ps(pB1,w);
 867     pB0  = _mm_add_ps(pB0,pB1);
 868     pB0  = _mm_add_ps(pB0,CB0);
 869
 870     pC0  = _mm_mul_ps(CC10,t2);
 871     pC1  = _mm_mul_ps(CC9,t2);
 872     pC0  = _mm_add_ps(pC0,CC8);
 873     pC1  = _mm_add_ps(pC1,CC7);
 874     pC0  = _mm_mul_ps(pC0,t2);
 875     pC1  = _mm_mul_ps(pC1,t2);
 876     pC0  = _mm_add_ps(pC0,CC6);
 877     pC1  = _mm_add_ps(pC1,CC5);
 878     pC0  = _mm_mul_ps(pC0,t2);
 879     pC1  = _mm_mul_ps(pC1,t2);
 880     pC0  = _mm_add_ps(pC0,CC4);
 881     pC1  = _mm_add_ps(pC1,CC3);
 882     pC0  = _mm_mul_ps(pC0,t2);
 883     pC1  = _mm_mul_ps(pC1,t2);
 884     pC0  = _mm_add_ps(pC0,CC2);
 885     pC1  = _mm_add_ps(pC1,CC1);
 886     pC0  = _mm_mul_ps(pC0,t2);
 887     pC1  = _mm_mul_ps(pC1,t);
 888     pC0  = _mm_add_ps(pC0,pC1);
 889     pC0  = _mm_add_ps(pC0,CC0);
 890     pC0  = _mm_mul_ps(pC0,t);
 891
 892     /* SELECT pB0 or pC0 for erfc() */
 893     mask = _mm_cmplt_ps(two,y);
 894     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
 895     res_erfc = _mm_mul_ps(res_erfc,expmx2);
 896
 897     /* erfc(x<0) = 2-erfc(|x|) */
 898     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
 899     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
 900
 901     /* Select erf() or erfc() */
 902     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
 903     res  = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
 904
 905     return res;
 906 }
 907
 908
 909
 910
 911 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
 912 static __m256
 913 gmx_mm256_erfc_ps(__m256 x)
 914 {
 915     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 916     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 917     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 918     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 919     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 920     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 921     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 922     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 923     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 924     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 925     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 926     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 927     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 928     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 929     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 930     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 931     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 932     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 933     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 934     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 935     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 936     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 937     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 938     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 939     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 940     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 941     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 942     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 943     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 944     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 945     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 946
 947     /* Coefficients for expansion of exp(x) in [0,0.1] */
 948     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 949     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 950     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 951     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 952
 953     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 954     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 955     const __m256  one      = _mm256_set1_ps(1.0f);
 956     const __m256  two      = _mm256_set1_ps(2.0f);
 957
 958     __m256 x2,x4,y;
 959     __m256 z,q,t,t2,w,w2;
 960     __m256 pA0,pA1,pB0,pB1,pC0,pC1;
 961     __m256 expmx2,corr;
 962     __m256 res_erf,res_erfc,res;
 963     __m256 mask;
 964
 965     /* Calculate erf() */
 966     x2     = _mm256_mul_ps(x,x);
 967     x4     = _mm256_mul_ps(x2,x2);
 968
 969     pA0  = _mm256_mul_ps(CA6,x4);
 970     pA1  = _mm256_mul_ps(CA5,x4);
 971     pA0  = _mm256_add_ps(pA0,CA4);
 972     pA1  = _mm256_add_ps(pA1,CA3);
 973     pA0  = _mm256_mul_ps(pA0,x4);
 974     pA1  = _mm256_mul_ps(pA1,x4);
 975     pA0  = _mm256_add_ps(pA0,CA2);
 976     pA1  = _mm256_add_ps(pA1,CA1);
 977     pA0  = _mm256_mul_ps(pA0,x4);
 978     pA1  = _mm256_mul_ps(pA1,x2);
 979     pA0  = _mm256_add_ps(pA0,pA1);
 980     pA0  = _mm256_add_ps(pA0,CA0);
 981
 982     res_erf = _mm256_mul_ps(x,pA0);
 983
 984     /* Calculate erfc */
 985     y       = gmx_mm256_abs_ps(x);
 986     t       = gmx_mm256_inv_ps(y);
 987     w       = _mm256_sub_ps(t,one);
 988     t2      = _mm256_mul_ps(t,t);
 989     w2      = _mm256_mul_ps(w,w);
 990     /*
 991      * We cannot simply calculate exp(-x2) directly in single precision, since
 992      * that will lose a couple of bits of precision due to the multiplication.
 993      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 994      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 995      *
 996      * The only drawback with this is that it requires TWO separate exponential
 997      * evaluations, which would be horrible performance-wise. However, the argument
 998      * for the second exp() call is always small, so there we simply use a
 999      * low-order minimax expansion on [0,0.1].
1000      */
1001
1002     z       = _mm256_and_ps(y,sieve);
1003     q       = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
1004
1005     corr    = _mm256_mul_ps(CD4,q);
1006     corr    = _mm256_add_ps(corr,CD3);
1007     corr    = _mm256_mul_ps(corr,q);
1008     corr    = _mm256_add_ps(corr,CD2);
1009     corr    = _mm256_mul_ps(corr,q);
1010     corr    = _mm256_add_ps(corr,one);
1011     corr    = _mm256_mul_ps(corr,q);
1012     corr    = _mm256_add_ps(corr,one);
1013
1014     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
1015     expmx2  = _mm256_mul_ps(expmx2,corr);
1016
1017     pB1  = _mm256_mul_ps(CB9,w2);
1018     pB0  = _mm256_mul_ps(CB8,w2);
1019     pB1  = _mm256_add_ps(pB1,CB7);
1020     pB0  = _mm256_add_ps(pB0,CB6);
1021     pB1  = _mm256_mul_ps(pB1,w2);
1022     pB0  = _mm256_mul_ps(pB0,w2);
1023     pB1  = _mm256_add_ps(pB1,CB5);
1024     pB0  = _mm256_add_ps(pB0,CB4);
1025     pB1  = _mm256_mul_ps(pB1,w2);
1026     pB0  = _mm256_mul_ps(pB0,w2);
1027     pB1  = _mm256_add_ps(pB1,CB3);
1028     pB0  = _mm256_add_ps(pB0,CB2);
1029     pB1  = _mm256_mul_ps(pB1,w2);
1030     pB0  = _mm256_mul_ps(pB0,w2);
1031     pB1  = _mm256_add_ps(pB1,CB1);
1032     pB1  = _mm256_mul_ps(pB1,w);
1033     pB0  = _mm256_add_ps(pB0,pB1);
1034     pB0  = _mm256_add_ps(pB0,CB0);
1035
1036     pC0  = _mm256_mul_ps(CC10,t2);
1037     pC1  = _mm256_mul_ps(CC9,t2);
1038     pC0  = _mm256_add_ps(pC0,CC8);
1039     pC1  = _mm256_add_ps(pC1,CC7);
1040     pC0  = _mm256_mul_ps(pC0,t2);
1041     pC1  = _mm256_mul_ps(pC1,t2);
1042     pC0  = _mm256_add_ps(pC0,CC6);
1043     pC1  = _mm256_add_ps(pC1,CC5);
1044     pC0  = _mm256_mul_ps(pC0,t2);
1045     pC1  = _mm256_mul_ps(pC1,t2);
1046     pC0  = _mm256_add_ps(pC0,CC4);
1047     pC1  = _mm256_add_ps(pC1,CC3);
1048     pC0  = _mm256_mul_ps(pC0,t2);
1049     pC1  = _mm256_mul_ps(pC1,t2);
1050     pC0  = _mm256_add_ps(pC0,CC2);
1051     pC1  = _mm256_add_ps(pC1,CC1);
1052     pC0  = _mm256_mul_ps(pC0,t2);
1053     pC1  = _mm256_mul_ps(pC1,t);
1054     pC0  = _mm256_add_ps(pC0,pC1);
1055     pC0  = _mm256_add_ps(pC0,CC0);
1056     pC0  = _mm256_mul_ps(pC0,t);
1057
1058     /* SELECT pB0 or pC0 for erfc() */
1059     mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
1060     res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
1061     res_erfc = _mm256_mul_ps(res_erfc,expmx2);
1062
1063     /* erfc(x<0) = 2-erfc(|x|) */
1064     mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
1065     res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
1066
1067     /* Select erf() or erfc() */
1068     mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
1069     res  = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(one,res_erf),mask);
1070
1071     return res;
1072 }
1073
1074
1075 /* erfc(), 128 bit wide */
1076 static __m128
1077 gmx_mm_erfc_ps(__m128 x)
1078 {
1079     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1080     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
1081     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
1082     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
1083     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
1084     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
1085     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
1086     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
1087     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1088     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
1089     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
1090     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
1091     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
1092     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
1093     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
1094     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
1095     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
1096     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
1097     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
1098     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1099     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
1100     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
1101     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
1102     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
1103     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
1104     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
1105     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
1106     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
1107     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
1108     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
1109     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
1110
1111     /* Coefficients for expansion of exp(x) in [0,0.1] */
1112     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1113     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
1114     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
1115     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
1116
1117     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1118     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1119     const __m128  one      = _mm_set1_ps(1.0f);
1120     const __m128  two      = _mm_set1_ps(2.0f);
1121
1122     __m128 x2,x4,y;
1123     __m128 z,q,t,t2,w,w2;
1124     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
1125     __m128 expmx2,corr;
1126     __m128 res_erf,res_erfc,res;
1127     __m128 mask;
1128
1129     /* Calculate erf() */
1130     x2     = _mm_mul_ps(x,x);
1131     x4     = _mm_mul_ps(x2,x2);
1132
1133     pA0  = _mm_mul_ps(CA6,x4);
1134     pA1  = _mm_mul_ps(CA5,x4);
1135     pA0  = _mm_add_ps(pA0,CA4);
1136     pA1  = _mm_add_ps(pA1,CA3);
1137     pA0  = _mm_mul_ps(pA0,x4);
1138     pA1  = _mm_mul_ps(pA1,x4);
1139     pA0  = _mm_add_ps(pA0,CA2);
1140     pA1  = _mm_add_ps(pA1,CA1);
1141     pA0  = _mm_mul_ps(pA0,x4);
1142     pA1  = _mm_mul_ps(pA1,x2);
1143     pA0  = _mm_add_ps(pA0,pA1);
1144     pA0  = _mm_add_ps(pA0,CA0);
1145
1146     res_erf = _mm_mul_ps(x,pA0);
1147
1148     /* Calculate erfc */
1149     y       = gmx_mm_abs_ps(x);
1150     t       = gmx_mm_inv_ps(y);
1151     w       = _mm_sub_ps(t,one);
1152     t2      = _mm_mul_ps(t,t);
1153     w2      = _mm_mul_ps(w,w);
1154     /*
1155      * We cannot simply calculate exp(-x2) directly in single precision, since
1156      * that will lose a couple of bits of precision due to the multiplication.
1157      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1158      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1159      *
1160      * The only drawback with this is that it requires TWO separate exponential
1161      * evaluations, which would be horrible performance-wise. However, the argument
1162      * for the second exp() call is always small, so there we simply use a
1163      * low-order minimax expansion on [0,0.1].
1164      */
1165
1166     z       = _mm_and_ps(y,sieve);
1167     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
1168
1169     corr    = _mm_mul_ps(CD4,q);
1170     corr    = _mm_add_ps(corr,CD3);
1171     corr    = _mm_mul_ps(corr,q);
1172     corr    = _mm_add_ps(corr,CD2);
1173     corr    = _mm_mul_ps(corr,q);
1174     corr    = _mm_add_ps(corr,one);
1175     corr    = _mm_mul_ps(corr,q);
1176     corr    = _mm_add_ps(corr,one);
1177
1178     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
1179     expmx2  = _mm_mul_ps(expmx2,corr);
1180
1181     pB1  = _mm_mul_ps(CB9,w2);
1182     pB0  = _mm_mul_ps(CB8,w2);
1183     pB1  = _mm_add_ps(pB1,CB7);
1184     pB0  = _mm_add_ps(pB0,CB6);
1185     pB1  = _mm_mul_ps(pB1,w2);
1186     pB0  = _mm_mul_ps(pB0,w2);
1187     pB1  = _mm_add_ps(pB1,CB5);
1188     pB0  = _mm_add_ps(pB0,CB4);
1189     pB1  = _mm_mul_ps(pB1,w2);
1190     pB0  = _mm_mul_ps(pB0,w2);
1191     pB1  = _mm_add_ps(pB1,CB3);
1192     pB0  = _mm_add_ps(pB0,CB2);
1193     pB1  = _mm_mul_ps(pB1,w2);
1194     pB0  = _mm_mul_ps(pB0,w2);
1195     pB1  = _mm_add_ps(pB1,CB1);
1196     pB1  = _mm_mul_ps(pB1,w);
1197     pB0  = _mm_add_ps(pB0,pB1);
1198     pB0  = _mm_add_ps(pB0,CB0);
1199
1200     pC0  = _mm_mul_ps(CC10,t2);
1201     pC1  = _mm_mul_ps(CC9,t2);
1202     pC0  = _mm_add_ps(pC0,CC8);
1203     pC1  = _mm_add_ps(pC1,CC7);
1204     pC0  = _mm_mul_ps(pC0,t2);
1205     pC1  = _mm_mul_ps(pC1,t2);
1206     pC0  = _mm_add_ps(pC0,CC6);
1207     pC1  = _mm_add_ps(pC1,CC5);
1208     pC0  = _mm_mul_ps(pC0,t2);
1209     pC1  = _mm_mul_ps(pC1,t2);
1210     pC0  = _mm_add_ps(pC0,CC4);
1211     pC1  = _mm_add_ps(pC1,CC3);
1212     pC0  = _mm_mul_ps(pC0,t2);
1213     pC1  = _mm_mul_ps(pC1,t2);
1214     pC0  = _mm_add_ps(pC0,CC2);
1215     pC1  = _mm_add_ps(pC1,CC1);
1216     pC0  = _mm_mul_ps(pC0,t2);
1217     pC1  = _mm_mul_ps(pC1,t);
1218     pC0  = _mm_add_ps(pC0,pC1);
1219     pC0  = _mm_add_ps(pC0,CC0);
1220     pC0  = _mm_mul_ps(pC0,t);
1221
1222     /* SELECT pB0 or pC0 for erfc() */
1223     mask = _mm_cmplt_ps(two,y);
1224     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
1225     res_erfc = _mm_mul_ps(res_erfc,expmx2);
1226
1227     /* erfc(x<0) = 2-erfc(|x|) */
1228     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
1229     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
1230
1231     /* Select erf() or erfc() */
1232     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
1233     res  = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
1234
1235     return res;
1236 }
1237
1238
1239
1240 /* Calculate the force correction due to PME analytically.
1241  *
1242  * This routine is meant to enable analytical evaluation of the
1243  * direct-space PME electrostatic force to avoid tables.
1244  *
1245  * The direct-space potential should be Erfc(beta*r)/r, but there
1246  * are some problems evaluating that:
1247  *
1248  * First, the error function is difficult (read: expensive) to
1249  * approxmiate accurately for intermediate to large arguments, and
1250  * this happens already in ranges of beta*r that occur in simulations.
1251  * Second, we now try to avoid calculating potentials in Gromacs but
1252  * use forces directly.
1253  *
1254  * We can simply things slight by noting that the PME part is really
1255  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1256  *
1257  * V= 1/r - Erf(beta*r)/r
1258  *
1259  * The first term we already have from the inverse square root, so
1260  * that we can leave out of this routine.
1261  *
1262  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1263  * the argument beta*r will be in the range 0.15 to ~4. Use your
1264  * favorite plotting program to realize how well-behaved Erf(z)/z is
1265  * in this range!
1266  *
1267  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1268  * However, it turns out it is more efficient to approximate f(z)/z and
1269  * then only use even powers. This is another minor optimization, since
1270  * we actually WANT f(z)/z, because it is going to be multiplied by
1271  * the vector between the two atoms to get the vectorial force. The
1272  * fastest flops are the ones we can avoid calculating!
1273  *
1274  * So, here's how it should be used:
1275  *
1276  * 1. Calculate r^2.
1277  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1278  * 3. Evaluate this routine with z^2 as the argument.
1279  * 4. The return value is the expression:
1280  *
1281  *
1282  *       2*exp(-z^2)     erf(z)
1283  *       ------------ - --------
1284  *       sqrt(Pi)*z^2      z^3
1285  *
1286  * 5. Multiply the entire expression by beta^3. This will get you
1287  *
1288  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
1289  *       ------------------  - ---------------
1290  *          sqrt(Pi)*z^2            z^3
1291  *
1292  *    or, switching back to r (z=r*beta):
1293  *
1294  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
1295  *       ----------------------- - -----------
1296  *            sqrt(Pi)*r^2            r^3
1297  *
1298  *
1299  *    With a bit of math exercise you should be able to confirm that
1300  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1301  *
1302  * 6. Add the result to 1/r^3, multiply by the product of the charges,
1303  *    and you have your force (divided by r). A final multiplication
1304  *    with the vector connecting the two particles and you have your
1305  *    vectorial force to add to the particles.
1306  *
1307  */
1308 static __m256
1309 gmx_mm256_pmecorrF_ps(__m256 z2)
1310 {
1311     const __m256  FN6      = _mm256_set1_ps(-1.7357322914161492954e-8f);
1312     const __m256  FN5      = _mm256_set1_ps(1.4703624142580877519e-6f);
1313     const __m256  FN4      = _mm256_set1_ps(-0.000053401640219807709149f);
1314     const __m256  FN3      = _mm256_set1_ps(0.0010054721316683106153f);
1315     const __m256  FN2      = _mm256_set1_ps(-0.019278317264888380590f);
1316     const __m256  FN1      = _mm256_set1_ps(0.069670166153766424023f);
1317     const __m256  FN0      = _mm256_set1_ps(-0.75225204789749321333f);
1318
1319     const __m256  FD4      = _mm256_set1_ps(0.0011193462567257629232f);
1320     const __m256  FD3      = _mm256_set1_ps(0.014866955030185295499f);
1321     const __m256  FD2      = _mm256_set1_ps(0.11583842382862377919f);
1322     const __m256  FD1      = _mm256_set1_ps(0.50736591960530292870f);
1323     const __m256  FD0      = _mm256_set1_ps(1.0f);
1324
1325     __m256 z4;
1326     __m256 polyFN0,polyFN1,polyFD0,polyFD1;
1327
1328     z4             = _mm256_mul_ps(z2,z2);
1329
1330     polyFD0        = _mm256_mul_ps(FD4,z4);
1331     polyFD1        = _mm256_mul_ps(FD3,z4);
1332     polyFD0        = _mm256_add_ps(polyFD0,FD2);
1333     polyFD1        = _mm256_add_ps(polyFD1,FD1);
1334     polyFD0        = _mm256_mul_ps(polyFD0,z4);
1335     polyFD1        = _mm256_mul_ps(polyFD1,z2);
1336     polyFD0        = _mm256_add_ps(polyFD0,FD0);
1337     polyFD0        = _mm256_add_ps(polyFD0,polyFD1);
1338
1339     polyFD0        = gmx_mm256_inv_ps(polyFD0);
1340
1341     polyFN0        = _mm256_mul_ps(FN6,z4);
1342     polyFN1        = _mm256_mul_ps(FN5,z4);
1343     polyFN0        = _mm256_add_ps(polyFN0,FN4);
1344     polyFN1        = _mm256_add_ps(polyFN1,FN3);
1345     polyFN0        = _mm256_mul_ps(polyFN0,z4);
1346     polyFN1        = _mm256_mul_ps(polyFN1,z4);
1347     polyFN0        = _mm256_add_ps(polyFN0,FN2);
1348     polyFN1        = _mm256_add_ps(polyFN1,FN1);
1349     polyFN0        = _mm256_mul_ps(polyFN0,z4);
1350     polyFN1        = _mm256_mul_ps(polyFN1,z2);
1351     polyFN0        = _mm256_add_ps(polyFN0,FN0);
1352     polyFN0        = _mm256_add_ps(polyFN0,polyFN1);
1353
1354     return   _mm256_mul_ps(polyFN0,polyFD0);
1355 }
1356
1357
1358 static __m128
1359 gmx_mm_pmecorrF_ps(__m128 z2)
1360 {
1361     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
1362     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
1363     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
1364     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
1365     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
1366     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
1367     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
1368
1369     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
1370     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
1371     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
1372     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
1373     const __m128  FD0      = _mm_set1_ps(1.0f);
1374
1375     __m128 z4;
1376     __m128 polyFN0,polyFN1,polyFD0,polyFD1;
1377
1378     z4             = _mm_mul_ps(z2,z2);
1379
1380     polyFD0        = _mm_mul_ps(FD4,z4);
1381     polyFD1        = _mm_mul_ps(FD3,z4);
1382     polyFD0        = _mm_add_ps(polyFD0,FD2);
1383     polyFD1        = _mm_add_ps(polyFD1,FD1);
1384     polyFD0        = _mm_mul_ps(polyFD0,z4);
1385     polyFD1        = _mm_mul_ps(polyFD1,z2);
1386     polyFD0        = _mm_add_ps(polyFD0,FD0);
1387     polyFD0        = _mm_add_ps(polyFD0,polyFD1);
1388
1389     polyFD0        = gmx_mm_inv_ps(polyFD0);
1390
1391     polyFN0        = _mm_mul_ps(FN6,z4);
1392     polyFN1        = _mm_mul_ps(FN5,z4);
1393     polyFN0        = _mm_add_ps(polyFN0,FN4);
1394     polyFN1        = _mm_add_ps(polyFN1,FN3);
1395     polyFN0        = _mm_mul_ps(polyFN0,z4);
1396     polyFN1        = _mm_mul_ps(polyFN1,z4);
1397     polyFN0        = _mm_add_ps(polyFN0,FN2);
1398     polyFN1        = _mm_add_ps(polyFN1,FN1);
1399     polyFN0        = _mm_mul_ps(polyFN0,z4);
1400     polyFN1        = _mm_mul_ps(polyFN1,z2);
1401     polyFN0        = _mm_add_ps(polyFN0,FN0);
1402     polyFN0        = _mm_add_ps(polyFN0,polyFN1);
1403
1404     return   _mm_mul_ps(polyFN0,polyFD0);
1405 }
1406
1407
1408
1409 /* Calculate the potential correction due to PME analytically.
1410  *
1411  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1412  *
1413  * This routine calculates Erf(z)/z, although you should provide z^2
1414  * as the input argument.
1415  *
1416  * Here's how it should be used:
1417  *
1418  * 1. Calculate r^2.
1419  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1420  * 3. Evaluate this routine with z^2 as the argument.
1421  * 4. The return value is the expression:
1422  *
1423  *
1424  *        erf(z)
1425  *       --------
1426  *          z
1427  *
1428  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1429  *
1430  *       erf(r*beta)
1431  *       -----------
1432  *           r
1433  *
1434  * 6. Subtract the result from 1/r, multiply by the product of the charges,
1435  *    and you have your potential.
1436  */
1437 static __m256
1438 gmx_mm256_pmecorrV_ps(__m256 z2)
1439 {
1440     const __m256  VN6      = _mm256_set1_ps(1.9296833005951166339e-8f);
1441     const __m256  VN5      = _mm256_set1_ps(-1.4213390571557850962e-6f);
1442     const __m256  VN4      = _mm256_set1_ps(0.000041603292906656984871f);
1443     const __m256  VN3      = _mm256_set1_ps(-0.00013134036773265025626f);
1444     const __m256  VN2      = _mm256_set1_ps(0.038657983986041781264f);
1445     const __m256  VN1      = _mm256_set1_ps(0.11285044772717598220f);
1446     const __m256  VN0      = _mm256_set1_ps(1.1283802385263030286f);
1447
1448     const __m256  VD3      = _mm256_set1_ps(0.0066752224023576045451f);
1449     const __m256  VD2      = _mm256_set1_ps(0.078647795836373922256f);
1450     const __m256  VD1      = _mm256_set1_ps(0.43336185284710920150f);
1451     const __m256  VD0      = _mm256_set1_ps(1.0f);
1452
1453     __m256 z4;
1454     __m256 polyVN0,polyVN1,polyVD0,polyVD1;
1455
1456     z4             = _mm256_mul_ps(z2,z2);
1457
1458     polyVD1        = _mm256_mul_ps(VD3,z4);
1459     polyVD0        = _mm256_mul_ps(VD2,z4);
1460     polyVD1        = _mm256_add_ps(polyVD1,VD1);
1461     polyVD0        = _mm256_add_ps(polyVD0,VD0);
1462     polyVD1        = _mm256_mul_ps(polyVD1,z2);
1463     polyVD0        = _mm256_add_ps(polyVD0,polyVD1);
1464
1465     polyVD0        = gmx_mm256_inv_ps(polyVD0);
1466
1467     polyVN0        = _mm256_mul_ps(VN6,z4);
1468     polyVN1        = _mm256_mul_ps(VN5,z4);
1469     polyVN0        = _mm256_add_ps(polyVN0,VN4);
1470     polyVN1        = _mm256_add_ps(polyVN1,VN3);
1471     polyVN0        = _mm256_mul_ps(polyVN0,z4);
1472     polyVN1        = _mm256_mul_ps(polyVN1,z4);
1473     polyVN0        = _mm256_add_ps(polyVN0,VN2);
1474     polyVN1        = _mm256_add_ps(polyVN1,VN1);
1475     polyVN0        = _mm256_mul_ps(polyVN0,z4);
1476     polyVN1        = _mm256_mul_ps(polyVN1,z2);
1477     polyVN0        = _mm256_add_ps(polyVN0,VN0);
1478     polyVN0        = _mm256_add_ps(polyVN0,polyVN1);
1479
1480     return   _mm256_mul_ps(polyVN0,polyVD0);
1481 }
1482
1483
1484 static __m128
1485 gmx_mm_pmecorrV_ps(__m128 z2)
1486 {
1487     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
1488     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
1489     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
1490     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
1491     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
1492     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
1493     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
1494
1495     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
1496     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
1497     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
1498     const __m128  VD0      = _mm_set1_ps(1.0f);
1499
1500     __m128 z4;
1501     __m128 polyVN0,polyVN1,polyVD0,polyVD1;
1502
1503     z4             = _mm_mul_ps(z2,z2);
1504
1505     polyVD1        = _mm_mul_ps(VD3,z4);
1506     polyVD0        = _mm_mul_ps(VD2,z4);
1507     polyVD1        = _mm_add_ps(polyVD1,VD1);
1508     polyVD0        = _mm_add_ps(polyVD0,VD0);
1509     polyVD1        = _mm_mul_ps(polyVD1,z2);
1510     polyVD0        = _mm_add_ps(polyVD0,polyVD1);
1511
1512     polyVD0        = gmx_mm_inv_ps(polyVD0);
1513
1514     polyVN0        = _mm_mul_ps(VN6,z4);
1515     polyVN1        = _mm_mul_ps(VN5,z4);
1516     polyVN0        = _mm_add_ps(polyVN0,VN4);
1517     polyVN1        = _mm_add_ps(polyVN1,VN3);
1518     polyVN0        = _mm_mul_ps(polyVN0,z4);
1519     polyVN1        = _mm_mul_ps(polyVN1,z4);
1520     polyVN0        = _mm_add_ps(polyVN0,VN2);
1521     polyVN1        = _mm_add_ps(polyVN1,VN1);
1522     polyVN0        = _mm_mul_ps(polyVN0,z4);
1523     polyVN1        = _mm_mul_ps(polyVN1,z2);
1524     polyVN0        = _mm_add_ps(polyVN0,VN0);
1525     polyVN0        = _mm_add_ps(polyVN0,polyVN1);
1526
1527     return   _mm_mul_ps(polyVN0,polyVD0);
1528 }
1529
1530
1531 static int
1532 gmx_mm256_sincos_ps(__m256 x,
1533                     __m256 *sinval,
1534                     __m256 *cosval)
1535 {
1536     const __m256 two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1537     const __m256 half        = _mm256_set1_ps(0.5f);
1538     const __m256 one         = _mm256_set1_ps(1.0f);
1539     const __m256 zero        = _mm256_setzero_ps();
1540
1541     const __m128i ione       = _mm_set1_epi32(1);
1542
1543     const __m256 mask_one    = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1544     const __m256 mask_two    = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1545     const __m256 mask_three  = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1546
1547     const __m256 CA1         = _mm256_set1_ps(1.5703125f);
1548     const __m256 CA2         = _mm256_set1_ps(4.837512969970703125e-4f);
1549     const __m256 CA3         = _mm256_set1_ps(7.54978995489188216e-8f);
1550
1551     const __m256 CC0         = _mm256_set1_ps(-0.0013602249f);
1552     const __m256 CC1         = _mm256_set1_ps(0.0416566950f);
1553     const __m256 CC2         = _mm256_set1_ps(-0.4999990225f);
1554     const __m256 CS0         = _mm256_set1_ps(-0.0001950727f);
1555     const __m256 CS1         = _mm256_set1_ps(0.0083320758f);
1556     const __m256 CS2         = _mm256_set1_ps(-0.1666665247f);
1557
1558     const __m256  signbit    = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1559
1560     __m256 y,y2;
1561     __m256 z;
1562     __m256i iz;
1563     __m128i iz_high,iz_low;
1564     __m256 offset_sin,offset_cos;
1565     __m256 mask_sin,mask_cos;
1566     __m256 tmp1,tmp2;
1567     __m256 tmp_sin,tmp_cos;
1568
1569     y               = _mm256_mul_ps(x,two_over_pi);
1570     y               = _mm256_add_ps(y,_mm256_or_ps(_mm256_and_ps(y,signbit),half));
1571
1572     iz              = _mm256_cvttps_epi32(y);
1573     z               = _mm256_round_ps(y,_MM_FROUND_TO_ZERO);
1574
1575     offset_sin      = _mm256_and_ps(_mm256_castsi256_ps(iz),mask_three);
1576
1577     iz_high         = _mm256_extractf128_si256(iz,0x1);
1578     iz_low          = _mm256_castsi256_si128(iz);
1579     iz_low          = _mm_add_epi32(iz_low,ione);
1580     iz_high         = _mm_add_epi32(iz_high,ione);
1581     iz              = _mm256_castsi128_si256(iz_low);
1582     iz              = _mm256_insertf128_si256(iz,iz_high,0x1);
1583     offset_cos      = _mm256_castsi256_ps(iz);
1584
1585     /* Extended precision arithmethic to achieve full precision */
1586     y               = _mm256_mul_ps(z,CA1);
1587     tmp1            = _mm256_mul_ps(z,CA2);
1588     tmp2            = _mm256_mul_ps(z,CA3);
1589     y               = _mm256_sub_ps(x,y);
1590     y               = _mm256_sub_ps(y,tmp1);
1591     y               = _mm256_sub_ps(y,tmp2);
1592
1593     y2              = _mm256_mul_ps(y,y);
1594
1595     tmp1            = _mm256_mul_ps(CC0,y2);
1596     tmp1            = _mm256_add_ps(tmp1,CC1);
1597     tmp2            = _mm256_mul_ps(CS0,y2);
1598     tmp2            = _mm256_add_ps(tmp2,CS1);
1599     tmp1            = _mm256_mul_ps(tmp1,y2);
1600     tmp1            = _mm256_add_ps(tmp1,CC2);
1601     tmp2            = _mm256_mul_ps(tmp2,y2);
1602     tmp2            = _mm256_add_ps(tmp2,CS2);
1603
1604     tmp1            = _mm256_mul_ps(tmp1,y2);
1605     tmp1            = _mm256_add_ps(tmp1,one);
1606
1607     tmp2            = _mm256_mul_ps(tmp2,_mm256_mul_ps(y,y2));
1608     tmp2            = _mm256_add_ps(tmp2,y);
1609
1610 #ifdef __INTEL_COMPILER
1611     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1612     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_one))),zero,_CMP_EQ_OQ);
1613     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_one))),zero,_CMP_EQ_OQ);
1614 #else
1615     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_one), zero, _CMP_EQ_OQ);
1616     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_one), zero, _CMP_EQ_OQ);
1617 #endif
1618     tmp_sin         = _mm256_blendv_ps(tmp1,tmp2,mask_sin);
1619     tmp_cos         = _mm256_blendv_ps(tmp1,tmp2,mask_cos);
1620
1621     tmp1            = _mm256_xor_ps(signbit,tmp_sin);
1622     tmp2            = _mm256_xor_ps(signbit,tmp_cos);
1623
1624 #ifdef __INTEL_COMPILER
1625     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1626     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_two))),zero,_CMP_EQ_OQ);
1627     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_two))),zero,_CMP_EQ_OQ);
1628 #else
1629     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_two), zero, _CMP_EQ_OQ);
1630     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_two), zero, _CMP_EQ_OQ);
1631
1632 #endif
1633     *sinval         = _mm256_blendv_ps(tmp1,tmp_sin,mask_sin);
1634     *cosval         = _mm256_blendv_ps(tmp2,tmp_cos,mask_cos);
1635
1636     return 0;
1637 }
1638
1639 static int
1640 gmx_mm_sincos_ps(__m128 x,
1641                  __m128 *sinval,
1642                  __m128 *cosval)
1643 {
1644     const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
1645     const __m128 half        = _mm_set1_ps(0.5);
1646     const __m128 one         = _mm_set1_ps(1.0);
1647
1648     const __m128i izero      = _mm_set1_epi32(0);
1649     const __m128i ione       = _mm_set1_epi32(1);
1650     const __m128i itwo       = _mm_set1_epi32(2);
1651     const __m128i ithree     = _mm_set1_epi32(3);
1652     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1653
1654     const __m128 CA1         = _mm_set1_ps(1.5703125f);
1655     const __m128 CA2         = _mm_set1_ps(4.837512969970703125e-4f);
1656     const __m128 CA3         = _mm_set1_ps(7.54978995489188216e-8f);
1657
1658     const __m128 CC0         = _mm_set1_ps(-0.0013602249f);
1659     const __m128 CC1         = _mm_set1_ps(0.0416566950f);
1660     const __m128 CC2         = _mm_set1_ps(-0.4999990225f);
1661     const __m128 CS0         = _mm_set1_ps(-0.0001950727f);
1662     const __m128 CS1         = _mm_set1_ps(0.0083320758f);
1663     const __m128 CS2         = _mm_set1_ps(-0.1666665247f);
1664
1665     __m128  y,y2;
1666     __m128  z;
1667     __m128i iz;
1668     __m128i offset_sin,offset_cos;
1669     __m128  tmp1,tmp2;
1670     __m128  mask_sin,mask_cos;
1671     __m128  tmp_sin,tmp_cos;
1672
1673     y          = _mm_mul_ps(x,two_over_pi);
1674     y          = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
1675
1676     iz         = _mm_cvttps_epi32(y);
1677     z          = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
1678
1679     offset_sin = _mm_and_si128(iz,ithree);
1680     offset_cos = _mm_add_epi32(iz,ione);
1681
1682     /* Extended precision arithmethic to achieve full precision */
1683     y               = _mm_mul_ps(z,CA1);
1684     tmp1            = _mm_mul_ps(z,CA2);
1685     tmp2            = _mm_mul_ps(z,CA3);
1686     y               = _mm_sub_ps(x,y);
1687     y               = _mm_sub_ps(y,tmp1);
1688     y               = _mm_sub_ps(y,tmp2);
1689
1690     y2              = _mm_mul_ps(y,y);
1691
1692     tmp1            = _mm_mul_ps(CC0,y2);
1693     tmp1            = _mm_add_ps(tmp1,CC1);
1694     tmp2            = _mm_mul_ps(CS0,y2);
1695     tmp2            = _mm_add_ps(tmp2,CS1);
1696     tmp1            = _mm_mul_ps(tmp1,y2);
1697     tmp1            = _mm_add_ps(tmp1,CC2);
1698     tmp2            = _mm_mul_ps(tmp2,y2);
1699     tmp2            = _mm_add_ps(tmp2,CS2);
1700
1701     tmp1            = _mm_mul_ps(tmp1,y2);
1702     tmp1            = _mm_add_ps(tmp1,one);
1703
1704     tmp2            = _mm_mul_ps(tmp2,_mm_mul_ps(y,y2));
1705     tmp2            = _mm_add_ps(tmp2,y);
1706
1707     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
1708     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
1709
1710     tmp_sin         = _mm_blendv_ps(tmp1,tmp2,mask_sin);
1711     tmp_cos         = _mm_blendv_ps(tmp1,tmp2,mask_cos);
1712
1713     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
1714     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
1715
1716     tmp1            = _mm_xor_ps(signbit,tmp_sin);
1717     tmp2            = _mm_xor_ps(signbit,tmp_cos);
1718
1719     *sinval         = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
1720     *cosval         = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
1721
1722     return 0;
1723 }
1724
1725
1726
1727
1728 /*
1729  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1730  * will then call the sincos() routine and waste a factor 2 in performance!
1731  */
1732 static __m256
1733 gmx_mm256_sin_ps(__m256 x)
1734 {
1735     __m256 s,c;
1736     gmx_mm256_sincos_ps(x,&s,&c);
1737     return s;
1738 }
1739
1740 static __m128
1741 gmx_mm_sin_ps(__m128 x)
1742 {
1743     __m128 s,c;
1744     gmx_mm_sincos_ps(x,&s,&c);
1745     return s;
1746 }
1747
1748
1749 /*
1750  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1751  * will then call the sincos() routine and waste a factor 2 in performance!
1752  */
1753 static __m256
1754 gmx_mm256_cos_ps(__m256 x)
1755 {
1756     __m256 s,c;
1757     gmx_mm256_sincos_ps(x,&s,&c);
1758     return c;
1759 }
1760
1761 static __m128
1762 gmx_mm_cos_ps(__m128 x)
1763 {
1764     __m128 s,c;
1765     gmx_mm_sincos_ps(x,&s,&c);
1766     return c;
1767 }
1768
1769
1770 static __m256
1771 gmx_mm256_tan_ps(__m256 x)
1772 {
1773     __m256 sinval,cosval;
1774     __m256 tanval;
1775
1776     gmx_mm256_sincos_ps(x,&sinval,&cosval);
1777
1778     tanval = _mm256_mul_ps(sinval,gmx_mm256_inv_ps(cosval));
1779
1780     return tanval;
1781 }
1782
1783 static __m128
1784 gmx_mm_tan_ps(__m128 x)
1785 {
1786     __m128 sinval,cosval;
1787     __m128 tanval;
1788
1789     gmx_mm_sincos_ps(x,&sinval,&cosval);
1790
1791     tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
1792
1793     return tanval;
1794 }
1795
1796
1797 static __m256
1798 gmx_mm256_asin_ps(__m256 x)
1799 {
1800     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1801     const __m256 limitlow  = _mm256_set1_ps(1e-4f);
1802     const __m256 half      = _mm256_set1_ps(0.5f);
1803     const __m256 one       = _mm256_set1_ps(1.0f);
1804     const __m256 halfpi    = _mm256_set1_ps((float)M_PI/2.0f);
1805
1806     const __m256 CC5        = _mm256_set1_ps(4.2163199048E-2f);
1807     const __m256 CC4        = _mm256_set1_ps(2.4181311049E-2f);
1808     const __m256 CC3        = _mm256_set1_ps(4.5470025998E-2f);
1809     const __m256 CC2        = _mm256_set1_ps(7.4953002686E-2f);
1810     const __m256 CC1        = _mm256_set1_ps(1.6666752422E-1f);
1811
1812     __m256 sign;
1813     __m256 mask;
1814     __m256 xabs;
1815     __m256 z,z1,z2,q,q1,q2;
1816     __m256 pA,pB;
1817
1818     sign  = _mm256_andnot_ps(signmask,x);
1819     xabs  = _mm256_and_ps(x,signmask);
1820
1821     mask  = _mm256_cmp_ps(xabs,half,_CMP_GT_OQ);
1822
1823     z1    = _mm256_mul_ps(half, _mm256_sub_ps(one,xabs));
1824     q1    = _mm256_mul_ps(z1,gmx_mm256_invsqrt_ps(z1));
1825     q1    = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one,_CMP_EQ_OQ),q1);
1826
1827     q2    = xabs;
1828     z2    = _mm256_mul_ps(q2,q2);
1829
1830     z     = _mm256_blendv_ps(z2,z1,mask);
1831     q     = _mm256_blendv_ps(q2,q1,mask);
1832
1833     z2    = _mm256_mul_ps(z,z);
1834
1835     pA    = _mm256_mul_ps(CC5,z2);
1836     pB    = _mm256_mul_ps(CC4,z2);
1837
1838     pA    = _mm256_add_ps(pA,CC3);
1839     pB    = _mm256_add_ps(pB,CC2);
1840
1841     pA    = _mm256_mul_ps(pA,z2);
1842     pB    = _mm256_mul_ps(pB,z2);
1843
1844     pA    = _mm256_add_ps(pA,CC1);
1845     pA    = _mm256_mul_ps(pA,z);
1846
1847     z     = _mm256_add_ps(pA,pB);
1848     z     = _mm256_mul_ps(z,q);
1849     z     = _mm256_add_ps(z,q);
1850
1851     q2    = _mm256_sub_ps(halfpi,z);
1852     q2    = _mm256_sub_ps(q2,z);
1853
1854     z     = _mm256_blendv_ps(z,q2,mask);
1855
1856     mask  = _mm256_cmp_ps(xabs,limitlow,_CMP_GT_OQ);
1857     z     = _mm256_blendv_ps(xabs,z,mask);
1858
1859     z     = _mm256_xor_ps(z,sign);
1860
1861     return z;
1862 }
1863
1864 static __m128
1865 gmx_mm_asin_ps(__m128 x)
1866 {
1867     /* Same algorithm as cephes library */
1868     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1869     const __m128 limitlow  = _mm_set1_ps(1e-4f);
1870     const __m128 half      = _mm_set1_ps(0.5f);
1871     const __m128 one       = _mm_set1_ps(1.0f);
1872     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
1873
1874     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
1875     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
1876     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
1877     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
1878     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
1879
1880     __m128 sign;
1881     __m128 mask;
1882     __m128 xabs;
1883     __m128 z,z1,z2,q,q1,q2;
1884     __m128 pA,pB;
1885
1886     sign  = _mm_andnot_ps(signmask,x);
1887     xabs  = _mm_and_ps(x,signmask);
1888
1889     mask  = _mm_cmpgt_ps(xabs,half);
1890
1891     z1    = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
1892     q1    = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
1893     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
1894
1895     q2    = xabs;
1896     z2    = _mm_mul_ps(q2,q2);
1897
1898     z     = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
1899     q     = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
1900
1901     z2    = _mm_mul_ps(z,z);
1902
1903     pA    = _mm_mul_ps(CC5,z2);
1904     pB    = _mm_mul_ps(CC4,z2);
1905
1906     pA    = _mm_add_ps(pA,CC3);
1907     pB    = _mm_add_ps(pB,CC2);
1908
1909     pA    = _mm_mul_ps(pA,z2);
1910     pB    = _mm_mul_ps(pB,z2);
1911
1912     pA    = _mm_add_ps(pA,CC1);
1913     pA    = _mm_mul_ps(pA,z);
1914
1915     z     = _mm_add_ps(pA,pB);
1916     z     = _mm_mul_ps(z,q);
1917     z     = _mm_add_ps(z,q);
1918
1919     q2    = _mm_sub_ps(halfpi,z);
1920     q2    = _mm_sub_ps(q2,z);
1921
1922     z     = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
1923
1924     mask  = _mm_cmpgt_ps(xabs,limitlow);
1925     z     = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
1926
1927     z = _mm_xor_ps(z,sign);
1928
1929     return z;
1930 }
1931
1932
1933 static __m256
1934 gmx_mm256_acos_ps(__m256 x)
1935 {
1936     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1937     const __m256 one_ps    = _mm256_set1_ps(1.0f);
1938     const __m256 half_ps   = _mm256_set1_ps(0.5f);
1939     const __m256 pi_ps     = _mm256_set1_ps((float)M_PI);
1940     const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1941
1942     __m256 mask1;
1943     __m256 mask2;
1944     __m256 xabs;
1945     __m256 z,z1,z2,z3;
1946
1947     xabs  = _mm256_and_ps(x,signmask);
1948     mask1 = _mm256_cmp_ps(xabs,half_ps,_CMP_GT_OQ);
1949     mask2 = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_GT_OQ);
1950
1951     z     = _mm256_mul_ps(half_ps,_mm256_sub_ps(one_ps,xabs));
1952     z     = _mm256_mul_ps(z,gmx_mm256_invsqrt_ps(z));
1953     z     = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one_ps,_CMP_EQ_OQ),z);
1954
1955     z     = _mm256_blendv_ps(x,z,mask1);
1956     z     = gmx_mm256_asin_ps(z);
1957
1958     z2    = _mm256_add_ps(z,z);
1959     z1    = _mm256_sub_ps(pi_ps,z2);
1960     z3    = _mm256_sub_ps(halfpi_ps,z);
1961
1962     z     = _mm256_blendv_ps(z1,z2,mask2);
1963     z     = _mm256_blendv_ps(z3,z,mask1);
1964
1965     return z;
1966 }
1967
1968 static __m128
1969 gmx_mm_acos_ps(__m128 x)
1970 {
1971     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1972     const __m128 one_ps    = _mm_set1_ps(1.0f);
1973     const __m128 half_ps   = _mm_set1_ps(0.5f);
1974     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1975     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1976
1977     __m128 mask1;
1978     __m128 mask2;
1979     __m128 xabs;
1980     __m128 z,z1,z2,z3;
1981
1982     xabs  = _mm_and_ps(x,signmask);
1983     mask1 = _mm_cmpgt_ps(xabs,half_ps);
1984     mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
1985
1986     z     = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
1987     z     = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
1988     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
1989
1990     z     = _mm_blendv_ps(x,z,mask1);
1991     z     = gmx_mm_asin_ps(z);
1992
1993     z2    = _mm_add_ps(z,z);
1994     z1    = _mm_sub_ps(pi_ps,z2);
1995     z3    = _mm_sub_ps(halfpi_ps,z);
1996
1997     z     = _mm_blendv_ps(z1,z2,mask2);
1998     z     = _mm_blendv_ps(z3,z,mask1);
1999
2000     return z;
2001 }
2002
2003
2004 static __m256
2005 gmx_mm256_atan_ps(__m256 x)
2006 {
2007     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2008     const __m256 limit1    = _mm256_set1_ps(0.414213562373095f);
2009     const __m256 limit2    = _mm256_set1_ps(2.414213562373095f);
2010     const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2011     const __m256 halfpi    = _mm256_set1_ps(1.570796326794896f);
2012     const __m256 mone      = _mm256_set1_ps(-1.0f);
2013     const __m256 CC3       = _mm256_set1_ps(-3.33329491539E-1f);
2014     const __m256 CC5       = _mm256_set1_ps(1.99777106478E-1f);
2015     const __m256 CC7       = _mm256_set1_ps(-1.38776856032E-1);
2016     const __m256 CC9       = _mm256_set1_ps(8.05374449538e-2f);
2017
2018     __m256 sign;
2019     __m256 mask1,mask2;
2020     __m256 y,z1,z2;
2021     __m256 x2,x4;
2022     __m256 sum1,sum2;
2023
2024     sign  = _mm256_andnot_ps(signmask,x);
2025     x     = _mm256_and_ps(x,signmask);
2026
2027     mask1 = _mm256_cmp_ps(x,limit1,_CMP_GT_OQ);
2028     mask2 = _mm256_cmp_ps(x,limit2,_CMP_GT_OQ);
2029
2030     z1    = _mm256_mul_ps(_mm256_add_ps(x,mone),gmx_mm256_inv_ps(_mm256_sub_ps(x,mone)));
2031     z2    = _mm256_mul_ps(mone,gmx_mm256_inv_ps(x));
2032
2033     y     = _mm256_and_ps(mask1,quarterpi);
2034     y     = _mm256_blendv_ps(y,halfpi,mask2);
2035
2036     x     = _mm256_blendv_ps(x,z1,mask1);
2037     x     = _mm256_blendv_ps(x,z2,mask2);
2038
2039     x2    = _mm256_mul_ps(x,x);
2040     x4    = _mm256_mul_ps(x2,x2);
2041
2042     sum1  = _mm256_mul_ps(CC9,x4);
2043     sum2  = _mm256_mul_ps(CC7,x4);
2044     sum1  = _mm256_add_ps(sum1,CC5);
2045     sum2  = _mm256_add_ps(sum2,CC3);
2046     sum1  = _mm256_mul_ps(sum1,x4);
2047     sum2  = _mm256_mul_ps(sum2,x2);
2048
2049     sum1  = _mm256_add_ps(sum1,sum2);
2050     sum1  = _mm256_sub_ps(sum1,mone);
2051     sum1  = _mm256_mul_ps(sum1,x);
2052     y     = _mm256_add_ps(y,sum1);
2053
2054     y     = _mm256_xor_ps(y,sign);
2055
2056     return y;
2057 }
2058
2059 static __m128
2060 gmx_mm_atan_ps(__m128 x)
2061 {
2062     /* Same algorithm as cephes library */
2063     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2064     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
2065     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
2066     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2067     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
2068     const __m128 mone      = _mm_set1_ps(-1.0f);
2069     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
2070     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
2071     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
2072     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
2073
2074     __m128 sign;
2075     __m128 mask1,mask2;
2076     __m128 y,z1,z2;
2077     __m128 x2,x4;
2078     __m128 sum1,sum2;
2079
2080     sign  = _mm_andnot_ps(signmask,x);
2081     x     = _mm_and_ps(x,signmask);
2082
2083     mask1 = _mm_cmpgt_ps(x,limit1);
2084     mask2 = _mm_cmpgt_ps(x,limit2);
2085
2086     z1    = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
2087     z2    = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
2088
2089     y     = _mm_and_ps(mask1,quarterpi);
2090     y     = _mm_blendv_ps(y,halfpi,mask2);
2091
2092     x     = _mm_blendv_ps(x,z1,mask1);
2093     x     = _mm_blendv_ps(x,z2,mask2);
2094
2095     x2    = _mm_mul_ps(x,x);
2096     x4    = _mm_mul_ps(x2,x2);
2097
2098     sum1  = _mm_mul_ps(CC9,x4);
2099     sum2  = _mm_mul_ps(CC7,x4);
2100     sum1  = _mm_add_ps(sum1,CC5);
2101     sum2  = _mm_add_ps(sum2,CC3);
2102     sum1  = _mm_mul_ps(sum1,x4);
2103     sum2  = _mm_mul_ps(sum2,x2);
2104
2105     sum1  = _mm_add_ps(sum1,sum2);
2106     sum1  = _mm_sub_ps(sum1,mone);
2107     sum1  = _mm_mul_ps(sum1,x);
2108     y     = _mm_add_ps(y,sum1);
2109
2110     y     = _mm_xor_ps(y,sign);
2111
2112     return y;
2113 }
2114
2115
2116 static __m256
2117 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2118 {
2119     const __m256 pi          = _mm256_set1_ps( (float) M_PI);
2120     const __m256 minuspi     = _mm256_set1_ps( (float) -M_PI);
2121     const __m256 halfpi      = _mm256_set1_ps( (float) M_PI/2.0f);
2122     const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2123
2124     __m256 z,z1,z3,z4;
2125     __m256 w;
2126     __m256 maskx_lt,maskx_eq;
2127     __m256 masky_lt,masky_eq;
2128     __m256 mask1,mask2,mask3,mask4,maskall;
2129
2130     maskx_lt  = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
2131     masky_lt  = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_LT_OQ);
2132     maskx_eq  = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
2133     masky_eq  = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_EQ_OQ);
2134
2135     z         = _mm256_mul_ps(y,gmx_mm256_inv_ps(x));
2136     z         = gmx_mm256_atan_ps(z);
2137
2138     mask1     = _mm256_and_ps(maskx_eq,masky_lt);
2139     mask2     = _mm256_andnot_ps(maskx_lt,masky_eq);
2140     mask3     = _mm256_andnot_ps( _mm256_or_ps(masky_lt,masky_eq) , maskx_eq);
2141     mask4     = _mm256_and_ps(maskx_lt,masky_eq);
2142     maskall   = _mm256_or_ps( _mm256_or_ps(mask1,mask2), _mm256_or_ps(mask3,mask4) );
2143
2144     z         = _mm256_andnot_ps(maskall,z);
2145     z1        = _mm256_and_ps(mask1,minushalfpi);
2146     z3        = _mm256_and_ps(mask3,halfpi);
2147     z4        = _mm256_and_ps(mask4,pi);
2148
2149     z         = _mm256_or_ps( _mm256_or_ps(z,z1), _mm256_or_ps(z3,z4) );
2150
2151     w         = _mm256_blendv_ps(pi,minuspi,masky_lt);
2152     w         = _mm256_and_ps(w,maskx_lt);
2153
2154     w         = _mm256_andnot_ps(maskall,w);
2155
2156     z         = _mm256_add_ps(z,w);
2157
2158     return z;
2159 }
2160
2161 static __m128
2162 gmx_mm_atan2_ps(__m128 y, __m128 x)
2163 {
2164     const __m128 pi          = _mm_set1_ps(M_PI);
2165     const __m128 minuspi     = _mm_set1_ps(-M_PI);
2166     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
2167     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2168
2169     __m128 z,z1,z3,z4;
2170     __m128 w;
2171     __m128 maskx_lt,maskx_eq;
2172     __m128 masky_lt,masky_eq;
2173     __m128 mask1,mask2,mask3,mask4,maskall;
2174
2175     maskx_lt  = _mm_cmplt_ps(x,_mm_setzero_ps());
2176     masky_lt  = _mm_cmplt_ps(y,_mm_setzero_ps());
2177     maskx_eq  = _mm_cmpeq_ps(x,_mm_setzero_ps());
2178     masky_eq  = _mm_cmpeq_ps(y,_mm_setzero_ps());
2179
2180     z         = _mm_mul_ps(y,gmx_mm_inv_ps(x));
2181     z         = gmx_mm_atan_ps(z);
2182
2183     mask1     = _mm_and_ps(maskx_eq,masky_lt);
2184     mask2     = _mm_andnot_ps(maskx_lt,masky_eq);
2185     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
2186     mask4     = _mm_and_ps(masky_eq,maskx_lt);
2187
2188     maskall   = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
2189
2190     z         = _mm_andnot_ps(maskall,z);
2191     z1        = _mm_and_ps(mask1,minushalfpi);
2192     z3        = _mm_and_ps(mask3,halfpi);
2193     z4        = _mm_and_ps(mask4,pi);
2194
2195     z         = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
2196
2197     mask1     = _mm_andnot_ps(masky_lt,maskx_lt);
2198     mask2     = _mm_and_ps(maskx_lt,masky_lt);
2199
2200     w         = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
2201     w         = _mm_andnot_ps(maskall,w);
2202
2203     z         = _mm_add_ps(z,w);
2204
2205     return z;
2206 }
2207
2208 #endif /* _gmx_math_x86_avx_256_single_h_ */