Added .gitignore for kernel generation
[gromacs.git] / include / gmx_math_x86_avx_256_single.h
blobb25b644ae868057f309d4ba3cc5d3c5f190008c9
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This file is part of GROMACS.
5 * Copyright (c) 2012-
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
18 * And Hey:
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_avx_256_single_h_
22 #define _gmx_math_x86_avx_256_single_h_
24 #include "gmx_x86_avx_256.h"
27 #ifndef M_PI
28 # define M_PI 3.14159265358979323846264338327950288
29 #endif
33 /************************
34 * *
35 * Simple math routines *
36 * *
37 ************************/
39 /* 1.0/sqrt(x), 256-bit wide version */
40 static gmx_inline __m256
41 gmx_mm256_invsqrt_ps(__m256 x)
43 const __m256 half = _mm256_set1_ps(0.5f);
44 const __m256 three = _mm256_set1_ps(3.0f);
46 __m256 lu = _mm256_rsqrt_ps(x);
48 return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
51 /* 1.0/sqrt(x), 128-bit wide version */
52 static gmx_inline __m128
53 gmx_mm_invsqrt_ps(__m128 x)
55 const __m128 half = _mm_set_ps(0.5,0.5,0.5,0.5);
56 const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
58 __m128 lu = _mm_rsqrt_ps(x);
60 return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
64 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
65 static gmx_inline __m256
66 gmx_mm256_sqrt_ps(__m256 x)
68 __m256 mask;
69 __m256 res;
71 mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
72 res = _mm256_andnot_ps(mask,gmx_mm256_invsqrt_ps(x));
74 res = _mm256_mul_ps(x,res);
76 return res;
79 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
80 static gmx_inline __m128
81 gmx_mm_sqrt_ps(__m128 x)
83 __m128 mask;
84 __m128 res;
86 mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
87 res = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
89 res = _mm_mul_ps(x,res);
91 return res;
95 /* 1.0/x, 256-bit wide */
96 static gmx_inline __m256
97 gmx_mm256_inv_ps(__m256 x)
99 const __m256 two = _mm256_set1_ps(2.0f);
101 __m256 lu = _mm256_rcp_ps(x);
103 return _mm256_mul_ps(lu,_mm256_sub_ps(two,_mm256_mul_ps(lu,x)));
106 /* 1.0/x, 128-bit wide */
107 static gmx_inline __m128
108 gmx_mm_inv_ps(__m128 x)
110 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
112 __m128 lu = _mm_rcp_ps(x);
114 return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
118 static gmx_inline __m256
119 gmx_mm256_abs_ps(__m256 x)
121 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
123 return _mm256_and_ps(x,signmask);
126 static gmx_inline __m128
127 gmx_mm_abs_ps(__m128 x)
129 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
131 return _mm_and_ps(x,signmask);
135 static __m256
136 gmx_mm256_log_ps(__m256 x)
138 const __m256 expmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
139 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
140 const __m256 half = _mm256_set1_ps(0.5f);
141 const __m256 one = _mm256_set1_ps(1.0f);
142 const __m256 invsq2 = _mm256_set1_ps(1.0f/sqrt(2.0f));
143 const __m256 corr1 = _mm256_set1_ps(-2.12194440e-4f);
144 const __m256 corr2 = _mm256_set1_ps(0.693359375f);
146 const __m256 CA_1 = _mm256_set1_ps(0.070376836292f);
147 const __m256 CB_0 = _mm256_set1_ps(1.6714950086782716f);
148 const __m256 CB_1 = _mm256_set1_ps(-2.452088066061482f);
149 const __m256 CC_0 = _mm256_set1_ps(1.5220770854701728f);
150 const __m256 CC_1 = _mm256_set1_ps(-1.3422238433233642f);
151 const __m256 CD_0 = _mm256_set1_ps(1.386218787509749f);
152 const __m256 CD_1 = _mm256_set1_ps(0.35075468953796346f);
153 const __m256 CE_0 = _mm256_set1_ps(1.3429983063133937f);
154 const __m256 CE_1 = _mm256_set1_ps(1.807420826584643f);
156 __m256 fexp,fexp1;
157 __m256i iexp;
158 __m128i iexp128a,iexp128b;
159 __m256 mask;
160 __m256i imask;
161 __m128i imask128a,imask128b;
162 __m256 x1,x2;
163 __m256 y;
164 __m256 pA,pB,pC,pD,pE,tB,tC,tD,tE;
166 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
167 fexp = _mm256_and_ps(x,expmask);
168 iexp = _mm256_castps_si256(fexp);
170 iexp128b = _mm256_extractf128_si256(iexp,0x1);
171 iexp128a = _mm256_castsi256_si128(iexp);
173 iexp128a = _mm_srli_epi32(iexp128a,23);
174 iexp128b = _mm_srli_epi32(iexp128b,23);
175 iexp128a = _mm_sub_epi32(iexp128a,expbase_m1);
176 iexp128b = _mm_sub_epi32(iexp128b,expbase_m1);
178 x = _mm256_andnot_ps(expmask,x);
179 x = _mm256_or_ps(x,one);
180 x = _mm256_mul_ps(x,half);
182 mask = _mm256_cmp_ps(x,invsq2,_CMP_LT_OQ);
184 x = _mm256_add_ps(x,_mm256_and_ps(mask,x));
185 x = _mm256_sub_ps(x,one);
187 imask = _mm256_castps_si256(mask);
189 imask128b = _mm256_extractf128_si256(imask,0x1);
190 imask128a = _mm256_castsi256_si128(imask);
192 iexp128a = _mm_add_epi32(iexp128a,imask128a);
193 iexp128b = _mm_add_epi32(iexp128b,imask128b);
195 iexp = _mm256_castsi128_si256(iexp128a);
196 iexp = _mm256_insertf128_si256(iexp,iexp128b,0x1);
198 x2 = _mm256_mul_ps(x,x);
200 pA = _mm256_mul_ps(CA_1,x);
201 pB = _mm256_mul_ps(CB_1,x);
202 pC = _mm256_mul_ps(CC_1,x);
203 pD = _mm256_mul_ps(CD_1,x);
204 pE = _mm256_mul_ps(CE_1,x);
205 tB = _mm256_add_ps(CB_0,x2);
206 tC = _mm256_add_ps(CC_0,x2);
207 tD = _mm256_add_ps(CD_0,x2);
208 tE = _mm256_add_ps(CE_0,x2);
209 pB = _mm256_add_ps(pB,tB);
210 pC = _mm256_add_ps(pC,tC);
211 pD = _mm256_add_ps(pD,tD);
212 pE = _mm256_add_ps(pE,tE);
214 pA = _mm256_mul_ps(pA,pB);
215 pC = _mm256_mul_ps(pC,pD);
216 pE = _mm256_mul_ps(pE,x2);
217 pA = _mm256_mul_ps(pA,pC);
218 y = _mm256_mul_ps(pA,pE);
220 fexp = _mm256_cvtepi32_ps(iexp);
221 y = _mm256_add_ps(y,_mm256_mul_ps(fexp,corr1));
223 y = _mm256_sub_ps(y, _mm256_mul_ps(half,x2));
224 x2 = _mm256_add_ps(x,y);
226 x2 = _mm256_add_ps(x2,_mm256_mul_ps(fexp,corr2));
228 return x2;
232 static __m128
233 gmx_mm_log_ps(__m128 x)
235 /* Same algorithm as cephes library */
236 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
237 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
238 const __m128 half = _mm_set1_ps(0.5f);
239 const __m128 one = _mm_set1_ps(1.0f);
240 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
241 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
242 const __m128 corr2 = _mm_set1_ps(0.693359375f);
244 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
245 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
246 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
247 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
248 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
249 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
250 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
251 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
252 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
254 __m128 fexp,fexp1;
255 __m128i iexp;
256 __m128 mask;
257 __m128 x1,x2;
258 __m128 y;
259 __m128 pA,pB,pC,pD,pE,tB,tC,tD,tE;
261 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
262 fexp = _mm_and_ps(x,expmask);
263 iexp = gmx_mm_castps_si128(fexp);
264 iexp = _mm_srli_epi32(iexp,23);
265 iexp = _mm_sub_epi32(iexp,expbase_m1);
267 x = _mm_andnot_ps(expmask,x);
268 x = _mm_or_ps(x,one);
269 x = _mm_mul_ps(x,half);
271 mask = _mm_cmplt_ps(x,invsq2);
273 x = _mm_add_ps(x,_mm_and_ps(mask,x));
274 x = _mm_sub_ps(x,one);
275 iexp = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
277 x2 = _mm_mul_ps(x,x);
279 pA = _mm_mul_ps(CA_1,x);
280 pB = _mm_mul_ps(CB_1,x);
281 pC = _mm_mul_ps(CC_1,x);
282 pD = _mm_mul_ps(CD_1,x);
283 pE = _mm_mul_ps(CE_1,x);
284 tB = _mm_add_ps(CB_0,x2);
285 tC = _mm_add_ps(CC_0,x2);
286 tD = _mm_add_ps(CD_0,x2);
287 tE = _mm_add_ps(CE_0,x2);
288 pB = _mm_add_ps(pB,tB);
289 pC = _mm_add_ps(pC,tC);
290 pD = _mm_add_ps(pD,tD);
291 pE = _mm_add_ps(pE,tE);
293 pA = _mm_mul_ps(pA,pB);
294 pC = _mm_mul_ps(pC,pD);
295 pE = _mm_mul_ps(pE,x2);
296 pA = _mm_mul_ps(pA,pC);
297 y = _mm_mul_ps(pA,pE);
299 fexp = _mm_cvtepi32_ps(iexp);
300 y = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
302 y = _mm_sub_ps(y, _mm_mul_ps(half,x2));
303 x2 = _mm_add_ps(x,y);
305 x2 = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
307 return x2;
312 * 2^x function, 256-bit wide
314 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
315 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
317 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
319 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
320 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
321 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
322 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
323 * number instead. That would take a few extra cycles and not really help, since something is
324 * wrong if you are using single precision to work with numbers that cannot really be represented
325 * in single precision.
327 * The accuracy is at least 23 bits.
329 static __m256
330 gmx_mm256_exp2_ps(__m256 x)
332 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
333 const __m256 arglimit = _mm256_set1_ps(126.0f);
335 const __m128i expbase = _mm_set1_epi32(127);
336 const __m256 CC6 = _mm256_set1_ps(1.535336188319500E-004);
337 const __m256 CC5 = _mm256_set1_ps(1.339887440266574E-003);
338 const __m256 CC4 = _mm256_set1_ps(9.618437357674640E-003);
339 const __m256 CC3 = _mm256_set1_ps(5.550332471162809E-002);
340 const __m256 CC2 = _mm256_set1_ps(2.402264791363012E-001);
341 const __m256 CC1 = _mm256_set1_ps(6.931472028550421E-001);
342 const __m256 CC0 = _mm256_set1_ps(1.0f);
344 __m256 p0,p1;
345 __m256 valuemask;
346 __m256i iexppart;
347 __m128i iexppart128a,iexppart128b;
348 __m256 fexppart;
349 __m256 intpart;
350 __m256 x2;
353 iexppart = _mm256_cvtps_epi32(x);
354 intpart = _mm256_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
356 iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
357 iexppart128a = _mm256_castsi256_si128(iexppart);
359 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
360 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
362 iexppart = _mm256_castsi128_si256(iexppart128a);
363 iexppart = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
364 valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(x),_CMP_GE_OQ);
365 fexppart = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
367 x = _mm256_sub_ps(x,intpart);
368 x2 = _mm256_mul_ps(x,x);
370 p0 = _mm256_mul_ps(CC6,x2);
371 p1 = _mm256_mul_ps(CC5,x2);
372 p0 = _mm256_add_ps(p0,CC4);
373 p1 = _mm256_add_ps(p1,CC3);
374 p0 = _mm256_mul_ps(p0,x2);
375 p1 = _mm256_mul_ps(p1,x2);
376 p0 = _mm256_add_ps(p0,CC2);
377 p1 = _mm256_add_ps(p1,CC1);
378 p0 = _mm256_mul_ps(p0,x2);
379 p1 = _mm256_mul_ps(p1,x);
380 p0 = _mm256_add_ps(p0,CC0);
381 p0 = _mm256_add_ps(p0,p1);
382 x = _mm256_mul_ps(p0,fexppart);
384 return x;
388 /* 2^x, 128 bit wide */
389 static __m128
390 gmx_mm_exp2_ps(__m128 x)
392 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
393 const __m128 arglimit = _mm_set1_ps(126.0f);
395 const __m128i expbase = _mm_set1_epi32(127);
396 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
397 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
398 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
399 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
400 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
401 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
402 const __m128 CA0 = _mm_set1_ps(1.0f);
404 __m128 valuemask;
405 __m128i iexppart;
406 __m128 fexppart;
407 __m128 intpart;
408 __m128 x2;
409 __m128 p0,p1;
411 iexppart = _mm_cvtps_epi32(x);
412 intpart = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
413 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
414 valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(x));
415 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
417 x = _mm_sub_ps(x,intpart);
418 x2 = _mm_mul_ps(x,x);
420 p0 = _mm_mul_ps(CA6,x2);
421 p1 = _mm_mul_ps(CA5,x2);
422 p0 = _mm_add_ps(p0,CA4);
423 p1 = _mm_add_ps(p1,CA3);
424 p0 = _mm_mul_ps(p0,x2);
425 p1 = _mm_mul_ps(p1,x2);
426 p0 = _mm_add_ps(p0,CA2);
427 p1 = _mm_add_ps(p1,CA1);
428 p0 = _mm_mul_ps(p0,x2);
429 p1 = _mm_mul_ps(p1,x);
430 p0 = _mm_add_ps(p0,CA0);
431 p0 = _mm_add_ps(p0,p1);
432 x = _mm_mul_ps(p0,fexppart);
434 return x;
438 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
439 * where y=log2(e)*x, but there will then be a small rounding error since we lose some
440 * precision due to the multiplication. This will then be magnified a lot by the exponential.
442 * Instead, we calculate the fractional part directly as a minimax approximation of
443 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
444 * remaining after 2^y, which avoids the precision-loss.
445 * The final result is correct to within 1 LSB over the entire argument range.
447 static __m256
448 gmx_mm256_exp_ps(__m256 exparg)
450 const __m256 argscale = _mm256_set1_ps(1.44269504088896341f);
451 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
452 const __m256 arglimit = _mm256_set1_ps(126.0f);
453 const __m128i expbase = _mm_set1_epi32(127);
455 const __m256 invargscale0 = _mm256_set1_ps(0.693359375f);
456 const __m256 invargscale1 = _mm256_set1_ps(-2.12194440e-4f);
458 const __m256 CE5 = _mm256_set1_ps(1.9875691500e-4f);
459 const __m256 CE4 = _mm256_set1_ps(1.3981999507e-3f);
460 const __m256 CE3 = _mm256_set1_ps(8.3334519073e-3f);
461 const __m256 CE2 = _mm256_set1_ps(4.1665795894e-2f);
462 const __m256 CE1 = _mm256_set1_ps(1.6666665459e-1f);
463 const __m256 CE0 = _mm256_set1_ps(5.0000001201e-1f);
464 const __m256 one = _mm256_set1_ps(1.0f);
466 __m256 exparg2,exp2arg;
467 __m256 pE0,pE1;
468 __m256 valuemask;
469 __m256i iexppart;
470 __m128i iexppart128a,iexppart128b;
471 __m256 fexppart;
472 __m256 intpart;
474 exp2arg = _mm256_mul_ps(exparg,argscale);
476 iexppart = _mm256_cvtps_epi32(exp2arg);
477 intpart = _mm256_round_ps(exp2arg,_MM_FROUND_TO_NEAREST_INT);
479 iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
480 iexppart128a = _mm256_castsi256_si128(iexppart);
482 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
483 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
485 iexppart = _mm256_castsi128_si256(iexppart128a);
486 iexppart = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
487 valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(exp2arg),_CMP_GE_OQ);
488 fexppart = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
490 /* Extended precision arithmetics */
491 exparg = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale0,intpart));
492 exparg = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale1,intpart));
494 exparg2 = _mm256_mul_ps(exparg,exparg);
496 pE1 = _mm256_mul_ps(CE5,exparg2);
497 pE0 = _mm256_mul_ps(CE4,exparg2);
498 pE1 = _mm256_add_ps(pE1,CE3);
499 pE0 = _mm256_add_ps(pE0,CE2);
500 pE1 = _mm256_mul_ps(pE1,exparg2);
501 pE0 = _mm256_mul_ps(pE0,exparg2);
502 pE1 = _mm256_add_ps(pE1,CE1);
503 pE0 = _mm256_add_ps(pE0,CE0);
504 pE1 = _mm256_mul_ps(pE1,exparg);
505 pE0 = _mm256_add_ps(pE0,pE1);
506 pE0 = _mm256_mul_ps(pE0,exparg2);
507 exparg = _mm256_add_ps(exparg,one);
508 exparg = _mm256_add_ps(exparg,pE0);
510 exparg = _mm256_mul_ps(exparg,fexppart);
512 return exparg;
516 /* exp(), 128 bit wide. */
517 static __m128
518 gmx_mm_exp_ps(__m128 x)
520 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
521 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
522 const __m128 arglimit = _mm_set1_ps(126.0f);
523 const __m128i expbase = _mm_set1_epi32(127);
525 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
526 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
528 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
529 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
530 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
531 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
532 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
533 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
534 const __m128 one = _mm_set1_ps(1.0f);
536 __m128 y,x2;
537 __m128 p0,p1;
538 __m128 valuemask;
539 __m128i iexppart;
540 __m128 fexppart;
541 __m128 intpart;
543 y = _mm_mul_ps(x,argscale);
545 iexppart = _mm_cvtps_epi32(y);
546 intpart = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
548 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
549 valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(y));
550 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
552 /* Extended precision arithmetics */
553 x = _mm_sub_ps(x,_mm_mul_ps(invargscale0,intpart));
554 x = _mm_sub_ps(x,_mm_mul_ps(invargscale1,intpart));
556 x2 = _mm_mul_ps(x,x);
558 p1 = _mm_mul_ps(CC5,x2);
559 p0 = _mm_mul_ps(CC4,x2);
560 p1 = _mm_add_ps(p1,CC3);
561 p0 = _mm_add_ps(p0,CC2);
562 p1 = _mm_mul_ps(p1,x2);
563 p0 = _mm_mul_ps(p0,x2);
564 p1 = _mm_add_ps(p1,CC1);
565 p0 = _mm_add_ps(p0,CC0);
566 p1 = _mm_mul_ps(p1,x);
567 p0 = _mm_add_ps(p0,p1);
568 p0 = _mm_mul_ps(p0,x2);
569 x = _mm_add_ps(x,one);
570 x = _mm_add_ps(x,p0);
572 x = _mm_mul_ps(x,fexppart);
574 return x;
579 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
580 static __m256
581 gmx_mm256_erf_ps(__m256 x)
583 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
584 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
585 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
586 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
587 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
588 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
589 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
590 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
591 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
592 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
593 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
594 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
595 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
596 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
597 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
598 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
599 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
600 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
601 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
602 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
603 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
604 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
605 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
606 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
607 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
608 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
609 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
610 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
611 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
612 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
613 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
615 /* Coefficients for expansion of exp(x) in [0,0.1] */
616 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
617 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
618 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
619 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
621 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
622 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
623 const __m256 one = _mm256_set1_ps(1.0f);
624 const __m256 two = _mm256_set1_ps(2.0f);
626 __m256 x2,x4,y;
627 __m256 z,q,t,t2,w,w2;
628 __m256 pA0,pA1,pB0,pB1,pC0,pC1;
629 __m256 expmx2,corr;
630 __m256 res_erf,res_erfc,res;
631 __m256 mask;
633 /* Calculate erf() */
634 x2 = _mm256_mul_ps(x,x);
635 x4 = _mm256_mul_ps(x2,x2);
637 pA0 = _mm256_mul_ps(CA6,x4);
638 pA1 = _mm256_mul_ps(CA5,x4);
639 pA0 = _mm256_add_ps(pA0,CA4);
640 pA1 = _mm256_add_ps(pA1,CA3);
641 pA0 = _mm256_mul_ps(pA0,x4);
642 pA1 = _mm256_mul_ps(pA1,x4);
643 pA0 = _mm256_add_ps(pA0,CA2);
644 pA1 = _mm256_add_ps(pA1,CA1);
645 pA0 = _mm256_mul_ps(pA0,x4);
646 pA1 = _mm256_mul_ps(pA1,x2);
647 pA0 = _mm256_add_ps(pA0,pA1);
648 pA0 = _mm256_add_ps(pA0,CA0);
650 res_erf = _mm256_mul_ps(x,pA0);
652 /* Calculate erfc */
654 y = gmx_mm256_abs_ps(x);
655 t = gmx_mm256_inv_ps(y);
656 w = _mm256_sub_ps(t,one);
657 t2 = _mm256_mul_ps(t,t);
658 w2 = _mm256_mul_ps(w,w);
660 * We cannot simply calculate exp(-x2) directly in single precision, since
661 * that will lose a couple of bits of precision due to the multiplication.
662 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
663 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
665 * The only drawback with this is that it requires TWO separate exponential
666 * evaluations, which would be horrible performance-wise. However, the argument
667 * for the second exp() call is always small, so there we simply use a
668 * low-order minimax expansion on [0,0.1].
671 z = _mm256_and_ps(y,sieve);
672 q = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
674 corr = _mm256_mul_ps(CD4,q);
675 corr = _mm256_add_ps(corr,CD3);
676 corr = _mm256_mul_ps(corr,q);
677 corr = _mm256_add_ps(corr,CD2);
678 corr = _mm256_mul_ps(corr,q);
679 corr = _mm256_add_ps(corr,one);
680 corr = _mm256_mul_ps(corr,q);
681 corr = _mm256_add_ps(corr,one);
683 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
684 expmx2 = _mm256_mul_ps(expmx2,corr);
686 pB1 = _mm256_mul_ps(CB9,w2);
687 pB0 = _mm256_mul_ps(CB8,w2);
688 pB1 = _mm256_add_ps(pB1,CB7);
689 pB0 = _mm256_add_ps(pB0,CB6);
690 pB1 = _mm256_mul_ps(pB1,w2);
691 pB0 = _mm256_mul_ps(pB0,w2);
692 pB1 = _mm256_add_ps(pB1,CB5);
693 pB0 = _mm256_add_ps(pB0,CB4);
694 pB1 = _mm256_mul_ps(pB1,w2);
695 pB0 = _mm256_mul_ps(pB0,w2);
696 pB1 = _mm256_add_ps(pB1,CB3);
697 pB0 = _mm256_add_ps(pB0,CB2);
698 pB1 = _mm256_mul_ps(pB1,w2);
699 pB0 = _mm256_mul_ps(pB0,w2);
700 pB1 = _mm256_add_ps(pB1,CB1);
701 pB1 = _mm256_mul_ps(pB1,w);
702 pB0 = _mm256_add_ps(pB0,pB1);
703 pB0 = _mm256_add_ps(pB0,CB0);
705 pC0 = _mm256_mul_ps(CC10,t2);
706 pC1 = _mm256_mul_ps(CC9,t2);
707 pC0 = _mm256_add_ps(pC0,CC8);
708 pC1 = _mm256_add_ps(pC1,CC7);
709 pC0 = _mm256_mul_ps(pC0,t2);
710 pC1 = _mm256_mul_ps(pC1,t2);
711 pC0 = _mm256_add_ps(pC0,CC6);
712 pC1 = _mm256_add_ps(pC1,CC5);
713 pC0 = _mm256_mul_ps(pC0,t2);
714 pC1 = _mm256_mul_ps(pC1,t2);
715 pC0 = _mm256_add_ps(pC0,CC4);
716 pC1 = _mm256_add_ps(pC1,CC3);
717 pC0 = _mm256_mul_ps(pC0,t2);
718 pC1 = _mm256_mul_ps(pC1,t2);
719 pC0 = _mm256_add_ps(pC0,CC2);
720 pC1 = _mm256_add_ps(pC1,CC1);
721 pC0 = _mm256_mul_ps(pC0,t2);
722 pC1 = _mm256_mul_ps(pC1,t);
723 pC0 = _mm256_add_ps(pC0,pC1);
724 pC0 = _mm256_add_ps(pC0,CC0);
725 pC0 = _mm256_mul_ps(pC0,t);
727 /* SELECT pB0 or pC0 for erfc() */
728 mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
729 res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
730 res_erfc = _mm256_mul_ps(res_erfc,expmx2);
732 /* erfc(x<0) = 2-erfc(|x|) */
733 mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
734 res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
736 /* Select erf() or erfc() */
737 mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
738 res = _mm256_blendv_ps(_mm256_sub_ps(one,res_erfc),res_erf,mask);
740 return res;
744 /* erf(), 128 bit wide */
745 static __m128
746 gmx_mm_erf_ps(__m128 x)
748 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
749 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
750 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
751 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
752 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
753 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
754 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
755 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
756 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
757 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
758 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
759 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
760 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
761 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
762 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
763 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
764 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
765 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
766 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
767 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
768 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
769 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
770 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
771 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
772 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
773 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
774 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
775 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
776 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
777 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
778 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
780 /* Coefficients for expansion of exp(x) in [0,0.1] */
781 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
782 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
783 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
784 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
786 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
787 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
788 const __m128 one = _mm_set1_ps(1.0f);
789 const __m128 two = _mm_set1_ps(2.0f);
791 __m128 x2,x4,y;
792 __m128 z,q,t,t2,w,w2;
793 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
794 __m128 expmx2,corr;
795 __m128 res_erf,res_erfc,res;
796 __m128 mask;
798 /* Calculate erf() */
799 x2 = _mm_mul_ps(x,x);
800 x4 = _mm_mul_ps(x2,x2);
802 pA0 = _mm_mul_ps(CA6,x4);
803 pA1 = _mm_mul_ps(CA5,x4);
804 pA0 = _mm_add_ps(pA0,CA4);
805 pA1 = _mm_add_ps(pA1,CA3);
806 pA0 = _mm_mul_ps(pA0,x4);
807 pA1 = _mm_mul_ps(pA1,x4);
808 pA0 = _mm_add_ps(pA0,CA2);
809 pA1 = _mm_add_ps(pA1,CA1);
810 pA0 = _mm_mul_ps(pA0,x4);
811 pA1 = _mm_mul_ps(pA1,x2);
812 pA0 = _mm_add_ps(pA0,pA1);
813 pA0 = _mm_add_ps(pA0,CA0);
815 res_erf = _mm_mul_ps(x,pA0);
817 /* Calculate erfc */
819 y = gmx_mm_abs_ps(x);
820 t = gmx_mm_inv_ps(y);
821 w = _mm_sub_ps(t,one);
822 t2 = _mm_mul_ps(t,t);
823 w2 = _mm_mul_ps(w,w);
825 * We cannot simply calculate exp(-x2) directly in single precision, since
826 * that will lose a couple of bits of precision due to the multiplication.
827 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
828 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
830 * The only drawback with this is that it requires TWO separate exponential
831 * evaluations, which would be horrible performance-wise. However, the argument
832 * for the second exp() call is always small, so there we simply use a
833 * low-order minimax expansion on [0,0.1].
836 z = _mm_and_ps(y,sieve);
837 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
839 corr = _mm_mul_ps(CD4,q);
840 corr = _mm_add_ps(corr,CD3);
841 corr = _mm_mul_ps(corr,q);
842 corr = _mm_add_ps(corr,CD2);
843 corr = _mm_mul_ps(corr,q);
844 corr = _mm_add_ps(corr,one);
845 corr = _mm_mul_ps(corr,q);
846 corr = _mm_add_ps(corr,one);
848 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
849 expmx2 = _mm_mul_ps(expmx2,corr);
851 pB1 = _mm_mul_ps(CB9,w2);
852 pB0 = _mm_mul_ps(CB8,w2);
853 pB1 = _mm_add_ps(pB1,CB7);
854 pB0 = _mm_add_ps(pB0,CB6);
855 pB1 = _mm_mul_ps(pB1,w2);
856 pB0 = _mm_mul_ps(pB0,w2);
857 pB1 = _mm_add_ps(pB1,CB5);
858 pB0 = _mm_add_ps(pB0,CB4);
859 pB1 = _mm_mul_ps(pB1,w2);
860 pB0 = _mm_mul_ps(pB0,w2);
861 pB1 = _mm_add_ps(pB1,CB3);
862 pB0 = _mm_add_ps(pB0,CB2);
863 pB1 = _mm_mul_ps(pB1,w2);
864 pB0 = _mm_mul_ps(pB0,w2);
865 pB1 = _mm_add_ps(pB1,CB1);
866 pB1 = _mm_mul_ps(pB1,w);
867 pB0 = _mm_add_ps(pB0,pB1);
868 pB0 = _mm_add_ps(pB0,CB0);
870 pC0 = _mm_mul_ps(CC10,t2);
871 pC1 = _mm_mul_ps(CC9,t2);
872 pC0 = _mm_add_ps(pC0,CC8);
873 pC1 = _mm_add_ps(pC1,CC7);
874 pC0 = _mm_mul_ps(pC0,t2);
875 pC1 = _mm_mul_ps(pC1,t2);
876 pC0 = _mm_add_ps(pC0,CC6);
877 pC1 = _mm_add_ps(pC1,CC5);
878 pC0 = _mm_mul_ps(pC0,t2);
879 pC1 = _mm_mul_ps(pC1,t2);
880 pC0 = _mm_add_ps(pC0,CC4);
881 pC1 = _mm_add_ps(pC1,CC3);
882 pC0 = _mm_mul_ps(pC0,t2);
883 pC1 = _mm_mul_ps(pC1,t2);
884 pC0 = _mm_add_ps(pC0,CC2);
885 pC1 = _mm_add_ps(pC1,CC1);
886 pC0 = _mm_mul_ps(pC0,t2);
887 pC1 = _mm_mul_ps(pC1,t);
888 pC0 = _mm_add_ps(pC0,pC1);
889 pC0 = _mm_add_ps(pC0,CC0);
890 pC0 = _mm_mul_ps(pC0,t);
892 /* SELECT pB0 or pC0 for erfc() */
893 mask = _mm_cmplt_ps(two,y);
894 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
895 res_erfc = _mm_mul_ps(res_erfc,expmx2);
897 /* erfc(x<0) = 2-erfc(|x|) */
898 mask = _mm_cmplt_ps(x,_mm_setzero_ps());
899 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
901 /* Select erf() or erfc() */
902 mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
903 res = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
905 return res;
911 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
912 static __m256
913 gmx_mm256_erfc_ps(__m256 x)
915 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
916 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
917 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
918 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
919 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
920 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
921 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
922 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
923 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
924 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
925 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
926 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
927 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
928 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
929 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
930 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
931 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
932 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
933 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
934 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
935 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
936 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
937 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
938 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
939 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
940 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
941 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
942 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
943 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
944 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
945 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
947 /* Coefficients for expansion of exp(x) in [0,0.1] */
948 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
949 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
950 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
951 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
953 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
954 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
955 const __m256 one = _mm256_set1_ps(1.0f);
956 const __m256 two = _mm256_set1_ps(2.0f);
958 __m256 x2,x4,y;
959 __m256 z,q,t,t2,w,w2;
960 __m256 pA0,pA1,pB0,pB1,pC0,pC1;
961 __m256 expmx2,corr;
962 __m256 res_erf,res_erfc,res;
963 __m256 mask;
965 /* Calculate erf() */
966 x2 = _mm256_mul_ps(x,x);
967 x4 = _mm256_mul_ps(x2,x2);
969 pA0 = _mm256_mul_ps(CA6,x4);
970 pA1 = _mm256_mul_ps(CA5,x4);
971 pA0 = _mm256_add_ps(pA0,CA4);
972 pA1 = _mm256_add_ps(pA1,CA3);
973 pA0 = _mm256_mul_ps(pA0,x4);
974 pA1 = _mm256_mul_ps(pA1,x4);
975 pA0 = _mm256_add_ps(pA0,CA2);
976 pA1 = _mm256_add_ps(pA1,CA1);
977 pA0 = _mm256_mul_ps(pA0,x4);
978 pA1 = _mm256_mul_ps(pA1,x2);
979 pA0 = _mm256_add_ps(pA0,pA1);
980 pA0 = _mm256_add_ps(pA0,CA0);
982 res_erf = _mm256_mul_ps(x,pA0);
984 /* Calculate erfc */
985 y = gmx_mm256_abs_ps(x);
986 t = gmx_mm256_inv_ps(y);
987 w = _mm256_sub_ps(t,one);
988 t2 = _mm256_mul_ps(t,t);
989 w2 = _mm256_mul_ps(w,w);
991 * We cannot simply calculate exp(-x2) directly in single precision, since
992 * that will lose a couple of bits of precision due to the multiplication.
993 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
994 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
996 * The only drawback with this is that it requires TWO separate exponential
997 * evaluations, which would be horrible performance-wise. However, the argument
998 * for the second exp() call is always small, so there we simply use a
999 * low-order minimax expansion on [0,0.1].
1002 z = _mm256_and_ps(y,sieve);
1003 q = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
1005 corr = _mm256_mul_ps(CD4,q);
1006 corr = _mm256_add_ps(corr,CD3);
1007 corr = _mm256_mul_ps(corr,q);
1008 corr = _mm256_add_ps(corr,CD2);
1009 corr = _mm256_mul_ps(corr,q);
1010 corr = _mm256_add_ps(corr,one);
1011 corr = _mm256_mul_ps(corr,q);
1012 corr = _mm256_add_ps(corr,one);
1014 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
1015 expmx2 = _mm256_mul_ps(expmx2,corr);
1017 pB1 = _mm256_mul_ps(CB9,w2);
1018 pB0 = _mm256_mul_ps(CB8,w2);
1019 pB1 = _mm256_add_ps(pB1,CB7);
1020 pB0 = _mm256_add_ps(pB0,CB6);
1021 pB1 = _mm256_mul_ps(pB1,w2);
1022 pB0 = _mm256_mul_ps(pB0,w2);
1023 pB1 = _mm256_add_ps(pB1,CB5);
1024 pB0 = _mm256_add_ps(pB0,CB4);
1025 pB1 = _mm256_mul_ps(pB1,w2);
1026 pB0 = _mm256_mul_ps(pB0,w2);
1027 pB1 = _mm256_add_ps(pB1,CB3);
1028 pB0 = _mm256_add_ps(pB0,CB2);
1029 pB1 = _mm256_mul_ps(pB1,w2);
1030 pB0 = _mm256_mul_ps(pB0,w2);
1031 pB1 = _mm256_add_ps(pB1,CB1);
1032 pB1 = _mm256_mul_ps(pB1,w);
1033 pB0 = _mm256_add_ps(pB0,pB1);
1034 pB0 = _mm256_add_ps(pB0,CB0);
1036 pC0 = _mm256_mul_ps(CC10,t2);
1037 pC1 = _mm256_mul_ps(CC9,t2);
1038 pC0 = _mm256_add_ps(pC0,CC8);
1039 pC1 = _mm256_add_ps(pC1,CC7);
1040 pC0 = _mm256_mul_ps(pC0,t2);
1041 pC1 = _mm256_mul_ps(pC1,t2);
1042 pC0 = _mm256_add_ps(pC0,CC6);
1043 pC1 = _mm256_add_ps(pC1,CC5);
1044 pC0 = _mm256_mul_ps(pC0,t2);
1045 pC1 = _mm256_mul_ps(pC1,t2);
1046 pC0 = _mm256_add_ps(pC0,CC4);
1047 pC1 = _mm256_add_ps(pC1,CC3);
1048 pC0 = _mm256_mul_ps(pC0,t2);
1049 pC1 = _mm256_mul_ps(pC1,t2);
1050 pC0 = _mm256_add_ps(pC0,CC2);
1051 pC1 = _mm256_add_ps(pC1,CC1);
1052 pC0 = _mm256_mul_ps(pC0,t2);
1053 pC1 = _mm256_mul_ps(pC1,t);
1054 pC0 = _mm256_add_ps(pC0,pC1);
1055 pC0 = _mm256_add_ps(pC0,CC0);
1056 pC0 = _mm256_mul_ps(pC0,t);
1058 /* SELECT pB0 or pC0 for erfc() */
1059 mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
1060 res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
1061 res_erfc = _mm256_mul_ps(res_erfc,expmx2);
1063 /* erfc(x<0) = 2-erfc(|x|) */
1064 mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
1065 res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
1067 /* Select erf() or erfc() */
1068 mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
1069 res = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(one,res_erf),mask);
1071 return res;
1075 /* erfc(), 128 bit wide */
1076 static __m128
1077 gmx_mm_erfc_ps(__m128 x)
1079 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1080 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
1081 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
1082 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
1083 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
1084 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
1085 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
1086 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
1087 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1088 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
1089 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
1090 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
1091 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
1092 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
1093 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
1094 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
1095 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
1096 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
1097 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
1098 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1099 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
1100 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
1101 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
1102 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
1103 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
1104 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
1105 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
1106 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
1107 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
1108 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
1109 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
1111 /* Coefficients for expansion of exp(x) in [0,0.1] */
1112 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1113 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
1114 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
1115 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
1117 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1118 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1119 const __m128 one = _mm_set1_ps(1.0f);
1120 const __m128 two = _mm_set1_ps(2.0f);
1122 __m128 x2,x4,y;
1123 __m128 z,q,t,t2,w,w2;
1124 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
1125 __m128 expmx2,corr;
1126 __m128 res_erf,res_erfc,res;
1127 __m128 mask;
1129 /* Calculate erf() */
1130 x2 = _mm_mul_ps(x,x);
1131 x4 = _mm_mul_ps(x2,x2);
1133 pA0 = _mm_mul_ps(CA6,x4);
1134 pA1 = _mm_mul_ps(CA5,x4);
1135 pA0 = _mm_add_ps(pA0,CA4);
1136 pA1 = _mm_add_ps(pA1,CA3);
1137 pA0 = _mm_mul_ps(pA0,x4);
1138 pA1 = _mm_mul_ps(pA1,x4);
1139 pA0 = _mm_add_ps(pA0,CA2);
1140 pA1 = _mm_add_ps(pA1,CA1);
1141 pA0 = _mm_mul_ps(pA0,x4);
1142 pA1 = _mm_mul_ps(pA1,x2);
1143 pA0 = _mm_add_ps(pA0,pA1);
1144 pA0 = _mm_add_ps(pA0,CA0);
1146 res_erf = _mm_mul_ps(x,pA0);
1148 /* Calculate erfc */
1149 y = gmx_mm_abs_ps(x);
1150 t = gmx_mm_inv_ps(y);
1151 w = _mm_sub_ps(t,one);
1152 t2 = _mm_mul_ps(t,t);
1153 w2 = _mm_mul_ps(w,w);
1155 * We cannot simply calculate exp(-x2) directly in single precision, since
1156 * that will lose a couple of bits of precision due to the multiplication.
1157 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1158 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1160 * The only drawback with this is that it requires TWO separate exponential
1161 * evaluations, which would be horrible performance-wise. However, the argument
1162 * for the second exp() call is always small, so there we simply use a
1163 * low-order minimax expansion on [0,0.1].
1166 z = _mm_and_ps(y,sieve);
1167 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
1169 corr = _mm_mul_ps(CD4,q);
1170 corr = _mm_add_ps(corr,CD3);
1171 corr = _mm_mul_ps(corr,q);
1172 corr = _mm_add_ps(corr,CD2);
1173 corr = _mm_mul_ps(corr,q);
1174 corr = _mm_add_ps(corr,one);
1175 corr = _mm_mul_ps(corr,q);
1176 corr = _mm_add_ps(corr,one);
1178 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
1179 expmx2 = _mm_mul_ps(expmx2,corr);
1181 pB1 = _mm_mul_ps(CB9,w2);
1182 pB0 = _mm_mul_ps(CB8,w2);
1183 pB1 = _mm_add_ps(pB1,CB7);
1184 pB0 = _mm_add_ps(pB0,CB6);
1185 pB1 = _mm_mul_ps(pB1,w2);
1186 pB0 = _mm_mul_ps(pB0,w2);
1187 pB1 = _mm_add_ps(pB1,CB5);
1188 pB0 = _mm_add_ps(pB0,CB4);
1189 pB1 = _mm_mul_ps(pB1,w2);
1190 pB0 = _mm_mul_ps(pB0,w2);
1191 pB1 = _mm_add_ps(pB1,CB3);
1192 pB0 = _mm_add_ps(pB0,CB2);
1193 pB1 = _mm_mul_ps(pB1,w2);
1194 pB0 = _mm_mul_ps(pB0,w2);
1195 pB1 = _mm_add_ps(pB1,CB1);
1196 pB1 = _mm_mul_ps(pB1,w);
1197 pB0 = _mm_add_ps(pB0,pB1);
1198 pB0 = _mm_add_ps(pB0,CB0);
1200 pC0 = _mm_mul_ps(CC10,t2);
1201 pC1 = _mm_mul_ps(CC9,t2);
1202 pC0 = _mm_add_ps(pC0,CC8);
1203 pC1 = _mm_add_ps(pC1,CC7);
1204 pC0 = _mm_mul_ps(pC0,t2);
1205 pC1 = _mm_mul_ps(pC1,t2);
1206 pC0 = _mm_add_ps(pC0,CC6);
1207 pC1 = _mm_add_ps(pC1,CC5);
1208 pC0 = _mm_mul_ps(pC0,t2);
1209 pC1 = _mm_mul_ps(pC1,t2);
1210 pC0 = _mm_add_ps(pC0,CC4);
1211 pC1 = _mm_add_ps(pC1,CC3);
1212 pC0 = _mm_mul_ps(pC0,t2);
1213 pC1 = _mm_mul_ps(pC1,t2);
1214 pC0 = _mm_add_ps(pC0,CC2);
1215 pC1 = _mm_add_ps(pC1,CC1);
1216 pC0 = _mm_mul_ps(pC0,t2);
1217 pC1 = _mm_mul_ps(pC1,t);
1218 pC0 = _mm_add_ps(pC0,pC1);
1219 pC0 = _mm_add_ps(pC0,CC0);
1220 pC0 = _mm_mul_ps(pC0,t);
1222 /* SELECT pB0 or pC0 for erfc() */
1223 mask = _mm_cmplt_ps(two,y);
1224 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
1225 res_erfc = _mm_mul_ps(res_erfc,expmx2);
1227 /* erfc(x<0) = 2-erfc(|x|) */
1228 mask = _mm_cmplt_ps(x,_mm_setzero_ps());
1229 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
1231 /* Select erf() or erfc() */
1232 mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
1233 res = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
1235 return res;
1240 /* Calculate the force correction due to PME analytically.
1242 * This routine is meant to enable analytical evaluation of the
1243 * direct-space PME electrostatic force to avoid tables.
1245 * The direct-space potential should be Erfc(beta*r)/r, but there
1246 * are some problems evaluating that:
1248 * First, the error function is difficult (read: expensive) to
1249 * approxmiate accurately for intermediate to large arguments, and
1250 * this happens already in ranges of beta*r that occur in simulations.
1251 * Second, we now try to avoid calculating potentials in Gromacs but
1252 * use forces directly.
1254 * We can simply things slight by noting that the PME part is really
1255 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1257 * V= 1/r - Erf(beta*r)/r
1259 * The first term we already have from the inverse square root, so
1260 * that we can leave out of this routine.
1262 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1263 * the argument beta*r will be in the range 0.15 to ~4. Use your
1264 * favorite plotting program to realize how well-behaved Erf(z)/z is
1265 * in this range!
1267 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1268 * However, it turns out it is more efficient to approximate f(z)/z and
1269 * then only use even powers. This is another minor optimization, since
1270 * we actually WANT f(z)/z, because it is going to be multiplied by
1271 * the vector between the two atoms to get the vectorial force. The
1272 * fastest flops are the ones we can avoid calculating!
1274 * So, here's how it should be used:
1276 * 1. Calculate r^2.
1277 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1278 * 3. Evaluate this routine with z^2 as the argument.
1279 * 4. The return value is the expression:
1282 * 2*exp(-z^2) erf(z)
1283 * ------------ - --------
1284 * sqrt(Pi)*z^2 z^3
1286 * 5. Multiply the entire expression by beta^3. This will get you
1288 * beta^3*2*exp(-z^2) beta^3*erf(z)
1289 * ------------------ - ---------------
1290 * sqrt(Pi)*z^2 z^3
1292 * or, switching back to r (z=r*beta):
1294 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
1295 * ----------------------- - -----------
1296 * sqrt(Pi)*r^2 r^3
1299 * With a bit of math exercise you should be able to confirm that
1300 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1302 * 6. Add the result to 1/r^3, multiply by the product of the charges,
1303 * and you have your force (divided by r). A final multiplication
1304 * with the vector connecting the two particles and you have your
1305 * vectorial force to add to the particles.
1308 static __m256
1309 gmx_mm256_pmecorrF_ps(__m256 z2)
1311 const __m256 FN6 = _mm256_set1_ps(-1.7357322914161492954e-8f);
1312 const __m256 FN5 = _mm256_set1_ps(1.4703624142580877519e-6f);
1313 const __m256 FN4 = _mm256_set1_ps(-0.000053401640219807709149f);
1314 const __m256 FN3 = _mm256_set1_ps(0.0010054721316683106153f);
1315 const __m256 FN2 = _mm256_set1_ps(-0.019278317264888380590f);
1316 const __m256 FN1 = _mm256_set1_ps(0.069670166153766424023f);
1317 const __m256 FN0 = _mm256_set1_ps(-0.75225204789749321333f);
1319 const __m256 FD4 = _mm256_set1_ps(0.0011193462567257629232f);
1320 const __m256 FD3 = _mm256_set1_ps(0.014866955030185295499f);
1321 const __m256 FD2 = _mm256_set1_ps(0.11583842382862377919f);
1322 const __m256 FD1 = _mm256_set1_ps(0.50736591960530292870f);
1323 const __m256 FD0 = _mm256_set1_ps(1.0f);
1325 __m256 z4;
1326 __m256 polyFN0,polyFN1,polyFD0,polyFD1;
1328 z4 = _mm256_mul_ps(z2,z2);
1330 polyFD0 = _mm256_mul_ps(FD4,z4);
1331 polyFD1 = _mm256_mul_ps(FD3,z4);
1332 polyFD0 = _mm256_add_ps(polyFD0,FD2);
1333 polyFD1 = _mm256_add_ps(polyFD1,FD1);
1334 polyFD0 = _mm256_mul_ps(polyFD0,z4);
1335 polyFD1 = _mm256_mul_ps(polyFD1,z2);
1336 polyFD0 = _mm256_add_ps(polyFD0,FD0);
1337 polyFD0 = _mm256_add_ps(polyFD0,polyFD1);
1339 polyFD0 = gmx_mm256_inv_ps(polyFD0);
1341 polyFN0 = _mm256_mul_ps(FN6,z4);
1342 polyFN1 = _mm256_mul_ps(FN5,z4);
1343 polyFN0 = _mm256_add_ps(polyFN0,FN4);
1344 polyFN1 = _mm256_add_ps(polyFN1,FN3);
1345 polyFN0 = _mm256_mul_ps(polyFN0,z4);
1346 polyFN1 = _mm256_mul_ps(polyFN1,z4);
1347 polyFN0 = _mm256_add_ps(polyFN0,FN2);
1348 polyFN1 = _mm256_add_ps(polyFN1,FN1);
1349 polyFN0 = _mm256_mul_ps(polyFN0,z4);
1350 polyFN1 = _mm256_mul_ps(polyFN1,z2);
1351 polyFN0 = _mm256_add_ps(polyFN0,FN0);
1352 polyFN0 = _mm256_add_ps(polyFN0,polyFN1);
1354 return _mm256_mul_ps(polyFN0,polyFD0);
1358 static __m128
1359 gmx_mm_pmecorrF_ps(__m128 z2)
1361 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
1362 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
1363 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
1364 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
1365 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
1366 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
1367 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
1369 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
1370 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
1371 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
1372 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
1373 const __m128 FD0 = _mm_set1_ps(1.0f);
1375 __m128 z4;
1376 __m128 polyFN0,polyFN1,polyFD0,polyFD1;
1378 z4 = _mm_mul_ps(z2,z2);
1380 polyFD0 = _mm_mul_ps(FD4,z4);
1381 polyFD1 = _mm_mul_ps(FD3,z4);
1382 polyFD0 = _mm_add_ps(polyFD0,FD2);
1383 polyFD1 = _mm_add_ps(polyFD1,FD1);
1384 polyFD0 = _mm_mul_ps(polyFD0,z4);
1385 polyFD1 = _mm_mul_ps(polyFD1,z2);
1386 polyFD0 = _mm_add_ps(polyFD0,FD0);
1387 polyFD0 = _mm_add_ps(polyFD0,polyFD1);
1389 polyFD0 = gmx_mm_inv_ps(polyFD0);
1391 polyFN0 = _mm_mul_ps(FN6,z4);
1392 polyFN1 = _mm_mul_ps(FN5,z4);
1393 polyFN0 = _mm_add_ps(polyFN0,FN4);
1394 polyFN1 = _mm_add_ps(polyFN1,FN3);
1395 polyFN0 = _mm_mul_ps(polyFN0,z4);
1396 polyFN1 = _mm_mul_ps(polyFN1,z4);
1397 polyFN0 = _mm_add_ps(polyFN0,FN2);
1398 polyFN1 = _mm_add_ps(polyFN1,FN1);
1399 polyFN0 = _mm_mul_ps(polyFN0,z4);
1400 polyFN1 = _mm_mul_ps(polyFN1,z2);
1401 polyFN0 = _mm_add_ps(polyFN0,FN0);
1402 polyFN0 = _mm_add_ps(polyFN0,polyFN1);
1404 return _mm_mul_ps(polyFN0,polyFD0);
1409 /* Calculate the potential correction due to PME analytically.
1411 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1413 * This routine calculates Erf(z)/z, although you should provide z^2
1414 * as the input argument.
1416 * Here's how it should be used:
1418 * 1. Calculate r^2.
1419 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1420 * 3. Evaluate this routine with z^2 as the argument.
1421 * 4. The return value is the expression:
1424 * erf(z)
1425 * --------
1428 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1430 * erf(r*beta)
1431 * -----------
1432 * r
1434 * 6. Subtract the result from 1/r, multiply by the product of the charges,
1435 * and you have your potential.
1437 static __m256
1438 gmx_mm256_pmecorrV_ps(__m256 z2)
1440 const __m256 VN6 = _mm256_set1_ps(1.9296833005951166339e-8f);
1441 const __m256 VN5 = _mm256_set1_ps(-1.4213390571557850962e-6f);
1442 const __m256 VN4 = _mm256_set1_ps(0.000041603292906656984871f);
1443 const __m256 VN3 = _mm256_set1_ps(-0.00013134036773265025626f);
1444 const __m256 VN2 = _mm256_set1_ps(0.038657983986041781264f);
1445 const __m256 VN1 = _mm256_set1_ps(0.11285044772717598220f);
1446 const __m256 VN0 = _mm256_set1_ps(1.1283802385263030286f);
1448 const __m256 VD3 = _mm256_set1_ps(0.0066752224023576045451f);
1449 const __m256 VD2 = _mm256_set1_ps(0.078647795836373922256f);
1450 const __m256 VD1 = _mm256_set1_ps(0.43336185284710920150f);
1451 const __m256 VD0 = _mm256_set1_ps(1.0f);
1453 __m256 z4;
1454 __m256 polyVN0,polyVN1,polyVD0,polyVD1;
1456 z4 = _mm256_mul_ps(z2,z2);
1458 polyVD1 = _mm256_mul_ps(VD3,z4);
1459 polyVD0 = _mm256_mul_ps(VD2,z4);
1460 polyVD1 = _mm256_add_ps(polyVD1,VD1);
1461 polyVD0 = _mm256_add_ps(polyVD0,VD0);
1462 polyVD1 = _mm256_mul_ps(polyVD1,z2);
1463 polyVD0 = _mm256_add_ps(polyVD0,polyVD1);
1465 polyVD0 = gmx_mm256_inv_ps(polyVD0);
1467 polyVN0 = _mm256_mul_ps(VN6,z4);
1468 polyVN1 = _mm256_mul_ps(VN5,z4);
1469 polyVN0 = _mm256_add_ps(polyVN0,VN4);
1470 polyVN1 = _mm256_add_ps(polyVN1,VN3);
1471 polyVN0 = _mm256_mul_ps(polyVN0,z4);
1472 polyVN1 = _mm256_mul_ps(polyVN1,z4);
1473 polyVN0 = _mm256_add_ps(polyVN0,VN2);
1474 polyVN1 = _mm256_add_ps(polyVN1,VN1);
1475 polyVN0 = _mm256_mul_ps(polyVN0,z4);
1476 polyVN1 = _mm256_mul_ps(polyVN1,z2);
1477 polyVN0 = _mm256_add_ps(polyVN0,VN0);
1478 polyVN0 = _mm256_add_ps(polyVN0,polyVN1);
1480 return _mm256_mul_ps(polyVN0,polyVD0);
1484 static __m128
1485 gmx_mm_pmecorrV_ps(__m128 z2)
1487 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
1488 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
1489 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
1490 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
1491 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
1492 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
1493 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
1495 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
1496 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
1497 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
1498 const __m128 VD0 = _mm_set1_ps(1.0f);
1500 __m128 z4;
1501 __m128 polyVN0,polyVN1,polyVD0,polyVD1;
1503 z4 = _mm_mul_ps(z2,z2);
1505 polyVD1 = _mm_mul_ps(VD3,z4);
1506 polyVD0 = _mm_mul_ps(VD2,z4);
1507 polyVD1 = _mm_add_ps(polyVD1,VD1);
1508 polyVD0 = _mm_add_ps(polyVD0,VD0);
1509 polyVD1 = _mm_mul_ps(polyVD1,z2);
1510 polyVD0 = _mm_add_ps(polyVD0,polyVD1);
1512 polyVD0 = gmx_mm_inv_ps(polyVD0);
1514 polyVN0 = _mm_mul_ps(VN6,z4);
1515 polyVN1 = _mm_mul_ps(VN5,z4);
1516 polyVN0 = _mm_add_ps(polyVN0,VN4);
1517 polyVN1 = _mm_add_ps(polyVN1,VN3);
1518 polyVN0 = _mm_mul_ps(polyVN0,z4);
1519 polyVN1 = _mm_mul_ps(polyVN1,z4);
1520 polyVN0 = _mm_add_ps(polyVN0,VN2);
1521 polyVN1 = _mm_add_ps(polyVN1,VN1);
1522 polyVN0 = _mm_mul_ps(polyVN0,z4);
1523 polyVN1 = _mm_mul_ps(polyVN1,z2);
1524 polyVN0 = _mm_add_ps(polyVN0,VN0);
1525 polyVN0 = _mm_add_ps(polyVN0,polyVN1);
1527 return _mm_mul_ps(polyVN0,polyVD0);
1531 static int
1532 gmx_mm256_sincos_ps(__m256 x,
1533 __m256 *sinval,
1534 __m256 *cosval)
1536 const __m256 two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1537 const __m256 half = _mm256_set1_ps(0.5f);
1538 const __m256 one = _mm256_set1_ps(1.0f);
1539 const __m256 zero = _mm256_setzero_ps();
1541 const __m128i ione = _mm_set1_epi32(1);
1543 const __m256 mask_one = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1544 const __m256 mask_two = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1545 const __m256 mask_three = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1547 const __m256 CA1 = _mm256_set1_ps(1.5703125f);
1548 const __m256 CA2 = _mm256_set1_ps(4.837512969970703125e-4f);
1549 const __m256 CA3 = _mm256_set1_ps(7.54978995489188216e-8f);
1551 const __m256 CC0 = _mm256_set1_ps(-0.0013602249f);
1552 const __m256 CC1 = _mm256_set1_ps(0.0416566950f);
1553 const __m256 CC2 = _mm256_set1_ps(-0.4999990225f);
1554 const __m256 CS0 = _mm256_set1_ps(-0.0001950727f);
1555 const __m256 CS1 = _mm256_set1_ps(0.0083320758f);
1556 const __m256 CS2 = _mm256_set1_ps(-0.1666665247f);
1558 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1560 __m256 y,y2;
1561 __m256 z;
1562 __m256i iz;
1563 __m128i iz_high,iz_low;
1564 __m256 offset_sin,offset_cos;
1565 __m256 mask_sin,mask_cos;
1566 __m256 tmp1,tmp2;
1567 __m256 tmp_sin,tmp_cos;
1569 y = _mm256_mul_ps(x,two_over_pi);
1570 y = _mm256_add_ps(y,_mm256_or_ps(_mm256_and_ps(y,signbit),half));
1572 iz = _mm256_cvttps_epi32(y);
1573 z = _mm256_round_ps(y,_MM_FROUND_TO_ZERO);
1575 offset_sin = _mm256_and_ps(_mm256_castsi256_ps(iz),mask_three);
1577 iz_high = _mm256_extractf128_si256(iz,0x1);
1578 iz_low = _mm256_castsi256_si128(iz);
1579 iz_low = _mm_add_epi32(iz_low,ione);
1580 iz_high = _mm_add_epi32(iz_high,ione);
1581 iz = _mm256_castsi128_si256(iz_low);
1582 iz = _mm256_insertf128_si256(iz,iz_high,0x1);
1583 offset_cos = _mm256_castsi256_ps(iz);
1585 /* Extended precision arithmethic to achieve full precision */
1586 y = _mm256_mul_ps(z,CA1);
1587 tmp1 = _mm256_mul_ps(z,CA2);
1588 tmp2 = _mm256_mul_ps(z,CA3);
1589 y = _mm256_sub_ps(x,y);
1590 y = _mm256_sub_ps(y,tmp1);
1591 y = _mm256_sub_ps(y,tmp2);
1593 y2 = _mm256_mul_ps(y,y);
1595 tmp1 = _mm256_mul_ps(CC0,y2);
1596 tmp1 = _mm256_add_ps(tmp1,CC1);
1597 tmp2 = _mm256_mul_ps(CS0,y2);
1598 tmp2 = _mm256_add_ps(tmp2,CS1);
1599 tmp1 = _mm256_mul_ps(tmp1,y2);
1600 tmp1 = _mm256_add_ps(tmp1,CC2);
1601 tmp2 = _mm256_mul_ps(tmp2,y2);
1602 tmp2 = _mm256_add_ps(tmp2,CS2);
1604 tmp1 = _mm256_mul_ps(tmp1,y2);
1605 tmp1 = _mm256_add_ps(tmp1,one);
1607 tmp2 = _mm256_mul_ps(tmp2,_mm256_mul_ps(y,y2));
1608 tmp2 = _mm256_add_ps(tmp2,y);
1610 #ifdef __INTEL_COMPILER
1611 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1612 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_one))),zero,_CMP_EQ_OQ);
1613 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_one))),zero,_CMP_EQ_OQ);
1614 #else
1615 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_one), zero, _CMP_EQ_OQ);
1616 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_one), zero, _CMP_EQ_OQ);
1617 #endif
1618 tmp_sin = _mm256_blendv_ps(tmp1,tmp2,mask_sin);
1619 tmp_cos = _mm256_blendv_ps(tmp1,tmp2,mask_cos);
1621 tmp1 = _mm256_xor_ps(signbit,tmp_sin);
1622 tmp2 = _mm256_xor_ps(signbit,tmp_cos);
1624 #ifdef __INTEL_COMPILER
1625 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1626 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_two))),zero,_CMP_EQ_OQ);
1627 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_two))),zero,_CMP_EQ_OQ);
1628 #else
1629 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_two), zero, _CMP_EQ_OQ);
1630 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_two), zero, _CMP_EQ_OQ);
1632 #endif
1633 *sinval = _mm256_blendv_ps(tmp1,tmp_sin,mask_sin);
1634 *cosval = _mm256_blendv_ps(tmp2,tmp_cos,mask_cos);
1636 return 0;
1639 static int
1640 gmx_mm_sincos_ps(__m128 x,
1641 __m128 *sinval,
1642 __m128 *cosval)
1644 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
1645 const __m128 half = _mm_set1_ps(0.5);
1646 const __m128 one = _mm_set1_ps(1.0);
1648 const __m128i izero = _mm_set1_epi32(0);
1649 const __m128i ione = _mm_set1_epi32(1);
1650 const __m128i itwo = _mm_set1_epi32(2);
1651 const __m128i ithree = _mm_set1_epi32(3);
1652 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1654 const __m128 CA1 = _mm_set1_ps(1.5703125f);
1655 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
1656 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
1658 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
1659 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
1660 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
1661 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
1662 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
1663 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
1665 __m128 y,y2;
1666 __m128 z;
1667 __m128i iz;
1668 __m128i offset_sin,offset_cos;
1669 __m128 tmp1,tmp2;
1670 __m128 mask_sin,mask_cos;
1671 __m128 tmp_sin,tmp_cos;
1673 y = _mm_mul_ps(x,two_over_pi);
1674 y = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
1676 iz = _mm_cvttps_epi32(y);
1677 z = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
1679 offset_sin = _mm_and_si128(iz,ithree);
1680 offset_cos = _mm_add_epi32(iz,ione);
1682 /* Extended precision arithmethic to achieve full precision */
1683 y = _mm_mul_ps(z,CA1);
1684 tmp1 = _mm_mul_ps(z,CA2);
1685 tmp2 = _mm_mul_ps(z,CA3);
1686 y = _mm_sub_ps(x,y);
1687 y = _mm_sub_ps(y,tmp1);
1688 y = _mm_sub_ps(y,tmp2);
1690 y2 = _mm_mul_ps(y,y);
1692 tmp1 = _mm_mul_ps(CC0,y2);
1693 tmp1 = _mm_add_ps(tmp1,CC1);
1694 tmp2 = _mm_mul_ps(CS0,y2);
1695 tmp2 = _mm_add_ps(tmp2,CS1);
1696 tmp1 = _mm_mul_ps(tmp1,y2);
1697 tmp1 = _mm_add_ps(tmp1,CC2);
1698 tmp2 = _mm_mul_ps(tmp2,y2);
1699 tmp2 = _mm_add_ps(tmp2,CS2);
1701 tmp1 = _mm_mul_ps(tmp1,y2);
1702 tmp1 = _mm_add_ps(tmp1,one);
1704 tmp2 = _mm_mul_ps(tmp2,_mm_mul_ps(y,y2));
1705 tmp2 = _mm_add_ps(tmp2,y);
1707 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
1708 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
1710 tmp_sin = _mm_blendv_ps(tmp1,tmp2,mask_sin);
1711 tmp_cos = _mm_blendv_ps(tmp1,tmp2,mask_cos);
1713 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
1714 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
1716 tmp1 = _mm_xor_ps(signbit,tmp_sin);
1717 tmp2 = _mm_xor_ps(signbit,tmp_cos);
1719 *sinval = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
1720 *cosval = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
1722 return 0;
1729 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1730 * will then call the sincos() routine and waste a factor 2 in performance!
1732 static __m256
1733 gmx_mm256_sin_ps(__m256 x)
1735 __m256 s,c;
1736 gmx_mm256_sincos_ps(x,&s,&c);
1737 return s;
1740 static __m128
1741 gmx_mm_sin_ps(__m128 x)
1743 __m128 s,c;
1744 gmx_mm_sincos_ps(x,&s,&c);
1745 return s;
1750 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1751 * will then call the sincos() routine and waste a factor 2 in performance!
1753 static __m256
1754 gmx_mm256_cos_ps(__m256 x)
1756 __m256 s,c;
1757 gmx_mm256_sincos_ps(x,&s,&c);
1758 return c;
1761 static __m128
1762 gmx_mm_cos_ps(__m128 x)
1764 __m128 s,c;
1765 gmx_mm_sincos_ps(x,&s,&c);
1766 return c;
1770 static __m256
1771 gmx_mm256_tan_ps(__m256 x)
1773 __m256 sinval,cosval;
1774 __m256 tanval;
1776 gmx_mm256_sincos_ps(x,&sinval,&cosval);
1778 tanval = _mm256_mul_ps(sinval,gmx_mm256_inv_ps(cosval));
1780 return tanval;
1783 static __m128
1784 gmx_mm_tan_ps(__m128 x)
1786 __m128 sinval,cosval;
1787 __m128 tanval;
1789 gmx_mm_sincos_ps(x,&sinval,&cosval);
1791 tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
1793 return tanval;
1797 static __m256
1798 gmx_mm256_asin_ps(__m256 x)
1800 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1801 const __m256 limitlow = _mm256_set1_ps(1e-4f);
1802 const __m256 half = _mm256_set1_ps(0.5f);
1803 const __m256 one = _mm256_set1_ps(1.0f);
1804 const __m256 halfpi = _mm256_set1_ps((float)M_PI/2.0f);
1806 const __m256 CC5 = _mm256_set1_ps(4.2163199048E-2f);
1807 const __m256 CC4 = _mm256_set1_ps(2.4181311049E-2f);
1808 const __m256 CC3 = _mm256_set1_ps(4.5470025998E-2f);
1809 const __m256 CC2 = _mm256_set1_ps(7.4953002686E-2f);
1810 const __m256 CC1 = _mm256_set1_ps(1.6666752422E-1f);
1812 __m256 sign;
1813 __m256 mask;
1814 __m256 xabs;
1815 __m256 z,z1,z2,q,q1,q2;
1816 __m256 pA,pB;
1818 sign = _mm256_andnot_ps(signmask,x);
1819 xabs = _mm256_and_ps(x,signmask);
1821 mask = _mm256_cmp_ps(xabs,half,_CMP_GT_OQ);
1823 z1 = _mm256_mul_ps(half, _mm256_sub_ps(one,xabs));
1824 q1 = _mm256_mul_ps(z1,gmx_mm256_invsqrt_ps(z1));
1825 q1 = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one,_CMP_EQ_OQ),q1);
1827 q2 = xabs;
1828 z2 = _mm256_mul_ps(q2,q2);
1830 z = _mm256_blendv_ps(z2,z1,mask);
1831 q = _mm256_blendv_ps(q2,q1,mask);
1833 z2 = _mm256_mul_ps(z,z);
1835 pA = _mm256_mul_ps(CC5,z2);
1836 pB = _mm256_mul_ps(CC4,z2);
1838 pA = _mm256_add_ps(pA,CC3);
1839 pB = _mm256_add_ps(pB,CC2);
1841 pA = _mm256_mul_ps(pA,z2);
1842 pB = _mm256_mul_ps(pB,z2);
1844 pA = _mm256_add_ps(pA,CC1);
1845 pA = _mm256_mul_ps(pA,z);
1847 z = _mm256_add_ps(pA,pB);
1848 z = _mm256_mul_ps(z,q);
1849 z = _mm256_add_ps(z,q);
1851 q2 = _mm256_sub_ps(halfpi,z);
1852 q2 = _mm256_sub_ps(q2,z);
1854 z = _mm256_blendv_ps(z,q2,mask);
1856 mask = _mm256_cmp_ps(xabs,limitlow,_CMP_GT_OQ);
1857 z = _mm256_blendv_ps(xabs,z,mask);
1859 z = _mm256_xor_ps(z,sign);
1861 return z;
1864 static __m128
1865 gmx_mm_asin_ps(__m128 x)
1867 /* Same algorithm as cephes library */
1868 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1869 const __m128 limitlow = _mm_set1_ps(1e-4f);
1870 const __m128 half = _mm_set1_ps(0.5f);
1871 const __m128 one = _mm_set1_ps(1.0f);
1872 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
1874 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
1875 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
1876 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
1877 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
1878 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
1880 __m128 sign;
1881 __m128 mask;
1882 __m128 xabs;
1883 __m128 z,z1,z2,q,q1,q2;
1884 __m128 pA,pB;
1886 sign = _mm_andnot_ps(signmask,x);
1887 xabs = _mm_and_ps(x,signmask);
1889 mask = _mm_cmpgt_ps(xabs,half);
1891 z1 = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
1892 q1 = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
1893 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
1895 q2 = xabs;
1896 z2 = _mm_mul_ps(q2,q2);
1898 z = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
1899 q = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
1901 z2 = _mm_mul_ps(z,z);
1903 pA = _mm_mul_ps(CC5,z2);
1904 pB = _mm_mul_ps(CC4,z2);
1906 pA = _mm_add_ps(pA,CC3);
1907 pB = _mm_add_ps(pB,CC2);
1909 pA = _mm_mul_ps(pA,z2);
1910 pB = _mm_mul_ps(pB,z2);
1912 pA = _mm_add_ps(pA,CC1);
1913 pA = _mm_mul_ps(pA,z);
1915 z = _mm_add_ps(pA,pB);
1916 z = _mm_mul_ps(z,q);
1917 z = _mm_add_ps(z,q);
1919 q2 = _mm_sub_ps(halfpi,z);
1920 q2 = _mm_sub_ps(q2,z);
1922 z = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
1924 mask = _mm_cmpgt_ps(xabs,limitlow);
1925 z = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
1927 z = _mm_xor_ps(z,sign);
1929 return z;
1933 static __m256
1934 gmx_mm256_acos_ps(__m256 x)
1936 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1937 const __m256 one_ps = _mm256_set1_ps(1.0f);
1938 const __m256 half_ps = _mm256_set1_ps(0.5f);
1939 const __m256 pi_ps = _mm256_set1_ps((float)M_PI);
1940 const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1942 __m256 mask1;
1943 __m256 mask2;
1944 __m256 xabs;
1945 __m256 z,z1,z2,z3;
1947 xabs = _mm256_and_ps(x,signmask);
1948 mask1 = _mm256_cmp_ps(xabs,half_ps,_CMP_GT_OQ);
1949 mask2 = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_GT_OQ);
1951 z = _mm256_mul_ps(half_ps,_mm256_sub_ps(one_ps,xabs));
1952 z = _mm256_mul_ps(z,gmx_mm256_invsqrt_ps(z));
1953 z = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one_ps,_CMP_EQ_OQ),z);
1955 z = _mm256_blendv_ps(x,z,mask1);
1956 z = gmx_mm256_asin_ps(z);
1958 z2 = _mm256_add_ps(z,z);
1959 z1 = _mm256_sub_ps(pi_ps,z2);
1960 z3 = _mm256_sub_ps(halfpi_ps,z);
1962 z = _mm256_blendv_ps(z1,z2,mask2);
1963 z = _mm256_blendv_ps(z3,z,mask1);
1965 return z;
1968 static __m128
1969 gmx_mm_acos_ps(__m128 x)
1971 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1972 const __m128 one_ps = _mm_set1_ps(1.0f);
1973 const __m128 half_ps = _mm_set1_ps(0.5f);
1974 const __m128 pi_ps = _mm_set1_ps(M_PI);
1975 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1977 __m128 mask1;
1978 __m128 mask2;
1979 __m128 xabs;
1980 __m128 z,z1,z2,z3;
1982 xabs = _mm_and_ps(x,signmask);
1983 mask1 = _mm_cmpgt_ps(xabs,half_ps);
1984 mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
1986 z = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
1987 z = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
1988 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
1990 z = _mm_blendv_ps(x,z,mask1);
1991 z = gmx_mm_asin_ps(z);
1993 z2 = _mm_add_ps(z,z);
1994 z1 = _mm_sub_ps(pi_ps,z2);
1995 z3 = _mm_sub_ps(halfpi_ps,z);
1997 z = _mm_blendv_ps(z1,z2,mask2);
1998 z = _mm_blendv_ps(z3,z,mask1);
2000 return z;
2004 static __m256
2005 gmx_mm256_atan_ps(__m256 x)
2007 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2008 const __m256 limit1 = _mm256_set1_ps(0.414213562373095f);
2009 const __m256 limit2 = _mm256_set1_ps(2.414213562373095f);
2010 const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2011 const __m256 halfpi = _mm256_set1_ps(1.570796326794896f);
2012 const __m256 mone = _mm256_set1_ps(-1.0f);
2013 const __m256 CC3 = _mm256_set1_ps(-3.33329491539E-1f);
2014 const __m256 CC5 = _mm256_set1_ps(1.99777106478E-1f);
2015 const __m256 CC7 = _mm256_set1_ps(-1.38776856032E-1);
2016 const __m256 CC9 = _mm256_set1_ps(8.05374449538e-2f);
2018 __m256 sign;
2019 __m256 mask1,mask2;
2020 __m256 y,z1,z2;
2021 __m256 x2,x4;
2022 __m256 sum1,sum2;
2024 sign = _mm256_andnot_ps(signmask,x);
2025 x = _mm256_and_ps(x,signmask);
2027 mask1 = _mm256_cmp_ps(x,limit1,_CMP_GT_OQ);
2028 mask2 = _mm256_cmp_ps(x,limit2,_CMP_GT_OQ);
2030 z1 = _mm256_mul_ps(_mm256_add_ps(x,mone),gmx_mm256_inv_ps(_mm256_sub_ps(x,mone)));
2031 z2 = _mm256_mul_ps(mone,gmx_mm256_inv_ps(x));
2033 y = _mm256_and_ps(mask1,quarterpi);
2034 y = _mm256_blendv_ps(y,halfpi,mask2);
2036 x = _mm256_blendv_ps(x,z1,mask1);
2037 x = _mm256_blendv_ps(x,z2,mask2);
2039 x2 = _mm256_mul_ps(x,x);
2040 x4 = _mm256_mul_ps(x2,x2);
2042 sum1 = _mm256_mul_ps(CC9,x4);
2043 sum2 = _mm256_mul_ps(CC7,x4);
2044 sum1 = _mm256_add_ps(sum1,CC5);
2045 sum2 = _mm256_add_ps(sum2,CC3);
2046 sum1 = _mm256_mul_ps(sum1,x4);
2047 sum2 = _mm256_mul_ps(sum2,x2);
2049 sum1 = _mm256_add_ps(sum1,sum2);
2050 sum1 = _mm256_sub_ps(sum1,mone);
2051 sum1 = _mm256_mul_ps(sum1,x);
2052 y = _mm256_add_ps(y,sum1);
2054 y = _mm256_xor_ps(y,sign);
2056 return y;
2059 static __m128
2060 gmx_mm_atan_ps(__m128 x)
2062 /* Same algorithm as cephes library */
2063 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2064 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
2065 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
2066 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2067 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
2068 const __m128 mone = _mm_set1_ps(-1.0f);
2069 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
2070 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
2071 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
2072 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
2074 __m128 sign;
2075 __m128 mask1,mask2;
2076 __m128 y,z1,z2;
2077 __m128 x2,x4;
2078 __m128 sum1,sum2;
2080 sign = _mm_andnot_ps(signmask,x);
2081 x = _mm_and_ps(x,signmask);
2083 mask1 = _mm_cmpgt_ps(x,limit1);
2084 mask2 = _mm_cmpgt_ps(x,limit2);
2086 z1 = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
2087 z2 = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
2089 y = _mm_and_ps(mask1,quarterpi);
2090 y = _mm_blendv_ps(y,halfpi,mask2);
2092 x = _mm_blendv_ps(x,z1,mask1);
2093 x = _mm_blendv_ps(x,z2,mask2);
2095 x2 = _mm_mul_ps(x,x);
2096 x4 = _mm_mul_ps(x2,x2);
2098 sum1 = _mm_mul_ps(CC9,x4);
2099 sum2 = _mm_mul_ps(CC7,x4);
2100 sum1 = _mm_add_ps(sum1,CC5);
2101 sum2 = _mm_add_ps(sum2,CC3);
2102 sum1 = _mm_mul_ps(sum1,x4);
2103 sum2 = _mm_mul_ps(sum2,x2);
2105 sum1 = _mm_add_ps(sum1,sum2);
2106 sum1 = _mm_sub_ps(sum1,mone);
2107 sum1 = _mm_mul_ps(sum1,x);
2108 y = _mm_add_ps(y,sum1);
2110 y = _mm_xor_ps(y,sign);
2112 return y;
2116 static __m256
2117 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2119 const __m256 pi = _mm256_set1_ps( (float) M_PI);
2120 const __m256 minuspi = _mm256_set1_ps( (float) -M_PI);
2121 const __m256 halfpi = _mm256_set1_ps( (float) M_PI/2.0f);
2122 const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2124 __m256 z,z1,z3,z4;
2125 __m256 w;
2126 __m256 maskx_lt,maskx_eq;
2127 __m256 masky_lt,masky_eq;
2128 __m256 mask1,mask2,mask3,mask4,maskall;
2130 maskx_lt = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
2131 masky_lt = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_LT_OQ);
2132 maskx_eq = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
2133 masky_eq = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_EQ_OQ);
2135 z = _mm256_mul_ps(y,gmx_mm256_inv_ps(x));
2136 z = gmx_mm256_atan_ps(z);
2138 mask1 = _mm256_and_ps(maskx_eq,masky_lt);
2139 mask2 = _mm256_andnot_ps(maskx_lt,masky_eq);
2140 mask3 = _mm256_andnot_ps( _mm256_or_ps(masky_lt,masky_eq) , maskx_eq);
2141 mask4 = _mm256_and_ps(maskx_lt,masky_eq);
2142 maskall = _mm256_or_ps( _mm256_or_ps(mask1,mask2), _mm256_or_ps(mask3,mask4) );
2144 z = _mm256_andnot_ps(maskall,z);
2145 z1 = _mm256_and_ps(mask1,minushalfpi);
2146 z3 = _mm256_and_ps(mask3,halfpi);
2147 z4 = _mm256_and_ps(mask4,pi);
2149 z = _mm256_or_ps( _mm256_or_ps(z,z1), _mm256_or_ps(z3,z4) );
2151 w = _mm256_blendv_ps(pi,minuspi,masky_lt);
2152 w = _mm256_and_ps(w,maskx_lt);
2154 w = _mm256_andnot_ps(maskall,w);
2156 z = _mm256_add_ps(z,w);
2158 return z;
2161 static __m128
2162 gmx_mm_atan2_ps(__m128 y, __m128 x)
2164 const __m128 pi = _mm_set1_ps(M_PI);
2165 const __m128 minuspi = _mm_set1_ps(-M_PI);
2166 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
2167 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2169 __m128 z,z1,z3,z4;
2170 __m128 w;
2171 __m128 maskx_lt,maskx_eq;
2172 __m128 masky_lt,masky_eq;
2173 __m128 mask1,mask2,mask3,mask4,maskall;
2175 maskx_lt = _mm_cmplt_ps(x,_mm_setzero_ps());
2176 masky_lt = _mm_cmplt_ps(y,_mm_setzero_ps());
2177 maskx_eq = _mm_cmpeq_ps(x,_mm_setzero_ps());
2178 masky_eq = _mm_cmpeq_ps(y,_mm_setzero_ps());
2180 z = _mm_mul_ps(y,gmx_mm_inv_ps(x));
2181 z = gmx_mm_atan_ps(z);
2183 mask1 = _mm_and_ps(maskx_eq,masky_lt);
2184 mask2 = _mm_andnot_ps(maskx_lt,masky_eq);
2185 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
2186 mask4 = _mm_and_ps(masky_eq,maskx_lt);
2188 maskall = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
2190 z = _mm_andnot_ps(maskall,z);
2191 z1 = _mm_and_ps(mask1,minushalfpi);
2192 z3 = _mm_and_ps(mask3,halfpi);
2193 z4 = _mm_and_ps(mask4,pi);
2195 z = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
2197 mask1 = _mm_andnot_ps(masky_lt,maskx_lt);
2198 mask2 = _mm_and_ps(maskx_lt,masky_lt);
2200 w = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
2201 w = _mm_andnot_ps(maskall,w);
2203 z = _mm_add_ps(z,w);
2205 return z;
2208 #endif /* _gmx_math_x86_avx_256_single_h_ */