fixed recent bug with CUDA texture objects
[gromacs.git] / include / gmx_math_x86_avx_256_single.h
blobb68625db4fe5116c5df7ef07cbf00ae9e7bf250a
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
6 * others, as listed in the AUTHORS file in the top-level source
7 * directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _gmx_math_x86_avx_256_single_h_
36 #define _gmx_math_x86_avx_256_single_h_
38 #include <math.h>
40 #include "gmx_x86_avx_256.h"
42 #ifndef M_PI
43 # define M_PI 3.14159265358979323846264338327950288
44 #endif
48 /************************
49 * *
50 * Simple math routines *
51 * *
52 ************************/
54 /* 1.0/sqrt(x), 256-bit wide version */
55 static gmx_inline __m256
56 gmx_mm256_invsqrt_ps(__m256 x)
58 const __m256 half = _mm256_set1_ps(0.5f);
59 const __m256 three = _mm256_set1_ps(3.0f);
61 __m256 lu = _mm256_rsqrt_ps(x);
63 return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
66 /* 1.0/sqrt(x), 128-bit wide version */
67 static gmx_inline __m128
68 gmx_mm_invsqrt_ps(__m128 x)
70 const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
71 const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
73 __m128 lu = _mm_rsqrt_ps(x);
75 return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
79 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
80 static gmx_inline __m256
81 gmx_mm256_sqrt_ps(__m256 x)
83 __m256 mask;
84 __m256 res;
86 mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
87 res = _mm256_andnot_ps(mask, gmx_mm256_invsqrt_ps(x));
89 res = _mm256_mul_ps(x, res);
91 return res;
94 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
95 static gmx_inline __m128
96 gmx_mm_sqrt_ps(__m128 x)
98 __m128 mask;
99 __m128 res;
101 mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
102 res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
104 res = _mm_mul_ps(x, res);
106 return res;
110 /* 1.0/x, 256-bit wide */
111 static gmx_inline __m256
112 gmx_mm256_inv_ps(__m256 x)
114 const __m256 two = _mm256_set1_ps(2.0f);
116 __m256 lu = _mm256_rcp_ps(x);
118 return _mm256_mul_ps(lu, _mm256_sub_ps(two, _mm256_mul_ps(lu, x)));
121 /* 1.0/x, 128-bit wide */
122 static gmx_inline __m128
123 gmx_mm_inv_ps(__m128 x)
125 const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
127 __m128 lu = _mm_rcp_ps(x);
129 return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
133 static gmx_inline __m256
134 gmx_mm256_abs_ps(__m256 x)
136 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
138 return _mm256_and_ps(x, signmask);
141 static gmx_inline __m128
142 gmx_mm_abs_ps(__m128 x)
144 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
146 return _mm_and_ps(x, signmask);
150 static __m256
151 gmx_mm256_log_ps(__m256 x)
153 const __m256 expmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
154 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
155 const __m256 half = _mm256_set1_ps(0.5f);
156 const __m256 one = _mm256_set1_ps(1.0f);
157 const __m256 invsq2 = _mm256_set1_ps(1.0f/sqrt(2.0f));
158 const __m256 corr1 = _mm256_set1_ps(-2.12194440e-4f);
159 const __m256 corr2 = _mm256_set1_ps(0.693359375f);
161 const __m256 CA_1 = _mm256_set1_ps(0.070376836292f);
162 const __m256 CB_0 = _mm256_set1_ps(1.6714950086782716f);
163 const __m256 CB_1 = _mm256_set1_ps(-2.452088066061482f);
164 const __m256 CC_0 = _mm256_set1_ps(1.5220770854701728f);
165 const __m256 CC_1 = _mm256_set1_ps(-1.3422238433233642f);
166 const __m256 CD_0 = _mm256_set1_ps(1.386218787509749f);
167 const __m256 CD_1 = _mm256_set1_ps(0.35075468953796346f);
168 const __m256 CE_0 = _mm256_set1_ps(1.3429983063133937f);
169 const __m256 CE_1 = _mm256_set1_ps(1.807420826584643f);
171 __m256 fexp, fexp1;
172 __m256i iexp;
173 __m128i iexp128a, iexp128b;
174 __m256 mask;
175 __m256i imask;
176 __m128i imask128a, imask128b;
177 __m256 x1, x2;
178 __m256 y;
179 __m256 pA, pB, pC, pD, pE, tB, tC, tD, tE;
181 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
182 fexp = _mm256_and_ps(x, expmask);
183 iexp = _mm256_castps_si256(fexp);
185 iexp128b = _mm256_extractf128_si256(iexp, 0x1);
186 iexp128a = _mm256_castsi256_si128(iexp);
188 iexp128a = _mm_srli_epi32(iexp128a, 23);
189 iexp128b = _mm_srli_epi32(iexp128b, 23);
190 iexp128a = _mm_sub_epi32(iexp128a, expbase_m1);
191 iexp128b = _mm_sub_epi32(iexp128b, expbase_m1);
193 x = _mm256_andnot_ps(expmask, x);
194 x = _mm256_or_ps(x, one);
195 x = _mm256_mul_ps(x, half);
197 mask = _mm256_cmp_ps(x, invsq2, _CMP_LT_OQ);
199 x = _mm256_add_ps(x, _mm256_and_ps(mask, x));
200 x = _mm256_sub_ps(x, one);
202 imask = _mm256_castps_si256(mask);
204 imask128b = _mm256_extractf128_si256(imask, 0x1);
205 imask128a = _mm256_castsi256_si128(imask);
207 iexp128a = _mm_add_epi32(iexp128a, imask128a);
208 iexp128b = _mm_add_epi32(iexp128b, imask128b);
210 iexp = _mm256_castsi128_si256(iexp128a);
211 iexp = _mm256_insertf128_si256(iexp, iexp128b, 0x1);
213 x2 = _mm256_mul_ps(x, x);
215 pA = _mm256_mul_ps(CA_1, x);
216 pB = _mm256_mul_ps(CB_1, x);
217 pC = _mm256_mul_ps(CC_1, x);
218 pD = _mm256_mul_ps(CD_1, x);
219 pE = _mm256_mul_ps(CE_1, x);
220 tB = _mm256_add_ps(CB_0, x2);
221 tC = _mm256_add_ps(CC_0, x2);
222 tD = _mm256_add_ps(CD_0, x2);
223 tE = _mm256_add_ps(CE_0, x2);
224 pB = _mm256_add_ps(pB, tB);
225 pC = _mm256_add_ps(pC, tC);
226 pD = _mm256_add_ps(pD, tD);
227 pE = _mm256_add_ps(pE, tE);
229 pA = _mm256_mul_ps(pA, pB);
230 pC = _mm256_mul_ps(pC, pD);
231 pE = _mm256_mul_ps(pE, x2);
232 pA = _mm256_mul_ps(pA, pC);
233 y = _mm256_mul_ps(pA, pE);
235 fexp = _mm256_cvtepi32_ps(iexp);
236 y = _mm256_add_ps(y, _mm256_mul_ps(fexp, corr1));
238 y = _mm256_sub_ps(y, _mm256_mul_ps(half, x2));
239 x2 = _mm256_add_ps(x, y);
241 x2 = _mm256_add_ps(x2, _mm256_mul_ps(fexp, corr2));
243 return x2;
247 static __m128
248 gmx_mm_log_ps(__m128 x)
250 /* Same algorithm as cephes library */
251 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
252 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
253 const __m128 half = _mm_set1_ps(0.5f);
254 const __m128 one = _mm_set1_ps(1.0f);
255 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
256 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
257 const __m128 corr2 = _mm_set1_ps(0.693359375f);
259 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
260 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
261 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
262 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
263 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
264 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
265 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
266 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
267 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
269 __m128 fexp, fexp1;
270 __m128i iexp;
271 __m128 mask;
272 __m128 x1, x2;
273 __m128 y;
274 __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
276 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
277 fexp = _mm_and_ps(x, expmask);
278 iexp = gmx_mm_castps_si128(fexp);
279 iexp = _mm_srli_epi32(iexp, 23);
280 iexp = _mm_sub_epi32(iexp, expbase_m1);
282 x = _mm_andnot_ps(expmask, x);
283 x = _mm_or_ps(x, one);
284 x = _mm_mul_ps(x, half);
286 mask = _mm_cmplt_ps(x, invsq2);
288 x = _mm_add_ps(x, _mm_and_ps(mask, x));
289 x = _mm_sub_ps(x, one);
290 iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
292 x2 = _mm_mul_ps(x, x);
294 pA = _mm_mul_ps(CA_1, x);
295 pB = _mm_mul_ps(CB_1, x);
296 pC = _mm_mul_ps(CC_1, x);
297 pD = _mm_mul_ps(CD_1, x);
298 pE = _mm_mul_ps(CE_1, x);
299 tB = _mm_add_ps(CB_0, x2);
300 tC = _mm_add_ps(CC_0, x2);
301 tD = _mm_add_ps(CD_0, x2);
302 tE = _mm_add_ps(CE_0, x2);
303 pB = _mm_add_ps(pB, tB);
304 pC = _mm_add_ps(pC, tC);
305 pD = _mm_add_ps(pD, tD);
306 pE = _mm_add_ps(pE, tE);
308 pA = _mm_mul_ps(pA, pB);
309 pC = _mm_mul_ps(pC, pD);
310 pE = _mm_mul_ps(pE, x2);
311 pA = _mm_mul_ps(pA, pC);
312 y = _mm_mul_ps(pA, pE);
314 fexp = _mm_cvtepi32_ps(iexp);
315 y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
317 y = _mm_sub_ps(y, _mm_mul_ps(half, x2));
318 x2 = _mm_add_ps(x, y);
320 x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
322 return x2;
327 * 2^x function, 256-bit wide
329 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
330 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
332 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
334 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
335 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
336 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
337 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
338 * number instead. That would take a few extra cycles and not really help, since something is
339 * wrong if you are using single precision to work with numbers that cannot really be represented
340 * in single precision.
342 * The accuracy is at least 23 bits.
344 static __m256
345 gmx_mm256_exp2_ps(__m256 x)
347 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
348 const __m256 arglimit = _mm256_set1_ps(126.0f);
350 const __m128i expbase = _mm_set1_epi32(127);
351 const __m256 CC6 = _mm256_set1_ps(1.535336188319500E-004);
352 const __m256 CC5 = _mm256_set1_ps(1.339887440266574E-003);
353 const __m256 CC4 = _mm256_set1_ps(9.618437357674640E-003);
354 const __m256 CC3 = _mm256_set1_ps(5.550332471162809E-002);
355 const __m256 CC2 = _mm256_set1_ps(2.402264791363012E-001);
356 const __m256 CC1 = _mm256_set1_ps(6.931472028550421E-001);
357 const __m256 CC0 = _mm256_set1_ps(1.0f);
359 __m256 p0, p1;
360 __m256 valuemask;
361 __m256i iexppart;
362 __m128i iexppart128a, iexppart128b;
363 __m256 fexppart;
364 __m256 intpart;
365 __m256 x2;
368 iexppart = _mm256_cvtps_epi32(x);
369 intpart = _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
371 iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
372 iexppart128a = _mm256_castsi256_si128(iexppart);
374 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
375 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
377 iexppart = _mm256_castsi128_si256(iexppart128a);
378 iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
379 valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(x), _CMP_GE_OQ);
380 fexppart = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
382 x = _mm256_sub_ps(x, intpart);
383 x2 = _mm256_mul_ps(x, x);
385 p0 = _mm256_mul_ps(CC6, x2);
386 p1 = _mm256_mul_ps(CC5, x2);
387 p0 = _mm256_add_ps(p0, CC4);
388 p1 = _mm256_add_ps(p1, CC3);
389 p0 = _mm256_mul_ps(p0, x2);
390 p1 = _mm256_mul_ps(p1, x2);
391 p0 = _mm256_add_ps(p0, CC2);
392 p1 = _mm256_add_ps(p1, CC1);
393 p0 = _mm256_mul_ps(p0, x2);
394 p1 = _mm256_mul_ps(p1, x);
395 p0 = _mm256_add_ps(p0, CC0);
396 p0 = _mm256_add_ps(p0, p1);
397 x = _mm256_mul_ps(p0, fexppart);
399 return x;
403 /* 2^x, 128 bit wide */
404 static __m128
405 gmx_mm_exp2_ps(__m128 x)
407 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
408 const __m128 arglimit = _mm_set1_ps(126.0f);
410 const __m128i expbase = _mm_set1_epi32(127);
411 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
412 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
413 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
414 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
415 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
416 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
417 const __m128 CA0 = _mm_set1_ps(1.0f);
419 __m128 valuemask;
420 __m128i iexppart;
421 __m128 fexppart;
422 __m128 intpart;
423 __m128 x2;
424 __m128 p0, p1;
426 iexppart = _mm_cvtps_epi32(x);
427 intpart = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
428 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
429 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
430 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
432 x = _mm_sub_ps(x, intpart);
433 x2 = _mm_mul_ps(x, x);
435 p0 = _mm_mul_ps(CA6, x2);
436 p1 = _mm_mul_ps(CA5, x2);
437 p0 = _mm_add_ps(p0, CA4);
438 p1 = _mm_add_ps(p1, CA3);
439 p0 = _mm_mul_ps(p0, x2);
440 p1 = _mm_mul_ps(p1, x2);
441 p0 = _mm_add_ps(p0, CA2);
442 p1 = _mm_add_ps(p1, CA1);
443 p0 = _mm_mul_ps(p0, x2);
444 p1 = _mm_mul_ps(p1, x);
445 p0 = _mm_add_ps(p0, CA0);
446 p0 = _mm_add_ps(p0, p1);
447 x = _mm_mul_ps(p0, fexppart);
449 return x;
453 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
454 * where y=log2(e)*x, but there will then be a small rounding error since we lose some
455 * precision due to the multiplication. This will then be magnified a lot by the exponential.
457 * Instead, we calculate the fractional part directly as a minimax approximation of
458 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
459 * remaining after 2^y, which avoids the precision-loss.
460 * The final result is correct to within 1 LSB over the entire argument range.
462 static __m256
463 gmx_mm256_exp_ps(__m256 exparg)
465 const __m256 argscale = _mm256_set1_ps(1.44269504088896341f);
466 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
467 const __m256 arglimit = _mm256_set1_ps(126.0f);
468 const __m128i expbase = _mm_set1_epi32(127);
470 const __m256 invargscale0 = _mm256_set1_ps(0.693359375f);
471 const __m256 invargscale1 = _mm256_set1_ps(-2.12194440e-4f);
473 const __m256 CE5 = _mm256_set1_ps(1.9875691500e-4f);
474 const __m256 CE4 = _mm256_set1_ps(1.3981999507e-3f);
475 const __m256 CE3 = _mm256_set1_ps(8.3334519073e-3f);
476 const __m256 CE2 = _mm256_set1_ps(4.1665795894e-2f);
477 const __m256 CE1 = _mm256_set1_ps(1.6666665459e-1f);
478 const __m256 CE0 = _mm256_set1_ps(5.0000001201e-1f);
479 const __m256 one = _mm256_set1_ps(1.0f);
481 __m256 exparg2, exp2arg;
482 __m256 pE0, pE1;
483 __m256 valuemask;
484 __m256i iexppart;
485 __m128i iexppart128a, iexppart128b;
486 __m256 fexppart;
487 __m256 intpart;
489 exp2arg = _mm256_mul_ps(exparg, argscale);
491 iexppart = _mm256_cvtps_epi32(exp2arg);
492 intpart = _mm256_round_ps(exp2arg, _MM_FROUND_TO_NEAREST_INT);
494 iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
495 iexppart128a = _mm256_castsi256_si128(iexppart);
497 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
498 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
500 iexppart = _mm256_castsi128_si256(iexppart128a);
501 iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
502 valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(exp2arg), _CMP_GE_OQ);
503 fexppart = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
505 /* Extended precision arithmetics */
506 exparg = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale0, intpart));
507 exparg = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale1, intpart));
509 exparg2 = _mm256_mul_ps(exparg, exparg);
511 pE1 = _mm256_mul_ps(CE5, exparg2);
512 pE0 = _mm256_mul_ps(CE4, exparg2);
513 pE1 = _mm256_add_ps(pE1, CE3);
514 pE0 = _mm256_add_ps(pE0, CE2);
515 pE1 = _mm256_mul_ps(pE1, exparg2);
516 pE0 = _mm256_mul_ps(pE0, exparg2);
517 pE1 = _mm256_add_ps(pE1, CE1);
518 pE0 = _mm256_add_ps(pE0, CE0);
519 pE1 = _mm256_mul_ps(pE1, exparg);
520 pE0 = _mm256_add_ps(pE0, pE1);
521 pE0 = _mm256_mul_ps(pE0, exparg2);
522 exparg = _mm256_add_ps(exparg, one);
523 exparg = _mm256_add_ps(exparg, pE0);
525 exparg = _mm256_mul_ps(exparg, fexppart);
527 return exparg;
531 /* exp(), 128 bit wide. */
532 static __m128
533 gmx_mm_exp_ps(__m128 x)
535 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
536 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
537 const __m128 arglimit = _mm_set1_ps(126.0f);
538 const __m128i expbase = _mm_set1_epi32(127);
540 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
541 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
543 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
544 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
545 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
546 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
547 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
548 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
549 const __m128 one = _mm_set1_ps(1.0f);
551 __m128 y, x2;
552 __m128 p0, p1;
553 __m128 valuemask;
554 __m128i iexppart;
555 __m128 fexppart;
556 __m128 intpart;
558 y = _mm_mul_ps(x, argscale);
560 iexppart = _mm_cvtps_epi32(y);
561 intpart = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
563 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
564 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
565 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
567 /* Extended precision arithmetics */
568 x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
569 x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
571 x2 = _mm_mul_ps(x, x);
573 p1 = _mm_mul_ps(CC5, x2);
574 p0 = _mm_mul_ps(CC4, x2);
575 p1 = _mm_add_ps(p1, CC3);
576 p0 = _mm_add_ps(p0, CC2);
577 p1 = _mm_mul_ps(p1, x2);
578 p0 = _mm_mul_ps(p0, x2);
579 p1 = _mm_add_ps(p1, CC1);
580 p0 = _mm_add_ps(p0, CC0);
581 p1 = _mm_mul_ps(p1, x);
582 p0 = _mm_add_ps(p0, p1);
583 p0 = _mm_mul_ps(p0, x2);
584 x = _mm_add_ps(x, one);
585 x = _mm_add_ps(x, p0);
587 x = _mm_mul_ps(x, fexppart);
589 return x;
594 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
595 static __m256
596 gmx_mm256_erf_ps(__m256 x)
598 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
599 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
600 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
601 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
602 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
603 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
604 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
605 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
606 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
607 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
608 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
609 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
610 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
611 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
612 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
613 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
614 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
615 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
616 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
617 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
618 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
619 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
620 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
621 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
622 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
623 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
624 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
625 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
626 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
627 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
628 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
630 /* Coefficients for expansion of exp(x) in [0,0.1] */
631 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
632 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
633 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
634 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
636 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
637 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
638 const __m256 one = _mm256_set1_ps(1.0f);
639 const __m256 two = _mm256_set1_ps(2.0f);
641 __m256 x2, x4, y;
642 __m256 z, q, t, t2, w, w2;
643 __m256 pA0, pA1, pB0, pB1, pC0, pC1;
644 __m256 expmx2, corr;
645 __m256 res_erf, res_erfc, res;
646 __m256 mask;
648 /* Calculate erf() */
649 x2 = _mm256_mul_ps(x, x);
650 x4 = _mm256_mul_ps(x2, x2);
652 pA0 = _mm256_mul_ps(CA6, x4);
653 pA1 = _mm256_mul_ps(CA5, x4);
654 pA0 = _mm256_add_ps(pA0, CA4);
655 pA1 = _mm256_add_ps(pA1, CA3);
656 pA0 = _mm256_mul_ps(pA0, x4);
657 pA1 = _mm256_mul_ps(pA1, x4);
658 pA0 = _mm256_add_ps(pA0, CA2);
659 pA1 = _mm256_add_ps(pA1, CA1);
660 pA0 = _mm256_mul_ps(pA0, x4);
661 pA1 = _mm256_mul_ps(pA1, x2);
662 pA0 = _mm256_add_ps(pA0, pA1);
663 pA0 = _mm256_add_ps(pA0, CA0);
665 res_erf = _mm256_mul_ps(x, pA0);
667 /* Calculate erfc */
669 y = gmx_mm256_abs_ps(x);
670 t = gmx_mm256_inv_ps(y);
671 w = _mm256_sub_ps(t, one);
672 t2 = _mm256_mul_ps(t, t);
673 w2 = _mm256_mul_ps(w, w);
675 * We cannot simply calculate exp(-x2) directly in single precision, since
676 * that will lose a couple of bits of precision due to the multiplication.
677 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
678 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
680 * The only drawback with this is that it requires TWO separate exponential
681 * evaluations, which would be horrible performance-wise. However, the argument
682 * for the second exp() call is always small, so there we simply use a
683 * low-order minimax expansion on [0,0.1].
686 z = _mm256_and_ps(y, sieve);
687 q = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
689 corr = _mm256_mul_ps(CD4, q);
690 corr = _mm256_add_ps(corr, CD3);
691 corr = _mm256_mul_ps(corr, q);
692 corr = _mm256_add_ps(corr, CD2);
693 corr = _mm256_mul_ps(corr, q);
694 corr = _mm256_add_ps(corr, one);
695 corr = _mm256_mul_ps(corr, q);
696 corr = _mm256_add_ps(corr, one);
698 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
699 expmx2 = _mm256_mul_ps(expmx2, corr);
701 pB1 = _mm256_mul_ps(CB9, w2);
702 pB0 = _mm256_mul_ps(CB8, w2);
703 pB1 = _mm256_add_ps(pB1, CB7);
704 pB0 = _mm256_add_ps(pB0, CB6);
705 pB1 = _mm256_mul_ps(pB1, w2);
706 pB0 = _mm256_mul_ps(pB0, w2);
707 pB1 = _mm256_add_ps(pB1, CB5);
708 pB0 = _mm256_add_ps(pB0, CB4);
709 pB1 = _mm256_mul_ps(pB1, w2);
710 pB0 = _mm256_mul_ps(pB0, w2);
711 pB1 = _mm256_add_ps(pB1, CB3);
712 pB0 = _mm256_add_ps(pB0, CB2);
713 pB1 = _mm256_mul_ps(pB1, w2);
714 pB0 = _mm256_mul_ps(pB0, w2);
715 pB1 = _mm256_add_ps(pB1, CB1);
716 pB1 = _mm256_mul_ps(pB1, w);
717 pB0 = _mm256_add_ps(pB0, pB1);
718 pB0 = _mm256_add_ps(pB0, CB0);
720 pC0 = _mm256_mul_ps(CC10, t2);
721 pC1 = _mm256_mul_ps(CC9, t2);
722 pC0 = _mm256_add_ps(pC0, CC8);
723 pC1 = _mm256_add_ps(pC1, CC7);
724 pC0 = _mm256_mul_ps(pC0, t2);
725 pC1 = _mm256_mul_ps(pC1, t2);
726 pC0 = _mm256_add_ps(pC0, CC6);
727 pC1 = _mm256_add_ps(pC1, CC5);
728 pC0 = _mm256_mul_ps(pC0, t2);
729 pC1 = _mm256_mul_ps(pC1, t2);
730 pC0 = _mm256_add_ps(pC0, CC4);
731 pC1 = _mm256_add_ps(pC1, CC3);
732 pC0 = _mm256_mul_ps(pC0, t2);
733 pC1 = _mm256_mul_ps(pC1, t2);
734 pC0 = _mm256_add_ps(pC0, CC2);
735 pC1 = _mm256_add_ps(pC1, CC1);
736 pC0 = _mm256_mul_ps(pC0, t2);
737 pC1 = _mm256_mul_ps(pC1, t);
738 pC0 = _mm256_add_ps(pC0, pC1);
739 pC0 = _mm256_add_ps(pC0, CC0);
740 pC0 = _mm256_mul_ps(pC0, t);
742 /* SELECT pB0 or pC0 for erfc() */
743 mask = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
744 res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
745 res_erfc = _mm256_mul_ps(res_erfc, expmx2);
747 /* erfc(x<0) = 2-erfc(|x|) */
748 mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
749 res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
751 /* Select erf() or erfc() */
752 mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
753 res = _mm256_blendv_ps(_mm256_sub_ps(one, res_erfc), res_erf, mask);
755 return res;
759 /* erf(), 128 bit wide */
760 static __m128
761 gmx_mm_erf_ps(__m128 x)
763 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
764 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
765 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
766 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
767 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
768 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
769 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
770 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
771 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
772 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
773 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
774 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
775 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
776 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
777 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
778 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
779 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
780 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
781 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
782 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
783 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
784 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
785 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
786 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
787 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
788 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
789 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
790 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
791 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
792 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
793 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
795 /* Coefficients for expansion of exp(x) in [0,0.1] */
796 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
797 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
798 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
799 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
801 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
802 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
803 const __m128 one = _mm_set1_ps(1.0f);
804 const __m128 two = _mm_set1_ps(2.0f);
806 __m128 x2, x4, y;
807 __m128 z, q, t, t2, w, w2;
808 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
809 __m128 expmx2, corr;
810 __m128 res_erf, res_erfc, res;
811 __m128 mask;
813 /* Calculate erf() */
814 x2 = _mm_mul_ps(x, x);
815 x4 = _mm_mul_ps(x2, x2);
817 pA0 = _mm_mul_ps(CA6, x4);
818 pA1 = _mm_mul_ps(CA5, x4);
819 pA0 = _mm_add_ps(pA0, CA4);
820 pA1 = _mm_add_ps(pA1, CA3);
821 pA0 = _mm_mul_ps(pA0, x4);
822 pA1 = _mm_mul_ps(pA1, x4);
823 pA0 = _mm_add_ps(pA0, CA2);
824 pA1 = _mm_add_ps(pA1, CA1);
825 pA0 = _mm_mul_ps(pA0, x4);
826 pA1 = _mm_mul_ps(pA1, x2);
827 pA0 = _mm_add_ps(pA0, pA1);
828 pA0 = _mm_add_ps(pA0, CA0);
830 res_erf = _mm_mul_ps(x, pA0);
832 /* Calculate erfc */
834 y = gmx_mm_abs_ps(x);
835 t = gmx_mm_inv_ps(y);
836 w = _mm_sub_ps(t, one);
837 t2 = _mm_mul_ps(t, t);
838 w2 = _mm_mul_ps(w, w);
840 * We cannot simply calculate exp(-x2) directly in single precision, since
841 * that will lose a couple of bits of precision due to the multiplication.
842 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
843 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
845 * The only drawback with this is that it requires TWO separate exponential
846 * evaluations, which would be horrible performance-wise. However, the argument
847 * for the second exp() call is always small, so there we simply use a
848 * low-order minimax expansion on [0,0.1].
851 z = _mm_and_ps(y, sieve);
852 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
854 corr = _mm_mul_ps(CD4, q);
855 corr = _mm_add_ps(corr, CD3);
856 corr = _mm_mul_ps(corr, q);
857 corr = _mm_add_ps(corr, CD2);
858 corr = _mm_mul_ps(corr, q);
859 corr = _mm_add_ps(corr, one);
860 corr = _mm_mul_ps(corr, q);
861 corr = _mm_add_ps(corr, one);
863 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
864 expmx2 = _mm_mul_ps(expmx2, corr);
866 pB1 = _mm_mul_ps(CB9, w2);
867 pB0 = _mm_mul_ps(CB8, w2);
868 pB1 = _mm_add_ps(pB1, CB7);
869 pB0 = _mm_add_ps(pB0, CB6);
870 pB1 = _mm_mul_ps(pB1, w2);
871 pB0 = _mm_mul_ps(pB0, w2);
872 pB1 = _mm_add_ps(pB1, CB5);
873 pB0 = _mm_add_ps(pB0, CB4);
874 pB1 = _mm_mul_ps(pB1, w2);
875 pB0 = _mm_mul_ps(pB0, w2);
876 pB1 = _mm_add_ps(pB1, CB3);
877 pB0 = _mm_add_ps(pB0, CB2);
878 pB1 = _mm_mul_ps(pB1, w2);
879 pB0 = _mm_mul_ps(pB0, w2);
880 pB1 = _mm_add_ps(pB1, CB1);
881 pB1 = _mm_mul_ps(pB1, w);
882 pB0 = _mm_add_ps(pB0, pB1);
883 pB0 = _mm_add_ps(pB0, CB0);
885 pC0 = _mm_mul_ps(CC10, t2);
886 pC1 = _mm_mul_ps(CC9, t2);
887 pC0 = _mm_add_ps(pC0, CC8);
888 pC1 = _mm_add_ps(pC1, CC7);
889 pC0 = _mm_mul_ps(pC0, t2);
890 pC1 = _mm_mul_ps(pC1, t2);
891 pC0 = _mm_add_ps(pC0, CC6);
892 pC1 = _mm_add_ps(pC1, CC5);
893 pC0 = _mm_mul_ps(pC0, t2);
894 pC1 = _mm_mul_ps(pC1, t2);
895 pC0 = _mm_add_ps(pC0, CC4);
896 pC1 = _mm_add_ps(pC1, CC3);
897 pC0 = _mm_mul_ps(pC0, t2);
898 pC1 = _mm_mul_ps(pC1, t2);
899 pC0 = _mm_add_ps(pC0, CC2);
900 pC1 = _mm_add_ps(pC1, CC1);
901 pC0 = _mm_mul_ps(pC0, t2);
902 pC1 = _mm_mul_ps(pC1, t);
903 pC0 = _mm_add_ps(pC0, pC1);
904 pC0 = _mm_add_ps(pC0, CC0);
905 pC0 = _mm_mul_ps(pC0, t);
907 /* SELECT pB0 or pC0 for erfc() */
908 mask = _mm_cmplt_ps(two, y);
909 res_erfc = _mm_blendv_ps(pB0, pC0, mask);
910 res_erfc = _mm_mul_ps(res_erfc, expmx2);
912 /* erfc(x<0) = 2-erfc(|x|) */
913 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
914 res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
916 /* Select erf() or erfc() */
917 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
918 res = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
920 return res;
926 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
927 static __m256
928 gmx_mm256_erfc_ps(__m256 x)
930 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
931 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
932 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
933 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
934 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
935 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
936 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
937 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
938 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
939 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
940 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
941 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
942 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
943 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
944 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
945 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
946 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
947 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
948 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
949 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
950 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
951 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
952 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
953 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
954 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
955 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
956 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
957 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
958 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
959 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
960 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
962 /* Coefficients for expansion of exp(x) in [0,0.1] */
963 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
964 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
965 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
966 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
968 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
969 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
970 const __m256 one = _mm256_set1_ps(1.0f);
971 const __m256 two = _mm256_set1_ps(2.0f);
973 __m256 x2, x4, y;
974 __m256 z, q, t, t2, w, w2;
975 __m256 pA0, pA1, pB0, pB1, pC0, pC1;
976 __m256 expmx2, corr;
977 __m256 res_erf, res_erfc, res;
978 __m256 mask;
980 /* Calculate erf() */
981 x2 = _mm256_mul_ps(x, x);
982 x4 = _mm256_mul_ps(x2, x2);
984 pA0 = _mm256_mul_ps(CA6, x4);
985 pA1 = _mm256_mul_ps(CA5, x4);
986 pA0 = _mm256_add_ps(pA0, CA4);
987 pA1 = _mm256_add_ps(pA1, CA3);
988 pA0 = _mm256_mul_ps(pA0, x4);
989 pA1 = _mm256_mul_ps(pA1, x4);
990 pA0 = _mm256_add_ps(pA0, CA2);
991 pA1 = _mm256_add_ps(pA1, CA1);
992 pA0 = _mm256_mul_ps(pA0, x4);
993 pA1 = _mm256_mul_ps(pA1, x2);
994 pA0 = _mm256_add_ps(pA0, pA1);
995 pA0 = _mm256_add_ps(pA0, CA0);
997 res_erf = _mm256_mul_ps(x, pA0);
999 /* Calculate erfc */
1000 y = gmx_mm256_abs_ps(x);
1001 t = gmx_mm256_inv_ps(y);
1002 w = _mm256_sub_ps(t, one);
1003 t2 = _mm256_mul_ps(t, t);
1004 w2 = _mm256_mul_ps(w, w);
1006 * We cannot simply calculate exp(-x2) directly in single precision, since
1007 * that will lose a couple of bits of precision due to the multiplication.
1008 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1009 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1011 * The only drawback with this is that it requires TWO separate exponential
1012 * evaluations, which would be horrible performance-wise. However, the argument
1013 * for the second exp() call is always small, so there we simply use a
1014 * low-order minimax expansion on [0,0.1].
1017 z = _mm256_and_ps(y, sieve);
1018 q = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
1020 corr = _mm256_mul_ps(CD4, q);
1021 corr = _mm256_add_ps(corr, CD3);
1022 corr = _mm256_mul_ps(corr, q);
1023 corr = _mm256_add_ps(corr, CD2);
1024 corr = _mm256_mul_ps(corr, q);
1025 corr = _mm256_add_ps(corr, one);
1026 corr = _mm256_mul_ps(corr, q);
1027 corr = _mm256_add_ps(corr, one);
1029 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
1030 expmx2 = _mm256_mul_ps(expmx2, corr);
1032 pB1 = _mm256_mul_ps(CB9, w2);
1033 pB0 = _mm256_mul_ps(CB8, w2);
1034 pB1 = _mm256_add_ps(pB1, CB7);
1035 pB0 = _mm256_add_ps(pB0, CB6);
1036 pB1 = _mm256_mul_ps(pB1, w2);
1037 pB0 = _mm256_mul_ps(pB0, w2);
1038 pB1 = _mm256_add_ps(pB1, CB5);
1039 pB0 = _mm256_add_ps(pB0, CB4);
1040 pB1 = _mm256_mul_ps(pB1, w2);
1041 pB0 = _mm256_mul_ps(pB0, w2);
1042 pB1 = _mm256_add_ps(pB1, CB3);
1043 pB0 = _mm256_add_ps(pB0, CB2);
1044 pB1 = _mm256_mul_ps(pB1, w2);
1045 pB0 = _mm256_mul_ps(pB0, w2);
1046 pB1 = _mm256_add_ps(pB1, CB1);
1047 pB1 = _mm256_mul_ps(pB1, w);
1048 pB0 = _mm256_add_ps(pB0, pB1);
1049 pB0 = _mm256_add_ps(pB0, CB0);
1051 pC0 = _mm256_mul_ps(CC10, t2);
1052 pC1 = _mm256_mul_ps(CC9, t2);
1053 pC0 = _mm256_add_ps(pC0, CC8);
1054 pC1 = _mm256_add_ps(pC1, CC7);
1055 pC0 = _mm256_mul_ps(pC0, t2);
1056 pC1 = _mm256_mul_ps(pC1, t2);
1057 pC0 = _mm256_add_ps(pC0, CC6);
1058 pC1 = _mm256_add_ps(pC1, CC5);
1059 pC0 = _mm256_mul_ps(pC0, t2);
1060 pC1 = _mm256_mul_ps(pC1, t2);
1061 pC0 = _mm256_add_ps(pC0, CC4);
1062 pC1 = _mm256_add_ps(pC1, CC3);
1063 pC0 = _mm256_mul_ps(pC0, t2);
1064 pC1 = _mm256_mul_ps(pC1, t2);
1065 pC0 = _mm256_add_ps(pC0, CC2);
1066 pC1 = _mm256_add_ps(pC1, CC1);
1067 pC0 = _mm256_mul_ps(pC0, t2);
1068 pC1 = _mm256_mul_ps(pC1, t);
1069 pC0 = _mm256_add_ps(pC0, pC1);
1070 pC0 = _mm256_add_ps(pC0, CC0);
1071 pC0 = _mm256_mul_ps(pC0, t);
1073 /* SELECT pB0 or pC0 for erfc() */
1074 mask = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
1075 res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
1076 res_erfc = _mm256_mul_ps(res_erfc, expmx2);
1078 /* erfc(x<0) = 2-erfc(|x|) */
1079 mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
1080 res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
1082 /* Select erf() or erfc() */
1083 mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
1084 res = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(one, res_erf), mask);
1086 return res;
1090 /* erfc(), 128 bit wide */
1091 static __m128
1092 gmx_mm_erfc_ps(__m128 x)
1094 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1095 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
1096 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
1097 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
1098 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
1099 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
1100 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
1101 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
1102 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1103 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
1104 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
1105 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
1106 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
1107 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
1108 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
1109 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
1110 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
1111 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
1112 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
1113 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1114 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
1115 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
1116 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
1117 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
1118 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
1119 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
1120 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
1121 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
1122 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
1123 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
1124 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
1126 /* Coefficients for expansion of exp(x) in [0,0.1] */
1127 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1128 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
1129 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
1130 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
1132 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1133 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1134 const __m128 one = _mm_set1_ps(1.0f);
1135 const __m128 two = _mm_set1_ps(2.0f);
1137 __m128 x2, x4, y;
1138 __m128 z, q, t, t2, w, w2;
1139 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
1140 __m128 expmx2, corr;
1141 __m128 res_erf, res_erfc, res;
1142 __m128 mask;
1144 /* Calculate erf() */
1145 x2 = _mm_mul_ps(x, x);
1146 x4 = _mm_mul_ps(x2, x2);
1148 pA0 = _mm_mul_ps(CA6, x4);
1149 pA1 = _mm_mul_ps(CA5, x4);
1150 pA0 = _mm_add_ps(pA0, CA4);
1151 pA1 = _mm_add_ps(pA1, CA3);
1152 pA0 = _mm_mul_ps(pA0, x4);
1153 pA1 = _mm_mul_ps(pA1, x4);
1154 pA0 = _mm_add_ps(pA0, CA2);
1155 pA1 = _mm_add_ps(pA1, CA1);
1156 pA0 = _mm_mul_ps(pA0, x4);
1157 pA1 = _mm_mul_ps(pA1, x2);
1158 pA0 = _mm_add_ps(pA0, pA1);
1159 pA0 = _mm_add_ps(pA0, CA0);
1161 res_erf = _mm_mul_ps(x, pA0);
1163 /* Calculate erfc */
1164 y = gmx_mm_abs_ps(x);
1165 t = gmx_mm_inv_ps(y);
1166 w = _mm_sub_ps(t, one);
1167 t2 = _mm_mul_ps(t, t);
1168 w2 = _mm_mul_ps(w, w);
1170 * We cannot simply calculate exp(-x2) directly in single precision, since
1171 * that will lose a couple of bits of precision due to the multiplication.
1172 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1173 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1175 * The only drawback with this is that it requires TWO separate exponential
1176 * evaluations, which would be horrible performance-wise. However, the argument
1177 * for the second exp() call is always small, so there we simply use a
1178 * low-order minimax expansion on [0,0.1].
1181 z = _mm_and_ps(y, sieve);
1182 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
1184 corr = _mm_mul_ps(CD4, q);
1185 corr = _mm_add_ps(corr, CD3);
1186 corr = _mm_mul_ps(corr, q);
1187 corr = _mm_add_ps(corr, CD2);
1188 corr = _mm_mul_ps(corr, q);
1189 corr = _mm_add_ps(corr, one);
1190 corr = _mm_mul_ps(corr, q);
1191 corr = _mm_add_ps(corr, one);
1193 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
1194 expmx2 = _mm_mul_ps(expmx2, corr);
1196 pB1 = _mm_mul_ps(CB9, w2);
1197 pB0 = _mm_mul_ps(CB8, w2);
1198 pB1 = _mm_add_ps(pB1, CB7);
1199 pB0 = _mm_add_ps(pB0, CB6);
1200 pB1 = _mm_mul_ps(pB1, w2);
1201 pB0 = _mm_mul_ps(pB0, w2);
1202 pB1 = _mm_add_ps(pB1, CB5);
1203 pB0 = _mm_add_ps(pB0, CB4);
1204 pB1 = _mm_mul_ps(pB1, w2);
1205 pB0 = _mm_mul_ps(pB0, w2);
1206 pB1 = _mm_add_ps(pB1, CB3);
1207 pB0 = _mm_add_ps(pB0, CB2);
1208 pB1 = _mm_mul_ps(pB1, w2);
1209 pB0 = _mm_mul_ps(pB0, w2);
1210 pB1 = _mm_add_ps(pB1, CB1);
1211 pB1 = _mm_mul_ps(pB1, w);
1212 pB0 = _mm_add_ps(pB0, pB1);
1213 pB0 = _mm_add_ps(pB0, CB0);
1215 pC0 = _mm_mul_ps(CC10, t2);
1216 pC1 = _mm_mul_ps(CC9, t2);
1217 pC0 = _mm_add_ps(pC0, CC8);
1218 pC1 = _mm_add_ps(pC1, CC7);
1219 pC0 = _mm_mul_ps(pC0, t2);
1220 pC1 = _mm_mul_ps(pC1, t2);
1221 pC0 = _mm_add_ps(pC0, CC6);
1222 pC1 = _mm_add_ps(pC1, CC5);
1223 pC0 = _mm_mul_ps(pC0, t2);
1224 pC1 = _mm_mul_ps(pC1, t2);
1225 pC0 = _mm_add_ps(pC0, CC4);
1226 pC1 = _mm_add_ps(pC1, CC3);
1227 pC0 = _mm_mul_ps(pC0, t2);
1228 pC1 = _mm_mul_ps(pC1, t2);
1229 pC0 = _mm_add_ps(pC0, CC2);
1230 pC1 = _mm_add_ps(pC1, CC1);
1231 pC0 = _mm_mul_ps(pC0, t2);
1232 pC1 = _mm_mul_ps(pC1, t);
1233 pC0 = _mm_add_ps(pC0, pC1);
1234 pC0 = _mm_add_ps(pC0, CC0);
1235 pC0 = _mm_mul_ps(pC0, t);
1237 /* SELECT pB0 or pC0 for erfc() */
1238 mask = _mm_cmplt_ps(two, y);
1239 res_erfc = _mm_blendv_ps(pB0, pC0, mask);
1240 res_erfc = _mm_mul_ps(res_erfc, expmx2);
1242 /* erfc(x<0) = 2-erfc(|x|) */
1243 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
1244 res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
1246 /* Select erf() or erfc() */
1247 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
1248 res = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
1250 return res;
1255 /* Calculate the force correction due to PME analytically.
1257 * This routine is meant to enable analytical evaluation of the
1258 * direct-space PME electrostatic force to avoid tables.
1260 * The direct-space potential should be Erfc(beta*r)/r, but there
1261 * are some problems evaluating that:
1263 * First, the error function is difficult (read: expensive) to
1264 * approxmiate accurately for intermediate to large arguments, and
1265 * this happens already in ranges of beta*r that occur in simulations.
1266 * Second, we now try to avoid calculating potentials in Gromacs but
1267 * use forces directly.
1269 * We can simply things slight by noting that the PME part is really
1270 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1272 * V= 1/r - Erf(beta*r)/r
1274 * The first term we already have from the inverse square root, so
1275 * that we can leave out of this routine.
1277 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1278 * the argument beta*r will be in the range 0.15 to ~4. Use your
1279 * favorite plotting program to realize how well-behaved Erf(z)/z is
1280 * in this range!
1282 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1283 * However, it turns out it is more efficient to approximate f(z)/z and
1284 * then only use even powers. This is another minor optimization, since
1285 * we actually WANT f(z)/z, because it is going to be multiplied by
1286 * the vector between the two atoms to get the vectorial force. The
1287 * fastest flops are the ones we can avoid calculating!
1289 * So, here's how it should be used:
1291 * 1. Calculate r^2.
1292 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1293 * 3. Evaluate this routine with z^2 as the argument.
1294 * 4. The return value is the expression:
1297 * 2*exp(-z^2) erf(z)
1298 * ------------ - --------
1299 * sqrt(Pi)*z^2 z^3
1301 * 5. Multiply the entire expression by beta^3. This will get you
1303 * beta^3*2*exp(-z^2) beta^3*erf(z)
1304 * ------------------ - ---------------
1305 * sqrt(Pi)*z^2 z^3
1307 * or, switching back to r (z=r*beta):
1309 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
1310 * ----------------------- - -----------
1311 * sqrt(Pi)*r^2 r^3
1314 * With a bit of math exercise you should be able to confirm that
1315 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1317 * 6. Add the result to 1/r^3, multiply by the product of the charges,
1318 * and you have your force (divided by r). A final multiplication
1319 * with the vector connecting the two particles and you have your
1320 * vectorial force to add to the particles.
1323 static __m256
1324 gmx_mm256_pmecorrF_ps(__m256 z2)
1326 const __m256 FN6 = _mm256_set1_ps(-1.7357322914161492954e-8f);
1327 const __m256 FN5 = _mm256_set1_ps(1.4703624142580877519e-6f);
1328 const __m256 FN4 = _mm256_set1_ps(-0.000053401640219807709149f);
1329 const __m256 FN3 = _mm256_set1_ps(0.0010054721316683106153f);
1330 const __m256 FN2 = _mm256_set1_ps(-0.019278317264888380590f);
1331 const __m256 FN1 = _mm256_set1_ps(0.069670166153766424023f);
1332 const __m256 FN0 = _mm256_set1_ps(-0.75225204789749321333f);
1334 const __m256 FD4 = _mm256_set1_ps(0.0011193462567257629232f);
1335 const __m256 FD3 = _mm256_set1_ps(0.014866955030185295499f);
1336 const __m256 FD2 = _mm256_set1_ps(0.11583842382862377919f);
1337 const __m256 FD1 = _mm256_set1_ps(0.50736591960530292870f);
1338 const __m256 FD0 = _mm256_set1_ps(1.0f);
1340 __m256 z4;
1341 __m256 polyFN0, polyFN1, polyFD0, polyFD1;
1343 z4 = _mm256_mul_ps(z2, z2);
1345 polyFD0 = _mm256_mul_ps(FD4, z4);
1346 polyFD1 = _mm256_mul_ps(FD3, z4);
1347 polyFD0 = _mm256_add_ps(polyFD0, FD2);
1348 polyFD1 = _mm256_add_ps(polyFD1, FD1);
1349 polyFD0 = _mm256_mul_ps(polyFD0, z4);
1350 polyFD1 = _mm256_mul_ps(polyFD1, z2);
1351 polyFD0 = _mm256_add_ps(polyFD0, FD0);
1352 polyFD0 = _mm256_add_ps(polyFD0, polyFD1);
1354 polyFD0 = gmx_mm256_inv_ps(polyFD0);
1356 polyFN0 = _mm256_mul_ps(FN6, z4);
1357 polyFN1 = _mm256_mul_ps(FN5, z4);
1358 polyFN0 = _mm256_add_ps(polyFN0, FN4);
1359 polyFN1 = _mm256_add_ps(polyFN1, FN3);
1360 polyFN0 = _mm256_mul_ps(polyFN0, z4);
1361 polyFN1 = _mm256_mul_ps(polyFN1, z4);
1362 polyFN0 = _mm256_add_ps(polyFN0, FN2);
1363 polyFN1 = _mm256_add_ps(polyFN1, FN1);
1364 polyFN0 = _mm256_mul_ps(polyFN0, z4);
1365 polyFN1 = _mm256_mul_ps(polyFN1, z2);
1366 polyFN0 = _mm256_add_ps(polyFN0, FN0);
1367 polyFN0 = _mm256_add_ps(polyFN0, polyFN1);
1369 return _mm256_mul_ps(polyFN0, polyFD0);
1373 static __m128
1374 gmx_mm_pmecorrF_ps(__m128 z2)
1376 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
1377 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
1378 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
1379 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
1380 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
1381 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
1382 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
1384 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
1385 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
1386 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
1387 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
1388 const __m128 FD0 = _mm_set1_ps(1.0f);
1390 __m128 z4;
1391 __m128 polyFN0, polyFN1, polyFD0, polyFD1;
1393 z4 = _mm_mul_ps(z2, z2);
1395 polyFD0 = _mm_mul_ps(FD4, z4);
1396 polyFD1 = _mm_mul_ps(FD3, z4);
1397 polyFD0 = _mm_add_ps(polyFD0, FD2);
1398 polyFD1 = _mm_add_ps(polyFD1, FD1);
1399 polyFD0 = _mm_mul_ps(polyFD0, z4);
1400 polyFD1 = _mm_mul_ps(polyFD1, z2);
1401 polyFD0 = _mm_add_ps(polyFD0, FD0);
1402 polyFD0 = _mm_add_ps(polyFD0, polyFD1);
1404 polyFD0 = gmx_mm_inv_ps(polyFD0);
1406 polyFN0 = _mm_mul_ps(FN6, z4);
1407 polyFN1 = _mm_mul_ps(FN5, z4);
1408 polyFN0 = _mm_add_ps(polyFN0, FN4);
1409 polyFN1 = _mm_add_ps(polyFN1, FN3);
1410 polyFN0 = _mm_mul_ps(polyFN0, z4);
1411 polyFN1 = _mm_mul_ps(polyFN1, z4);
1412 polyFN0 = _mm_add_ps(polyFN0, FN2);
1413 polyFN1 = _mm_add_ps(polyFN1, FN1);
1414 polyFN0 = _mm_mul_ps(polyFN0, z4);
1415 polyFN1 = _mm_mul_ps(polyFN1, z2);
1416 polyFN0 = _mm_add_ps(polyFN0, FN0);
1417 polyFN0 = _mm_add_ps(polyFN0, polyFN1);
1419 return _mm_mul_ps(polyFN0, polyFD0);
1424 /* Calculate the potential correction due to PME analytically.
1426 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1428 * This routine calculates Erf(z)/z, although you should provide z^2
1429 * as the input argument.
1431 * Here's how it should be used:
1433 * 1. Calculate r^2.
1434 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1435 * 3. Evaluate this routine with z^2 as the argument.
1436 * 4. The return value is the expression:
1439 * erf(z)
1440 * --------
1443 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1445 * erf(r*beta)
1446 * -----------
1449 * 6. Subtract the result from 1/r, multiply by the product of the charges,
1450 * and you have your potential.
1452 static __m256
1453 gmx_mm256_pmecorrV_ps(__m256 z2)
1455 const __m256 VN6 = _mm256_set1_ps(1.9296833005951166339e-8f);
1456 const __m256 VN5 = _mm256_set1_ps(-1.4213390571557850962e-6f);
1457 const __m256 VN4 = _mm256_set1_ps(0.000041603292906656984871f);
1458 const __m256 VN3 = _mm256_set1_ps(-0.00013134036773265025626f);
1459 const __m256 VN2 = _mm256_set1_ps(0.038657983986041781264f);
1460 const __m256 VN1 = _mm256_set1_ps(0.11285044772717598220f);
1461 const __m256 VN0 = _mm256_set1_ps(1.1283802385263030286f);
1463 const __m256 VD3 = _mm256_set1_ps(0.0066752224023576045451f);
1464 const __m256 VD2 = _mm256_set1_ps(0.078647795836373922256f);
1465 const __m256 VD1 = _mm256_set1_ps(0.43336185284710920150f);
1466 const __m256 VD0 = _mm256_set1_ps(1.0f);
1468 __m256 z4;
1469 __m256 polyVN0, polyVN1, polyVD0, polyVD1;
1471 z4 = _mm256_mul_ps(z2, z2);
1473 polyVD1 = _mm256_mul_ps(VD3, z4);
1474 polyVD0 = _mm256_mul_ps(VD2, z4);
1475 polyVD1 = _mm256_add_ps(polyVD1, VD1);
1476 polyVD0 = _mm256_add_ps(polyVD0, VD0);
1477 polyVD1 = _mm256_mul_ps(polyVD1, z2);
1478 polyVD0 = _mm256_add_ps(polyVD0, polyVD1);
1480 polyVD0 = gmx_mm256_inv_ps(polyVD0);
1482 polyVN0 = _mm256_mul_ps(VN6, z4);
1483 polyVN1 = _mm256_mul_ps(VN5, z4);
1484 polyVN0 = _mm256_add_ps(polyVN0, VN4);
1485 polyVN1 = _mm256_add_ps(polyVN1, VN3);
1486 polyVN0 = _mm256_mul_ps(polyVN0, z4);
1487 polyVN1 = _mm256_mul_ps(polyVN1, z4);
1488 polyVN0 = _mm256_add_ps(polyVN0, VN2);
1489 polyVN1 = _mm256_add_ps(polyVN1, VN1);
1490 polyVN0 = _mm256_mul_ps(polyVN0, z4);
1491 polyVN1 = _mm256_mul_ps(polyVN1, z2);
1492 polyVN0 = _mm256_add_ps(polyVN0, VN0);
1493 polyVN0 = _mm256_add_ps(polyVN0, polyVN1);
1495 return _mm256_mul_ps(polyVN0, polyVD0);
1499 static __m128
1500 gmx_mm_pmecorrV_ps(__m128 z2)
1502 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
1503 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
1504 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
1505 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
1506 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
1507 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
1508 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
1510 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
1511 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
1512 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
1513 const __m128 VD0 = _mm_set1_ps(1.0f);
1515 __m128 z4;
1516 __m128 polyVN0, polyVN1, polyVD0, polyVD1;
1518 z4 = _mm_mul_ps(z2, z2);
1520 polyVD1 = _mm_mul_ps(VD3, z4);
1521 polyVD0 = _mm_mul_ps(VD2, z4);
1522 polyVD1 = _mm_add_ps(polyVD1, VD1);
1523 polyVD0 = _mm_add_ps(polyVD0, VD0);
1524 polyVD1 = _mm_mul_ps(polyVD1, z2);
1525 polyVD0 = _mm_add_ps(polyVD0, polyVD1);
1527 polyVD0 = gmx_mm_inv_ps(polyVD0);
1529 polyVN0 = _mm_mul_ps(VN6, z4);
1530 polyVN1 = _mm_mul_ps(VN5, z4);
1531 polyVN0 = _mm_add_ps(polyVN0, VN4);
1532 polyVN1 = _mm_add_ps(polyVN1, VN3);
1533 polyVN0 = _mm_mul_ps(polyVN0, z4);
1534 polyVN1 = _mm_mul_ps(polyVN1, z4);
1535 polyVN0 = _mm_add_ps(polyVN0, VN2);
1536 polyVN1 = _mm_add_ps(polyVN1, VN1);
1537 polyVN0 = _mm_mul_ps(polyVN0, z4);
1538 polyVN1 = _mm_mul_ps(polyVN1, z2);
1539 polyVN0 = _mm_add_ps(polyVN0, VN0);
1540 polyVN0 = _mm_add_ps(polyVN0, polyVN1);
1542 return _mm_mul_ps(polyVN0, polyVD0);
1546 static int
1547 gmx_mm256_sincos_ps(__m256 x,
1548 __m256 *sinval,
1549 __m256 *cosval)
1551 const __m256 two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1552 const __m256 half = _mm256_set1_ps(0.5f);
1553 const __m256 one = _mm256_set1_ps(1.0f);
1554 const __m256 zero = _mm256_setzero_ps();
1556 const __m128i ione = _mm_set1_epi32(1);
1558 const __m256 mask_one = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1559 const __m256 mask_two = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1560 const __m256 mask_three = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1562 const __m256 CA1 = _mm256_set1_ps(1.5703125f);
1563 const __m256 CA2 = _mm256_set1_ps(4.837512969970703125e-4f);
1564 const __m256 CA3 = _mm256_set1_ps(7.54978995489188216e-8f);
1566 const __m256 CC0 = _mm256_set1_ps(-0.0013602249f);
1567 const __m256 CC1 = _mm256_set1_ps(0.0416566950f);
1568 const __m256 CC2 = _mm256_set1_ps(-0.4999990225f);
1569 const __m256 CS0 = _mm256_set1_ps(-0.0001950727f);
1570 const __m256 CS1 = _mm256_set1_ps(0.0083320758f);
1571 const __m256 CS2 = _mm256_set1_ps(-0.1666665247f);
1573 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1575 __m256 y, y2;
1576 __m256 z;
1577 __m256i iz;
1578 __m128i iz_high, iz_low;
1579 __m256 offset_sin, offset_cos;
1580 __m256 mask_sin, mask_cos;
1581 __m256 tmp1, tmp2;
1582 __m256 tmp_sin, tmp_cos;
1584 y = _mm256_mul_ps(x, two_over_pi);
1585 y = _mm256_add_ps(y, _mm256_or_ps(_mm256_and_ps(y, signbit), half));
1587 iz = _mm256_cvttps_epi32(y);
1588 z = _mm256_round_ps(y, _MM_FROUND_TO_ZERO);
1590 offset_sin = _mm256_and_ps(_mm256_castsi256_ps(iz), mask_three);
1592 iz_high = _mm256_extractf128_si256(iz, 0x1);
1593 iz_low = _mm256_castsi256_si128(iz);
1594 iz_low = _mm_add_epi32(iz_low, ione);
1595 iz_high = _mm_add_epi32(iz_high, ione);
1596 iz = _mm256_castsi128_si256(iz_low);
1597 iz = _mm256_insertf128_si256(iz, iz_high, 0x1);
1598 offset_cos = _mm256_castsi256_ps(iz);
1600 /* Extended precision arithmethic to achieve full precision */
1601 y = _mm256_mul_ps(z, CA1);
1602 tmp1 = _mm256_mul_ps(z, CA2);
1603 tmp2 = _mm256_mul_ps(z, CA3);
1604 y = _mm256_sub_ps(x, y);
1605 y = _mm256_sub_ps(y, tmp1);
1606 y = _mm256_sub_ps(y, tmp2);
1608 y2 = _mm256_mul_ps(y, y);
1610 tmp1 = _mm256_mul_ps(CC0, y2);
1611 tmp1 = _mm256_add_ps(tmp1, CC1);
1612 tmp2 = _mm256_mul_ps(CS0, y2);
1613 tmp2 = _mm256_add_ps(tmp2, CS1);
1614 tmp1 = _mm256_mul_ps(tmp1, y2);
1615 tmp1 = _mm256_add_ps(tmp1, CC2);
1616 tmp2 = _mm256_mul_ps(tmp2, y2);
1617 tmp2 = _mm256_add_ps(tmp2, CS2);
1619 tmp1 = _mm256_mul_ps(tmp1, y2);
1620 tmp1 = _mm256_add_ps(tmp1, one);
1622 tmp2 = _mm256_mul_ps(tmp2, _mm256_mul_ps(y, y2));
1623 tmp2 = _mm256_add_ps(tmp2, y);
1625 #ifdef __INTEL_COMPILER
1626 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1627 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_one))), zero, _CMP_EQ_OQ);
1628 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_one))), zero, _CMP_EQ_OQ);
1629 #else
1630 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_one), zero, _CMP_EQ_OQ);
1631 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_one), zero, _CMP_EQ_OQ);
1632 #endif
1633 tmp_sin = _mm256_blendv_ps(tmp1, tmp2, mask_sin);
1634 tmp_cos = _mm256_blendv_ps(tmp1, tmp2, mask_cos);
1636 tmp1 = _mm256_xor_ps(signbit, tmp_sin);
1637 tmp2 = _mm256_xor_ps(signbit, tmp_cos);
1639 #ifdef __INTEL_COMPILER
1640 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1641 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_two))), zero, _CMP_EQ_OQ);
1642 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_two))), zero, _CMP_EQ_OQ);
1643 #else
1644 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_two), zero, _CMP_EQ_OQ);
1645 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_two), zero, _CMP_EQ_OQ);
1647 #endif
1648 *sinval = _mm256_blendv_ps(tmp1, tmp_sin, mask_sin);
1649 *cosval = _mm256_blendv_ps(tmp2, tmp_cos, mask_cos);
1651 return 0;
1654 static int
1655 gmx_mm_sincos_ps(__m128 x,
1656 __m128 *sinval,
1657 __m128 *cosval)
1659 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
1660 const __m128 half = _mm_set1_ps(0.5);
1661 const __m128 one = _mm_set1_ps(1.0);
1663 const __m128i izero = _mm_set1_epi32(0);
1664 const __m128i ione = _mm_set1_epi32(1);
1665 const __m128i itwo = _mm_set1_epi32(2);
1666 const __m128i ithree = _mm_set1_epi32(3);
1667 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1669 const __m128 CA1 = _mm_set1_ps(1.5703125f);
1670 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
1671 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
1673 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
1674 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
1675 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
1676 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
1677 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
1678 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
1680 __m128 y, y2;
1681 __m128 z;
1682 __m128i iz;
1683 __m128i offset_sin, offset_cos;
1684 __m128 tmp1, tmp2;
1685 __m128 mask_sin, mask_cos;
1686 __m128 tmp_sin, tmp_cos;
1688 y = _mm_mul_ps(x, two_over_pi);
1689 y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
1691 iz = _mm_cvttps_epi32(y);
1692 z = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
1694 offset_sin = _mm_and_si128(iz, ithree);
1695 offset_cos = _mm_add_epi32(iz, ione);
1697 /* Extended precision arithmethic to achieve full precision */
1698 y = _mm_mul_ps(z, CA1);
1699 tmp1 = _mm_mul_ps(z, CA2);
1700 tmp2 = _mm_mul_ps(z, CA3);
1701 y = _mm_sub_ps(x, y);
1702 y = _mm_sub_ps(y, tmp1);
1703 y = _mm_sub_ps(y, tmp2);
1705 y2 = _mm_mul_ps(y, y);
1707 tmp1 = _mm_mul_ps(CC0, y2);
1708 tmp1 = _mm_add_ps(tmp1, CC1);
1709 tmp2 = _mm_mul_ps(CS0, y2);
1710 tmp2 = _mm_add_ps(tmp2, CS1);
1711 tmp1 = _mm_mul_ps(tmp1, y2);
1712 tmp1 = _mm_add_ps(tmp1, CC2);
1713 tmp2 = _mm_mul_ps(tmp2, y2);
1714 tmp2 = _mm_add_ps(tmp2, CS2);
1716 tmp1 = _mm_mul_ps(tmp1, y2);
1717 tmp1 = _mm_add_ps(tmp1, one);
1719 tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
1720 tmp2 = _mm_add_ps(tmp2, y);
1722 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
1723 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
1725 tmp_sin = _mm_blendv_ps(tmp1, tmp2, mask_sin);
1726 tmp_cos = _mm_blendv_ps(tmp1, tmp2, mask_cos);
1728 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
1729 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
1731 tmp1 = _mm_xor_ps(signbit, tmp_sin);
1732 tmp2 = _mm_xor_ps(signbit, tmp_cos);
1734 *sinval = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
1735 *cosval = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
1737 return 0;
1744 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1745 * will then call the sincos() routine and waste a factor 2 in performance!
1747 static __m256
1748 gmx_mm256_sin_ps(__m256 x)
1750 __m256 s, c;
1751 gmx_mm256_sincos_ps(x, &s, &c);
1752 return s;
1755 static __m128
1756 gmx_mm_sin_ps(__m128 x)
1758 __m128 s, c;
1759 gmx_mm_sincos_ps(x, &s, &c);
1760 return s;
1765 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1766 * will then call the sincos() routine and waste a factor 2 in performance!
1768 static __m256
1769 gmx_mm256_cos_ps(__m256 x)
1771 __m256 s, c;
1772 gmx_mm256_sincos_ps(x, &s, &c);
1773 return c;
1776 static __m128
1777 gmx_mm_cos_ps(__m128 x)
1779 __m128 s, c;
1780 gmx_mm_sincos_ps(x, &s, &c);
1781 return c;
1785 static __m256
1786 gmx_mm256_tan_ps(__m256 x)
1788 __m256 sinval, cosval;
1789 __m256 tanval;
1791 gmx_mm256_sincos_ps(x, &sinval, &cosval);
1793 tanval = _mm256_mul_ps(sinval, gmx_mm256_inv_ps(cosval));
1795 return tanval;
1798 static __m128
1799 gmx_mm_tan_ps(__m128 x)
1801 __m128 sinval, cosval;
1802 __m128 tanval;
1804 gmx_mm_sincos_ps(x, &sinval, &cosval);
1806 tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
1808 return tanval;
1812 static __m256
1813 gmx_mm256_asin_ps(__m256 x)
1815 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1816 const __m256 limitlow = _mm256_set1_ps(1e-4f);
1817 const __m256 half = _mm256_set1_ps(0.5f);
1818 const __m256 one = _mm256_set1_ps(1.0f);
1819 const __m256 halfpi = _mm256_set1_ps((float)M_PI/2.0f);
1821 const __m256 CC5 = _mm256_set1_ps(4.2163199048E-2f);
1822 const __m256 CC4 = _mm256_set1_ps(2.4181311049E-2f);
1823 const __m256 CC3 = _mm256_set1_ps(4.5470025998E-2f);
1824 const __m256 CC2 = _mm256_set1_ps(7.4953002686E-2f);
1825 const __m256 CC1 = _mm256_set1_ps(1.6666752422E-1f);
1827 __m256 sign;
1828 __m256 mask;
1829 __m256 xabs;
1830 __m256 z, z1, z2, q, q1, q2;
1831 __m256 pA, pB;
1833 sign = _mm256_andnot_ps(signmask, x);
1834 xabs = _mm256_and_ps(x, signmask);
1836 mask = _mm256_cmp_ps(xabs, half, _CMP_GT_OQ);
1838 z1 = _mm256_mul_ps(half, _mm256_sub_ps(one, xabs));
1839 q1 = _mm256_mul_ps(z1, gmx_mm256_invsqrt_ps(z1));
1840 q1 = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
1842 q2 = xabs;
1843 z2 = _mm256_mul_ps(q2, q2);
1845 z = _mm256_blendv_ps(z2, z1, mask);
1846 q = _mm256_blendv_ps(q2, q1, mask);
1848 z2 = _mm256_mul_ps(z, z);
1850 pA = _mm256_mul_ps(CC5, z2);
1851 pB = _mm256_mul_ps(CC4, z2);
1853 pA = _mm256_add_ps(pA, CC3);
1854 pB = _mm256_add_ps(pB, CC2);
1856 pA = _mm256_mul_ps(pA, z2);
1857 pB = _mm256_mul_ps(pB, z2);
1859 pA = _mm256_add_ps(pA, CC1);
1860 pA = _mm256_mul_ps(pA, z);
1862 z = _mm256_add_ps(pA, pB);
1863 z = _mm256_mul_ps(z, q);
1864 z = _mm256_add_ps(z, q);
1866 q2 = _mm256_sub_ps(halfpi, z);
1867 q2 = _mm256_sub_ps(q2, z);
1869 z = _mm256_blendv_ps(z, q2, mask);
1871 mask = _mm256_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
1872 z = _mm256_blendv_ps(xabs, z, mask);
1874 z = _mm256_xor_ps(z, sign);
1876 return z;
1879 static __m128
1880 gmx_mm_asin_ps(__m128 x)
1882 /* Same algorithm as cephes library */
1883 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1884 const __m128 limitlow = _mm_set1_ps(1e-4f);
1885 const __m128 half = _mm_set1_ps(0.5f);
1886 const __m128 one = _mm_set1_ps(1.0f);
1887 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
1889 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
1890 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
1891 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
1892 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
1893 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
1895 __m128 sign;
1896 __m128 mask;
1897 __m128 xabs;
1898 __m128 z, z1, z2, q, q1, q2;
1899 __m128 pA, pB;
1901 sign = _mm_andnot_ps(signmask, x);
1902 xabs = _mm_and_ps(x, signmask);
1904 mask = _mm_cmpgt_ps(xabs, half);
1906 z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
1907 q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
1908 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
1910 q2 = xabs;
1911 z2 = _mm_mul_ps(q2, q2);
1913 z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
1914 q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
1916 z2 = _mm_mul_ps(z, z);
1918 pA = _mm_mul_ps(CC5, z2);
1919 pB = _mm_mul_ps(CC4, z2);
1921 pA = _mm_add_ps(pA, CC3);
1922 pB = _mm_add_ps(pB, CC2);
1924 pA = _mm_mul_ps(pA, z2);
1925 pB = _mm_mul_ps(pB, z2);
1927 pA = _mm_add_ps(pA, CC1);
1928 pA = _mm_mul_ps(pA, z);
1930 z = _mm_add_ps(pA, pB);
1931 z = _mm_mul_ps(z, q);
1932 z = _mm_add_ps(z, q);
1934 q2 = _mm_sub_ps(halfpi, z);
1935 q2 = _mm_sub_ps(q2, z);
1937 z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1939 mask = _mm_cmpgt_ps(xabs, limitlow);
1940 z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1942 z = _mm_xor_ps(z, sign);
1944 return z;
1948 static __m256
1949 gmx_mm256_acos_ps(__m256 x)
1951 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1952 const __m256 one_ps = _mm256_set1_ps(1.0f);
1953 const __m256 half_ps = _mm256_set1_ps(0.5f);
1954 const __m256 pi_ps = _mm256_set1_ps((float)M_PI);
1955 const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1957 __m256 mask1;
1958 __m256 mask2;
1959 __m256 xabs;
1960 __m256 z, z1, z2, z3;
1962 xabs = _mm256_and_ps(x, signmask);
1963 mask1 = _mm256_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
1964 mask2 = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_GT_OQ);
1966 z = _mm256_mul_ps(half_ps, _mm256_sub_ps(one_ps, xabs));
1967 z = _mm256_mul_ps(z, gmx_mm256_invsqrt_ps(z));
1968 z = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
1970 z = _mm256_blendv_ps(x, z, mask1);
1971 z = gmx_mm256_asin_ps(z);
1973 z2 = _mm256_add_ps(z, z);
1974 z1 = _mm256_sub_ps(pi_ps, z2);
1975 z3 = _mm256_sub_ps(halfpi_ps, z);
1977 z = _mm256_blendv_ps(z1, z2, mask2);
1978 z = _mm256_blendv_ps(z3, z, mask1);
1980 return z;
1983 static __m128
1984 gmx_mm_acos_ps(__m128 x)
1986 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1987 const __m128 one_ps = _mm_set1_ps(1.0f);
1988 const __m128 half_ps = _mm_set1_ps(0.5f);
1989 const __m128 pi_ps = _mm_set1_ps(M_PI);
1990 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1992 __m128 mask1;
1993 __m128 mask2;
1994 __m128 xabs;
1995 __m128 z, z1, z2, z3;
1997 xabs = _mm_and_ps(x, signmask);
1998 mask1 = _mm_cmpgt_ps(xabs, half_ps);
1999 mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
2001 z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
2002 z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
2003 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
2005 z = _mm_blendv_ps(x, z, mask1);
2006 z = gmx_mm_asin_ps(z);
2008 z2 = _mm_add_ps(z, z);
2009 z1 = _mm_sub_ps(pi_ps, z2);
2010 z3 = _mm_sub_ps(halfpi_ps, z);
2012 z = _mm_blendv_ps(z1, z2, mask2);
2013 z = _mm_blendv_ps(z3, z, mask1);
2015 return z;
2019 static __m256
2020 gmx_mm256_atan_ps(__m256 x)
2022 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2023 const __m256 limit1 = _mm256_set1_ps(0.414213562373095f);
2024 const __m256 limit2 = _mm256_set1_ps(2.414213562373095f);
2025 const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2026 const __m256 halfpi = _mm256_set1_ps(1.570796326794896f);
2027 const __m256 mone = _mm256_set1_ps(-1.0f);
2028 const __m256 CC3 = _mm256_set1_ps(-3.33329491539E-1f);
2029 const __m256 CC5 = _mm256_set1_ps(1.99777106478E-1f);
2030 const __m256 CC7 = _mm256_set1_ps(-1.38776856032E-1);
2031 const __m256 CC9 = _mm256_set1_ps(8.05374449538e-2f);
2033 __m256 sign;
2034 __m256 mask1, mask2;
2035 __m256 y, z1, z2;
2036 __m256 x2, x4;
2037 __m256 sum1, sum2;
2039 sign = _mm256_andnot_ps(signmask, x);
2040 x = _mm256_and_ps(x, signmask);
2042 mask1 = _mm256_cmp_ps(x, limit1, _CMP_GT_OQ);
2043 mask2 = _mm256_cmp_ps(x, limit2, _CMP_GT_OQ);
2045 z1 = _mm256_mul_ps(_mm256_add_ps(x, mone), gmx_mm256_inv_ps(_mm256_sub_ps(x, mone)));
2046 z2 = _mm256_mul_ps(mone, gmx_mm256_inv_ps(x));
2048 y = _mm256_and_ps(mask1, quarterpi);
2049 y = _mm256_blendv_ps(y, halfpi, mask2);
2051 x = _mm256_blendv_ps(x, z1, mask1);
2052 x = _mm256_blendv_ps(x, z2, mask2);
2054 x2 = _mm256_mul_ps(x, x);
2055 x4 = _mm256_mul_ps(x2, x2);
2057 sum1 = _mm256_mul_ps(CC9, x4);
2058 sum2 = _mm256_mul_ps(CC7, x4);
2059 sum1 = _mm256_add_ps(sum1, CC5);
2060 sum2 = _mm256_add_ps(sum2, CC3);
2061 sum1 = _mm256_mul_ps(sum1, x4);
2062 sum2 = _mm256_mul_ps(sum2, x2);
2064 sum1 = _mm256_add_ps(sum1, sum2);
2065 sum1 = _mm256_sub_ps(sum1, mone);
2066 sum1 = _mm256_mul_ps(sum1, x);
2067 y = _mm256_add_ps(y, sum1);
2069 y = _mm256_xor_ps(y, sign);
2071 return y;
2074 static __m128
2075 gmx_mm_atan_ps(__m128 x)
2077 /* Same algorithm as cephes library */
2078 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2079 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
2080 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
2081 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2082 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
2083 const __m128 mone = _mm_set1_ps(-1.0f);
2084 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
2085 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
2086 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
2087 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
2089 __m128 sign;
2090 __m128 mask1, mask2;
2091 __m128 y, z1, z2;
2092 __m128 x2, x4;
2093 __m128 sum1, sum2;
2095 sign = _mm_andnot_ps(signmask, x);
2096 x = _mm_and_ps(x, signmask);
2098 mask1 = _mm_cmpgt_ps(x, limit1);
2099 mask2 = _mm_cmpgt_ps(x, limit2);
2101 z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
2102 z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
2104 y = _mm_and_ps(mask1, quarterpi);
2105 y = _mm_blendv_ps(y, halfpi, mask2);
2107 x = _mm_blendv_ps(x, z1, mask1);
2108 x = _mm_blendv_ps(x, z2, mask2);
2110 x2 = _mm_mul_ps(x, x);
2111 x4 = _mm_mul_ps(x2, x2);
2113 sum1 = _mm_mul_ps(CC9, x4);
2114 sum2 = _mm_mul_ps(CC7, x4);
2115 sum1 = _mm_add_ps(sum1, CC5);
2116 sum2 = _mm_add_ps(sum2, CC3);
2117 sum1 = _mm_mul_ps(sum1, x4);
2118 sum2 = _mm_mul_ps(sum2, x2);
2120 sum1 = _mm_add_ps(sum1, sum2);
2121 sum1 = _mm_sub_ps(sum1, mone);
2122 sum1 = _mm_mul_ps(sum1, x);
2123 y = _mm_add_ps(y, sum1);
2125 y = _mm_xor_ps(y, sign);
2127 return y;
2131 static __m256
2132 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2134 const __m256 pi = _mm256_set1_ps( (float) M_PI);
2135 const __m256 minuspi = _mm256_set1_ps( (float) -M_PI);
2136 const __m256 halfpi = _mm256_set1_ps( (float) M_PI/2.0f);
2137 const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2139 __m256 z, z1, z3, z4;
2140 __m256 w;
2141 __m256 maskx_lt, maskx_eq;
2142 __m256 masky_lt, masky_eq;
2143 __m256 mask1, mask2, mask3, mask4, maskall;
2145 maskx_lt = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
2146 masky_lt = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_LT_OQ);
2147 maskx_eq = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
2148 masky_eq = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_EQ_OQ);
2150 z = _mm256_mul_ps(y, gmx_mm256_inv_ps(x));
2151 z = gmx_mm256_atan_ps(z);
2153 mask1 = _mm256_and_ps(maskx_eq, masky_lt);
2154 mask2 = _mm256_andnot_ps(maskx_lt, masky_eq);
2155 mask3 = _mm256_andnot_ps( _mm256_or_ps(masky_lt, masky_eq), maskx_eq);
2156 mask4 = _mm256_and_ps(maskx_lt, masky_eq);
2157 maskall = _mm256_or_ps( _mm256_or_ps(mask1, mask2), _mm256_or_ps(mask3, mask4) );
2159 z = _mm256_andnot_ps(maskall, z);
2160 z1 = _mm256_and_ps(mask1, minushalfpi);
2161 z3 = _mm256_and_ps(mask3, halfpi);
2162 z4 = _mm256_and_ps(mask4, pi);
2164 z = _mm256_or_ps( _mm256_or_ps(z, z1), _mm256_or_ps(z3, z4) );
2166 w = _mm256_blendv_ps(pi, minuspi, masky_lt);
2167 w = _mm256_and_ps(w, maskx_lt);
2169 w = _mm256_andnot_ps(maskall, w);
2171 z = _mm256_add_ps(z, w);
2173 return z;
2176 static __m128
2177 gmx_mm_atan2_ps(__m128 y, __m128 x)
2179 const __m128 pi = _mm_set1_ps(M_PI);
2180 const __m128 minuspi = _mm_set1_ps(-M_PI);
2181 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
2182 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2184 __m128 z, z1, z3, z4;
2185 __m128 w;
2186 __m128 maskx_lt, maskx_eq;
2187 __m128 masky_lt, masky_eq;
2188 __m128 mask1, mask2, mask3, mask4, maskall;
2190 maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps());
2191 masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps());
2192 maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps());
2193 masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps());
2195 z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
2196 z = gmx_mm_atan_ps(z);
2198 mask1 = _mm_and_ps(maskx_eq, masky_lt);
2199 mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
2200 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
2201 mask4 = _mm_and_ps(masky_eq, maskx_lt);
2203 maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
2205 z = _mm_andnot_ps(maskall, z);
2206 z1 = _mm_and_ps(mask1, minushalfpi);
2207 z3 = _mm_and_ps(mask3, halfpi);
2208 z4 = _mm_and_ps(mask4, pi);
2210 z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
2212 mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
2213 mask2 = _mm_and_ps(maskx_lt, masky_lt);
2215 w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
2216 w = _mm_andnot_ps(maskall, w);
2218 z = _mm_add_ps(z, w);
2220 return z;
2223 #endif /* _gmx_math_x86_avx_256_single_h_ */