Fixed precision in thermal expansion coefficient calc.
[gromacs.git] / include / gmx_math_x86_sse2_single.h
blob21e7a9719df28d491ed8c38d0937da096345b202
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
6 * others, as listed in the AUTHORS file in the top-level source
7 * directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _gmx_math_x86_sse2_single_h_
36 #define _gmx_math_x86_sse2_single_h_
39 #include <stdio.h>
40 #include <math.h>
42 #include "gmx_x86_sse2.h"
45 #ifndef M_PI
46 # define M_PI 3.14159265358979323846264338327950288
47 #endif
51 /************************
52 * *
53 * Simple math routines *
54 * *
55 ************************/
57 /* 1.0/sqrt(x) */
58 static gmx_inline __m128
59 gmx_mm_invsqrt_ps(__m128 x)
61 const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
62 const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
64 __m128 lu = _mm_rsqrt_ps(x);
66 return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
69 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
70 static gmx_inline __m128
71 gmx_mm_sqrt_ps(__m128 x)
73 __m128 mask;
74 __m128 res;
76 mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
77 res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
79 res = _mm_mul_ps(x, res);
81 return res;
84 /* 1.0/x */
85 static gmx_inline __m128
86 gmx_mm_inv_ps(__m128 x)
88 const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
90 __m128 lu = _mm_rcp_ps(x);
92 return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
95 static gmx_inline __m128
96 gmx_mm_abs_ps(__m128 x)
98 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
100 return _mm_and_ps(x, signmask);
104 static __m128
105 gmx_mm_log_ps(__m128 x)
107 /* Same algorithm as cephes library */
108 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
109 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
110 const __m128 half = _mm_set1_ps(0.5f);
111 const __m128 one = _mm_set1_ps(1.0f);
112 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
113 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
114 const __m128 corr2 = _mm_set1_ps(0.693359375f);
116 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
117 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
118 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
119 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
120 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
121 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
122 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
123 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
124 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
126 __m128 fexp, fexp1;
127 __m128i iexp;
128 __m128 mask;
129 __m128 x1, x2;
130 __m128 y;
131 __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
133 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
134 fexp = _mm_and_ps(x, expmask);
135 iexp = gmx_mm_castps_si128(fexp);
136 iexp = _mm_srli_epi32(iexp, 23);
137 iexp = _mm_sub_epi32(iexp, expbase_m1);
139 x = _mm_andnot_ps(expmask, x);
140 x = _mm_or_ps(x, one);
141 x = _mm_mul_ps(x, half);
143 mask = _mm_cmplt_ps(x, invsq2);
145 x = _mm_add_ps(x, _mm_and_ps(mask, x));
146 x = _mm_sub_ps(x, one);
147 iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
149 x2 = _mm_mul_ps(x, x);
151 pA = _mm_mul_ps(CA_1, x);
152 pB = _mm_mul_ps(CB_1, x);
153 pC = _mm_mul_ps(CC_1, x);
154 pD = _mm_mul_ps(CD_1, x);
155 pE = _mm_mul_ps(CE_1, x);
156 tB = _mm_add_ps(CB_0, x2);
157 tC = _mm_add_ps(CC_0, x2);
158 tD = _mm_add_ps(CD_0, x2);
159 tE = _mm_add_ps(CE_0, x2);
160 pB = _mm_add_ps(pB, tB);
161 pC = _mm_add_ps(pC, tC);
162 pD = _mm_add_ps(pD, tD);
163 pE = _mm_add_ps(pE, tE);
165 pA = _mm_mul_ps(pA, pB);
166 pC = _mm_mul_ps(pC, pD);
167 pE = _mm_mul_ps(pE, x2);
168 pA = _mm_mul_ps(pA, pC);
169 y = _mm_mul_ps(pA, pE);
171 fexp = _mm_cvtepi32_ps(iexp);
172 y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
174 y = _mm_sub_ps(y, _mm_mul_ps(half, x2));
175 x2 = _mm_add_ps(x, y);
177 x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
179 return x2;
184 * 2^x function.
186 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
187 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
189 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
191 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
192 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
193 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
194 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
195 * number instead. That would take a few extra cycles and not really help, since something is
196 * wrong if you are using single precision to work with numbers that cannot really be represented
197 * in single precision.
199 * The accuracy is at least 23 bits.
201 static __m128
202 gmx_mm_exp2_ps(__m128 x)
204 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
205 const __m128 arglimit = _mm_set1_ps(126.0f);
207 const __m128i expbase = _mm_set1_epi32(127);
208 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
209 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
210 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
211 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
212 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
213 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
214 const __m128 CA0 = _mm_set1_ps(1.0f);
217 __m128 valuemask;
218 __m128i iexppart;
219 __m128 fexppart;
220 __m128 intpart;
221 __m128 x2;
222 __m128 p0, p1;
224 iexppart = _mm_cvtps_epi32(x);
225 intpart = _mm_cvtepi32_ps(iexppart);
226 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
227 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
228 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
230 x = _mm_sub_ps(x, intpart);
231 x2 = _mm_mul_ps(x, x);
233 p0 = _mm_mul_ps(CA6, x2);
234 p1 = _mm_mul_ps(CA5, x2);
235 p0 = _mm_add_ps(p0, CA4);
236 p1 = _mm_add_ps(p1, CA3);
237 p0 = _mm_mul_ps(p0, x2);
238 p1 = _mm_mul_ps(p1, x2);
239 p0 = _mm_add_ps(p0, CA2);
240 p1 = _mm_add_ps(p1, CA1);
241 p0 = _mm_mul_ps(p0, x2);
242 p1 = _mm_mul_ps(p1, x);
243 p0 = _mm_add_ps(p0, CA0);
244 p0 = _mm_add_ps(p0, p1);
245 x = _mm_mul_ps(p0, fexppart);
247 return x;
251 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
252 * but there will then be a small rounding error since we lose some precision due to the
253 * multiplication. This will then be magnified a lot by the exponential.
255 * Instead, we calculate the fractional part directly as a minimax approximation of
256 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
257 * remaining after 2^y, which avoids the precision-loss.
258 * The final result is correct to within 1 LSB over the entire argument range.
260 static __m128
261 gmx_mm_exp_ps(__m128 x)
263 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
264 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
265 const __m128 arglimit = _mm_set1_ps(126.0f);
266 const __m128i expbase = _mm_set1_epi32(127);
268 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
269 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
271 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
272 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
273 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
274 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
275 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
276 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
277 const __m128 one = _mm_set1_ps(1.0f);
279 __m128 y, x2;
280 __m128 p0, p1;
281 __m128 valuemask;
282 __m128i iexppart;
283 __m128 fexppart;
284 __m128 intpart;
286 y = _mm_mul_ps(x, argscale);
288 iexppart = _mm_cvtps_epi32(y);
289 intpart = _mm_cvtepi32_ps(iexppart);
291 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
292 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
293 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
295 /* Extended precision arithmetics */
296 x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
297 x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
299 x2 = _mm_mul_ps(x, x);
301 p1 = _mm_mul_ps(CC5, x2);
302 p0 = _mm_mul_ps(CC4, x2);
303 p1 = _mm_add_ps(p1, CC3);
304 p0 = _mm_add_ps(p0, CC2);
305 p1 = _mm_mul_ps(p1, x2);
306 p0 = _mm_mul_ps(p0, x2);
307 p1 = _mm_add_ps(p1, CC1);
308 p0 = _mm_add_ps(p0, CC0);
309 p1 = _mm_mul_ps(p1, x);
310 p0 = _mm_add_ps(p0, p1);
311 p0 = _mm_mul_ps(p0, x2);
312 x = _mm_add_ps(x, one);
313 x = _mm_add_ps(x, p0);
315 x = _mm_mul_ps(x, fexppart);
317 return x;
320 /* FULL precision. Only errors in LSB */
321 static __m128
322 gmx_mm_erf_ps(__m128 x)
324 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
325 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
326 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
327 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
328 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
329 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
330 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
331 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
332 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
333 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
334 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
335 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
336 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
337 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
338 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
339 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
340 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
341 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
342 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
343 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
344 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
345 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
346 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
347 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
348 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
349 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
350 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
351 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
352 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
353 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
354 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
356 /* Coefficients for expansion of exp(x) in [0,0.1] */
357 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
358 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
359 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
360 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
362 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
363 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
364 const __m128 one = _mm_set1_ps(1.0f);
365 const __m128 two = _mm_set1_ps(2.0f);
367 __m128 x2, x4, y;
368 __m128 z, q, t, t2, w, w2;
369 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
370 __m128 expmx2, corr;
371 __m128 res_erf, res_erfc, res;
372 __m128 mask;
374 /* Calculate erf() */
375 x2 = _mm_mul_ps(x, x);
376 x4 = _mm_mul_ps(x2, x2);
378 pA0 = _mm_mul_ps(CA6, x4);
379 pA1 = _mm_mul_ps(CA5, x4);
380 pA0 = _mm_add_ps(pA0, CA4);
381 pA1 = _mm_add_ps(pA1, CA3);
382 pA0 = _mm_mul_ps(pA0, x4);
383 pA1 = _mm_mul_ps(pA1, x4);
384 pA0 = _mm_add_ps(pA0, CA2);
385 pA1 = _mm_add_ps(pA1, CA1);
386 pA0 = _mm_mul_ps(pA0, x4);
387 pA1 = _mm_mul_ps(pA1, x2);
388 pA0 = _mm_add_ps(pA0, pA1);
389 pA0 = _mm_add_ps(pA0, CA0);
391 res_erf = _mm_mul_ps(x, pA0);
393 /* Calculate erfc */
395 y = gmx_mm_abs_ps(x);
396 t = gmx_mm_inv_ps(y);
397 w = _mm_sub_ps(t, one);
398 t2 = _mm_mul_ps(t, t);
399 w2 = _mm_mul_ps(w, w);
401 * We cannot simply calculate exp(-x2) directly in single precision, since
402 * that will lose a couple of bits of precision due to the multiplication.
403 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
404 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
406 * The only drawback with this is that it requires TWO separate exponential
407 * evaluations, which would be horrible performance-wise. However, the argument
408 * for the second exp() call is always small, so there we simply use a
409 * low-order minimax expansion on [0,0.1].
412 z = _mm_and_ps(y, sieve);
413 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
415 corr = _mm_mul_ps(CD4, q);
416 corr = _mm_add_ps(corr, CD3);
417 corr = _mm_mul_ps(corr, q);
418 corr = _mm_add_ps(corr, CD2);
419 corr = _mm_mul_ps(corr, q);
420 corr = _mm_add_ps(corr, one);
421 corr = _mm_mul_ps(corr, q);
422 corr = _mm_add_ps(corr, one);
424 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
425 expmx2 = _mm_mul_ps(expmx2, corr);
427 pB1 = _mm_mul_ps(CB9, w2);
428 pB0 = _mm_mul_ps(CB8, w2);
429 pB1 = _mm_add_ps(pB1, CB7);
430 pB0 = _mm_add_ps(pB0, CB6);
431 pB1 = _mm_mul_ps(pB1, w2);
432 pB0 = _mm_mul_ps(pB0, w2);
433 pB1 = _mm_add_ps(pB1, CB5);
434 pB0 = _mm_add_ps(pB0, CB4);
435 pB1 = _mm_mul_ps(pB1, w2);
436 pB0 = _mm_mul_ps(pB0, w2);
437 pB1 = _mm_add_ps(pB1, CB3);
438 pB0 = _mm_add_ps(pB0, CB2);
439 pB1 = _mm_mul_ps(pB1, w2);
440 pB0 = _mm_mul_ps(pB0, w2);
441 pB1 = _mm_add_ps(pB1, CB1);
442 pB1 = _mm_mul_ps(pB1, w);
443 pB0 = _mm_add_ps(pB0, pB1);
444 pB0 = _mm_add_ps(pB0, CB0);
446 pC0 = _mm_mul_ps(CC10, t2);
447 pC1 = _mm_mul_ps(CC9, t2);
448 pC0 = _mm_add_ps(pC0, CC8);
449 pC1 = _mm_add_ps(pC1, CC7);
450 pC0 = _mm_mul_ps(pC0, t2);
451 pC1 = _mm_mul_ps(pC1, t2);
452 pC0 = _mm_add_ps(pC0, CC6);
453 pC1 = _mm_add_ps(pC1, CC5);
454 pC0 = _mm_mul_ps(pC0, t2);
455 pC1 = _mm_mul_ps(pC1, t2);
456 pC0 = _mm_add_ps(pC0, CC4);
457 pC1 = _mm_add_ps(pC1, CC3);
458 pC0 = _mm_mul_ps(pC0, t2);
459 pC1 = _mm_mul_ps(pC1, t2);
460 pC0 = _mm_add_ps(pC0, CC2);
461 pC1 = _mm_add_ps(pC1, CC1);
462 pC0 = _mm_mul_ps(pC0, t2);
463 pC1 = _mm_mul_ps(pC1, t);
464 pC0 = _mm_add_ps(pC0, pC1);
465 pC0 = _mm_add_ps(pC0, CC0);
466 pC0 = _mm_mul_ps(pC0, t);
468 /* SELECT pB0 or pC0 for erfc() */
469 mask = _mm_cmplt_ps(two, y);
470 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
471 res_erfc = _mm_mul_ps(res_erfc, expmx2);
473 /* erfc(x<0) = 2-erfc(|x|) */
474 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
475 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc),
476 _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
478 /* Select erf() or erfc() */
479 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
480 res = _mm_or_ps(_mm_andnot_ps(mask, _mm_sub_ps(one, res_erfc)), _mm_and_ps(mask, res_erf));
482 return res;
489 /* FULL precision. Only errors in LSB */
490 static __m128
491 gmx_mm_erfc_ps(__m128 x)
493 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
494 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
495 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
496 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
497 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
498 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
499 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
500 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
501 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
502 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
503 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
504 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
505 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
506 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
507 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
508 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
509 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
510 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
511 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
512 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
513 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
514 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
515 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
516 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
517 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
518 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
519 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
520 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
521 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
522 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
523 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
525 /* Coefficients for expansion of exp(x) in [0,0.1] */
526 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
527 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
528 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
529 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
531 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
532 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
533 const __m128 one = _mm_set1_ps(1.0f);
534 const __m128 two = _mm_set1_ps(2.0f);
536 __m128 x2, x4, y;
537 __m128 z, q, t, t2, w, w2;
538 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
539 __m128 expmx2, corr;
540 __m128 res_erf, res_erfc, res;
541 __m128 mask;
543 /* Calculate erf() */
544 x2 = _mm_mul_ps(x, x);
545 x4 = _mm_mul_ps(x2, x2);
547 pA0 = _mm_mul_ps(CA6, x4);
548 pA1 = _mm_mul_ps(CA5, x4);
549 pA0 = _mm_add_ps(pA0, CA4);
550 pA1 = _mm_add_ps(pA1, CA3);
551 pA0 = _mm_mul_ps(pA0, x4);
552 pA1 = _mm_mul_ps(pA1, x4);
553 pA0 = _mm_add_ps(pA0, CA2);
554 pA1 = _mm_add_ps(pA1, CA1);
555 pA0 = _mm_mul_ps(pA0, x4);
556 pA1 = _mm_mul_ps(pA1, x2);
557 pA0 = _mm_add_ps(pA0, pA1);
558 pA0 = _mm_add_ps(pA0, CA0);
560 res_erf = _mm_mul_ps(x, pA0);
562 /* Calculate erfc */
563 y = gmx_mm_abs_ps(x);
564 t = gmx_mm_inv_ps(y);
565 w = _mm_sub_ps(t, one);
566 t2 = _mm_mul_ps(t, t);
567 w2 = _mm_mul_ps(w, w);
569 * We cannot simply calculate exp(-x2) directly in single precision, since
570 * that will lose a couple of bits of precision due to the multiplication.
571 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
572 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
574 * The only drawback with this is that it requires TWO separate exponential
575 * evaluations, which would be horrible performance-wise. However, the argument
576 * for the second exp() call is always small, so there we simply use a
577 * low-order minimax expansion on [0,0.1].
580 z = _mm_and_ps(y, sieve);
581 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
583 corr = _mm_mul_ps(CD4, q);
584 corr = _mm_add_ps(corr, CD3);
585 corr = _mm_mul_ps(corr, q);
586 corr = _mm_add_ps(corr, CD2);
587 corr = _mm_mul_ps(corr, q);
588 corr = _mm_add_ps(corr, one);
589 corr = _mm_mul_ps(corr, q);
590 corr = _mm_add_ps(corr, one);
592 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
593 expmx2 = _mm_mul_ps(expmx2, corr);
595 pB1 = _mm_mul_ps(CB9, w2);
596 pB0 = _mm_mul_ps(CB8, w2);
597 pB1 = _mm_add_ps(pB1, CB7);
598 pB0 = _mm_add_ps(pB0, CB6);
599 pB1 = _mm_mul_ps(pB1, w2);
600 pB0 = _mm_mul_ps(pB0, w2);
601 pB1 = _mm_add_ps(pB1, CB5);
602 pB0 = _mm_add_ps(pB0, CB4);
603 pB1 = _mm_mul_ps(pB1, w2);
604 pB0 = _mm_mul_ps(pB0, w2);
605 pB1 = _mm_add_ps(pB1, CB3);
606 pB0 = _mm_add_ps(pB0, CB2);
607 pB1 = _mm_mul_ps(pB1, w2);
608 pB0 = _mm_mul_ps(pB0, w2);
609 pB1 = _mm_add_ps(pB1, CB1);
610 pB1 = _mm_mul_ps(pB1, w);
611 pB0 = _mm_add_ps(pB0, pB1);
612 pB0 = _mm_add_ps(pB0, CB0);
614 pC0 = _mm_mul_ps(CC10, t2);
615 pC1 = _mm_mul_ps(CC9, t2);
616 pC0 = _mm_add_ps(pC0, CC8);
617 pC1 = _mm_add_ps(pC1, CC7);
618 pC0 = _mm_mul_ps(pC0, t2);
619 pC1 = _mm_mul_ps(pC1, t2);
620 pC0 = _mm_add_ps(pC0, CC6);
621 pC1 = _mm_add_ps(pC1, CC5);
622 pC0 = _mm_mul_ps(pC0, t2);
623 pC1 = _mm_mul_ps(pC1, t2);
624 pC0 = _mm_add_ps(pC0, CC4);
625 pC1 = _mm_add_ps(pC1, CC3);
626 pC0 = _mm_mul_ps(pC0, t2);
627 pC1 = _mm_mul_ps(pC1, t2);
628 pC0 = _mm_add_ps(pC0, CC2);
629 pC1 = _mm_add_ps(pC1, CC1);
630 pC0 = _mm_mul_ps(pC0, t2);
631 pC1 = _mm_mul_ps(pC1, t);
632 pC0 = _mm_add_ps(pC0, pC1);
633 pC0 = _mm_add_ps(pC0, CC0);
634 pC0 = _mm_mul_ps(pC0, t);
636 /* SELECT pB0 or pC0 for erfc() */
637 mask = _mm_cmplt_ps(two, y);
638 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
639 res_erfc = _mm_mul_ps(res_erfc, expmx2);
641 /* erfc(x<0) = 2-erfc(|x|) */
642 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
643 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
645 /* Select erf() or erfc() */
646 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
647 res = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(one, res_erf)));
649 return res;
653 /* Calculate the force correction due to PME analytically.
655 * This routine is meant to enable analytical evaluation of the
656 * direct-space PME electrostatic force to avoid tables.
658 * The direct-space potential should be Erfc(beta*r)/r, but there
659 * are some problems evaluating that:
661 * First, the error function is difficult (read: expensive) to
662 * approxmiate accurately for intermediate to large arguments, and
663 * this happens already in ranges of beta*r that occur in simulations.
664 * Second, we now try to avoid calculating potentials in Gromacs but
665 * use forces directly.
667 * We can simply things slight by noting that the PME part is really
668 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
670 * V= 1/r - Erf(beta*r)/r
672 * The first term we already have from the inverse square root, so
673 * that we can leave out of this routine.
675 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
676 * the argument beta*r will be in the range 0.15 to ~4. Use your
677 * favorite plotting program to realize how well-behaved Erf(z)/z is
678 * in this range!
680 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
681 * However, it turns out it is more efficient to approximate f(z)/z and
682 * then only use even powers. This is another minor optimization, since
683 * we actually WANT f(z)/z, because it is going to be multiplied by
684 * the vector between the two atoms to get the vectorial force. The
685 * fastest flops are the ones we can avoid calculating!
687 * So, here's how it should be used:
689 * 1. Calculate r^2.
690 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
691 * 3. Evaluate this routine with z^2 as the argument.
692 * 4. The return value is the expression:
695 * 2*exp(-z^2) erf(z)
696 * ------------ - --------
697 * sqrt(Pi)*z^2 z^3
699 * 5. Multiply the entire expression by beta^3. This will get you
701 * beta^3*2*exp(-z^2) beta^3*erf(z)
702 * ------------------ - ---------------
703 * sqrt(Pi)*z^2 z^3
705 * or, switching back to r (z=r*beta):
707 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
708 * ----------------------- - -----------
709 * sqrt(Pi)*r^2 r^3
712 * With a bit of math exercise you should be able to confirm that
713 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
715 * 6. Add the result to 1/r^3, multiply by the product of the charges,
716 * and you have your force (divided by r). A final multiplication
717 * with the vector connecting the two particles and you have your
718 * vectorial force to add to the particles.
721 static gmx_inline __m128
722 gmx_mm_pmecorrF_ps(__m128 z2)
724 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
725 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
726 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
727 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
728 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
729 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
730 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
732 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
733 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
734 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
735 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
736 const __m128 FD0 = _mm_set1_ps(1.0f);
738 __m128 z4;
739 __m128 polyFN0, polyFN1, polyFD0, polyFD1;
741 z4 = _mm_mul_ps(z2, z2);
743 polyFD0 = _mm_mul_ps(FD4, z4);
744 polyFD1 = _mm_mul_ps(FD3, z4);
745 polyFD0 = _mm_add_ps(polyFD0, FD2);
746 polyFD1 = _mm_add_ps(polyFD1, FD1);
747 polyFD0 = _mm_mul_ps(polyFD0, z4);
748 polyFD1 = _mm_mul_ps(polyFD1, z2);
749 polyFD0 = _mm_add_ps(polyFD0, FD0);
750 polyFD0 = _mm_add_ps(polyFD0, polyFD1);
752 polyFD0 = gmx_mm_inv_ps(polyFD0);
754 polyFN0 = _mm_mul_ps(FN6, z4);
755 polyFN1 = _mm_mul_ps(FN5, z4);
756 polyFN0 = _mm_add_ps(polyFN0, FN4);
757 polyFN1 = _mm_add_ps(polyFN1, FN3);
758 polyFN0 = _mm_mul_ps(polyFN0, z4);
759 polyFN1 = _mm_mul_ps(polyFN1, z4);
760 polyFN0 = _mm_add_ps(polyFN0, FN2);
761 polyFN1 = _mm_add_ps(polyFN1, FN1);
762 polyFN0 = _mm_mul_ps(polyFN0, z4);
763 polyFN1 = _mm_mul_ps(polyFN1, z2);
764 polyFN0 = _mm_add_ps(polyFN0, FN0);
765 polyFN0 = _mm_add_ps(polyFN0, polyFN1);
767 return _mm_mul_ps(polyFN0, polyFD0);
771 /* Calculate the potential correction due to PME analytically.
773 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
775 * This routine calculates Erf(z)/z, although you should provide z^2
776 * as the input argument.
778 * Here's how it should be used:
780 * 1. Calculate r^2.
781 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
782 * 3. Evaluate this routine with z^2 as the argument.
783 * 4. The return value is the expression:
786 * erf(z)
787 * --------
790 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
792 * erf(r*beta)
793 * -----------
796 * 6. Subtract the result from 1/r, multiply by the product of the charges,
797 * and you have your potential.
799 static gmx_inline __m128
800 gmx_mm_pmecorrV_ps(__m128 z2)
802 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
803 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
804 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
805 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
806 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
807 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
808 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
810 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
811 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
812 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
813 const __m128 VD0 = _mm_set1_ps(1.0f);
815 __m128 z4;
816 __m128 polyVN0, polyVN1, polyVD0, polyVD1;
818 z4 = _mm_mul_ps(z2, z2);
820 polyVD1 = _mm_mul_ps(VD3, z4);
821 polyVD0 = _mm_mul_ps(VD2, z4);
822 polyVD1 = _mm_add_ps(polyVD1, VD1);
823 polyVD0 = _mm_add_ps(polyVD0, VD0);
824 polyVD1 = _mm_mul_ps(polyVD1, z2);
825 polyVD0 = _mm_add_ps(polyVD0, polyVD1);
827 polyVD0 = gmx_mm_inv_ps(polyVD0);
829 polyVN0 = _mm_mul_ps(VN6, z4);
830 polyVN1 = _mm_mul_ps(VN5, z4);
831 polyVN0 = _mm_add_ps(polyVN0, VN4);
832 polyVN1 = _mm_add_ps(polyVN1, VN3);
833 polyVN0 = _mm_mul_ps(polyVN0, z4);
834 polyVN1 = _mm_mul_ps(polyVN1, z4);
835 polyVN0 = _mm_add_ps(polyVN0, VN2);
836 polyVN1 = _mm_add_ps(polyVN1, VN1);
837 polyVN0 = _mm_mul_ps(polyVN0, z4);
838 polyVN1 = _mm_mul_ps(polyVN1, z2);
839 polyVN0 = _mm_add_ps(polyVN0, VN0);
840 polyVN0 = _mm_add_ps(polyVN0, polyVN1);
842 return _mm_mul_ps(polyVN0, polyVD0);
846 static int
847 gmx_mm_sincos_ps(__m128 x,
848 __m128 *sinval,
849 __m128 *cosval)
851 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
852 const __m128 half = _mm_set1_ps(0.5);
853 const __m128 one = _mm_set1_ps(1.0);
855 const __m128i izero = _mm_set1_epi32(0);
856 const __m128i ione = _mm_set1_epi32(1);
857 const __m128i itwo = _mm_set1_epi32(2);
858 const __m128i ithree = _mm_set1_epi32(3);
859 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
861 const __m128 CA1 = _mm_set1_ps(1.5703125f);
862 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
863 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
865 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
866 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
867 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
868 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
869 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
870 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
872 __m128 y, y2;
873 __m128 z;
874 __m128i iz;
875 __m128i offset_sin, offset_cos;
876 __m128 tmp1, tmp2;
877 __m128 mask_sin, mask_cos;
878 __m128 tmp_sin, tmp_cos;
880 y = _mm_mul_ps(x, two_over_pi);
881 y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
883 iz = _mm_cvttps_epi32(y);
884 z = _mm_cvtepi32_ps(iz);
886 offset_sin = _mm_and_si128(iz, ithree);
887 offset_cos = _mm_add_epi32(iz, ione);
889 /* Extended precision arithmethic to achieve full precision */
890 y = _mm_mul_ps(z, CA1);
891 tmp1 = _mm_mul_ps(z, CA2);
892 tmp2 = _mm_mul_ps(z, CA3);
893 y = _mm_sub_ps(x, y);
894 y = _mm_sub_ps(y, tmp1);
895 y = _mm_sub_ps(y, tmp2);
897 y2 = _mm_mul_ps(y, y);
899 tmp1 = _mm_mul_ps(CC0, y2);
900 tmp1 = _mm_add_ps(tmp1, CC1);
901 tmp2 = _mm_mul_ps(CS0, y2);
902 tmp2 = _mm_add_ps(tmp2, CS1);
903 tmp1 = _mm_mul_ps(tmp1, y2);
904 tmp1 = _mm_add_ps(tmp1, CC2);
905 tmp2 = _mm_mul_ps(tmp2, y2);
906 tmp2 = _mm_add_ps(tmp2, CS2);
908 tmp1 = _mm_mul_ps(tmp1, y2);
909 tmp1 = _mm_add_ps(tmp1, one);
911 tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
912 tmp2 = _mm_add_ps(tmp2, y);
914 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
915 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
917 tmp_sin = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp2) );
918 tmp_cos = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp1), _mm_and_ps(mask_cos, tmp2) );
920 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
921 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
923 tmp1 = _mm_xor_ps(signbit, tmp_sin);
924 tmp2 = _mm_xor_ps(signbit, tmp_cos);
926 *sinval = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp_sin) );
927 *cosval = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp2), _mm_and_ps(mask_cos, tmp_cos) );
929 return 0;
933 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
934 * will then call the sincos() routine and waste a factor 2 in performance!
936 static __m128
937 gmx_mm_sin_ps(__m128 x)
939 __m128 s, c;
940 gmx_mm_sincos_ps(x, &s, &c);
941 return s;
945 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
946 * will then call the sincos() routine and waste a factor 2 in performance!
948 static __m128
949 gmx_mm_cos_ps(__m128 x)
951 __m128 s, c;
952 gmx_mm_sincos_ps(x, &s, &c);
953 return c;
957 static __m128
958 gmx_mm_tan_ps(__m128 x)
960 __m128 sinval, cosval;
961 __m128 tanval;
963 gmx_mm_sincos_ps(x, &sinval, &cosval);
965 tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
967 return tanval;
971 static __m128
972 gmx_mm_asin_ps(__m128 x)
974 /* Same algorithm as cephes library */
975 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
976 const __m128 limitlow = _mm_set1_ps(1e-4f);
977 const __m128 half = _mm_set1_ps(0.5f);
978 const __m128 one = _mm_set1_ps(1.0f);
979 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
981 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
982 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
983 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
984 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
985 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
987 __m128 sign;
988 __m128 mask;
989 __m128 xabs;
990 __m128 z, z1, z2, q, q1, q2;
991 __m128 pA, pB;
993 sign = _mm_andnot_ps(signmask, x);
994 xabs = _mm_and_ps(x, signmask);
996 mask = _mm_cmpgt_ps(xabs, half);
998 z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
999 q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
1000 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
1002 q2 = xabs;
1003 z2 = _mm_mul_ps(q2, q2);
1005 z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
1006 q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
1008 z2 = _mm_mul_ps(z, z);
1010 pA = _mm_mul_ps(CC5, z2);
1011 pB = _mm_mul_ps(CC4, z2);
1013 pA = _mm_add_ps(pA, CC3);
1014 pB = _mm_add_ps(pB, CC2);
1016 pA = _mm_mul_ps(pA, z2);
1017 pB = _mm_mul_ps(pB, z2);
1019 pA = _mm_add_ps(pA, CC1);
1020 pA = _mm_mul_ps(pA, z);
1022 z = _mm_add_ps(pA, pB);
1023 z = _mm_mul_ps(z, q);
1024 z = _mm_add_ps(z, q);
1026 q2 = _mm_sub_ps(halfpi, z);
1027 q2 = _mm_sub_ps(q2, z);
1029 z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1031 mask = _mm_cmpgt_ps(xabs, limitlow);
1032 z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1034 z = _mm_xor_ps(z, sign);
1036 return z;
1040 static __m128
1041 gmx_mm_acos_ps(__m128 x)
1043 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1044 const __m128 one_ps = _mm_set1_ps(1.0f);
1045 const __m128 half_ps = _mm_set1_ps(0.5f);
1046 const __m128 pi_ps = _mm_set1_ps(M_PI);
1047 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1049 __m128 mask1;
1050 __m128 mask2;
1051 __m128 xabs;
1052 __m128 z, z1, z2, z3;
1054 xabs = _mm_and_ps(x, signmask);
1055 mask1 = _mm_cmpgt_ps(xabs, half_ps);
1056 mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
1058 z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
1059 z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
1060 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
1062 z = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, x) );
1063 z = gmx_mm_asin_ps(z);
1065 z2 = _mm_add_ps(z, z);
1066 z1 = _mm_sub_ps(pi_ps, z2);
1067 z3 = _mm_sub_ps(halfpi_ps, z);
1069 z = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, z1) );
1070 z = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, z3) );
1072 return z;
1076 static __m128
1077 gmx_mm_atan_ps(__m128 x)
1079 /* Same algorithm as cephes library */
1080 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1081 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
1082 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
1083 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
1084 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
1085 const __m128 mone = _mm_set1_ps(-1.0f);
1086 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
1087 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
1088 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
1089 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
1091 __m128 sign;
1092 __m128 mask1, mask2;
1093 __m128 y, z1, z2;
1094 __m128 x2, x4;
1095 __m128 sum1, sum2;
1097 sign = _mm_andnot_ps(signmask, x);
1098 x = _mm_and_ps(x, signmask);
1100 mask1 = _mm_cmpgt_ps(x, limit1);
1101 mask2 = _mm_cmpgt_ps(x, limit2);
1103 z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
1104 z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
1106 y = _mm_and_ps(mask1, quarterpi);
1107 y = _mm_or_ps( _mm_and_ps(mask2, halfpi), _mm_andnot_ps(mask2, y) );
1109 x = _mm_or_ps( _mm_and_ps(mask1, z1), _mm_andnot_ps(mask1, x) );
1110 x = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, x) );
1112 x2 = _mm_mul_ps(x, x);
1113 x4 = _mm_mul_ps(x2, x2);
1115 sum1 = _mm_mul_ps(CC9, x4);
1116 sum2 = _mm_mul_ps(CC7, x4);
1117 sum1 = _mm_add_ps(sum1, CC5);
1118 sum2 = _mm_add_ps(sum2, CC3);
1119 sum1 = _mm_mul_ps(sum1, x4);
1120 sum2 = _mm_mul_ps(sum2, x2);
1122 sum1 = _mm_add_ps(sum1, sum2);
1123 sum1 = _mm_sub_ps(sum1, mone);
1124 sum1 = _mm_mul_ps(sum1, x);
1125 y = _mm_add_ps(y, sum1);
1127 y = _mm_xor_ps(y, sign);
1129 return y;
1133 static __m128
1134 gmx_mm_atan2_ps(__m128 y, __m128 x)
1136 const __m128 pi = _mm_set1_ps(M_PI);
1137 const __m128 minuspi = _mm_set1_ps(-M_PI);
1138 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
1139 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1141 __m128 z, z1, z3, z4;
1142 __m128 w;
1143 __m128 maskx_lt, maskx_eq;
1144 __m128 masky_lt, masky_eq;
1145 __m128 mask1, mask2, mask3, mask4, maskall;
1147 maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps());
1148 masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps());
1149 maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps());
1150 masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps());
1152 z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
1153 z = gmx_mm_atan_ps(z);
1155 mask1 = _mm_and_ps(maskx_eq, masky_lt);
1156 mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
1157 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
1158 mask4 = _mm_and_ps(masky_eq, maskx_lt);
1160 maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
1162 z = _mm_andnot_ps(maskall, z);
1163 z1 = _mm_and_ps(mask1, minushalfpi);
1164 z3 = _mm_and_ps(mask3, halfpi);
1165 z4 = _mm_and_ps(mask4, pi);
1167 z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
1169 mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
1170 mask2 = _mm_and_ps(maskx_lt, masky_lt);
1172 w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
1173 w = _mm_andnot_ps(maskall, w);
1175 z = _mm_add_ps(z, w);
1177 return z;
1181 #endif /* _gmx_math_x86_sse2_single_h_ */