Fixes for Bluegene/Q
[gromacs.git] / include / gmx_math_x86_avx_128_fma_single.h
blob1dc40bbf0f42a1b6b516cd46275ef119f4a8f095
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
6 * others, as listed in the AUTHORS file in the top-level source
7 * directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _gmx_math_x86_avx_128_fma_single_h_
36 #define _gmx_math_x86_avx_128_fma_single_h_
38 #include <immintrin.h> /* AVX */
39 #ifdef HAVE_X86INTRIN_H
40 #include <x86intrin.h> /* FMA */
41 #endif
42 #ifdef HAVE_INTRIN_H
43 #include <intrin.h> /* FMA MSVC */
44 #endif
46 #include <math.h>
48 #include "gmx_x86_avx_128_fma.h"
51 #ifndef M_PI
52 # define M_PI 3.14159265358979323846264338327950288
53 #endif
58 /************************
59 * *
60 * Simple math routines *
61 * *
62 ************************/
64 /* 1.0/sqrt(x) */
65 static gmx_inline __m128
66 gmx_mm_invsqrt_ps(__m128 x)
68 const __m128 half = _mm_set1_ps(0.5);
69 const __m128 one = _mm_set1_ps(1.0);
71 __m128 lu = _mm_rsqrt_ps(x);
73 return _mm_macc_ps(_mm_nmacc_ps(x, _mm_mul_ps(lu, lu), one), _mm_mul_ps(lu, half), lu);
76 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
77 static gmx_inline __m128
78 gmx_mm_sqrt_ps(__m128 x)
80 __m128 mask;
81 __m128 res;
83 mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
84 res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
86 res = _mm_mul_ps(x, res);
88 return res;
91 /* 1.0/x */
92 static gmx_inline __m128
93 gmx_mm_inv_ps(__m128 x)
95 const __m128 two = _mm_set1_ps(2.0);
97 __m128 lu = _mm_rcp_ps(x);
99 return _mm_mul_ps(lu, _mm_nmacc_ps(lu, x, two));
102 static gmx_inline __m128
103 gmx_mm_abs_ps(__m128 x)
105 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
107 return _mm_and_ps(x, signmask);
110 static __m128
111 gmx_mm_log_ps(__m128 x)
113 /* Same algorithm as cephes library */
114 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
115 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
116 const __m128 half = _mm_set1_ps(0.5f);
117 const __m128 one = _mm_set1_ps(1.0f);
118 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
119 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
120 const __m128 corr2 = _mm_set1_ps(0.693359375f);
122 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
123 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
124 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
125 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
126 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
127 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
128 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
129 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
130 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
132 __m128 fexp, fexp1;
133 __m128i iexp;
134 __m128 mask;
135 __m128 x1, x2;
136 __m128 y;
137 __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
139 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
140 fexp = _mm_and_ps(x, expmask);
141 iexp = gmx_mm_castps_si128(fexp);
142 iexp = _mm_srli_epi32(iexp, 23);
143 iexp = _mm_sub_epi32(iexp, expbase_m1);
145 x = _mm_andnot_ps(expmask, x);
146 x = _mm_or_ps(x, one);
147 x = _mm_mul_ps(x, half);
149 mask = _mm_cmp_ps(x, invsq2, _CMP_LT_OQ);
151 x = _mm_add_ps(x, _mm_and_ps(mask, x));
152 x = _mm_sub_ps(x, one);
153 iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
155 x2 = _mm_mul_ps(x, x);
157 pA = _mm_mul_ps(CA_1, x);
159 pB = _mm_add_ps(x, CB_1);
160 pC = _mm_add_ps(x, CC_1);
161 pD = _mm_add_ps(x, CD_1);
162 pE = _mm_add_ps(x, CE_1);
164 pB = _mm_macc_ps(x, pB, CB_0);
165 pC = _mm_macc_ps(x, pC, CC_0);
166 pD = _mm_macc_ps(x, pD, CD_0);
167 pE = _mm_macc_ps(x, pE, CE_0);
169 pA = _mm_mul_ps(pA, pB);
170 pC = _mm_mul_ps(pC, pD);
171 pE = _mm_mul_ps(pE, x2);
172 pA = _mm_mul_ps(pA, pC);
173 y = _mm_mul_ps(pA, pE);
175 fexp = _mm_cvtepi32_ps(iexp);
176 y = _mm_macc_ps(fexp, corr1, y);
177 y = _mm_nmacc_ps(half, x2, y);
179 x2 = _mm_add_ps(x, y);
180 x2 = _mm_macc_ps(fexp, corr2, x2);
182 return x2;
187 * 2^x function.
189 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
190 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
192 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
194 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
195 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
196 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
197 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
198 * number instead. That would take a few extra cycles and not really help, since something is
199 * wrong if you are using single precision to work with numbers that cannot really be represented
200 * in single precision.
202 * The accuracy is at least 23 bits.
204 static __m128
205 gmx_mm_exp2_ps(__m128 x)
207 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
208 const __m128 arglimit = _mm_set1_ps(126.0f);
210 const __m128i expbase = _mm_set1_epi32(127);
211 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
212 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
213 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
214 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
215 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
216 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
217 const __m128 CA0 = _mm_set1_ps(1.0f);
219 __m128 valuemask;
220 __m128i iexppart;
221 __m128 fexppart;
222 __m128 intpart;
223 __m128 x2;
224 __m128 p0, p1;
226 iexppart = _mm_cvtps_epi32(x);
227 intpart = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
228 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
229 valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(x), _CMP_GE_OQ);
230 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
232 x = _mm_sub_ps(x, intpart);
233 x2 = _mm_mul_ps(x, x);
235 p0 = _mm_macc_ps(CA6, x2, CA4);
236 p1 = _mm_macc_ps(CA5, x2, CA3);
237 p0 = _mm_macc_ps(p0, x2, CA2);
238 p1 = _mm_macc_ps(p1, x2, CA1);
239 p0 = _mm_macc_ps(p0, x2, CA0);
240 p0 = _mm_macc_ps(p1, x, p0);
241 x = _mm_mul_ps(p0, fexppart);
243 return x;
247 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
248 * but there will then be a small rounding error since we lose some precision due to the
249 * multiplication. This will then be magnified a lot by the exponential.
251 * Instead, we calculate the fractional part directly as a minimax approximation of
252 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
253 * remaining after 2^y, which avoids the precision-loss.
254 * The final result is correct to within 1 LSB over the entire argument range.
256 static __m128
257 gmx_mm_exp_ps(__m128 x)
259 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
260 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
261 const __m128 arglimit = _mm_set1_ps(126.0f);
262 const __m128i expbase = _mm_set1_epi32(127);
264 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
265 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
267 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
268 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
269 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
270 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
271 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
272 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
273 const __m128 one = _mm_set1_ps(1.0f);
275 __m128 y, x2;
276 __m128 p0, p1;
277 __m128 valuemask;
278 __m128i iexppart;
279 __m128 fexppart;
280 __m128 intpart;
282 y = _mm_mul_ps(x, argscale);
284 iexppart = _mm_cvtps_epi32(y);
285 intpart = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
287 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
288 valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(y), _CMP_GE_OQ);
289 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
291 /* Extended precision arithmetics */
292 x = _mm_nmacc_ps(invargscale0, intpart, x);
293 x = _mm_nmacc_ps(invargscale1, intpart, x);
295 x2 = _mm_mul_ps(x, x);
297 p1 = _mm_macc_ps(CC5, x2, CC3);
298 p0 = _mm_macc_ps(CC4, x2, CC2);
299 p1 = _mm_macc_ps(p1, x2, CC1);
300 p0 = _mm_macc_ps(p0, x2, CC0);
301 p0 = _mm_macc_ps(p1, x, p0);
302 p0 = _mm_macc_ps(p0, x2, one);
304 x = _mm_add_ps(x, p0);
306 x = _mm_mul_ps(x, fexppart);
308 return x;
311 /* FULL precision. Only errors in LSB */
312 static __m128
313 gmx_mm_erf_ps(__m128 x)
315 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
316 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
317 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
318 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
319 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
320 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
321 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
322 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
323 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
324 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
325 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
326 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
327 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
328 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
329 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
330 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
331 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
332 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
333 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
334 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
335 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
336 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
337 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
338 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
339 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
340 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
341 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
342 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
343 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
344 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
345 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
347 /* Coefficients for expansion of exp(x) in [0,0.1] */
348 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
349 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
350 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
351 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
353 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
354 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
355 const __m128 one = _mm_set1_ps(1.0f);
356 const __m128 two = _mm_set1_ps(2.0f);
358 __m128 x2, x4, y;
359 __m128 z, q, t, t2, w, w2;
360 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
361 __m128 expmx2, corr;
362 __m128 res_erf, res_erfc, res;
363 __m128 mask;
365 /* Calculate erf() */
366 x2 = _mm_mul_ps(x, x);
367 x4 = _mm_mul_ps(x2, x2);
369 pA0 = _mm_macc_ps(CA6, x4, CA4);
370 pA1 = _mm_macc_ps(CA5, x4, CA3);
371 pA0 = _mm_macc_ps(pA0, x4, CA2);
372 pA1 = _mm_macc_ps(pA1, x4, CA1);
373 pA0 = _mm_mul_ps(pA0, x4);
374 pA0 = _mm_macc_ps(pA1, x2, pA0);
375 /* Constant term must come last for precision reasons */
376 pA0 = _mm_add_ps(pA0, CA0);
378 res_erf = _mm_mul_ps(x, pA0);
380 /* Calculate erfc */
382 y = gmx_mm_abs_ps(x);
383 t = gmx_mm_inv_ps(y);
384 w = _mm_sub_ps(t, one);
385 t2 = _mm_mul_ps(t, t);
386 w2 = _mm_mul_ps(w, w);
388 * We cannot simply calculate exp(-x2) directly in single precision, since
389 * that will lose a couple of bits of precision due to the multiplication.
390 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
391 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
393 * The only drawback with this is that it requires TWO separate exponential
394 * evaluations, which would be horrible performance-wise. However, the argument
395 * for the second exp() call is always small, so there we simply use a
396 * low-order minimax expansion on [0,0.1].
399 z = _mm_and_ps(y, sieve);
400 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
402 corr = _mm_macc_ps(CD4, q, CD3);
403 corr = _mm_macc_ps(corr, q, CD2);
404 corr = _mm_macc_ps(corr, q, one);
405 corr = _mm_macc_ps(corr, q, one);
407 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
408 expmx2 = _mm_mul_ps(expmx2, corr);
410 pB1 = _mm_macc_ps(CB9, w2, CB7);
411 pB0 = _mm_macc_ps(CB8, w2, CB6);
412 pB1 = _mm_macc_ps(pB1, w2, CB5);
413 pB0 = _mm_macc_ps(pB0, w2, CB4);
414 pB1 = _mm_macc_ps(pB1, w2, CB3);
415 pB0 = _mm_macc_ps(pB0, w2, CB2);
416 pB1 = _mm_macc_ps(pB1, w2, CB1);
417 pB0 = _mm_macc_ps(pB0, w2, CB0);
418 pB0 = _mm_macc_ps(pB1, w, pB0);
420 pC0 = _mm_macc_ps(CC10, t2, CC8);
421 pC1 = _mm_macc_ps(CC9, t2, CC7);
422 pC0 = _mm_macc_ps(pC0, t2, CC6);
423 pC1 = _mm_macc_ps(pC1, t2, CC5);
424 pC0 = _mm_macc_ps(pC0, t2, CC4);
425 pC1 = _mm_macc_ps(pC1, t2, CC3);
426 pC0 = _mm_macc_ps(pC0, t2, CC2);
427 pC1 = _mm_macc_ps(pC1, t2, CC1);
429 pC0 = _mm_macc_ps(pC0, t2, CC0);
430 pC0 = _mm_macc_ps(pC1, t, pC0);
431 pC0 = _mm_mul_ps(pC0, t);
433 /* SELECT pB0 or pC0 for erfc() */
434 mask = _mm_cmp_ps(two, y, _CMP_LT_OQ);
435 res_erfc = _mm_blendv_ps(pB0, pC0, mask);
436 res_erfc = _mm_mul_ps(res_erfc, expmx2);
438 /* erfc(x<0) = 2-erfc(|x|) */
439 mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
440 res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
442 /* Select erf() or erfc() */
443 mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
444 res = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
446 return res;
450 /* FULL precision. Only errors in LSB */
451 static __m128
452 gmx_mm_erfc_ps(__m128 x)
454 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
455 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
456 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
457 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
458 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
459 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
460 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
461 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
462 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
463 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
464 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
465 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
466 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
467 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
468 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
469 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
470 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
471 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
472 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
473 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
474 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
475 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
476 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
477 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
478 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
479 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
480 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
481 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
482 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
483 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
484 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
486 /* Coefficients for expansion of exp(x) in [0,0.1] */
487 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
488 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
489 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
490 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
492 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
493 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
494 const __m128 one = _mm_set1_ps(1.0f);
495 const __m128 two = _mm_set1_ps(2.0f);
497 __m128 x2, x4, y;
498 __m128 z, q, t, t2, w, w2;
499 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
500 __m128 expmx2, corr;
501 __m128 res_erf, res_erfc, res;
502 __m128 mask;
504 /* Calculate erf() */
505 x2 = _mm_mul_ps(x, x);
506 x4 = _mm_mul_ps(x2, x2);
508 pA0 = _mm_macc_ps(CA6, x4, CA4);
509 pA1 = _mm_macc_ps(CA5, x4, CA3);
510 pA0 = _mm_macc_ps(pA0, x4, CA2);
511 pA1 = _mm_macc_ps(pA1, x4, CA1);
512 pA1 = _mm_mul_ps(pA1, x2);
513 pA0 = _mm_macc_ps(pA0, x4, pA1);
514 /* Constant term must come last for precision reasons */
515 pA0 = _mm_add_ps(pA0, CA0);
517 res_erf = _mm_mul_ps(x, pA0);
519 /* Calculate erfc */
520 y = gmx_mm_abs_ps(x);
521 t = gmx_mm_inv_ps(y);
522 w = _mm_sub_ps(t, one);
523 t2 = _mm_mul_ps(t, t);
524 w2 = _mm_mul_ps(w, w);
526 * We cannot simply calculate exp(-x2) directly in single precision, since
527 * that will lose a couple of bits of precision due to the multiplication.
528 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
529 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
531 * The only drawback with this is that it requires TWO separate exponential
532 * evaluations, which would be horrible performance-wise. However, the argument
533 * for the second exp() call is always small, so there we simply use a
534 * low-order minimax expansion on [0,0.1].
537 z = _mm_and_ps(y, sieve);
538 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
540 corr = _mm_macc_ps(CD4, q, CD3);
541 corr = _mm_macc_ps(corr, q, CD2);
542 corr = _mm_macc_ps(corr, q, one);
543 corr = _mm_macc_ps(corr, q, one);
545 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
546 expmx2 = _mm_mul_ps(expmx2, corr);
548 pB1 = _mm_macc_ps(CB9, w2, CB7);
549 pB0 = _mm_macc_ps(CB8, w2, CB6);
550 pB1 = _mm_macc_ps(pB1, w2, CB5);
551 pB0 = _mm_macc_ps(pB0, w2, CB4);
552 pB1 = _mm_macc_ps(pB1, w2, CB3);
553 pB0 = _mm_macc_ps(pB0, w2, CB2);
554 pB1 = _mm_macc_ps(pB1, w2, CB1);
555 pB0 = _mm_macc_ps(pB0, w2, CB0);
556 pB0 = _mm_macc_ps(pB1, w, pB0);
558 pC0 = _mm_macc_ps(CC10, t2, CC8);
559 pC1 = _mm_macc_ps(CC9, t2, CC7);
560 pC0 = _mm_macc_ps(pC0, t2, CC6);
561 pC1 = _mm_macc_ps(pC1, t2, CC5);
562 pC0 = _mm_macc_ps(pC0, t2, CC4);
563 pC1 = _mm_macc_ps(pC1, t2, CC3);
564 pC0 = _mm_macc_ps(pC0, t2, CC2);
565 pC1 = _mm_macc_ps(pC1, t2, CC1);
567 pC0 = _mm_macc_ps(pC0, t2, CC0);
568 pC0 = _mm_macc_ps(pC1, t, pC0);
569 pC0 = _mm_mul_ps(pC0, t);
571 /* SELECT pB0 or pC0 for erfc() */
572 mask = _mm_cmp_ps(two, y, _CMP_LT_OQ);
573 res_erfc = _mm_blendv_ps(pB0, pC0, mask);
574 res_erfc = _mm_mul_ps(res_erfc, expmx2);
576 /* erfc(x<0) = 2-erfc(|x|) */
577 mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
578 res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
580 /* Select erf() or erfc() */
581 mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
582 res = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
584 return res;
588 /* Calculate the force correction due to PME analytically.
590 * This routine is meant to enable analytical evaluation of the
591 * direct-space PME electrostatic force to avoid tables.
593 * The direct-space potential should be Erfc(beta*r)/r, but there
594 * are some problems evaluating that:
596 * First, the error function is difficult (read: expensive) to
597 * approxmiate accurately for intermediate to large arguments, and
598 * this happens already in ranges of beta*r that occur in simulations.
599 * Second, we now try to avoid calculating potentials in Gromacs but
600 * use forces directly.
602 * We can simply things slight by noting that the PME part is really
603 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
605 * V= 1/r - Erf(beta*r)/r
607 * The first term we already have from the inverse square root, so
608 * that we can leave out of this routine.
610 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
611 * the argument beta*r will be in the range 0.15 to ~4. Use your
612 * favorite plotting program to realize how well-behaved Erf(z)/z is
613 * in this range!
615 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
616 * However, it turns out it is more efficient to approximate f(z)/z and
617 * then only use even powers. This is another minor optimization, since
618 * we actually WANT f(z)/z, because it is going to be multiplied by
619 * the vector between the two atoms to get the vectorial force. The
620 * fastest flops are the ones we can avoid calculating!
622 * So, here's how it should be used:
624 * 1. Calculate r^2.
625 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
626 * 3. Evaluate this routine with z^2 as the argument.
627 * 4. The return value is the expression:
630 * 2*exp(-z^2) erf(z)
631 * ------------ - --------
632 * sqrt(Pi)*z^2 z^3
634 * 5. Multiply the entire expression by beta^3. This will get you
636 * beta^3*2*exp(-z^2) beta^3*erf(z)
637 * ------------------ - ---------------
638 * sqrt(Pi)*z^2 z^3
640 * or, switching back to r (z=r*beta):
642 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
643 * ----------------------- - -----------
644 * sqrt(Pi)*r^2 r^3
647 * With a bit of math exercise you should be able to confirm that
648 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
650 * 6. Add the result to 1/r^3, multiply by the product of the charges,
651 * and you have your force (divided by r). A final multiplication
652 * with the vector connecting the two particles and you have your
653 * vectorial force to add to the particles.
656 static __m128
657 gmx_mm_pmecorrF_ps(__m128 z2)
659 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
660 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
661 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
662 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
663 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
664 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
665 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
667 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
668 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
669 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
670 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
671 const __m128 FD0 = _mm_set1_ps(1.0f);
673 __m128 z4;
674 __m128 polyFN0, polyFN1, polyFD0, polyFD1;
676 z4 = _mm_mul_ps(z2, z2);
678 polyFD0 = _mm_macc_ps(FD4, z4, FD2);
679 polyFD1 = _mm_macc_ps(FD3, z4, FD1);
680 polyFD0 = _mm_macc_ps(polyFD0, z4, FD0);
681 polyFD0 = _mm_macc_ps(polyFD1, z2, polyFD0);
683 polyFD0 = gmx_mm_inv_ps(polyFD0);
685 polyFN0 = _mm_macc_ps(FN6, z4, FN4);
686 polyFN1 = _mm_macc_ps(FN5, z4, FN3);
687 polyFN0 = _mm_macc_ps(polyFN0, z4, FN2);
688 polyFN1 = _mm_macc_ps(polyFN1, z4, FN1);
689 polyFN0 = _mm_macc_ps(polyFN0, z4, FN0);
690 polyFN0 = _mm_macc_ps(polyFN1, z2, polyFN0);
692 return _mm_mul_ps(polyFN0, polyFD0);
698 /* Calculate the potential correction due to PME analytically.
700 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
702 * This routine calculates Erf(z)/z, although you should provide z^2
703 * as the input argument.
705 * Here's how it should be used:
707 * 1. Calculate r^2.
708 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
709 * 3. Evaluate this routine with z^2 as the argument.
710 * 4. The return value is the expression:
713 * erf(z)
714 * --------
717 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
719 * erf(r*beta)
720 * -----------
723 * 6. Add the result to 1/r, multiply by the product of the charges,
724 * and you have your potential.
726 static __m128
727 gmx_mm_pmecorrV_ps(__m128 z2)
729 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
730 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
731 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
732 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
733 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
734 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
735 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
737 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
738 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
739 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
740 const __m128 VD0 = _mm_set1_ps(1.0f);
742 __m128 z4;
743 __m128 polyVN0, polyVN1, polyVD0, polyVD1;
745 z4 = _mm_mul_ps(z2, z2);
747 polyVD1 = _mm_macc_ps(VD3, z4, VD1);
748 polyVD0 = _mm_macc_ps(VD2, z4, VD0);
749 polyVD0 = _mm_macc_ps(polyVD1, z2, polyVD0);
751 polyVD0 = gmx_mm_inv_ps(polyVD0);
753 polyVN0 = _mm_macc_ps(VN6, z4, VN4);
754 polyVN1 = _mm_macc_ps(VN5, z4, VN3);
755 polyVN0 = _mm_macc_ps(polyVN0, z4, VN2);
756 polyVN1 = _mm_macc_ps(polyVN1, z4, VN1);
757 polyVN0 = _mm_macc_ps(polyVN0, z4, VN0);
758 polyVN0 = _mm_macc_ps(polyVN1, z2, polyVN0);
760 return _mm_mul_ps(polyVN0, polyVD0);
765 static int
766 gmx_mm_sincos_ps(__m128 x,
767 __m128 *sinval,
768 __m128 *cosval)
770 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
771 const __m128 half = _mm_set1_ps(0.5);
772 const __m128 one = _mm_set1_ps(1.0);
774 const __m128i izero = _mm_set1_epi32(0);
775 const __m128i ione = _mm_set1_epi32(1);
776 const __m128i itwo = _mm_set1_epi32(2);
777 const __m128i ithree = _mm_set1_epi32(3);
778 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
780 const __m128 CA1 = _mm_set1_ps(1.5703125f);
781 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
782 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
784 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
785 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
786 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
787 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
788 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
789 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
791 __m128 y, y2;
792 __m128 z;
793 __m128i iz;
794 __m128i offset_sin, offset_cos;
795 __m128 tmp1, tmp2;
796 __m128 mask_sin, mask_cos;
797 __m128 tmp_sin, tmp_cos;
799 y = _mm_mul_ps(x, two_over_pi);
800 y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
802 iz = _mm_cvttps_epi32(y);
803 z = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
805 offset_sin = _mm_and_si128(iz, ithree);
806 offset_cos = _mm_add_epi32(iz, ione);
808 /* Extended precision arithmethic to achieve full precision */
809 y = _mm_nmacc_ps(z, CA1, x);
810 y = _mm_nmacc_ps(z, CA2, y);
811 y = _mm_nmacc_ps(z, CA3, y);
813 y2 = _mm_mul_ps(y, y);
815 tmp1 = _mm_macc_ps(CC0, y2, CC1);
816 tmp2 = _mm_macc_ps(CS0, y2, CS1);
817 tmp1 = _mm_macc_ps(tmp1, y2, CC2);
818 tmp2 = _mm_macc_ps(tmp2, y2, CS2);
820 tmp1 = _mm_macc_ps(tmp1, y2, one);
822 tmp2 = _mm_macc_ps(tmp2, _mm_mul_ps(y, y2), y);
824 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
825 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
827 tmp_sin = _mm_blendv_ps(tmp1, tmp2, mask_sin);
828 tmp_cos = _mm_blendv_ps(tmp1, tmp2, mask_cos);
830 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
831 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
833 tmp1 = _mm_xor_ps(signbit, tmp_sin);
834 tmp2 = _mm_xor_ps(signbit, tmp_cos);
836 *sinval = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
837 *cosval = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
839 return 0;
843 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
844 * will then call the sincos() routine and waste a factor 2 in performance!
846 static __m128
847 gmx_mm_sin_ps(__m128 x)
849 __m128 s, c;
850 gmx_mm_sincos_ps(x, &s, &c);
851 return s;
855 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
856 * will then call the sincos() routine and waste a factor 2 in performance!
858 static __m128
859 gmx_mm_cos_ps(__m128 x)
861 __m128 s, c;
862 gmx_mm_sincos_ps(x, &s, &c);
863 return c;
867 static __m128
868 gmx_mm_tan_ps(__m128 x)
870 __m128 sinval, cosval;
871 __m128 tanval;
873 gmx_mm_sincos_ps(x, &sinval, &cosval);
875 tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
877 return tanval;
881 static __m128
882 gmx_mm_asin_ps(__m128 x)
884 /* Same algorithm as cephes library */
885 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
886 const __m128 limitlow = _mm_set1_ps(1e-4f);
887 const __m128 half = _mm_set1_ps(0.5f);
888 const __m128 one = _mm_set1_ps(1.0f);
889 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
891 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
892 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
893 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
894 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
895 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
897 __m128 sign;
898 __m128 mask;
899 __m128 xabs;
900 __m128 z, z1, z2, q, q1, q2;
901 __m128 pA, pB;
903 sign = _mm_andnot_ps(signmask, x);
904 xabs = _mm_and_ps(x, signmask);
906 mask = _mm_cmp_ps(xabs, half, _CMP_GT_OQ);
908 z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
909 q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
910 q1 = _mm_andnot_ps(_mm_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
912 q2 = xabs;
913 z2 = _mm_mul_ps(q2, q2);
915 z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
916 q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
918 z2 = _mm_mul_ps(z, z);
920 pA = _mm_macc_ps(CC5, z2, CC3);
921 pB = _mm_macc_ps(CC4, z2, CC2);
923 pA = _mm_macc_ps(pA, z2, CC1);
924 pA = _mm_mul_ps(pA, z);
926 z = _mm_macc_ps(pB, z2, pA);
928 z = _mm_macc_ps(z, q, q);
930 q2 = _mm_sub_ps(halfpi, z);
931 q2 = _mm_sub_ps(q2, z);
933 z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
935 mask = _mm_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
936 z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
938 z = _mm_xor_ps(z, sign);
940 return z;
944 static __m128
945 gmx_mm_acos_ps(__m128 x)
947 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
948 const __m128 one_ps = _mm_set1_ps(1.0f);
949 const __m128 half_ps = _mm_set1_ps(0.5f);
950 const __m128 pi_ps = _mm_set1_ps(M_PI);
951 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
953 __m128 mask1;
954 __m128 mask2;
955 __m128 xabs;
956 __m128 z, z1, z2, z3;
958 xabs = _mm_and_ps(x, signmask);
959 mask1 = _mm_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
960 mask2 = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_GT_OQ);
962 z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
963 z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
964 z = _mm_andnot_ps(_mm_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
966 z = _mm_blendv_ps(x, z, mask1);
967 z = gmx_mm_asin_ps(z);
969 z2 = _mm_add_ps(z, z);
970 z1 = _mm_sub_ps(pi_ps, z2);
971 z3 = _mm_sub_ps(halfpi_ps, z);
973 z = _mm_blendv_ps(z1, z2, mask2);
974 z = _mm_blendv_ps(z3, z, mask1);
976 return z;
980 static __m128
981 gmx_mm_atan_ps(__m128 x)
983 /* Same algorithm as cephes library */
984 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
985 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
986 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
987 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
988 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
989 const __m128 mone = _mm_set1_ps(-1.0f);
990 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
991 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
992 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
993 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
995 __m128 sign;
996 __m128 mask1, mask2;
997 __m128 y, z1, z2;
998 __m128 x2, x4;
999 __m128 sum1, sum2;
1001 sign = _mm_andnot_ps(signmask, x);
1002 x = _mm_and_ps(x, signmask);
1004 mask1 = _mm_cmp_ps(x, limit1, _CMP_GT_OQ);
1005 mask2 = _mm_cmp_ps(x, limit2, _CMP_GT_OQ);
1007 z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
1008 z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
1010 y = _mm_and_ps(mask1, quarterpi);
1011 y = _mm_blendv_ps(y, halfpi, mask2);
1013 x = _mm_blendv_ps(x, z1, mask1);
1014 x = _mm_blendv_ps(x, z2, mask2);
1016 x2 = _mm_mul_ps(x, x);
1017 x4 = _mm_mul_ps(x2, x2);
1019 sum1 = _mm_macc_ps(CC9, x4, CC5);
1020 sum2 = _mm_macc_ps(CC7, x4, CC3);
1021 sum1 = _mm_mul_ps(sum1, x4);
1022 sum1 = _mm_macc_ps(sum2, x2, sum1);
1024 sum1 = _mm_sub_ps(sum1, mone);
1025 y = _mm_macc_ps(sum1, x, y);
1027 y = _mm_xor_ps(y, sign);
1029 return y;
1033 static __m128
1034 gmx_mm_atan2_ps(__m128 y, __m128 x)
1036 const __m128 pi = _mm_set1_ps(M_PI);
1037 const __m128 minuspi = _mm_set1_ps(-M_PI);
1038 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
1039 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1041 __m128 z, z1, z3, z4;
1042 __m128 w;
1043 __m128 maskx_lt, maskx_eq;
1044 __m128 masky_lt, masky_eq;
1045 __m128 mask1, mask2, mask3, mask4, maskall;
1047 maskx_lt = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
1048 masky_lt = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_LT_OQ);
1049 maskx_eq = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
1050 masky_eq = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_EQ_OQ);
1052 z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
1053 z = gmx_mm_atan_ps(z);
1055 mask1 = _mm_and_ps(maskx_eq, masky_lt);
1056 mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
1057 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
1058 mask4 = _mm_and_ps(masky_eq, maskx_lt);
1060 maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
1062 z = _mm_andnot_ps(maskall, z);
1063 z1 = _mm_and_ps(mask1, minushalfpi);
1064 z3 = _mm_and_ps(mask3, halfpi);
1065 z4 = _mm_and_ps(mask4, pi);
1067 z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
1069 mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
1070 mask2 = _mm_and_ps(maskx_lt, masky_lt);
1072 w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
1073 w = _mm_andnot_ps(maskall, w);
1075 z = _mm_add_ps(z, w);
1077 return z;
1082 #endif /* _gmx_math_x86_avx_128_fma_single_h_ */