2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
6 * others, as listed in the AUTHORS file in the top-level source
7 * directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _gmx_math_x86_avx_128_fma_single_h_
36 #define _gmx_math_x86_avx_128_fma_single_h_
38 #include <immintrin.h> /* AVX */
39 #ifdef HAVE_X86INTRIN_H
40 #include <x86intrin.h> /* FMA */
43 #include <intrin.h> /* FMA MSVC */
48 #include "gmx_x86_avx_128_fma.h"
52 # define M_PI 3.14159265358979323846264338327950288
58 /************************
60 * Simple math routines *
62 ************************/
65 static gmx_inline __m128
66 gmx_mm_invsqrt_ps(__m128 x
)
68 const __m128 half
= _mm_set1_ps(0.5);
69 const __m128 one
= _mm_set1_ps(1.0);
71 __m128 lu
= _mm_rsqrt_ps(x
);
73 return _mm_macc_ps(_mm_nmacc_ps(x
, _mm_mul_ps(lu
, lu
), one
), _mm_mul_ps(lu
, half
), lu
);
76 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
77 static gmx_inline __m128
78 gmx_mm_sqrt_ps(__m128 x
)
83 mask
= _mm_cmp_ps(x
, _mm_setzero_ps(), _CMP_EQ_OQ
);
84 res
= _mm_andnot_ps(mask
, gmx_mm_invsqrt_ps(x
));
86 res
= _mm_mul_ps(x
, res
);
92 static gmx_inline __m128
93 gmx_mm_inv_ps(__m128 x
)
95 const __m128 two
= _mm_set1_ps(2.0);
97 __m128 lu
= _mm_rcp_ps(x
);
99 return _mm_mul_ps(lu
, _mm_nmacc_ps(lu
, x
, two
));
102 static gmx_inline __m128
103 gmx_mm_abs_ps(__m128 x
)
105 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
107 return _mm_and_ps(x
, signmask
);
111 gmx_mm_log_ps(__m128 x
)
113 /* Same algorithm as cephes library */
114 const __m128 expmask
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
115 const __m128i expbase_m1
= _mm_set1_epi32(127-1); /* We want non-IEEE format */
116 const __m128 half
= _mm_set1_ps(0.5f
);
117 const __m128 one
= _mm_set1_ps(1.0f
);
118 const __m128 invsq2
= _mm_set1_ps(1.0f
/sqrt(2.0f
));
119 const __m128 corr1
= _mm_set1_ps(-2.12194440e-4f
);
120 const __m128 corr2
= _mm_set1_ps(0.693359375f
);
122 const __m128 CA_1
= _mm_set1_ps(0.070376836292f
);
123 const __m128 CB_0
= _mm_set1_ps(1.6714950086782716f
);
124 const __m128 CB_1
= _mm_set1_ps(-2.452088066061482f
);
125 const __m128 CC_0
= _mm_set1_ps(1.5220770854701728f
);
126 const __m128 CC_1
= _mm_set1_ps(-1.3422238433233642f
);
127 const __m128 CD_0
= _mm_set1_ps(1.386218787509749f
);
128 const __m128 CD_1
= _mm_set1_ps(0.35075468953796346f
);
129 const __m128 CE_0
= _mm_set1_ps(1.3429983063133937f
);
130 const __m128 CE_1
= _mm_set1_ps(1.807420826584643f
);
137 __m128 pA
, pB
, pC
, pD
, pE
, tB
, tC
, tD
, tE
;
139 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
140 fexp
= _mm_and_ps(x
, expmask
);
141 iexp
= gmx_mm_castps_si128(fexp
);
142 iexp
= _mm_srli_epi32(iexp
, 23);
143 iexp
= _mm_sub_epi32(iexp
, expbase_m1
);
145 x
= _mm_andnot_ps(expmask
, x
);
146 x
= _mm_or_ps(x
, one
);
147 x
= _mm_mul_ps(x
, half
);
149 mask
= _mm_cmp_ps(x
, invsq2
, _CMP_LT_OQ
);
151 x
= _mm_add_ps(x
, _mm_and_ps(mask
, x
));
152 x
= _mm_sub_ps(x
, one
);
153 iexp
= _mm_add_epi32(iexp
, gmx_mm_castps_si128(mask
)); /* 0xFFFFFFFF = -1 as int */
155 x2
= _mm_mul_ps(x
, x
);
157 pA
= _mm_mul_ps(CA_1
, x
);
159 pB
= _mm_add_ps(x
, CB_1
);
160 pC
= _mm_add_ps(x
, CC_1
);
161 pD
= _mm_add_ps(x
, CD_1
);
162 pE
= _mm_add_ps(x
, CE_1
);
164 pB
= _mm_macc_ps(x
, pB
, CB_0
);
165 pC
= _mm_macc_ps(x
, pC
, CC_0
);
166 pD
= _mm_macc_ps(x
, pD
, CD_0
);
167 pE
= _mm_macc_ps(x
, pE
, CE_0
);
169 pA
= _mm_mul_ps(pA
, pB
);
170 pC
= _mm_mul_ps(pC
, pD
);
171 pE
= _mm_mul_ps(pE
, x2
);
172 pA
= _mm_mul_ps(pA
, pC
);
173 y
= _mm_mul_ps(pA
, pE
);
175 fexp
= _mm_cvtepi32_ps(iexp
);
176 y
= _mm_macc_ps(fexp
, corr1
, y
);
177 y
= _mm_nmacc_ps(half
, x2
, y
);
179 x2
= _mm_add_ps(x
, y
);
180 x2
= _mm_macc_ps(fexp
, corr2
, x2
);
189 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
190 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
192 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
194 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
195 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
196 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
197 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
198 * number instead. That would take a few extra cycles and not really help, since something is
199 * wrong if you are using single precision to work with numbers that cannot really be represented
200 * in single precision.
202 * The accuracy is at least 23 bits.
205 gmx_mm_exp2_ps(__m128 x
)
207 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
208 const __m128 arglimit
= _mm_set1_ps(126.0f
);
210 const __m128i expbase
= _mm_set1_epi32(127);
211 const __m128 CA6
= _mm_set1_ps(1.535336188319500E-004);
212 const __m128 CA5
= _mm_set1_ps(1.339887440266574E-003);
213 const __m128 CA4
= _mm_set1_ps(9.618437357674640E-003);
214 const __m128 CA3
= _mm_set1_ps(5.550332471162809E-002);
215 const __m128 CA2
= _mm_set1_ps(2.402264791363012E-001);
216 const __m128 CA1
= _mm_set1_ps(6.931472028550421E-001);
217 const __m128 CA0
= _mm_set1_ps(1.0f
);
226 iexppart
= _mm_cvtps_epi32(x
);
227 intpart
= _mm_round_ps(x
, _MM_FROUND_TO_NEAREST_INT
);
228 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
, expbase
), 23);
229 valuemask
= _mm_cmp_ps(arglimit
, gmx_mm_abs_ps(x
), _CMP_GE_OQ
);
230 fexppart
= _mm_and_ps(valuemask
, gmx_mm_castsi128_ps(iexppart
));
232 x
= _mm_sub_ps(x
, intpart
);
233 x2
= _mm_mul_ps(x
, x
);
235 p0
= _mm_macc_ps(CA6
, x2
, CA4
);
236 p1
= _mm_macc_ps(CA5
, x2
, CA3
);
237 p0
= _mm_macc_ps(p0
, x2
, CA2
);
238 p1
= _mm_macc_ps(p1
, x2
, CA1
);
239 p0
= _mm_macc_ps(p0
, x2
, CA0
);
240 p0
= _mm_macc_ps(p1
, x
, p0
);
241 x
= _mm_mul_ps(p0
, fexppart
);
247 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
248 * but there will then be a small rounding error since we lose some precision due to the
249 * multiplication. This will then be magnified a lot by the exponential.
251 * Instead, we calculate the fractional part directly as a minimax approximation of
252 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
253 * remaining after 2^y, which avoids the precision-loss.
254 * The final result is correct to within 1 LSB over the entire argument range.
257 gmx_mm_exp_ps(__m128 x
)
259 const __m128 argscale
= _mm_set1_ps(1.44269504088896341f
);
260 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
261 const __m128 arglimit
= _mm_set1_ps(126.0f
);
262 const __m128i expbase
= _mm_set1_epi32(127);
264 const __m128 invargscale0
= _mm_set1_ps(0.693359375f
);
265 const __m128 invargscale1
= _mm_set1_ps(-2.12194440e-4f
);
267 const __m128 CC5
= _mm_set1_ps(1.9875691500e-4f
);
268 const __m128 CC4
= _mm_set1_ps(1.3981999507e-3f
);
269 const __m128 CC3
= _mm_set1_ps(8.3334519073e-3f
);
270 const __m128 CC2
= _mm_set1_ps(4.1665795894e-2f
);
271 const __m128 CC1
= _mm_set1_ps(1.6666665459e-1f
);
272 const __m128 CC0
= _mm_set1_ps(5.0000001201e-1f
);
273 const __m128 one
= _mm_set1_ps(1.0f
);
282 y
= _mm_mul_ps(x
, argscale
);
284 iexppart
= _mm_cvtps_epi32(y
);
285 intpart
= _mm_round_ps(y
, _MM_FROUND_TO_NEAREST_INT
);
287 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
, expbase
), 23);
288 valuemask
= _mm_cmp_ps(arglimit
, gmx_mm_abs_ps(y
), _CMP_GE_OQ
);
289 fexppart
= _mm_and_ps(valuemask
, gmx_mm_castsi128_ps(iexppart
));
291 /* Extended precision arithmetics */
292 x
= _mm_nmacc_ps(invargscale0
, intpart
, x
);
293 x
= _mm_nmacc_ps(invargscale1
, intpart
, x
);
295 x2
= _mm_mul_ps(x
, x
);
297 p1
= _mm_macc_ps(CC5
, x2
, CC3
);
298 p0
= _mm_macc_ps(CC4
, x2
, CC2
);
299 p1
= _mm_macc_ps(p1
, x2
, CC1
);
300 p0
= _mm_macc_ps(p0
, x2
, CC0
);
301 p0
= _mm_macc_ps(p1
, x
, p0
);
302 p0
= _mm_macc_ps(p0
, x2
, one
);
304 x
= _mm_add_ps(x
, p0
);
306 x
= _mm_mul_ps(x
, fexppart
);
311 /* FULL precision. Only errors in LSB */
313 gmx_mm_erf_ps(__m128 x
)
315 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
316 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
317 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
318 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
319 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
320 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
321 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
322 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
323 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
324 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
325 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
326 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
327 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
328 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
329 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
330 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
331 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
332 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
333 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
334 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
335 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
336 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
337 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
338 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
339 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
340 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
341 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
342 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
343 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
344 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
345 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
347 /* Coefficients for expansion of exp(x) in [0,0.1] */
348 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
349 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
350 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
351 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
353 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
354 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
355 const __m128 one
= _mm_set1_ps(1.0f
);
356 const __m128 two
= _mm_set1_ps(2.0f
);
359 __m128 z
, q
, t
, t2
, w
, w2
;
360 __m128 pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
362 __m128 res_erf
, res_erfc
, res
;
365 /* Calculate erf() */
366 x2
= _mm_mul_ps(x
, x
);
367 x4
= _mm_mul_ps(x2
, x2
);
369 pA0
= _mm_macc_ps(CA6
, x4
, CA4
);
370 pA1
= _mm_macc_ps(CA5
, x4
, CA3
);
371 pA0
= _mm_macc_ps(pA0
, x4
, CA2
);
372 pA1
= _mm_macc_ps(pA1
, x4
, CA1
);
373 pA0
= _mm_mul_ps(pA0
, x4
);
374 pA0
= _mm_macc_ps(pA1
, x2
, pA0
);
375 /* Constant term must come last for precision reasons */
376 pA0
= _mm_add_ps(pA0
, CA0
);
378 res_erf
= _mm_mul_ps(x
, pA0
);
382 y
= gmx_mm_abs_ps(x
);
383 t
= gmx_mm_inv_ps(y
);
384 w
= _mm_sub_ps(t
, one
);
385 t2
= _mm_mul_ps(t
, t
);
386 w2
= _mm_mul_ps(w
, w
);
388 * We cannot simply calculate exp(-x2) directly in single precision, since
389 * that will lose a couple of bits of precision due to the multiplication.
390 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
391 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
393 * The only drawback with this is that it requires TWO separate exponential
394 * evaluations, which would be horrible performance-wise. However, the argument
395 * for the second exp() call is always small, so there we simply use a
396 * low-order minimax expansion on [0,0.1].
399 z
= _mm_and_ps(y
, sieve
);
400 q
= _mm_mul_ps( _mm_sub_ps(z
, y
), _mm_add_ps(z
, y
) );
402 corr
= _mm_macc_ps(CD4
, q
, CD3
);
403 corr
= _mm_macc_ps(corr
, q
, CD2
);
404 corr
= _mm_macc_ps(corr
, q
, one
);
405 corr
= _mm_macc_ps(corr
, q
, one
);
407 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
, z
) ) );
408 expmx2
= _mm_mul_ps(expmx2
, corr
);
410 pB1
= _mm_macc_ps(CB9
, w2
, CB7
);
411 pB0
= _mm_macc_ps(CB8
, w2
, CB6
);
412 pB1
= _mm_macc_ps(pB1
, w2
, CB5
);
413 pB0
= _mm_macc_ps(pB0
, w2
, CB4
);
414 pB1
= _mm_macc_ps(pB1
, w2
, CB3
);
415 pB0
= _mm_macc_ps(pB0
, w2
, CB2
);
416 pB1
= _mm_macc_ps(pB1
, w2
, CB1
);
417 pB0
= _mm_macc_ps(pB0
, w2
, CB0
);
418 pB0
= _mm_macc_ps(pB1
, w
, pB0
);
420 pC0
= _mm_macc_ps(CC10
, t2
, CC8
);
421 pC1
= _mm_macc_ps(CC9
, t2
, CC7
);
422 pC0
= _mm_macc_ps(pC0
, t2
, CC6
);
423 pC1
= _mm_macc_ps(pC1
, t2
, CC5
);
424 pC0
= _mm_macc_ps(pC0
, t2
, CC4
);
425 pC1
= _mm_macc_ps(pC1
, t2
, CC3
);
426 pC0
= _mm_macc_ps(pC0
, t2
, CC2
);
427 pC1
= _mm_macc_ps(pC1
, t2
, CC1
);
429 pC0
= _mm_macc_ps(pC0
, t2
, CC0
);
430 pC0
= _mm_macc_ps(pC1
, t
, pC0
);
431 pC0
= _mm_mul_ps(pC0
, t
);
433 /* SELECT pB0 or pC0 for erfc() */
434 mask
= _mm_cmp_ps(two
, y
, _CMP_LT_OQ
);
435 res_erfc
= _mm_blendv_ps(pB0
, pC0
, mask
);
436 res_erfc
= _mm_mul_ps(res_erfc
, expmx2
);
438 /* erfc(x<0) = 2-erfc(|x|) */
439 mask
= _mm_cmp_ps(x
, _mm_setzero_ps(), _CMP_LT_OQ
);
440 res_erfc
= _mm_blendv_ps(res_erfc
, _mm_sub_ps(two
, res_erfc
), mask
);
442 /* Select erf() or erfc() */
443 mask
= _mm_cmp_ps(y
, _mm_set1_ps(0.75f
), _CMP_LT_OQ
);
444 res
= _mm_blendv_ps(_mm_sub_ps(one
, res_erfc
), res_erf
, mask
);
450 /* FULL precision. Only errors in LSB */
452 gmx_mm_erfc_ps(__m128 x
)
454 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
455 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
456 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
457 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
458 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
459 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
460 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
461 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
462 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
463 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
464 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
465 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
466 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
467 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
468 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
469 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
470 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
471 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
472 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
473 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
474 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
475 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
476 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
477 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
478 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
479 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
480 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
481 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
482 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
483 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
484 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
486 /* Coefficients for expansion of exp(x) in [0,0.1] */
487 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
488 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
489 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
490 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
492 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
493 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
494 const __m128 one
= _mm_set1_ps(1.0f
);
495 const __m128 two
= _mm_set1_ps(2.0f
);
498 __m128 z
, q
, t
, t2
, w
, w2
;
499 __m128 pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
501 __m128 res_erf
, res_erfc
, res
;
504 /* Calculate erf() */
505 x2
= _mm_mul_ps(x
, x
);
506 x4
= _mm_mul_ps(x2
, x2
);
508 pA0
= _mm_macc_ps(CA6
, x4
, CA4
);
509 pA1
= _mm_macc_ps(CA5
, x4
, CA3
);
510 pA0
= _mm_macc_ps(pA0
, x4
, CA2
);
511 pA1
= _mm_macc_ps(pA1
, x4
, CA1
);
512 pA1
= _mm_mul_ps(pA1
, x2
);
513 pA0
= _mm_macc_ps(pA0
, x4
, pA1
);
514 /* Constant term must come last for precision reasons */
515 pA0
= _mm_add_ps(pA0
, CA0
);
517 res_erf
= _mm_mul_ps(x
, pA0
);
520 y
= gmx_mm_abs_ps(x
);
521 t
= gmx_mm_inv_ps(y
);
522 w
= _mm_sub_ps(t
, one
);
523 t2
= _mm_mul_ps(t
, t
);
524 w2
= _mm_mul_ps(w
, w
);
526 * We cannot simply calculate exp(-x2) directly in single precision, since
527 * that will lose a couple of bits of precision due to the multiplication.
528 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
529 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
531 * The only drawback with this is that it requires TWO separate exponential
532 * evaluations, which would be horrible performance-wise. However, the argument
533 * for the second exp() call is always small, so there we simply use a
534 * low-order minimax expansion on [0,0.1].
537 z
= _mm_and_ps(y
, sieve
);
538 q
= _mm_mul_ps( _mm_sub_ps(z
, y
), _mm_add_ps(z
, y
) );
540 corr
= _mm_macc_ps(CD4
, q
, CD3
);
541 corr
= _mm_macc_ps(corr
, q
, CD2
);
542 corr
= _mm_macc_ps(corr
, q
, one
);
543 corr
= _mm_macc_ps(corr
, q
, one
);
545 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
, z
) ) );
546 expmx2
= _mm_mul_ps(expmx2
, corr
);
548 pB1
= _mm_macc_ps(CB9
, w2
, CB7
);
549 pB0
= _mm_macc_ps(CB8
, w2
, CB6
);
550 pB1
= _mm_macc_ps(pB1
, w2
, CB5
);
551 pB0
= _mm_macc_ps(pB0
, w2
, CB4
);
552 pB1
= _mm_macc_ps(pB1
, w2
, CB3
);
553 pB0
= _mm_macc_ps(pB0
, w2
, CB2
);
554 pB1
= _mm_macc_ps(pB1
, w2
, CB1
);
555 pB0
= _mm_macc_ps(pB0
, w2
, CB0
);
556 pB0
= _mm_macc_ps(pB1
, w
, pB0
);
558 pC0
= _mm_macc_ps(CC10
, t2
, CC8
);
559 pC1
= _mm_macc_ps(CC9
, t2
, CC7
);
560 pC0
= _mm_macc_ps(pC0
, t2
, CC6
);
561 pC1
= _mm_macc_ps(pC1
, t2
, CC5
);
562 pC0
= _mm_macc_ps(pC0
, t2
, CC4
);
563 pC1
= _mm_macc_ps(pC1
, t2
, CC3
);
564 pC0
= _mm_macc_ps(pC0
, t2
, CC2
);
565 pC1
= _mm_macc_ps(pC1
, t2
, CC1
);
567 pC0
= _mm_macc_ps(pC0
, t2
, CC0
);
568 pC0
= _mm_macc_ps(pC1
, t
, pC0
);
569 pC0
= _mm_mul_ps(pC0
, t
);
571 /* SELECT pB0 or pC0 for erfc() */
572 mask
= _mm_cmp_ps(two
, y
, _CMP_LT_OQ
);
573 res_erfc
= _mm_blendv_ps(pB0
, pC0
, mask
);
574 res_erfc
= _mm_mul_ps(res_erfc
, expmx2
);
576 /* erfc(x<0) = 2-erfc(|x|) */
577 mask
= _mm_cmp_ps(x
, _mm_setzero_ps(), _CMP_LT_OQ
);
578 res_erfc
= _mm_blendv_ps(res_erfc
, _mm_sub_ps(two
, res_erfc
), mask
);
580 /* Select erf() or erfc() */
581 mask
= _mm_cmp_ps(y
, _mm_set1_ps(0.75f
), _CMP_LT_OQ
);
582 res
= _mm_blendv_ps(res_erfc
, _mm_sub_ps(one
, res_erf
), mask
);
588 /* Calculate the force correction due to PME analytically.
590 * This routine is meant to enable analytical evaluation of the
591 * direct-space PME electrostatic force to avoid tables.
593 * The direct-space potential should be Erfc(beta*r)/r, but there
594 * are some problems evaluating that:
596 * First, the error function is difficult (read: expensive) to
597 * approxmiate accurately for intermediate to large arguments, and
598 * this happens already in ranges of beta*r that occur in simulations.
599 * Second, we now try to avoid calculating potentials in Gromacs but
600 * use forces directly.
602 * We can simply things slight by noting that the PME part is really
603 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
605 * V= 1/r - Erf(beta*r)/r
607 * The first term we already have from the inverse square root, so
608 * that we can leave out of this routine.
610 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
611 * the argument beta*r will be in the range 0.15 to ~4. Use your
612 * favorite plotting program to realize how well-behaved Erf(z)/z is
615 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
616 * However, it turns out it is more efficient to approximate f(z)/z and
617 * then only use even powers. This is another minor optimization, since
618 * we actually WANT f(z)/z, because it is going to be multiplied by
619 * the vector between the two atoms to get the vectorial force. The
620 * fastest flops are the ones we can avoid calculating!
622 * So, here's how it should be used:
625 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
626 * 3. Evaluate this routine with z^2 as the argument.
627 * 4. The return value is the expression:
631 * ------------ - --------
634 * 5. Multiply the entire expression by beta^3. This will get you
636 * beta^3*2*exp(-z^2) beta^3*erf(z)
637 * ------------------ - ---------------
640 * or, switching back to r (z=r*beta):
642 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
643 * ----------------------- - -----------
647 * With a bit of math exercise you should be able to confirm that
648 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
650 * 6. Add the result to 1/r^3, multiply by the product of the charges,
651 * and you have your force (divided by r). A final multiplication
652 * with the vector connecting the two particles and you have your
653 * vectorial force to add to the particles.
657 gmx_mm_pmecorrF_ps(__m128 z2
)
659 const __m128 FN6
= _mm_set1_ps(-1.7357322914161492954e-8f
);
660 const __m128 FN5
= _mm_set1_ps(1.4703624142580877519e-6f
);
661 const __m128 FN4
= _mm_set1_ps(-0.000053401640219807709149f
);
662 const __m128 FN3
= _mm_set1_ps(0.0010054721316683106153f
);
663 const __m128 FN2
= _mm_set1_ps(-0.019278317264888380590f
);
664 const __m128 FN1
= _mm_set1_ps(0.069670166153766424023f
);
665 const __m128 FN0
= _mm_set1_ps(-0.75225204789749321333f
);
667 const __m128 FD4
= _mm_set1_ps(0.0011193462567257629232f
);
668 const __m128 FD3
= _mm_set1_ps(0.014866955030185295499f
);
669 const __m128 FD2
= _mm_set1_ps(0.11583842382862377919f
);
670 const __m128 FD1
= _mm_set1_ps(0.50736591960530292870f
);
671 const __m128 FD0
= _mm_set1_ps(1.0f
);
674 __m128 polyFN0
, polyFN1
, polyFD0
, polyFD1
;
676 z4
= _mm_mul_ps(z2
, z2
);
678 polyFD0
= _mm_macc_ps(FD4
, z4
, FD2
);
679 polyFD1
= _mm_macc_ps(FD3
, z4
, FD1
);
680 polyFD0
= _mm_macc_ps(polyFD0
, z4
, FD0
);
681 polyFD0
= _mm_macc_ps(polyFD1
, z2
, polyFD0
);
683 polyFD0
= gmx_mm_inv_ps(polyFD0
);
685 polyFN0
= _mm_macc_ps(FN6
, z4
, FN4
);
686 polyFN1
= _mm_macc_ps(FN5
, z4
, FN3
);
687 polyFN0
= _mm_macc_ps(polyFN0
, z4
, FN2
);
688 polyFN1
= _mm_macc_ps(polyFN1
, z4
, FN1
);
689 polyFN0
= _mm_macc_ps(polyFN0
, z4
, FN0
);
690 polyFN0
= _mm_macc_ps(polyFN1
, z2
, polyFN0
);
692 return _mm_mul_ps(polyFN0
, polyFD0
);
698 /* Calculate the potential correction due to PME analytically.
700 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
702 * This routine calculates Erf(z)/z, although you should provide z^2
703 * as the input argument.
705 * Here's how it should be used:
708 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
709 * 3. Evaluate this routine with z^2 as the argument.
710 * 4. The return value is the expression:
717 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
723 * 6. Add the result to 1/r, multiply by the product of the charges,
724 * and you have your potential.
727 gmx_mm_pmecorrV_ps(__m128 z2
)
729 const __m128 VN6
= _mm_set1_ps(1.9296833005951166339e-8f
);
730 const __m128 VN5
= _mm_set1_ps(-1.4213390571557850962e-6f
);
731 const __m128 VN4
= _mm_set1_ps(0.000041603292906656984871f
);
732 const __m128 VN3
= _mm_set1_ps(-0.00013134036773265025626f
);
733 const __m128 VN2
= _mm_set1_ps(0.038657983986041781264f
);
734 const __m128 VN1
= _mm_set1_ps(0.11285044772717598220f
);
735 const __m128 VN0
= _mm_set1_ps(1.1283802385263030286f
);
737 const __m128 VD3
= _mm_set1_ps(0.0066752224023576045451f
);
738 const __m128 VD2
= _mm_set1_ps(0.078647795836373922256f
);
739 const __m128 VD1
= _mm_set1_ps(0.43336185284710920150f
);
740 const __m128 VD0
= _mm_set1_ps(1.0f
);
743 __m128 polyVN0
, polyVN1
, polyVD0
, polyVD1
;
745 z4
= _mm_mul_ps(z2
, z2
);
747 polyVD1
= _mm_macc_ps(VD3
, z4
, VD1
);
748 polyVD0
= _mm_macc_ps(VD2
, z4
, VD0
);
749 polyVD0
= _mm_macc_ps(polyVD1
, z2
, polyVD0
);
751 polyVD0
= gmx_mm_inv_ps(polyVD0
);
753 polyVN0
= _mm_macc_ps(VN6
, z4
, VN4
);
754 polyVN1
= _mm_macc_ps(VN5
, z4
, VN3
);
755 polyVN0
= _mm_macc_ps(polyVN0
, z4
, VN2
);
756 polyVN1
= _mm_macc_ps(polyVN1
, z4
, VN1
);
757 polyVN0
= _mm_macc_ps(polyVN0
, z4
, VN0
);
758 polyVN0
= _mm_macc_ps(polyVN1
, z2
, polyVN0
);
760 return _mm_mul_ps(polyVN0
, polyVD0
);
766 gmx_mm_sincos_ps(__m128 x
,
770 const __m128 two_over_pi
= _mm_set1_ps(2.0/M_PI
);
771 const __m128 half
= _mm_set1_ps(0.5);
772 const __m128 one
= _mm_set1_ps(1.0);
774 const __m128i izero
= _mm_set1_epi32(0);
775 const __m128i ione
= _mm_set1_epi32(1);
776 const __m128i itwo
= _mm_set1_epi32(2);
777 const __m128i ithree
= _mm_set1_epi32(3);
778 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
780 const __m128 CA1
= _mm_set1_ps(1.5703125f
);
781 const __m128 CA2
= _mm_set1_ps(4.837512969970703125e-4f
);
782 const __m128 CA3
= _mm_set1_ps(7.54978995489188216e-8f
);
784 const __m128 CC0
= _mm_set1_ps(-0.0013602249f
);
785 const __m128 CC1
= _mm_set1_ps(0.0416566950f
);
786 const __m128 CC2
= _mm_set1_ps(-0.4999990225f
);
787 const __m128 CS0
= _mm_set1_ps(-0.0001950727f
);
788 const __m128 CS1
= _mm_set1_ps(0.0083320758f
);
789 const __m128 CS2
= _mm_set1_ps(-0.1666665247f
);
794 __m128i offset_sin
, offset_cos
;
796 __m128 mask_sin
, mask_cos
;
797 __m128 tmp_sin
, tmp_cos
;
799 y
= _mm_mul_ps(x
, two_over_pi
);
800 y
= _mm_add_ps(y
, _mm_or_ps(_mm_and_ps(y
, signbit
), half
));
802 iz
= _mm_cvttps_epi32(y
);
803 z
= _mm_round_ps(y
, _MM_FROUND_TO_ZERO
);
805 offset_sin
= _mm_and_si128(iz
, ithree
);
806 offset_cos
= _mm_add_epi32(iz
, ione
);
808 /* Extended precision arithmethic to achieve full precision */
809 y
= _mm_nmacc_ps(z
, CA1
, x
);
810 y
= _mm_nmacc_ps(z
, CA2
, y
);
811 y
= _mm_nmacc_ps(z
, CA3
, y
);
813 y2
= _mm_mul_ps(y
, y
);
815 tmp1
= _mm_macc_ps(CC0
, y2
, CC1
);
816 tmp2
= _mm_macc_ps(CS0
, y2
, CS1
);
817 tmp1
= _mm_macc_ps(tmp1
, y2
, CC2
);
818 tmp2
= _mm_macc_ps(tmp2
, y2
, CS2
);
820 tmp1
= _mm_macc_ps(tmp1
, y2
, one
);
822 tmp2
= _mm_macc_ps(tmp2
, _mm_mul_ps(y
, y2
), y
);
824 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
, ione
), izero
));
825 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
, ione
), izero
));
827 tmp_sin
= _mm_blendv_ps(tmp1
, tmp2
, mask_sin
);
828 tmp_cos
= _mm_blendv_ps(tmp1
, tmp2
, mask_cos
);
830 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
, itwo
), izero
));
831 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
, itwo
), izero
));
833 tmp1
= _mm_xor_ps(signbit
, tmp_sin
);
834 tmp2
= _mm_xor_ps(signbit
, tmp_cos
);
836 *sinval
= _mm_blendv_ps(tmp1
, tmp_sin
, mask_sin
);
837 *cosval
= _mm_blendv_ps(tmp2
, tmp_cos
, mask_cos
);
843 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
844 * will then call the sincos() routine and waste a factor 2 in performance!
847 gmx_mm_sin_ps(__m128 x
)
850 gmx_mm_sincos_ps(x
, &s
, &c
);
855 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
856 * will then call the sincos() routine and waste a factor 2 in performance!
859 gmx_mm_cos_ps(__m128 x
)
862 gmx_mm_sincos_ps(x
, &s
, &c
);
868 gmx_mm_tan_ps(__m128 x
)
870 __m128 sinval
, cosval
;
873 gmx_mm_sincos_ps(x
, &sinval
, &cosval
);
875 tanval
= _mm_mul_ps(sinval
, gmx_mm_inv_ps(cosval
));
882 gmx_mm_asin_ps(__m128 x
)
884 /* Same algorithm as cephes library */
885 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
886 const __m128 limitlow
= _mm_set1_ps(1e-4f
);
887 const __m128 half
= _mm_set1_ps(0.5f
);
888 const __m128 one
= _mm_set1_ps(1.0f
);
889 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0f
);
891 const __m128 CC5
= _mm_set1_ps(4.2163199048E-2f
);
892 const __m128 CC4
= _mm_set1_ps(2.4181311049E-2f
);
893 const __m128 CC3
= _mm_set1_ps(4.5470025998E-2f
);
894 const __m128 CC2
= _mm_set1_ps(7.4953002686E-2f
);
895 const __m128 CC1
= _mm_set1_ps(1.6666752422E-1f
);
900 __m128 z
, z1
, z2
, q
, q1
, q2
;
903 sign
= _mm_andnot_ps(signmask
, x
);
904 xabs
= _mm_and_ps(x
, signmask
);
906 mask
= _mm_cmp_ps(xabs
, half
, _CMP_GT_OQ
);
908 z1
= _mm_mul_ps(half
, _mm_sub_ps(one
, xabs
));
909 q1
= _mm_mul_ps(z1
, gmx_mm_invsqrt_ps(z1
));
910 q1
= _mm_andnot_ps(_mm_cmp_ps(xabs
, one
, _CMP_EQ_OQ
), q1
);
913 z2
= _mm_mul_ps(q2
, q2
);
915 z
= _mm_or_ps( _mm_and_ps(mask
, z1
), _mm_andnot_ps(mask
, z2
) );
916 q
= _mm_or_ps( _mm_and_ps(mask
, q1
), _mm_andnot_ps(mask
, q2
) );
918 z2
= _mm_mul_ps(z
, z
);
920 pA
= _mm_macc_ps(CC5
, z2
, CC3
);
921 pB
= _mm_macc_ps(CC4
, z2
, CC2
);
923 pA
= _mm_macc_ps(pA
, z2
, CC1
);
924 pA
= _mm_mul_ps(pA
, z
);
926 z
= _mm_macc_ps(pB
, z2
, pA
);
928 z
= _mm_macc_ps(z
, q
, q
);
930 q2
= _mm_sub_ps(halfpi
, z
);
931 q2
= _mm_sub_ps(q2
, z
);
933 z
= _mm_or_ps( _mm_and_ps(mask
, q2
), _mm_andnot_ps(mask
, z
) );
935 mask
= _mm_cmp_ps(xabs
, limitlow
, _CMP_GT_OQ
);
936 z
= _mm_or_ps( _mm_and_ps(mask
, z
), _mm_andnot_ps(mask
, xabs
) );
938 z
= _mm_xor_ps(z
, sign
);
945 gmx_mm_acos_ps(__m128 x
)
947 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
948 const __m128 one_ps
= _mm_set1_ps(1.0f
);
949 const __m128 half_ps
= _mm_set1_ps(0.5f
);
950 const __m128 pi_ps
= _mm_set1_ps(M_PI
);
951 const __m128 halfpi_ps
= _mm_set1_ps(M_PI
/2.0f
);
956 __m128 z
, z1
, z2
, z3
;
958 xabs
= _mm_and_ps(x
, signmask
);
959 mask1
= _mm_cmp_ps(xabs
, half_ps
, _CMP_GT_OQ
);
960 mask2
= _mm_cmp_ps(x
, _mm_setzero_ps(), _CMP_GT_OQ
);
962 z
= _mm_mul_ps(half_ps
, _mm_sub_ps(one_ps
, xabs
));
963 z
= _mm_mul_ps(z
, gmx_mm_invsqrt_ps(z
));
964 z
= _mm_andnot_ps(_mm_cmp_ps(xabs
, one_ps
, _CMP_EQ_OQ
), z
);
966 z
= _mm_blendv_ps(x
, z
, mask1
);
967 z
= gmx_mm_asin_ps(z
);
969 z2
= _mm_add_ps(z
, z
);
970 z1
= _mm_sub_ps(pi_ps
, z2
);
971 z3
= _mm_sub_ps(halfpi_ps
, z
);
973 z
= _mm_blendv_ps(z1
, z2
, mask2
);
974 z
= _mm_blendv_ps(z3
, z
, mask1
);
981 gmx_mm_atan_ps(__m128 x
)
983 /* Same algorithm as cephes library */
984 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
985 const __m128 limit1
= _mm_set1_ps(0.414213562373095f
);
986 const __m128 limit2
= _mm_set1_ps(2.414213562373095f
);
987 const __m128 quarterpi
= _mm_set1_ps(0.785398163397448f
);
988 const __m128 halfpi
= _mm_set1_ps(1.570796326794896f
);
989 const __m128 mone
= _mm_set1_ps(-1.0f
);
990 const __m128 CC3
= _mm_set1_ps(-3.33329491539E-1f
);
991 const __m128 CC5
= _mm_set1_ps(1.99777106478E-1f
);
992 const __m128 CC7
= _mm_set1_ps(-1.38776856032E-1);
993 const __m128 CC9
= _mm_set1_ps(8.05374449538e-2f
);
1001 sign
= _mm_andnot_ps(signmask
, x
);
1002 x
= _mm_and_ps(x
, signmask
);
1004 mask1
= _mm_cmp_ps(x
, limit1
, _CMP_GT_OQ
);
1005 mask2
= _mm_cmp_ps(x
, limit2
, _CMP_GT_OQ
);
1007 z1
= _mm_mul_ps(_mm_add_ps(x
, mone
), gmx_mm_inv_ps(_mm_sub_ps(x
, mone
)));
1008 z2
= _mm_mul_ps(mone
, gmx_mm_inv_ps(x
));
1010 y
= _mm_and_ps(mask1
, quarterpi
);
1011 y
= _mm_blendv_ps(y
, halfpi
, mask2
);
1013 x
= _mm_blendv_ps(x
, z1
, mask1
);
1014 x
= _mm_blendv_ps(x
, z2
, mask2
);
1016 x2
= _mm_mul_ps(x
, x
);
1017 x4
= _mm_mul_ps(x2
, x2
);
1019 sum1
= _mm_macc_ps(CC9
, x4
, CC5
);
1020 sum2
= _mm_macc_ps(CC7
, x4
, CC3
);
1021 sum1
= _mm_mul_ps(sum1
, x4
);
1022 sum1
= _mm_macc_ps(sum2
, x2
, sum1
);
1024 sum1
= _mm_sub_ps(sum1
, mone
);
1025 y
= _mm_macc_ps(sum1
, x
, y
);
1027 y
= _mm_xor_ps(y
, sign
);
1034 gmx_mm_atan2_ps(__m128 y
, __m128 x
)
1036 const __m128 pi
= _mm_set1_ps(M_PI
);
1037 const __m128 minuspi
= _mm_set1_ps(-M_PI
);
1038 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0);
1039 const __m128 minushalfpi
= _mm_set1_ps(-M_PI
/2.0);
1041 __m128 z
, z1
, z3
, z4
;
1043 __m128 maskx_lt
, maskx_eq
;
1044 __m128 masky_lt
, masky_eq
;
1045 __m128 mask1
, mask2
, mask3
, mask4
, maskall
;
1047 maskx_lt
= _mm_cmp_ps(x
, _mm_setzero_ps(), _CMP_LT_OQ
);
1048 masky_lt
= _mm_cmp_ps(y
, _mm_setzero_ps(), _CMP_LT_OQ
);
1049 maskx_eq
= _mm_cmp_ps(x
, _mm_setzero_ps(), _CMP_EQ_OQ
);
1050 masky_eq
= _mm_cmp_ps(y
, _mm_setzero_ps(), _CMP_EQ_OQ
);
1052 z
= _mm_mul_ps(y
, gmx_mm_inv_ps(x
));
1053 z
= gmx_mm_atan_ps(z
);
1055 mask1
= _mm_and_ps(maskx_eq
, masky_lt
);
1056 mask2
= _mm_andnot_ps(maskx_lt
, masky_eq
);
1057 mask3
= _mm_andnot_ps( _mm_or_ps(masky_lt
, masky_eq
), maskx_eq
);
1058 mask4
= _mm_and_ps(masky_eq
, maskx_lt
);
1060 maskall
= _mm_or_ps( _mm_or_ps(mask1
, mask2
), _mm_or_ps(mask3
, mask4
) );
1062 z
= _mm_andnot_ps(maskall
, z
);
1063 z1
= _mm_and_ps(mask1
, minushalfpi
);
1064 z3
= _mm_and_ps(mask3
, halfpi
);
1065 z4
= _mm_and_ps(mask4
, pi
);
1067 z
= _mm_or_ps( _mm_or_ps(z
, z1
), _mm_or_ps(z3
, z4
) );
1069 mask1
= _mm_andnot_ps(masky_lt
, maskx_lt
);
1070 mask2
= _mm_and_ps(maskx_lt
, masky_lt
);
1072 w
= _mm_or_ps( _mm_and_ps(mask1
, pi
), _mm_and_ps(mask2
, minuspi
) );
1073 w
= _mm_andnot_ps(maskall
, w
);
1075 z
= _mm_add_ps(z
, w
);
1082 #endif /* _gmx_math_x86_avx_128_fma_single_h_ */