1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of GROMACS.
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_avx_128_fma_single_h_
22 #define _gmx_math_x86_avx_128_fma_single_h_
24 #include <immintrin.h> /* AVX */
25 #ifdef HAVE_X86INTRIN_H
26 #include <x86intrin.h> /* FMA */
31 #include "gmx_x86_avx_128_fma.h"
35 # define M_PI 3.14159265358979323846264338327950288
41 /************************
43 * Simple math routines *
45 ************************/
48 static gmx_inline __m128
49 gmx_mm_invsqrt_ps(__m128 x
)
51 const __m128 half
= _mm_set_ps(0.5,0.5,0.5,0.5);
52 const __m128 three
= _mm_set_ps(3.0,3.0,3.0,3.0);
54 __m128 lu
= _mm_rsqrt_ps(x
);
56 return _mm_mul_ps(_mm_mul_ps(half
,lu
),_mm_nmacc_ps(_mm_mul_ps(lu
,lu
),x
,three
));
59 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
60 static gmx_inline __m128
61 gmx_mm_sqrt_ps(__m128 x
)
66 mask
= _mm_cmp_ps(x
,_mm_setzero_ps(),_CMP_EQ_OQ
);
67 res
= _mm_andnot_ps(mask
,gmx_mm_invsqrt_ps(x
));
69 res
= _mm_mul_ps(x
,res
);
75 static gmx_inline __m128
76 gmx_mm_inv_ps(__m128 x
)
78 const __m128 two
= _mm_set_ps(2.0f
,2.0f
,2.0f
,2.0f
);
80 __m128 lu
= _mm_rcp_ps(x
);
82 return _mm_mul_ps(lu
,_mm_nmacc_ps(lu
,x
,two
));
85 static gmx_inline __m128
86 gmx_mm_abs_ps(__m128 x
)
88 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
90 return _mm_and_ps(x
,signmask
);
94 gmx_mm_log_ps(__m128 x
)
96 /* Same algorithm as cephes library */
97 const __m128 expmask
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
98 const __m128i expbase_m1
= _mm_set1_epi32(127-1); /* We want non-IEEE format */
99 const __m128 half
= _mm_set1_ps(0.5f
);
100 const __m128 one
= _mm_set1_ps(1.0f
);
101 const __m128 invsq2
= _mm_set1_ps(1.0f
/sqrt(2.0f
));
102 const __m128 corr1
= _mm_set1_ps(-2.12194440e-4f
);
103 const __m128 corr2
= _mm_set1_ps(0.693359375f
);
105 const __m128 CA_1
= _mm_set1_ps(0.070376836292f
);
106 const __m128 CB_0
= _mm_set1_ps(1.6714950086782716f
);
107 const __m128 CB_1
= _mm_set1_ps(-2.452088066061482f
);
108 const __m128 CC_0
= _mm_set1_ps(1.5220770854701728f
);
109 const __m128 CC_1
= _mm_set1_ps(-1.3422238433233642f
);
110 const __m128 CD_0
= _mm_set1_ps(1.386218787509749f
);
111 const __m128 CD_1
= _mm_set1_ps(0.35075468953796346f
);
112 const __m128 CE_0
= _mm_set1_ps(1.3429983063133937f
);
113 const __m128 CE_1
= _mm_set1_ps(1.807420826584643f
);
120 __m128 pA
,pB
,pC
,pD
,pE
,tB
,tC
,tD
,tE
;
122 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
123 fexp
= _mm_and_ps(x
,expmask
);
124 iexp
= gmx_mm_castps_si128(fexp
);
125 iexp
= _mm_srli_epi32(iexp
,23);
126 iexp
= _mm_sub_epi32(iexp
,expbase_m1
);
128 x
= _mm_andnot_ps(expmask
,x
);
129 x
= _mm_or_ps(x
,one
);
130 x
= _mm_mul_ps(x
,half
);
132 mask
= _mm_cmp_ps(x
,invsq2
,_CMP_LT_OQ
);
134 x
= _mm_add_ps(x
,_mm_and_ps(mask
,x
));
135 x
= _mm_sub_ps(x
,one
);
136 iexp
= _mm_add_epi32(iexp
,gmx_mm_castps_si128(mask
)); /* 0xFFFFFFFF = -1 as int */
138 x2
= _mm_mul_ps(x
,x
);
140 pA
= _mm_mul_ps(CA_1
,x
);
142 pB
= _mm_add_ps(x
,CB_1
);
143 pC
= _mm_add_ps(x
,CC_1
);
144 pD
= _mm_add_ps(x
,CD_1
);
145 pE
= _mm_add_ps(x
,CE_1
);
147 pB
= _mm_macc_ps(x
,pB
,CB_0
);
148 pC
= _mm_macc_ps(x
,pC
,CC_0
);
149 pD
= _mm_macc_ps(x
,pD
,CD_0
);
150 pE
= _mm_macc_ps(x
,pE
,CE_0
);
152 pA
= _mm_mul_ps(pA
,pB
);
153 pC
= _mm_mul_ps(pC
,pD
);
154 pE
= _mm_mul_ps(pE
,x2
);
155 pA
= _mm_mul_ps(pA
,pC
);
156 y
= _mm_mul_ps(pA
,pE
);
158 fexp
= _mm_cvtepi32_ps(iexp
);
159 y
= _mm_macc_ps(fexp
,corr1
,y
);
160 y
= _mm_nmacc_ps(half
,x2
,y
);
162 x2
= _mm_add_ps(x
,y
);
163 x2
= _mm_macc_ps(fexp
,corr2
,x2
);
172 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
173 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
175 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
177 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
178 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
179 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
180 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
181 * number instead. That would take a few extra cycles and not really help, since something is
182 * wrong if you are using single precision to work with numbers that cannot really be represented
183 * in single precision.
185 * The accuracy is at least 23 bits.
188 gmx_mm_exp2_ps(__m128 x
)
190 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
191 const __m128 arglimit
= _mm_set1_ps(126.0f
);
193 const __m128i expbase
= _mm_set1_epi32(127);
194 const __m128 CA6
= _mm_set1_ps(1.535336188319500E-004);
195 const __m128 CA5
= _mm_set1_ps(1.339887440266574E-003);
196 const __m128 CA4
= _mm_set1_ps(9.618437357674640E-003);
197 const __m128 CA3
= _mm_set1_ps(5.550332471162809E-002);
198 const __m128 CA2
= _mm_set1_ps(2.402264791363012E-001);
199 const __m128 CA1
= _mm_set1_ps(6.931472028550421E-001);
200 const __m128 CA0
= _mm_set1_ps(1.0f
);
209 iexppart
= _mm_cvtps_epi32(x
);
210 intpart
= _mm_round_ps(x
,_MM_FROUND_TO_NEAREST_INT
);
211 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
,expbase
),23);
212 valuemask
= _mm_cmp_ps(arglimit
,gmx_mm_abs_ps(x
),_CMP_GE_OQ
);
213 fexppart
= _mm_and_ps(valuemask
,gmx_mm_castsi128_ps(iexppart
));
215 x
= _mm_sub_ps(x
,intpart
);
216 x2
= _mm_mul_ps(x
,x
);
218 p0
= _mm_macc_ps(CA6
,x2
,CA4
);
219 p1
= _mm_macc_ps(CA5
,x2
,CA3
);
220 p0
= _mm_macc_ps(p0
,x2
,CA2
);
221 p1
= _mm_macc_ps(p1
,x2
,CA1
);
222 p0
= _mm_macc_ps(p0
,x2
,CA0
);
223 p0
= _mm_macc_ps(p1
,x
,p0
);
224 x
= _mm_mul_ps(p0
,fexppart
);
230 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
231 * but there will then be a small rounding error since we lose some precision due to the
232 * multiplication. This will then be magnified a lot by the exponential.
234 * Instead, we calculate the fractional part directly as a minimax approximation of
235 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
236 * remaining after 2^y, which avoids the precision-loss.
237 * The final result is correct to within 1 LSB over the entire argument range.
240 gmx_mm_exp_ps(__m128 x
)
242 const __m128 argscale
= _mm_set1_ps(1.44269504088896341f
);
243 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
244 const __m128 arglimit
= _mm_set1_ps(126.0f
);
245 const __m128i expbase
= _mm_set1_epi32(127);
247 const __m128 invargscale0
= _mm_set1_ps(0.693359375f
);
248 const __m128 invargscale1
= _mm_set1_ps(-2.12194440e-4f
);
250 const __m128 CC5
= _mm_set1_ps(1.9875691500e-4f
);
251 const __m128 CC4
= _mm_set1_ps(1.3981999507e-3f
);
252 const __m128 CC3
= _mm_set1_ps(8.3334519073e-3f
);
253 const __m128 CC2
= _mm_set1_ps(4.1665795894e-2f
);
254 const __m128 CC1
= _mm_set1_ps(1.6666665459e-1f
);
255 const __m128 CC0
= _mm_set1_ps(5.0000001201e-1f
);
256 const __m128 one
= _mm_set1_ps(1.0f
);
265 y
= _mm_mul_ps(x
,argscale
);
267 iexppart
= _mm_cvtps_epi32(y
);
268 intpart
= _mm_round_ps(y
,_MM_FROUND_TO_NEAREST_INT
);
270 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
,expbase
),23);
271 valuemask
= _mm_cmp_ps(arglimit
,gmx_mm_abs_ps(y
),_CMP_GE_OQ
);
272 fexppart
= _mm_and_ps(valuemask
,gmx_mm_castsi128_ps(iexppart
));
274 /* Extended precision arithmetics */
275 x
= _mm_nmacc_ps(invargscale0
,intpart
,x
);
276 x
= _mm_nmacc_ps(invargscale1
,intpart
,x
);
278 x2
= _mm_mul_ps(x
,x
);
280 p1
= _mm_macc_ps(CC5
,x2
,CC3
);
281 p0
= _mm_macc_ps(CC4
,x2
,CC2
);
282 p1
= _mm_macc_ps(p1
,x2
,CC1
);
283 p0
= _mm_macc_ps(p0
,x2
,CC0
);
284 p0
= _mm_macc_ps(p1
,x
,p0
);
285 p0
= _mm_macc_ps(p0
,x2
,one
);
287 x
= _mm_add_ps(x
,p0
);
289 x
= _mm_mul_ps(x
,fexppart
);
294 /* FULL precision. Only errors in LSB */
296 gmx_mm_erf_ps(__m128 x
)
298 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
299 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
300 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
301 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
302 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
303 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
304 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
305 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
306 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
307 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
308 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
309 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
310 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
311 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
312 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
313 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
314 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
315 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
316 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
317 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
318 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
319 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
320 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
321 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
322 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
323 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
324 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
325 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
326 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
327 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
328 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
330 /* Coefficients for expansion of exp(x) in [0,0.1] */
331 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
332 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
333 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
334 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
336 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
337 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
338 const __m128 one
= _mm_set1_ps(1.0f
);
339 const __m128 two
= _mm_set1_ps(2.0f
);
342 __m128 z
,q
,t
,t2
,w
,w2
;
343 __m128 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
345 __m128 res_erf
,res_erfc
,res
;
348 /* Calculate erf() */
349 x2
= _mm_mul_ps(x
,x
);
350 x4
= _mm_mul_ps(x2
,x2
);
352 pA0
= _mm_macc_ps(CA6
,x4
,CA4
);
353 pA1
= _mm_macc_ps(CA5
,x4
,CA3
);
354 pA0
= _mm_macc_ps(pA0
,x4
,CA2
);
355 pA1
= _mm_macc_ps(pA1
,x4
,CA1
);
356 pA0
= _mm_mul_ps(pA0
,x4
);
357 pA0
= _mm_macc_ps(pA1
,x2
,pA0
);
358 /* Constant term must come last for precision reasons */
359 pA0
= _mm_add_ps(pA0
,CA0
);
361 res_erf
= _mm_mul_ps(x
,pA0
);
365 y
= gmx_mm_abs_ps(x
);
366 t
= gmx_mm_inv_ps(y
);
367 w
= _mm_sub_ps(t
,one
);
368 t2
= _mm_mul_ps(t
,t
);
369 w2
= _mm_mul_ps(w
,w
);
371 * We cannot simply calculate exp(-x2) directly in single precision, since
372 * that will lose a couple of bits of precision due to the multiplication.
373 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
374 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
376 * The only drawback with this is that it requires TWO separate exponential
377 * evaluations, which would be horrible performance-wise. However, the argument
378 * for the second exp() call is always small, so there we simply use a
379 * low-order minimax expansion on [0,0.1].
382 z
= _mm_and_ps(y
,sieve
);
383 q
= _mm_mul_ps( _mm_sub_ps(z
,y
) , _mm_add_ps(z
,y
) );
385 corr
= _mm_macc_ps(CD4
,q
,CD3
);
386 corr
= _mm_macc_ps(corr
,q
,CD2
);
387 corr
= _mm_macc_ps(corr
,q
,one
);
388 corr
= _mm_macc_ps(corr
,q
,one
);
390 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
,z
) ) );
391 expmx2
= _mm_mul_ps(expmx2
,corr
);
393 pB1
= _mm_macc_ps(CB9
,w2
,CB7
);
394 pB0
= _mm_macc_ps(CB8
,w2
,CB6
);
395 pB1
= _mm_macc_ps(pB1
,w2
,CB5
);
396 pB0
= _mm_macc_ps(pB0
,w2
,CB4
);
397 pB1
= _mm_macc_ps(pB1
,w2
,CB3
);
398 pB0
= _mm_macc_ps(pB0
,w2
,CB2
);
399 pB1
= _mm_macc_ps(pB1
,w2
,CB1
);
400 pB0
= _mm_macc_ps(pB0
,w2
,CB0
);
401 pB0
= _mm_macc_ps(pB1
,w
,pB0
);
403 pC0
= _mm_macc_ps(CC10
,t2
,CC8
);
404 pC1
= _mm_macc_ps(CC9
,t2
,CC7
);
405 pC0
= _mm_macc_ps(pC0
,t2
,CC6
);
406 pC1
= _mm_macc_ps(pC1
,t2
,CC5
);
407 pC0
= _mm_macc_ps(pC0
,t2
,CC4
);
408 pC1
= _mm_macc_ps(pC1
,t2
,CC3
);
409 pC0
= _mm_macc_ps(pC0
,t2
,CC2
);
410 pC1
= _mm_macc_ps(pC1
,t2
,CC1
);
412 pC0
= _mm_macc_ps(pC0
,t2
,CC0
);
413 pC0
= _mm_macc_ps(pC1
,t
,pC0
);
414 pC0
= _mm_mul_ps(pC0
,t
);
416 /* SELECT pB0 or pC0 for erfc() */
417 mask
= _mm_cmp_ps(two
,y
,_CMP_LT_OQ
);
418 res_erfc
= _mm_blendv_ps(pB0
,pC0
,mask
);
419 res_erfc
= _mm_mul_ps(res_erfc
,expmx2
);
421 /* erfc(x<0) = 2-erfc(|x|) */
422 mask
= _mm_cmp_ps(x
,_mm_setzero_ps(),_CMP_LT_OQ
);
423 res_erfc
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(two
,res_erfc
),mask
);
425 /* Select erf() or erfc() */
426 mask
= _mm_cmp_ps(y
,_mm_set1_ps(0.75f
),_CMP_LT_OQ
);
427 res
= _mm_blendv_ps(_mm_sub_ps(one
,res_erfc
),res_erf
,mask
);
433 /* FULL precision. Only errors in LSB */
435 gmx_mm_erfc_ps(__m128 x
)
437 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
438 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
439 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
440 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
441 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
442 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
443 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
444 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
445 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
446 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
447 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
448 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
449 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
450 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
451 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
452 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
453 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
454 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
455 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
456 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
457 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
458 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
459 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
460 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
461 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
462 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
463 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
464 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
465 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
466 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
467 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
469 /* Coefficients for expansion of exp(x) in [0,0.1] */
470 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
471 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
472 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
473 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
475 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
476 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
477 const __m128 one
= _mm_set1_ps(1.0f
);
478 const __m128 two
= _mm_set1_ps(2.0f
);
481 __m128 z
,q
,t
,t2
,w
,w2
;
482 __m128 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
484 __m128 res_erf
,res_erfc
,res
;
487 /* Calculate erf() */
488 x2
= _mm_mul_ps(x
,x
);
489 x4
= _mm_mul_ps(x2
,x2
);
491 pA0
= _mm_macc_ps(CA6
,x4
,CA4
);
492 pA1
= _mm_macc_ps(CA5
,x4
,CA3
);
493 pA0
= _mm_macc_ps(pA0
,x4
,CA2
);
494 pA1
= _mm_macc_ps(pA1
,x4
,CA1
);
495 pA1
= _mm_mul_ps(pA1
,x2
);
496 pA0
= _mm_macc_ps(pA0
,x4
,pA1
);
497 /* Constant term must come last for precision reasons */
498 pA0
= _mm_add_ps(pA0
,CA0
);
500 res_erf
= _mm_mul_ps(x
,pA0
);
503 y
= gmx_mm_abs_ps(x
);
504 t
= gmx_mm_inv_ps(y
);
505 w
= _mm_sub_ps(t
,one
);
506 t2
= _mm_mul_ps(t
,t
);
507 w2
= _mm_mul_ps(w
,w
);
509 * We cannot simply calculate exp(-x2) directly in single precision, since
510 * that will lose a couple of bits of precision due to the multiplication.
511 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
512 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
514 * The only drawback with this is that it requires TWO separate exponential
515 * evaluations, which would be horrible performance-wise. However, the argument
516 * for the second exp() call is always small, so there we simply use a
517 * low-order minimax expansion on [0,0.1].
520 z
= _mm_and_ps(y
,sieve
);
521 q
= _mm_mul_ps( _mm_sub_ps(z
,y
) , _mm_add_ps(z
,y
) );
523 corr
= _mm_macc_ps(CD4
,q
,CD3
);
524 corr
= _mm_macc_ps(corr
,q
,CD2
);
525 corr
= _mm_macc_ps(corr
,q
,one
);
526 corr
= _mm_macc_ps(corr
,q
,one
);
528 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
,z
) ) );
529 expmx2
= _mm_mul_ps(expmx2
,corr
);
531 pB1
= _mm_macc_ps(CB9
,w2
,CB7
);
532 pB0
= _mm_macc_ps(CB8
,w2
,CB6
);
533 pB1
= _mm_macc_ps(pB1
,w2
,CB5
);
534 pB0
= _mm_macc_ps(pB0
,w2
,CB4
);
535 pB1
= _mm_macc_ps(pB1
,w2
,CB3
);
536 pB0
= _mm_macc_ps(pB0
,w2
,CB2
);
537 pB1
= _mm_macc_ps(pB1
,w2
,CB1
);
538 pB0
= _mm_macc_ps(pB0
,w2
,CB0
);
539 pB0
= _mm_macc_ps(pB1
,w
,pB0
);
541 pC0
= _mm_macc_ps(CC10
,t2
,CC8
);
542 pC1
= _mm_macc_ps(CC9
,t2
,CC7
);
543 pC0
= _mm_macc_ps(pC0
,t2
,CC6
);
544 pC1
= _mm_macc_ps(pC1
,t2
,CC5
);
545 pC0
= _mm_macc_ps(pC0
,t2
,CC4
);
546 pC1
= _mm_macc_ps(pC1
,t2
,CC3
);
547 pC0
= _mm_macc_ps(pC0
,t2
,CC2
);
548 pC1
= _mm_macc_ps(pC1
,t2
,CC1
);
550 pC0
= _mm_macc_ps(pC0
,t2
,CC0
);
551 pC0
= _mm_macc_ps(pC1
,t
,pC0
);
552 pC0
= _mm_mul_ps(pC0
,t
);
554 /* SELECT pB0 or pC0 for erfc() */
555 mask
= _mm_cmp_ps(two
,y
,_CMP_LT_OQ
);
556 res_erfc
= _mm_blendv_ps(pB0
,pC0
,mask
);
557 res_erfc
= _mm_mul_ps(res_erfc
,expmx2
);
559 /* erfc(x<0) = 2-erfc(|x|) */
560 mask
= _mm_cmp_ps(x
,_mm_setzero_ps(),_CMP_LT_OQ
);
561 res_erfc
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(two
,res_erfc
),mask
);
563 /* Select erf() or erfc() */
564 mask
= _mm_cmp_ps(y
,_mm_set1_ps(0.75f
),_CMP_LT_OQ
);
565 res
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(one
,res_erf
),mask
);
571 /* Calculate the force correction due to PME analytically.
573 * This routine is meant to enable analytical evaluation of the
574 * direct-space PME electrostatic force to avoid tables.
576 * The direct-space potential should be Erfc(beta*r)/r, but there
577 * are some problems evaluating that:
579 * First, the error function is difficult (read: expensive) to
580 * approxmiate accurately for intermediate to large arguments, and
581 * this happens already in ranges of beta*r that occur in simulations.
582 * Second, we now try to avoid calculating potentials in Gromacs but
583 * use forces directly.
585 * We can simply things slight by noting that the PME part is really
586 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
588 * V= 1/r - Erf(beta*r)/r
590 * The first term we already have from the inverse square root, so
591 * that we can leave out of this routine.
593 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
594 * the argument beta*r will be in the range 0.15 to ~4. Use your
595 * favorite plotting program to realize how well-behaved Erf(z)/z is
598 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
599 * However, it turns out it is more efficient to approximate f(z)/z and
600 * then only use even powers. This is another minor optimization, since
601 * we actually WANT f(z)/z, because it is going to be multiplied by
602 * the vector between the two atoms to get the vectorial force. The
603 * fastest flops are the ones we can avoid calculating!
605 * So, here's how it should be used:
608 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
609 * 3. Evaluate this routine with z^2 as the argument.
610 * 4. The return value is the expression:
614 * ------------ - --------
617 * 5. Multiply the entire expression by beta^3. This will get you
619 * beta^3*2*exp(-z^2) beta^3*erf(z)
620 * ------------------ - ---------------
623 * or, switching back to r (z=r*beta):
625 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
626 * ----------------------- - -----------
630 * With a bit of math exercise you should be able to confirm that
631 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
633 * 6. Add the result to 1/r^3, multiply by the product of the charges,
634 * and you have your force (divided by r). A final multiplication
635 * with the vector connecting the two particles and you have your
636 * vectorial force to add to the particles.
640 gmx_mm_pmecorrF_ps(__m128 z2
)
642 const __m128 FN6
= _mm_set1_ps(-1.7357322914161492954e-8f
);
643 const __m128 FN5
= _mm_set1_ps(1.4703624142580877519e-6f
);
644 const __m128 FN4
= _mm_set1_ps(-0.000053401640219807709149f
);
645 const __m128 FN3
= _mm_set1_ps(0.0010054721316683106153f
);
646 const __m128 FN2
= _mm_set1_ps(-0.019278317264888380590f
);
647 const __m128 FN1
= _mm_set1_ps(0.069670166153766424023f
);
648 const __m128 FN0
= _mm_set1_ps(-0.75225204789749321333f
);
650 const __m128 FD4
= _mm_set1_ps(0.0011193462567257629232f
);
651 const __m128 FD3
= _mm_set1_ps(0.014866955030185295499f
);
652 const __m128 FD2
= _mm_set1_ps(0.11583842382862377919f
);
653 const __m128 FD1
= _mm_set1_ps(0.50736591960530292870f
);
654 const __m128 FD0
= _mm_set1_ps(1.0f
);
657 __m128 polyFN0
,polyFN1
,polyFD0
,polyFD1
;
659 z4
= _mm_mul_ps(z2
,z2
);
661 polyFD0
= _mm_macc_ps(FD4
,z4
,FD2
);
662 polyFD1
= _mm_macc_ps(FD3
,z4
,FD1
);
663 polyFD0
= _mm_macc_ps(polyFD0
,z4
,FD0
);
664 polyFD0
= _mm_macc_ps(polyFD1
,z2
,polyFD0
);
666 polyFD0
= gmx_mm_inv_ps(polyFD0
);
668 polyFN0
= _mm_macc_ps(FN6
,z4
,FN4
);
669 polyFN1
= _mm_macc_ps(FN5
,z4
,FN3
);
670 polyFN0
= _mm_macc_ps(polyFN0
,z4
,FN2
);
671 polyFN1
= _mm_macc_ps(polyFN1
,z4
,FN1
);
672 polyFN0
= _mm_macc_ps(polyFN0
,z4
,FN0
);
673 polyFN0
= _mm_macc_ps(polyFN1
,z2
,polyFN0
);
675 return _mm_mul_ps(polyFN0
,polyFD0
);
681 /* Calculate the potential correction due to PME analytically.
683 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
685 * This routine calculates Erf(z)/z, although you should provide z^2
686 * as the input argument.
688 * Here's how it should be used:
691 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
692 * 3. Evaluate this routine with z^2 as the argument.
693 * 4. The return value is the expression:
700 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
706 * 6. Add the result to 1/r, multiply by the product of the charges,
707 * and you have your potential.
710 gmx_mm_pmecorrV_ps(__m128 z2
)
712 const __m128 VN6
= _mm_set1_ps(1.9296833005951166339e-8f
);
713 const __m128 VN5
= _mm_set1_ps(-1.4213390571557850962e-6f
);
714 const __m128 VN4
= _mm_set1_ps(0.000041603292906656984871f
);
715 const __m128 VN3
= _mm_set1_ps(-0.00013134036773265025626f
);
716 const __m128 VN2
= _mm_set1_ps(0.038657983986041781264f
);
717 const __m128 VN1
= _mm_set1_ps(0.11285044772717598220f
);
718 const __m128 VN0
= _mm_set1_ps(1.1283802385263030286f
);
720 const __m128 VD3
= _mm_set1_ps(0.0066752224023576045451f
);
721 const __m128 VD2
= _mm_set1_ps(0.078647795836373922256f
);
722 const __m128 VD1
= _mm_set1_ps(0.43336185284710920150f
);
723 const __m128 VD0
= _mm_set1_ps(1.0f
);
726 __m128 polyVN0
,polyVN1
,polyVD0
,polyVD1
;
728 z4
= _mm_mul_ps(z2
,z2
);
730 polyVD1
= _mm_macc_ps(VD3
,z4
,VD1
);
731 polyVD0
= _mm_macc_ps(VD2
,z4
,VD0
);
732 polyVD0
= _mm_macc_ps(polyVD1
,z2
,polyVD0
);
734 polyVD0
= gmx_mm_inv_ps(polyVD0
);
736 polyVN0
= _mm_macc_ps(VN6
,z4
,VN4
);
737 polyVN1
= _mm_macc_ps(VN5
,z4
,VN3
);
738 polyVN0
= _mm_macc_ps(polyVN0
,z4
,VN2
);
739 polyVN1
= _mm_macc_ps(polyVN1
,z4
,VN1
);
740 polyVN0
= _mm_macc_ps(polyVN0
,z4
,VN0
);
741 polyVN0
= _mm_macc_ps(polyVN1
,z2
,polyVN0
);
743 return _mm_mul_ps(polyVN0
,polyVD0
);
749 gmx_mm_sincos_ps(__m128 x
,
753 const __m128 two_over_pi
= _mm_set1_ps(2.0/M_PI
);
754 const __m128 half
= _mm_set1_ps(0.5);
755 const __m128 one
= _mm_set1_ps(1.0);
757 const __m128i izero
= _mm_set1_epi32(0);
758 const __m128i ione
= _mm_set1_epi32(1);
759 const __m128i itwo
= _mm_set1_epi32(2);
760 const __m128i ithree
= _mm_set1_epi32(3);
761 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
763 const __m128 CA1
= _mm_set1_ps(1.5703125f
);
764 const __m128 CA2
= _mm_set1_ps(4.837512969970703125e-4f
);
765 const __m128 CA3
= _mm_set1_ps(7.54978995489188216e-8f
);
767 const __m128 CC0
= _mm_set1_ps(-0.0013602249f
);
768 const __m128 CC1
= _mm_set1_ps(0.0416566950f
);
769 const __m128 CC2
= _mm_set1_ps(-0.4999990225f
);
770 const __m128 CS0
= _mm_set1_ps(-0.0001950727f
);
771 const __m128 CS1
= _mm_set1_ps(0.0083320758f
);
772 const __m128 CS2
= _mm_set1_ps(-0.1666665247f
);
777 __m128i offset_sin
,offset_cos
;
779 __m128 mask_sin
,mask_cos
;
780 __m128 tmp_sin
,tmp_cos
;
782 y
= _mm_mul_ps(x
,two_over_pi
);
783 y
= _mm_add_ps(y
,_mm_or_ps(_mm_and_ps(y
,signbit
),half
));
785 iz
= _mm_cvttps_epi32(y
);
786 z
= _mm_round_ps(y
,_MM_FROUND_TO_ZERO
);
788 offset_sin
= _mm_and_si128(iz
,ithree
);
789 offset_cos
= _mm_add_epi32(iz
,ione
);
791 /* Extended precision arithmethic to achieve full precision */
792 y
= _mm_nmacc_ps(z
,CA1
,x
);
793 y
= _mm_nmacc_ps(z
,CA2
,y
);
794 y
= _mm_nmacc_ps(z
,CA3
,y
);
796 y2
= _mm_mul_ps(y
,y
);
798 tmp1
= _mm_macc_ps(CC0
,y2
,CC1
);
799 tmp2
= _mm_macc_ps(CS0
,y2
,CS1
);
800 tmp1
= _mm_macc_ps(tmp1
,y2
,CC2
);
801 tmp2
= _mm_macc_ps(tmp2
,y2
,CS2
);
803 tmp1
= _mm_macc_ps(tmp1
,y2
,one
);
805 tmp2
= _mm_macc_ps(tmp2
,_mm_mul_ps(y
,y2
),y
);
807 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
,ione
), izero
));
808 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
,ione
), izero
));
810 tmp_sin
= _mm_blendv_ps(tmp1
,tmp2
,mask_sin
);
811 tmp_cos
= _mm_blendv_ps(tmp1
,tmp2
,mask_cos
);
813 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
,itwo
), izero
));
814 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
,itwo
), izero
));
816 tmp1
= _mm_xor_ps(signbit
,tmp_sin
);
817 tmp2
= _mm_xor_ps(signbit
,tmp_cos
);
819 *sinval
= _mm_blendv_ps(tmp1
,tmp_sin
,mask_sin
);
820 *cosval
= _mm_blendv_ps(tmp2
,tmp_cos
,mask_cos
);
826 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
827 * will then call the sincos() routine and waste a factor 2 in performance!
830 gmx_mm_sin_ps(__m128 x
)
833 gmx_mm_sincos_ps(x
,&s
,&c
);
838 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
839 * will then call the sincos() routine and waste a factor 2 in performance!
842 gmx_mm_cos_ps(__m128 x
)
845 gmx_mm_sincos_ps(x
,&s
,&c
);
851 gmx_mm_tan_ps(__m128 x
)
853 __m128 sinval
,cosval
;
856 gmx_mm_sincos_ps(x
,&sinval
,&cosval
);
858 tanval
= _mm_mul_ps(sinval
,gmx_mm_inv_ps(cosval
));
865 gmx_mm_asin_ps(__m128 x
)
867 /* Same algorithm as cephes library */
868 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
869 const __m128 limitlow
= _mm_set1_ps(1e-4f
);
870 const __m128 half
= _mm_set1_ps(0.5f
);
871 const __m128 one
= _mm_set1_ps(1.0f
);
872 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0f
);
874 const __m128 CC5
= _mm_set1_ps(4.2163199048E-2f
);
875 const __m128 CC4
= _mm_set1_ps(2.4181311049E-2f
);
876 const __m128 CC3
= _mm_set1_ps(4.5470025998E-2f
);
877 const __m128 CC2
= _mm_set1_ps(7.4953002686E-2f
);
878 const __m128 CC1
= _mm_set1_ps(1.6666752422E-1f
);
883 __m128 z
,z1
,z2
,q
,q1
,q2
;
886 sign
= _mm_andnot_ps(signmask
,x
);
887 xabs
= _mm_and_ps(x
,signmask
);
889 mask
= _mm_cmp_ps(xabs
,half
,_CMP_GT_OQ
);
891 z1
= _mm_mul_ps(half
, _mm_sub_ps(one
,xabs
));
892 q1
= _mm_mul_ps(z1
,gmx_mm_invsqrt_ps(z1
));
893 q1
= _mm_andnot_ps(_mm_cmp_ps(xabs
,one
,_CMP_EQ_OQ
),q1
);
896 z2
= _mm_mul_ps(q2
,q2
);
898 z
= _mm_or_ps( _mm_and_ps(mask
,z1
) , _mm_andnot_ps(mask
,z2
) );
899 q
= _mm_or_ps( _mm_and_ps(mask
,q1
) , _mm_andnot_ps(mask
,q2
) );
901 z2
= _mm_mul_ps(z
,z
);
903 pA
= _mm_macc_ps(CC5
,z2
,CC3
);
904 pB
= _mm_macc_ps(CC4
,z2
,CC2
);
906 pA
= _mm_macc_ps(pA
,z2
,CC1
);
907 pA
= _mm_mul_ps(pA
,z
);
909 z
= _mm_macc_ps(pB
,z2
,pA
);
911 z
= _mm_macc_ps(z
,q
,q
);
913 q2
= _mm_sub_ps(halfpi
,z
);
914 q2
= _mm_sub_ps(q2
,z
);
916 z
= _mm_or_ps( _mm_and_ps(mask
,q2
) , _mm_andnot_ps(mask
,z
) );
918 mask
= _mm_cmp_ps(xabs
,limitlow
,_CMP_GT_OQ
);
919 z
= _mm_or_ps( _mm_and_ps(mask
,z
) , _mm_andnot_ps(mask
,xabs
) );
921 z
= _mm_xor_ps(z
,sign
);
928 gmx_mm_acos_ps(__m128 x
)
930 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
931 const __m128 one_ps
= _mm_set1_ps(1.0f
);
932 const __m128 half_ps
= _mm_set1_ps(0.5f
);
933 const __m128 pi_ps
= _mm_set1_ps(M_PI
);
934 const __m128 halfpi_ps
= _mm_set1_ps(M_PI
/2.0f
);
941 xabs
= _mm_and_ps(x
,signmask
);
942 mask1
= _mm_cmp_ps(xabs
,half_ps
,_CMP_GT_OQ
);
943 mask2
= _mm_cmp_ps(x
,_mm_setzero_ps(),_CMP_GT_OQ
);
945 z
= _mm_mul_ps(half_ps
,_mm_sub_ps(one_ps
,xabs
));
946 z
= _mm_mul_ps(z
,gmx_mm_invsqrt_ps(z
));
947 z
= _mm_andnot_ps(_mm_cmp_ps(xabs
,one_ps
,_CMP_EQ_OQ
),z
);
949 z
= _mm_blendv_ps(x
,z
,mask1
);
950 z
= gmx_mm_asin_ps(z
);
952 z2
= _mm_add_ps(z
,z
);
953 z1
= _mm_sub_ps(pi_ps
,z2
);
954 z3
= _mm_sub_ps(halfpi_ps
,z
);
956 z
= _mm_blendv_ps(z1
,z2
,mask2
);
957 z
= _mm_blendv_ps(z3
,z
,mask1
);
964 gmx_mm_atan_ps(__m128 x
)
966 /* Same algorithm as cephes library */
967 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
968 const __m128 limit1
= _mm_set1_ps(0.414213562373095f
);
969 const __m128 limit2
= _mm_set1_ps(2.414213562373095f
);
970 const __m128 quarterpi
= _mm_set1_ps(0.785398163397448f
);
971 const __m128 halfpi
= _mm_set1_ps(1.570796326794896f
);
972 const __m128 mone
= _mm_set1_ps(-1.0f
);
973 const __m128 CC3
= _mm_set1_ps(-3.33329491539E-1f
);
974 const __m128 CC5
= _mm_set1_ps(1.99777106478E-1f
);
975 const __m128 CC7
= _mm_set1_ps(-1.38776856032E-1);
976 const __m128 CC9
= _mm_set1_ps(8.05374449538e-2f
);
984 sign
= _mm_andnot_ps(signmask
,x
);
985 x
= _mm_and_ps(x
,signmask
);
987 mask1
= _mm_cmp_ps(x
,limit1
,_CMP_GT_OQ
);
988 mask2
= _mm_cmp_ps(x
,limit2
,_CMP_GT_OQ
);
990 z1
= _mm_mul_ps(_mm_add_ps(x
,mone
),gmx_mm_inv_ps(_mm_sub_ps(x
,mone
)));
991 z2
= _mm_mul_ps(mone
,gmx_mm_inv_ps(x
));
993 y
= _mm_and_ps(mask1
,quarterpi
);
994 y
= _mm_blendv_ps(y
,halfpi
,mask2
);
996 x
= _mm_blendv_ps(x
,z1
,mask1
);
997 x
= _mm_blendv_ps(x
,z2
,mask2
);
999 x2
= _mm_mul_ps(x
,x
);
1000 x4
= _mm_mul_ps(x2
,x2
);
1002 sum1
= _mm_macc_ps(CC9
,x4
,CC5
);
1003 sum2
= _mm_macc_ps(CC7
,x4
,CC3
);
1004 sum1
= _mm_mul_ps(sum1
,x4
);
1005 sum1
= _mm_macc_ps(sum2
,x2
,sum1
);
1007 sum1
= _mm_sub_ps(sum1
,mone
);
1008 y
= _mm_macc_ps(sum1
,x
,y
);
1010 y
= _mm_xor_ps(y
,sign
);
1017 gmx_mm_atan2_ps(__m128 y
, __m128 x
)
1019 const __m128 pi
= _mm_set1_ps(M_PI
);
1020 const __m128 minuspi
= _mm_set1_ps(-M_PI
);
1021 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0);
1022 const __m128 minushalfpi
= _mm_set1_ps(-M_PI
/2.0);
1026 __m128 maskx_lt
,maskx_eq
;
1027 __m128 masky_lt
,masky_eq
;
1028 __m128 mask1
,mask2
,mask3
,mask4
,maskall
;
1030 maskx_lt
= _mm_cmp_ps(x
,_mm_setzero_ps(),_CMP_LT_OQ
);
1031 masky_lt
= _mm_cmp_ps(y
,_mm_setzero_ps(),_CMP_LT_OQ
);
1032 maskx_eq
= _mm_cmp_ps(x
,_mm_setzero_ps(),_CMP_EQ_OQ
);
1033 masky_eq
= _mm_cmp_ps(y
,_mm_setzero_ps(),_CMP_EQ_OQ
);
1035 z
= _mm_mul_ps(y
,gmx_mm_inv_ps(x
));
1036 z
= gmx_mm_atan_ps(z
);
1038 mask1
= _mm_and_ps(maskx_eq
,masky_lt
);
1039 mask2
= _mm_andnot_ps(maskx_lt
,masky_eq
);
1040 mask3
= _mm_andnot_ps( _mm_or_ps(masky_lt
,masky_eq
) , maskx_eq
);
1041 mask4
= _mm_and_ps(masky_eq
,maskx_lt
);
1043 maskall
= _mm_or_ps( _mm_or_ps(mask1
,mask2
), _mm_or_ps(mask3
,mask4
) );
1045 z
= _mm_andnot_ps(maskall
,z
);
1046 z1
= _mm_and_ps(mask1
,minushalfpi
);
1047 z3
= _mm_and_ps(mask3
,halfpi
);
1048 z4
= _mm_and_ps(mask4
,pi
);
1050 z
= _mm_or_ps( _mm_or_ps(z
,z1
), _mm_or_ps(z3
,z4
) );
1052 mask1
= _mm_andnot_ps(masky_lt
,maskx_lt
);
1053 mask2
= _mm_and_ps(maskx_lt
,masky_lt
);
1055 w
= _mm_or_ps( _mm_and_ps(mask1
,pi
), _mm_and_ps(mask2
,minuspi
) );
1056 w
= _mm_andnot_ps(maskall
,w
);
1058 z
= _mm_add_ps(z
,w
);
1065 #endif /* _gmx_math_x86_avx_128_fma_single_h_ */