1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of GROMACS.
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_avx_128_fma_double_h_
22 #define _gmx_math_x86_avx_128_fma_double_h_
26 #include "gmx_x86_avx_128_fma.h"
30 # define M_PI 3.14159265358979323846264338327950288
34 /************************
36 * Simple math routines *
38 ************************/
41 static gmx_inline __m128d
42 gmx_mm_invsqrt_pd(__m128d x
)
44 const __m128d half
= _mm_set1_pd(0.5);
45 const __m128d three
= _mm_set1_pd(3.0);
47 /* Lookup instruction only exists in single precision, convert back and forth... */
48 __m128d lu
= _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x
)));
50 lu
= _mm_mul_pd(_mm_mul_pd(half
,lu
),_mm_nmacc_pd(_mm_mul_pd(lu
,lu
),x
,three
));
51 return _mm_mul_pd(_mm_mul_pd(half
,lu
),_mm_nmacc_pd(_mm_mul_pd(lu
,lu
),x
,three
));
54 /* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
56 gmx_mm_invsqrt_pair_pd(__m128d x1
, __m128d x2
, __m128d
*invsqrt1
, __m128d
*invsqrt2
)
58 const __m128d half
= _mm_set1_pd(0.5);
59 const __m128d three
= _mm_set1_pd(3.0);
60 const __m128 halff
= _mm_set1_ps(0.5f
);
61 const __m128 threef
= _mm_set1_ps(3.0f
);
66 /* Do first N-R step in float for 2x throughput */
67 xf
= _mm_shuffle_ps(_mm_cvtpd_ps(x1
),_mm_cvtpd_ps(x2
),_MM_SHUFFLE(1,0,1,0));
68 luf
= _mm_rsqrt_ps(xf
);
70 luf
= _mm_mul_ps(_mm_mul_ps(halff
,luf
),_mm_nmacc_ps(_mm_mul_ps(luf
,luf
),xf
,threef
));
73 lu2
= _mm_cvtps_pd(_mm_shuffle_ps(luf
,luf
,_MM_SHUFFLE(3,2,3,2)));
74 lu1
= _mm_cvtps_pd(luf
);
76 *invsqrt1
= _mm_mul_pd(_mm_mul_pd(half
,lu1
),_mm_nmacc_pd(_mm_mul_pd(lu1
,lu1
),x1
,three
));
77 *invsqrt2
= _mm_mul_pd(_mm_mul_pd(half
,lu2
),_mm_nmacc_pd(_mm_mul_pd(lu2
,lu2
),x2
,three
));
80 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
81 static gmx_inline __m128d
82 gmx_mm_sqrt_pd(__m128d x
)
87 mask
= _mm_cmpeq_pd(x
,_mm_setzero_pd());
88 res
= _mm_andnot_pd(mask
,gmx_mm_invsqrt_pd(x
));
90 res
= _mm_mul_pd(x
,res
);
96 static gmx_inline __m128d
97 gmx_mm_inv_pd(__m128d x
)
99 const __m128d two
= _mm_set1_pd(2.0);
101 /* Lookup instruction only exists in single precision, convert back and forth... */
102 __m128d lu
= _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x
)));
104 /* Perform two N-R steps for double precision */
105 lu
= _mm_mul_pd(lu
,_mm_nmacc_pd(lu
,x
,two
));
106 return _mm_mul_pd(lu
,_mm_nmacc_pd(lu
,x
,two
));
109 static gmx_inline __m128d
110 gmx_mm_abs_pd(__m128d x
)
112 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
114 return _mm_and_pd(x
,signmask
);
121 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
124 * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
125 * according to the same algorithm as used in the Cephes/netlib math routines.
128 gmx_mm_exp2_pd(__m128d x
)
130 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
131 const __m128d arglimit
= _mm_set1_pd(1022.0);
132 const __m128i expbase
= _mm_set1_epi32(1023);
134 const __m128d P2
= _mm_set1_pd(2.30933477057345225087e-2);
135 const __m128d P1
= _mm_set1_pd(2.02020656693165307700e1
);
136 const __m128d P0
= _mm_set1_pd(1.51390680115615096133e3
);
138 const __m128d Q1
= _mm_set1_pd(2.33184211722314911771e2
);
139 const __m128d Q0
= _mm_set1_pd(4.36821166879210612817e3
);
140 const __m128d one
= _mm_set1_pd(1.0);
141 const __m128d two
= _mm_set1_pd(2.0);
150 iexppart
= _mm_cvtpd_epi32(x
);
151 intpart
= _mm_round_pd(x
,_MM_FROUND_TO_NEAREST_INT
);
153 /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
154 * To be able to shift it into the exponent for a double precision number we first need to
155 * shuffle so that the lower half contains the first element, and the upper half the second.
156 * This should really be done as a zero-extension, but since the next instructions will shift
157 * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
158 * (thus we just use element 2 from iexppart).
160 iexppart
= _mm_shuffle_epi32(iexppart
,_MM_SHUFFLE(2,1,2,0));
162 /* Do the shift operation on the 64-bit registers */
163 iexppart
= _mm_add_epi32(iexppart
,expbase
);
164 iexppart
= _mm_slli_epi64(iexppart
,52);
166 valuemask
= _mm_cmpge_pd(arglimit
,gmx_mm_abs_pd(x
));
167 fexppart
= _mm_and_pd(valuemask
,gmx_mm_castsi128_pd(iexppart
));
169 z
= _mm_sub_pd(x
,intpart
);
170 z2
= _mm_mul_pd(z
,z
);
172 PolyP
= _mm_macc_pd(P2
,z2
,P1
);
173 PolyQ
= _mm_add_pd(z2
,Q1
);
174 PolyP
= _mm_macc_pd(PolyP
,z2
,P0
);
175 PolyQ
= _mm_macc_pd(PolyQ
,z2
,Q0
);
176 PolyP
= _mm_mul_pd(PolyP
,z
);
178 z
= _mm_mul_pd(PolyP
,gmx_mm_inv_pd(_mm_sub_pd(PolyQ
,PolyP
)));
179 z
= _mm_macc_pd(two
,z
,one
);
181 z
= _mm_mul_pd(z
,fexppart
);
186 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
187 * but there will then be a small rounding error since we lose some precision due to the
188 * multiplication. This will then be magnified a lot by the exponential.
190 * Instead, we calculate the fractional part directly as a Padé approximation of
191 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
192 * remaining after 2^y, which avoids the precision-loss.
195 gmx_mm_exp_pd(__m128d exparg
)
197 const __m128d argscale
= _mm_set1_pd(1.4426950408889634073599);
198 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
199 const __m128d arglimit
= _mm_set1_pd(1022.0);
200 const __m128i expbase
= _mm_set1_epi32(1023);
202 const __m128d invargscale0
= _mm_set1_pd(6.93145751953125e-1);
203 const __m128d invargscale1
= _mm_set1_pd(1.42860682030941723212e-6);
205 const __m128d P2
= _mm_set1_pd(1.26177193074810590878e-4);
206 const __m128d P1
= _mm_set1_pd(3.02994407707441961300e-2);
208 const __m128d Q3
= _mm_set1_pd(3.00198505138664455042E-6);
209 const __m128d Q2
= _mm_set1_pd(2.52448340349684104192E-3);
210 const __m128d Q1
= _mm_set1_pd(2.27265548208155028766E-1);
212 const __m128d one
= _mm_set1_pd(1.0);
213 const __m128d two
= _mm_set1_pd(2.0);
222 x
= _mm_mul_pd(exparg
,argscale
);
224 iexppart
= _mm_cvtpd_epi32(x
);
225 intpart
= _mm_round_pd(x
,_MM_FROUND_TO_NEAREST_INT
);
227 /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
228 * To be able to shift it into the exponent for a double precision number we first need to
229 * shuffle so that the lower half contains the first element, and the upper half the second.
230 * This should really be done as a zero-extension, but since the next instructions will shift
231 * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
232 * (thus we just use element 2 from iexppart).
234 iexppart
= _mm_shuffle_epi32(iexppart
,_MM_SHUFFLE(2,1,2,0));
236 /* Do the shift operation on the 64-bit registers */
237 iexppart
= _mm_add_epi32(iexppart
,expbase
);
238 iexppart
= _mm_slli_epi64(iexppart
,52);
240 valuemask
= _mm_cmpge_pd(arglimit
,gmx_mm_abs_pd(x
));
241 fexppart
= _mm_and_pd(valuemask
,gmx_mm_castsi128_pd(iexppart
));
243 z
= _mm_sub_pd(exparg
,_mm_mul_pd(invargscale0
,intpart
));
244 z
= _mm_sub_pd(z
,_mm_mul_pd(invargscale1
,intpart
));
246 z2
= _mm_mul_pd(z
,z
);
248 PolyQ
= _mm_macc_pd(Q3
,z2
,Q2
);
249 PolyP
= _mm_macc_pd(P2
,z2
,P1
);
250 PolyQ
= _mm_macc_pd(PolyQ
,z2
,Q1
);
252 PolyP
= _mm_macc_pd(PolyP
,z2
,one
);
253 PolyQ
= _mm_macc_pd(PolyQ
,z2
,two
);
255 PolyP
= _mm_mul_pd(PolyP
,z
);
257 z
= _mm_mul_pd(PolyP
,gmx_mm_inv_pd(_mm_sub_pd(PolyQ
,PolyP
)));
258 z
= _mm_macc_pd(two
,z
,one
);
260 z
= _mm_mul_pd(z
,fexppart
);
268 gmx_mm_log_pd(__m128d x
)
270 /* Same algorithm as cephes library */
271 const __m128d expmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
273 const __m128i expbase_m1
= _mm_set1_epi32(1023-1); /* We want non-IEEE format */
275 const __m128d half
= _mm_set1_pd(0.5);
276 const __m128d one
= _mm_set1_pd(1.0);
277 const __m128d two
= _mm_set1_pd(2.0);
278 const __m128d invsq2
= _mm_set1_pd(1.0/sqrt(2.0));
280 const __m128d corr1
= _mm_set1_pd(-2.121944400546905827679e-4);
281 const __m128d corr2
= _mm_set1_pd(0.693359375);
283 const __m128d P5
= _mm_set1_pd(1.01875663804580931796e-4);
284 const __m128d P4
= _mm_set1_pd(4.97494994976747001425e-1);
285 const __m128d P3
= _mm_set1_pd(4.70579119878881725854e0
);
286 const __m128d P2
= _mm_set1_pd(1.44989225341610930846e1
);
287 const __m128d P1
= _mm_set1_pd(1.79368678507819816313e1
);
288 const __m128d P0
= _mm_set1_pd(7.70838733755885391666e0
);
290 const __m128d Q4
= _mm_set1_pd(1.12873587189167450590e1
);
291 const __m128d Q3
= _mm_set1_pd(4.52279145837532221105e1
);
292 const __m128d Q2
= _mm_set1_pd(8.29875266912776603211e1
);
293 const __m128d Q1
= _mm_set1_pd(7.11544750618563894466e1
);
294 const __m128d Q0
= _mm_set1_pd(2.31251620126765340583e1
);
296 const __m128d R2
= _mm_set1_pd(-7.89580278884799154124e-1);
297 const __m128d R1
= _mm_set1_pd(1.63866645699558079767e1
);
298 const __m128d R0
= _mm_set1_pd(-6.41409952958715622951e1
);
300 const __m128d S2
= _mm_set1_pd(-3.56722798256324312549E1
);
301 const __m128d S1
= _mm_set1_pd(3.12093766372244180303E2
);
302 const __m128d S0
= _mm_set1_pd(-7.69691943550460008604E2
);
308 __m128d corr
,t1
,t2
,q
;
309 __m128d zA
,yA
,xA
,zB
,yB
,xB
,z
;
311 __m128d polyP1
,polyP2
,polyQ1
,polyQ2
;
313 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
314 fexp
= _mm_and_pd(x
,expmask
);
315 iexp
= gmx_mm_castpd_si128(fexp
);
316 iexp
= _mm_srli_epi64(iexp
,52);
317 iexp
= _mm_sub_epi32(iexp
,expbase_m1
);
318 iexp
= _mm_shuffle_epi32(iexp
, _MM_SHUFFLE(1,1,2,0) );
319 fexp
= _mm_cvtepi32_pd(iexp
);
321 x
= _mm_andnot_pd(expmask
,x
);
322 x
= _mm_or_pd(x
,one
);
323 x
= _mm_mul_pd(x
,half
);
325 mask1
= _mm_cmpgt_pd(gmx_mm_abs_pd(fexp
),two
);
326 mask2
= _mm_cmplt_pd(x
,invsq2
);
328 fexp
= _mm_sub_pd(fexp
,_mm_and_pd(mask2
,one
));
330 /* If mask1 is set ('A') */
331 zA
= _mm_sub_pd(x
,half
);
332 t1
= _mm_blendv_pd( zA
,x
,mask2
);
333 zA
= _mm_sub_pd(t1
,half
);
334 t2
= _mm_blendv_pd( x
,zA
,mask2
);
335 yA
= _mm_mul_pd(half
,_mm_add_pd(t2
,one
));
337 xA
= _mm_mul_pd(zA
,gmx_mm_inv_pd(yA
));
338 zA
= _mm_mul_pd(xA
,xA
);
341 polyR
= _mm_macc_pd(R2
,zA
,R1
);
342 polyR
= _mm_macc_pd(polyR
,zA
,R0
);
344 polyS
= _mm_add_pd(zA
,S2
);
345 polyS
= _mm_macc_pd(polyS
,zA
,S1
);
346 polyS
= _mm_macc_pd(polyS
,zA
,S0
);
348 q
= _mm_mul_pd(polyR
,gmx_mm_inv_pd(polyS
));
349 zA
= _mm_mul_pd(_mm_mul_pd(xA
,zA
),q
);
351 zA
= _mm_macc_pd(corr1
,fexp
,zA
);
352 zA
= _mm_add_pd(zA
,xA
);
353 zA
= _mm_macc_pd(corr2
,fexp
,zA
);
355 /* If mask1 is not set ('B') */
356 corr
= _mm_and_pd(mask2
,x
);
357 xB
= _mm_add_pd(x
,corr
);
358 xB
= _mm_sub_pd(xB
,one
);
359 zB
= _mm_mul_pd(xB
,xB
);
361 polyP1
= _mm_macc_pd(P5
,zB
,P3
);
362 polyP2
= _mm_macc_pd(P4
,zB
,P2
);
363 polyP1
= _mm_macc_pd(polyP1
,zB
,P1
);
364 polyP2
= _mm_macc_pd(polyP2
,zB
,P0
);
365 polyP1
= _mm_macc_pd(polyP1
,xB
,polyP2
);
367 polyQ2
= _mm_macc_pd(Q4
,zB
,Q2
);
368 polyQ1
= _mm_add_pd(zB
,Q3
);
369 polyQ1
= _mm_macc_pd(polyQ1
,zB
,Q1
);
370 polyQ2
= _mm_macc_pd(polyQ2
,zB
,Q0
);
371 polyQ1
= _mm_macc_pd(polyQ1
,xB
,polyQ2
);
373 fexp
= _mm_and_pd(fexp
,_mm_cmpneq_pd(fexp
,_mm_setzero_pd()));
375 q
= _mm_mul_pd(polyP1
,gmx_mm_inv_pd(polyQ1
));
376 yB
= _mm_macc_pd(_mm_mul_pd(xB
,zB
),q
,_mm_mul_pd(corr1
,fexp
));
378 yB
= _mm_nmacc_pd(half
,zB
,yB
);
379 zB
= _mm_add_pd(xB
,yB
);
380 zB
= _mm_macc_pd(corr2
,fexp
,zB
);
382 z
= _mm_blendv_pd( zB
,zA
,mask1
);
390 gmx_mm_erf_pd(__m128d x
)
392 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
393 const __m128d CAP4
= _mm_set1_pd(-0.431780540597889301512e-4);
394 const __m128d CAP3
= _mm_set1_pd(-0.00578562306260059236059);
395 const __m128d CAP2
= _mm_set1_pd(-0.028593586920219752446);
396 const __m128d CAP1
= _mm_set1_pd(-0.315924962948621698209);
397 const __m128d CAP0
= _mm_set1_pd(0.14952975608477029151);
399 const __m128d CAQ5
= _mm_set1_pd(-0.374089300177174709737e-5);
400 const __m128d CAQ4
= _mm_set1_pd(0.00015126584532155383535);
401 const __m128d CAQ3
= _mm_set1_pd(0.00536692680669480725423);
402 const __m128d CAQ2
= _mm_set1_pd(0.0668686825594046122636);
403 const __m128d CAQ1
= _mm_set1_pd(0.402604990869284362773);
405 const __m128d CAoffset
= _mm_set1_pd(0.9788494110107421875);
407 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
408 const __m128d CBP6
= _mm_set1_pd(2.49650423685462752497647637088e-10);
409 const __m128d CBP5
= _mm_set1_pd(0.00119770193298159629350136085658);
410 const __m128d CBP4
= _mm_set1_pd(0.0164944422378370965881008942733);
411 const __m128d CBP3
= _mm_set1_pd(0.0984581468691775932063932439252);
412 const __m128d CBP2
= _mm_set1_pd(0.317364595806937763843589437418);
413 const __m128d CBP1
= _mm_set1_pd(0.554167062641455850932670067075);
414 const __m128d CBP0
= _mm_set1_pd(0.427583576155807163756925301060);
415 const __m128d CBQ7
= _mm_set1_pd(0.00212288829699830145976198384930);
416 const __m128d CBQ6
= _mm_set1_pd(0.0334810979522685300554606393425);
417 const __m128d CBQ5
= _mm_set1_pd(0.2361713785181450957579508850717);
418 const __m128d CBQ4
= _mm_set1_pd(0.955364736493055670530981883072);
419 const __m128d CBQ3
= _mm_set1_pd(2.36815675631420037315349279199);
420 const __m128d CBQ2
= _mm_set1_pd(3.55261649184083035537184223542);
421 const __m128d CBQ1
= _mm_set1_pd(2.93501136050160872574376997993);
424 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
425 const __m128d CCP6
= _mm_set1_pd(-2.8175401114513378771);
426 const __m128d CCP5
= _mm_set1_pd(-3.22729451764143718517);
427 const __m128d CCP4
= _mm_set1_pd(-2.5518551727311523996);
428 const __m128d CCP3
= _mm_set1_pd(-0.687717681153649930619);
429 const __m128d CCP2
= _mm_set1_pd(-0.212652252872804219852);
430 const __m128d CCP1
= _mm_set1_pd(0.0175389834052493308818);
431 const __m128d CCP0
= _mm_set1_pd(0.00628057170626964891937);
433 const __m128d CCQ6
= _mm_set1_pd(5.48409182238641741584);
434 const __m128d CCQ5
= _mm_set1_pd(13.5064170191802889145);
435 const __m128d CCQ4
= _mm_set1_pd(22.9367376522880577224);
436 const __m128d CCQ3
= _mm_set1_pd(15.930646027911794143);
437 const __m128d CCQ2
= _mm_set1_pd(11.0567237927800161565);
438 const __m128d CCQ1
= _mm_set1_pd(2.79257750980575282228);
440 const __m128d CCoffset
= _mm_set1_pd(0.5579090118408203125);
442 const __m128d one
= _mm_set1_pd(1.0);
443 const __m128d two
= _mm_set1_pd(2.0);
445 const __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
447 __m128d xabs
,x2
,x4
,t
,t2
,w
,w2
;
448 __m128d PolyAP0
,PolyAP1
,PolyAQ0
,PolyAQ1
;
449 __m128d PolyBP0
,PolyBP1
,PolyBQ0
,PolyBQ1
;
450 __m128d PolyCP0
,PolyCP1
,PolyCQ0
,PolyCQ1
;
451 __m128d res_erf
,res_erfcB
,res_erfcC
,res_erfc
,res
;
454 /* Calculate erf() */
455 xabs
= gmx_mm_abs_pd(x
);
456 x2
= _mm_mul_pd(x
,x
);
457 x4
= _mm_mul_pd(x2
,x2
);
459 PolyAP0
= _mm_macc_pd(CAP4
,x4
,CAP2
);
460 PolyAP1
= _mm_macc_pd(CAP3
,x4
,CAP1
);
461 PolyAP0
= _mm_macc_pd(PolyAP0
,x4
,CAP0
);
462 PolyAP0
= _mm_macc_pd(PolyAP1
,x2
,PolyAP0
);
464 PolyAQ1
= _mm_macc_pd(CAQ5
,x4
,CAQ3
);
465 PolyAQ0
= _mm_macc_pd(CAQ4
,x4
,CAQ2
);
466 PolyAQ1
= _mm_macc_pd(PolyAQ1
,x4
,CAQ1
);
467 PolyAQ0
= _mm_macc_pd(PolyAQ0
,x4
,one
);
468 PolyAQ0
= _mm_macc_pd(PolyAQ1
,x2
,PolyAQ0
);
470 res_erf
= _mm_macc_pd(PolyAP0
,gmx_mm_inv_pd(PolyAQ0
),CAoffset
);
471 res_erf
= _mm_mul_pd(x
,res_erf
);
473 /* Calculate erfc() in range [1,4.5] */
474 t
= _mm_sub_pd(xabs
,one
);
475 t2
= _mm_mul_pd(t
,t
);
477 PolyBP0
= _mm_macc_pd(CBP6
,t2
,CBP4
);
478 PolyBP1
= _mm_macc_pd(CBP5
,t2
,CBP3
);
479 PolyBP0
= _mm_macc_pd(PolyBP0
,t2
,CBP2
);
480 PolyBP1
= _mm_macc_pd(PolyBP1
,t2
,CBP1
);
481 PolyBP0
= _mm_macc_pd(PolyBP0
,t2
,CBP0
);
482 PolyBP0
= _mm_macc_pd(PolyBP1
,t
,PolyBP0
);
484 PolyBQ1
= _mm_macc_pd(CBQ7
,t2
,CBQ5
);
485 PolyBQ0
= _mm_macc_pd(CBQ6
,t2
,CBQ4
);
486 PolyBQ1
= _mm_macc_pd(PolyBQ1
,t2
,CBQ3
);
487 PolyBQ0
= _mm_macc_pd(PolyBQ0
,t2
,CBQ2
);
488 PolyBQ1
= _mm_macc_pd(PolyBQ1
,t2
,CBQ1
);
489 PolyBQ0
= _mm_macc_pd(PolyBQ0
,t2
,one
);
490 PolyBQ0
= _mm_macc_pd(PolyBQ1
,t
,PolyBQ0
);
492 res_erfcB
= _mm_mul_pd(PolyBP0
,gmx_mm_inv_pd(PolyBQ0
));
494 res_erfcB
= _mm_mul_pd(res_erfcB
,xabs
);
496 /* Calculate erfc() in range [4.5,inf] */
497 w
= gmx_mm_inv_pd(xabs
);
498 w2
= _mm_mul_pd(w
,w
);
500 PolyCP0
= _mm_macc_pd(CCP6
,w2
,CCP4
);
501 PolyCP1
= _mm_macc_pd(CCP5
,w2
,CCP3
);
502 PolyCP0
= _mm_macc_pd(PolyCP0
,w2
,CCP2
);
503 PolyCP1
= _mm_macc_pd(PolyCP1
,w2
,CCP1
);
504 PolyCP0
= _mm_macc_pd(PolyCP0
,w2
,CCP0
);
505 PolyCP0
= _mm_macc_pd(PolyCP1
,w
,PolyCP0
);
507 PolyCQ0
= _mm_macc_pd(CCQ6
,w2
,CCQ4
);
508 PolyCQ1
= _mm_macc_pd(CCQ5
,w2
,CCQ3
);
509 PolyCQ0
= _mm_macc_pd(PolyCQ0
,w2
,CCQ2
);
510 PolyCQ1
= _mm_macc_pd(PolyCQ1
,w2
,CCQ1
);
511 PolyCQ0
= _mm_macc_pd(PolyCQ0
,w2
,one
);
512 PolyCQ0
= _mm_macc_pd(PolyCQ1
,w
,PolyCQ0
);
514 expmx2
= gmx_mm_exp_pd( _mm_or_pd(signbit
, x2
) );
516 res_erfcC
= _mm_macc_pd(PolyCP0
,gmx_mm_inv_pd(PolyCQ0
),CCoffset
);
517 res_erfcC
= _mm_mul_pd(res_erfcC
,w
);
519 mask
= _mm_cmpgt_pd(xabs
,_mm_set1_pd(4.5));
520 res_erfc
= _mm_blendv_pd(res_erfcB
,res_erfcC
,mask
);
522 res_erfc
= _mm_mul_pd(res_erfc
,expmx2
);
524 /* erfc(x<0) = 2-erfc(|x|) */
525 mask
= _mm_cmplt_pd(x
,_mm_setzero_pd());
526 res_erfc
= _mm_blendv_pd(res_erfc
,_mm_sub_pd(two
,res_erfc
),mask
);
528 /* Select erf() or erfc() */
529 mask
= _mm_cmplt_pd(xabs
,one
);
530 res
= _mm_blendv_pd(_mm_sub_pd(one
,res_erfc
),res_erf
,mask
);
537 gmx_mm_erfc_pd(__m128d x
)
539 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
540 const __m128d CAP4
= _mm_set1_pd(-0.431780540597889301512e-4);
541 const __m128d CAP3
= _mm_set1_pd(-0.00578562306260059236059);
542 const __m128d CAP2
= _mm_set1_pd(-0.028593586920219752446);
543 const __m128d CAP1
= _mm_set1_pd(-0.315924962948621698209);
544 const __m128d CAP0
= _mm_set1_pd(0.14952975608477029151);
546 const __m128d CAQ5
= _mm_set1_pd(-0.374089300177174709737e-5);
547 const __m128d CAQ4
= _mm_set1_pd(0.00015126584532155383535);
548 const __m128d CAQ3
= _mm_set1_pd(0.00536692680669480725423);
549 const __m128d CAQ2
= _mm_set1_pd(0.0668686825594046122636);
550 const __m128d CAQ1
= _mm_set1_pd(0.402604990869284362773);
552 const __m128d CAoffset
= _mm_set1_pd(0.9788494110107421875);
554 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
555 const __m128d CBP6
= _mm_set1_pd(2.49650423685462752497647637088e-10);
556 const __m128d CBP5
= _mm_set1_pd(0.00119770193298159629350136085658);
557 const __m128d CBP4
= _mm_set1_pd(0.0164944422378370965881008942733);
558 const __m128d CBP3
= _mm_set1_pd(0.0984581468691775932063932439252);
559 const __m128d CBP2
= _mm_set1_pd(0.317364595806937763843589437418);
560 const __m128d CBP1
= _mm_set1_pd(0.554167062641455850932670067075);
561 const __m128d CBP0
= _mm_set1_pd(0.427583576155807163756925301060);
562 const __m128d CBQ7
= _mm_set1_pd(0.00212288829699830145976198384930);
563 const __m128d CBQ6
= _mm_set1_pd(0.0334810979522685300554606393425);
564 const __m128d CBQ5
= _mm_set1_pd(0.2361713785181450957579508850717);
565 const __m128d CBQ4
= _mm_set1_pd(0.955364736493055670530981883072);
566 const __m128d CBQ3
= _mm_set1_pd(2.36815675631420037315349279199);
567 const __m128d CBQ2
= _mm_set1_pd(3.55261649184083035537184223542);
568 const __m128d CBQ1
= _mm_set1_pd(2.93501136050160872574376997993);
571 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
572 const __m128d CCP6
= _mm_set1_pd(-2.8175401114513378771);
573 const __m128d CCP5
= _mm_set1_pd(-3.22729451764143718517);
574 const __m128d CCP4
= _mm_set1_pd(-2.5518551727311523996);
575 const __m128d CCP3
= _mm_set1_pd(-0.687717681153649930619);
576 const __m128d CCP2
= _mm_set1_pd(-0.212652252872804219852);
577 const __m128d CCP1
= _mm_set1_pd(0.0175389834052493308818);
578 const __m128d CCP0
= _mm_set1_pd(0.00628057170626964891937);
580 const __m128d CCQ6
= _mm_set1_pd(5.48409182238641741584);
581 const __m128d CCQ5
= _mm_set1_pd(13.5064170191802889145);
582 const __m128d CCQ4
= _mm_set1_pd(22.9367376522880577224);
583 const __m128d CCQ3
= _mm_set1_pd(15.930646027911794143);
584 const __m128d CCQ2
= _mm_set1_pd(11.0567237927800161565);
585 const __m128d CCQ1
= _mm_set1_pd(2.79257750980575282228);
587 const __m128d CCoffset
= _mm_set1_pd(0.5579090118408203125);
589 const __m128d one
= _mm_set1_pd(1.0);
590 const __m128d two
= _mm_set1_pd(2.0);
592 const __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
594 __m128d xabs
,x2
,x4
,t
,t2
,w
,w2
;
595 __m128d PolyAP0
,PolyAP1
,PolyAQ0
,PolyAQ1
;
596 __m128d PolyBP0
,PolyBP1
,PolyBQ0
,PolyBQ1
;
597 __m128d PolyCP0
,PolyCP1
,PolyCQ0
,PolyCQ1
;
598 __m128d res_erf
,res_erfcB
,res_erfcC
,res_erfc
,res
;
601 /* Calculate erf() */
602 xabs
= gmx_mm_abs_pd(x
);
603 x2
= _mm_mul_pd(x
,x
);
604 x4
= _mm_mul_pd(x2
,x2
);
606 PolyAP0
= _mm_macc_pd(CAP4
,x4
,CAP2
);
607 PolyAP1
= _mm_macc_pd(CAP3
,x4
,CAP1
);
608 PolyAP0
= _mm_macc_pd(PolyAP0
,x4
,CAP0
);
609 PolyAP0
= _mm_macc_pd(PolyAP1
,x2
,PolyAP0
);
611 PolyAQ1
= _mm_macc_pd(CAQ5
,x4
,CAQ3
);
612 PolyAQ0
= _mm_macc_pd(CAQ4
,x4
,CAQ2
);
613 PolyAQ1
= _mm_macc_pd(PolyAQ1
,x4
,CAQ1
);
614 PolyAQ0
= _mm_macc_pd(PolyAQ0
,x4
,one
);
615 PolyAQ0
= _mm_macc_pd(PolyAQ1
,x2
,PolyAQ0
);
617 res_erf
= _mm_macc_pd(PolyAP0
,gmx_mm_inv_pd(PolyAQ0
),CAoffset
);
618 res_erf
= _mm_mul_pd(x
,res_erf
);
620 /* Calculate erfc() in range [1,4.5] */
621 t
= _mm_sub_pd(xabs
,one
);
622 t2
= _mm_mul_pd(t
,t
);
624 PolyBP0
= _mm_macc_pd(CBP6
,t2
,CBP4
);
625 PolyBP1
= _mm_macc_pd(CBP5
,t2
,CBP3
);
626 PolyBP0
= _mm_macc_pd(PolyBP0
,t2
,CBP2
);
627 PolyBP1
= _mm_macc_pd(PolyBP1
,t2
,CBP1
);
628 PolyBP0
= _mm_macc_pd(PolyBP0
,t2
,CBP0
);
629 PolyBP0
= _mm_macc_pd(PolyBP1
,t
,PolyBP0
);
631 PolyBQ1
= _mm_macc_pd(CBQ7
,t2
,CBQ5
);
632 PolyBQ0
= _mm_macc_pd(CBQ6
,t2
,CBQ4
);
633 PolyBQ1
= _mm_macc_pd(PolyBQ1
,t2
,CBQ3
);
634 PolyBQ0
= _mm_macc_pd(PolyBQ0
,t2
,CBQ2
);
635 PolyBQ1
= _mm_macc_pd(PolyBQ1
,t2
,CBQ1
);
636 PolyBQ0
= _mm_macc_pd(PolyBQ0
,t2
,one
);
637 PolyBQ0
= _mm_macc_pd(PolyBQ1
,t
,PolyBQ0
);
639 res_erfcB
= _mm_mul_pd(PolyBP0
,gmx_mm_inv_pd(PolyBQ0
));
641 res_erfcB
= _mm_mul_pd(res_erfcB
,xabs
);
643 /* Calculate erfc() in range [4.5,inf] */
644 w
= gmx_mm_inv_pd(xabs
);
645 w2
= _mm_mul_pd(w
,w
);
647 PolyCP0
= _mm_macc_pd(CCP6
,w2
,CCP4
);
648 PolyCP1
= _mm_macc_pd(CCP5
,w2
,CCP3
);
649 PolyCP0
= _mm_macc_pd(PolyCP0
,w2
,CCP2
);
650 PolyCP1
= _mm_macc_pd(PolyCP1
,w2
,CCP1
);
651 PolyCP0
= _mm_macc_pd(PolyCP0
,w2
,CCP0
);
652 PolyCP0
= _mm_macc_pd(PolyCP1
,w
,PolyCP0
);
654 PolyCQ0
= _mm_macc_pd(CCQ6
,w2
,CCQ4
);
655 PolyCQ1
= _mm_macc_pd(CCQ5
,w2
,CCQ3
);
656 PolyCQ0
= _mm_macc_pd(PolyCQ0
,w2
,CCQ2
);
657 PolyCQ1
= _mm_macc_pd(PolyCQ1
,w2
,CCQ1
);
658 PolyCQ0
= _mm_macc_pd(PolyCQ0
,w2
,one
);
659 PolyCQ0
= _mm_macc_pd(PolyCQ1
,w
,PolyCQ0
);
661 expmx2
= gmx_mm_exp_pd( _mm_or_pd(signbit
, x2
) );
663 res_erfcC
= _mm_macc_pd(PolyCP0
,gmx_mm_inv_pd(PolyCQ0
),CCoffset
);
664 res_erfcC
= _mm_mul_pd(res_erfcC
,w
);
666 mask
= _mm_cmpgt_pd(xabs
,_mm_set1_pd(4.5));
667 res_erfc
= _mm_blendv_pd(res_erfcB
,res_erfcC
,mask
);
669 res_erfc
= _mm_mul_pd(res_erfc
,expmx2
);
671 /* erfc(x<0) = 2-erfc(|x|) */
672 mask
= _mm_cmplt_pd(x
,_mm_setzero_pd());
673 res_erfc
= _mm_blendv_pd(res_erfc
,_mm_sub_pd(two
,res_erfc
),mask
);
675 /* Select erf() or erfc() */
676 mask
= _mm_cmplt_pd(xabs
,one
);
677 res
= _mm_blendv_pd(res_erfc
,_mm_sub_pd(one
,res_erf
),mask
);
686 gmx_mm_sincos_pd(__m128d x
,
691 __declspec(align(16))
692 const double sintable
[34] =
694 1.00000000000000000e+00 , 0.00000000000000000e+00 ,
695 9.95184726672196929e-01 , 9.80171403295606036e-02 ,
696 9.80785280403230431e-01 , 1.95090322016128248e-01 ,
697 9.56940335732208824e-01 , 2.90284677254462331e-01 ,
698 9.23879532511286738e-01 , 3.82683432365089782e-01 ,
699 8.81921264348355050e-01 , 4.71396736825997642e-01 ,
700 8.31469612302545236e-01 , 5.55570233019602178e-01 ,
701 7.73010453362736993e-01 , 6.34393284163645488e-01 ,
702 7.07106781186547573e-01 , 7.07106781186547462e-01 ,
703 6.34393284163645599e-01 , 7.73010453362736882e-01 ,
704 5.55570233019602289e-01 , 8.31469612302545125e-01 ,
705 4.71396736825997809e-01 , 8.81921264348354939e-01 ,
706 3.82683432365089837e-01 , 9.23879532511286738e-01 ,
707 2.90284677254462276e-01 , 9.56940335732208935e-01 ,
708 1.95090322016128304e-01 , 9.80785280403230431e-01 ,
709 9.80171403295607702e-02 , 9.95184726672196818e-01 ,
710 0.0 , 1.00000000000000000e+00
713 const __m128d sintable
[17] =
715 _mm_set_pd( 0.0 , 1.0 ),
716 _mm_set_pd( sin( 1.0 * (M_PI
/2.0) / 16.0) , cos( 1.0 * (M_PI
/2.0) / 16.0) ),
717 _mm_set_pd( sin( 2.0 * (M_PI
/2.0) / 16.0) , cos( 2.0 * (M_PI
/2.0) / 16.0) ),
718 _mm_set_pd( sin( 3.0 * (M_PI
/2.0) / 16.0) , cos( 3.0 * (M_PI
/2.0) / 16.0) ),
719 _mm_set_pd( sin( 4.0 * (M_PI
/2.0) / 16.0) , cos( 4.0 * (M_PI
/2.0) / 16.0) ),
720 _mm_set_pd( sin( 5.0 * (M_PI
/2.0) / 16.0) , cos( 5.0 * (M_PI
/2.0) / 16.0) ),
721 _mm_set_pd( sin( 6.0 * (M_PI
/2.0) / 16.0) , cos( 6.0 * (M_PI
/2.0) / 16.0) ),
722 _mm_set_pd( sin( 7.0 * (M_PI
/2.0) / 16.0) , cos( 7.0 * (M_PI
/2.0) / 16.0) ),
723 _mm_set_pd( sin( 8.0 * (M_PI
/2.0) / 16.0) , cos( 8.0 * (M_PI
/2.0) / 16.0) ),
724 _mm_set_pd( sin( 9.0 * (M_PI
/2.0) / 16.0) , cos( 9.0 * (M_PI
/2.0) / 16.0) ),
725 _mm_set_pd( sin( 10.0 * (M_PI
/2.0) / 16.0) , cos( 10.0 * (M_PI
/2.0) / 16.0) ),
726 _mm_set_pd( sin( 11.0 * (M_PI
/2.0) / 16.0) , cos( 11.0 * (M_PI
/2.0) / 16.0) ),
727 _mm_set_pd( sin( 12.0 * (M_PI
/2.0) / 16.0) , cos( 12.0 * (M_PI
/2.0) / 16.0) ),
728 _mm_set_pd( sin( 13.0 * (M_PI
/2.0) / 16.0) , cos( 13.0 * (M_PI
/2.0) / 16.0) ),
729 _mm_set_pd( sin( 14.0 * (M_PI
/2.0) / 16.0) , cos( 14.0 * (M_PI
/2.0) / 16.0) ),
730 _mm_set_pd( sin( 15.0 * (M_PI
/2.0) / 16.0) , cos( 15.0 * (M_PI
/2.0) / 16.0) ),
731 _mm_set_pd( 1.0 , 0.0 )
735 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
736 const __m128i signbit_epi32
= _mm_set1_epi32(0x80000000);
738 const __m128d tabscale
= _mm_set1_pd(32.0/M_PI
);
739 const __m128d invtabscale0
= _mm_set1_pd(9.81747508049011230469e-02);
740 const __m128d invtabscale1
= _mm_set1_pd(1.96197799156550576057e-08);
741 const __m128i ione
= _mm_set1_epi32(1);
742 const __m128i i32
= _mm_set1_epi32(32);
743 const __m128i i16
= _mm_set1_epi32(16);
744 const __m128i tabmask
= _mm_set1_epi32(0x3F);
745 const __m128d sinP7
= _mm_set1_pd(-1.0/5040.0);
746 const __m128d sinP5
= _mm_set1_pd(1.0/120.0);
747 const __m128d sinP3
= _mm_set1_pd(-1.0/6.0);
748 const __m128d sinP1
= _mm_set1_pd(1.0);
750 const __m128d cosP6
= _mm_set1_pd(-1.0/720.0);
751 const __m128d cosP4
= _mm_set1_pd(1.0/24.0);
752 const __m128d cosP2
= _mm_set1_pd(-1.0/2.0);
753 const __m128d cosP0
= _mm_set1_pd(1.0);
756 __m128i tabidx
,corridx
;
757 __m128d xabs
,z
,z2
,polySin
,polyCos
;
759 __m128d ypoint0
,ypoint1
;
761 __m128d sinpoint
,cospoint
;
762 __m128d xsign
,ssign
,csign
;
763 __m128i imask
,sswapsign
,cswapsign
;
766 xsign
= _mm_andnot_pd(signmask
,x
);
767 xabs
= _mm_and_pd(x
,signmask
);
769 scalex
= _mm_mul_pd(tabscale
,xabs
);
770 tabidx
= _mm_cvtpd_epi32(scalex
);
772 xpoint
= _mm_round_pd(scalex
,_MM_FROUND_TO_NEAREST_INT
);
774 /* Extended precision arithmetics */
775 z
= _mm_nmacc_pd(invtabscale0
,xpoint
,xabs
);
776 z
= _mm_nmacc_pd(invtabscale1
,xpoint
,z
);
778 /* Range reduction to 0..2*Pi */
779 tabidx
= _mm_and_si128(tabidx
,tabmask
);
781 /* tabidx is now in range [0,..,64] */
782 imask
= _mm_cmpgt_epi32(tabidx
,i32
);
785 corridx
= _mm_and_si128(imask
,i32
);
786 tabidx
= _mm_sub_epi32(tabidx
,corridx
);
788 /* tabidx is now in range [0..32] */
789 imask
= _mm_cmpgt_epi32(tabidx
,i16
);
790 cswapsign
= _mm_xor_si128(cswapsign
,imask
);
791 corridx
= _mm_sub_epi32(i32
,tabidx
);
792 tabidx
= _mm_blendv_epi8(tabidx
,corridx
,imask
);
793 /* tabidx is now in range [0..16] */
794 ssign
= _mm_cvtepi32_pd( _mm_or_si128( sswapsign
, ione
) );
795 csign
= _mm_cvtepi32_pd( _mm_or_si128( cswapsign
, ione
) );
798 ypoint0
= _mm_load_pd(sintable
+ 2*_mm_extract_epi32(tabidx
,0));
799 ypoint1
= _mm_load_pd(sintable
+ 2*_mm_extract_epi32(tabidx
,1));
801 ypoint0
= sintable
[_mm_extract_epi32(tabidx
,0)];
802 ypoint1
= sintable
[_mm_extract_epi32(tabidx
,1)];
804 sinpoint
= _mm_unpackhi_pd(ypoint0
,ypoint1
);
805 cospoint
= _mm_unpacklo_pd(ypoint0
,ypoint1
);
807 sinpoint
= _mm_mul_pd(sinpoint
,ssign
);
808 cospoint
= _mm_mul_pd(cospoint
,csign
);
810 z2
= _mm_mul_pd(z
,z
);
812 polySin
= _mm_macc_pd(sinP7
,z2
,sinP5
);
813 polySin
= _mm_macc_pd(polySin
,z2
,sinP3
);
814 polySin
= _mm_macc_pd(polySin
,z2
,sinP1
);
815 polySin
= _mm_mul_pd(polySin
,z
);
817 polyCos
= _mm_macc_pd(cosP6
,z2
,cosP4
);
818 polyCos
= _mm_macc_pd(polyCos
,z2
,cosP2
);
819 polyCos
= _mm_macc_pd(polyCos
,z2
,cosP0
);
821 *sinval
= _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint
,polyCos
) , _mm_mul_pd(cospoint
,polySin
) ),xsign
);
822 *cosval
= _mm_sub_pd( _mm_mul_pd(cospoint
,polyCos
) , _mm_mul_pd(sinpoint
,polySin
) );
828 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
829 * will then call the sincos() routine and waste a factor 2 in performance!
832 gmx_mm_sin_pd(__m128d x
)
835 gmx_mm_sincos_pd(x
,&s
,&c
);
840 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
841 * will then call the sincos() routine and waste a factor 2 in performance!
844 gmx_mm_cos_pd(__m128d x
)
847 gmx_mm_sincos_pd(x
,&s
,&c
);
854 gmx_mm_tan_pd(__m128d x
)
856 __m128d sinval
,cosval
;
859 gmx_mm_sincos_pd(x
,&sinval
,&cosval
);
861 tanval
= _mm_mul_pd(sinval
,gmx_mm_inv_pd(cosval
));
869 gmx_mm_asin_pd(__m128d x
)
871 /* Same algorithm as cephes library */
872 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
873 const __m128d limit1
= _mm_set1_pd(0.625);
874 const __m128d limit2
= _mm_set1_pd(1e-8);
875 const __m128d one
= _mm_set1_pd(1.0);
876 const __m128d halfpi
= _mm_set1_pd(M_PI
/2.0);
877 const __m128d quarterpi
= _mm_set1_pd(M_PI
/4.0);
878 const __m128d morebits
= _mm_set1_pd(6.123233995736765886130e-17);
880 const __m128d P5
= _mm_set1_pd(4.253011369004428248960e-3);
881 const __m128d P4
= _mm_set1_pd(-6.019598008014123785661e-1);
882 const __m128d P3
= _mm_set1_pd(5.444622390564711410273e0
);
883 const __m128d P2
= _mm_set1_pd(-1.626247967210700244449e1
);
884 const __m128d P1
= _mm_set1_pd(1.956261983317594739197e1
);
885 const __m128d P0
= _mm_set1_pd(-8.198089802484824371615e0
);
887 const __m128d Q4
= _mm_set1_pd(-1.474091372988853791896e1
);
888 const __m128d Q3
= _mm_set1_pd(7.049610280856842141659e1
);
889 const __m128d Q2
= _mm_set1_pd(-1.471791292232726029859e2
);
890 const __m128d Q1
= _mm_set1_pd(1.395105614657485689735e2
);
891 const __m128d Q0
= _mm_set1_pd(-4.918853881490881290097e1
);
893 const __m128d R4
= _mm_set1_pd(2.967721961301243206100e-3);
894 const __m128d R3
= _mm_set1_pd(-5.634242780008963776856e-1);
895 const __m128d R2
= _mm_set1_pd(6.968710824104713396794e0
);
896 const __m128d R1
= _mm_set1_pd(-2.556901049652824852289e1
);
897 const __m128d R0
= _mm_set1_pd(2.853665548261061424989e1
);
899 const __m128d S3
= _mm_set1_pd(-2.194779531642920639778e1
);
900 const __m128d S2
= _mm_set1_pd(1.470656354026814941758e2
);
901 const __m128d S1
= _mm_set1_pd(-3.838770957603691357202e2
);
902 const __m128d S0
= _mm_set1_pd(3.424398657913078477438e2
);
907 __m128d zz
,ww
,z
,q
,w
,y
,zz2
,ww2
;
914 sign
= _mm_andnot_pd(signmask
,x
);
915 xabs
= _mm_and_pd(x
,signmask
);
917 mask
= _mm_cmpgt_pd(xabs
,limit1
);
919 zz
= _mm_sub_pd(one
,xabs
);
920 ww
= _mm_mul_pd(xabs
,xabs
);
921 zz2
= _mm_mul_pd(zz
,zz
);
922 ww2
= _mm_mul_pd(ww
,ww
);
925 RA
= _mm_macc_pd(R4
,zz2
,R2
);
926 RB
= _mm_macc_pd(R3
,zz2
,R1
);
927 RA
= _mm_macc_pd(RA
,zz2
,R0
);
928 RA
= _mm_macc_pd(RB
,zz
,RA
);
931 SB
= _mm_macc_pd(S3
,zz2
,S1
);
932 SA
= _mm_add_pd(zz2
,S2
);
933 SA
= _mm_macc_pd(SA
,zz2
,S0
);
934 SA
= _mm_macc_pd(SB
,zz
,SA
);
937 PA
= _mm_macc_pd(P5
,ww2
,P3
);
938 PB
= _mm_macc_pd(P4
,ww2
,P2
);
939 PA
= _mm_macc_pd(PA
,ww2
,P1
);
940 PB
= _mm_macc_pd(PB
,ww2
,P0
);
941 PA
= _mm_macc_pd(PA
,ww
,PB
);
944 QB
= _mm_macc_pd(Q4
,ww2
,Q2
);
945 QA
= _mm_add_pd(ww2
,Q3
);
946 QA
= _mm_macc_pd(QA
,ww2
,Q1
);
947 QB
= _mm_macc_pd(QB
,ww2
,Q0
);
948 QA
= _mm_macc_pd(QA
,ww
,QB
);
950 RA
= _mm_mul_pd(RA
,zz
);
951 PA
= _mm_mul_pd(PA
,ww
);
953 nom
= _mm_blendv_pd( PA
,RA
,mask
);
954 denom
= _mm_blendv_pd( QA
,SA
,mask
);
956 q
= _mm_mul_pd( nom
, gmx_mm_inv_pd(denom
) );
958 zz
= _mm_add_pd(zz
,zz
);
959 zz
= gmx_mm_sqrt_pd(zz
);
960 z
= _mm_sub_pd(quarterpi
,zz
);
961 zz
= _mm_mul_pd(zz
,q
);
962 zz
= _mm_sub_pd(zz
,morebits
);
963 z
= _mm_sub_pd(z
,zz
);
964 z
= _mm_add_pd(z
,quarterpi
);
966 w
= _mm_macc_pd(xabs
,q
,xabs
);
968 z
= _mm_blendv_pd( w
,z
,mask
);
970 mask
= _mm_cmpgt_pd(xabs
,limit2
);
971 z
= _mm_blendv_pd( xabs
,z
,mask
);
973 z
= _mm_xor_pd(z
,sign
);
980 gmx_mm_acos_pd(__m128d x
)
982 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
983 const __m128d one
= _mm_set1_pd(1.0);
984 const __m128d half
= _mm_set1_pd(0.5);
985 const __m128d pi
= _mm_set1_pd(M_PI
);
986 const __m128d quarterpi0
= _mm_set1_pd(7.85398163397448309616e-1);
987 const __m128d quarterpi1
= _mm_set1_pd(6.123233995736765886130e-17);
994 mask1
= _mm_cmpgt_pd(x
,half
);
995 z1
= _mm_mul_pd(half
,_mm_sub_pd(one
,x
));
996 z1
= gmx_mm_sqrt_pd(z1
);
997 z
= _mm_blendv_pd( x
,z1
,mask1
);
999 z
= gmx_mm_asin_pd(z
);
1001 z1
= _mm_add_pd(z
,z
);
1003 z2
= _mm_sub_pd(quarterpi0
,z
);
1004 z2
= _mm_add_pd(z2
,quarterpi1
);
1005 z2
= _mm_add_pd(z2
,quarterpi0
);
1007 z
= _mm_blendv_pd(z2
,z1
,mask1
);
1013 gmx_mm_atan_pd(__m128d x
)
1015 /* Same algorithm as cephes library */
1016 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
1017 const __m128d limit1
= _mm_set1_pd(0.66);
1018 const __m128d limit2
= _mm_set1_pd(2.41421356237309504880);
1019 const __m128d quarterpi
= _mm_set1_pd(M_PI
/4.0);
1020 const __m128d halfpi
= _mm_set1_pd(M_PI
/2.0);
1021 const __m128d mone
= _mm_set1_pd(-1.0);
1022 const __m128d morebits1
= _mm_set1_pd(0.5*6.123233995736765886130E-17);
1023 const __m128d morebits2
= _mm_set1_pd(6.123233995736765886130E-17);
1025 const __m128d P4
= _mm_set1_pd(-8.750608600031904122785E-1);
1026 const __m128d P3
= _mm_set1_pd(-1.615753718733365076637E1
);
1027 const __m128d P2
= _mm_set1_pd(-7.500855792314704667340E1
);
1028 const __m128d P1
= _mm_set1_pd(-1.228866684490136173410E2
);
1029 const __m128d P0
= _mm_set1_pd(-6.485021904942025371773E1
);
1031 const __m128d Q4
= _mm_set1_pd(2.485846490142306297962E1
);
1032 const __m128d Q3
= _mm_set1_pd(1.650270098316988542046E2
);
1033 const __m128d Q2
= _mm_set1_pd(4.328810604912902668951E2
);
1034 const __m128d Q1
= _mm_set1_pd(4.853903996359136964868E2
);
1035 const __m128d Q0
= _mm_set1_pd(1.945506571482613964425E2
);
1038 __m128d mask1
,mask2
;
1041 __m128d P_A
,P_B
,Q_A
,Q_B
;
1043 sign
= _mm_andnot_pd(signmask
,x
);
1044 x
= _mm_and_pd(x
,signmask
);
1046 mask1
= _mm_cmpgt_pd(x
,limit1
);
1047 mask2
= _mm_cmpgt_pd(x
,limit2
);
1049 t1
= _mm_mul_pd(_mm_add_pd(x
,mone
),gmx_mm_inv_pd(_mm_sub_pd(x
,mone
)));
1050 t2
= _mm_mul_pd(mone
,gmx_mm_inv_pd(x
));
1052 y
= _mm_and_pd(mask1
,quarterpi
);
1053 y
= _mm_or_pd( _mm_and_pd(mask2
,halfpi
) , _mm_andnot_pd(mask2
,y
) );
1055 x
= _mm_or_pd( _mm_and_pd(mask1
,t1
) , _mm_andnot_pd(mask1
,x
) );
1056 x
= _mm_or_pd( _mm_and_pd(mask2
,t2
) , _mm_andnot_pd(mask2
,x
) );
1058 z
= _mm_mul_pd(x
,x
);
1059 z2
= _mm_mul_pd(z
,z
);
1061 P_A
= _mm_macc_pd(P4
,z2
,P2
);
1062 P_B
= _mm_macc_pd(P3
,z2
,P1
);
1063 P_A
= _mm_macc_pd(P_A
,z2
,P0
);
1064 P_A
= _mm_macc_pd(P_B
,z
,P_A
);
1067 Q_B
= _mm_macc_pd(Q4
,z2
,Q2
);
1068 Q_A
= _mm_add_pd(z2
,Q3
);
1069 Q_A
= _mm_macc_pd(Q_A
,z2
,Q1
);
1070 Q_B
= _mm_macc_pd(Q_B
,z2
,Q0
);
1071 Q_A
= _mm_macc_pd(Q_A
,z
,Q_B
);
1073 z
= _mm_mul_pd(z
,P_A
);
1074 z
= _mm_mul_pd(z
,gmx_mm_inv_pd(Q_A
));
1075 z
= _mm_macc_pd(z
,x
,x
);
1077 t1
= _mm_and_pd(mask1
,morebits1
);
1078 t1
= _mm_or_pd( _mm_and_pd(mask2
,morebits2
) , _mm_andnot_pd(mask2
,t1
) );
1080 z
= _mm_add_pd(z
,t1
);
1081 y
= _mm_add_pd(y
,z
);
1083 y
= _mm_xor_pd(y
,sign
);
1090 gmx_mm_atan2_pd(__m128d y
, __m128d x
)
1092 const __m128d pi
= _mm_set1_pd(M_PI
);
1093 const __m128d minuspi
= _mm_set1_pd(-M_PI
);
1094 const __m128d halfpi
= _mm_set1_pd(M_PI
/2.0);
1095 const __m128d minushalfpi
= _mm_set1_pd(-M_PI
/2.0);
1099 __m128d maskx_lt
,maskx_eq
;
1100 __m128d masky_lt
,masky_eq
;
1101 __m128d mask1
,mask2
,mask3
,mask4
,maskall
;
1103 maskx_lt
= _mm_cmplt_pd(x
,_mm_setzero_pd());
1104 masky_lt
= _mm_cmplt_pd(y
,_mm_setzero_pd());
1105 maskx_eq
= _mm_cmpeq_pd(x
,_mm_setzero_pd());
1106 masky_eq
= _mm_cmpeq_pd(y
,_mm_setzero_pd());
1108 z
= _mm_mul_pd(y
,gmx_mm_inv_pd(x
));
1109 z
= gmx_mm_atan_pd(z
);
1111 mask1
= _mm_and_pd(maskx_eq
,masky_lt
);
1112 mask2
= _mm_andnot_pd(maskx_lt
,masky_eq
);
1113 mask3
= _mm_andnot_pd( _mm_or_pd(masky_lt
,masky_eq
) , maskx_eq
);
1114 mask4
= _mm_and_pd(masky_eq
,maskx_lt
);
1116 maskall
= _mm_or_pd( _mm_or_pd(mask1
,mask2
), _mm_or_pd(mask3
,mask4
) );
1118 z
= _mm_andnot_pd(maskall
,z
);
1119 z1
= _mm_and_pd(mask1
,minushalfpi
);
1120 z3
= _mm_and_pd(mask3
,halfpi
);
1121 z4
= _mm_and_pd(mask4
,pi
);
1123 z
= _mm_or_pd( _mm_or_pd(z
,z1
), _mm_or_pd(z3
,z4
) );
1125 w
= _mm_blendv_pd(pi
,minuspi
,masky_lt
);
1126 w
= _mm_and_pd(w
,maskx_lt
);
1128 w
= _mm_andnot_pd(maskall
,w
);
1130 z
= _mm_add_pd(z
,w
);
1135 #endif /*_gmx_math_x86_avx_128_fma_double_h_ */