1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of GROMACS.
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_sse4_1_single_h_
22 #define _gmx_math_x86_sse4_1_single_h_
27 #include "gmx_x86_sse4_1.h"
32 # define M_PI 3.14159265358979323846264338327950288
38 /************************
40 * Simple math routines *
42 ************************/
45 static gmx_inline __m128
46 gmx_mm_invsqrt_ps(__m128 x
)
48 const __m128 half
= _mm_set_ps(0.5,0.5,0.5,0.5);
49 const __m128 three
= _mm_set_ps(3.0,3.0,3.0,3.0);
51 __m128 lu
= _mm_rsqrt_ps(x
);
53 return _mm_mul_ps(half
,_mm_mul_ps(_mm_sub_ps(three
,_mm_mul_ps(_mm_mul_ps(lu
,lu
),x
)),lu
));
56 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
57 static gmx_inline __m128
58 gmx_mm_sqrt_ps(__m128 x
)
63 mask
= _mm_cmpeq_ps(x
,_mm_setzero_ps());
64 res
= _mm_andnot_ps(mask
,gmx_mm_invsqrt_ps(x
));
66 res
= _mm_mul_ps(x
,res
);
72 static gmx_inline __m128
73 gmx_mm_inv_ps(__m128 x
)
75 const __m128 two
= _mm_set_ps(2.0f
,2.0f
,2.0f
,2.0f
);
77 __m128 lu
= _mm_rcp_ps(x
);
79 return _mm_mul_ps(lu
,_mm_sub_ps(two
,_mm_mul_ps(lu
,x
)));
82 static gmx_inline __m128
83 gmx_mm_abs_ps(__m128 x
)
85 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
87 return _mm_and_ps(x
,signmask
);
93 gmx_mm_log_ps(__m128 x
)
95 /* Same algorithm as cephes library */
96 const __m128 expmask
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
97 const __m128i expbase_m1
= _mm_set1_epi32(127-1); /* We want non-IEEE format */
98 const __m128 half
= _mm_set1_ps(0.5f
);
99 const __m128 one
= _mm_set1_ps(1.0f
);
100 const __m128 invsq2
= _mm_set1_ps(1.0f
/sqrt(2.0f
));
101 const __m128 corr1
= _mm_set1_ps(-2.12194440e-4f
);
102 const __m128 corr2
= _mm_set1_ps(0.693359375f
);
104 const __m128 CA_1
= _mm_set1_ps(0.070376836292f
);
105 const __m128 CB_0
= _mm_set1_ps(1.6714950086782716f
);
106 const __m128 CB_1
= _mm_set1_ps(-2.452088066061482f
);
107 const __m128 CC_0
= _mm_set1_ps(1.5220770854701728f
);
108 const __m128 CC_1
= _mm_set1_ps(-1.3422238433233642f
);
109 const __m128 CD_0
= _mm_set1_ps(1.386218787509749f
);
110 const __m128 CD_1
= _mm_set1_ps(0.35075468953796346f
);
111 const __m128 CE_0
= _mm_set1_ps(1.3429983063133937f
);
112 const __m128 CE_1
= _mm_set1_ps(1.807420826584643f
);
119 __m128 pA
,pB
,pC
,pD
,pE
,tB
,tC
,tD
,tE
;
121 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
122 fexp
= _mm_and_ps(x
,expmask
);
123 iexp
= gmx_mm_castps_si128(fexp
);
124 iexp
= _mm_srli_epi32(iexp
,23);
125 iexp
= _mm_sub_epi32(iexp
,expbase_m1
);
127 x
= _mm_andnot_ps(expmask
,x
);
128 x
= _mm_or_ps(x
,one
);
129 x
= _mm_mul_ps(x
,half
);
131 mask
= _mm_cmplt_ps(x
,invsq2
);
133 x
= _mm_add_ps(x
,_mm_and_ps(mask
,x
));
134 x
= _mm_sub_ps(x
,one
);
135 iexp
= _mm_add_epi32(iexp
,gmx_mm_castps_si128(mask
)); /* 0xFFFFFFFF = -1 as int */
137 x2
= _mm_mul_ps(x
,x
);
139 pA
= _mm_mul_ps(CA_1
,x
);
140 pB
= _mm_mul_ps(CB_1
,x
);
141 pC
= _mm_mul_ps(CC_1
,x
);
142 pD
= _mm_mul_ps(CD_1
,x
);
143 pE
= _mm_mul_ps(CE_1
,x
);
144 tB
= _mm_add_ps(CB_0
,x2
);
145 tC
= _mm_add_ps(CC_0
,x2
);
146 tD
= _mm_add_ps(CD_0
,x2
);
147 tE
= _mm_add_ps(CE_0
,x2
);
148 pB
= _mm_add_ps(pB
,tB
);
149 pC
= _mm_add_ps(pC
,tC
);
150 pD
= _mm_add_ps(pD
,tD
);
151 pE
= _mm_add_ps(pE
,tE
);
153 pA
= _mm_mul_ps(pA
,pB
);
154 pC
= _mm_mul_ps(pC
,pD
);
155 pE
= _mm_mul_ps(pE
,x2
);
156 pA
= _mm_mul_ps(pA
,pC
);
157 y
= _mm_mul_ps(pA
,pE
);
159 fexp
= _mm_cvtepi32_ps(iexp
);
160 y
= _mm_add_ps(y
,_mm_mul_ps(fexp
,corr1
));
162 y
= _mm_sub_ps(y
, _mm_mul_ps(half
,x2
));
163 x2
= _mm_add_ps(x
,y
);
165 x2
= _mm_add_ps(x2
,_mm_mul_ps(fexp
,corr2
));
174 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
175 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
177 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
179 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
180 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
181 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
182 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
183 * number instead. That would take a few extra cycles and not really help, since something is
184 * wrong if you are using single precision to work with numbers that cannot really be represented
185 * in single precision.
187 * The accuracy is at least 23 bits.
190 gmx_mm_exp2_ps(__m128 x
)
192 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
193 const __m128 arglimit
= _mm_set1_ps(126.0f
);
195 const __m128i expbase
= _mm_set1_epi32(127);
196 const __m128 CA6
= _mm_set1_ps(1.535336188319500E-004);
197 const __m128 CA5
= _mm_set1_ps(1.339887440266574E-003);
198 const __m128 CA4
= _mm_set1_ps(9.618437357674640E-003);
199 const __m128 CA3
= _mm_set1_ps(5.550332471162809E-002);
200 const __m128 CA2
= _mm_set1_ps(2.402264791363012E-001);
201 const __m128 CA1
= _mm_set1_ps(6.931472028550421E-001);
202 const __m128 CA0
= _mm_set1_ps(1.0f
);
211 iexppart
= _mm_cvtps_epi32(x
);
212 intpart
= _mm_round_ps(x
,_MM_FROUND_TO_NEAREST_INT
);
213 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
,expbase
),23);
214 valuemask
= _mm_cmpge_ps(arglimit
,gmx_mm_abs_ps(x
));
215 fexppart
= _mm_and_ps(valuemask
,gmx_mm_castsi128_ps(iexppart
));
217 x
= _mm_sub_ps(x
,intpart
);
218 x2
= _mm_mul_ps(x
,x
);
220 p0
= _mm_mul_ps(CA6
,x2
);
221 p1
= _mm_mul_ps(CA5
,x2
);
222 p0
= _mm_add_ps(p0
,CA4
);
223 p1
= _mm_add_ps(p1
,CA3
);
224 p0
= _mm_mul_ps(p0
,x2
);
225 p1
= _mm_mul_ps(p1
,x2
);
226 p0
= _mm_add_ps(p0
,CA2
);
227 p1
= _mm_add_ps(p1
,CA1
);
228 p0
= _mm_mul_ps(p0
,x2
);
229 p1
= _mm_mul_ps(p1
,x
);
230 p0
= _mm_add_ps(p0
,CA0
);
231 p0
= _mm_add_ps(p0
,p1
);
232 x
= _mm_mul_ps(p0
,fexppart
);
238 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
239 * but there will then be a small rounding error since we lose some precision due to the
240 * multiplication. This will then be magnified a lot by the exponential.
242 * Instead, we calculate the fractional part directly as a minimax approximation of
243 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
244 * remaining after 2^y, which avoids the precision-loss.
245 * The final result is correct to within 1 LSB over the entire argument range.
248 gmx_mm_exp_ps(__m128 x
)
250 const __m128 argscale
= _mm_set1_ps(1.44269504088896341f
);
251 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
252 const __m128 arglimit
= _mm_set1_ps(126.0f
);
253 const __m128i expbase
= _mm_set1_epi32(127);
255 const __m128 invargscale0
= _mm_set1_ps(0.693359375f
);
256 const __m128 invargscale1
= _mm_set1_ps(-2.12194440e-4f
);
258 const __m128 CC5
= _mm_set1_ps(1.9875691500e-4f
);
259 const __m128 CC4
= _mm_set1_ps(1.3981999507e-3f
);
260 const __m128 CC3
= _mm_set1_ps(8.3334519073e-3f
);
261 const __m128 CC2
= _mm_set1_ps(4.1665795894e-2f
);
262 const __m128 CC1
= _mm_set1_ps(1.6666665459e-1f
);
263 const __m128 CC0
= _mm_set1_ps(5.0000001201e-1f
);
264 const __m128 one
= _mm_set1_ps(1.0f
);
273 y
= _mm_mul_ps(x
,argscale
);
275 iexppart
= _mm_cvtps_epi32(y
);
276 intpart
= _mm_round_ps(y
,_MM_FROUND_TO_NEAREST_INT
);
278 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
,expbase
),23);
279 valuemask
= _mm_cmpge_ps(arglimit
,gmx_mm_abs_ps(y
));
280 fexppart
= _mm_and_ps(valuemask
,gmx_mm_castsi128_ps(iexppart
));
282 /* Extended precision arithmetics */
283 x
= _mm_sub_ps(x
,_mm_mul_ps(invargscale0
,intpart
));
284 x
= _mm_sub_ps(x
,_mm_mul_ps(invargscale1
,intpart
));
286 x2
= _mm_mul_ps(x
,x
);
288 p1
= _mm_mul_ps(CC5
,x2
);
289 p0
= _mm_mul_ps(CC4
,x2
);
290 p1
= _mm_add_ps(p1
,CC3
);
291 p0
= _mm_add_ps(p0
,CC2
);
292 p1
= _mm_mul_ps(p1
,x2
);
293 p0
= _mm_mul_ps(p0
,x2
);
294 p1
= _mm_add_ps(p1
,CC1
);
295 p0
= _mm_add_ps(p0
,CC0
);
296 p1
= _mm_mul_ps(p1
,x
);
297 p0
= _mm_add_ps(p0
,p1
);
298 p0
= _mm_mul_ps(p0
,x2
);
299 x
= _mm_add_ps(x
,one
);
300 x
= _mm_add_ps(x
,p0
);
302 x
= _mm_mul_ps(x
,fexppart
);
307 /* FULL precision. Only errors in LSB */
309 gmx_mm_erf_ps(__m128 x
)
311 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
312 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
313 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
314 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
315 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
316 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
317 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
318 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
319 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
320 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
321 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
322 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
323 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
324 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
325 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
326 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
327 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
328 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
329 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
330 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
331 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
332 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
333 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
334 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
335 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
336 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
337 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
338 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
339 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
340 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
341 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
343 /* Coefficients for expansion of exp(x) in [0,0.1] */
344 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
345 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
346 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
347 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
349 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
350 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
351 const __m128 one
= _mm_set1_ps(1.0f
);
352 const __m128 two
= _mm_set1_ps(2.0f
);
355 __m128 z
,q
,t
,t2
,w
,w2
;
356 __m128 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
358 __m128 res_erf
,res_erfc
,res
;
361 /* Calculate erf() */
362 x2
= _mm_mul_ps(x
,x
);
363 x4
= _mm_mul_ps(x2
,x2
);
365 pA0
= _mm_mul_ps(CA6
,x4
);
366 pA1
= _mm_mul_ps(CA5
,x4
);
367 pA0
= _mm_add_ps(pA0
,CA4
);
368 pA1
= _mm_add_ps(pA1
,CA3
);
369 pA0
= _mm_mul_ps(pA0
,x4
);
370 pA1
= _mm_mul_ps(pA1
,x4
);
371 pA0
= _mm_add_ps(pA0
,CA2
);
372 pA1
= _mm_add_ps(pA1
,CA1
);
373 pA0
= _mm_mul_ps(pA0
,x4
);
374 pA1
= _mm_mul_ps(pA1
,x2
);
375 pA0
= _mm_add_ps(pA0
,pA1
);
376 pA0
= _mm_add_ps(pA0
,CA0
);
378 res_erf
= _mm_mul_ps(x
,pA0
);
382 y
= gmx_mm_abs_ps(x
);
383 t
= gmx_mm_inv_ps(y
);
384 w
= _mm_sub_ps(t
,one
);
385 t2
= _mm_mul_ps(t
,t
);
386 w2
= _mm_mul_ps(w
,w
);
388 * We cannot simply calculate exp(-x2) directly in single precision, since
389 * that will lose a couple of bits of precision due to the multiplication.
390 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
391 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
393 * The only drawback with this is that it requires TWO separate exponential
394 * evaluations, which would be horrible performance-wise. However, the argument
395 * for the second exp() call is always small, so there we simply use a
396 * low-order minimax expansion on [0,0.1].
399 z
= _mm_and_ps(y
,sieve
);
400 q
= _mm_mul_ps( _mm_sub_ps(z
,y
) , _mm_add_ps(z
,y
) );
402 corr
= _mm_mul_ps(CD4
,q
);
403 corr
= _mm_add_ps(corr
,CD3
);
404 corr
= _mm_mul_ps(corr
,q
);
405 corr
= _mm_add_ps(corr
,CD2
);
406 corr
= _mm_mul_ps(corr
,q
);
407 corr
= _mm_add_ps(corr
,one
);
408 corr
= _mm_mul_ps(corr
,q
);
409 corr
= _mm_add_ps(corr
,one
);
411 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
,z
) ) );
412 expmx2
= _mm_mul_ps(expmx2
,corr
);
414 pB1
= _mm_mul_ps(CB9
,w2
);
415 pB0
= _mm_mul_ps(CB8
,w2
);
416 pB1
= _mm_add_ps(pB1
,CB7
);
417 pB0
= _mm_add_ps(pB0
,CB6
);
418 pB1
= _mm_mul_ps(pB1
,w2
);
419 pB0
= _mm_mul_ps(pB0
,w2
);
420 pB1
= _mm_add_ps(pB1
,CB5
);
421 pB0
= _mm_add_ps(pB0
,CB4
);
422 pB1
= _mm_mul_ps(pB1
,w2
);
423 pB0
= _mm_mul_ps(pB0
,w2
);
424 pB1
= _mm_add_ps(pB1
,CB3
);
425 pB0
= _mm_add_ps(pB0
,CB2
);
426 pB1
= _mm_mul_ps(pB1
,w2
);
427 pB0
= _mm_mul_ps(pB0
,w2
);
428 pB1
= _mm_add_ps(pB1
,CB1
);
429 pB1
= _mm_mul_ps(pB1
,w
);
430 pB0
= _mm_add_ps(pB0
,pB1
);
431 pB0
= _mm_add_ps(pB0
,CB0
);
433 pC0
= _mm_mul_ps(CC10
,t2
);
434 pC1
= _mm_mul_ps(CC9
,t2
);
435 pC0
= _mm_add_ps(pC0
,CC8
);
436 pC1
= _mm_add_ps(pC1
,CC7
);
437 pC0
= _mm_mul_ps(pC0
,t2
);
438 pC1
= _mm_mul_ps(pC1
,t2
);
439 pC0
= _mm_add_ps(pC0
,CC6
);
440 pC1
= _mm_add_ps(pC1
,CC5
);
441 pC0
= _mm_mul_ps(pC0
,t2
);
442 pC1
= _mm_mul_ps(pC1
,t2
);
443 pC0
= _mm_add_ps(pC0
,CC4
);
444 pC1
= _mm_add_ps(pC1
,CC3
);
445 pC0
= _mm_mul_ps(pC0
,t2
);
446 pC1
= _mm_mul_ps(pC1
,t2
);
447 pC0
= _mm_add_ps(pC0
,CC2
);
448 pC1
= _mm_add_ps(pC1
,CC1
);
449 pC0
= _mm_mul_ps(pC0
,t2
);
450 pC1
= _mm_mul_ps(pC1
,t
);
451 pC0
= _mm_add_ps(pC0
,pC1
);
452 pC0
= _mm_add_ps(pC0
,CC0
);
453 pC0
= _mm_mul_ps(pC0
,t
);
455 /* SELECT pB0 or pC0 for erfc() */
456 mask
= _mm_cmplt_ps(two
,y
);
457 res_erfc
= _mm_blendv_ps(pB0
,pC0
,mask
);
458 res_erfc
= _mm_mul_ps(res_erfc
,expmx2
);
460 /* erfc(x<0) = 2-erfc(|x|) */
461 mask
= _mm_cmplt_ps(x
,_mm_setzero_ps());
462 res_erfc
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(two
,res_erfc
),mask
);
464 /* Select erf() or erfc() */
465 mask
= _mm_cmplt_ps(y
,_mm_set1_ps(0.75f
));
466 res
= _mm_blendv_ps(_mm_sub_ps(one
,res_erfc
),res_erf
,mask
);
472 /* FULL precision. Only errors in LSB */
474 gmx_mm_erfc_ps(__m128 x
)
476 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
477 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
478 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
479 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
480 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
481 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
482 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
483 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
484 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
485 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
486 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
487 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
488 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
489 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
490 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
491 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
492 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
493 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
494 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
495 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
496 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
497 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
498 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
499 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
500 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
501 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
502 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
503 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
504 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
505 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
506 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
508 /* Coefficients for expansion of exp(x) in [0,0.1] */
509 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
510 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
511 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
512 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
514 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
515 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
516 const __m128 one
= _mm_set1_ps(1.0f
);
517 const __m128 two
= _mm_set1_ps(2.0f
);
520 __m128 z
,q
,t
,t2
,w
,w2
;
521 __m128 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
523 __m128 res_erf
,res_erfc
,res
;
526 /* Calculate erf() */
527 x2
= _mm_mul_ps(x
,x
);
528 x4
= _mm_mul_ps(x2
,x2
);
530 pA0
= _mm_mul_ps(CA6
,x4
);
531 pA1
= _mm_mul_ps(CA5
,x4
);
532 pA0
= _mm_add_ps(pA0
,CA4
);
533 pA1
= _mm_add_ps(pA1
,CA3
);
534 pA0
= _mm_mul_ps(pA0
,x4
);
535 pA1
= _mm_mul_ps(pA1
,x4
);
536 pA0
= _mm_add_ps(pA0
,CA2
);
537 pA1
= _mm_add_ps(pA1
,CA1
);
538 pA0
= _mm_mul_ps(pA0
,x4
);
539 pA1
= _mm_mul_ps(pA1
,x2
);
540 pA0
= _mm_add_ps(pA0
,pA1
);
541 pA0
= _mm_add_ps(pA0
,CA0
);
543 res_erf
= _mm_mul_ps(x
,pA0
);
546 y
= gmx_mm_abs_ps(x
);
547 t
= gmx_mm_inv_ps(y
);
548 w
= _mm_sub_ps(t
,one
);
549 t2
= _mm_mul_ps(t
,t
);
550 w2
= _mm_mul_ps(w
,w
);
552 * We cannot simply calculate exp(-x2) directly in single precision, since
553 * that will lose a couple of bits of precision due to the multiplication.
554 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
555 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
557 * The only drawback with this is that it requires TWO separate exponential
558 * evaluations, which would be horrible performance-wise. However, the argument
559 * for the second exp() call is always small, so there we simply use a
560 * low-order minimax expansion on [0,0.1].
563 z
= _mm_and_ps(y
,sieve
);
564 q
= _mm_mul_ps( _mm_sub_ps(z
,y
) , _mm_add_ps(z
,y
) );
566 corr
= _mm_mul_ps(CD4
,q
);
567 corr
= _mm_add_ps(corr
,CD3
);
568 corr
= _mm_mul_ps(corr
,q
);
569 corr
= _mm_add_ps(corr
,CD2
);
570 corr
= _mm_mul_ps(corr
,q
);
571 corr
= _mm_add_ps(corr
,one
);
572 corr
= _mm_mul_ps(corr
,q
);
573 corr
= _mm_add_ps(corr
,one
);
575 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
,z
) ) );
576 expmx2
= _mm_mul_ps(expmx2
,corr
);
578 pB1
= _mm_mul_ps(CB9
,w2
);
579 pB0
= _mm_mul_ps(CB8
,w2
);
580 pB1
= _mm_add_ps(pB1
,CB7
);
581 pB0
= _mm_add_ps(pB0
,CB6
);
582 pB1
= _mm_mul_ps(pB1
,w2
);
583 pB0
= _mm_mul_ps(pB0
,w2
);
584 pB1
= _mm_add_ps(pB1
,CB5
);
585 pB0
= _mm_add_ps(pB0
,CB4
);
586 pB1
= _mm_mul_ps(pB1
,w2
);
587 pB0
= _mm_mul_ps(pB0
,w2
);
588 pB1
= _mm_add_ps(pB1
,CB3
);
589 pB0
= _mm_add_ps(pB0
,CB2
);
590 pB1
= _mm_mul_ps(pB1
,w2
);
591 pB0
= _mm_mul_ps(pB0
,w2
);
592 pB1
= _mm_add_ps(pB1
,CB1
);
593 pB1
= _mm_mul_ps(pB1
,w
);
594 pB0
= _mm_add_ps(pB0
,pB1
);
595 pB0
= _mm_add_ps(pB0
,CB0
);
597 pC0
= _mm_mul_ps(CC10
,t2
);
598 pC1
= _mm_mul_ps(CC9
,t2
);
599 pC0
= _mm_add_ps(pC0
,CC8
);
600 pC1
= _mm_add_ps(pC1
,CC7
);
601 pC0
= _mm_mul_ps(pC0
,t2
);
602 pC1
= _mm_mul_ps(pC1
,t2
);
603 pC0
= _mm_add_ps(pC0
,CC6
);
604 pC1
= _mm_add_ps(pC1
,CC5
);
605 pC0
= _mm_mul_ps(pC0
,t2
);
606 pC1
= _mm_mul_ps(pC1
,t2
);
607 pC0
= _mm_add_ps(pC0
,CC4
);
608 pC1
= _mm_add_ps(pC1
,CC3
);
609 pC0
= _mm_mul_ps(pC0
,t2
);
610 pC1
= _mm_mul_ps(pC1
,t2
);
611 pC0
= _mm_add_ps(pC0
,CC2
);
612 pC1
= _mm_add_ps(pC1
,CC1
);
613 pC0
= _mm_mul_ps(pC0
,t2
);
614 pC1
= _mm_mul_ps(pC1
,t
);
615 pC0
= _mm_add_ps(pC0
,pC1
);
616 pC0
= _mm_add_ps(pC0
,CC0
);
617 pC0
= _mm_mul_ps(pC0
,t
);
619 /* SELECT pB0 or pC0 for erfc() */
620 mask
= _mm_cmplt_ps(two
,y
);
621 res_erfc
= _mm_blendv_ps(pB0
,pC0
,mask
);
622 res_erfc
= _mm_mul_ps(res_erfc
,expmx2
);
624 /* erfc(x<0) = 2-erfc(|x|) */
625 mask
= _mm_cmplt_ps(x
,_mm_setzero_ps());
626 res_erfc
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(two
,res_erfc
),mask
);
628 /* Select erf() or erfc() */
629 mask
= _mm_cmplt_ps(y
,_mm_set1_ps(0.75f
));
630 res
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(one
,res_erf
),mask
);
636 /* Calculate the force correction due to PME analytically.
638 * This routine is meant to enable analytical evaluation of the
639 * direct-space PME electrostatic force to avoid tables.
641 * The direct-space potential should be Erfc(beta*r)/r, but there
642 * are some problems evaluating that:
644 * First, the error function is difficult (read: expensive) to
645 * approxmiate accurately for intermediate to large arguments, and
646 * this happens already in ranges of beta*r that occur in simulations.
647 * Second, we now try to avoid calculating potentials in Gromacs but
648 * use forces directly.
650 * We can simply things slight by noting that the PME part is really
651 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
653 * V= 1/r - Erf(beta*r)/r
655 * The first term we already have from the inverse square root, so
656 * that we can leave out of this routine.
658 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
659 * the argument beta*r will be in the range 0.15 to ~4. Use your
660 * favorite plotting program to realize how well-behaved Erf(z)/z is
663 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
664 * However, it turns out it is more efficient to approximate f(z)/z and
665 * then only use even powers. This is another minor optimization, since
666 * we actually WANT f(z)/z, because it is going to be multiplied by
667 * the vector between the two atoms to get the vectorial force. The
668 * fastest flops are the ones we can avoid calculating!
670 * So, here's how it should be used:
673 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
674 * 3. Evaluate this routine with z^2 as the argument.
675 * 4. The return value is the expression:
679 * ------------ - --------
682 * 5. Multiply the entire expression by beta^3. This will get you
684 * beta^3*2*exp(-z^2) beta^3*erf(z)
685 * ------------------ - ---------------
688 * or, switching back to r (z=r*beta):
690 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
691 * ----------------------- - -----------
695 * With a bit of math exercise you should be able to confirm that
696 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
698 * 6. Add the result to 1/r^3, multiply by the product of the charges,
699 * and you have your force (divided by r). A final multiplication
700 * with the vector connecting the two particles and you have your
701 * vectorial force to add to the particles.
704 static gmx_inline __m128
705 gmx_mm_pmecorrF_ps(__m128 z2
)
707 const __m128 FN6
= _mm_set1_ps(-1.7357322914161492954e-8f
);
708 const __m128 FN5
= _mm_set1_ps(1.4703624142580877519e-6f
);
709 const __m128 FN4
= _mm_set1_ps(-0.000053401640219807709149f
);
710 const __m128 FN3
= _mm_set1_ps(0.0010054721316683106153f
);
711 const __m128 FN2
= _mm_set1_ps(-0.019278317264888380590f
);
712 const __m128 FN1
= _mm_set1_ps(0.069670166153766424023f
);
713 const __m128 FN0
= _mm_set1_ps(-0.75225204789749321333f
);
715 const __m128 FD4
= _mm_set1_ps(0.0011193462567257629232f
);
716 const __m128 FD3
= _mm_set1_ps(0.014866955030185295499f
);
717 const __m128 FD2
= _mm_set1_ps(0.11583842382862377919f
);
718 const __m128 FD1
= _mm_set1_ps(0.50736591960530292870f
);
719 const __m128 FD0
= _mm_set1_ps(1.0f
);
722 __m128 polyFN0
,polyFN1
,polyFD0
,polyFD1
;
724 z4
= _mm_mul_ps(z2
,z2
);
726 polyFD0
= _mm_mul_ps(FD4
,z4
);
727 polyFD1
= _mm_mul_ps(FD3
,z4
);
728 polyFD0
= _mm_add_ps(polyFD0
,FD2
);
729 polyFD1
= _mm_add_ps(polyFD1
,FD1
);
730 polyFD0
= _mm_mul_ps(polyFD0
,z4
);
731 polyFD1
= _mm_mul_ps(polyFD1
,z2
);
732 polyFD0
= _mm_add_ps(polyFD0
,FD0
);
733 polyFD0
= _mm_add_ps(polyFD0
,polyFD1
);
735 polyFD0
= gmx_mm_inv_ps(polyFD0
);
737 polyFN0
= _mm_mul_ps(FN6
,z4
);
738 polyFN1
= _mm_mul_ps(FN5
,z4
);
739 polyFN0
= _mm_add_ps(polyFN0
,FN4
);
740 polyFN1
= _mm_add_ps(polyFN1
,FN3
);
741 polyFN0
= _mm_mul_ps(polyFN0
,z4
);
742 polyFN1
= _mm_mul_ps(polyFN1
,z4
);
743 polyFN0
= _mm_add_ps(polyFN0
,FN2
);
744 polyFN1
= _mm_add_ps(polyFN1
,FN1
);
745 polyFN0
= _mm_mul_ps(polyFN0
,z4
);
746 polyFN1
= _mm_mul_ps(polyFN1
,z2
);
747 polyFN0
= _mm_add_ps(polyFN0
,FN0
);
748 polyFN0
= _mm_add_ps(polyFN0
,polyFN1
);
750 return _mm_mul_ps(polyFN0
,polyFD0
);
754 /* Calculate the potential correction due to PME analytically.
756 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
758 * This routine calculates Erf(z)/z, although you should provide z^2
759 * as the input argument.
761 * Here's how it should be used:
764 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
765 * 3. Evaluate this routine with z^2 as the argument.
766 * 4. The return value is the expression:
773 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
779 * 6. Add the result to 1/r, multiply by the product of the charges,
780 * and you have your potential.
782 static gmx_inline __m128
783 gmx_mm_pmecorrV_ps(__m128 z2
)
785 const __m128 VN6
= _mm_set1_ps(1.9296833005951166339e-8f
);
786 const __m128 VN5
= _mm_set1_ps(-1.4213390571557850962e-6f
);
787 const __m128 VN4
= _mm_set1_ps(0.000041603292906656984871f
);
788 const __m128 VN3
= _mm_set1_ps(-0.00013134036773265025626f
);
789 const __m128 VN2
= _mm_set1_ps(0.038657983986041781264f
);
790 const __m128 VN1
= _mm_set1_ps(0.11285044772717598220f
);
791 const __m128 VN0
= _mm_set1_ps(1.1283802385263030286f
);
793 const __m128 VD3
= _mm_set1_ps(0.0066752224023576045451f
);
794 const __m128 VD2
= _mm_set1_ps(0.078647795836373922256f
);
795 const __m128 VD1
= _mm_set1_ps(0.43336185284710920150f
);
796 const __m128 VD0
= _mm_set1_ps(1.0f
);
799 __m128 polyVN0
,polyVN1
,polyVD0
,polyVD1
;
801 z4
= _mm_mul_ps(z2
,z2
);
803 polyVD1
= _mm_mul_ps(VD3
,z4
);
804 polyVD0
= _mm_mul_ps(VD2
,z4
);
805 polyVD1
= _mm_add_ps(polyVD1
,VD1
);
806 polyVD0
= _mm_add_ps(polyVD0
,VD0
);
807 polyVD1
= _mm_mul_ps(polyVD1
,z2
);
808 polyVD0
= _mm_add_ps(polyVD0
,polyVD1
);
810 polyVD0
= gmx_mm_inv_ps(polyVD0
);
812 polyVN0
= _mm_mul_ps(VN6
,z4
);
813 polyVN1
= _mm_mul_ps(VN5
,z4
);
814 polyVN0
= _mm_add_ps(polyVN0
,VN4
);
815 polyVN1
= _mm_add_ps(polyVN1
,VN3
);
816 polyVN0
= _mm_mul_ps(polyVN0
,z4
);
817 polyVN1
= _mm_mul_ps(polyVN1
,z4
);
818 polyVN0
= _mm_add_ps(polyVN0
,VN2
);
819 polyVN1
= _mm_add_ps(polyVN1
,VN1
);
820 polyVN0
= _mm_mul_ps(polyVN0
,z4
);
821 polyVN1
= _mm_mul_ps(polyVN1
,z2
);
822 polyVN0
= _mm_add_ps(polyVN0
,VN0
);
823 polyVN0
= _mm_add_ps(polyVN0
,polyVN1
);
825 return _mm_mul_ps(polyVN0
,polyVD0
);
830 gmx_mm_sincos_ps(__m128 x
,
834 const __m128 two_over_pi
= _mm_set1_ps(2.0/M_PI
);
835 const __m128 half
= _mm_set1_ps(0.5);
836 const __m128 one
= _mm_set1_ps(1.0);
838 const __m128i izero
= _mm_set1_epi32(0);
839 const __m128i ione
= _mm_set1_epi32(1);
840 const __m128i itwo
= _mm_set1_epi32(2);
841 const __m128i ithree
= _mm_set1_epi32(3);
842 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
844 const __m128 CA1
= _mm_set1_ps(1.5703125f
);
845 const __m128 CA2
= _mm_set1_ps(4.837512969970703125e-4f
);
846 const __m128 CA3
= _mm_set1_ps(7.54978995489188216e-8f
);
848 const __m128 CC0
= _mm_set1_ps(-0.0013602249f
);
849 const __m128 CC1
= _mm_set1_ps(0.0416566950f
);
850 const __m128 CC2
= _mm_set1_ps(-0.4999990225f
);
851 const __m128 CS0
= _mm_set1_ps(-0.0001950727f
);
852 const __m128 CS1
= _mm_set1_ps(0.0083320758f
);
853 const __m128 CS2
= _mm_set1_ps(-0.1666665247f
);
858 __m128i offset_sin
,offset_cos
;
860 __m128 mask_sin
,mask_cos
;
861 __m128 tmp_sin
,tmp_cos
;
863 y
= _mm_mul_ps(x
,two_over_pi
);
864 y
= _mm_add_ps(y
,_mm_or_ps(_mm_and_ps(y
,signbit
),half
));
866 iz
= _mm_cvttps_epi32(y
);
867 z
= _mm_round_ps(y
,_MM_FROUND_TO_ZERO
);
869 offset_sin
= _mm_and_si128(iz
,ithree
);
870 offset_cos
= _mm_add_epi32(iz
,ione
);
872 /* Extended precision arithmethic to achieve full precision */
873 y
= _mm_mul_ps(z
,CA1
);
874 tmp1
= _mm_mul_ps(z
,CA2
);
875 tmp2
= _mm_mul_ps(z
,CA3
);
877 y
= _mm_sub_ps(y
,tmp1
);
878 y
= _mm_sub_ps(y
,tmp2
);
880 y2
= _mm_mul_ps(y
,y
);
882 tmp1
= _mm_mul_ps(CC0
,y2
);
883 tmp1
= _mm_add_ps(tmp1
,CC1
);
884 tmp2
= _mm_mul_ps(CS0
,y2
);
885 tmp2
= _mm_add_ps(tmp2
,CS1
);
886 tmp1
= _mm_mul_ps(tmp1
,y2
);
887 tmp1
= _mm_add_ps(tmp1
,CC2
);
888 tmp2
= _mm_mul_ps(tmp2
,y2
);
889 tmp2
= _mm_add_ps(tmp2
,CS2
);
891 tmp1
= _mm_mul_ps(tmp1
,y2
);
892 tmp1
= _mm_add_ps(tmp1
,one
);
894 tmp2
= _mm_mul_ps(tmp2
,_mm_mul_ps(y
,y2
));
895 tmp2
= _mm_add_ps(tmp2
,y
);
897 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
,ione
), izero
));
898 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
,ione
), izero
));
900 tmp_sin
= _mm_blendv_ps(tmp1
,tmp2
,mask_sin
);
901 tmp_cos
= _mm_blendv_ps(tmp1
,tmp2
,mask_cos
);
903 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
,itwo
), izero
));
904 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
,itwo
), izero
));
906 tmp1
= _mm_xor_ps(signbit
,tmp_sin
);
907 tmp2
= _mm_xor_ps(signbit
,tmp_cos
);
909 *sinval
= _mm_blendv_ps(tmp1
,tmp_sin
,mask_sin
);
910 *cosval
= _mm_blendv_ps(tmp2
,tmp_cos
,mask_cos
);
916 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
917 * will then call the sincos() routine and waste a factor 2 in performance!
920 gmx_mm_sin_ps(__m128 x
)
923 gmx_mm_sincos_ps(x
,&s
,&c
);
928 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
929 * will then call the sincos() routine and waste a factor 2 in performance!
932 gmx_mm_cos_ps(__m128 x
)
935 gmx_mm_sincos_ps(x
,&s
,&c
);
941 gmx_mm_tan_ps(__m128 x
)
943 __m128 sinval
,cosval
;
946 gmx_mm_sincos_ps(x
,&sinval
,&cosval
);
948 tanval
= _mm_mul_ps(sinval
,gmx_mm_inv_ps(cosval
));
955 gmx_mm_asin_ps(__m128 x
)
957 /* Same algorithm as cephes library */
958 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
959 const __m128 limitlow
= _mm_set1_ps(1e-4f
);
960 const __m128 half
= _mm_set1_ps(0.5f
);
961 const __m128 one
= _mm_set1_ps(1.0f
);
962 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0f
);
964 const __m128 CC5
= _mm_set1_ps(4.2163199048E-2f
);
965 const __m128 CC4
= _mm_set1_ps(2.4181311049E-2f
);
966 const __m128 CC3
= _mm_set1_ps(4.5470025998E-2f
);
967 const __m128 CC2
= _mm_set1_ps(7.4953002686E-2f
);
968 const __m128 CC1
= _mm_set1_ps(1.6666752422E-1f
);
973 __m128 z
,z1
,z2
,q
,q1
,q2
;
976 sign
= _mm_andnot_ps(signmask
,x
);
977 xabs
= _mm_and_ps(x
,signmask
);
979 mask
= _mm_cmpgt_ps(xabs
,half
);
981 z1
= _mm_mul_ps(half
, _mm_sub_ps(one
,xabs
));
982 q1
= _mm_mul_ps(z1
,gmx_mm_invsqrt_ps(z1
));
983 q1
= _mm_andnot_ps(_mm_cmpeq_ps(xabs
,one
),q1
);
986 z2
= _mm_mul_ps(q2
,q2
);
988 z
= _mm_or_ps( _mm_and_ps(mask
,z1
) , _mm_andnot_ps(mask
,z2
) );
989 q
= _mm_or_ps( _mm_and_ps(mask
,q1
) , _mm_andnot_ps(mask
,q2
) );
991 z2
= _mm_mul_ps(z
,z
);
993 pA
= _mm_mul_ps(CC5
,z2
);
994 pB
= _mm_mul_ps(CC4
,z2
);
996 pA
= _mm_add_ps(pA
,CC3
);
997 pB
= _mm_add_ps(pB
,CC2
);
999 pA
= _mm_mul_ps(pA
,z2
);
1000 pB
= _mm_mul_ps(pB
,z2
);
1002 pA
= _mm_add_ps(pA
,CC1
);
1003 pA
= _mm_mul_ps(pA
,z
);
1005 z
= _mm_add_ps(pA
,pB
);
1006 z
= _mm_mul_ps(z
,q
);
1007 z
= _mm_add_ps(z
,q
);
1009 q2
= _mm_sub_ps(halfpi
,z
);
1010 q2
= _mm_sub_ps(q2
,z
);
1012 z
= _mm_or_ps( _mm_and_ps(mask
,q2
) , _mm_andnot_ps(mask
,z
) );
1014 mask
= _mm_cmpgt_ps(xabs
,limitlow
);
1015 z
= _mm_or_ps( _mm_and_ps(mask
,z
) , _mm_andnot_ps(mask
,xabs
) );
1017 z
= _mm_xor_ps(z
,sign
);
1024 gmx_mm_acos_ps(__m128 x
)
1026 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1027 const __m128 one_ps
= _mm_set1_ps(1.0f
);
1028 const __m128 half_ps
= _mm_set1_ps(0.5f
);
1029 const __m128 pi_ps
= _mm_set1_ps(M_PI
);
1030 const __m128 halfpi_ps
= _mm_set1_ps(M_PI
/2.0f
);
1037 xabs
= _mm_and_ps(x
,signmask
);
1038 mask1
= _mm_cmpgt_ps(xabs
,half_ps
);
1039 mask2
= _mm_cmpgt_ps(x
,_mm_setzero_ps());
1041 z
= _mm_mul_ps(half_ps
,_mm_sub_ps(one_ps
,xabs
));
1042 z
= _mm_mul_ps(z
,gmx_mm_invsqrt_ps(z
));
1043 z
= _mm_andnot_ps(_mm_cmpeq_ps(xabs
,one_ps
),z
);
1045 z
= _mm_blendv_ps(x
,z
,mask1
);
1046 z
= gmx_mm_asin_ps(z
);
1048 z2
= _mm_add_ps(z
,z
);
1049 z1
= _mm_sub_ps(pi_ps
,z2
);
1050 z3
= _mm_sub_ps(halfpi_ps
,z
);
1052 z
= _mm_blendv_ps(z1
,z2
,mask2
);
1053 z
= _mm_blendv_ps(z3
,z
,mask1
);
1060 gmx_mm_atan_ps(__m128 x
)
1062 /* Same algorithm as cephes library */
1063 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1064 const __m128 limit1
= _mm_set1_ps(0.414213562373095f
);
1065 const __m128 limit2
= _mm_set1_ps(2.414213562373095f
);
1066 const __m128 quarterpi
= _mm_set1_ps(0.785398163397448f
);
1067 const __m128 halfpi
= _mm_set1_ps(1.570796326794896f
);
1068 const __m128 mone
= _mm_set1_ps(-1.0f
);
1069 const __m128 CC3
= _mm_set1_ps(-3.33329491539E-1f
);
1070 const __m128 CC5
= _mm_set1_ps(1.99777106478E-1f
);
1071 const __m128 CC7
= _mm_set1_ps(-1.38776856032E-1);
1072 const __m128 CC9
= _mm_set1_ps(8.05374449538e-2f
);
1080 sign
= _mm_andnot_ps(signmask
,x
);
1081 x
= _mm_and_ps(x
,signmask
);
1083 mask1
= _mm_cmpgt_ps(x
,limit1
);
1084 mask2
= _mm_cmpgt_ps(x
,limit2
);
1086 z1
= _mm_mul_ps(_mm_add_ps(x
,mone
),gmx_mm_inv_ps(_mm_sub_ps(x
,mone
)));
1087 z2
= _mm_mul_ps(mone
,gmx_mm_inv_ps(x
));
1089 y
= _mm_and_ps(mask1
,quarterpi
);
1090 y
= _mm_blendv_ps(y
,halfpi
,mask2
);
1092 x
= _mm_blendv_ps(x
,z1
,mask1
);
1093 x
= _mm_blendv_ps(x
,z2
,mask2
);
1095 x2
= _mm_mul_ps(x
,x
);
1096 x4
= _mm_mul_ps(x2
,x2
);
1098 sum1
= _mm_mul_ps(CC9
,x4
);
1099 sum2
= _mm_mul_ps(CC7
,x4
);
1100 sum1
= _mm_add_ps(sum1
,CC5
);
1101 sum2
= _mm_add_ps(sum2
,CC3
);
1102 sum1
= _mm_mul_ps(sum1
,x4
);
1103 sum2
= _mm_mul_ps(sum2
,x2
);
1105 sum1
= _mm_add_ps(sum1
,sum2
);
1106 sum1
= _mm_sub_ps(sum1
,mone
);
1107 sum1
= _mm_mul_ps(sum1
,x
);
1108 y
= _mm_add_ps(y
,sum1
);
1110 y
= _mm_xor_ps(y
,sign
);
1117 gmx_mm_atan2_ps(__m128 y
, __m128 x
)
1119 const __m128 pi
= _mm_set1_ps(M_PI
);
1120 const __m128 minuspi
= _mm_set1_ps(-M_PI
);
1121 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0);
1122 const __m128 minushalfpi
= _mm_set1_ps(-M_PI
/2.0);
1126 __m128 maskx_lt
,maskx_eq
;
1127 __m128 masky_lt
,masky_eq
;
1128 __m128 mask1
,mask2
,mask3
,mask4
,maskall
;
1130 maskx_lt
= _mm_cmplt_ps(x
,_mm_setzero_ps());
1131 masky_lt
= _mm_cmplt_ps(y
,_mm_setzero_ps());
1132 maskx_eq
= _mm_cmpeq_ps(x
,_mm_setzero_ps());
1133 masky_eq
= _mm_cmpeq_ps(y
,_mm_setzero_ps());
1135 z
= _mm_mul_ps(y
,gmx_mm_inv_ps(x
));
1136 z
= gmx_mm_atan_ps(z
);
1138 mask1
= _mm_and_ps(maskx_eq
,masky_lt
);
1139 mask2
= _mm_andnot_ps(maskx_lt
,masky_eq
);
1140 mask3
= _mm_andnot_ps( _mm_or_ps(masky_lt
,masky_eq
) , maskx_eq
);
1141 mask4
= _mm_and_ps(masky_eq
,maskx_lt
);
1143 maskall
= _mm_or_ps( _mm_or_ps(mask1
,mask2
), _mm_or_ps(mask3
,mask4
) );
1145 z
= _mm_andnot_ps(maskall
,z
);
1146 z1
= _mm_and_ps(mask1
,minushalfpi
);
1147 z3
= _mm_and_ps(mask3
,halfpi
);
1148 z4
= _mm_and_ps(mask4
,pi
);
1150 z
= _mm_or_ps( _mm_or_ps(z
,z1
), _mm_or_ps(z3
,z4
) );
1152 mask1
= _mm_andnot_ps(masky_lt
,maskx_lt
);
1153 mask2
= _mm_and_ps(maskx_lt
,masky_lt
);
1155 w
= _mm_or_ps( _mm_and_ps(mask1
,pi
), _mm_and_ps(mask2
,minuspi
) );
1156 w
= _mm_andnot_ps(maskall
,w
);
1158 z
= _mm_add_ps(z
,w
);
1165 #endif /* _gmx_math_x86_sse4_1_single_h_ */