1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of GROMACS.
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_avx_256_single_h_
22 #define _gmx_math_x86_avx_256_single_h_
24 #include "gmx_x86_avx_256.h"
28 # define M_PI 3.14159265358979323846264338327950288
33 /************************
35 * Simple math routines *
37 ************************/
39 /* 1.0/sqrt(x), 256-bit wide version */
40 static gmx_inline __m256
41 gmx_mm256_invsqrt_ps(__m256 x
)
43 const __m256 half
= _mm256_set1_ps(0.5f
);
44 const __m256 three
= _mm256_set1_ps(3.0f
);
46 __m256 lu
= _mm256_rsqrt_ps(x
);
48 return _mm256_mul_ps(half
,_mm256_mul_ps(_mm256_sub_ps(three
,_mm256_mul_ps(_mm256_mul_ps(lu
,lu
),x
)),lu
));
51 /* 1.0/sqrt(x), 128-bit wide version */
52 static gmx_inline __m128
53 gmx_mm_invsqrt_ps(__m128 x
)
55 const __m128 half
= _mm_set_ps(0.5,0.5,0.5,0.5);
56 const __m128 three
= _mm_set_ps(3.0,3.0,3.0,3.0);
58 __m128 lu
= _mm_rsqrt_ps(x
);
60 return _mm_mul_ps(half
,_mm_mul_ps(_mm_sub_ps(three
,_mm_mul_ps(_mm_mul_ps(lu
,lu
),x
)),lu
));
64 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
65 static gmx_inline __m256
66 gmx_mm256_sqrt_ps(__m256 x
)
71 mask
= _mm256_cmp_ps(x
,_mm256_setzero_ps(),_CMP_EQ_OQ
);
72 res
= _mm256_andnot_ps(mask
,gmx_mm256_invsqrt_ps(x
));
74 res
= _mm256_mul_ps(x
,res
);
79 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
80 static gmx_inline __m128
81 gmx_mm_sqrt_ps(__m128 x
)
86 mask
= _mm_cmpeq_ps(x
,_mm_setzero_ps());
87 res
= _mm_andnot_ps(mask
,gmx_mm_invsqrt_ps(x
));
89 res
= _mm_mul_ps(x
,res
);
95 /* 1.0/x, 256-bit wide */
96 static gmx_inline __m256
97 gmx_mm256_inv_ps(__m256 x
)
99 const __m256 two
= _mm256_set1_ps(2.0f
);
101 __m256 lu
= _mm256_rcp_ps(x
);
103 return _mm256_mul_ps(lu
,_mm256_sub_ps(two
,_mm256_mul_ps(lu
,x
)));
106 /* 1.0/x, 128-bit wide */
107 static gmx_inline __m128
108 gmx_mm_inv_ps(__m128 x
)
110 const __m128 two
= _mm_set_ps(2.0f
,2.0f
,2.0f
,2.0f
);
112 __m128 lu
= _mm_rcp_ps(x
);
114 return _mm_mul_ps(lu
,_mm_sub_ps(two
,_mm_mul_ps(lu
,x
)));
118 static gmx_inline __m256
119 gmx_mm256_abs_ps(__m256 x
)
121 const __m256 signmask
= _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
123 return _mm256_and_ps(x
,signmask
);
126 static gmx_inline __m128
127 gmx_mm_abs_ps(__m128 x
)
129 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
131 return _mm_and_ps(x
,signmask
);
136 gmx_mm256_log_ps(__m256 x
)
138 const __m256 expmask
= _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
139 const __m128i expbase_m1
= _mm_set1_epi32(127-1); /* We want non-IEEE format */
140 const __m256 half
= _mm256_set1_ps(0.5f
);
141 const __m256 one
= _mm256_set1_ps(1.0f
);
142 const __m256 invsq2
= _mm256_set1_ps(1.0f
/sqrt(2.0f
));
143 const __m256 corr1
= _mm256_set1_ps(-2.12194440e-4f
);
144 const __m256 corr2
= _mm256_set1_ps(0.693359375f
);
146 const __m256 CA_1
= _mm256_set1_ps(0.070376836292f
);
147 const __m256 CB_0
= _mm256_set1_ps(1.6714950086782716f
);
148 const __m256 CB_1
= _mm256_set1_ps(-2.452088066061482f
);
149 const __m256 CC_0
= _mm256_set1_ps(1.5220770854701728f
);
150 const __m256 CC_1
= _mm256_set1_ps(-1.3422238433233642f
);
151 const __m256 CD_0
= _mm256_set1_ps(1.386218787509749f
);
152 const __m256 CD_1
= _mm256_set1_ps(0.35075468953796346f
);
153 const __m256 CE_0
= _mm256_set1_ps(1.3429983063133937f
);
154 const __m256 CE_1
= _mm256_set1_ps(1.807420826584643f
);
158 __m128i iexp128a
,iexp128b
;
161 __m128i imask128a
,imask128b
;
164 __m256 pA
,pB
,pC
,pD
,pE
,tB
,tC
,tD
,tE
;
166 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
167 fexp
= _mm256_and_ps(x
,expmask
);
168 iexp
= _mm256_castps_si256(fexp
);
170 iexp128b
= _mm256_extractf128_si256(iexp
,0x1);
171 iexp128a
= _mm256_castsi256_si128(iexp
);
173 iexp128a
= _mm_srli_epi32(iexp128a
,23);
174 iexp128b
= _mm_srli_epi32(iexp128b
,23);
175 iexp128a
= _mm_sub_epi32(iexp128a
,expbase_m1
);
176 iexp128b
= _mm_sub_epi32(iexp128b
,expbase_m1
);
178 x
= _mm256_andnot_ps(expmask
,x
);
179 x
= _mm256_or_ps(x
,one
);
180 x
= _mm256_mul_ps(x
,half
);
182 mask
= _mm256_cmp_ps(x
,invsq2
,_CMP_LT_OQ
);
184 x
= _mm256_add_ps(x
,_mm256_and_ps(mask
,x
));
185 x
= _mm256_sub_ps(x
,one
);
187 imask
= _mm256_castps_si256(mask
);
189 imask128b
= _mm256_extractf128_si256(imask
,0x1);
190 imask128a
= _mm256_castsi256_si128(imask
);
192 iexp128a
= _mm_add_epi32(iexp128a
,imask128a
);
193 iexp128b
= _mm_add_epi32(iexp128b
,imask128b
);
195 iexp
= _mm256_castsi128_si256(iexp128a
);
196 iexp
= _mm256_insertf128_si256(iexp
,iexp128b
,0x1);
198 x2
= _mm256_mul_ps(x
,x
);
200 pA
= _mm256_mul_ps(CA_1
,x
);
201 pB
= _mm256_mul_ps(CB_1
,x
);
202 pC
= _mm256_mul_ps(CC_1
,x
);
203 pD
= _mm256_mul_ps(CD_1
,x
);
204 pE
= _mm256_mul_ps(CE_1
,x
);
205 tB
= _mm256_add_ps(CB_0
,x2
);
206 tC
= _mm256_add_ps(CC_0
,x2
);
207 tD
= _mm256_add_ps(CD_0
,x2
);
208 tE
= _mm256_add_ps(CE_0
,x2
);
209 pB
= _mm256_add_ps(pB
,tB
);
210 pC
= _mm256_add_ps(pC
,tC
);
211 pD
= _mm256_add_ps(pD
,tD
);
212 pE
= _mm256_add_ps(pE
,tE
);
214 pA
= _mm256_mul_ps(pA
,pB
);
215 pC
= _mm256_mul_ps(pC
,pD
);
216 pE
= _mm256_mul_ps(pE
,x2
);
217 pA
= _mm256_mul_ps(pA
,pC
);
218 y
= _mm256_mul_ps(pA
,pE
);
220 fexp
= _mm256_cvtepi32_ps(iexp
);
221 y
= _mm256_add_ps(y
,_mm256_mul_ps(fexp
,corr1
));
223 y
= _mm256_sub_ps(y
, _mm256_mul_ps(half
,x2
));
224 x2
= _mm256_add_ps(x
,y
);
226 x2
= _mm256_add_ps(x2
,_mm256_mul_ps(fexp
,corr2
));
233 gmx_mm_log_ps(__m128 x
)
235 /* Same algorithm as cephes library */
236 const __m128 expmask
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
237 const __m128i expbase_m1
= _mm_set1_epi32(127-1); /* We want non-IEEE format */
238 const __m128 half
= _mm_set1_ps(0.5f
);
239 const __m128 one
= _mm_set1_ps(1.0f
);
240 const __m128 invsq2
= _mm_set1_ps(1.0f
/sqrt(2.0f
));
241 const __m128 corr1
= _mm_set1_ps(-2.12194440e-4f
);
242 const __m128 corr2
= _mm_set1_ps(0.693359375f
);
244 const __m128 CA_1
= _mm_set1_ps(0.070376836292f
);
245 const __m128 CB_0
= _mm_set1_ps(1.6714950086782716f
);
246 const __m128 CB_1
= _mm_set1_ps(-2.452088066061482f
);
247 const __m128 CC_0
= _mm_set1_ps(1.5220770854701728f
);
248 const __m128 CC_1
= _mm_set1_ps(-1.3422238433233642f
);
249 const __m128 CD_0
= _mm_set1_ps(1.386218787509749f
);
250 const __m128 CD_1
= _mm_set1_ps(0.35075468953796346f
);
251 const __m128 CE_0
= _mm_set1_ps(1.3429983063133937f
);
252 const __m128 CE_1
= _mm_set1_ps(1.807420826584643f
);
259 __m128 pA
,pB
,pC
,pD
,pE
,tB
,tC
,tD
,tE
;
261 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
262 fexp
= _mm_and_ps(x
,expmask
);
263 iexp
= gmx_mm_castps_si128(fexp
);
264 iexp
= _mm_srli_epi32(iexp
,23);
265 iexp
= _mm_sub_epi32(iexp
,expbase_m1
);
267 x
= _mm_andnot_ps(expmask
,x
);
268 x
= _mm_or_ps(x
,one
);
269 x
= _mm_mul_ps(x
,half
);
271 mask
= _mm_cmplt_ps(x
,invsq2
);
273 x
= _mm_add_ps(x
,_mm_and_ps(mask
,x
));
274 x
= _mm_sub_ps(x
,one
);
275 iexp
= _mm_add_epi32(iexp
,gmx_mm_castps_si128(mask
)); /* 0xFFFFFFFF = -1 as int */
277 x2
= _mm_mul_ps(x
,x
);
279 pA
= _mm_mul_ps(CA_1
,x
);
280 pB
= _mm_mul_ps(CB_1
,x
);
281 pC
= _mm_mul_ps(CC_1
,x
);
282 pD
= _mm_mul_ps(CD_1
,x
);
283 pE
= _mm_mul_ps(CE_1
,x
);
284 tB
= _mm_add_ps(CB_0
,x2
);
285 tC
= _mm_add_ps(CC_0
,x2
);
286 tD
= _mm_add_ps(CD_0
,x2
);
287 tE
= _mm_add_ps(CE_0
,x2
);
288 pB
= _mm_add_ps(pB
,tB
);
289 pC
= _mm_add_ps(pC
,tC
);
290 pD
= _mm_add_ps(pD
,tD
);
291 pE
= _mm_add_ps(pE
,tE
);
293 pA
= _mm_mul_ps(pA
,pB
);
294 pC
= _mm_mul_ps(pC
,pD
);
295 pE
= _mm_mul_ps(pE
,x2
);
296 pA
= _mm_mul_ps(pA
,pC
);
297 y
= _mm_mul_ps(pA
,pE
);
299 fexp
= _mm_cvtepi32_ps(iexp
);
300 y
= _mm_add_ps(y
,_mm_mul_ps(fexp
,corr1
));
302 y
= _mm_sub_ps(y
, _mm_mul_ps(half
,x2
));
303 x2
= _mm_add_ps(x
,y
);
305 x2
= _mm_add_ps(x2
,_mm_mul_ps(fexp
,corr2
));
312 * 2^x function, 256-bit wide
314 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
315 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
317 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
319 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
320 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
321 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
322 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
323 * number instead. That would take a few extra cycles and not really help, since something is
324 * wrong if you are using single precision to work with numbers that cannot really be represented
325 * in single precision.
327 * The accuracy is at least 23 bits.
330 gmx_mm256_exp2_ps(__m256 x
)
332 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
333 const __m256 arglimit
= _mm256_set1_ps(126.0f
);
335 const __m128i expbase
= _mm_set1_epi32(127);
336 const __m256 CC6
= _mm256_set1_ps(1.535336188319500E-004);
337 const __m256 CC5
= _mm256_set1_ps(1.339887440266574E-003);
338 const __m256 CC4
= _mm256_set1_ps(9.618437357674640E-003);
339 const __m256 CC3
= _mm256_set1_ps(5.550332471162809E-002);
340 const __m256 CC2
= _mm256_set1_ps(2.402264791363012E-001);
341 const __m256 CC1
= _mm256_set1_ps(6.931472028550421E-001);
342 const __m256 CC0
= _mm256_set1_ps(1.0f
);
347 __m128i iexppart128a
,iexppart128b
;
353 iexppart
= _mm256_cvtps_epi32(x
);
354 intpart
= _mm256_round_ps(x
,_MM_FROUND_TO_NEAREST_INT
);
356 iexppart128b
= _mm256_extractf128_si256(iexppart
,0x1);
357 iexppart128a
= _mm256_castsi256_si128(iexppart
);
359 iexppart128a
= _mm_slli_epi32(_mm_add_epi32(iexppart128a
,expbase
),23);
360 iexppart128b
= _mm_slli_epi32(_mm_add_epi32(iexppart128b
,expbase
),23);
362 iexppart
= _mm256_castsi128_si256(iexppart128a
);
363 iexppart
= _mm256_insertf128_si256(iexppart
,iexppart128b
,0x1);
364 valuemask
= _mm256_cmp_ps(arglimit
,gmx_mm256_abs_ps(x
),_CMP_GE_OQ
);
365 fexppart
= _mm256_and_ps(valuemask
,_mm256_castsi256_ps(iexppart
));
367 x
= _mm256_sub_ps(x
,intpart
);
368 x2
= _mm256_mul_ps(x
,x
);
370 p0
= _mm256_mul_ps(CC6
,x2
);
371 p1
= _mm256_mul_ps(CC5
,x2
);
372 p0
= _mm256_add_ps(p0
,CC4
);
373 p1
= _mm256_add_ps(p1
,CC3
);
374 p0
= _mm256_mul_ps(p0
,x2
);
375 p1
= _mm256_mul_ps(p1
,x2
);
376 p0
= _mm256_add_ps(p0
,CC2
);
377 p1
= _mm256_add_ps(p1
,CC1
);
378 p0
= _mm256_mul_ps(p0
,x2
);
379 p1
= _mm256_mul_ps(p1
,x
);
380 p0
= _mm256_add_ps(p0
,CC0
);
381 p0
= _mm256_add_ps(p0
,p1
);
382 x
= _mm256_mul_ps(p0
,fexppart
);
388 /* 2^x, 128 bit wide */
390 gmx_mm_exp2_ps(__m128 x
)
392 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
393 const __m128 arglimit
= _mm_set1_ps(126.0f
);
395 const __m128i expbase
= _mm_set1_epi32(127);
396 const __m128 CA6
= _mm_set1_ps(1.535336188319500E-004);
397 const __m128 CA5
= _mm_set1_ps(1.339887440266574E-003);
398 const __m128 CA4
= _mm_set1_ps(9.618437357674640E-003);
399 const __m128 CA3
= _mm_set1_ps(5.550332471162809E-002);
400 const __m128 CA2
= _mm_set1_ps(2.402264791363012E-001);
401 const __m128 CA1
= _mm_set1_ps(6.931472028550421E-001);
402 const __m128 CA0
= _mm_set1_ps(1.0f
);
411 iexppart
= _mm_cvtps_epi32(x
);
412 intpart
= _mm_round_ps(x
,_MM_FROUND_TO_NEAREST_INT
);
413 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
,expbase
),23);
414 valuemask
= _mm_cmpge_ps(arglimit
,gmx_mm_abs_ps(x
));
415 fexppart
= _mm_and_ps(valuemask
,gmx_mm_castsi128_ps(iexppart
));
417 x
= _mm_sub_ps(x
,intpart
);
418 x2
= _mm_mul_ps(x
,x
);
420 p0
= _mm_mul_ps(CA6
,x2
);
421 p1
= _mm_mul_ps(CA5
,x2
);
422 p0
= _mm_add_ps(p0
,CA4
);
423 p1
= _mm_add_ps(p1
,CA3
);
424 p0
= _mm_mul_ps(p0
,x2
);
425 p1
= _mm_mul_ps(p1
,x2
);
426 p0
= _mm_add_ps(p0
,CA2
);
427 p1
= _mm_add_ps(p1
,CA1
);
428 p0
= _mm_mul_ps(p0
,x2
);
429 p1
= _mm_mul_ps(p1
,x
);
430 p0
= _mm_add_ps(p0
,CA0
);
431 p0
= _mm_add_ps(p0
,p1
);
432 x
= _mm_mul_ps(p0
,fexppart
);
438 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
439 * where y=log2(e)*x, but there will then be a small rounding error since we lose some
440 * precision due to the multiplication. This will then be magnified a lot by the exponential.
442 * Instead, we calculate the fractional part directly as a minimax approximation of
443 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
444 * remaining after 2^y, which avoids the precision-loss.
445 * The final result is correct to within 1 LSB over the entire argument range.
448 gmx_mm256_exp_ps(__m256 exparg
)
450 const __m256 argscale
= _mm256_set1_ps(1.44269504088896341f
);
451 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
452 const __m256 arglimit
= _mm256_set1_ps(126.0f
);
453 const __m128i expbase
= _mm_set1_epi32(127);
455 const __m256 invargscale0
= _mm256_set1_ps(0.693359375f
);
456 const __m256 invargscale1
= _mm256_set1_ps(-2.12194440e-4f
);
458 const __m256 CE5
= _mm256_set1_ps(1.9875691500e-4f
);
459 const __m256 CE4
= _mm256_set1_ps(1.3981999507e-3f
);
460 const __m256 CE3
= _mm256_set1_ps(8.3334519073e-3f
);
461 const __m256 CE2
= _mm256_set1_ps(4.1665795894e-2f
);
462 const __m256 CE1
= _mm256_set1_ps(1.6666665459e-1f
);
463 const __m256 CE0
= _mm256_set1_ps(5.0000001201e-1f
);
464 const __m256 one
= _mm256_set1_ps(1.0f
);
466 __m256 exparg2
,exp2arg
;
470 __m128i iexppart128a
,iexppart128b
;
474 exp2arg
= _mm256_mul_ps(exparg
,argscale
);
476 iexppart
= _mm256_cvtps_epi32(exp2arg
);
477 intpart
= _mm256_round_ps(exp2arg
,_MM_FROUND_TO_NEAREST_INT
);
479 iexppart128b
= _mm256_extractf128_si256(iexppart
,0x1);
480 iexppart128a
= _mm256_castsi256_si128(iexppart
);
482 iexppart128a
= _mm_slli_epi32(_mm_add_epi32(iexppart128a
,expbase
),23);
483 iexppart128b
= _mm_slli_epi32(_mm_add_epi32(iexppart128b
,expbase
),23);
485 iexppart
= _mm256_castsi128_si256(iexppart128a
);
486 iexppart
= _mm256_insertf128_si256(iexppart
,iexppart128b
,0x1);
487 valuemask
= _mm256_cmp_ps(arglimit
,gmx_mm256_abs_ps(exp2arg
),_CMP_GE_OQ
);
488 fexppart
= _mm256_and_ps(valuemask
,_mm256_castsi256_ps(iexppart
));
490 /* Extended precision arithmetics */
491 exparg
= _mm256_sub_ps(exparg
,_mm256_mul_ps(invargscale0
,intpart
));
492 exparg
= _mm256_sub_ps(exparg
,_mm256_mul_ps(invargscale1
,intpart
));
494 exparg2
= _mm256_mul_ps(exparg
,exparg
);
496 pE1
= _mm256_mul_ps(CE5
,exparg2
);
497 pE0
= _mm256_mul_ps(CE4
,exparg2
);
498 pE1
= _mm256_add_ps(pE1
,CE3
);
499 pE0
= _mm256_add_ps(pE0
,CE2
);
500 pE1
= _mm256_mul_ps(pE1
,exparg2
);
501 pE0
= _mm256_mul_ps(pE0
,exparg2
);
502 pE1
= _mm256_add_ps(pE1
,CE1
);
503 pE0
= _mm256_add_ps(pE0
,CE0
);
504 pE1
= _mm256_mul_ps(pE1
,exparg
);
505 pE0
= _mm256_add_ps(pE0
,pE1
);
506 pE0
= _mm256_mul_ps(pE0
,exparg2
);
507 exparg
= _mm256_add_ps(exparg
,one
);
508 exparg
= _mm256_add_ps(exparg
,pE0
);
510 exparg
= _mm256_mul_ps(exparg
,fexppart
);
516 /* exp(), 128 bit wide. */
518 gmx_mm_exp_ps(__m128 x
)
520 const __m128 argscale
= _mm_set1_ps(1.44269504088896341f
);
521 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
522 const __m128 arglimit
= _mm_set1_ps(126.0f
);
523 const __m128i expbase
= _mm_set1_epi32(127);
525 const __m128 invargscale0
= _mm_set1_ps(0.693359375f
);
526 const __m128 invargscale1
= _mm_set1_ps(-2.12194440e-4f
);
528 const __m128 CC5
= _mm_set1_ps(1.9875691500e-4f
);
529 const __m128 CC4
= _mm_set1_ps(1.3981999507e-3f
);
530 const __m128 CC3
= _mm_set1_ps(8.3334519073e-3f
);
531 const __m128 CC2
= _mm_set1_ps(4.1665795894e-2f
);
532 const __m128 CC1
= _mm_set1_ps(1.6666665459e-1f
);
533 const __m128 CC0
= _mm_set1_ps(5.0000001201e-1f
);
534 const __m128 one
= _mm_set1_ps(1.0f
);
543 y
= _mm_mul_ps(x
,argscale
);
545 iexppart
= _mm_cvtps_epi32(y
);
546 intpart
= _mm_round_ps(y
,_MM_FROUND_TO_NEAREST_INT
);
548 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
,expbase
),23);
549 valuemask
= _mm_cmpge_ps(arglimit
,gmx_mm_abs_ps(y
));
550 fexppart
= _mm_and_ps(valuemask
,gmx_mm_castsi128_ps(iexppart
));
552 /* Extended precision arithmetics */
553 x
= _mm_sub_ps(x
,_mm_mul_ps(invargscale0
,intpart
));
554 x
= _mm_sub_ps(x
,_mm_mul_ps(invargscale1
,intpart
));
556 x2
= _mm_mul_ps(x
,x
);
558 p1
= _mm_mul_ps(CC5
,x2
);
559 p0
= _mm_mul_ps(CC4
,x2
);
560 p1
= _mm_add_ps(p1
,CC3
);
561 p0
= _mm_add_ps(p0
,CC2
);
562 p1
= _mm_mul_ps(p1
,x2
);
563 p0
= _mm_mul_ps(p0
,x2
);
564 p1
= _mm_add_ps(p1
,CC1
);
565 p0
= _mm_add_ps(p0
,CC0
);
566 p1
= _mm_mul_ps(p1
,x
);
567 p0
= _mm_add_ps(p0
,p1
);
568 p0
= _mm_mul_ps(p0
,x2
);
569 x
= _mm_add_ps(x
,one
);
570 x
= _mm_add_ps(x
,p0
);
572 x
= _mm_mul_ps(x
,fexppart
);
579 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
581 gmx_mm256_erf_ps(__m256 x
)
583 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
584 const __m256 CA6
= _mm256_set1_ps(7.853861353153693e-5f
);
585 const __m256 CA5
= _mm256_set1_ps(-8.010193625184903e-4f
);
586 const __m256 CA4
= _mm256_set1_ps(5.188327685732524e-3f
);
587 const __m256 CA3
= _mm256_set1_ps(-2.685381193529856e-2f
);
588 const __m256 CA2
= _mm256_set1_ps(1.128358514861418e-1f
);
589 const __m256 CA1
= _mm256_set1_ps(-3.761262582423300e-1f
);
590 const __m256 CA0
= _mm256_set1_ps(1.128379165726710f
);
591 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
592 const __m256 CB9
= _mm256_set1_ps(-0.0018629930017603923f
);
593 const __m256 CB8
= _mm256_set1_ps(0.003909821287598495f
);
594 const __m256 CB7
= _mm256_set1_ps(-0.0052094582210355615f
);
595 const __m256 CB6
= _mm256_set1_ps(0.005685614362160572f
);
596 const __m256 CB5
= _mm256_set1_ps(-0.0025367682853477272f
);
597 const __m256 CB4
= _mm256_set1_ps(-0.010199799682318782f
);
598 const __m256 CB3
= _mm256_set1_ps(0.04369575504816542f
);
599 const __m256 CB2
= _mm256_set1_ps(-0.11884063474674492f
);
600 const __m256 CB1
= _mm256_set1_ps(0.2732120154030589f
);
601 const __m256 CB0
= _mm256_set1_ps(0.42758357702025784f
);
602 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
603 const __m256 CC10
= _mm256_set1_ps(-0.0445555913112064f
);
604 const __m256 CC9
= _mm256_set1_ps(0.21376355144663348f
);
605 const __m256 CC8
= _mm256_set1_ps(-0.3473187200259257f
);
606 const __m256 CC7
= _mm256_set1_ps(0.016690861551248114f
);
607 const __m256 CC6
= _mm256_set1_ps(0.7560973182491192f
);
608 const __m256 CC5
= _mm256_set1_ps(-1.2137903600145787f
);
609 const __m256 CC4
= _mm256_set1_ps(0.8411872321232948f
);
610 const __m256 CC3
= _mm256_set1_ps(-0.08670413896296343f
);
611 const __m256 CC2
= _mm256_set1_ps(-0.27124782687240334f
);
612 const __m256 CC1
= _mm256_set1_ps(-0.0007502488047806069f
);
613 const __m256 CC0
= _mm256_set1_ps(0.5642114853803148f
);
615 /* Coefficients for expansion of exp(x) in [0,0.1] */
616 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
617 const __m256 CD2
= _mm256_set1_ps(0.5000066608081202f
);
618 const __m256 CD3
= _mm256_set1_ps(0.1664795422874624f
);
619 const __m256 CD4
= _mm256_set1_ps(0.04379839977652482f
);
621 const __m256 sieve
= _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
622 const __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
623 const __m256 one
= _mm256_set1_ps(1.0f
);
624 const __m256 two
= _mm256_set1_ps(2.0f
);
627 __m256 z
,q
,t
,t2
,w
,w2
;
628 __m256 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
630 __m256 res_erf
,res_erfc
,res
;
633 /* Calculate erf() */
634 x2
= _mm256_mul_ps(x
,x
);
635 x4
= _mm256_mul_ps(x2
,x2
);
637 pA0
= _mm256_mul_ps(CA6
,x4
);
638 pA1
= _mm256_mul_ps(CA5
,x4
);
639 pA0
= _mm256_add_ps(pA0
,CA4
);
640 pA1
= _mm256_add_ps(pA1
,CA3
);
641 pA0
= _mm256_mul_ps(pA0
,x4
);
642 pA1
= _mm256_mul_ps(pA1
,x4
);
643 pA0
= _mm256_add_ps(pA0
,CA2
);
644 pA1
= _mm256_add_ps(pA1
,CA1
);
645 pA0
= _mm256_mul_ps(pA0
,x4
);
646 pA1
= _mm256_mul_ps(pA1
,x2
);
647 pA0
= _mm256_add_ps(pA0
,pA1
);
648 pA0
= _mm256_add_ps(pA0
,CA0
);
650 res_erf
= _mm256_mul_ps(x
,pA0
);
654 y
= gmx_mm256_abs_ps(x
);
655 t
= gmx_mm256_inv_ps(y
);
656 w
= _mm256_sub_ps(t
,one
);
657 t2
= _mm256_mul_ps(t
,t
);
658 w2
= _mm256_mul_ps(w
,w
);
660 * We cannot simply calculate exp(-x2) directly in single precision, since
661 * that will lose a couple of bits of precision due to the multiplication.
662 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
663 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
665 * The only drawback with this is that it requires TWO separate exponential
666 * evaluations, which would be horrible performance-wise. However, the argument
667 * for the second exp() call is always small, so there we simply use a
668 * low-order minimax expansion on [0,0.1].
671 z
= _mm256_and_ps(y
,sieve
);
672 q
= _mm256_mul_ps( _mm256_sub_ps(z
,y
) , _mm256_add_ps(z
,y
) );
674 corr
= _mm256_mul_ps(CD4
,q
);
675 corr
= _mm256_add_ps(corr
,CD3
);
676 corr
= _mm256_mul_ps(corr
,q
);
677 corr
= _mm256_add_ps(corr
,CD2
);
678 corr
= _mm256_mul_ps(corr
,q
);
679 corr
= _mm256_add_ps(corr
,one
);
680 corr
= _mm256_mul_ps(corr
,q
);
681 corr
= _mm256_add_ps(corr
,one
);
683 expmx2
= gmx_mm256_exp_ps( _mm256_or_ps( signbit
, _mm256_mul_ps(z
,z
) ) );
684 expmx2
= _mm256_mul_ps(expmx2
,corr
);
686 pB1
= _mm256_mul_ps(CB9
,w2
);
687 pB0
= _mm256_mul_ps(CB8
,w2
);
688 pB1
= _mm256_add_ps(pB1
,CB7
);
689 pB0
= _mm256_add_ps(pB0
,CB6
);
690 pB1
= _mm256_mul_ps(pB1
,w2
);
691 pB0
= _mm256_mul_ps(pB0
,w2
);
692 pB1
= _mm256_add_ps(pB1
,CB5
);
693 pB0
= _mm256_add_ps(pB0
,CB4
);
694 pB1
= _mm256_mul_ps(pB1
,w2
);
695 pB0
= _mm256_mul_ps(pB0
,w2
);
696 pB1
= _mm256_add_ps(pB1
,CB3
);
697 pB0
= _mm256_add_ps(pB0
,CB2
);
698 pB1
= _mm256_mul_ps(pB1
,w2
);
699 pB0
= _mm256_mul_ps(pB0
,w2
);
700 pB1
= _mm256_add_ps(pB1
,CB1
);
701 pB1
= _mm256_mul_ps(pB1
,w
);
702 pB0
= _mm256_add_ps(pB0
,pB1
);
703 pB0
= _mm256_add_ps(pB0
,CB0
);
705 pC0
= _mm256_mul_ps(CC10
,t2
);
706 pC1
= _mm256_mul_ps(CC9
,t2
);
707 pC0
= _mm256_add_ps(pC0
,CC8
);
708 pC1
= _mm256_add_ps(pC1
,CC7
);
709 pC0
= _mm256_mul_ps(pC0
,t2
);
710 pC1
= _mm256_mul_ps(pC1
,t2
);
711 pC0
= _mm256_add_ps(pC0
,CC6
);
712 pC1
= _mm256_add_ps(pC1
,CC5
);
713 pC0
= _mm256_mul_ps(pC0
,t2
);
714 pC1
= _mm256_mul_ps(pC1
,t2
);
715 pC0
= _mm256_add_ps(pC0
,CC4
);
716 pC1
= _mm256_add_ps(pC1
,CC3
);
717 pC0
= _mm256_mul_ps(pC0
,t2
);
718 pC1
= _mm256_mul_ps(pC1
,t2
);
719 pC0
= _mm256_add_ps(pC0
,CC2
);
720 pC1
= _mm256_add_ps(pC1
,CC1
);
721 pC0
= _mm256_mul_ps(pC0
,t2
);
722 pC1
= _mm256_mul_ps(pC1
,t
);
723 pC0
= _mm256_add_ps(pC0
,pC1
);
724 pC0
= _mm256_add_ps(pC0
,CC0
);
725 pC0
= _mm256_mul_ps(pC0
,t
);
727 /* SELECT pB0 or pC0 for erfc() */
728 mask
= _mm256_cmp_ps(two
,y
,_CMP_LT_OQ
);
729 res_erfc
= _mm256_blendv_ps(pB0
,pC0
,mask
);
730 res_erfc
= _mm256_mul_ps(res_erfc
,expmx2
);
732 /* erfc(x<0) = 2-erfc(|x|) */
733 mask
= _mm256_cmp_ps(x
,_mm256_setzero_ps(),_CMP_LT_OQ
);
734 res_erfc
= _mm256_blendv_ps(res_erfc
,_mm256_sub_ps(two
,res_erfc
),mask
);
736 /* Select erf() or erfc() */
737 mask
= _mm256_cmp_ps(y
,_mm256_set1_ps(0.75f
),_CMP_LT_OQ
);
738 res
= _mm256_blendv_ps(_mm256_sub_ps(one
,res_erfc
),res_erf
,mask
);
744 /* erf(), 128 bit wide */
746 gmx_mm_erf_ps(__m128 x
)
748 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
749 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
750 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
751 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
752 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
753 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
754 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
755 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
756 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
757 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
758 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
759 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
760 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
761 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
762 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
763 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
764 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
765 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
766 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
767 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
768 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
769 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
770 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
771 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
772 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
773 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
774 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
775 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
776 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
777 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
778 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
780 /* Coefficients for expansion of exp(x) in [0,0.1] */
781 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
782 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
783 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
784 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
786 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
787 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
788 const __m128 one
= _mm_set1_ps(1.0f
);
789 const __m128 two
= _mm_set1_ps(2.0f
);
792 __m128 z
,q
,t
,t2
,w
,w2
;
793 __m128 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
795 __m128 res_erf
,res_erfc
,res
;
798 /* Calculate erf() */
799 x2
= _mm_mul_ps(x
,x
);
800 x4
= _mm_mul_ps(x2
,x2
);
802 pA0
= _mm_mul_ps(CA6
,x4
);
803 pA1
= _mm_mul_ps(CA5
,x4
);
804 pA0
= _mm_add_ps(pA0
,CA4
);
805 pA1
= _mm_add_ps(pA1
,CA3
);
806 pA0
= _mm_mul_ps(pA0
,x4
);
807 pA1
= _mm_mul_ps(pA1
,x4
);
808 pA0
= _mm_add_ps(pA0
,CA2
);
809 pA1
= _mm_add_ps(pA1
,CA1
);
810 pA0
= _mm_mul_ps(pA0
,x4
);
811 pA1
= _mm_mul_ps(pA1
,x2
);
812 pA0
= _mm_add_ps(pA0
,pA1
);
813 pA0
= _mm_add_ps(pA0
,CA0
);
815 res_erf
= _mm_mul_ps(x
,pA0
);
819 y
= gmx_mm_abs_ps(x
);
820 t
= gmx_mm_inv_ps(y
);
821 w
= _mm_sub_ps(t
,one
);
822 t2
= _mm_mul_ps(t
,t
);
823 w2
= _mm_mul_ps(w
,w
);
825 * We cannot simply calculate exp(-x2) directly in single precision, since
826 * that will lose a couple of bits of precision due to the multiplication.
827 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
828 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
830 * The only drawback with this is that it requires TWO separate exponential
831 * evaluations, which would be horrible performance-wise. However, the argument
832 * for the second exp() call is always small, so there we simply use a
833 * low-order minimax expansion on [0,0.1].
836 z
= _mm_and_ps(y
,sieve
);
837 q
= _mm_mul_ps( _mm_sub_ps(z
,y
) , _mm_add_ps(z
,y
) );
839 corr
= _mm_mul_ps(CD4
,q
);
840 corr
= _mm_add_ps(corr
,CD3
);
841 corr
= _mm_mul_ps(corr
,q
);
842 corr
= _mm_add_ps(corr
,CD2
);
843 corr
= _mm_mul_ps(corr
,q
);
844 corr
= _mm_add_ps(corr
,one
);
845 corr
= _mm_mul_ps(corr
,q
);
846 corr
= _mm_add_ps(corr
,one
);
848 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
,z
) ) );
849 expmx2
= _mm_mul_ps(expmx2
,corr
);
851 pB1
= _mm_mul_ps(CB9
,w2
);
852 pB0
= _mm_mul_ps(CB8
,w2
);
853 pB1
= _mm_add_ps(pB1
,CB7
);
854 pB0
= _mm_add_ps(pB0
,CB6
);
855 pB1
= _mm_mul_ps(pB1
,w2
);
856 pB0
= _mm_mul_ps(pB0
,w2
);
857 pB1
= _mm_add_ps(pB1
,CB5
);
858 pB0
= _mm_add_ps(pB0
,CB4
);
859 pB1
= _mm_mul_ps(pB1
,w2
);
860 pB0
= _mm_mul_ps(pB0
,w2
);
861 pB1
= _mm_add_ps(pB1
,CB3
);
862 pB0
= _mm_add_ps(pB0
,CB2
);
863 pB1
= _mm_mul_ps(pB1
,w2
);
864 pB0
= _mm_mul_ps(pB0
,w2
);
865 pB1
= _mm_add_ps(pB1
,CB1
);
866 pB1
= _mm_mul_ps(pB1
,w
);
867 pB0
= _mm_add_ps(pB0
,pB1
);
868 pB0
= _mm_add_ps(pB0
,CB0
);
870 pC0
= _mm_mul_ps(CC10
,t2
);
871 pC1
= _mm_mul_ps(CC9
,t2
);
872 pC0
= _mm_add_ps(pC0
,CC8
);
873 pC1
= _mm_add_ps(pC1
,CC7
);
874 pC0
= _mm_mul_ps(pC0
,t2
);
875 pC1
= _mm_mul_ps(pC1
,t2
);
876 pC0
= _mm_add_ps(pC0
,CC6
);
877 pC1
= _mm_add_ps(pC1
,CC5
);
878 pC0
= _mm_mul_ps(pC0
,t2
);
879 pC1
= _mm_mul_ps(pC1
,t2
);
880 pC0
= _mm_add_ps(pC0
,CC4
);
881 pC1
= _mm_add_ps(pC1
,CC3
);
882 pC0
= _mm_mul_ps(pC0
,t2
);
883 pC1
= _mm_mul_ps(pC1
,t2
);
884 pC0
= _mm_add_ps(pC0
,CC2
);
885 pC1
= _mm_add_ps(pC1
,CC1
);
886 pC0
= _mm_mul_ps(pC0
,t2
);
887 pC1
= _mm_mul_ps(pC1
,t
);
888 pC0
= _mm_add_ps(pC0
,pC1
);
889 pC0
= _mm_add_ps(pC0
,CC0
);
890 pC0
= _mm_mul_ps(pC0
,t
);
892 /* SELECT pB0 or pC0 for erfc() */
893 mask
= _mm_cmplt_ps(two
,y
);
894 res_erfc
= _mm_blendv_ps(pB0
,pC0
,mask
);
895 res_erfc
= _mm_mul_ps(res_erfc
,expmx2
);
897 /* erfc(x<0) = 2-erfc(|x|) */
898 mask
= _mm_cmplt_ps(x
,_mm_setzero_ps());
899 res_erfc
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(two
,res_erfc
),mask
);
901 /* Select erf() or erfc() */
902 mask
= _mm_cmplt_ps(y
,_mm_set1_ps(0.75f
));
903 res
= _mm_blendv_ps(_mm_sub_ps(one
,res_erfc
),res_erf
,mask
);
911 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
913 gmx_mm256_erfc_ps(__m256 x
)
915 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
916 const __m256 CA6
= _mm256_set1_ps(7.853861353153693e-5f
);
917 const __m256 CA5
= _mm256_set1_ps(-8.010193625184903e-4f
);
918 const __m256 CA4
= _mm256_set1_ps(5.188327685732524e-3f
);
919 const __m256 CA3
= _mm256_set1_ps(-2.685381193529856e-2f
);
920 const __m256 CA2
= _mm256_set1_ps(1.128358514861418e-1f
);
921 const __m256 CA1
= _mm256_set1_ps(-3.761262582423300e-1f
);
922 const __m256 CA0
= _mm256_set1_ps(1.128379165726710f
);
923 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
924 const __m256 CB9
= _mm256_set1_ps(-0.0018629930017603923f
);
925 const __m256 CB8
= _mm256_set1_ps(0.003909821287598495f
);
926 const __m256 CB7
= _mm256_set1_ps(-0.0052094582210355615f
);
927 const __m256 CB6
= _mm256_set1_ps(0.005685614362160572f
);
928 const __m256 CB5
= _mm256_set1_ps(-0.0025367682853477272f
);
929 const __m256 CB4
= _mm256_set1_ps(-0.010199799682318782f
);
930 const __m256 CB3
= _mm256_set1_ps(0.04369575504816542f
);
931 const __m256 CB2
= _mm256_set1_ps(-0.11884063474674492f
);
932 const __m256 CB1
= _mm256_set1_ps(0.2732120154030589f
);
933 const __m256 CB0
= _mm256_set1_ps(0.42758357702025784f
);
934 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
935 const __m256 CC10
= _mm256_set1_ps(-0.0445555913112064f
);
936 const __m256 CC9
= _mm256_set1_ps(0.21376355144663348f
);
937 const __m256 CC8
= _mm256_set1_ps(-0.3473187200259257f
);
938 const __m256 CC7
= _mm256_set1_ps(0.016690861551248114f
);
939 const __m256 CC6
= _mm256_set1_ps(0.7560973182491192f
);
940 const __m256 CC5
= _mm256_set1_ps(-1.2137903600145787f
);
941 const __m256 CC4
= _mm256_set1_ps(0.8411872321232948f
);
942 const __m256 CC3
= _mm256_set1_ps(-0.08670413896296343f
);
943 const __m256 CC2
= _mm256_set1_ps(-0.27124782687240334f
);
944 const __m256 CC1
= _mm256_set1_ps(-0.0007502488047806069f
);
945 const __m256 CC0
= _mm256_set1_ps(0.5642114853803148f
);
947 /* Coefficients for expansion of exp(x) in [0,0.1] */
948 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
949 const __m256 CD2
= _mm256_set1_ps(0.5000066608081202f
);
950 const __m256 CD3
= _mm256_set1_ps(0.1664795422874624f
);
951 const __m256 CD4
= _mm256_set1_ps(0.04379839977652482f
);
953 const __m256 sieve
= _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
954 const __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
955 const __m256 one
= _mm256_set1_ps(1.0f
);
956 const __m256 two
= _mm256_set1_ps(2.0f
);
959 __m256 z
,q
,t
,t2
,w
,w2
;
960 __m256 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
962 __m256 res_erf
,res_erfc
,res
;
965 /* Calculate erf() */
966 x2
= _mm256_mul_ps(x
,x
);
967 x4
= _mm256_mul_ps(x2
,x2
);
969 pA0
= _mm256_mul_ps(CA6
,x4
);
970 pA1
= _mm256_mul_ps(CA5
,x4
);
971 pA0
= _mm256_add_ps(pA0
,CA4
);
972 pA1
= _mm256_add_ps(pA1
,CA3
);
973 pA0
= _mm256_mul_ps(pA0
,x4
);
974 pA1
= _mm256_mul_ps(pA1
,x4
);
975 pA0
= _mm256_add_ps(pA0
,CA2
);
976 pA1
= _mm256_add_ps(pA1
,CA1
);
977 pA0
= _mm256_mul_ps(pA0
,x4
);
978 pA1
= _mm256_mul_ps(pA1
,x2
);
979 pA0
= _mm256_add_ps(pA0
,pA1
);
980 pA0
= _mm256_add_ps(pA0
,CA0
);
982 res_erf
= _mm256_mul_ps(x
,pA0
);
985 y
= gmx_mm256_abs_ps(x
);
986 t
= gmx_mm256_inv_ps(y
);
987 w
= _mm256_sub_ps(t
,one
);
988 t2
= _mm256_mul_ps(t
,t
);
989 w2
= _mm256_mul_ps(w
,w
);
991 * We cannot simply calculate exp(-x2) directly in single precision, since
992 * that will lose a couple of bits of precision due to the multiplication.
993 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
994 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
996 * The only drawback with this is that it requires TWO separate exponential
997 * evaluations, which would be horrible performance-wise. However, the argument
998 * for the second exp() call is always small, so there we simply use a
999 * low-order minimax expansion on [0,0.1].
1002 z
= _mm256_and_ps(y
,sieve
);
1003 q
= _mm256_mul_ps( _mm256_sub_ps(z
,y
) , _mm256_add_ps(z
,y
) );
1005 corr
= _mm256_mul_ps(CD4
,q
);
1006 corr
= _mm256_add_ps(corr
,CD3
);
1007 corr
= _mm256_mul_ps(corr
,q
);
1008 corr
= _mm256_add_ps(corr
,CD2
);
1009 corr
= _mm256_mul_ps(corr
,q
);
1010 corr
= _mm256_add_ps(corr
,one
);
1011 corr
= _mm256_mul_ps(corr
,q
);
1012 corr
= _mm256_add_ps(corr
,one
);
1014 expmx2
= gmx_mm256_exp_ps( _mm256_or_ps( signbit
, _mm256_mul_ps(z
,z
) ) );
1015 expmx2
= _mm256_mul_ps(expmx2
,corr
);
1017 pB1
= _mm256_mul_ps(CB9
,w2
);
1018 pB0
= _mm256_mul_ps(CB8
,w2
);
1019 pB1
= _mm256_add_ps(pB1
,CB7
);
1020 pB0
= _mm256_add_ps(pB0
,CB6
);
1021 pB1
= _mm256_mul_ps(pB1
,w2
);
1022 pB0
= _mm256_mul_ps(pB0
,w2
);
1023 pB1
= _mm256_add_ps(pB1
,CB5
);
1024 pB0
= _mm256_add_ps(pB0
,CB4
);
1025 pB1
= _mm256_mul_ps(pB1
,w2
);
1026 pB0
= _mm256_mul_ps(pB0
,w2
);
1027 pB1
= _mm256_add_ps(pB1
,CB3
);
1028 pB0
= _mm256_add_ps(pB0
,CB2
);
1029 pB1
= _mm256_mul_ps(pB1
,w2
);
1030 pB0
= _mm256_mul_ps(pB0
,w2
);
1031 pB1
= _mm256_add_ps(pB1
,CB1
);
1032 pB1
= _mm256_mul_ps(pB1
,w
);
1033 pB0
= _mm256_add_ps(pB0
,pB1
);
1034 pB0
= _mm256_add_ps(pB0
,CB0
);
1036 pC0
= _mm256_mul_ps(CC10
,t2
);
1037 pC1
= _mm256_mul_ps(CC9
,t2
);
1038 pC0
= _mm256_add_ps(pC0
,CC8
);
1039 pC1
= _mm256_add_ps(pC1
,CC7
);
1040 pC0
= _mm256_mul_ps(pC0
,t2
);
1041 pC1
= _mm256_mul_ps(pC1
,t2
);
1042 pC0
= _mm256_add_ps(pC0
,CC6
);
1043 pC1
= _mm256_add_ps(pC1
,CC5
);
1044 pC0
= _mm256_mul_ps(pC0
,t2
);
1045 pC1
= _mm256_mul_ps(pC1
,t2
);
1046 pC0
= _mm256_add_ps(pC0
,CC4
);
1047 pC1
= _mm256_add_ps(pC1
,CC3
);
1048 pC0
= _mm256_mul_ps(pC0
,t2
);
1049 pC1
= _mm256_mul_ps(pC1
,t2
);
1050 pC0
= _mm256_add_ps(pC0
,CC2
);
1051 pC1
= _mm256_add_ps(pC1
,CC1
);
1052 pC0
= _mm256_mul_ps(pC0
,t2
);
1053 pC1
= _mm256_mul_ps(pC1
,t
);
1054 pC0
= _mm256_add_ps(pC0
,pC1
);
1055 pC0
= _mm256_add_ps(pC0
,CC0
);
1056 pC0
= _mm256_mul_ps(pC0
,t
);
1058 /* SELECT pB0 or pC0 for erfc() */
1059 mask
= _mm256_cmp_ps(two
,y
,_CMP_LT_OQ
);
1060 res_erfc
= _mm256_blendv_ps(pB0
,pC0
,mask
);
1061 res_erfc
= _mm256_mul_ps(res_erfc
,expmx2
);
1063 /* erfc(x<0) = 2-erfc(|x|) */
1064 mask
= _mm256_cmp_ps(x
,_mm256_setzero_ps(),_CMP_LT_OQ
);
1065 res_erfc
= _mm256_blendv_ps(res_erfc
,_mm256_sub_ps(two
,res_erfc
),mask
);
1067 /* Select erf() or erfc() */
1068 mask
= _mm256_cmp_ps(y
,_mm256_set1_ps(0.75f
),_CMP_LT_OQ
);
1069 res
= _mm256_blendv_ps(res_erfc
,_mm256_sub_ps(one
,res_erf
),mask
);
1075 /* erfc(), 128 bit wide */
1077 gmx_mm_erfc_ps(__m128 x
)
1079 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1080 const __m128 CA6
= _mm_set1_ps(7.853861353153693e-5f
);
1081 const __m128 CA5
= _mm_set1_ps(-8.010193625184903e-4f
);
1082 const __m128 CA4
= _mm_set1_ps(5.188327685732524e-3f
);
1083 const __m128 CA3
= _mm_set1_ps(-2.685381193529856e-2f
);
1084 const __m128 CA2
= _mm_set1_ps(1.128358514861418e-1f
);
1085 const __m128 CA1
= _mm_set1_ps(-3.761262582423300e-1f
);
1086 const __m128 CA0
= _mm_set1_ps(1.128379165726710f
);
1087 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1088 const __m128 CB9
= _mm_set1_ps(-0.0018629930017603923f
);
1089 const __m128 CB8
= _mm_set1_ps(0.003909821287598495f
);
1090 const __m128 CB7
= _mm_set1_ps(-0.0052094582210355615f
);
1091 const __m128 CB6
= _mm_set1_ps(0.005685614362160572f
);
1092 const __m128 CB5
= _mm_set1_ps(-0.0025367682853477272f
);
1093 const __m128 CB4
= _mm_set1_ps(-0.010199799682318782f
);
1094 const __m128 CB3
= _mm_set1_ps(0.04369575504816542f
);
1095 const __m128 CB2
= _mm_set1_ps(-0.11884063474674492f
);
1096 const __m128 CB1
= _mm_set1_ps(0.2732120154030589f
);
1097 const __m128 CB0
= _mm_set1_ps(0.42758357702025784f
);
1098 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1099 const __m128 CC10
= _mm_set1_ps(-0.0445555913112064f
);
1100 const __m128 CC9
= _mm_set1_ps(0.21376355144663348f
);
1101 const __m128 CC8
= _mm_set1_ps(-0.3473187200259257f
);
1102 const __m128 CC7
= _mm_set1_ps(0.016690861551248114f
);
1103 const __m128 CC6
= _mm_set1_ps(0.7560973182491192f
);
1104 const __m128 CC5
= _mm_set1_ps(-1.2137903600145787f
);
1105 const __m128 CC4
= _mm_set1_ps(0.8411872321232948f
);
1106 const __m128 CC3
= _mm_set1_ps(-0.08670413896296343f
);
1107 const __m128 CC2
= _mm_set1_ps(-0.27124782687240334f
);
1108 const __m128 CC1
= _mm_set1_ps(-0.0007502488047806069f
);
1109 const __m128 CC0
= _mm_set1_ps(0.5642114853803148f
);
1111 /* Coefficients for expansion of exp(x) in [0,0.1] */
1112 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1113 const __m128 CD2
= _mm_set1_ps(0.5000066608081202f
);
1114 const __m128 CD3
= _mm_set1_ps(0.1664795422874624f
);
1115 const __m128 CD4
= _mm_set1_ps(0.04379839977652482f
);
1117 const __m128 sieve
= gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1118 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1119 const __m128 one
= _mm_set1_ps(1.0f
);
1120 const __m128 two
= _mm_set1_ps(2.0f
);
1123 __m128 z
,q
,t
,t2
,w
,w2
;
1124 __m128 pA0
,pA1
,pB0
,pB1
,pC0
,pC1
;
1126 __m128 res_erf
,res_erfc
,res
;
1129 /* Calculate erf() */
1130 x2
= _mm_mul_ps(x
,x
);
1131 x4
= _mm_mul_ps(x2
,x2
);
1133 pA0
= _mm_mul_ps(CA6
,x4
);
1134 pA1
= _mm_mul_ps(CA5
,x4
);
1135 pA0
= _mm_add_ps(pA0
,CA4
);
1136 pA1
= _mm_add_ps(pA1
,CA3
);
1137 pA0
= _mm_mul_ps(pA0
,x4
);
1138 pA1
= _mm_mul_ps(pA1
,x4
);
1139 pA0
= _mm_add_ps(pA0
,CA2
);
1140 pA1
= _mm_add_ps(pA1
,CA1
);
1141 pA0
= _mm_mul_ps(pA0
,x4
);
1142 pA1
= _mm_mul_ps(pA1
,x2
);
1143 pA0
= _mm_add_ps(pA0
,pA1
);
1144 pA0
= _mm_add_ps(pA0
,CA0
);
1146 res_erf
= _mm_mul_ps(x
,pA0
);
1148 /* Calculate erfc */
1149 y
= gmx_mm_abs_ps(x
);
1150 t
= gmx_mm_inv_ps(y
);
1151 w
= _mm_sub_ps(t
,one
);
1152 t2
= _mm_mul_ps(t
,t
);
1153 w2
= _mm_mul_ps(w
,w
);
1155 * We cannot simply calculate exp(-x2) directly in single precision, since
1156 * that will lose a couple of bits of precision due to the multiplication.
1157 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1158 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1160 * The only drawback with this is that it requires TWO separate exponential
1161 * evaluations, which would be horrible performance-wise. However, the argument
1162 * for the second exp() call is always small, so there we simply use a
1163 * low-order minimax expansion on [0,0.1].
1166 z
= _mm_and_ps(y
,sieve
);
1167 q
= _mm_mul_ps( _mm_sub_ps(z
,y
) , _mm_add_ps(z
,y
) );
1169 corr
= _mm_mul_ps(CD4
,q
);
1170 corr
= _mm_add_ps(corr
,CD3
);
1171 corr
= _mm_mul_ps(corr
,q
);
1172 corr
= _mm_add_ps(corr
,CD2
);
1173 corr
= _mm_mul_ps(corr
,q
);
1174 corr
= _mm_add_ps(corr
,one
);
1175 corr
= _mm_mul_ps(corr
,q
);
1176 corr
= _mm_add_ps(corr
,one
);
1178 expmx2
= gmx_mm_exp_ps( _mm_or_ps( signbit
, _mm_mul_ps(z
,z
) ) );
1179 expmx2
= _mm_mul_ps(expmx2
,corr
);
1181 pB1
= _mm_mul_ps(CB9
,w2
);
1182 pB0
= _mm_mul_ps(CB8
,w2
);
1183 pB1
= _mm_add_ps(pB1
,CB7
);
1184 pB0
= _mm_add_ps(pB0
,CB6
);
1185 pB1
= _mm_mul_ps(pB1
,w2
);
1186 pB0
= _mm_mul_ps(pB0
,w2
);
1187 pB1
= _mm_add_ps(pB1
,CB5
);
1188 pB0
= _mm_add_ps(pB0
,CB4
);
1189 pB1
= _mm_mul_ps(pB1
,w2
);
1190 pB0
= _mm_mul_ps(pB0
,w2
);
1191 pB1
= _mm_add_ps(pB1
,CB3
);
1192 pB0
= _mm_add_ps(pB0
,CB2
);
1193 pB1
= _mm_mul_ps(pB1
,w2
);
1194 pB0
= _mm_mul_ps(pB0
,w2
);
1195 pB1
= _mm_add_ps(pB1
,CB1
);
1196 pB1
= _mm_mul_ps(pB1
,w
);
1197 pB0
= _mm_add_ps(pB0
,pB1
);
1198 pB0
= _mm_add_ps(pB0
,CB0
);
1200 pC0
= _mm_mul_ps(CC10
,t2
);
1201 pC1
= _mm_mul_ps(CC9
,t2
);
1202 pC0
= _mm_add_ps(pC0
,CC8
);
1203 pC1
= _mm_add_ps(pC1
,CC7
);
1204 pC0
= _mm_mul_ps(pC0
,t2
);
1205 pC1
= _mm_mul_ps(pC1
,t2
);
1206 pC0
= _mm_add_ps(pC0
,CC6
);
1207 pC1
= _mm_add_ps(pC1
,CC5
);
1208 pC0
= _mm_mul_ps(pC0
,t2
);
1209 pC1
= _mm_mul_ps(pC1
,t2
);
1210 pC0
= _mm_add_ps(pC0
,CC4
);
1211 pC1
= _mm_add_ps(pC1
,CC3
);
1212 pC0
= _mm_mul_ps(pC0
,t2
);
1213 pC1
= _mm_mul_ps(pC1
,t2
);
1214 pC0
= _mm_add_ps(pC0
,CC2
);
1215 pC1
= _mm_add_ps(pC1
,CC1
);
1216 pC0
= _mm_mul_ps(pC0
,t2
);
1217 pC1
= _mm_mul_ps(pC1
,t
);
1218 pC0
= _mm_add_ps(pC0
,pC1
);
1219 pC0
= _mm_add_ps(pC0
,CC0
);
1220 pC0
= _mm_mul_ps(pC0
,t
);
1222 /* SELECT pB0 or pC0 for erfc() */
1223 mask
= _mm_cmplt_ps(two
,y
);
1224 res_erfc
= _mm_blendv_ps(pB0
,pC0
,mask
);
1225 res_erfc
= _mm_mul_ps(res_erfc
,expmx2
);
1227 /* erfc(x<0) = 2-erfc(|x|) */
1228 mask
= _mm_cmplt_ps(x
,_mm_setzero_ps());
1229 res_erfc
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(two
,res_erfc
),mask
);
1231 /* Select erf() or erfc() */
1232 mask
= _mm_cmplt_ps(y
,_mm_set1_ps(0.75f
));
1233 res
= _mm_blendv_ps(res_erfc
,_mm_sub_ps(one
,res_erf
),mask
);
1240 /* Calculate the force correction due to PME analytically.
1242 * This routine is meant to enable analytical evaluation of the
1243 * direct-space PME electrostatic force to avoid tables.
1245 * The direct-space potential should be Erfc(beta*r)/r, but there
1246 * are some problems evaluating that:
1248 * First, the error function is difficult (read: expensive) to
1249 * approxmiate accurately for intermediate to large arguments, and
1250 * this happens already in ranges of beta*r that occur in simulations.
1251 * Second, we now try to avoid calculating potentials in Gromacs but
1252 * use forces directly.
1254 * We can simply things slight by noting that the PME part is really
1255 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1257 * V= 1/r - Erf(beta*r)/r
1259 * The first term we already have from the inverse square root, so
1260 * that we can leave out of this routine.
1262 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1263 * the argument beta*r will be in the range 0.15 to ~4. Use your
1264 * favorite plotting program to realize how well-behaved Erf(z)/z is
1267 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1268 * However, it turns out it is more efficient to approximate f(z)/z and
1269 * then only use even powers. This is another minor optimization, since
1270 * we actually WANT f(z)/z, because it is going to be multiplied by
1271 * the vector between the two atoms to get the vectorial force. The
1272 * fastest flops are the ones we can avoid calculating!
1274 * So, here's how it should be used:
1277 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1278 * 3. Evaluate this routine with z^2 as the argument.
1279 * 4. The return value is the expression:
1282 * 2*exp(-z^2) erf(z)
1283 * ------------ - --------
1286 * 5. Multiply the entire expression by beta^3. This will get you
1288 * beta^3*2*exp(-z^2) beta^3*erf(z)
1289 * ------------------ - ---------------
1292 * or, switching back to r (z=r*beta):
1294 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
1295 * ----------------------- - -----------
1299 * With a bit of math exercise you should be able to confirm that
1300 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1302 * 6. Add the result to 1/r^3, multiply by the product of the charges,
1303 * and you have your force (divided by r). A final multiplication
1304 * with the vector connecting the two particles and you have your
1305 * vectorial force to add to the particles.
1309 gmx_mm256_pmecorrF_ps(__m256 z2
)
1311 const __m256 FN6
= _mm256_set1_ps(-1.7357322914161492954e-8f
);
1312 const __m256 FN5
= _mm256_set1_ps(1.4703624142580877519e-6f
);
1313 const __m256 FN4
= _mm256_set1_ps(-0.000053401640219807709149f
);
1314 const __m256 FN3
= _mm256_set1_ps(0.0010054721316683106153f
);
1315 const __m256 FN2
= _mm256_set1_ps(-0.019278317264888380590f
);
1316 const __m256 FN1
= _mm256_set1_ps(0.069670166153766424023f
);
1317 const __m256 FN0
= _mm256_set1_ps(-0.75225204789749321333f
);
1319 const __m256 FD4
= _mm256_set1_ps(0.0011193462567257629232f
);
1320 const __m256 FD3
= _mm256_set1_ps(0.014866955030185295499f
);
1321 const __m256 FD2
= _mm256_set1_ps(0.11583842382862377919f
);
1322 const __m256 FD1
= _mm256_set1_ps(0.50736591960530292870f
);
1323 const __m256 FD0
= _mm256_set1_ps(1.0f
);
1326 __m256 polyFN0
,polyFN1
,polyFD0
,polyFD1
;
1328 z4
= _mm256_mul_ps(z2
,z2
);
1330 polyFD0
= _mm256_mul_ps(FD4
,z4
);
1331 polyFD1
= _mm256_mul_ps(FD3
,z4
);
1332 polyFD0
= _mm256_add_ps(polyFD0
,FD2
);
1333 polyFD1
= _mm256_add_ps(polyFD1
,FD1
);
1334 polyFD0
= _mm256_mul_ps(polyFD0
,z4
);
1335 polyFD1
= _mm256_mul_ps(polyFD1
,z2
);
1336 polyFD0
= _mm256_add_ps(polyFD0
,FD0
);
1337 polyFD0
= _mm256_add_ps(polyFD0
,polyFD1
);
1339 polyFD0
= gmx_mm256_inv_ps(polyFD0
);
1341 polyFN0
= _mm256_mul_ps(FN6
,z4
);
1342 polyFN1
= _mm256_mul_ps(FN5
,z4
);
1343 polyFN0
= _mm256_add_ps(polyFN0
,FN4
);
1344 polyFN1
= _mm256_add_ps(polyFN1
,FN3
);
1345 polyFN0
= _mm256_mul_ps(polyFN0
,z4
);
1346 polyFN1
= _mm256_mul_ps(polyFN1
,z4
);
1347 polyFN0
= _mm256_add_ps(polyFN0
,FN2
);
1348 polyFN1
= _mm256_add_ps(polyFN1
,FN1
);
1349 polyFN0
= _mm256_mul_ps(polyFN0
,z4
);
1350 polyFN1
= _mm256_mul_ps(polyFN1
,z2
);
1351 polyFN0
= _mm256_add_ps(polyFN0
,FN0
);
1352 polyFN0
= _mm256_add_ps(polyFN0
,polyFN1
);
1354 return _mm256_mul_ps(polyFN0
,polyFD0
);
1359 gmx_mm_pmecorrF_ps(__m128 z2
)
1361 const __m128 FN6
= _mm_set1_ps(-1.7357322914161492954e-8f
);
1362 const __m128 FN5
= _mm_set1_ps(1.4703624142580877519e-6f
);
1363 const __m128 FN4
= _mm_set1_ps(-0.000053401640219807709149f
);
1364 const __m128 FN3
= _mm_set1_ps(0.0010054721316683106153f
);
1365 const __m128 FN2
= _mm_set1_ps(-0.019278317264888380590f
);
1366 const __m128 FN1
= _mm_set1_ps(0.069670166153766424023f
);
1367 const __m128 FN0
= _mm_set1_ps(-0.75225204789749321333f
);
1369 const __m128 FD4
= _mm_set1_ps(0.0011193462567257629232f
);
1370 const __m128 FD3
= _mm_set1_ps(0.014866955030185295499f
);
1371 const __m128 FD2
= _mm_set1_ps(0.11583842382862377919f
);
1372 const __m128 FD1
= _mm_set1_ps(0.50736591960530292870f
);
1373 const __m128 FD0
= _mm_set1_ps(1.0f
);
1376 __m128 polyFN0
,polyFN1
,polyFD0
,polyFD1
;
1378 z4
= _mm_mul_ps(z2
,z2
);
1380 polyFD0
= _mm_mul_ps(FD4
,z4
);
1381 polyFD1
= _mm_mul_ps(FD3
,z4
);
1382 polyFD0
= _mm_add_ps(polyFD0
,FD2
);
1383 polyFD1
= _mm_add_ps(polyFD1
,FD1
);
1384 polyFD0
= _mm_mul_ps(polyFD0
,z4
);
1385 polyFD1
= _mm_mul_ps(polyFD1
,z2
);
1386 polyFD0
= _mm_add_ps(polyFD0
,FD0
);
1387 polyFD0
= _mm_add_ps(polyFD0
,polyFD1
);
1389 polyFD0
= gmx_mm_inv_ps(polyFD0
);
1391 polyFN0
= _mm_mul_ps(FN6
,z4
);
1392 polyFN1
= _mm_mul_ps(FN5
,z4
);
1393 polyFN0
= _mm_add_ps(polyFN0
,FN4
);
1394 polyFN1
= _mm_add_ps(polyFN1
,FN3
);
1395 polyFN0
= _mm_mul_ps(polyFN0
,z4
);
1396 polyFN1
= _mm_mul_ps(polyFN1
,z4
);
1397 polyFN0
= _mm_add_ps(polyFN0
,FN2
);
1398 polyFN1
= _mm_add_ps(polyFN1
,FN1
);
1399 polyFN0
= _mm_mul_ps(polyFN0
,z4
);
1400 polyFN1
= _mm_mul_ps(polyFN1
,z2
);
1401 polyFN0
= _mm_add_ps(polyFN0
,FN0
);
1402 polyFN0
= _mm_add_ps(polyFN0
,polyFN1
);
1404 return _mm_mul_ps(polyFN0
,polyFD0
);
1409 /* Calculate the potential correction due to PME analytically.
1411 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1413 * This routine calculates Erf(z)/z, although you should provide z^2
1414 * as the input argument.
1416 * Here's how it should be used:
1419 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1420 * 3. Evaluate this routine with z^2 as the argument.
1421 * 4. The return value is the expression:
1428 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1434 * 6. Add the result to 1/r, multiply by the product of the charges,
1435 * and you have your potential.
1438 gmx_mm256_pmecorrV_ps(__m256 z2
)
1440 const __m256 VN6
= _mm256_set1_ps(1.9296833005951166339e-8f
);
1441 const __m256 VN5
= _mm256_set1_ps(-1.4213390571557850962e-6f
);
1442 const __m256 VN4
= _mm256_set1_ps(0.000041603292906656984871f
);
1443 const __m256 VN3
= _mm256_set1_ps(-0.00013134036773265025626f
);
1444 const __m256 VN2
= _mm256_set1_ps(0.038657983986041781264f
);
1445 const __m256 VN1
= _mm256_set1_ps(0.11285044772717598220f
);
1446 const __m256 VN0
= _mm256_set1_ps(1.1283802385263030286f
);
1448 const __m256 VD3
= _mm256_set1_ps(0.0066752224023576045451f
);
1449 const __m256 VD2
= _mm256_set1_ps(0.078647795836373922256f
);
1450 const __m256 VD1
= _mm256_set1_ps(0.43336185284710920150f
);
1451 const __m256 VD0
= _mm256_set1_ps(1.0f
);
1454 __m256 polyVN0
,polyVN1
,polyVD0
,polyVD1
;
1456 z4
= _mm256_mul_ps(z2
,z2
);
1458 polyVD1
= _mm256_mul_ps(VD3
,z4
);
1459 polyVD0
= _mm256_mul_ps(VD2
,z4
);
1460 polyVD1
= _mm256_add_ps(polyVD1
,VD1
);
1461 polyVD0
= _mm256_add_ps(polyVD0
,VD0
);
1462 polyVD1
= _mm256_mul_ps(polyVD1
,z2
);
1463 polyVD0
= _mm256_add_ps(polyVD0
,polyVD1
);
1465 polyVD0
= gmx_mm256_inv_ps(polyVD0
);
1467 polyVN0
= _mm256_mul_ps(VN6
,z4
);
1468 polyVN1
= _mm256_mul_ps(VN5
,z4
);
1469 polyVN0
= _mm256_add_ps(polyVN0
,VN4
);
1470 polyVN1
= _mm256_add_ps(polyVN1
,VN3
);
1471 polyVN0
= _mm256_mul_ps(polyVN0
,z4
);
1472 polyVN1
= _mm256_mul_ps(polyVN1
,z4
);
1473 polyVN0
= _mm256_add_ps(polyVN0
,VN2
);
1474 polyVN1
= _mm256_add_ps(polyVN1
,VN1
);
1475 polyVN0
= _mm256_mul_ps(polyVN0
,z4
);
1476 polyVN1
= _mm256_mul_ps(polyVN1
,z2
);
1477 polyVN0
= _mm256_add_ps(polyVN0
,VN0
);
1478 polyVN0
= _mm256_add_ps(polyVN0
,polyVN1
);
1480 return _mm256_mul_ps(polyVN0
,polyVD0
);
1485 gmx_mm_pmecorrV_ps(__m128 z2
)
1487 const __m128 VN6
= _mm_set1_ps(1.9296833005951166339e-8f
);
1488 const __m128 VN5
= _mm_set1_ps(-1.4213390571557850962e-6f
);
1489 const __m128 VN4
= _mm_set1_ps(0.000041603292906656984871f
);
1490 const __m128 VN3
= _mm_set1_ps(-0.00013134036773265025626f
);
1491 const __m128 VN2
= _mm_set1_ps(0.038657983986041781264f
);
1492 const __m128 VN1
= _mm_set1_ps(0.11285044772717598220f
);
1493 const __m128 VN0
= _mm_set1_ps(1.1283802385263030286f
);
1495 const __m128 VD3
= _mm_set1_ps(0.0066752224023576045451f
);
1496 const __m128 VD2
= _mm_set1_ps(0.078647795836373922256f
);
1497 const __m128 VD1
= _mm_set1_ps(0.43336185284710920150f
);
1498 const __m128 VD0
= _mm_set1_ps(1.0f
);
1501 __m128 polyVN0
,polyVN1
,polyVD0
,polyVD1
;
1503 z4
= _mm_mul_ps(z2
,z2
);
1505 polyVD1
= _mm_mul_ps(VD3
,z4
);
1506 polyVD0
= _mm_mul_ps(VD2
,z4
);
1507 polyVD1
= _mm_add_ps(polyVD1
,VD1
);
1508 polyVD0
= _mm_add_ps(polyVD0
,VD0
);
1509 polyVD1
= _mm_mul_ps(polyVD1
,z2
);
1510 polyVD0
= _mm_add_ps(polyVD0
,polyVD1
);
1512 polyVD0
= gmx_mm_inv_ps(polyVD0
);
1514 polyVN0
= _mm_mul_ps(VN6
,z4
);
1515 polyVN1
= _mm_mul_ps(VN5
,z4
);
1516 polyVN0
= _mm_add_ps(polyVN0
,VN4
);
1517 polyVN1
= _mm_add_ps(polyVN1
,VN3
);
1518 polyVN0
= _mm_mul_ps(polyVN0
,z4
);
1519 polyVN1
= _mm_mul_ps(polyVN1
,z4
);
1520 polyVN0
= _mm_add_ps(polyVN0
,VN2
);
1521 polyVN1
= _mm_add_ps(polyVN1
,VN1
);
1522 polyVN0
= _mm_mul_ps(polyVN0
,z4
);
1523 polyVN1
= _mm_mul_ps(polyVN1
,z2
);
1524 polyVN0
= _mm_add_ps(polyVN0
,VN0
);
1525 polyVN0
= _mm_add_ps(polyVN0
,polyVN1
);
1527 return _mm_mul_ps(polyVN0
,polyVD0
);
1532 gmx_mm256_sincos_ps(__m256 x
,
1536 const __m256 two_over_pi
= _mm256_set1_ps(2.0f
/(float)M_PI
);
1537 const __m256 half
= _mm256_set1_ps(0.5f
);
1538 const __m256 one
= _mm256_set1_ps(1.0f
);
1539 const __m256 zero
= _mm256_setzero_ps();
1541 const __m128i ione
= _mm_set1_epi32(1);
1543 const __m256 mask_one
= _mm256_castsi256_ps(_mm256_set1_epi32(1));
1544 const __m256 mask_two
= _mm256_castsi256_ps(_mm256_set1_epi32(2));
1545 const __m256 mask_three
= _mm256_castsi256_ps(_mm256_set1_epi32(3));
1547 const __m256 CA1
= _mm256_set1_ps(1.5703125f
);
1548 const __m256 CA2
= _mm256_set1_ps(4.837512969970703125e-4f
);
1549 const __m256 CA3
= _mm256_set1_ps(7.54978995489188216e-8f
);
1551 const __m256 CC0
= _mm256_set1_ps(-0.0013602249f
);
1552 const __m256 CC1
= _mm256_set1_ps(0.0416566950f
);
1553 const __m256 CC2
= _mm256_set1_ps(-0.4999990225f
);
1554 const __m256 CS0
= _mm256_set1_ps(-0.0001950727f
);
1555 const __m256 CS1
= _mm256_set1_ps(0.0083320758f
);
1556 const __m256 CS2
= _mm256_set1_ps(-0.1666665247f
);
1558 const __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1563 __m128i iz_high
,iz_low
;
1564 __m256 offset_sin
,offset_cos
;
1565 __m256 mask_sin
,mask_cos
;
1567 __m256 tmp_sin
,tmp_cos
;
1569 y
= _mm256_mul_ps(x
,two_over_pi
);
1570 y
= _mm256_add_ps(y
,_mm256_or_ps(_mm256_and_ps(y
,signbit
),half
));
1572 iz
= _mm256_cvttps_epi32(y
);
1573 z
= _mm256_round_ps(y
,_MM_FROUND_TO_ZERO
);
1575 offset_sin
= _mm256_and_ps(_mm256_castsi256_ps(iz
),mask_three
);
1577 iz_high
= _mm256_extractf128_si256(iz
,0x1);
1578 iz_low
= _mm256_castsi256_si128(iz
);
1579 iz_low
= _mm_add_epi32(iz_low
,ione
);
1580 iz_high
= _mm_add_epi32(iz_high
,ione
);
1581 iz
= _mm256_castsi128_si256(iz_low
);
1582 iz
= _mm256_insertf128_si256(iz
,iz_high
,0x1);
1583 offset_cos
= _mm256_castsi256_ps(iz
);
1585 /* Extended precision arithmethic to achieve full precision */
1586 y
= _mm256_mul_ps(z
,CA1
);
1587 tmp1
= _mm256_mul_ps(z
,CA2
);
1588 tmp2
= _mm256_mul_ps(z
,CA3
);
1589 y
= _mm256_sub_ps(x
,y
);
1590 y
= _mm256_sub_ps(y
,tmp1
);
1591 y
= _mm256_sub_ps(y
,tmp2
);
1593 y2
= _mm256_mul_ps(y
,y
);
1595 tmp1
= _mm256_mul_ps(CC0
,y2
);
1596 tmp1
= _mm256_add_ps(tmp1
,CC1
);
1597 tmp2
= _mm256_mul_ps(CS0
,y2
);
1598 tmp2
= _mm256_add_ps(tmp2
,CS1
);
1599 tmp1
= _mm256_mul_ps(tmp1
,y2
);
1600 tmp1
= _mm256_add_ps(tmp1
,CC2
);
1601 tmp2
= _mm256_mul_ps(tmp2
,y2
);
1602 tmp2
= _mm256_add_ps(tmp2
,CS2
);
1604 tmp1
= _mm256_mul_ps(tmp1
,y2
);
1605 tmp1
= _mm256_add_ps(tmp1
,one
);
1607 tmp2
= _mm256_mul_ps(tmp2
,_mm256_mul_ps(y
,y2
));
1608 tmp2
= _mm256_add_ps(tmp2
,y
);
1610 #ifdef __INTEL_COMPILER
1611 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1612 mask_sin
= _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin
,mask_one
))),zero
,_CMP_EQ_OQ
);
1613 mask_cos
= _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos
,mask_one
))),zero
,_CMP_EQ_OQ
);
1615 mask_sin
= _mm256_cmp_ps( _mm256_and_ps(offset_sin
,mask_one
), zero
, _CMP_EQ_OQ
);
1616 mask_cos
= _mm256_cmp_ps( _mm256_and_ps(offset_cos
,mask_one
), zero
, _CMP_EQ_OQ
);
1618 tmp_sin
= _mm256_blendv_ps(tmp1
,tmp2
,mask_sin
);
1619 tmp_cos
= _mm256_blendv_ps(tmp1
,tmp2
,mask_cos
);
1621 tmp1
= _mm256_xor_ps(signbit
,tmp_sin
);
1622 tmp2
= _mm256_xor_ps(signbit
,tmp_cos
);
1624 #ifdef __INTEL_COMPILER
1625 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1626 mask_sin
= _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin
,mask_two
))),zero
,_CMP_EQ_OQ
);
1627 mask_cos
= _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos
,mask_two
))),zero
,_CMP_EQ_OQ
);
1629 mask_sin
= _mm256_cmp_ps( _mm256_and_ps(offset_sin
,mask_two
), zero
, _CMP_EQ_OQ
);
1630 mask_cos
= _mm256_cmp_ps( _mm256_and_ps(offset_cos
,mask_two
), zero
, _CMP_EQ_OQ
);
1633 *sinval
= _mm256_blendv_ps(tmp1
,tmp_sin
,mask_sin
);
1634 *cosval
= _mm256_blendv_ps(tmp2
,tmp_cos
,mask_cos
);
1640 gmx_mm_sincos_ps(__m128 x
,
1644 const __m128 two_over_pi
= _mm_set1_ps(2.0/M_PI
);
1645 const __m128 half
= _mm_set1_ps(0.5);
1646 const __m128 one
= _mm_set1_ps(1.0);
1648 const __m128i izero
= _mm_set1_epi32(0);
1649 const __m128i ione
= _mm_set1_epi32(1);
1650 const __m128i itwo
= _mm_set1_epi32(2);
1651 const __m128i ithree
= _mm_set1_epi32(3);
1652 const __m128 signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1654 const __m128 CA1
= _mm_set1_ps(1.5703125f
);
1655 const __m128 CA2
= _mm_set1_ps(4.837512969970703125e-4f
);
1656 const __m128 CA3
= _mm_set1_ps(7.54978995489188216e-8f
);
1658 const __m128 CC0
= _mm_set1_ps(-0.0013602249f
);
1659 const __m128 CC1
= _mm_set1_ps(0.0416566950f
);
1660 const __m128 CC2
= _mm_set1_ps(-0.4999990225f
);
1661 const __m128 CS0
= _mm_set1_ps(-0.0001950727f
);
1662 const __m128 CS1
= _mm_set1_ps(0.0083320758f
);
1663 const __m128 CS2
= _mm_set1_ps(-0.1666665247f
);
1668 __m128i offset_sin
,offset_cos
;
1670 __m128 mask_sin
,mask_cos
;
1671 __m128 tmp_sin
,tmp_cos
;
1673 y
= _mm_mul_ps(x
,two_over_pi
);
1674 y
= _mm_add_ps(y
,_mm_or_ps(_mm_and_ps(y
,signbit
),half
));
1676 iz
= _mm_cvttps_epi32(y
);
1677 z
= _mm_round_ps(y
,_MM_FROUND_TO_ZERO
);
1679 offset_sin
= _mm_and_si128(iz
,ithree
);
1680 offset_cos
= _mm_add_epi32(iz
,ione
);
1682 /* Extended precision arithmethic to achieve full precision */
1683 y
= _mm_mul_ps(z
,CA1
);
1684 tmp1
= _mm_mul_ps(z
,CA2
);
1685 tmp2
= _mm_mul_ps(z
,CA3
);
1686 y
= _mm_sub_ps(x
,y
);
1687 y
= _mm_sub_ps(y
,tmp1
);
1688 y
= _mm_sub_ps(y
,tmp2
);
1690 y2
= _mm_mul_ps(y
,y
);
1692 tmp1
= _mm_mul_ps(CC0
,y2
);
1693 tmp1
= _mm_add_ps(tmp1
,CC1
);
1694 tmp2
= _mm_mul_ps(CS0
,y2
);
1695 tmp2
= _mm_add_ps(tmp2
,CS1
);
1696 tmp1
= _mm_mul_ps(tmp1
,y2
);
1697 tmp1
= _mm_add_ps(tmp1
,CC2
);
1698 tmp2
= _mm_mul_ps(tmp2
,y2
);
1699 tmp2
= _mm_add_ps(tmp2
,CS2
);
1701 tmp1
= _mm_mul_ps(tmp1
,y2
);
1702 tmp1
= _mm_add_ps(tmp1
,one
);
1704 tmp2
= _mm_mul_ps(tmp2
,_mm_mul_ps(y
,y2
));
1705 tmp2
= _mm_add_ps(tmp2
,y
);
1707 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
,ione
), izero
));
1708 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
,ione
), izero
));
1710 tmp_sin
= _mm_blendv_ps(tmp1
,tmp2
,mask_sin
);
1711 tmp_cos
= _mm_blendv_ps(tmp1
,tmp2
,mask_cos
);
1713 mask_sin
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin
,itwo
), izero
));
1714 mask_cos
= gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos
,itwo
), izero
));
1716 tmp1
= _mm_xor_ps(signbit
,tmp_sin
);
1717 tmp2
= _mm_xor_ps(signbit
,tmp_cos
);
1719 *sinval
= _mm_blendv_ps(tmp1
,tmp_sin
,mask_sin
);
1720 *cosval
= _mm_blendv_ps(tmp2
,tmp_cos
,mask_cos
);
1729 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1730 * will then call the sincos() routine and waste a factor 2 in performance!
1733 gmx_mm256_sin_ps(__m256 x
)
1736 gmx_mm256_sincos_ps(x
,&s
,&c
);
1741 gmx_mm_sin_ps(__m128 x
)
1744 gmx_mm_sincos_ps(x
,&s
,&c
);
1750 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1751 * will then call the sincos() routine and waste a factor 2 in performance!
1754 gmx_mm256_cos_ps(__m256 x
)
1757 gmx_mm256_sincos_ps(x
,&s
,&c
);
1762 gmx_mm_cos_ps(__m128 x
)
1765 gmx_mm_sincos_ps(x
,&s
,&c
);
1771 gmx_mm256_tan_ps(__m256 x
)
1773 __m256 sinval
,cosval
;
1776 gmx_mm256_sincos_ps(x
,&sinval
,&cosval
);
1778 tanval
= _mm256_mul_ps(sinval
,gmx_mm256_inv_ps(cosval
));
1784 gmx_mm_tan_ps(__m128 x
)
1786 __m128 sinval
,cosval
;
1789 gmx_mm_sincos_ps(x
,&sinval
,&cosval
);
1791 tanval
= _mm_mul_ps(sinval
,gmx_mm_inv_ps(cosval
));
1798 gmx_mm256_asin_ps(__m256 x
)
1800 const __m256 signmask
= _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1801 const __m256 limitlow
= _mm256_set1_ps(1e-4f
);
1802 const __m256 half
= _mm256_set1_ps(0.5f
);
1803 const __m256 one
= _mm256_set1_ps(1.0f
);
1804 const __m256 halfpi
= _mm256_set1_ps((float)M_PI
/2.0f
);
1806 const __m256 CC5
= _mm256_set1_ps(4.2163199048E-2f
);
1807 const __m256 CC4
= _mm256_set1_ps(2.4181311049E-2f
);
1808 const __m256 CC3
= _mm256_set1_ps(4.5470025998E-2f
);
1809 const __m256 CC2
= _mm256_set1_ps(7.4953002686E-2f
);
1810 const __m256 CC1
= _mm256_set1_ps(1.6666752422E-1f
);
1815 __m256 z
,z1
,z2
,q
,q1
,q2
;
1818 sign
= _mm256_andnot_ps(signmask
,x
);
1819 xabs
= _mm256_and_ps(x
,signmask
);
1821 mask
= _mm256_cmp_ps(xabs
,half
,_CMP_GT_OQ
);
1823 z1
= _mm256_mul_ps(half
, _mm256_sub_ps(one
,xabs
));
1824 q1
= _mm256_mul_ps(z1
,gmx_mm256_invsqrt_ps(z1
));
1825 q1
= _mm256_andnot_ps(_mm256_cmp_ps(xabs
,one
,_CMP_EQ_OQ
),q1
);
1828 z2
= _mm256_mul_ps(q2
,q2
);
1830 z
= _mm256_blendv_ps(z2
,z1
,mask
);
1831 q
= _mm256_blendv_ps(q2
,q1
,mask
);
1833 z2
= _mm256_mul_ps(z
,z
);
1835 pA
= _mm256_mul_ps(CC5
,z2
);
1836 pB
= _mm256_mul_ps(CC4
,z2
);
1838 pA
= _mm256_add_ps(pA
,CC3
);
1839 pB
= _mm256_add_ps(pB
,CC2
);
1841 pA
= _mm256_mul_ps(pA
,z2
);
1842 pB
= _mm256_mul_ps(pB
,z2
);
1844 pA
= _mm256_add_ps(pA
,CC1
);
1845 pA
= _mm256_mul_ps(pA
,z
);
1847 z
= _mm256_add_ps(pA
,pB
);
1848 z
= _mm256_mul_ps(z
,q
);
1849 z
= _mm256_add_ps(z
,q
);
1851 q2
= _mm256_sub_ps(halfpi
,z
);
1852 q2
= _mm256_sub_ps(q2
,z
);
1854 z
= _mm256_blendv_ps(z
,q2
,mask
);
1856 mask
= _mm256_cmp_ps(xabs
,limitlow
,_CMP_GT_OQ
);
1857 z
= _mm256_blendv_ps(xabs
,z
,mask
);
1859 z
= _mm256_xor_ps(z
,sign
);
1865 gmx_mm_asin_ps(__m128 x
)
1867 /* Same algorithm as cephes library */
1868 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1869 const __m128 limitlow
= _mm_set1_ps(1e-4f
);
1870 const __m128 half
= _mm_set1_ps(0.5f
);
1871 const __m128 one
= _mm_set1_ps(1.0f
);
1872 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0f
);
1874 const __m128 CC5
= _mm_set1_ps(4.2163199048E-2f
);
1875 const __m128 CC4
= _mm_set1_ps(2.4181311049E-2f
);
1876 const __m128 CC3
= _mm_set1_ps(4.5470025998E-2f
);
1877 const __m128 CC2
= _mm_set1_ps(7.4953002686E-2f
);
1878 const __m128 CC1
= _mm_set1_ps(1.6666752422E-1f
);
1883 __m128 z
,z1
,z2
,q
,q1
,q2
;
1886 sign
= _mm_andnot_ps(signmask
,x
);
1887 xabs
= _mm_and_ps(x
,signmask
);
1889 mask
= _mm_cmpgt_ps(xabs
,half
);
1891 z1
= _mm_mul_ps(half
, _mm_sub_ps(one
,xabs
));
1892 q1
= _mm_mul_ps(z1
,gmx_mm_invsqrt_ps(z1
));
1893 q1
= _mm_andnot_ps(_mm_cmpeq_ps(xabs
,one
),q1
);
1896 z2
= _mm_mul_ps(q2
,q2
);
1898 z
= _mm_or_ps( _mm_and_ps(mask
,z1
) , _mm_andnot_ps(mask
,z2
) );
1899 q
= _mm_or_ps( _mm_and_ps(mask
,q1
) , _mm_andnot_ps(mask
,q2
) );
1901 z2
= _mm_mul_ps(z
,z
);
1903 pA
= _mm_mul_ps(CC5
,z2
);
1904 pB
= _mm_mul_ps(CC4
,z2
);
1906 pA
= _mm_add_ps(pA
,CC3
);
1907 pB
= _mm_add_ps(pB
,CC2
);
1909 pA
= _mm_mul_ps(pA
,z2
);
1910 pB
= _mm_mul_ps(pB
,z2
);
1912 pA
= _mm_add_ps(pA
,CC1
);
1913 pA
= _mm_mul_ps(pA
,z
);
1915 z
= _mm_add_ps(pA
,pB
);
1916 z
= _mm_mul_ps(z
,q
);
1917 z
= _mm_add_ps(z
,q
);
1919 q2
= _mm_sub_ps(halfpi
,z
);
1920 q2
= _mm_sub_ps(q2
,z
);
1922 z
= _mm_or_ps( _mm_and_ps(mask
,q2
) , _mm_andnot_ps(mask
,z
) );
1924 mask
= _mm_cmpgt_ps(xabs
,limitlow
);
1925 z
= _mm_or_ps( _mm_and_ps(mask
,z
) , _mm_andnot_ps(mask
,xabs
) );
1927 z
= _mm_xor_ps(z
,sign
);
1934 gmx_mm256_acos_ps(__m256 x
)
1936 const __m256 signmask
= _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1937 const __m256 one_ps
= _mm256_set1_ps(1.0f
);
1938 const __m256 half_ps
= _mm256_set1_ps(0.5f
);
1939 const __m256 pi_ps
= _mm256_set1_ps((float)M_PI
);
1940 const __m256 halfpi_ps
= _mm256_set1_ps((float)M_PI
/2.0f
);
1947 xabs
= _mm256_and_ps(x
,signmask
);
1948 mask1
= _mm256_cmp_ps(xabs
,half_ps
,_CMP_GT_OQ
);
1949 mask2
= _mm256_cmp_ps(x
,_mm256_setzero_ps(),_CMP_GT_OQ
);
1951 z
= _mm256_mul_ps(half_ps
,_mm256_sub_ps(one_ps
,xabs
));
1952 z
= _mm256_mul_ps(z
,gmx_mm256_invsqrt_ps(z
));
1953 z
= _mm256_andnot_ps(_mm256_cmp_ps(xabs
,one_ps
,_CMP_EQ_OQ
),z
);
1955 z
= _mm256_blendv_ps(x
,z
,mask1
);
1956 z
= gmx_mm256_asin_ps(z
);
1958 z2
= _mm256_add_ps(z
,z
);
1959 z1
= _mm256_sub_ps(pi_ps
,z2
);
1960 z3
= _mm256_sub_ps(halfpi_ps
,z
);
1962 z
= _mm256_blendv_ps(z1
,z2
,mask2
);
1963 z
= _mm256_blendv_ps(z3
,z
,mask1
);
1969 gmx_mm_acos_ps(__m128 x
)
1971 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1972 const __m128 one_ps
= _mm_set1_ps(1.0f
);
1973 const __m128 half_ps
= _mm_set1_ps(0.5f
);
1974 const __m128 pi_ps
= _mm_set1_ps(M_PI
);
1975 const __m128 halfpi_ps
= _mm_set1_ps(M_PI
/2.0f
);
1982 xabs
= _mm_and_ps(x
,signmask
);
1983 mask1
= _mm_cmpgt_ps(xabs
,half_ps
);
1984 mask2
= _mm_cmpgt_ps(x
,_mm_setzero_ps());
1986 z
= _mm_mul_ps(half_ps
,_mm_sub_ps(one_ps
,xabs
));
1987 z
= _mm_mul_ps(z
,gmx_mm_invsqrt_ps(z
));
1988 z
= _mm_andnot_ps(_mm_cmpeq_ps(xabs
,one_ps
),z
);
1990 z
= _mm_blendv_ps(x
,z
,mask1
);
1991 z
= gmx_mm_asin_ps(z
);
1993 z2
= _mm_add_ps(z
,z
);
1994 z1
= _mm_sub_ps(pi_ps
,z2
);
1995 z3
= _mm_sub_ps(halfpi_ps
,z
);
1997 z
= _mm_blendv_ps(z1
,z2
,mask2
);
1998 z
= _mm_blendv_ps(z3
,z
,mask1
);
2005 gmx_mm256_atan_ps(__m256 x
)
2007 const __m256 signmask
= _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2008 const __m256 limit1
= _mm256_set1_ps(0.414213562373095f
);
2009 const __m256 limit2
= _mm256_set1_ps(2.414213562373095f
);
2010 const __m256 quarterpi
= _mm256_set1_ps(0.785398163397448f
);
2011 const __m256 halfpi
= _mm256_set1_ps(1.570796326794896f
);
2012 const __m256 mone
= _mm256_set1_ps(-1.0f
);
2013 const __m256 CC3
= _mm256_set1_ps(-3.33329491539E-1f
);
2014 const __m256 CC5
= _mm256_set1_ps(1.99777106478E-1f
);
2015 const __m256 CC7
= _mm256_set1_ps(-1.38776856032E-1);
2016 const __m256 CC9
= _mm256_set1_ps(8.05374449538e-2f
);
2024 sign
= _mm256_andnot_ps(signmask
,x
);
2025 x
= _mm256_and_ps(x
,signmask
);
2027 mask1
= _mm256_cmp_ps(x
,limit1
,_CMP_GT_OQ
);
2028 mask2
= _mm256_cmp_ps(x
,limit2
,_CMP_GT_OQ
);
2030 z1
= _mm256_mul_ps(_mm256_add_ps(x
,mone
),gmx_mm256_inv_ps(_mm256_sub_ps(x
,mone
)));
2031 z2
= _mm256_mul_ps(mone
,gmx_mm256_inv_ps(x
));
2033 y
= _mm256_and_ps(mask1
,quarterpi
);
2034 y
= _mm256_blendv_ps(y
,halfpi
,mask2
);
2036 x
= _mm256_blendv_ps(x
,z1
,mask1
);
2037 x
= _mm256_blendv_ps(x
,z2
,mask2
);
2039 x2
= _mm256_mul_ps(x
,x
);
2040 x4
= _mm256_mul_ps(x2
,x2
);
2042 sum1
= _mm256_mul_ps(CC9
,x4
);
2043 sum2
= _mm256_mul_ps(CC7
,x4
);
2044 sum1
= _mm256_add_ps(sum1
,CC5
);
2045 sum2
= _mm256_add_ps(sum2
,CC3
);
2046 sum1
= _mm256_mul_ps(sum1
,x4
);
2047 sum2
= _mm256_mul_ps(sum2
,x2
);
2049 sum1
= _mm256_add_ps(sum1
,sum2
);
2050 sum1
= _mm256_sub_ps(sum1
,mone
);
2051 sum1
= _mm256_mul_ps(sum1
,x
);
2052 y
= _mm256_add_ps(y
,sum1
);
2054 y
= _mm256_xor_ps(y
,sign
);
2060 gmx_mm_atan_ps(__m128 x
)
2062 /* Same algorithm as cephes library */
2063 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2064 const __m128 limit1
= _mm_set1_ps(0.414213562373095f
);
2065 const __m128 limit2
= _mm_set1_ps(2.414213562373095f
);
2066 const __m128 quarterpi
= _mm_set1_ps(0.785398163397448f
);
2067 const __m128 halfpi
= _mm_set1_ps(1.570796326794896f
);
2068 const __m128 mone
= _mm_set1_ps(-1.0f
);
2069 const __m128 CC3
= _mm_set1_ps(-3.33329491539E-1f
);
2070 const __m128 CC5
= _mm_set1_ps(1.99777106478E-1f
);
2071 const __m128 CC7
= _mm_set1_ps(-1.38776856032E-1);
2072 const __m128 CC9
= _mm_set1_ps(8.05374449538e-2f
);
2080 sign
= _mm_andnot_ps(signmask
,x
);
2081 x
= _mm_and_ps(x
,signmask
);
2083 mask1
= _mm_cmpgt_ps(x
,limit1
);
2084 mask2
= _mm_cmpgt_ps(x
,limit2
);
2086 z1
= _mm_mul_ps(_mm_add_ps(x
,mone
),gmx_mm_inv_ps(_mm_sub_ps(x
,mone
)));
2087 z2
= _mm_mul_ps(mone
,gmx_mm_inv_ps(x
));
2089 y
= _mm_and_ps(mask1
,quarterpi
);
2090 y
= _mm_blendv_ps(y
,halfpi
,mask2
);
2092 x
= _mm_blendv_ps(x
,z1
,mask1
);
2093 x
= _mm_blendv_ps(x
,z2
,mask2
);
2095 x2
= _mm_mul_ps(x
,x
);
2096 x4
= _mm_mul_ps(x2
,x2
);
2098 sum1
= _mm_mul_ps(CC9
,x4
);
2099 sum2
= _mm_mul_ps(CC7
,x4
);
2100 sum1
= _mm_add_ps(sum1
,CC5
);
2101 sum2
= _mm_add_ps(sum2
,CC3
);
2102 sum1
= _mm_mul_ps(sum1
,x4
);
2103 sum2
= _mm_mul_ps(sum2
,x2
);
2105 sum1
= _mm_add_ps(sum1
,sum2
);
2106 sum1
= _mm_sub_ps(sum1
,mone
);
2107 sum1
= _mm_mul_ps(sum1
,x
);
2108 y
= _mm_add_ps(y
,sum1
);
2110 y
= _mm_xor_ps(y
,sign
);
2117 gmx_mm256_atan2_ps(__m256 y
, __m256 x
)
2119 const __m256 pi
= _mm256_set1_ps( (float) M_PI
);
2120 const __m256 minuspi
= _mm256_set1_ps( (float) -M_PI
);
2121 const __m256 halfpi
= _mm256_set1_ps( (float) M_PI
/2.0f
);
2122 const __m256 minushalfpi
= _mm256_set1_ps( (float) -M_PI
/2.0f
);
2126 __m256 maskx_lt
,maskx_eq
;
2127 __m256 masky_lt
,masky_eq
;
2128 __m256 mask1
,mask2
,mask3
,mask4
,maskall
;
2130 maskx_lt
= _mm256_cmp_ps(x
,_mm256_setzero_ps(),_CMP_LT_OQ
);
2131 masky_lt
= _mm256_cmp_ps(y
,_mm256_setzero_ps(),_CMP_LT_OQ
);
2132 maskx_eq
= _mm256_cmp_ps(x
,_mm256_setzero_ps(),_CMP_EQ_OQ
);
2133 masky_eq
= _mm256_cmp_ps(y
,_mm256_setzero_ps(),_CMP_EQ_OQ
);
2135 z
= _mm256_mul_ps(y
,gmx_mm256_inv_ps(x
));
2136 z
= gmx_mm256_atan_ps(z
);
2138 mask1
= _mm256_and_ps(maskx_eq
,masky_lt
);
2139 mask2
= _mm256_andnot_ps(maskx_lt
,masky_eq
);
2140 mask3
= _mm256_andnot_ps( _mm256_or_ps(masky_lt
,masky_eq
) , maskx_eq
);
2141 mask4
= _mm256_and_ps(maskx_lt
,masky_eq
);
2142 maskall
= _mm256_or_ps( _mm256_or_ps(mask1
,mask2
), _mm256_or_ps(mask3
,mask4
) );
2144 z
= _mm256_andnot_ps(maskall
,z
);
2145 z1
= _mm256_and_ps(mask1
,minushalfpi
);
2146 z3
= _mm256_and_ps(mask3
,halfpi
);
2147 z4
= _mm256_and_ps(mask4
,pi
);
2149 z
= _mm256_or_ps( _mm256_or_ps(z
,z1
), _mm256_or_ps(z3
,z4
) );
2151 w
= _mm256_blendv_ps(pi
,minuspi
,masky_lt
);
2152 w
= _mm256_and_ps(w
,maskx_lt
);
2154 w
= _mm256_andnot_ps(maskall
,w
);
2156 z
= _mm256_add_ps(z
,w
);
2162 gmx_mm_atan2_ps(__m128 y
, __m128 x
)
2164 const __m128 pi
= _mm_set1_ps(M_PI
);
2165 const __m128 minuspi
= _mm_set1_ps(-M_PI
);
2166 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0);
2167 const __m128 minushalfpi
= _mm_set1_ps(-M_PI
/2.0);
2171 __m128 maskx_lt
,maskx_eq
;
2172 __m128 masky_lt
,masky_eq
;
2173 __m128 mask1
,mask2
,mask3
,mask4
,maskall
;
2175 maskx_lt
= _mm_cmplt_ps(x
,_mm_setzero_ps());
2176 masky_lt
= _mm_cmplt_ps(y
,_mm_setzero_ps());
2177 maskx_eq
= _mm_cmpeq_ps(x
,_mm_setzero_ps());
2178 masky_eq
= _mm_cmpeq_ps(y
,_mm_setzero_ps());
2180 z
= _mm_mul_ps(y
,gmx_mm_inv_ps(x
));
2181 z
= gmx_mm_atan_ps(z
);
2183 mask1
= _mm_and_ps(maskx_eq
,masky_lt
);
2184 mask2
= _mm_andnot_ps(maskx_lt
,masky_eq
);
2185 mask3
= _mm_andnot_ps( _mm_or_ps(masky_lt
,masky_eq
) , maskx_eq
);
2186 mask4
= _mm_and_ps(masky_eq
,maskx_lt
);
2188 maskall
= _mm_or_ps( _mm_or_ps(mask1
,mask2
), _mm_or_ps(mask3
,mask4
) );
2190 z
= _mm_andnot_ps(maskall
,z
);
2191 z1
= _mm_and_ps(mask1
,minushalfpi
);
2192 z3
= _mm_and_ps(mask3
,halfpi
);
2193 z4
= _mm_and_ps(mask4
,pi
);
2195 z
= _mm_or_ps( _mm_or_ps(z
,z1
), _mm_or_ps(z3
,z4
) );
2197 mask1
= _mm_andnot_ps(masky_lt
,maskx_lt
);
2198 mask2
= _mm_and_ps(maskx_lt
,masky_lt
);
2200 w
= _mm_or_ps( _mm_and_ps(mask1
,pi
), _mm_and_ps(mask2
,minuspi
) );
2201 w
= _mm_andnot_ps(maskall
,w
);
2203 z
= _mm_add_ps(z
,w
);
2208 #endif /* _gmx_math_x86_avx_256_single_h_ */