added Verlet scheme and NxN non-bonded functionality
[gromacs.git] / include / gmx_math_x86_sse4_1_single.h
blobdf051161f6af00ad9fcbf665c8c0ee8dc31656b9
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This file is part of GROMACS.
5 * Copyright (c) 2012-
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
18 * And Hey:
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_sse4_1_single_h_
22 #define _gmx_math_x86_sse4_1_single_h_
24 #include <stdio.h>
25 #include <math.h>
27 #include "gmx_x86_sse4_1.h"
31 #ifndef M_PI
32 # define M_PI 3.14159265358979323846264338327950288
33 #endif
38 /************************
39 * *
40 * Simple math routines *
41 * *
42 ************************/
44 /* 1.0/sqrt(x) */
45 static gmx_inline __m128
46 gmx_mm_invsqrt_ps(__m128 x)
48 const __m128 half = _mm_set_ps(0.5,0.5,0.5,0.5);
49 const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
51 __m128 lu = _mm_rsqrt_ps(x);
53 return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
56 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
57 static gmx_inline __m128
58 gmx_mm_sqrt_ps(__m128 x)
60 __m128 mask;
61 __m128 res;
63 mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
64 res = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
66 res = _mm_mul_ps(x,res);
68 return res;
71 /* 1.0/x */
72 static gmx_inline __m128
73 gmx_mm_inv_ps(__m128 x)
75 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
77 __m128 lu = _mm_rcp_ps(x);
79 return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
82 static gmx_inline __m128
83 gmx_mm_abs_ps(__m128 x)
85 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
87 return _mm_and_ps(x,signmask);
92 static __m128
93 gmx_mm_log_ps(__m128 x)
95 /* Same algorithm as cephes library */
96 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
97 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
98 const __m128 half = _mm_set1_ps(0.5f);
99 const __m128 one = _mm_set1_ps(1.0f);
100 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
101 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
102 const __m128 corr2 = _mm_set1_ps(0.693359375f);
104 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
105 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
106 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
107 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
108 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
109 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
110 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
111 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
112 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
114 __m128 fexp,fexp1;
115 __m128i iexp;
116 __m128 mask;
117 __m128 x1,x2;
118 __m128 y;
119 __m128 pA,pB,pC,pD,pE,tB,tC,tD,tE;
121 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
122 fexp = _mm_and_ps(x,expmask);
123 iexp = gmx_mm_castps_si128(fexp);
124 iexp = _mm_srli_epi32(iexp,23);
125 iexp = _mm_sub_epi32(iexp,expbase_m1);
127 x = _mm_andnot_ps(expmask,x);
128 x = _mm_or_ps(x,one);
129 x = _mm_mul_ps(x,half);
131 mask = _mm_cmplt_ps(x,invsq2);
133 x = _mm_add_ps(x,_mm_and_ps(mask,x));
134 x = _mm_sub_ps(x,one);
135 iexp = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
137 x2 = _mm_mul_ps(x,x);
139 pA = _mm_mul_ps(CA_1,x);
140 pB = _mm_mul_ps(CB_1,x);
141 pC = _mm_mul_ps(CC_1,x);
142 pD = _mm_mul_ps(CD_1,x);
143 pE = _mm_mul_ps(CE_1,x);
144 tB = _mm_add_ps(CB_0,x2);
145 tC = _mm_add_ps(CC_0,x2);
146 tD = _mm_add_ps(CD_0,x2);
147 tE = _mm_add_ps(CE_0,x2);
148 pB = _mm_add_ps(pB,tB);
149 pC = _mm_add_ps(pC,tC);
150 pD = _mm_add_ps(pD,tD);
151 pE = _mm_add_ps(pE,tE);
153 pA = _mm_mul_ps(pA,pB);
154 pC = _mm_mul_ps(pC,pD);
155 pE = _mm_mul_ps(pE,x2);
156 pA = _mm_mul_ps(pA,pC);
157 y = _mm_mul_ps(pA,pE);
159 fexp = _mm_cvtepi32_ps(iexp);
160 y = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
162 y = _mm_sub_ps(y, _mm_mul_ps(half,x2));
163 x2 = _mm_add_ps(x,y);
165 x2 = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
167 return x2;
172 * 2^x function.
174 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
175 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
177 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
179 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
180 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
181 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
182 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
183 * number instead. That would take a few extra cycles and not really help, since something is
184 * wrong if you are using single precision to work with numbers that cannot really be represented
185 * in single precision.
187 * The accuracy is at least 23 bits.
189 static __m128
190 gmx_mm_exp2_ps(__m128 x)
192 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
193 const __m128 arglimit = _mm_set1_ps(126.0f);
195 const __m128i expbase = _mm_set1_epi32(127);
196 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
197 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
198 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
199 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
200 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
201 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
202 const __m128 CA0 = _mm_set1_ps(1.0f);
204 __m128 valuemask;
205 __m128i iexppart;
206 __m128 fexppart;
207 __m128 intpart;
208 __m128 x2;
209 __m128 p0,p1;
211 iexppart = _mm_cvtps_epi32(x);
212 intpart = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
213 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
214 valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(x));
215 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
217 x = _mm_sub_ps(x,intpart);
218 x2 = _mm_mul_ps(x,x);
220 p0 = _mm_mul_ps(CA6,x2);
221 p1 = _mm_mul_ps(CA5,x2);
222 p0 = _mm_add_ps(p0,CA4);
223 p1 = _mm_add_ps(p1,CA3);
224 p0 = _mm_mul_ps(p0,x2);
225 p1 = _mm_mul_ps(p1,x2);
226 p0 = _mm_add_ps(p0,CA2);
227 p1 = _mm_add_ps(p1,CA1);
228 p0 = _mm_mul_ps(p0,x2);
229 p1 = _mm_mul_ps(p1,x);
230 p0 = _mm_add_ps(p0,CA0);
231 p0 = _mm_add_ps(p0,p1);
232 x = _mm_mul_ps(p0,fexppart);
234 return x;
238 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
239 * but there will then be a small rounding error since we lose some precision due to the
240 * multiplication. This will then be magnified a lot by the exponential.
242 * Instead, we calculate the fractional part directly as a minimax approximation of
243 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
244 * remaining after 2^y, which avoids the precision-loss.
245 * The final result is correct to within 1 LSB over the entire argument range.
247 static __m128
248 gmx_mm_exp_ps(__m128 x)
250 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
251 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
252 const __m128 arglimit = _mm_set1_ps(126.0f);
253 const __m128i expbase = _mm_set1_epi32(127);
255 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
256 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
258 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
259 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
260 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
261 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
262 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
263 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
264 const __m128 one = _mm_set1_ps(1.0f);
266 __m128 y,x2;
267 __m128 p0,p1;
268 __m128 valuemask;
269 __m128i iexppart;
270 __m128 fexppart;
271 __m128 intpart;
273 y = _mm_mul_ps(x,argscale);
275 iexppart = _mm_cvtps_epi32(y);
276 intpart = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
278 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
279 valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(y));
280 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
282 /* Extended precision arithmetics */
283 x = _mm_sub_ps(x,_mm_mul_ps(invargscale0,intpart));
284 x = _mm_sub_ps(x,_mm_mul_ps(invargscale1,intpart));
286 x2 = _mm_mul_ps(x,x);
288 p1 = _mm_mul_ps(CC5,x2);
289 p0 = _mm_mul_ps(CC4,x2);
290 p1 = _mm_add_ps(p1,CC3);
291 p0 = _mm_add_ps(p0,CC2);
292 p1 = _mm_mul_ps(p1,x2);
293 p0 = _mm_mul_ps(p0,x2);
294 p1 = _mm_add_ps(p1,CC1);
295 p0 = _mm_add_ps(p0,CC0);
296 p1 = _mm_mul_ps(p1,x);
297 p0 = _mm_add_ps(p0,p1);
298 p0 = _mm_mul_ps(p0,x2);
299 x = _mm_add_ps(x,one);
300 x = _mm_add_ps(x,p0);
302 x = _mm_mul_ps(x,fexppart);
304 return x;
307 /* FULL precision. Only errors in LSB */
308 static __m128
309 gmx_mm_erf_ps(__m128 x)
311 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
312 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
313 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
314 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
315 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
316 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
317 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
318 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
319 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
320 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
321 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
322 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
323 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
324 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
325 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
326 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
327 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
328 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
329 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
330 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
331 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
332 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
333 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
334 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
335 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
336 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
337 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
338 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
339 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
340 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
341 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
343 /* Coefficients for expansion of exp(x) in [0,0.1] */
344 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
345 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
346 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
347 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
349 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
350 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
351 const __m128 one = _mm_set1_ps(1.0f);
352 const __m128 two = _mm_set1_ps(2.0f);
354 __m128 x2,x4,y;
355 __m128 z,q,t,t2,w,w2;
356 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
357 __m128 expmx2,corr;
358 __m128 res_erf,res_erfc,res;
359 __m128 mask;
361 /* Calculate erf() */
362 x2 = _mm_mul_ps(x,x);
363 x4 = _mm_mul_ps(x2,x2);
365 pA0 = _mm_mul_ps(CA6,x4);
366 pA1 = _mm_mul_ps(CA5,x4);
367 pA0 = _mm_add_ps(pA0,CA4);
368 pA1 = _mm_add_ps(pA1,CA3);
369 pA0 = _mm_mul_ps(pA0,x4);
370 pA1 = _mm_mul_ps(pA1,x4);
371 pA0 = _mm_add_ps(pA0,CA2);
372 pA1 = _mm_add_ps(pA1,CA1);
373 pA0 = _mm_mul_ps(pA0,x4);
374 pA1 = _mm_mul_ps(pA1,x2);
375 pA0 = _mm_add_ps(pA0,pA1);
376 pA0 = _mm_add_ps(pA0,CA0);
378 res_erf = _mm_mul_ps(x,pA0);
380 /* Calculate erfc */
382 y = gmx_mm_abs_ps(x);
383 t = gmx_mm_inv_ps(y);
384 w = _mm_sub_ps(t,one);
385 t2 = _mm_mul_ps(t,t);
386 w2 = _mm_mul_ps(w,w);
388 * We cannot simply calculate exp(-x2) directly in single precision, since
389 * that will lose a couple of bits of precision due to the multiplication.
390 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
391 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
393 * The only drawback with this is that it requires TWO separate exponential
394 * evaluations, which would be horrible performance-wise. However, the argument
395 * for the second exp() call is always small, so there we simply use a
396 * low-order minimax expansion on [0,0.1].
399 z = _mm_and_ps(y,sieve);
400 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
402 corr = _mm_mul_ps(CD4,q);
403 corr = _mm_add_ps(corr,CD3);
404 corr = _mm_mul_ps(corr,q);
405 corr = _mm_add_ps(corr,CD2);
406 corr = _mm_mul_ps(corr,q);
407 corr = _mm_add_ps(corr,one);
408 corr = _mm_mul_ps(corr,q);
409 corr = _mm_add_ps(corr,one);
411 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
412 expmx2 = _mm_mul_ps(expmx2,corr);
414 pB1 = _mm_mul_ps(CB9,w2);
415 pB0 = _mm_mul_ps(CB8,w2);
416 pB1 = _mm_add_ps(pB1,CB7);
417 pB0 = _mm_add_ps(pB0,CB6);
418 pB1 = _mm_mul_ps(pB1,w2);
419 pB0 = _mm_mul_ps(pB0,w2);
420 pB1 = _mm_add_ps(pB1,CB5);
421 pB0 = _mm_add_ps(pB0,CB4);
422 pB1 = _mm_mul_ps(pB1,w2);
423 pB0 = _mm_mul_ps(pB0,w2);
424 pB1 = _mm_add_ps(pB1,CB3);
425 pB0 = _mm_add_ps(pB0,CB2);
426 pB1 = _mm_mul_ps(pB1,w2);
427 pB0 = _mm_mul_ps(pB0,w2);
428 pB1 = _mm_add_ps(pB1,CB1);
429 pB1 = _mm_mul_ps(pB1,w);
430 pB0 = _mm_add_ps(pB0,pB1);
431 pB0 = _mm_add_ps(pB0,CB0);
433 pC0 = _mm_mul_ps(CC10,t2);
434 pC1 = _mm_mul_ps(CC9,t2);
435 pC0 = _mm_add_ps(pC0,CC8);
436 pC1 = _mm_add_ps(pC1,CC7);
437 pC0 = _mm_mul_ps(pC0,t2);
438 pC1 = _mm_mul_ps(pC1,t2);
439 pC0 = _mm_add_ps(pC0,CC6);
440 pC1 = _mm_add_ps(pC1,CC5);
441 pC0 = _mm_mul_ps(pC0,t2);
442 pC1 = _mm_mul_ps(pC1,t2);
443 pC0 = _mm_add_ps(pC0,CC4);
444 pC1 = _mm_add_ps(pC1,CC3);
445 pC0 = _mm_mul_ps(pC0,t2);
446 pC1 = _mm_mul_ps(pC1,t2);
447 pC0 = _mm_add_ps(pC0,CC2);
448 pC1 = _mm_add_ps(pC1,CC1);
449 pC0 = _mm_mul_ps(pC0,t2);
450 pC1 = _mm_mul_ps(pC1,t);
451 pC0 = _mm_add_ps(pC0,pC1);
452 pC0 = _mm_add_ps(pC0,CC0);
453 pC0 = _mm_mul_ps(pC0,t);
455 /* SELECT pB0 or pC0 for erfc() */
456 mask = _mm_cmplt_ps(two,y);
457 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
458 res_erfc = _mm_mul_ps(res_erfc,expmx2);
460 /* erfc(x<0) = 2-erfc(|x|) */
461 mask = _mm_cmplt_ps(x,_mm_setzero_ps());
462 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
464 /* Select erf() or erfc() */
465 mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
466 res = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
468 return res;
472 /* FULL precision. Only errors in LSB */
473 static __m128
474 gmx_mm_erfc_ps(__m128 x)
476 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
477 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
478 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
479 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
480 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
481 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
482 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
483 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
484 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
485 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
486 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
487 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
488 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
489 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
490 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
491 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
492 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
493 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
494 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
495 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
496 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
497 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
498 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
499 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
500 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
501 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
502 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
503 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
504 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
505 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
506 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
508 /* Coefficients for expansion of exp(x) in [0,0.1] */
509 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
510 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
511 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
512 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
514 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
515 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
516 const __m128 one = _mm_set1_ps(1.0f);
517 const __m128 two = _mm_set1_ps(2.0f);
519 __m128 x2,x4,y;
520 __m128 z,q,t,t2,w,w2;
521 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
522 __m128 expmx2,corr;
523 __m128 res_erf,res_erfc,res;
524 __m128 mask;
526 /* Calculate erf() */
527 x2 = _mm_mul_ps(x,x);
528 x4 = _mm_mul_ps(x2,x2);
530 pA0 = _mm_mul_ps(CA6,x4);
531 pA1 = _mm_mul_ps(CA5,x4);
532 pA0 = _mm_add_ps(pA0,CA4);
533 pA1 = _mm_add_ps(pA1,CA3);
534 pA0 = _mm_mul_ps(pA0,x4);
535 pA1 = _mm_mul_ps(pA1,x4);
536 pA0 = _mm_add_ps(pA0,CA2);
537 pA1 = _mm_add_ps(pA1,CA1);
538 pA0 = _mm_mul_ps(pA0,x4);
539 pA1 = _mm_mul_ps(pA1,x2);
540 pA0 = _mm_add_ps(pA0,pA1);
541 pA0 = _mm_add_ps(pA0,CA0);
543 res_erf = _mm_mul_ps(x,pA0);
545 /* Calculate erfc */
546 y = gmx_mm_abs_ps(x);
547 t = gmx_mm_inv_ps(y);
548 w = _mm_sub_ps(t,one);
549 t2 = _mm_mul_ps(t,t);
550 w2 = _mm_mul_ps(w,w);
552 * We cannot simply calculate exp(-x2) directly in single precision, since
553 * that will lose a couple of bits of precision due to the multiplication.
554 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
555 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
557 * The only drawback with this is that it requires TWO separate exponential
558 * evaluations, which would be horrible performance-wise. However, the argument
559 * for the second exp() call is always small, so there we simply use a
560 * low-order minimax expansion on [0,0.1].
563 z = _mm_and_ps(y,sieve);
564 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
566 corr = _mm_mul_ps(CD4,q);
567 corr = _mm_add_ps(corr,CD3);
568 corr = _mm_mul_ps(corr,q);
569 corr = _mm_add_ps(corr,CD2);
570 corr = _mm_mul_ps(corr,q);
571 corr = _mm_add_ps(corr,one);
572 corr = _mm_mul_ps(corr,q);
573 corr = _mm_add_ps(corr,one);
575 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
576 expmx2 = _mm_mul_ps(expmx2,corr);
578 pB1 = _mm_mul_ps(CB9,w2);
579 pB0 = _mm_mul_ps(CB8,w2);
580 pB1 = _mm_add_ps(pB1,CB7);
581 pB0 = _mm_add_ps(pB0,CB6);
582 pB1 = _mm_mul_ps(pB1,w2);
583 pB0 = _mm_mul_ps(pB0,w2);
584 pB1 = _mm_add_ps(pB1,CB5);
585 pB0 = _mm_add_ps(pB0,CB4);
586 pB1 = _mm_mul_ps(pB1,w2);
587 pB0 = _mm_mul_ps(pB0,w2);
588 pB1 = _mm_add_ps(pB1,CB3);
589 pB0 = _mm_add_ps(pB0,CB2);
590 pB1 = _mm_mul_ps(pB1,w2);
591 pB0 = _mm_mul_ps(pB0,w2);
592 pB1 = _mm_add_ps(pB1,CB1);
593 pB1 = _mm_mul_ps(pB1,w);
594 pB0 = _mm_add_ps(pB0,pB1);
595 pB0 = _mm_add_ps(pB0,CB0);
597 pC0 = _mm_mul_ps(CC10,t2);
598 pC1 = _mm_mul_ps(CC9,t2);
599 pC0 = _mm_add_ps(pC0,CC8);
600 pC1 = _mm_add_ps(pC1,CC7);
601 pC0 = _mm_mul_ps(pC0,t2);
602 pC1 = _mm_mul_ps(pC1,t2);
603 pC0 = _mm_add_ps(pC0,CC6);
604 pC1 = _mm_add_ps(pC1,CC5);
605 pC0 = _mm_mul_ps(pC0,t2);
606 pC1 = _mm_mul_ps(pC1,t2);
607 pC0 = _mm_add_ps(pC0,CC4);
608 pC1 = _mm_add_ps(pC1,CC3);
609 pC0 = _mm_mul_ps(pC0,t2);
610 pC1 = _mm_mul_ps(pC1,t2);
611 pC0 = _mm_add_ps(pC0,CC2);
612 pC1 = _mm_add_ps(pC1,CC1);
613 pC0 = _mm_mul_ps(pC0,t2);
614 pC1 = _mm_mul_ps(pC1,t);
615 pC0 = _mm_add_ps(pC0,pC1);
616 pC0 = _mm_add_ps(pC0,CC0);
617 pC0 = _mm_mul_ps(pC0,t);
619 /* SELECT pB0 or pC0 for erfc() */
620 mask = _mm_cmplt_ps(two,y);
621 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
622 res_erfc = _mm_mul_ps(res_erfc,expmx2);
624 /* erfc(x<0) = 2-erfc(|x|) */
625 mask = _mm_cmplt_ps(x,_mm_setzero_ps());
626 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
628 /* Select erf() or erfc() */
629 mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
630 res = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
632 return res;
636 /* Calculate the force correction due to PME analytically.
638 * This routine is meant to enable analytical evaluation of the
639 * direct-space PME electrostatic force to avoid tables.
641 * The direct-space potential should be Erfc(beta*r)/r, but there
642 * are some problems evaluating that:
644 * First, the error function is difficult (read: expensive) to
645 * approxmiate accurately for intermediate to large arguments, and
646 * this happens already in ranges of beta*r that occur in simulations.
647 * Second, we now try to avoid calculating potentials in Gromacs but
648 * use forces directly.
650 * We can simply things slight by noting that the PME part is really
651 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
653 * V= 1/r - Erf(beta*r)/r
655 * The first term we already have from the inverse square root, so
656 * that we can leave out of this routine.
658 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
659 * the argument beta*r will be in the range 0.15 to ~4. Use your
660 * favorite plotting program to realize how well-behaved Erf(z)/z is
661 * in this range!
663 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
664 * However, it turns out it is more efficient to approximate f(z)/z and
665 * then only use even powers. This is another minor optimization, since
666 * we actually WANT f(z)/z, because it is going to be multiplied by
667 * the vector between the two atoms to get the vectorial force. The
668 * fastest flops are the ones we can avoid calculating!
670 * So, here's how it should be used:
672 * 1. Calculate r^2.
673 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
674 * 3. Evaluate this routine with z^2 as the argument.
675 * 4. The return value is the expression:
678 * 2*exp(-z^2) erf(z)
679 * ------------ - --------
680 * sqrt(Pi)*z^2 z^3
682 * 5. Multiply the entire expression by beta^3. This will get you
684 * beta^3*2*exp(-z^2) beta^3*erf(z)
685 * ------------------ - ---------------
686 * sqrt(Pi)*z^2 z^3
688 * or, switching back to r (z=r*beta):
690 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
691 * ----------------------- - -----------
692 * sqrt(Pi)*r^2 r^3
695 * With a bit of math exercise you should be able to confirm that
696 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
698 * 6. Add the result to 1/r^3, multiply by the product of the charges,
699 * and you have your force (divided by r). A final multiplication
700 * with the vector connecting the two particles and you have your
701 * vectorial force to add to the particles.
704 static gmx_inline __m128
705 gmx_mm_pmecorrF_ps(__m128 z2)
707 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
708 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
709 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
710 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
711 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
712 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
713 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
715 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
716 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
717 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
718 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
719 const __m128 FD0 = _mm_set1_ps(1.0f);
721 __m128 z4;
722 __m128 polyFN0,polyFN1,polyFD0,polyFD1;
724 z4 = _mm_mul_ps(z2,z2);
726 polyFD0 = _mm_mul_ps(FD4,z4);
727 polyFD1 = _mm_mul_ps(FD3,z4);
728 polyFD0 = _mm_add_ps(polyFD0,FD2);
729 polyFD1 = _mm_add_ps(polyFD1,FD1);
730 polyFD0 = _mm_mul_ps(polyFD0,z4);
731 polyFD1 = _mm_mul_ps(polyFD1,z2);
732 polyFD0 = _mm_add_ps(polyFD0,FD0);
733 polyFD0 = _mm_add_ps(polyFD0,polyFD1);
735 polyFD0 = gmx_mm_inv_ps(polyFD0);
737 polyFN0 = _mm_mul_ps(FN6,z4);
738 polyFN1 = _mm_mul_ps(FN5,z4);
739 polyFN0 = _mm_add_ps(polyFN0,FN4);
740 polyFN1 = _mm_add_ps(polyFN1,FN3);
741 polyFN0 = _mm_mul_ps(polyFN0,z4);
742 polyFN1 = _mm_mul_ps(polyFN1,z4);
743 polyFN0 = _mm_add_ps(polyFN0,FN2);
744 polyFN1 = _mm_add_ps(polyFN1,FN1);
745 polyFN0 = _mm_mul_ps(polyFN0,z4);
746 polyFN1 = _mm_mul_ps(polyFN1,z2);
747 polyFN0 = _mm_add_ps(polyFN0,FN0);
748 polyFN0 = _mm_add_ps(polyFN0,polyFN1);
750 return _mm_mul_ps(polyFN0,polyFD0);
754 /* Calculate the potential correction due to PME analytically.
756 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
758 * This routine calculates Erf(z)/z, although you should provide z^2
759 * as the input argument.
761 * Here's how it should be used:
763 * 1. Calculate r^2.
764 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
765 * 3. Evaluate this routine with z^2 as the argument.
766 * 4. The return value is the expression:
769 * erf(z)
770 * --------
773 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
775 * erf(r*beta)
776 * -----------
777 * r
779 * 6. Add the result to 1/r, multiply by the product of the charges,
780 * and you have your potential.
782 static gmx_inline __m128
783 gmx_mm_pmecorrV_ps(__m128 z2)
785 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
786 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
787 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
788 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
789 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
790 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
791 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
793 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
794 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
795 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
796 const __m128 VD0 = _mm_set1_ps(1.0f);
798 __m128 z4;
799 __m128 polyVN0,polyVN1,polyVD0,polyVD1;
801 z4 = _mm_mul_ps(z2,z2);
803 polyVD1 = _mm_mul_ps(VD3,z4);
804 polyVD0 = _mm_mul_ps(VD2,z4);
805 polyVD1 = _mm_add_ps(polyVD1,VD1);
806 polyVD0 = _mm_add_ps(polyVD0,VD0);
807 polyVD1 = _mm_mul_ps(polyVD1,z2);
808 polyVD0 = _mm_add_ps(polyVD0,polyVD1);
810 polyVD0 = gmx_mm_inv_ps(polyVD0);
812 polyVN0 = _mm_mul_ps(VN6,z4);
813 polyVN1 = _mm_mul_ps(VN5,z4);
814 polyVN0 = _mm_add_ps(polyVN0,VN4);
815 polyVN1 = _mm_add_ps(polyVN1,VN3);
816 polyVN0 = _mm_mul_ps(polyVN0,z4);
817 polyVN1 = _mm_mul_ps(polyVN1,z4);
818 polyVN0 = _mm_add_ps(polyVN0,VN2);
819 polyVN1 = _mm_add_ps(polyVN1,VN1);
820 polyVN0 = _mm_mul_ps(polyVN0,z4);
821 polyVN1 = _mm_mul_ps(polyVN1,z2);
822 polyVN0 = _mm_add_ps(polyVN0,VN0);
823 polyVN0 = _mm_add_ps(polyVN0,polyVN1);
825 return _mm_mul_ps(polyVN0,polyVD0);
829 static int
830 gmx_mm_sincos_ps(__m128 x,
831 __m128 *sinval,
832 __m128 *cosval)
834 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
835 const __m128 half = _mm_set1_ps(0.5);
836 const __m128 one = _mm_set1_ps(1.0);
838 const __m128i izero = _mm_set1_epi32(0);
839 const __m128i ione = _mm_set1_epi32(1);
840 const __m128i itwo = _mm_set1_epi32(2);
841 const __m128i ithree = _mm_set1_epi32(3);
842 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
844 const __m128 CA1 = _mm_set1_ps(1.5703125f);
845 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
846 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
848 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
849 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
850 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
851 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
852 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
853 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
855 __m128 y,y2;
856 __m128 z;
857 __m128i iz;
858 __m128i offset_sin,offset_cos;
859 __m128 tmp1,tmp2;
860 __m128 mask_sin,mask_cos;
861 __m128 tmp_sin,tmp_cos;
863 y = _mm_mul_ps(x,two_over_pi);
864 y = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
866 iz = _mm_cvttps_epi32(y);
867 z = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
869 offset_sin = _mm_and_si128(iz,ithree);
870 offset_cos = _mm_add_epi32(iz,ione);
872 /* Extended precision arithmethic to achieve full precision */
873 y = _mm_mul_ps(z,CA1);
874 tmp1 = _mm_mul_ps(z,CA2);
875 tmp2 = _mm_mul_ps(z,CA3);
876 y = _mm_sub_ps(x,y);
877 y = _mm_sub_ps(y,tmp1);
878 y = _mm_sub_ps(y,tmp2);
880 y2 = _mm_mul_ps(y,y);
882 tmp1 = _mm_mul_ps(CC0,y2);
883 tmp1 = _mm_add_ps(tmp1,CC1);
884 tmp2 = _mm_mul_ps(CS0,y2);
885 tmp2 = _mm_add_ps(tmp2,CS1);
886 tmp1 = _mm_mul_ps(tmp1,y2);
887 tmp1 = _mm_add_ps(tmp1,CC2);
888 tmp2 = _mm_mul_ps(tmp2,y2);
889 tmp2 = _mm_add_ps(tmp2,CS2);
891 tmp1 = _mm_mul_ps(tmp1,y2);
892 tmp1 = _mm_add_ps(tmp1,one);
894 tmp2 = _mm_mul_ps(tmp2,_mm_mul_ps(y,y2));
895 tmp2 = _mm_add_ps(tmp2,y);
897 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
898 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
900 tmp_sin = _mm_blendv_ps(tmp1,tmp2,mask_sin);
901 tmp_cos = _mm_blendv_ps(tmp1,tmp2,mask_cos);
903 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
904 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
906 tmp1 = _mm_xor_ps(signbit,tmp_sin);
907 tmp2 = _mm_xor_ps(signbit,tmp_cos);
909 *sinval = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
910 *cosval = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
912 return 0;
916 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
917 * will then call the sincos() routine and waste a factor 2 in performance!
919 static __m128
920 gmx_mm_sin_ps(__m128 x)
922 __m128 s,c;
923 gmx_mm_sincos_ps(x,&s,&c);
924 return s;
928 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
929 * will then call the sincos() routine and waste a factor 2 in performance!
931 static __m128
932 gmx_mm_cos_ps(__m128 x)
934 __m128 s,c;
935 gmx_mm_sincos_ps(x,&s,&c);
936 return c;
940 static __m128
941 gmx_mm_tan_ps(__m128 x)
943 __m128 sinval,cosval;
944 __m128 tanval;
946 gmx_mm_sincos_ps(x,&sinval,&cosval);
948 tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
950 return tanval;
954 static __m128
955 gmx_mm_asin_ps(__m128 x)
957 /* Same algorithm as cephes library */
958 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
959 const __m128 limitlow = _mm_set1_ps(1e-4f);
960 const __m128 half = _mm_set1_ps(0.5f);
961 const __m128 one = _mm_set1_ps(1.0f);
962 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
964 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
965 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
966 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
967 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
968 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
970 __m128 sign;
971 __m128 mask;
972 __m128 xabs;
973 __m128 z,z1,z2,q,q1,q2;
974 __m128 pA,pB;
976 sign = _mm_andnot_ps(signmask,x);
977 xabs = _mm_and_ps(x,signmask);
979 mask = _mm_cmpgt_ps(xabs,half);
981 z1 = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
982 q1 = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
983 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
985 q2 = xabs;
986 z2 = _mm_mul_ps(q2,q2);
988 z = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
989 q = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
991 z2 = _mm_mul_ps(z,z);
993 pA = _mm_mul_ps(CC5,z2);
994 pB = _mm_mul_ps(CC4,z2);
996 pA = _mm_add_ps(pA,CC3);
997 pB = _mm_add_ps(pB,CC2);
999 pA = _mm_mul_ps(pA,z2);
1000 pB = _mm_mul_ps(pB,z2);
1002 pA = _mm_add_ps(pA,CC1);
1003 pA = _mm_mul_ps(pA,z);
1005 z = _mm_add_ps(pA,pB);
1006 z = _mm_mul_ps(z,q);
1007 z = _mm_add_ps(z,q);
1009 q2 = _mm_sub_ps(halfpi,z);
1010 q2 = _mm_sub_ps(q2,z);
1012 z = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
1014 mask = _mm_cmpgt_ps(xabs,limitlow);
1015 z = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
1017 z = _mm_xor_ps(z,sign);
1019 return z;
1023 static __m128
1024 gmx_mm_acos_ps(__m128 x)
1026 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1027 const __m128 one_ps = _mm_set1_ps(1.0f);
1028 const __m128 half_ps = _mm_set1_ps(0.5f);
1029 const __m128 pi_ps = _mm_set1_ps(M_PI);
1030 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1032 __m128 mask1;
1033 __m128 mask2;
1034 __m128 xabs;
1035 __m128 z,z1,z2,z3;
1037 xabs = _mm_and_ps(x,signmask);
1038 mask1 = _mm_cmpgt_ps(xabs,half_ps);
1039 mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
1041 z = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
1042 z = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
1043 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
1045 z = _mm_blendv_ps(x,z,mask1);
1046 z = gmx_mm_asin_ps(z);
1048 z2 = _mm_add_ps(z,z);
1049 z1 = _mm_sub_ps(pi_ps,z2);
1050 z3 = _mm_sub_ps(halfpi_ps,z);
1052 z = _mm_blendv_ps(z1,z2,mask2);
1053 z = _mm_blendv_ps(z3,z,mask1);
1055 return z;
1059 static __m128
1060 gmx_mm_atan_ps(__m128 x)
1062 /* Same algorithm as cephes library */
1063 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1064 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
1065 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
1066 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
1067 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
1068 const __m128 mone = _mm_set1_ps(-1.0f);
1069 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
1070 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
1071 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
1072 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
1074 __m128 sign;
1075 __m128 mask1,mask2;
1076 __m128 y,z1,z2;
1077 __m128 x2,x4;
1078 __m128 sum1,sum2;
1080 sign = _mm_andnot_ps(signmask,x);
1081 x = _mm_and_ps(x,signmask);
1083 mask1 = _mm_cmpgt_ps(x,limit1);
1084 mask2 = _mm_cmpgt_ps(x,limit2);
1086 z1 = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
1087 z2 = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
1089 y = _mm_and_ps(mask1,quarterpi);
1090 y = _mm_blendv_ps(y,halfpi,mask2);
1092 x = _mm_blendv_ps(x,z1,mask1);
1093 x = _mm_blendv_ps(x,z2,mask2);
1095 x2 = _mm_mul_ps(x,x);
1096 x4 = _mm_mul_ps(x2,x2);
1098 sum1 = _mm_mul_ps(CC9,x4);
1099 sum2 = _mm_mul_ps(CC7,x4);
1100 sum1 = _mm_add_ps(sum1,CC5);
1101 sum2 = _mm_add_ps(sum2,CC3);
1102 sum1 = _mm_mul_ps(sum1,x4);
1103 sum2 = _mm_mul_ps(sum2,x2);
1105 sum1 = _mm_add_ps(sum1,sum2);
1106 sum1 = _mm_sub_ps(sum1,mone);
1107 sum1 = _mm_mul_ps(sum1,x);
1108 y = _mm_add_ps(y,sum1);
1110 y = _mm_xor_ps(y,sign);
1112 return y;
1116 static __m128
1117 gmx_mm_atan2_ps(__m128 y, __m128 x)
1119 const __m128 pi = _mm_set1_ps(M_PI);
1120 const __m128 minuspi = _mm_set1_ps(-M_PI);
1121 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
1122 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1124 __m128 z,z1,z3,z4;
1125 __m128 w;
1126 __m128 maskx_lt,maskx_eq;
1127 __m128 masky_lt,masky_eq;
1128 __m128 mask1,mask2,mask3,mask4,maskall;
1130 maskx_lt = _mm_cmplt_ps(x,_mm_setzero_ps());
1131 masky_lt = _mm_cmplt_ps(y,_mm_setzero_ps());
1132 maskx_eq = _mm_cmpeq_ps(x,_mm_setzero_ps());
1133 masky_eq = _mm_cmpeq_ps(y,_mm_setzero_ps());
1135 z = _mm_mul_ps(y,gmx_mm_inv_ps(x));
1136 z = gmx_mm_atan_ps(z);
1138 mask1 = _mm_and_ps(maskx_eq,masky_lt);
1139 mask2 = _mm_andnot_ps(maskx_lt,masky_eq);
1140 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
1141 mask4 = _mm_and_ps(masky_eq,maskx_lt);
1143 maskall = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
1145 z = _mm_andnot_ps(maskall,z);
1146 z1 = _mm_and_ps(mask1,minushalfpi);
1147 z3 = _mm_and_ps(mask3,halfpi);
1148 z4 = _mm_and_ps(mask4,pi);
1150 z = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
1152 mask1 = _mm_andnot_ps(masky_lt,maskx_lt);
1153 mask2 = _mm_and_ps(maskx_lt,masky_lt);
1155 w = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
1156 w = _mm_andnot_ps(maskall,w);
1158 z = _mm_add_ps(z,w);
1160 return z;
1165 #endif /* _gmx_math_x86_sse4_1_single_h_ */