added Verlet scheme and NxN non-bonded functionality
[gromacs.git] / include / gmx_math_x86_avx_128_fma_single.h
blob8d48f2cbca1d5ffd3c5008ef07728b2a77d819e3
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This file is part of GROMACS.
5 * Copyright (c) 2012-
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
18 * And Hey:
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_avx_128_fma_single_h_
22 #define _gmx_math_x86_avx_128_fma_single_h_
24 #include <immintrin.h> /* AVX */
25 #ifdef HAVE_X86INTRIN_H
26 #include <x86intrin.h> /* FMA */
27 #endif
29 #include <math.h>
31 #include "gmx_x86_avx_128_fma.h"
34 #ifndef M_PI
35 # define M_PI 3.14159265358979323846264338327950288
36 #endif
41 /************************
42 * *
43 * Simple math routines *
44 * *
45 ************************/
47 /* 1.0/sqrt(x) */
48 static gmx_inline __m128
49 gmx_mm_invsqrt_ps(__m128 x)
51 const __m128 half = _mm_set_ps(0.5,0.5,0.5,0.5);
52 const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
54 __m128 lu = _mm_rsqrt_ps(x);
56 return _mm_mul_ps(_mm_mul_ps(half,lu),_mm_nmacc_ps(_mm_mul_ps(lu,lu),x,three));
59 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
60 static gmx_inline __m128
61 gmx_mm_sqrt_ps(__m128 x)
63 __m128 mask;
64 __m128 res;
66 mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_EQ_OQ);
67 res = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
69 res = _mm_mul_ps(x,res);
71 return res;
74 /* 1.0/x */
75 static gmx_inline __m128
76 gmx_mm_inv_ps(__m128 x)
78 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
80 __m128 lu = _mm_rcp_ps(x);
82 return _mm_mul_ps(lu,_mm_nmacc_ps(lu,x,two));
85 static gmx_inline __m128
86 gmx_mm_abs_ps(__m128 x)
88 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
90 return _mm_and_ps(x,signmask);
93 static __m128
94 gmx_mm_log_ps(__m128 x)
96 /* Same algorithm as cephes library */
97 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
98 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
99 const __m128 half = _mm_set1_ps(0.5f);
100 const __m128 one = _mm_set1_ps(1.0f);
101 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
102 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
103 const __m128 corr2 = _mm_set1_ps(0.693359375f);
105 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
106 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
107 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
108 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
109 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
110 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
111 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
112 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
113 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
115 __m128 fexp,fexp1;
116 __m128i iexp;
117 __m128 mask;
118 __m128 x1,x2;
119 __m128 y;
120 __m128 pA,pB,pC,pD,pE,tB,tC,tD,tE;
122 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
123 fexp = _mm_and_ps(x,expmask);
124 iexp = gmx_mm_castps_si128(fexp);
125 iexp = _mm_srli_epi32(iexp,23);
126 iexp = _mm_sub_epi32(iexp,expbase_m1);
128 x = _mm_andnot_ps(expmask,x);
129 x = _mm_or_ps(x,one);
130 x = _mm_mul_ps(x,half);
132 mask = _mm_cmp_ps(x,invsq2,_CMP_LT_OQ);
134 x = _mm_add_ps(x,_mm_and_ps(mask,x));
135 x = _mm_sub_ps(x,one);
136 iexp = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
138 x2 = _mm_mul_ps(x,x);
140 pA = _mm_mul_ps(CA_1,x);
142 pB = _mm_add_ps(x,CB_1);
143 pC = _mm_add_ps(x,CC_1);
144 pD = _mm_add_ps(x,CD_1);
145 pE = _mm_add_ps(x,CE_1);
147 pB = _mm_macc_ps(x,pB,CB_0);
148 pC = _mm_macc_ps(x,pC,CC_0);
149 pD = _mm_macc_ps(x,pD,CD_0);
150 pE = _mm_macc_ps(x,pE,CE_0);
152 pA = _mm_mul_ps(pA,pB);
153 pC = _mm_mul_ps(pC,pD);
154 pE = _mm_mul_ps(pE,x2);
155 pA = _mm_mul_ps(pA,pC);
156 y = _mm_mul_ps(pA,pE);
158 fexp = _mm_cvtepi32_ps(iexp);
159 y = _mm_macc_ps(fexp,corr1,y);
160 y = _mm_nmacc_ps(half,x2,y);
162 x2 = _mm_add_ps(x,y);
163 x2 = _mm_macc_ps(fexp,corr2,x2);
165 return x2;
170 * 2^x function.
172 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
173 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
175 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
177 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
178 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
179 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
180 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
181 * number instead. That would take a few extra cycles and not really help, since something is
182 * wrong if you are using single precision to work with numbers that cannot really be represented
183 * in single precision.
185 * The accuracy is at least 23 bits.
187 static __m128
188 gmx_mm_exp2_ps(__m128 x)
190 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
191 const __m128 arglimit = _mm_set1_ps(126.0f);
193 const __m128i expbase = _mm_set1_epi32(127);
194 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
195 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
196 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
197 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
198 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
199 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
200 const __m128 CA0 = _mm_set1_ps(1.0f);
202 __m128 valuemask;
203 __m128i iexppart;
204 __m128 fexppart;
205 __m128 intpart;
206 __m128 x2;
207 __m128 p0,p1;
209 iexppart = _mm_cvtps_epi32(x);
210 intpart = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
211 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
212 valuemask = _mm_cmp_ps(arglimit,gmx_mm_abs_ps(x),_CMP_GE_OQ);
213 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
215 x = _mm_sub_ps(x,intpart);
216 x2 = _mm_mul_ps(x,x);
218 p0 = _mm_macc_ps(CA6,x2,CA4);
219 p1 = _mm_macc_ps(CA5,x2,CA3);
220 p0 = _mm_macc_ps(p0,x2,CA2);
221 p1 = _mm_macc_ps(p1,x2,CA1);
222 p0 = _mm_macc_ps(p0,x2,CA0);
223 p0 = _mm_macc_ps(p1,x,p0);
224 x = _mm_mul_ps(p0,fexppart);
226 return x;
230 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
231 * but there will then be a small rounding error since we lose some precision due to the
232 * multiplication. This will then be magnified a lot by the exponential.
234 * Instead, we calculate the fractional part directly as a minimax approximation of
235 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
236 * remaining after 2^y, which avoids the precision-loss.
237 * The final result is correct to within 1 LSB over the entire argument range.
239 static __m128
240 gmx_mm_exp_ps(__m128 x)
242 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
243 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
244 const __m128 arglimit = _mm_set1_ps(126.0f);
245 const __m128i expbase = _mm_set1_epi32(127);
247 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
248 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
250 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
251 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
252 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
253 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
254 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
255 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
256 const __m128 one = _mm_set1_ps(1.0f);
258 __m128 y,x2;
259 __m128 p0,p1;
260 __m128 valuemask;
261 __m128i iexppart;
262 __m128 fexppart;
263 __m128 intpart;
265 y = _mm_mul_ps(x,argscale);
267 iexppart = _mm_cvtps_epi32(y);
268 intpart = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
270 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
271 valuemask = _mm_cmp_ps(arglimit,gmx_mm_abs_ps(y),_CMP_GE_OQ);
272 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
274 /* Extended precision arithmetics */
275 x = _mm_nmacc_ps(invargscale0,intpart,x);
276 x = _mm_nmacc_ps(invargscale1,intpart,x);
278 x2 = _mm_mul_ps(x,x);
280 p1 = _mm_macc_ps(CC5,x2,CC3);
281 p0 = _mm_macc_ps(CC4,x2,CC2);
282 p1 = _mm_macc_ps(p1,x2,CC1);
283 p0 = _mm_macc_ps(p0,x2,CC0);
284 p0 = _mm_macc_ps(p1,x,p0);
285 p0 = _mm_macc_ps(p0,x2,one);
287 x = _mm_add_ps(x,p0);
289 x = _mm_mul_ps(x,fexppart);
291 return x;
294 /* FULL precision. Only errors in LSB */
295 static __m128
296 gmx_mm_erf_ps(__m128 x)
298 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
299 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
300 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
301 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
302 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
303 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
304 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
305 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
306 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
307 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
308 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
309 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
310 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
311 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
312 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
313 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
314 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
315 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
316 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
317 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
318 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
319 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
320 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
321 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
322 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
323 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
324 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
325 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
326 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
327 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
328 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
330 /* Coefficients for expansion of exp(x) in [0,0.1] */
331 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
332 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
333 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
334 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
336 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
337 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
338 const __m128 one = _mm_set1_ps(1.0f);
339 const __m128 two = _mm_set1_ps(2.0f);
341 __m128 x2,x4,y;
342 __m128 z,q,t,t2,w,w2;
343 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
344 __m128 expmx2,corr;
345 __m128 res_erf,res_erfc,res;
346 __m128 mask;
348 /* Calculate erf() */
349 x2 = _mm_mul_ps(x,x);
350 x4 = _mm_mul_ps(x2,x2);
352 pA0 = _mm_macc_ps(CA6,x4,CA4);
353 pA1 = _mm_macc_ps(CA5,x4,CA3);
354 pA0 = _mm_macc_ps(pA0,x4,CA2);
355 pA1 = _mm_macc_ps(pA1,x4,CA1);
356 pA0 = _mm_mul_ps(pA0,x4);
357 pA0 = _mm_macc_ps(pA1,x2,pA0);
358 /* Constant term must come last for precision reasons */
359 pA0 = _mm_add_ps(pA0,CA0);
361 res_erf = _mm_mul_ps(x,pA0);
363 /* Calculate erfc */
365 y = gmx_mm_abs_ps(x);
366 t = gmx_mm_inv_ps(y);
367 w = _mm_sub_ps(t,one);
368 t2 = _mm_mul_ps(t,t);
369 w2 = _mm_mul_ps(w,w);
371 * We cannot simply calculate exp(-x2) directly in single precision, since
372 * that will lose a couple of bits of precision due to the multiplication.
373 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
374 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
376 * The only drawback with this is that it requires TWO separate exponential
377 * evaluations, which would be horrible performance-wise. However, the argument
378 * for the second exp() call is always small, so there we simply use a
379 * low-order minimax expansion on [0,0.1].
382 z = _mm_and_ps(y,sieve);
383 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
385 corr = _mm_macc_ps(CD4,q,CD3);
386 corr = _mm_macc_ps(corr,q,CD2);
387 corr = _mm_macc_ps(corr,q,one);
388 corr = _mm_macc_ps(corr,q,one);
390 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
391 expmx2 = _mm_mul_ps(expmx2,corr);
393 pB1 = _mm_macc_ps(CB9,w2,CB7);
394 pB0 = _mm_macc_ps(CB8,w2,CB6);
395 pB1 = _mm_macc_ps(pB1,w2,CB5);
396 pB0 = _mm_macc_ps(pB0,w2,CB4);
397 pB1 = _mm_macc_ps(pB1,w2,CB3);
398 pB0 = _mm_macc_ps(pB0,w2,CB2);
399 pB1 = _mm_macc_ps(pB1,w2,CB1);
400 pB0 = _mm_macc_ps(pB0,w2,CB0);
401 pB0 = _mm_macc_ps(pB1,w,pB0);
403 pC0 = _mm_macc_ps(CC10,t2,CC8);
404 pC1 = _mm_macc_ps(CC9,t2,CC7);
405 pC0 = _mm_macc_ps(pC0,t2,CC6);
406 pC1 = _mm_macc_ps(pC1,t2,CC5);
407 pC0 = _mm_macc_ps(pC0,t2,CC4);
408 pC1 = _mm_macc_ps(pC1,t2,CC3);
409 pC0 = _mm_macc_ps(pC0,t2,CC2);
410 pC1 = _mm_macc_ps(pC1,t2,CC1);
412 pC0 = _mm_macc_ps(pC0,t2,CC0);
413 pC0 = _mm_macc_ps(pC1,t,pC0);
414 pC0 = _mm_mul_ps(pC0,t);
416 /* SELECT pB0 or pC0 for erfc() */
417 mask = _mm_cmp_ps(two,y,_CMP_LT_OQ);
418 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
419 res_erfc = _mm_mul_ps(res_erfc,expmx2);
421 /* erfc(x<0) = 2-erfc(|x|) */
422 mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
423 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
425 /* Select erf() or erfc() */
426 mask = _mm_cmp_ps(y,_mm_set1_ps(0.75f),_CMP_LT_OQ);
427 res = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
429 return res;
433 /* FULL precision. Only errors in LSB */
434 static __m128
435 gmx_mm_erfc_ps(__m128 x)
437 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
438 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
439 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
440 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
441 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
442 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
443 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
444 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
445 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
446 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
447 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
448 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
449 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
450 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
451 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
452 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
453 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
454 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
455 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
456 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
457 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
458 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
459 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
460 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
461 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
462 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
463 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
464 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
465 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
466 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
467 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
469 /* Coefficients for expansion of exp(x) in [0,0.1] */
470 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
471 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
472 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
473 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
475 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
476 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
477 const __m128 one = _mm_set1_ps(1.0f);
478 const __m128 two = _mm_set1_ps(2.0f);
480 __m128 x2,x4,y;
481 __m128 z,q,t,t2,w,w2;
482 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
483 __m128 expmx2,corr;
484 __m128 res_erf,res_erfc,res;
485 __m128 mask;
487 /* Calculate erf() */
488 x2 = _mm_mul_ps(x,x);
489 x4 = _mm_mul_ps(x2,x2);
491 pA0 = _mm_macc_ps(CA6,x4,CA4);
492 pA1 = _mm_macc_ps(CA5,x4,CA3);
493 pA0 = _mm_macc_ps(pA0,x4,CA2);
494 pA1 = _mm_macc_ps(pA1,x4,CA1);
495 pA1 = _mm_mul_ps(pA1,x2);
496 pA0 = _mm_macc_ps(pA0,x4,pA1);
497 /* Constant term must come last for precision reasons */
498 pA0 = _mm_add_ps(pA0,CA0);
500 res_erf = _mm_mul_ps(x,pA0);
502 /* Calculate erfc */
503 y = gmx_mm_abs_ps(x);
504 t = gmx_mm_inv_ps(y);
505 w = _mm_sub_ps(t,one);
506 t2 = _mm_mul_ps(t,t);
507 w2 = _mm_mul_ps(w,w);
509 * We cannot simply calculate exp(-x2) directly in single precision, since
510 * that will lose a couple of bits of precision due to the multiplication.
511 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
512 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
514 * The only drawback with this is that it requires TWO separate exponential
515 * evaluations, which would be horrible performance-wise. However, the argument
516 * for the second exp() call is always small, so there we simply use a
517 * low-order minimax expansion on [0,0.1].
520 z = _mm_and_ps(y,sieve);
521 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
523 corr = _mm_macc_ps(CD4,q,CD3);
524 corr = _mm_macc_ps(corr,q,CD2);
525 corr = _mm_macc_ps(corr,q,one);
526 corr = _mm_macc_ps(corr,q,one);
528 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
529 expmx2 = _mm_mul_ps(expmx2,corr);
531 pB1 = _mm_macc_ps(CB9,w2,CB7);
532 pB0 = _mm_macc_ps(CB8,w2,CB6);
533 pB1 = _mm_macc_ps(pB1,w2,CB5);
534 pB0 = _mm_macc_ps(pB0,w2,CB4);
535 pB1 = _mm_macc_ps(pB1,w2,CB3);
536 pB0 = _mm_macc_ps(pB0,w2,CB2);
537 pB1 = _mm_macc_ps(pB1,w2,CB1);
538 pB0 = _mm_macc_ps(pB0,w2,CB0);
539 pB0 = _mm_macc_ps(pB1,w,pB0);
541 pC0 = _mm_macc_ps(CC10,t2,CC8);
542 pC1 = _mm_macc_ps(CC9,t2,CC7);
543 pC0 = _mm_macc_ps(pC0,t2,CC6);
544 pC1 = _mm_macc_ps(pC1,t2,CC5);
545 pC0 = _mm_macc_ps(pC0,t2,CC4);
546 pC1 = _mm_macc_ps(pC1,t2,CC3);
547 pC0 = _mm_macc_ps(pC0,t2,CC2);
548 pC1 = _mm_macc_ps(pC1,t2,CC1);
550 pC0 = _mm_macc_ps(pC0,t2,CC0);
551 pC0 = _mm_macc_ps(pC1,t,pC0);
552 pC0 = _mm_mul_ps(pC0,t);
554 /* SELECT pB0 or pC0 for erfc() */
555 mask = _mm_cmp_ps(two,y,_CMP_LT_OQ);
556 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
557 res_erfc = _mm_mul_ps(res_erfc,expmx2);
559 /* erfc(x<0) = 2-erfc(|x|) */
560 mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
561 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
563 /* Select erf() or erfc() */
564 mask = _mm_cmp_ps(y,_mm_set1_ps(0.75f),_CMP_LT_OQ);
565 res = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
567 return res;
571 /* Calculate the force correction due to PME analytically.
573 * This routine is meant to enable analytical evaluation of the
574 * direct-space PME electrostatic force to avoid tables.
576 * The direct-space potential should be Erfc(beta*r)/r, but there
577 * are some problems evaluating that:
579 * First, the error function is difficult (read: expensive) to
580 * approxmiate accurately for intermediate to large arguments, and
581 * this happens already in ranges of beta*r that occur in simulations.
582 * Second, we now try to avoid calculating potentials in Gromacs but
583 * use forces directly.
585 * We can simply things slight by noting that the PME part is really
586 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
588 * V= 1/r - Erf(beta*r)/r
590 * The first term we already have from the inverse square root, so
591 * that we can leave out of this routine.
593 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
594 * the argument beta*r will be in the range 0.15 to ~4. Use your
595 * favorite plotting program to realize how well-behaved Erf(z)/z is
596 * in this range!
598 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
599 * However, it turns out it is more efficient to approximate f(z)/z and
600 * then only use even powers. This is another minor optimization, since
601 * we actually WANT f(z)/z, because it is going to be multiplied by
602 * the vector between the two atoms to get the vectorial force. The
603 * fastest flops are the ones we can avoid calculating!
605 * So, here's how it should be used:
607 * 1. Calculate r^2.
608 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
609 * 3. Evaluate this routine with z^2 as the argument.
610 * 4. The return value is the expression:
613 * 2*exp(-z^2) erf(z)
614 * ------------ - --------
615 * sqrt(Pi)*z^2 z^3
617 * 5. Multiply the entire expression by beta^3. This will get you
619 * beta^3*2*exp(-z^2) beta^3*erf(z)
620 * ------------------ - ---------------
621 * sqrt(Pi)*z^2 z^3
623 * or, switching back to r (z=r*beta):
625 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
626 * ----------------------- - -----------
627 * sqrt(Pi)*r^2 r^3
630 * With a bit of math exercise you should be able to confirm that
631 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
633 * 6. Add the result to 1/r^3, multiply by the product of the charges,
634 * and you have your force (divided by r). A final multiplication
635 * with the vector connecting the two particles and you have your
636 * vectorial force to add to the particles.
639 static __m128
640 gmx_mm_pmecorrF_ps(__m128 z2)
642 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
643 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
644 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
645 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
646 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
647 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
648 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
650 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
651 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
652 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
653 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
654 const __m128 FD0 = _mm_set1_ps(1.0f);
656 __m128 z4;
657 __m128 polyFN0,polyFN1,polyFD0,polyFD1;
659 z4 = _mm_mul_ps(z2,z2);
661 polyFD0 = _mm_macc_ps(FD4,z4,FD2);
662 polyFD1 = _mm_macc_ps(FD3,z4,FD1);
663 polyFD0 = _mm_macc_ps(polyFD0,z4,FD0);
664 polyFD0 = _mm_macc_ps(polyFD1,z2,polyFD0);
666 polyFD0 = gmx_mm_inv_ps(polyFD0);
668 polyFN0 = _mm_macc_ps(FN6,z4,FN4);
669 polyFN1 = _mm_macc_ps(FN5,z4,FN3);
670 polyFN0 = _mm_macc_ps(polyFN0,z4,FN2);
671 polyFN1 = _mm_macc_ps(polyFN1,z4,FN1);
672 polyFN0 = _mm_macc_ps(polyFN0,z4,FN0);
673 polyFN0 = _mm_macc_ps(polyFN1,z2,polyFN0);
675 return _mm_mul_ps(polyFN0,polyFD0);
681 /* Calculate the potential correction due to PME analytically.
683 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
685 * This routine calculates Erf(z)/z, although you should provide z^2
686 * as the input argument.
688 * Here's how it should be used:
690 * 1. Calculate r^2.
691 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
692 * 3. Evaluate this routine with z^2 as the argument.
693 * 4. The return value is the expression:
696 * erf(z)
697 * --------
700 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
702 * erf(r*beta)
703 * -----------
704 * r
706 * 6. Add the result to 1/r, multiply by the product of the charges,
707 * and you have your potential.
709 static __m128
710 gmx_mm_pmecorrV_ps(__m128 z2)
712 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
713 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
714 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
715 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
716 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
717 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
718 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
720 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
721 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
722 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
723 const __m128 VD0 = _mm_set1_ps(1.0f);
725 __m128 z4;
726 __m128 polyVN0,polyVN1,polyVD0,polyVD1;
728 z4 = _mm_mul_ps(z2,z2);
730 polyVD1 = _mm_macc_ps(VD3,z4,VD1);
731 polyVD0 = _mm_macc_ps(VD2,z4,VD0);
732 polyVD0 = _mm_macc_ps(polyVD1,z2,polyVD0);
734 polyVD0 = gmx_mm_inv_ps(polyVD0);
736 polyVN0 = _mm_macc_ps(VN6,z4,VN4);
737 polyVN1 = _mm_macc_ps(VN5,z4,VN3);
738 polyVN0 = _mm_macc_ps(polyVN0,z4,VN2);
739 polyVN1 = _mm_macc_ps(polyVN1,z4,VN1);
740 polyVN0 = _mm_macc_ps(polyVN0,z4,VN0);
741 polyVN0 = _mm_macc_ps(polyVN1,z2,polyVN0);
743 return _mm_mul_ps(polyVN0,polyVD0);
748 static int
749 gmx_mm_sincos_ps(__m128 x,
750 __m128 *sinval,
751 __m128 *cosval)
753 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
754 const __m128 half = _mm_set1_ps(0.5);
755 const __m128 one = _mm_set1_ps(1.0);
757 const __m128i izero = _mm_set1_epi32(0);
758 const __m128i ione = _mm_set1_epi32(1);
759 const __m128i itwo = _mm_set1_epi32(2);
760 const __m128i ithree = _mm_set1_epi32(3);
761 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
763 const __m128 CA1 = _mm_set1_ps(1.5703125f);
764 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
765 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
767 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
768 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
769 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
770 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
771 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
772 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
774 __m128 y,y2;
775 __m128 z;
776 __m128i iz;
777 __m128i offset_sin,offset_cos;
778 __m128 tmp1,tmp2;
779 __m128 mask_sin,mask_cos;
780 __m128 tmp_sin,tmp_cos;
782 y = _mm_mul_ps(x,two_over_pi);
783 y = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
785 iz = _mm_cvttps_epi32(y);
786 z = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
788 offset_sin = _mm_and_si128(iz,ithree);
789 offset_cos = _mm_add_epi32(iz,ione);
791 /* Extended precision arithmethic to achieve full precision */
792 y = _mm_nmacc_ps(z,CA1,x);
793 y = _mm_nmacc_ps(z,CA2,y);
794 y = _mm_nmacc_ps(z,CA3,y);
796 y2 = _mm_mul_ps(y,y);
798 tmp1 = _mm_macc_ps(CC0,y2,CC1);
799 tmp2 = _mm_macc_ps(CS0,y2,CS1);
800 tmp1 = _mm_macc_ps(tmp1,y2,CC2);
801 tmp2 = _mm_macc_ps(tmp2,y2,CS2);
803 tmp1 = _mm_macc_ps(tmp1,y2,one);
805 tmp2 = _mm_macc_ps(tmp2,_mm_mul_ps(y,y2),y);
807 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
808 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
810 tmp_sin = _mm_blendv_ps(tmp1,tmp2,mask_sin);
811 tmp_cos = _mm_blendv_ps(tmp1,tmp2,mask_cos);
813 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
814 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
816 tmp1 = _mm_xor_ps(signbit,tmp_sin);
817 tmp2 = _mm_xor_ps(signbit,tmp_cos);
819 *sinval = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
820 *cosval = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
822 return 0;
826 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
827 * will then call the sincos() routine and waste a factor 2 in performance!
829 static __m128
830 gmx_mm_sin_ps(__m128 x)
832 __m128 s,c;
833 gmx_mm_sincos_ps(x,&s,&c);
834 return s;
838 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
839 * will then call the sincos() routine and waste a factor 2 in performance!
841 static __m128
842 gmx_mm_cos_ps(__m128 x)
844 __m128 s,c;
845 gmx_mm_sincos_ps(x,&s,&c);
846 return c;
850 static __m128
851 gmx_mm_tan_ps(__m128 x)
853 __m128 sinval,cosval;
854 __m128 tanval;
856 gmx_mm_sincos_ps(x,&sinval,&cosval);
858 tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
860 return tanval;
864 static __m128
865 gmx_mm_asin_ps(__m128 x)
867 /* Same algorithm as cephes library */
868 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
869 const __m128 limitlow = _mm_set1_ps(1e-4f);
870 const __m128 half = _mm_set1_ps(0.5f);
871 const __m128 one = _mm_set1_ps(1.0f);
872 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
874 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
875 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
876 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
877 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
878 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
880 __m128 sign;
881 __m128 mask;
882 __m128 xabs;
883 __m128 z,z1,z2,q,q1,q2;
884 __m128 pA,pB;
886 sign = _mm_andnot_ps(signmask,x);
887 xabs = _mm_and_ps(x,signmask);
889 mask = _mm_cmp_ps(xabs,half,_CMP_GT_OQ);
891 z1 = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
892 q1 = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
893 q1 = _mm_andnot_ps(_mm_cmp_ps(xabs,one,_CMP_EQ_OQ),q1);
895 q2 = xabs;
896 z2 = _mm_mul_ps(q2,q2);
898 z = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
899 q = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
901 z2 = _mm_mul_ps(z,z);
903 pA = _mm_macc_ps(CC5,z2,CC3);
904 pB = _mm_macc_ps(CC4,z2,CC2);
906 pA = _mm_macc_ps(pA,z2,CC1);
907 pA = _mm_mul_ps(pA,z);
909 z = _mm_macc_ps(pB,z2,pA);
911 z = _mm_macc_ps(z,q,q);
913 q2 = _mm_sub_ps(halfpi,z);
914 q2 = _mm_sub_ps(q2,z);
916 z = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
918 mask = _mm_cmp_ps(xabs,limitlow,_CMP_GT_OQ);
919 z = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
921 z = _mm_xor_ps(z,sign);
923 return z;
927 static __m128
928 gmx_mm_acos_ps(__m128 x)
930 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
931 const __m128 one_ps = _mm_set1_ps(1.0f);
932 const __m128 half_ps = _mm_set1_ps(0.5f);
933 const __m128 pi_ps = _mm_set1_ps(M_PI);
934 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
936 __m128 mask1;
937 __m128 mask2;
938 __m128 xabs;
939 __m128 z,z1,z2,z3;
941 xabs = _mm_and_ps(x,signmask);
942 mask1 = _mm_cmp_ps(xabs,half_ps,_CMP_GT_OQ);
943 mask2 = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_GT_OQ);
945 z = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
946 z = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
947 z = _mm_andnot_ps(_mm_cmp_ps(xabs,one_ps,_CMP_EQ_OQ),z);
949 z = _mm_blendv_ps(x,z,mask1);
950 z = gmx_mm_asin_ps(z);
952 z2 = _mm_add_ps(z,z);
953 z1 = _mm_sub_ps(pi_ps,z2);
954 z3 = _mm_sub_ps(halfpi_ps,z);
956 z = _mm_blendv_ps(z1,z2,mask2);
957 z = _mm_blendv_ps(z3,z,mask1);
959 return z;
963 static __m128
964 gmx_mm_atan_ps(__m128 x)
966 /* Same algorithm as cephes library */
967 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
968 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
969 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
970 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
971 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
972 const __m128 mone = _mm_set1_ps(-1.0f);
973 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
974 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
975 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
976 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
978 __m128 sign;
979 __m128 mask1,mask2;
980 __m128 y,z1,z2;
981 __m128 x2,x4;
982 __m128 sum1,sum2;
984 sign = _mm_andnot_ps(signmask,x);
985 x = _mm_and_ps(x,signmask);
987 mask1 = _mm_cmp_ps(x,limit1,_CMP_GT_OQ);
988 mask2 = _mm_cmp_ps(x,limit2,_CMP_GT_OQ);
990 z1 = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
991 z2 = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
993 y = _mm_and_ps(mask1,quarterpi);
994 y = _mm_blendv_ps(y,halfpi,mask2);
996 x = _mm_blendv_ps(x,z1,mask1);
997 x = _mm_blendv_ps(x,z2,mask2);
999 x2 = _mm_mul_ps(x,x);
1000 x4 = _mm_mul_ps(x2,x2);
1002 sum1 = _mm_macc_ps(CC9,x4,CC5);
1003 sum2 = _mm_macc_ps(CC7,x4,CC3);
1004 sum1 = _mm_mul_ps(sum1,x4);
1005 sum1 = _mm_macc_ps(sum2,x2,sum1);
1007 sum1 = _mm_sub_ps(sum1,mone);
1008 y = _mm_macc_ps(sum1,x,y);
1010 y = _mm_xor_ps(y,sign);
1012 return y;
1016 static __m128
1017 gmx_mm_atan2_ps(__m128 y, __m128 x)
1019 const __m128 pi = _mm_set1_ps(M_PI);
1020 const __m128 minuspi = _mm_set1_ps(-M_PI);
1021 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
1022 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1024 __m128 z,z1,z3,z4;
1025 __m128 w;
1026 __m128 maskx_lt,maskx_eq;
1027 __m128 masky_lt,masky_eq;
1028 __m128 mask1,mask2,mask3,mask4,maskall;
1030 maskx_lt = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
1031 masky_lt = _mm_cmp_ps(y,_mm_setzero_ps(),_CMP_LT_OQ);
1032 maskx_eq = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_EQ_OQ);
1033 masky_eq = _mm_cmp_ps(y,_mm_setzero_ps(),_CMP_EQ_OQ);
1035 z = _mm_mul_ps(y,gmx_mm_inv_ps(x));
1036 z = gmx_mm_atan_ps(z);
1038 mask1 = _mm_and_ps(maskx_eq,masky_lt);
1039 mask2 = _mm_andnot_ps(maskx_lt,masky_eq);
1040 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
1041 mask4 = _mm_and_ps(masky_eq,maskx_lt);
1043 maskall = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
1045 z = _mm_andnot_ps(maskall,z);
1046 z1 = _mm_and_ps(mask1,minushalfpi);
1047 z3 = _mm_and_ps(mask3,halfpi);
1048 z4 = _mm_and_ps(mask4,pi);
1050 z = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
1052 mask1 = _mm_andnot_ps(masky_lt,maskx_lt);
1053 mask2 = _mm_and_ps(maskx_lt,masky_lt);
1055 w = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
1056 w = _mm_andnot_ps(maskall,w);
1058 z = _mm_add_ps(z,w);
1060 return z;
1065 #endif /* _gmx_math_x86_avx_128_fma_single_h_ */