moved full precision real format definition to simple.h
[gromacs.git] / include / gmx_sse2_double.h
blob71969675781569b7d223d4f41b103a9eaf58c7c8
1 /*
2 * This source code is part of
4 * G R O M A C S
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
16 * later version.
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
34 /* We require SSE2 now! */
36 #include <math.h>
37 #include <xmmintrin.h> /* SSE */
38 #include <emmintrin.h> /* SSE2 */
39 #ifdef GMX_SSE3
40 # include <pmmintrin.h> /* SSE3 */
41 #endif
42 #ifdef GMX_SSE4
43 # include <smmintrin.h> /* SSE4.1 */
44 #endif
47 #include <stdio.h>
49 /***************************************************
50 * *
51 * COMPILER RANT WARNING: *
52 * *
53 * Ideally, this header would be filled with *
54 * simple static inline functions. Unfortunately, *
55 * many vendors provide really braindead compilers *
56 * that either cannot handle more than 1-2 SSE *
57 * function parameters, and some cannot handle *
58 * pointers to SSE __m128 datatypes as parameters *
59 * at all. Thus, for portability we have had to *
60 * implement all but the simplest routines as *
61 * macros instead... *
62 * *
63 ***************************************************/
66 /***************************************************
67 * *
68 * Wrappers/replacements for some instructions *
69 * not available in all SSE versions. *
70 * *
71 ***************************************************/
73 #ifdef GMX_SSE4
74 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
75 #else
76 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
77 #endif
80 * Some compilers require a cast to change the interpretation
81 * of a register from FP to Int and vice versa, and not all of
82 * the provide instructions to do this. Roll our own wrappers...
85 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
86 # define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
87 # define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
88 # define gmx_mm_castpd_pd128(a) (a)
89 #elif defined(__GNUC__)
90 # define gmx_mm_castsi128_pd(a) ((__m128d)(a))
91 # define gmx_mm_castpd_si128(a) ((__m128i)(a))
92 # define gmx_mm_castpd_pd128(a) ((__m128d)(a))
93 #else
94 static __m128d gmx_mm_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
95 static __m128i gmx_mm_castpd_si128(__m128d a) { return *(__m128i *) &a; }
96 static __m128d gmx_mm_castpd_pd128(__m128d a) { return *(__m128d *) &a; }
97 #endif
99 #define gmx_mm_extract_epi64(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
103 static inline void
104 gmx_printxmm_pd(const char *s,__m128d xmm)
106 double f[2];
108 _mm_storeu_pd(f,xmm);
109 printf("%s: %15.10g %15.10g\n",s,f[0],f[1]);
113 static inline __m128d
114 gmx_mm_inv_pd(__m128d x)
116 const __m128d two = _mm_set1_pd(2.0);
118 /* Lookup instruction only exists in single precision, convert back and forth... */
119 __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
121 /* Perform two N-R steps for double precision */
122 lu = _mm_mul_pd(lu,_mm_sub_pd(two,_mm_mul_pd(x,lu)));
123 return _mm_mul_pd(lu,_mm_sub_pd(two,_mm_mul_pd(x,lu)));
128 static inline __m128d
129 gmx_mm_invsqrt_pd(__m128d x)
131 const __m128d half = _mm_set1_pd(0.5);
132 const __m128d three = _mm_set1_pd(3.0);
134 /* Lookup instruction only exists in single precision, convert back and forth... */
135 __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
137 lu = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu,lu),x)),lu));
138 return _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu,lu),x)),lu));
141 static inline __m128d
142 gmx_mm_sqrt_pd(__m128d x)
144 __m128d mask;
145 __m128d res;
147 mask = _mm_cmpeq_pd(x,_mm_setzero_pd());
148 res = _mm_andnot_pd(mask,gmx_mm_invsqrt_pd(x));
150 res = _mm_mul_pd(x,res);
152 return res;
157 * Exponential function.
159 * Exp(x) is calculate from the relation Exp(x)=2^(y), where y=log2(e)*x
160 * Thus, the contents of this routine is mostly about calculating 2^y.
162 * This is done by separating y=z+w, where z=[y] is an integer. For technical reasons it is easiest
163 * for us to round to the _nearest_ integer and have w in [-0.5,0.5] rather than always rounding down.
164 * (It is not until SSE4 there was an efficient operation to do rounding towards -infinity).
166 * With this we get 2^y=2^z*2^w
168 * Since we have IEEE fp representation, we can easily calculate 2^z by adding the FP exponent bias
169 * (1023 in double), and shifting the integer to the exponent field of the FP number (52 bits up).
171 * The 2^w term is calculated from a (10,0)-th order (no denominator) Minimax polynomia on the interval
172 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
174 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 10, 0}, WorkingPrecision -> 20]
176 * The lowest exponent we can represent in IEEE double-precision binary format is 2^-1022; below that
177 * it will wrap around and lead to very large positive numbers. This corresponds to a lower bound
178 * on the argument for exp(x) of roughly -708.39. For smaller arguments the return value will be 0.0.
180 * There appears to be a slight loss of precision for large arguments (~250), where the largest relative
181 * error reaches ~2e-14. However, since the actual value for that argument is around 1E100, it might
182 * not matter for typical workloads. This is likely caused by the polynomial evaluation,
183 * and the only way around would then be a table-based version, which I haven't managed to get the
184 * same performance from.
186 * The _average_ accuracy is about 51 bits in the range [-20,20], and the worst roughly 1 bit worse.
188 static __m128d
189 gmx_mm_exp_pd(__m128d x)
191 const __m128d argscale = _mm_set1_pd(1.442695040888963387);
192 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
193 const __m128d arglimit = _mm_set1_pd(-1022.0/1.442695040888963387);
194 const __m128i expbase = _mm_set1_epi32(1023);
196 const __m128d CA0 = _mm_set1_pd(7.0372789822689374920e-9);
197 const __m128d CB0 = _mm_set1_pd(89.491964762085371);
198 const __m128d CB1 = _mm_set1_pd(-9.7373870675164587);
199 const __m128d CC0 = _mm_set1_pd(51.247261867992408);
200 const __m128d CC1 = _mm_set1_pd(-0.184020268133945);
201 const __m128d CD0 = _mm_set1_pd(36.82070153762337);
202 const __m128d CD1 = _mm_set1_pd(5.416849282638991);
203 const __m128d CE0 = _mm_set1_pd(30.34003452248759);
204 const __m128d CE1 = _mm_set1_pd(8.726173289493301);
205 const __m128d CF0 = _mm_set1_pd(27.73526969472330);
206 const __m128d CF1 = _mm_set1_pd(10.284755658866532);
208 __m128d valuemask;
209 __m128i iexppart;
210 __m128d fexppart;
211 __m128d intpart;
212 __m128d z,z2;
213 __m128d factB,factC,factD,factE,factF;
215 z = _mm_mul_pd(x,argscale);
216 iexppart = _mm_cvtpd_epi32(z);
218 #if GMX_SSE4
219 /* This reduces latency and speeds up the code by roughly 5% when supported */
220 intpart = _mm_round_pd(z,0);
221 #else
222 intpart = _mm_cvtepi32_pd(iexppart);
223 #endif
224 /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
225 * To be able to shift it into the exponent for a double precision number we first need to
226 * shuffle so that the lower half contains the first element, and the upper half the second.
227 * This should really be done as a zero-extension, but since the next instructions will shift
228 * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
229 * (thus we just use element 2 from iexppart).
231 iexppart = _mm_shuffle_epi32(iexppart,_MM_SHUFFLE(2,1,2,0));
233 /* Do the shift operation on the 64-bit registers */
234 iexppart = _mm_add_epi32(iexppart,expbase);
235 iexppart = _mm_slli_epi64(iexppart,52);
236 valuemask = _mm_cmpgt_pd(x,arglimit);
238 z = _mm_sub_pd(z,intpart);
239 z2 = _mm_mul_pd(z,z);
241 fexppart = _mm_and_pd(valuemask,gmx_mm_castsi128_pd(iexppart));
243 /* Since SSE doubleing-point has relatively high latency it is faster to do
244 * factorized polynomial summation with independent terms than using alternating add/multiply, i.e.
245 * p(z) = A0 * (B0 + z) * (C0 + C1*z + z^2) * (D0 + D1*z + z^2) * (E0 + E1*z + z^2) * (F0 + F1*z + z^2)
248 factB = _mm_add_pd(CB0,_mm_mul_pd(CB1,z) );
249 factB = _mm_add_pd(factB,z2);
250 factC = _mm_add_pd(CC0,_mm_mul_pd(CC1,z) );
251 factC = _mm_add_pd(factC,z2);
252 factD = _mm_add_pd(CD0,_mm_mul_pd(CD1,z) );
253 factD = _mm_add_pd(factD,z2);
254 factE = _mm_add_pd(CE0,_mm_mul_pd(CE1,z) );
255 factE = _mm_add_pd(factE,z2);
256 factF = _mm_add_pd(CF0,_mm_mul_pd(CF1,z) );
257 factF = _mm_add_pd(factF,z2);
259 z = _mm_mul_pd(CA0,fexppart);
260 factB = _mm_mul_pd(factB,factC);
261 factD = _mm_mul_pd(factD,factE);
262 z = _mm_mul_pd(z,factF);
263 factB = _mm_mul_pd(factB,factD);
264 z = _mm_mul_pd(z,factB);
266 /* Currently uses 32 actual (real, not including casts) SSE instructions */
267 return z;
270 static __m128d
271 gmx_mm_log_pd(__m128d x)
273 /* Same algorithm as cephes library */
274 const __m128d expmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
276 const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
277 const __m128d half = _mm_set1_pd(0.5);
278 const __m128d one = _mm_set1_pd(1.0);
279 const __m128i itwo = _mm_set1_epi32(2);
280 const __m128d invsq2 = _mm_set1_pd(1.0/sqrt(2.0));
282 const __m128d corr1 = _mm_set1_pd(-2.121944400546905827679e-4);
283 const __m128d corr2 = _mm_set1_pd(0.693359375);
285 const __m128d P5 = _mm_set1_pd(1.01875663804580931796E-4);
286 const __m128d P4 = _mm_set1_pd(4.97494994976747001425E-1);
287 const __m128d P3 = _mm_set1_pd(4.70579119878881725854E0);
288 const __m128d P2 = _mm_set1_pd(1.44989225341610930846E1);
289 const __m128d P1 = _mm_set1_pd(1.79368678507819816313E1);
290 const __m128d P0 = _mm_set1_pd(7.70838733755885391666E0);
292 const __m128d Q4 = _mm_set1_pd(1.12873587189167450590E1);
293 const __m128d Q3 = _mm_set1_pd(4.52279145837532221105E1);
294 const __m128d Q2 = _mm_set1_pd(8.29875266912776603211E1);
295 const __m128d Q1 = _mm_set1_pd(7.11544750618563894466E1);
296 const __m128d Q0 = _mm_set1_pd(2.31251620126765340583E1);
298 const __m128d R2 = _mm_set1_pd(-7.89580278884799154124E-1);
299 const __m128d R1 = _mm_set1_pd(1.63866645699558079767E1);
300 const __m128d R0 = _mm_set1_pd(-6.41409952958715622951E1);
302 const __m128d S2 = _mm_set1_pd(-3.56722798256324312549E1);
303 const __m128d S1 = _mm_set1_pd(3.12093766372244180303E2);
304 const __m128d S0 = _mm_set1_pd(-7.69691943550460008604E2);
306 __m128d fexp;
307 __m128i iexp,iexp1,signbit,iexpabs;
308 __m128i imask1;
309 __m128d mask1,mask2;
310 __m128d corr,t1,t2,q;
311 __m128d zA,yA,xA,zB,yB,xB,z;
312 __m128d polyR,polyS;
313 __m128d polyP1,polyP2,polyQ1,polyQ2;
315 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
316 fexp = _mm_and_pd(x,expmask);
317 iexp = gmx_mm_castpd_si128(fexp);
318 iexp = _mm_srli_epi64(iexp,52);
319 iexp = _mm_sub_epi32(iexp,expbase_m1);
320 iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1,1,2,0) );
322 x = _mm_andnot_pd(expmask,x);
323 x = _mm_or_pd(x,one);
324 x = _mm_mul_pd(x,half);
326 signbit = _mm_srai_epi32(iexp,31);
327 iexpabs = _mm_sub_epi32(_mm_xor_si128(iexp,signbit),signbit);
329 imask1 = _mm_cmpgt_epi32( iexpabs, itwo );
330 mask2 = _mm_cmplt_pd(x,invsq2);
332 fexp = _mm_cvtepi32_pd(iexp);
333 corr = _mm_and_pd(mask2,one);
334 fexp = _mm_sub_pd(fexp,corr);
336 /* If mask1 is set ('A') */
337 zA = _mm_sub_pd(x,half);
338 t1 = _mm_or_pd( _mm_andnot_pd(mask2,zA), _mm_and_pd(mask2,x) );
339 zA = _mm_sub_pd(t1,half);
340 t2 = _mm_or_pd( _mm_andnot_pd(mask2,x), _mm_and_pd(mask2,zA) );
341 yA = _mm_mul_pd(half,_mm_add_pd(t2,one));
343 xA = _mm_mul_pd(zA,gmx_mm_inv_pd(yA));
344 zA = _mm_mul_pd(xA,xA);
346 /* EVALUATE POLY */
347 polyR = _mm_mul_pd(R2,zA);
348 polyR = _mm_add_pd(polyR,R1);
349 polyR = _mm_mul_pd(polyR,zA);
350 polyR = _mm_add_pd(polyR,R0);
352 polyS = _mm_add_pd(zA,S2);
353 polyS = _mm_mul_pd(polyS,zA);
354 polyS = _mm_add_pd(polyS,S1);
355 polyS = _mm_mul_pd(polyS,zA);
356 polyS = _mm_add_pd(polyS,S0);
358 q = _mm_mul_pd(polyR,gmx_mm_inv_pd(polyS));
359 zA = _mm_mul_pd(_mm_mul_pd(xA,zA),q);
361 zA = _mm_add_pd(zA,_mm_mul_pd(corr1,fexp));
362 zA = _mm_add_pd(zA,xA);
363 zA = _mm_add_pd(zA,_mm_mul_pd(corr2,fexp));
366 /* If mask1 is not set ('B') */
367 corr = _mm_and_pd(mask2,x);
368 xB = _mm_add_pd(x,corr);
369 xB = _mm_sub_pd(xB,one);
370 zB = _mm_mul_pd(xB,xB);
372 polyP1 = _mm_mul_pd(P5,zB);
373 polyP2 = _mm_mul_pd(P4,zB);
374 polyP1 = _mm_add_pd(polyP1,P3);
375 polyP2 = _mm_add_pd(polyP2,P2);
376 polyP1 = _mm_mul_pd(polyP1,zB);
377 polyP2 = _mm_mul_pd(polyP2,zB);
378 polyP1 = _mm_add_pd(polyP1,P1);
379 polyP2 = _mm_add_pd(polyP2,P0);
380 polyP1 = _mm_mul_pd(polyP1,xB);
381 polyP1 = _mm_add_pd(polyP1,polyP2);
383 polyQ2 = _mm_mul_pd(Q4,zB);
384 polyQ1 = _mm_add_pd(zB,Q3);
385 polyQ2 = _mm_add_pd(polyQ2,Q2);
386 polyQ1 = _mm_mul_pd(polyQ1,zB);
387 polyQ2 = _mm_mul_pd(polyQ2,zB);
388 polyQ1 = _mm_add_pd(polyQ1,Q1);
389 polyQ2 = _mm_add_pd(polyQ2,Q0);
390 polyQ1 = _mm_mul_pd(polyQ1,xB);
391 polyQ1 = _mm_add_pd(polyQ1,polyQ2);
393 q = _mm_mul_pd(polyP1,gmx_mm_inv_pd(polyQ1));
394 yB = _mm_mul_pd(_mm_mul_pd(xB,zB),q);
396 yB = _mm_add_pd(yB,_mm_mul_pd(corr1,fexp));
397 yB = _mm_sub_pd(yB,_mm_mul_pd(half,zB));
398 zB = _mm_add_pd(xB,yB);
399 zB = _mm_add_pd(zB,_mm_mul_pd(corr2,fexp));
402 mask1 = gmx_mm_castsi128_pd( _mm_shuffle_epi32(imask1, _MM_SHUFFLE(1,1,0,0)) );
403 z = _mm_or_pd( _mm_and_pd(mask1,zA), _mm_andnot_pd(mask1,zB) );
405 return z;
409 static int
410 gmx_mm_sincos_pd(__m128d x,
411 __m128d *sinval,
412 __m128d *cosval)
414 #ifdef _MSC_VER
415 __declspec(align(16))
416 const double sintable[34] =
418 1.00000000000000000e+00 , 0.00000000000000000e+00 ,
419 9.95184726672196929e-01 , 9.80171403295606036e-02 ,
420 9.80785280403230431e-01 , 1.95090322016128248e-01 ,
421 9.56940335732208824e-01 , 2.90284677254462331e-01 ,
422 9.23879532511286738e-01 , 3.82683432365089782e-01 ,
423 8.81921264348355050e-01 , 4.71396736825997642e-01 ,
424 8.31469612302545236e-01 , 5.55570233019602178e-01 ,
425 7.73010453362736993e-01 , 6.34393284163645488e-01 ,
426 7.07106781186547573e-01 , 7.07106781186547462e-01 ,
427 6.34393284163645599e-01 , 7.73010453362736882e-01 ,
428 5.55570233019602289e-01 , 8.31469612302545125e-01 ,
429 4.71396736825997809e-01 , 8.81921264348354939e-01 ,
430 3.82683432365089837e-01 , 9.23879532511286738e-01 ,
431 2.90284677254462276e-01 , 9.56940335732208935e-01 ,
432 1.95090322016128304e-01 , 9.80785280403230431e-01 ,
433 9.80171403295607702e-02 , 9.95184726672196818e-01 ,
434 6.12323399573676604e-17 , 1.00000000000000000e+00
436 #else
437 const __m128d sintable[17] =
439 _mm_set_pd( sin( 0.0 * (M_PI/2.0) / 16.0) , cos( 0.0 * (M_PI/2.0) / 16.0) ),
440 _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0) , cos( 1.0 * (M_PI/2.0) / 16.0) ),
441 _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0) , cos( 2.0 * (M_PI/2.0) / 16.0) ),
442 _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0) , cos( 3.0 * (M_PI/2.0) / 16.0) ),
443 _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0) , cos( 4.0 * (M_PI/2.0) / 16.0) ),
444 _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0) , cos( 5.0 * (M_PI/2.0) / 16.0) ),
445 _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0) , cos( 6.0 * (M_PI/2.0) / 16.0) ),
446 _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0) , cos( 7.0 * (M_PI/2.0) / 16.0) ),
447 _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0) , cos( 8.0 * (M_PI/2.0) / 16.0) ),
448 _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0) , cos( 9.0 * (M_PI/2.0) / 16.0) ),
449 _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0) , cos( 10.0 * (M_PI/2.0) / 16.0) ),
450 _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0) , cos( 11.0 * (M_PI/2.0) / 16.0) ),
451 _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0) , cos( 12.0 * (M_PI/2.0) / 16.0) ),
452 _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0) , cos( 13.0 * (M_PI/2.0) / 16.0) ),
453 _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0) , cos( 14.0 * (M_PI/2.0) / 16.0) ),
454 _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0) , cos( 15.0 * (M_PI/2.0) / 16.0) ),
455 _mm_set_pd( sin( 16.0 * (M_PI/2.0) / 16.0) , cos( 16.0 * (M_PI/2.0) / 16.0) )
457 #endif
459 const __m128d signmask = _mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
460 const __m128d tabscale = _mm_set1_pd(32.0/M_PI);
461 const __m128d invtabscale = _mm_set1_pd(M_PI/32.0);
462 const __m128d one = _mm_set1_pd(1.0);
463 const __m128i i32 = _mm_set1_epi32(32);
464 const __m128i i16 = _mm_set1_epi32(16);
465 const __m128i tabmask = _mm_set1_epi32(0x3F);
466 const __m128d sinP7 = _mm_set1_pd(-1.9835958374985742404167983310657359E-4);
467 const __m128d sinP5 = _mm_set1_pd(8.3333330133863188710910904896670286347068944E-3);
468 const __m128d sinP3 = _mm_set1_pd(-1.66666666666049927649121240304808461825E-1);
469 const __m128d sinP1 = _mm_set1_pd(9.99999999999999814240922423058227089729828E-1);
471 const __m128d cosP6 = _mm_set1_pd(-1.3884108697213500852322439746988228E-3);
472 const __m128d cosP4 = _mm_set1_pd(4.16666637872444585215198198619627956E-2);
473 const __m128d cosP2 = _mm_set1_pd(-4.999999999944495616220671189143471E-1);
474 const __m128d cosP0 = _mm_set1_pd(9.999999999999983282334075742852867E-1);
476 __m128d scalex;
477 __m128i tabidx,corridx;
478 __m128d xabs,z,z2,polySin,polyCos;
479 __m128d xpoint;
480 __m128d ypoint0,ypoint1;
481 __m128d sinpoint,cospoint;
482 __m128d xsign,ssign,csign;
483 __m128i imask,sswapsign,cswapsign;
484 __m128d minusone;
486 xsign = _mm_andnot_pd(signmask,x);
487 xabs = _mm_and_pd(x,signmask);
489 scalex = _mm_mul_pd(tabscale,xabs);
490 tabidx = _mm_cvtpd_epi32(scalex);
491 xpoint = _mm_cvtepi32_pd(tabidx);
492 xpoint = _mm_mul_pd(xpoint,invtabscale);
493 z = _mm_sub_pd(xabs, xpoint);
495 /* Range reduction to 0..2*Pi */
496 tabidx = _mm_and_si128(tabidx,tabmask);
498 /* tabidx is now in range [0,..,64] */
499 imask = _mm_cmpgt_epi32(tabidx,i32);
500 sswapsign = imask;
501 cswapsign = imask;
502 corridx = _mm_and_si128(imask,i32);
503 tabidx = _mm_sub_epi32(tabidx,corridx);
505 /* tabidx is now in range [0..32] */
506 imask = _mm_cmpgt_epi32(tabidx,i16);
507 cswapsign = _mm_xor_si128(cswapsign,imask);
508 corridx = _mm_sub_epi32(i32,tabidx);
509 tabidx = _mm_or_si128( _mm_and_si128(imask,corridx), _mm_andnot_si128(imask,tabidx) );
511 /* tabidx is now in range [0..16] */
512 sswapsign = _mm_shuffle_epi32(sswapsign,_MM_SHUFFLE(2,2,0,0));
513 cswapsign = _mm_shuffle_epi32(cswapsign,_MM_SHUFFLE(2,2,0,0));
514 minusone = _mm_sub_pd(_mm_setzero_pd(),one);
516 ssign = _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(sswapsign),minusone ),
517 _mm_andnot_pd( _mm_castsi128_pd(sswapsign),one ));
518 csign = _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(cswapsign),minusone ),
519 _mm_andnot_pd( _mm_castsi128_pd(cswapsign),one ));
521 /* First lookup into table */
522 #ifdef _MSC_VER
523 ypoint0 = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx,0));
524 ypoint1 = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx,1));
525 #else
526 ypoint0 = sintable[gmx_mm_extract_epi32(tabidx,0)];
527 ypoint1 = sintable[gmx_mm_extract_epi32(tabidx,1)];
528 #endif
529 sinpoint = _mm_unpackhi_pd(ypoint0,ypoint1);
530 cospoint = _mm_unpacklo_pd(ypoint0,ypoint1);
532 sinpoint = _mm_mul_pd(sinpoint,ssign);
533 cospoint = _mm_mul_pd(cospoint,csign);
535 z2 = _mm_mul_pd(z,z);
537 polySin = _mm_mul_pd(sinP7,z2);
538 polySin = _mm_add_pd(polySin,sinP5);
539 polySin = _mm_mul_pd(polySin,z2);
540 polySin = _mm_add_pd(polySin,sinP3);
541 polySin = _mm_mul_pd(polySin,z2);
542 polySin = _mm_add_pd(polySin,sinP1);
543 polySin = _mm_mul_pd(polySin,z);
545 polyCos = _mm_mul_pd(cosP6,z2);
546 polyCos = _mm_add_pd(polyCos,cosP4);
547 polyCos = _mm_mul_pd(polyCos,z2);
548 polyCos = _mm_add_pd(polyCos,cosP2);
549 polyCos = _mm_mul_pd(polyCos,z2);
550 polyCos = _mm_add_pd(polyCos,cosP0);
552 *sinval = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint,polyCos) , _mm_mul_pd(cospoint,polySin) ),xsign);
553 *cosval = _mm_sub_pd( _mm_mul_pd(cospoint,polyCos) , _mm_mul_pd(sinpoint,polySin) );
555 return 0;
560 static __m128d
561 gmx_mm_tan_pd(__m128d x)
563 __m128d sinval,cosval;
564 __m128d tanval;
566 gmx_mm_sincos_pd(x,&sinval,&cosval);
568 tanval = _mm_mul_pd(sinval,gmx_mm_inv_pd(cosval));
570 return tanval;
575 static __m128d
576 gmx_mm_asin_pd(__m128d x)
578 /* Same algorithm as cephes library */
579 const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
580 const __m128d limit1 = _mm_set1_pd(0.625);
581 const __m128d limit2 = _mm_set1_pd(1e-8);
582 const __m128d one = _mm_set1_pd(1.0);
583 const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
584 const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
585 const __m128d morebits = _mm_set1_pd(6.123233995736765886130E-17);
587 const __m128d P5 = _mm_set1_pd(4.253011369004428248960E-3);
588 const __m128d P4 = _mm_set1_pd(-6.019598008014123785661E-1);
589 const __m128d P3 = _mm_set1_pd(5.444622390564711410273E0);
590 const __m128d P2 = _mm_set1_pd(-1.626247967210700244449E1);
591 const __m128d P1 = _mm_set1_pd(1.956261983317594739197E1);
592 const __m128d P0 = _mm_set1_pd(-8.198089802484824371615E0);
594 const __m128d Q4 = _mm_set1_pd(-1.474091372988853791896E1);
595 const __m128d Q3 = _mm_set1_pd(7.049610280856842141659E1);
596 const __m128d Q2 = _mm_set1_pd(-1.471791292232726029859E2);
597 const __m128d Q1 = _mm_set1_pd(1.395105614657485689735E2);
598 const __m128d Q0 = _mm_set1_pd(-4.918853881490881290097E1);
600 const __m128d R4 = _mm_set1_pd(2.967721961301243206100E-3);
601 const __m128d R3 = _mm_set1_pd(-5.634242780008963776856E-1);
602 const __m128d R2 = _mm_set1_pd(6.968710824104713396794E0);
603 const __m128d R1 = _mm_set1_pd(-2.556901049652824852289E1);
604 const __m128d R0 = _mm_set1_pd(2.853665548261061424989E1);
606 const __m128d S3 = _mm_set1_pd(-2.194779531642920639778E1);
607 const __m128d S2 = _mm_set1_pd(1.470656354026814941758E2);
608 const __m128d S1 = _mm_set1_pd(-3.838770957603691357202E2);
609 const __m128d S0 = _mm_set1_pd(3.424398657913078477438E2);
611 __m128d sign;
612 __m128d mask;
613 __m128d xabs;
614 __m128d zz,ww,z,q,w,y,zz2,ww2;
615 __m128d PA,PB;
616 __m128d QA,QB;
617 __m128d RA,RB;
618 __m128d SA,SB;
619 __m128d nom,denom;
621 sign = _mm_andnot_pd(signmask,x);
622 xabs = _mm_and_pd(x,signmask);
624 mask = _mm_cmpgt_pd(xabs,limit1);
626 zz = _mm_sub_pd(one,xabs);
627 ww = _mm_mul_pd(xabs,xabs);
628 zz2 = _mm_mul_pd(zz,zz);
629 ww2 = _mm_mul_pd(ww,ww);
631 /* R */
632 RA = _mm_mul_pd(R4,zz2);
633 RB = _mm_mul_pd(R3,zz2);
634 RA = _mm_add_pd(RA,R2);
635 RB = _mm_add_pd(RB,R1);
636 RA = _mm_mul_pd(RA,zz2);
637 RB = _mm_mul_pd(RB,zz);
638 RA = _mm_add_pd(RA,R0);
639 RA = _mm_add_pd(RA,RB);
641 /* S, SA = zz2 */
642 SB = _mm_mul_pd(S3,zz2);
643 SA = _mm_add_pd(zz2,S2);
644 SB = _mm_add_pd(SB,S1);
645 SA = _mm_mul_pd(SA,zz2);
646 SB = _mm_mul_pd(SB,zz);
647 SA = _mm_add_pd(SA,S0);
648 SA = _mm_add_pd(SA,SB);
650 /* P */
651 PA = _mm_mul_pd(P5,ww2);
652 PB = _mm_mul_pd(P4,ww2);
653 PA = _mm_add_pd(PA,P3);
654 PB = _mm_add_pd(PB,P2);
655 PA = _mm_mul_pd(PA,ww2);
656 PB = _mm_mul_pd(PB,ww2);
657 PA = _mm_add_pd(PA,P1);
658 PB = _mm_add_pd(PB,P0);
659 PA = _mm_mul_pd(PA,ww);
660 PA = _mm_add_pd(PA,PB);
662 /* Q, QA = ww2 */
663 QB = _mm_mul_pd(Q4,ww2);
664 QA = _mm_add_pd(ww2,Q3);
665 QB = _mm_add_pd(QB,Q2);
666 QA = _mm_mul_pd(QA,ww2);
667 QB = _mm_mul_pd(QB,ww2);
668 QA = _mm_add_pd(QA,Q1);
669 QB = _mm_add_pd(QB,Q0);
670 QA = _mm_mul_pd(QA,ww);
671 QA = _mm_add_pd(QA,QB);
673 RA = _mm_mul_pd(RA,zz);
674 PA = _mm_mul_pd(PA,ww);
676 nom = _mm_or_pd( _mm_and_pd(mask,RA) , _mm_andnot_pd(mask,PA) );
677 denom = _mm_or_pd( _mm_and_pd(mask,SA) , _mm_andnot_pd(mask,QA) );
679 q = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
681 zz = _mm_add_pd(zz,zz);
682 zz = gmx_mm_sqrt_pd(zz);
683 z = _mm_sub_pd(quarterpi,zz);
684 zz = _mm_mul_pd(zz,q);
685 zz = _mm_sub_pd(zz,morebits);
686 z = _mm_sub_pd(z,zz);
687 z = _mm_add_pd(z,quarterpi);
689 w = _mm_mul_pd(xabs,q);
690 w = _mm_add_pd(w,xabs);
692 z = _mm_or_pd( _mm_and_pd(mask,z) , _mm_andnot_pd(mask,w) );
694 mask = _mm_cmpgt_pd(xabs,limit2);
695 z = _mm_or_pd( _mm_and_pd(mask,z) , _mm_andnot_pd(mask,xabs) );
697 z = _mm_xor_pd(z,sign);
699 return z;
703 static __m128d
704 gmx_mm_acos_pd(__m128d x)
706 const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
707 const __m128d one_pd = _mm_set1_pd(1.0);
708 const __m128d half_pd = _mm_set1_pd(0.5);
709 const __m128d pi_pd = _mm_set1_pd(M_PI);
710 const __m128d halfpi_pd = _mm_set1_pd(M_PI/2.0);
712 __m128d mask1;
713 __m128d mask2;
714 __m128d xabs;
715 __m128d z,z1,z2,z3;
717 xabs = _mm_and_pd(x,signmask);
718 mask1 = _mm_cmpgt_pd(xabs,half_pd);
719 mask2 = _mm_cmpgt_pd(x,_mm_setzero_pd());
721 z = _mm_mul_pd(half_pd,_mm_sub_pd(one_pd,xabs));
722 z = _mm_mul_pd(z,gmx_mm_invsqrt_pd(z));
723 z = _mm_andnot_pd(_mm_cmpeq_pd(xabs,one_pd),z);
725 z = _mm_or_pd( _mm_and_pd(mask1,z) , _mm_andnot_pd(mask1,x) );
726 z = gmx_mm_asin_pd(z);
728 z2 = _mm_add_pd(z,z);
729 z1 = _mm_sub_pd(pi_pd,z2);
730 z3 = _mm_sub_pd(halfpi_pd,z);
732 z = _mm_or_pd( _mm_and_pd(mask2,z2) , _mm_andnot_pd(mask2,z1) );
733 z = _mm_or_pd( _mm_and_pd(mask1,z) , _mm_andnot_pd(mask1,z3) );
735 return z;
738 static __m128d
739 gmx_mm_atan_pd(__m128d x)
741 /* Same algorithm as cephes library */
742 const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
743 const __m128d limit1 = _mm_set1_pd(0.66);
744 const __m128d limit2 = _mm_set1_pd(2.41421356237309504880);
745 const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
746 const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
747 const __m128d mone = _mm_set1_pd(-1.0);
748 const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
749 const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
751 const __m128d P4 = _mm_set1_pd(-8.750608600031904122785E-1);
752 const __m128d P3 = _mm_set1_pd(-1.615753718733365076637E1);
753 const __m128d P2 = _mm_set1_pd(-7.500855792314704667340E1);
754 const __m128d P1 = _mm_set1_pd(-1.228866684490136173410E2);
755 const __m128d P0 = _mm_set1_pd(-6.485021904942025371773E1);
757 const __m128d Q4 = _mm_set1_pd(2.485846490142306297962E1);
758 const __m128d Q3 = _mm_set1_pd(1.650270098316988542046E2);
759 const __m128d Q2 = _mm_set1_pd(4.328810604912902668951E2);
760 const __m128d Q1 = _mm_set1_pd(4.853903996359136964868E2);
761 const __m128d Q0 = _mm_set1_pd(1.945506571482613964425E2);
763 __m128d sign;
764 __m128d mask1,mask2;
765 __m128d y,t1,t2;
766 __m128d z,z2;
767 __m128d P_A,P_B,Q_A,Q_B;
769 sign = _mm_andnot_pd(signmask,x);
770 x = _mm_and_pd(x,signmask);
772 mask1 = _mm_cmpgt_pd(x,limit1);
773 mask2 = _mm_cmpgt_pd(x,limit2);
775 t1 = _mm_mul_pd(_mm_add_pd(x,mone),gmx_mm_inv_pd(_mm_sub_pd(x,mone)));
776 t2 = _mm_mul_pd(mone,gmx_mm_inv_pd(x));
778 y = _mm_and_pd(mask1,quarterpi);
779 y = _mm_or_pd( _mm_and_pd(mask2,halfpi) , _mm_andnot_pd(mask2,y) );
781 x = _mm_or_pd( _mm_and_pd(mask1,t1) , _mm_andnot_pd(mask1,x) );
782 x = _mm_or_pd( _mm_and_pd(mask2,t2) , _mm_andnot_pd(mask2,x) );
784 z = _mm_mul_pd(x,x);
785 z2 = _mm_mul_pd(z,z);
787 P_A = _mm_mul_pd(P4,z2);
788 P_B = _mm_mul_pd(P3,z2);
789 P_A = _mm_add_pd(P_A,P2);
790 P_B = _mm_add_pd(P_B,P1);
791 P_A = _mm_mul_pd(P_A,z2);
792 P_B = _mm_mul_pd(P_B,z);
793 P_A = _mm_add_pd(P_A,P0);
794 P_A = _mm_add_pd(P_A,P_B);
796 /* Q_A = z2 */
797 Q_B = _mm_mul_pd(Q4,z2);
798 Q_A = _mm_add_pd(z2,Q3);
799 Q_B = _mm_add_pd(Q_B,Q2);
800 Q_A = _mm_mul_pd(Q_A,z2);
801 Q_B = _mm_mul_pd(Q_B,z2);
802 Q_A = _mm_add_pd(Q_A,Q1);
803 Q_B = _mm_add_pd(Q_B,Q0);
804 Q_A = _mm_mul_pd(Q_A,z);
805 Q_A = _mm_add_pd(Q_A,Q_B);
807 z = _mm_mul_pd(z,P_A);
808 z = _mm_mul_pd(z,gmx_mm_inv_pd(Q_A));
809 z = _mm_mul_pd(z,x);
810 z = _mm_add_pd(z,x);
812 t1 = _mm_and_pd(mask1,morebits1);
813 t1 = _mm_or_pd( _mm_and_pd(mask2,morebits2) , _mm_andnot_pd(mask2,t1) );
815 z = _mm_add_pd(z,t1);
816 y = _mm_add_pd(y,z);
818 y = _mm_xor_pd(y,sign);
820 return y;
823 static __m128d
824 gmx_mm_atan2_pd(__m128d y, __m128d x)
826 const __m128d pi = _mm_set1_pd(M_PI);
827 const __m128d minuspi = _mm_set1_pd(-M_PI);
828 const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
829 const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
831 __m128d z,z1,z3,z4;
832 __m128d w;
833 __m128d maskx_lt,maskx_eq;
834 __m128d masky_lt,masky_eq;
835 __m128d mask1,mask2,mask3,mask4,maskall;
837 maskx_lt = _mm_cmplt_pd(x,_mm_setzero_pd());
838 masky_lt = _mm_cmplt_pd(y,_mm_setzero_pd());
839 maskx_eq = _mm_cmpeq_pd(x,_mm_setzero_pd());
840 masky_eq = _mm_cmpeq_pd(y,_mm_setzero_pd());
842 z = _mm_mul_pd(y,gmx_mm_inv_pd(x));
843 z = gmx_mm_atan_pd(z);
845 mask1 = _mm_and_pd(maskx_eq,masky_lt);
846 mask2 = _mm_andnot_pd(maskx_lt,masky_eq);
847 mask3 = _mm_andnot_pd( _mm_or_pd(masky_lt,masky_eq) , maskx_eq);
848 mask4 = _mm_and_pd(masky_eq,maskx_lt);
850 maskall = _mm_or_pd( _mm_or_pd(mask1,mask2), _mm_or_pd(mask3,mask4) );
852 z = _mm_andnot_pd(maskall,z);
853 z1 = _mm_and_pd(mask1,minushalfpi);
854 z3 = _mm_and_pd(mask3,halfpi);
855 z4 = _mm_and_pd(mask4,pi);
857 z = _mm_or_pd( _mm_or_pd(z,z1), _mm_or_pd(z3,z4) );
859 mask1 = _mm_andnot_pd(masky_lt,maskx_lt);
860 mask2 = _mm_and_pd(maskx_lt,masky_lt);
862 w = _mm_or_pd( _mm_and_pd(mask1,pi), _mm_and_pd(mask2,minuspi) );
863 w = _mm_andnot_pd(maskall,w);
865 z = _mm_add_pd(z,w);
867 return z;
871 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
872 __m128d __gmx_t1 = row0; \
873 row0 = _mm_unpacklo_pd(row0,row1); \
874 row1 = _mm_unpackhi_pd(__gmx_t1,row1); \
878 #define GMX_MM_LOAD_1JCOORD_1ATOM_PD(ptr1,jx1,jy1,jz1) { \
879 jx1 = _mm_loadu_pd(ptr1); \
880 jy1 = _mm_unpackhi_pd(jx1,jx1); \
881 jz1 = _mm_load_sd(ptr1+2); \
885 #define GMX_MM_LOAD_1JCOORD_2ATOMS_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
886 jx1 = _mm_loadu_pd(ptr1); \
887 jy1 = _mm_unpackhi_pd(jx1,jx1); \
888 jz1 = _mm_loadu_pd(ptr1+2); \
889 jx2 = _mm_unpackhi_pd(jz1,jz1); \
890 jy2 = _mm_loadu_pd(ptr1+4); \
891 jz2 = _mm_unpackhi_pd(jy2,jy2); \
895 #define GMX_MM_LOAD_1JCOORD_3ATOMS_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
896 jx1 = _mm_loadu_pd(ptr1); \
897 jy1 = _mm_unpackhi_pd(jx1,jx1); \
898 jz1 = _mm_loadu_pd(ptr1+2); \
899 jx2 = _mm_unpackhi_pd(jz1,jz1); \
900 jy2 = _mm_loadu_pd(ptr1+4); \
901 jz2 = _mm_unpackhi_pd(jy2,jy2); \
902 jx3 = _mm_loadu_pd(ptr1+6); \
903 jy3 = _mm_unpackhi_pd(jx3,jx3); \
904 jz3 = _mm_load_sd(ptr1+8); \
908 #define GMX_MM_LOAD_1JCOORD_4ATOMS_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
909 jx1 = _mm_loadu_pd(ptr1); \
910 jy1 = _mm_unpackhi_pd(jx1,jx1); \
911 jz1 = _mm_loadu_pd(ptr1+2); \
912 jx2 = _mm_unpackhi_pd(jz1,jz1); \
913 jy2 = _mm_loadu_pd(ptr1+4); \
914 jz2 = _mm_unpackhi_pd(jy2,jy2); \
915 jx3 = _mm_loadu_pd(ptr1+6); \
916 jy3 = _mm_unpackhi_pd(jx3,jx3); \
917 jz3 = _mm_loadu_pd(ptr1+8); \
918 jx4 = _mm_unpackhi_pd(jz3,jz3); \
919 jy4 = _mm_loadu_pd(ptr1+10); \
920 jz4 = _mm_unpackhi_pd(jy4,jy4); \
923 #define GMX_MM_LOAD_2JCOORD_1ATOM_PD(ptr1,ptr2,jx1,jy1,jz1) { \
924 __m128d _tmp1; \
925 _tmp1 = _mm_loadu_pd(ptr1); \
926 jy1 = _mm_loadu_pd(ptr2); \
927 jz1 = _mm_load_sd(ptr1+2); \
928 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
929 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
930 jz1 = _mm_loadh_pd(jz1,ptr2+2); \
934 #define GMX_MM_LOAD_2JCOORD_2ATOMS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
935 __m128d _tmp1, _tmp2,_tmp3; \
936 _tmp1 = _mm_loadu_pd(ptr1); \
937 jy1 = _mm_loadu_pd(ptr2); \
938 _tmp2 = _mm_loadu_pd(ptr1+2); \
939 jx2 = _mm_loadu_pd(ptr2+2); \
940 _tmp3 = _mm_loadu_pd(ptr1+4); \
941 jz2 = _mm_loadu_pd(ptr2+4); \
942 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
943 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
944 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
945 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
946 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
947 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
951 #define GMX_MM_LOAD_2JCOORD_3ATOMS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
952 __m128d _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
953 _tmp1 = _mm_loadu_pd(ptr1); \
954 jy1 = _mm_loadu_pd(ptr2); \
955 _tmp2 = _mm_loadu_pd(ptr1+2); \
956 jx2 = _mm_loadu_pd(ptr2+2); \
957 _tmp3 = _mm_loadu_pd(ptr1+4); \
958 jz2 = _mm_loadu_pd(ptr2+4); \
959 _tmp4 = _mm_loadu_pd(ptr1+6); \
960 jy3 = _mm_loadu_pd(ptr2+6); \
961 jz3 = _mm_load_sd(ptr1+8); \
962 _tmp5 = _mm_load_sd(ptr2+8); \
963 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
964 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
965 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
966 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
967 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
968 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
969 jx3 = _mm_unpacklo_pd(_tmp4,jy3); \
970 jy3 = _mm_unpackhi_pd(_tmp4,jy3); \
971 jz3 = _mm_unpacklo_pd(jz2,_tmp5); \
975 #define GMX_MM_LOAD_2JCOORD_4ATOMS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
976 __m128d _tmp1, _tmp2,_tmp3, _tmp4, _tmp5, _tmp6; \
977 _tmp1 = _mm_loadu_pd(ptr1); \
978 jy1 = _mm_loadu_pd(ptr2); \
979 _tmp2 = _mm_loadu_pd(ptr1+2); \
980 jx2 = _mm_loadu_pd(ptr2+2); \
981 _tmp3 = _mm_loadu_pd(ptr1+4); \
982 jz2 = _mm_loadu_pd(ptr2+4); \
983 _tmp4 = _mm_loadu_pd(ptr1+6); \
984 jy3 = _mm_loadu_pd(ptr2+6); \
985 _tmp5 = _mm_loadu_pd(ptr1+8); \
986 jx4 = _mm_loadu_pd(ptr2+8); \
987 _tmp6 = _mm_loadu_pd(ptr1+10); \
988 jz4 = _mm_loadu_pd(ptr2+10); \
989 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
990 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
991 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
992 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
993 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
994 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
995 jx3 = _mm_unpacklo_pd(_tmp4,jy3); \
996 jy3 = _mm_unpackhi_pd(_tmp4,jy3); \
997 jz3 = _mm_unpacklo_pd(_tmp5,jx4); \
998 jx4 = _mm_unpackhi_pd(_tmp5,jx4); \
999 jy4 = _mm_unpacklo_pd(_tmp6,jz4); \
1000 jz4 = _mm_unpackhi_pd(_tmp6,jz4); \
1004 #define GMX_MM_UPDATE_1JCOORD_1ATOM_PD(ptr1,jx1,jy1,jz1) { \
1005 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1006 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1007 _mm_store_sd(ptr1+2, _mm_add_sd( _mm_load_sd(ptr1+2), jz1)); \
1011 #define GMX_MM_UPDATE_1JCOORD_2ATOMS_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1012 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1013 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1014 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1015 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1016 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1017 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1021 #define GMX_MM_UPDATE_1JCOORD_3ATOMS_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1022 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1023 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1024 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1025 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1026 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1027 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1028 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1029 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1030 _mm_store_sd(ptr1+8, _mm_add_pd( _mm_load_sd(ptr1+8), jz3 )); \
1034 #define GMX_MM_UPDATE_1JCOORD_4ATOMS_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1035 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1036 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1037 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1038 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1039 jz3 = _mm_unpacklo_pd(jz3,jx4); \
1040 jy4 = _mm_unpacklo_pd(jy4,jz4); \
1041 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1042 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1043 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1044 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1045 _mm_storeu_pd(ptr1+8, _mm_add_pd( _mm_loadu_pd(ptr1+8), jz3 )); \
1046 _mm_storeu_pd(ptr1+10, _mm_add_pd( _mm_loadu_pd(ptr1+10), jy4 )); \
1050 #define GMX_MM_UPDATE_2JCOORD_1ATOM_PD(ptr1,ptr2,jx1,jy1,jz1) { \
1051 __m128d _tmp1,_tmp2,_tmp3; \
1052 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1053 _tmp2 = _mm_unpackhi_pd(jx1,jy1); \
1054 _tmp3 = _mm_unpackhi_pd(jz1,jz1); \
1055 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1056 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), _tmp2 )); \
1057 _mm_store_sd(ptr1+2, _mm_add_pd( _mm_load_sd(ptr1+2), jz1 )); \
1058 _mm_store_sd(ptr2+2, _mm_add_sd( _mm_load_sd(ptr2+2), _tmp3 )); \
1062 #define GMX_MM_UPDATE_2JCOORD_2ATOMS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1063 __m128d _tmp1,_tmp2,_tmp3; \
1064 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1065 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1066 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1067 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1068 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1069 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1070 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1071 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1072 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1073 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1074 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1075 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1079 #define GMX_MM_UPDATE_2JCOORD_3ATOMS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1080 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1081 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1082 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1083 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1084 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1085 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1086 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1087 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1088 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1089 _tmp5 = _mm_unpackhi_pd(jz3,jz3); \
1090 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1091 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1092 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1093 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1094 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1095 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1096 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1097 _mm_storeu_pd(ptr2+6, _mm_add_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1098 _mm_store_sd(ptr1+8, _mm_add_sd( _mm_load_sd(ptr1+8), jz3 )); \
1099 _mm_store_sd(ptr2+8, _mm_add_sd( _mm_load_sd(ptr2+8), _tmp5 )); \
1103 #define GMX_MM_UPDATE_2JCOORD_4ATOMS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1104 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6; \
1105 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1106 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1107 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1108 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1109 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1110 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1111 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1112 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1113 _tmp5 = _mm_unpacklo_pd(jz3,jx4); \
1114 jx4 = _mm_unpackhi_pd(jz3,jx4); \
1115 _tmp6 = _mm_unpacklo_pd(jy4,jz4); \
1116 jz4 = _mm_unpackhi_pd(jy4,jz4); \
1117 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1118 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1119 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1120 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1121 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1122 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1123 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1124 _mm_storeu_pd(ptr2+6, _mm_add_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1125 _mm_storeu_pd(ptr1+8, _mm_add_pd( _mm_loadu_pd(ptr1+8), _tmp5 )); \
1126 _mm_storeu_pd(ptr2+8, _mm_add_pd( _mm_loadu_pd(ptr2+8), jx4 )); \
1127 _mm_storeu_pd(ptr1+10,_mm_add_pd( _mm_loadu_pd(ptr1+10),_tmp6 )); \
1128 _mm_storeu_pd(ptr2+10,_mm_add_pd( _mm_loadu_pd(ptr2+10), jz4 )); \
1132 static inline __m128d
1133 gmx_mm_scalarprod_pd(__m128d x, __m128d y, __m128d z)
1135 return _mm_add_pd(_mm_add_pd(_mm_mul_pd(x,x),_mm_mul_pd(y,y)),_mm_mul_pd(z,z));
1138 static inline __m128d
1139 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
1141 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx), _mm_mul_pd(dy,dy) ), _mm_mul_pd(dz,dz) );
1146 static inline void
1147 gmx_mm_update_iforce_1atom_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1148 double *fptr,
1149 double *fshiftptr)
1151 __m128d t1,t2,t3;
1153 #ifdef GMX_SSE3
1154 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1155 /* fiz1 is fine as it is */
1156 #else
1157 /* SSE2 */
1158 /* transpose data */
1159 t1 = *fix1;
1160 *fix1 = _mm_unpacklo_pd(*fix1,*fiy1); /* y0 x0 */
1161 *fiy1 = _mm_unpackhi_pd(t1,*fiy1); /* y1 x1 */
1163 *fix1 = _mm_add_pd(*fix1,*fiy1);
1164 *fiz1 = _mm_add_sd( *fiz1, _mm_unpackhi_pd(*fiz1,*fiz1 ));
1165 #endif
1166 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1167 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), *fiz1 ));
1169 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1170 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1173 static inline void
1174 gmx_mm_update_iforce_2atoms_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1175 __m128d *fix2, __m128d *fiy2, __m128d *fiz2,
1176 double *fptr,
1177 double *fshiftptr)
1179 __m128d t1;
1181 #ifdef GMX_SSE3
1182 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1183 *fiz1 = _mm_hadd_pd(*fiz1,*fix2);
1184 *fiy2 = _mm_hadd_pd(*fiy2,*fiz2);
1185 #else
1186 /* SSE2 */
1187 /* transpose data */
1188 GMX_MM_TRANSPOSE2_PD(*fix1,*fiy1);
1189 GMX_MM_TRANSPOSE2_PD(*fiz1,*fix2);
1190 GMX_MM_TRANSPOSE2_PD(*fiy2,*fiz2);
1192 *fix1 = _mm_add_pd(*fix1,*fiy1);
1193 *fiz1 = _mm_add_pd(*fiz1,*fix2);
1194 *fiy2 = _mm_add_pd(*fiy2,*fiz2);
1195 #endif
1196 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1197 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), *fiz1 ));
1198 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), *fiy2 ));
1200 t1 = _mm_shuffle_pd(*fiz1,*fiy2,_MM_SHUFFLE2(0,1));
1201 *fix1 = _mm_add_pd(*fix1,t1); /* x and y sums */
1202 *fiz1 = _mm_add_sd(*fiz1, _mm_unpackhi_pd(*fiy2,*fiy2)); /* z sum */
1204 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1205 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1210 static inline void
1211 gmx_mm_update_iforce_3atoms_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1212 __m128d *fix2, __m128d *fiy2, __m128d *fiz2,
1213 __m128d *fix3, __m128d *fiy3, __m128d *fiz3,
1214 double *fptr,
1215 double *fshiftptr)
1217 __m128d t1,t2;
1219 #ifdef GMX_SSE3
1220 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1221 *fiz1 = _mm_hadd_pd(*fiz1,*fix2);
1222 *fiy2 = _mm_hadd_pd(*fiy2,*fiz2);
1223 *fix3 = _mm_hadd_pd(*fix3,*fiy3);
1224 /* fiz3 is fine as it is */
1225 #else
1226 /* SSE2 */
1227 /* transpose data */
1228 GMX_MM_TRANSPOSE2_PD(*fix1,*fiy1);
1229 GMX_MM_TRANSPOSE2_PD(*fiz1,*fix2);
1230 GMX_MM_TRANSPOSE2_PD(*fiy2,*fiz2);
1231 t1 = *fix3;
1232 *fix3 = _mm_unpacklo_pd(*fix3,*fiy3); /* y0 x0 */
1233 *fiy3 = _mm_unpackhi_pd(t1,*fiy3); /* y1 x1 */
1235 *fix1 = _mm_add_pd(*fix1,*fiy1);
1236 *fiz1 = _mm_add_pd(*fiz1,*fix2);
1237 *fiy2 = _mm_add_pd(*fiy2,*fiz2);
1239 *fix3 = _mm_add_pd(*fix3,*fiy3);
1240 *fiz3 = _mm_add_sd( *fiz3, _mm_unpackhi_pd(*fiz3,*fiz3));
1241 #endif
1242 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1243 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), *fiz1 ));
1244 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), *fiy2 ));
1245 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), *fix3 ));
1246 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), *fiz3 ));
1248 *fix1 = _mm_add_pd(*fix1,*fix3);
1249 t1 = _mm_shuffle_pd(*fiz1,*fiy2,_MM_SHUFFLE2(0,1));
1250 *fix1 = _mm_add_pd(*fix1,t1); /* x and y sums */
1252 t2 = _mm_shuffle_pd(*fiy2,*fiy2,_MM_SHUFFLE2(1,1));
1253 *fiz1 = _mm_add_sd(*fiz1,*fiz3);
1254 *fiz1 = _mm_add_sd(*fiz1,t2); /* z sum */
1256 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1257 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1262 static inline void
1263 gmx_mm_update_iforce_4atoms_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1264 __m128d *fix2, __m128d *fiy2, __m128d *fiz2,
1265 __m128d *fix3, __m128d *fiy3, __m128d *fiz3,
1266 __m128d *fix4, __m128d *fiy4, __m128d *fiz4,
1267 double *fptr,
1268 double *fshiftptr)
1270 __m128d t1,t2;
1272 #ifdef GMX_SSE3
1273 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1274 *fiz1 = _mm_hadd_pd(*fiz1,*fix2);
1275 *fiy2 = _mm_hadd_pd(*fiy2,*fiz2);
1276 *fix3 = _mm_hadd_pd(*fix3,*fiy3);
1277 *fiz3 = _mm_hadd_pd(*fiz3,*fix4);
1278 *fiy4 = _mm_hadd_pd(*fiy4,*fiz4);
1279 #else
1280 /* SSE2 */
1281 /* transpose data */
1282 GMX_MM_TRANSPOSE2_PD(*fix1,*fiy1);
1283 GMX_MM_TRANSPOSE2_PD(*fiz1,*fix2);
1284 GMX_MM_TRANSPOSE2_PD(*fiy2,*fiz2);
1285 GMX_MM_TRANSPOSE2_PD(*fix3,*fiy3);
1286 GMX_MM_TRANSPOSE2_PD(*fiz3,*fix4);
1287 GMX_MM_TRANSPOSE2_PD(*fiy4,*fiz4);
1289 *fix1 = _mm_add_pd(*fix1,*fiy1);
1290 *fiz1 = _mm_add_pd(*fiz1,*fix2);
1291 *fiy2 = _mm_add_pd(*fiy2,*fiz2);
1292 *fix3 = _mm_add_pd(*fix3,*fiy3);
1293 *fiz3 = _mm_add_pd(*fiz3,*fix4);
1294 *fiy4 = _mm_add_pd(*fiy4,*fiz4);
1295 #endif
1296 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1297 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), *fiz1 ));
1298 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), *fiy2 ));
1299 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), *fix3 ));
1300 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), *fiz3 ));
1301 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), *fiy4 ));
1303 t1 = _mm_shuffle_pd(*fiz1,*fiy2,_MM_SHUFFLE2(0,1));
1304 *fix1 = _mm_add_pd(*fix1,t1);
1305 t2 = _mm_shuffle_pd(*fiz3,*fiy4,_MM_SHUFFLE2(0,1));
1306 *fix3 = _mm_add_pd(*fix3,t2);
1307 *fix1 = _mm_add_pd(*fix1,*fix3); /* x and y sums */
1310 *fiz1 = _mm_add_sd(*fiz1, _mm_unpackhi_pd(*fiy2,*fiy2));
1311 *fiz3 = _mm_add_sd(*fiz3, _mm_unpackhi_pd(*fiy4,*fiy4));
1312 *fiz1 = _mm_add_sd(*fiz1,*fiz3); /* z sum */
1315 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1316 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1320 static inline void
1321 gmx_mm_update_1pot_pd(__m128d pot1, double *ptr1)
1323 #ifdef GMX_SSE3
1324 pot1 = _mm_hadd_pd(pot1,pot1);
1325 #else
1326 /* SSE2 */
1327 pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1,pot1));
1328 #endif
1329 _mm_store_sd(ptr1,_mm_add_sd(pot1,_mm_load_sd(ptr1)));
1332 static inline void
1333 gmx_mm_update_2pot_pd(__m128d pot1, double *ptr1, __m128d pot2, double *ptr2)
1335 #ifdef GMX_SSE3
1336 pot1 = _mm_hadd_pd(pot1,pot2);
1337 #else
1338 /* SSE2 */
1339 GMX_MM_TRANSPOSE2_PD(pot1,pot2);
1340 pot1 = _mm_add_pd(pot1,pot2);
1341 #endif
1342 pot2 = _mm_unpackhi_pd(pot1,pot1);
1344 _mm_store_sd(ptr1,_mm_add_sd(pot1,_mm_load_sd(ptr1)));
1345 _mm_store_sd(ptr2,_mm_add_sd(pot2,_mm_load_sd(ptr2)));