2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
30 /* We require SSE2 now! */
33 #include <xmmintrin.h> /* SSE */
34 #include <emmintrin.h> /* SSE2 */
36 # include <pmmintrin.h> /* SSE3 */
39 # include <smmintrin.h> /* SSE4.1 */
45 #include "types/simple.h"
47 /***************************************************
49 * COMPILER RANT WARNING: *
51 * Ideally, this header would be filled with *
52 * simple static inline functions. Unfortunately, *
53 * many vendors provide really braindead compilers *
54 * that either cannot handle more than 1-2 SSE *
55 * function parameters, and some cannot handle *
56 * pointers to SSE __m128 datatypes as parameters *
57 * at all. Thus, for portability we have had to *
58 * implement all but the simplest routines as *
61 ***************************************************/
64 /***************************************************
66 * Wrappers/replacements for some instructions *
67 * not available in all SSE versions. *
69 ***************************************************/
72 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
74 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
78 * Some compilers require a cast to change the interpretation
79 * of a register from FP to Int and vice versa, and not all of
80 * the provide instructions to do this. Roll our own wrappers...
83 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
84 # define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
85 # define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
86 # define gmx_mm_castpd_pd128(a) (a)
87 #elif defined(__GNUC__)
88 # define gmx_mm_castsi128_pd(a) ((__m128d)(a))
89 # define gmx_mm_castpd_si128(a) ((__m128i)(a))
90 # define gmx_mm_castpd_pd128(a) ((__m128d)(a))
92 static __m128d
gmx_mm_castsi128_pd(__m128i a
) { return *(__m128d
*) &a
; }
93 static __m128i
gmx_mm_castpd_si128(__m128d a
) { return *(__m128i
*) &a
; }
94 static __m128d
gmx_mm_castpd_pd128(__m128d a
) { return *(__m128d
*) &a
; }
97 #define gmx_mm_extract_epi64(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
102 gmx_printxmm_pd(const char *s
,__m128d xmm
)
106 _mm_storeu_pd(f
,xmm
);
107 printf("%s: %15.10g %15.10g\n",s
,f
[0],f
[1]);
111 static inline __m128d
112 gmx_mm_inv_pd(__m128d x
)
114 const __m128d two
= _mm_set1_pd(2.0);
116 /* Lookup instruction only exists in single precision, convert back and forth... */
117 __m128d lu
= _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x
)));
119 /* Perform two N-R steps for double precision */
120 lu
= _mm_mul_pd(lu
,_mm_sub_pd(two
,_mm_mul_pd(x
,lu
)));
121 return _mm_mul_pd(lu
,_mm_sub_pd(two
,_mm_mul_pd(x
,lu
)));
126 static inline __m128d
127 gmx_mm_invsqrt_pd(__m128d x
)
129 const __m128d half
= _mm_set1_pd(0.5);
130 const __m128d three
= _mm_set1_pd(3.0);
132 /* Lookup instruction only exists in single precision, convert back and forth... */
133 __m128d lu
= _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x
)));
135 lu
= _mm_mul_pd(half
,_mm_mul_pd(_mm_sub_pd(three
,_mm_mul_pd(_mm_mul_pd(lu
,lu
),x
)),lu
));
136 return _mm_mul_pd(half
,_mm_mul_pd(_mm_sub_pd(three
,_mm_mul_pd(_mm_mul_pd(lu
,lu
),x
)),lu
));
139 static inline __m128d
140 gmx_mm_sqrt_pd(__m128d x
)
145 mask
= _mm_cmpeq_pd(x
,_mm_setzero_pd());
146 res
= _mm_andnot_pd(mask
,gmx_mm_invsqrt_pd(x
));
148 res
= _mm_mul_pd(x
,res
);
155 * Exponential function.
157 * Exp(x) is calculate from the relation Exp(x)=2^(y), where y=log2(e)*x
158 * Thus, the contents of this routine is mostly about calculating 2^y.
160 * This is done by separating y=z+w, where z=[y] is an integer. For technical reasons it is easiest
161 * for us to round to the _nearest_ integer and have w in [-0.5,0.5] rather than always rounding down.
162 * (It is not until SSE4 there was an efficient operation to do rounding towards -infinity).
164 * With this we get 2^y=2^z*2^w
166 * Since we have IEEE fp representation, we can easily calculate 2^z by adding the FP exponent bias
167 * (1023 in double), and shifting the integer to the exponent field of the FP number (52 bits up).
169 * The 2^w term is calculated from a (10,0)-th order (no denominator) Minimax polynomia on the interval
170 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
172 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 10, 0}, WorkingPrecision -> 20]
174 * The lowest exponent we can represent in IEEE double-precision binary format is 2^-1022; below that
175 * it will wrap around and lead to very large positive numbers. This corresponds to a lower bound
176 * on the argument for exp(x) of roughly -708.39. For smaller arguments the return value will be 0.0.
178 * There appears to be a slight loss of precision for large arguments (~250), where the largest relative
179 * error reaches ~2e-14. However, since the actual value for that argument is around 1E100, it might
180 * not matter for typical workloads. This is likely caused by the polynomial evaluation,
181 * and the only way around would then be a table-based version, which I haven't managed to get the
182 * same performance from.
184 * The _average_ accuracy is about 51 bits in the range [-20,20], and the worst roughly 1 bit worse.
187 gmx_mm_exp_pd(__m128d x
)
189 const __m128d argscale
= _mm_set1_pd(1.442695040888963387);
190 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
191 const __m128d arglimit
= _mm_set1_pd(-1022.0/1.442695040888963387);
192 const __m128i expbase
= _mm_set1_epi32(1023);
194 const __m128d CA0
= _mm_set1_pd(7.0372789822689374920e-9);
195 const __m128d CB0
= _mm_set1_pd(89.491964762085371);
196 const __m128d CB1
= _mm_set1_pd(-9.7373870675164587);
197 const __m128d CC0
= _mm_set1_pd(51.247261867992408);
198 const __m128d CC1
= _mm_set1_pd(-0.184020268133945);
199 const __m128d CD0
= _mm_set1_pd(36.82070153762337);
200 const __m128d CD1
= _mm_set1_pd(5.416849282638991);
201 const __m128d CE0
= _mm_set1_pd(30.34003452248759);
202 const __m128d CE1
= _mm_set1_pd(8.726173289493301);
203 const __m128d CF0
= _mm_set1_pd(27.73526969472330);
204 const __m128d CF1
= _mm_set1_pd(10.284755658866532);
211 __m128d factB
,factC
,factD
,factE
,factF
;
213 z
= _mm_mul_pd(x
,argscale
);
214 iexppart
= _mm_cvtpd_epi32(z
);
217 /* This reduces latency and speeds up the code by roughly 5% when supported */
218 intpart
= _mm_round_pd(z
,0);
220 intpart
= _mm_cvtepi32_pd(iexppart
);
222 /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
223 * To be able to shift it into the exponent for a double precision number we first need to
224 * shuffle so that the lower half contains the first element, and the upper half the second.
225 * This should really be done as a zero-extension, but since the next instructions will shift
226 * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
227 * (thus we just use element 2 from iexppart).
229 iexppart
= _mm_shuffle_epi32(iexppart
,_MM_SHUFFLE(2,1,2,0));
231 /* Do the shift operation on the 64-bit registers */
232 iexppart
= _mm_add_epi32(iexppart
,expbase
);
233 iexppart
= _mm_slli_epi64(iexppart
,52);
234 valuemask
= _mm_cmpgt_pd(x
,arglimit
);
236 z
= _mm_sub_pd(z
,intpart
);
237 z2
= _mm_mul_pd(z
,z
);
239 fexppart
= _mm_and_pd(valuemask
,gmx_mm_castsi128_pd(iexppart
));
241 /* Since SSE doubleing-point has relatively high latency it is faster to do
242 * factorized polynomial summation with independent terms than using alternating add/multiply, i.e.
243 * p(z) = A0 * (B0 + z) * (C0 + C1*z + z^2) * (D0 + D1*z + z^2) * (E0 + E1*z + z^2) * (F0 + F1*z + z^2)
246 factB
= _mm_add_pd(CB0
,_mm_mul_pd(CB1
,z
) );
247 factB
= _mm_add_pd(factB
,z2
);
248 factC
= _mm_add_pd(CC0
,_mm_mul_pd(CC1
,z
) );
249 factC
= _mm_add_pd(factC
,z2
);
250 factD
= _mm_add_pd(CD0
,_mm_mul_pd(CD1
,z
) );
251 factD
= _mm_add_pd(factD
,z2
);
252 factE
= _mm_add_pd(CE0
,_mm_mul_pd(CE1
,z
) );
253 factE
= _mm_add_pd(factE
,z2
);
254 factF
= _mm_add_pd(CF0
,_mm_mul_pd(CF1
,z
) );
255 factF
= _mm_add_pd(factF
,z2
);
257 z
= _mm_mul_pd(CA0
,fexppart
);
258 factB
= _mm_mul_pd(factB
,factC
);
259 factD
= _mm_mul_pd(factD
,factE
);
260 z
= _mm_mul_pd(z
,factF
);
261 factB
= _mm_mul_pd(factB
,factD
);
262 z
= _mm_mul_pd(z
,factB
);
264 /* Currently uses 32 actual (real, not including casts) SSE instructions */
269 gmx_mm_log_pd(__m128d x
)
271 /* Same algorithm as cephes library */
272 const __m128d expmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
274 const __m128i expbase_m1
= _mm_set1_epi32(1023-1); /* We want non-IEEE format */
275 const __m128d half
= _mm_set1_pd(0.5);
276 const __m128d one
= _mm_set1_pd(1.0);
277 const __m128i itwo
= _mm_set1_epi32(2);
278 const __m128d invsq2
= _mm_set1_pd(1.0/sqrt(2.0));
280 const __m128d corr1
= _mm_set1_pd(-2.121944400546905827679e-4);
281 const __m128d corr2
= _mm_set1_pd(0.693359375);
283 const __m128d P5
= _mm_set1_pd(1.01875663804580931796E-4);
284 const __m128d P4
= _mm_set1_pd(4.97494994976747001425E-1);
285 const __m128d P3
= _mm_set1_pd(4.70579119878881725854E0
);
286 const __m128d P2
= _mm_set1_pd(1.44989225341610930846E1
);
287 const __m128d P1
= _mm_set1_pd(1.79368678507819816313E1
);
288 const __m128d P0
= _mm_set1_pd(7.70838733755885391666E0
);
290 const __m128d Q4
= _mm_set1_pd(1.12873587189167450590E1
);
291 const __m128d Q3
= _mm_set1_pd(4.52279145837532221105E1
);
292 const __m128d Q2
= _mm_set1_pd(8.29875266912776603211E1
);
293 const __m128d Q1
= _mm_set1_pd(7.11544750618563894466E1
);
294 const __m128d Q0
= _mm_set1_pd(2.31251620126765340583E1
);
296 const __m128d R2
= _mm_set1_pd(-7.89580278884799154124E-1);
297 const __m128d R1
= _mm_set1_pd(1.63866645699558079767E1
);
298 const __m128d R0
= _mm_set1_pd(-6.41409952958715622951E1
);
300 const __m128d S2
= _mm_set1_pd(-3.56722798256324312549E1
);
301 const __m128d S1
= _mm_set1_pd(3.12093766372244180303E2
);
302 const __m128d S0
= _mm_set1_pd(-7.69691943550460008604E2
);
305 __m128i iexp
,iexp1
,signbit
,iexpabs
;
308 __m128d corr
,t1
,t2
,q
;
309 __m128d zA
,yA
,xA
,zB
,yB
,xB
,z
;
311 __m128d polyP1
,polyP2
,polyQ1
,polyQ2
;
313 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
314 fexp
= _mm_and_pd(x
,expmask
);
315 iexp
= gmx_mm_castpd_si128(fexp
);
316 iexp
= _mm_srli_epi64(iexp
,52);
317 iexp
= _mm_sub_epi32(iexp
,expbase_m1
);
318 iexp
= _mm_shuffle_epi32(iexp
, _MM_SHUFFLE(1,1,2,0) );
320 x
= _mm_andnot_pd(expmask
,x
);
321 x
= _mm_or_pd(x
,one
);
322 x
= _mm_mul_pd(x
,half
);
324 signbit
= _mm_srai_epi32(iexp
,31);
325 iexpabs
= _mm_sub_epi32(_mm_xor_si128(iexp
,signbit
),signbit
);
327 imask1
= _mm_cmpgt_epi32( iexpabs
, itwo
);
328 mask2
= _mm_cmplt_pd(x
,invsq2
);
330 fexp
= _mm_cvtepi32_pd(iexp
);
331 corr
= _mm_and_pd(mask2
,one
);
332 fexp
= _mm_sub_pd(fexp
,corr
);
334 /* If mask1 is set ('A') */
335 zA
= _mm_sub_pd(x
,half
);
336 t1
= _mm_or_pd( _mm_andnot_pd(mask2
,zA
), _mm_and_pd(mask2
,x
) );
337 zA
= _mm_sub_pd(t1
,half
);
338 t2
= _mm_or_pd( _mm_andnot_pd(mask2
,x
), _mm_and_pd(mask2
,zA
) );
339 yA
= _mm_mul_pd(half
,_mm_add_pd(t2
,one
));
341 xA
= _mm_mul_pd(zA
,gmx_mm_inv_pd(yA
));
342 zA
= _mm_mul_pd(xA
,xA
);
345 polyR
= _mm_mul_pd(R2
,zA
);
346 polyR
= _mm_add_pd(polyR
,R1
);
347 polyR
= _mm_mul_pd(polyR
,zA
);
348 polyR
= _mm_add_pd(polyR
,R0
);
350 polyS
= _mm_add_pd(zA
,S2
);
351 polyS
= _mm_mul_pd(polyS
,zA
);
352 polyS
= _mm_add_pd(polyS
,S1
);
353 polyS
= _mm_mul_pd(polyS
,zA
);
354 polyS
= _mm_add_pd(polyS
,S0
);
356 q
= _mm_mul_pd(polyR
,gmx_mm_inv_pd(polyS
));
357 zA
= _mm_mul_pd(_mm_mul_pd(xA
,zA
),q
);
359 zA
= _mm_add_pd(zA
,_mm_mul_pd(corr1
,fexp
));
360 zA
= _mm_add_pd(zA
,xA
);
361 zA
= _mm_add_pd(zA
,_mm_mul_pd(corr2
,fexp
));
364 /* If mask1 is not set ('B') */
365 corr
= _mm_and_pd(mask2
,x
);
366 xB
= _mm_add_pd(x
,corr
);
367 xB
= _mm_sub_pd(xB
,one
);
368 zB
= _mm_mul_pd(xB
,xB
);
370 polyP1
= _mm_mul_pd(P5
,zB
);
371 polyP2
= _mm_mul_pd(P4
,zB
);
372 polyP1
= _mm_add_pd(polyP1
,P3
);
373 polyP2
= _mm_add_pd(polyP2
,P2
);
374 polyP1
= _mm_mul_pd(polyP1
,zB
);
375 polyP2
= _mm_mul_pd(polyP2
,zB
);
376 polyP1
= _mm_add_pd(polyP1
,P1
);
377 polyP2
= _mm_add_pd(polyP2
,P0
);
378 polyP1
= _mm_mul_pd(polyP1
,xB
);
379 polyP1
= _mm_add_pd(polyP1
,polyP2
);
381 polyQ2
= _mm_mul_pd(Q4
,zB
);
382 polyQ1
= _mm_add_pd(zB
,Q3
);
383 polyQ2
= _mm_add_pd(polyQ2
,Q2
);
384 polyQ1
= _mm_mul_pd(polyQ1
,zB
);
385 polyQ2
= _mm_mul_pd(polyQ2
,zB
);
386 polyQ1
= _mm_add_pd(polyQ1
,Q1
);
387 polyQ2
= _mm_add_pd(polyQ2
,Q0
);
388 polyQ1
= _mm_mul_pd(polyQ1
,xB
);
389 polyQ1
= _mm_add_pd(polyQ1
,polyQ2
);
391 q
= _mm_mul_pd(polyP1
,gmx_mm_inv_pd(polyQ1
));
392 yB
= _mm_mul_pd(_mm_mul_pd(xB
,zB
),q
);
394 yB
= _mm_add_pd(yB
,_mm_mul_pd(corr1
,fexp
));
395 yB
= _mm_sub_pd(yB
,_mm_mul_pd(half
,zB
));
396 zB
= _mm_add_pd(xB
,yB
);
397 zB
= _mm_add_pd(zB
,_mm_mul_pd(corr2
,fexp
));
400 mask1
= gmx_mm_castsi128_pd( _mm_shuffle_epi32(imask1
, _MM_SHUFFLE(1,1,0,0)) );
401 z
= _mm_or_pd( _mm_and_pd(mask1
,zA
), _mm_andnot_pd(mask1
,zB
) );
408 gmx_mm_sincos_pd(__m128d x
,
413 __declspec(align(16))
414 const double sintable
[34] =
416 1.00000000000000000e+00 , 0.00000000000000000e+00 ,
417 9.95184726672196929e-01 , 9.80171403295606036e-02 ,
418 9.80785280403230431e-01 , 1.95090322016128248e-01 ,
419 9.56940335732208824e-01 , 2.90284677254462331e-01 ,
420 9.23879532511286738e-01 , 3.82683432365089782e-01 ,
421 8.81921264348355050e-01 , 4.71396736825997642e-01 ,
422 8.31469612302545236e-01 , 5.55570233019602178e-01 ,
423 7.73010453362736993e-01 , 6.34393284163645488e-01 ,
424 7.07106781186547573e-01 , 7.07106781186547462e-01 ,
425 6.34393284163645599e-01 , 7.73010453362736882e-01 ,
426 5.55570233019602289e-01 , 8.31469612302545125e-01 ,
427 4.71396736825997809e-01 , 8.81921264348354939e-01 ,
428 3.82683432365089837e-01 , 9.23879532511286738e-01 ,
429 2.90284677254462276e-01 , 9.56940335732208935e-01 ,
430 1.95090322016128304e-01 , 9.80785280403230431e-01 ,
431 9.80171403295607702e-02 , 9.95184726672196818e-01 ,
432 6.12323399573676604e-17 , 1.00000000000000000e+00
435 const __m128d sintable
[17] =
437 _mm_set_pd( sin( 0.0 * (M_PI
/2.0) / 16.0) , cos( 0.0 * (M_PI
/2.0) / 16.0) ),
438 _mm_set_pd( sin( 1.0 * (M_PI
/2.0) / 16.0) , cos( 1.0 * (M_PI
/2.0) / 16.0) ),
439 _mm_set_pd( sin( 2.0 * (M_PI
/2.0) / 16.0) , cos( 2.0 * (M_PI
/2.0) / 16.0) ),
440 _mm_set_pd( sin( 3.0 * (M_PI
/2.0) / 16.0) , cos( 3.0 * (M_PI
/2.0) / 16.0) ),
441 _mm_set_pd( sin( 4.0 * (M_PI
/2.0) / 16.0) , cos( 4.0 * (M_PI
/2.0) / 16.0) ),
442 _mm_set_pd( sin( 5.0 * (M_PI
/2.0) / 16.0) , cos( 5.0 * (M_PI
/2.0) / 16.0) ),
443 _mm_set_pd( sin( 6.0 * (M_PI
/2.0) / 16.0) , cos( 6.0 * (M_PI
/2.0) / 16.0) ),
444 _mm_set_pd( sin( 7.0 * (M_PI
/2.0) / 16.0) , cos( 7.0 * (M_PI
/2.0) / 16.0) ),
445 _mm_set_pd( sin( 8.0 * (M_PI
/2.0) / 16.0) , cos( 8.0 * (M_PI
/2.0) / 16.0) ),
446 _mm_set_pd( sin( 9.0 * (M_PI
/2.0) / 16.0) , cos( 9.0 * (M_PI
/2.0) / 16.0) ),
447 _mm_set_pd( sin( 10.0 * (M_PI
/2.0) / 16.0) , cos( 10.0 * (M_PI
/2.0) / 16.0) ),
448 _mm_set_pd( sin( 11.0 * (M_PI
/2.0) / 16.0) , cos( 11.0 * (M_PI
/2.0) / 16.0) ),
449 _mm_set_pd( sin( 12.0 * (M_PI
/2.0) / 16.0) , cos( 12.0 * (M_PI
/2.0) / 16.0) ),
450 _mm_set_pd( sin( 13.0 * (M_PI
/2.0) / 16.0) , cos( 13.0 * (M_PI
/2.0) / 16.0) ),
451 _mm_set_pd( sin( 14.0 * (M_PI
/2.0) / 16.0) , cos( 14.0 * (M_PI
/2.0) / 16.0) ),
452 _mm_set_pd( sin( 15.0 * (M_PI
/2.0) / 16.0) , cos( 15.0 * (M_PI
/2.0) / 16.0) ),
453 _mm_set_pd( sin( 16.0 * (M_PI
/2.0) / 16.0) , cos( 16.0 * (M_PI
/2.0) / 16.0) )
457 const __m128d signmask
= _mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
458 const __m128d tabscale
= _mm_set1_pd(32.0/M_PI
);
459 const __m128d invtabscale
= _mm_set1_pd(M_PI
/32.0);
460 const __m128d one
= _mm_set1_pd(1.0);
461 const __m128i i32
= _mm_set1_epi32(32);
462 const __m128i i16
= _mm_set1_epi32(16);
463 const __m128i tabmask
= _mm_set1_epi32(0x3F);
464 const __m128d sinP7
= _mm_set1_pd(-1.9835958374985742404167983310657359E-4);
465 const __m128d sinP5
= _mm_set1_pd(8.3333330133863188710910904896670286347068944E-3);
466 const __m128d sinP3
= _mm_set1_pd(-1.66666666666049927649121240304808461825E-1);
467 const __m128d sinP1
= _mm_set1_pd(9.99999999999999814240922423058227089729828E-1);
469 const __m128d cosP6
= _mm_set1_pd(-1.3884108697213500852322439746988228E-3);
470 const __m128d cosP4
= _mm_set1_pd(4.16666637872444585215198198619627956E-2);
471 const __m128d cosP2
= _mm_set1_pd(-4.999999999944495616220671189143471E-1);
472 const __m128d cosP0
= _mm_set1_pd(9.999999999999983282334075742852867E-1);
475 __m128i tabidx
,corridx
;
476 __m128d xabs
,z
,z2
,polySin
,polyCos
;
478 __m128d ypoint0
,ypoint1
;
479 __m128d sinpoint
,cospoint
;
480 __m128d xsign
,ssign
,csign
;
481 __m128i imask
,sswapsign
,cswapsign
;
484 xsign
= _mm_andnot_pd(signmask
,x
);
485 xabs
= _mm_and_pd(x
,signmask
);
487 scalex
= _mm_mul_pd(tabscale
,xabs
);
488 tabidx
= _mm_cvtpd_epi32(scalex
);
489 xpoint
= _mm_cvtepi32_pd(tabidx
);
490 xpoint
= _mm_mul_pd(xpoint
,invtabscale
);
491 z
= _mm_sub_pd(xabs
, xpoint
);
493 /* Range reduction to 0..2*Pi */
494 tabidx
= _mm_and_si128(tabidx
,tabmask
);
496 /* tabidx is now in range [0,..,64] */
497 imask
= _mm_cmpgt_epi32(tabidx
,i32
);
500 corridx
= _mm_and_si128(imask
,i32
);
501 tabidx
= _mm_sub_epi32(tabidx
,corridx
);
503 /* tabidx is now in range [0..32] */
504 imask
= _mm_cmpgt_epi32(tabidx
,i16
);
505 cswapsign
= _mm_xor_si128(cswapsign
,imask
);
506 corridx
= _mm_sub_epi32(i32
,tabidx
);
507 tabidx
= _mm_or_si128( _mm_and_si128(imask
,corridx
), _mm_andnot_si128(imask
,tabidx
) );
509 /* tabidx is now in range [0..16] */
510 sswapsign
= _mm_shuffle_epi32(sswapsign
,_MM_SHUFFLE(1,1,0,0));
511 cswapsign
= _mm_shuffle_epi32(cswapsign
,_MM_SHUFFLE(1,1,0,0));
512 minusone
= _mm_sub_pd(_mm_setzero_pd(),one
);
514 ssign
= _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(sswapsign
),minusone
),
515 _mm_andnot_pd( _mm_castsi128_pd(sswapsign
),one
));
516 csign
= _mm_or_pd(_mm_and_pd( _mm_castsi128_pd(cswapsign
),minusone
),
517 _mm_andnot_pd( _mm_castsi128_pd(cswapsign
),one
));
519 /* First lookup into table */
521 ypoint0
= _mm_load_pd(sintable
+ 2*gmx_mm_extract_epi32(tabidx
,0));
522 ypoint1
= _mm_load_pd(sintable
+ 2*gmx_mm_extract_epi32(tabidx
,1));
524 ypoint0
= sintable
[gmx_mm_extract_epi32(tabidx
,0)];
525 ypoint1
= sintable
[gmx_mm_extract_epi32(tabidx
,1)];
527 sinpoint
= _mm_unpackhi_pd(ypoint0
,ypoint1
);
528 cospoint
= _mm_unpacklo_pd(ypoint0
,ypoint1
);
530 sinpoint
= _mm_mul_pd(sinpoint
,ssign
);
531 cospoint
= _mm_mul_pd(cospoint
,csign
);
533 z2
= _mm_mul_pd(z
,z
);
535 polySin
= _mm_mul_pd(sinP7
,z2
);
536 polySin
= _mm_add_pd(polySin
,sinP5
);
537 polySin
= _mm_mul_pd(polySin
,z2
);
538 polySin
= _mm_add_pd(polySin
,sinP3
);
539 polySin
= _mm_mul_pd(polySin
,z2
);
540 polySin
= _mm_add_pd(polySin
,sinP1
);
541 polySin
= _mm_mul_pd(polySin
,z
);
543 polyCos
= _mm_mul_pd(cosP6
,z2
);
544 polyCos
= _mm_add_pd(polyCos
,cosP4
);
545 polyCos
= _mm_mul_pd(polyCos
,z2
);
546 polyCos
= _mm_add_pd(polyCos
,cosP2
);
547 polyCos
= _mm_mul_pd(polyCos
,z2
);
548 polyCos
= _mm_add_pd(polyCos
,cosP0
);
550 *sinval
= _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint
,polyCos
) , _mm_mul_pd(cospoint
,polySin
) ),xsign
);
551 *cosval
= _mm_sub_pd( _mm_mul_pd(cospoint
,polyCos
) , _mm_mul_pd(sinpoint
,polySin
) );
559 gmx_mm_tan_pd(__m128d x
)
561 __m128d sinval
,cosval
;
564 gmx_mm_sincos_pd(x
,&sinval
,&cosval
);
566 tanval
= _mm_mul_pd(sinval
,gmx_mm_inv_pd(cosval
));
574 gmx_mm_asin_pd(__m128d x
)
576 /* Same algorithm as cephes library */
577 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
578 const __m128d limit1
= _mm_set1_pd(0.625);
579 const __m128d limit2
= _mm_set1_pd(1e-8);
580 const __m128d one
= _mm_set1_pd(1.0);
581 const __m128d halfpi
= _mm_set1_pd(M_PI
/2.0);
582 const __m128d quarterpi
= _mm_set1_pd(M_PI
/4.0);
583 const __m128d morebits
= _mm_set1_pd(6.123233995736765886130E-17);
585 const __m128d P5
= _mm_set1_pd(4.253011369004428248960E-3);
586 const __m128d P4
= _mm_set1_pd(-6.019598008014123785661E-1);
587 const __m128d P3
= _mm_set1_pd(5.444622390564711410273E0
);
588 const __m128d P2
= _mm_set1_pd(-1.626247967210700244449E1
);
589 const __m128d P1
= _mm_set1_pd(1.956261983317594739197E1
);
590 const __m128d P0
= _mm_set1_pd(-8.198089802484824371615E0
);
592 const __m128d Q4
= _mm_set1_pd(-1.474091372988853791896E1
);
593 const __m128d Q3
= _mm_set1_pd(7.049610280856842141659E1
);
594 const __m128d Q2
= _mm_set1_pd(-1.471791292232726029859E2
);
595 const __m128d Q1
= _mm_set1_pd(1.395105614657485689735E2
);
596 const __m128d Q0
= _mm_set1_pd(-4.918853881490881290097E1
);
598 const __m128d R4
= _mm_set1_pd(2.967721961301243206100E-3);
599 const __m128d R3
= _mm_set1_pd(-5.634242780008963776856E-1);
600 const __m128d R2
= _mm_set1_pd(6.968710824104713396794E0
);
601 const __m128d R1
= _mm_set1_pd(-2.556901049652824852289E1
);
602 const __m128d R0
= _mm_set1_pd(2.853665548261061424989E1
);
604 const __m128d S3
= _mm_set1_pd(-2.194779531642920639778E1
);
605 const __m128d S2
= _mm_set1_pd(1.470656354026814941758E2
);
606 const __m128d S1
= _mm_set1_pd(-3.838770957603691357202E2
);
607 const __m128d S0
= _mm_set1_pd(3.424398657913078477438E2
);
612 __m128d zz
,ww
,z
,q
,w
,y
,zz2
,ww2
;
619 sign
= _mm_andnot_pd(signmask
,x
);
620 xabs
= _mm_and_pd(x
,signmask
);
622 mask
= _mm_cmpgt_pd(xabs
,limit1
);
624 zz
= _mm_sub_pd(one
,xabs
);
625 ww
= _mm_mul_pd(xabs
,xabs
);
626 zz2
= _mm_mul_pd(zz
,zz
);
627 ww2
= _mm_mul_pd(ww
,ww
);
630 RA
= _mm_mul_pd(R4
,zz2
);
631 RB
= _mm_mul_pd(R3
,zz2
);
632 RA
= _mm_add_pd(RA
,R2
);
633 RB
= _mm_add_pd(RB
,R1
);
634 RA
= _mm_mul_pd(RA
,zz2
);
635 RB
= _mm_mul_pd(RB
,zz
);
636 RA
= _mm_add_pd(RA
,R0
);
637 RA
= _mm_add_pd(RA
,RB
);
640 SB
= _mm_mul_pd(S3
,zz2
);
641 SA
= _mm_add_pd(zz2
,S2
);
642 SB
= _mm_add_pd(SB
,S1
);
643 SA
= _mm_mul_pd(SA
,zz2
);
644 SB
= _mm_mul_pd(SB
,zz
);
645 SA
= _mm_add_pd(SA
,S0
);
646 SA
= _mm_add_pd(SA
,SB
);
649 PA
= _mm_mul_pd(P5
,ww2
);
650 PB
= _mm_mul_pd(P4
,ww2
);
651 PA
= _mm_add_pd(PA
,P3
);
652 PB
= _mm_add_pd(PB
,P2
);
653 PA
= _mm_mul_pd(PA
,ww2
);
654 PB
= _mm_mul_pd(PB
,ww2
);
655 PA
= _mm_add_pd(PA
,P1
);
656 PB
= _mm_add_pd(PB
,P0
);
657 PA
= _mm_mul_pd(PA
,ww
);
658 PA
= _mm_add_pd(PA
,PB
);
661 QB
= _mm_mul_pd(Q4
,ww2
);
662 QA
= _mm_add_pd(ww2
,Q3
);
663 QB
= _mm_add_pd(QB
,Q2
);
664 QA
= _mm_mul_pd(QA
,ww2
);
665 QB
= _mm_mul_pd(QB
,ww2
);
666 QA
= _mm_add_pd(QA
,Q1
);
667 QB
= _mm_add_pd(QB
,Q0
);
668 QA
= _mm_mul_pd(QA
,ww
);
669 QA
= _mm_add_pd(QA
,QB
);
671 RA
= _mm_mul_pd(RA
,zz
);
672 PA
= _mm_mul_pd(PA
,ww
);
674 nom
= _mm_or_pd( _mm_and_pd(mask
,RA
) , _mm_andnot_pd(mask
,PA
) );
675 denom
= _mm_or_pd( _mm_and_pd(mask
,SA
) , _mm_andnot_pd(mask
,QA
) );
677 q
= _mm_mul_pd( nom
, gmx_mm_inv_pd(denom
) );
679 zz
= _mm_add_pd(zz
,zz
);
680 zz
= gmx_mm_sqrt_pd(zz
);
681 z
= _mm_sub_pd(quarterpi
,zz
);
682 zz
= _mm_mul_pd(zz
,q
);
683 zz
= _mm_sub_pd(zz
,morebits
);
684 z
= _mm_sub_pd(z
,zz
);
685 z
= _mm_add_pd(z
,quarterpi
);
687 w
= _mm_mul_pd(xabs
,q
);
688 w
= _mm_add_pd(w
,xabs
);
690 z
= _mm_or_pd( _mm_and_pd(mask
,z
) , _mm_andnot_pd(mask
,w
) );
692 mask
= _mm_cmpgt_pd(xabs
,limit2
);
693 z
= _mm_or_pd( _mm_and_pd(mask
,z
) , _mm_andnot_pd(mask
,xabs
) );
695 z
= _mm_xor_pd(z
,sign
);
702 gmx_mm_acos_pd(__m128d x
)
704 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
705 const __m128d one_pd
= _mm_set1_pd(1.0);
706 const __m128d half_pd
= _mm_set1_pd(0.5);
707 const __m128d pi_pd
= _mm_set1_pd(M_PI
);
708 const __m128d halfpi_pd
= _mm_set1_pd(M_PI
/2.0);
715 xabs
= _mm_and_pd(x
,signmask
);
716 mask1
= _mm_cmpgt_pd(xabs
,half_pd
);
717 mask2
= _mm_cmpgt_pd(x
,_mm_setzero_pd());
719 z
= _mm_mul_pd(half_pd
,_mm_sub_pd(one_pd
,xabs
));
720 z
= _mm_mul_pd(z
,gmx_mm_invsqrt_pd(z
));
721 z
= _mm_andnot_pd(_mm_cmpeq_pd(xabs
,one_pd
),z
);
723 z
= _mm_or_pd( _mm_and_pd(mask1
,z
) , _mm_andnot_pd(mask1
,x
) );
724 z
= gmx_mm_asin_pd(z
);
726 z2
= _mm_add_pd(z
,z
);
727 z1
= _mm_sub_pd(pi_pd
,z2
);
728 z3
= _mm_sub_pd(halfpi_pd
,z
);
730 z
= _mm_or_pd( _mm_and_pd(mask2
,z2
) , _mm_andnot_pd(mask2
,z1
) );
731 z
= _mm_or_pd( _mm_and_pd(mask1
,z
) , _mm_andnot_pd(mask1
,z3
) );
737 gmx_mm_atan_pd(__m128d x
)
739 /* Same algorithm as cephes library */
740 const __m128d signmask
= gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
741 const __m128d limit1
= _mm_set1_pd(0.66);
742 const __m128d limit2
= _mm_set1_pd(2.41421356237309504880);
743 const __m128d quarterpi
= _mm_set1_pd(M_PI
/4.0);
744 const __m128d halfpi
= _mm_set1_pd(M_PI
/2.0);
745 const __m128d mone
= _mm_set1_pd(-1.0);
746 const __m128d morebits1
= _mm_set1_pd(0.5*6.123233995736765886130E-17);
747 const __m128d morebits2
= _mm_set1_pd(6.123233995736765886130E-17);
749 const __m128d P4
= _mm_set1_pd(-8.750608600031904122785E-1);
750 const __m128d P3
= _mm_set1_pd(-1.615753718733365076637E1
);
751 const __m128d P2
= _mm_set1_pd(-7.500855792314704667340E1
);
752 const __m128d P1
= _mm_set1_pd(-1.228866684490136173410E2
);
753 const __m128d P0
= _mm_set1_pd(-6.485021904942025371773E1
);
755 const __m128d Q4
= _mm_set1_pd(2.485846490142306297962E1
);
756 const __m128d Q3
= _mm_set1_pd(1.650270098316988542046E2
);
757 const __m128d Q2
= _mm_set1_pd(4.328810604912902668951E2
);
758 const __m128d Q1
= _mm_set1_pd(4.853903996359136964868E2
);
759 const __m128d Q0
= _mm_set1_pd(1.945506571482613964425E2
);
765 __m128d P_A
,P_B
,Q_A
,Q_B
;
767 sign
= _mm_andnot_pd(signmask
,x
);
768 x
= _mm_and_pd(x
,signmask
);
770 mask1
= _mm_cmpgt_pd(x
,limit1
);
771 mask2
= _mm_cmpgt_pd(x
,limit2
);
773 t1
= _mm_mul_pd(_mm_add_pd(x
,mone
),gmx_mm_inv_pd(_mm_sub_pd(x
,mone
)));
774 t2
= _mm_mul_pd(mone
,gmx_mm_inv_pd(x
));
776 y
= _mm_and_pd(mask1
,quarterpi
);
777 y
= _mm_or_pd( _mm_and_pd(mask2
,halfpi
) , _mm_andnot_pd(mask2
,y
) );
779 x
= _mm_or_pd( _mm_and_pd(mask1
,t1
) , _mm_andnot_pd(mask1
,x
) );
780 x
= _mm_or_pd( _mm_and_pd(mask2
,t2
) , _mm_andnot_pd(mask2
,x
) );
783 z2
= _mm_mul_pd(z
,z
);
785 P_A
= _mm_mul_pd(P4
,z2
);
786 P_B
= _mm_mul_pd(P3
,z2
);
787 P_A
= _mm_add_pd(P_A
,P2
);
788 P_B
= _mm_add_pd(P_B
,P1
);
789 P_A
= _mm_mul_pd(P_A
,z2
);
790 P_B
= _mm_mul_pd(P_B
,z
);
791 P_A
= _mm_add_pd(P_A
,P0
);
792 P_A
= _mm_add_pd(P_A
,P_B
);
795 Q_B
= _mm_mul_pd(Q4
,z2
);
796 Q_A
= _mm_add_pd(z2
,Q3
);
797 Q_B
= _mm_add_pd(Q_B
,Q2
);
798 Q_A
= _mm_mul_pd(Q_A
,z2
);
799 Q_B
= _mm_mul_pd(Q_B
,z2
);
800 Q_A
= _mm_add_pd(Q_A
,Q1
);
801 Q_B
= _mm_add_pd(Q_B
,Q0
);
802 Q_A
= _mm_mul_pd(Q_A
,z
);
803 Q_A
= _mm_add_pd(Q_A
,Q_B
);
805 z
= _mm_mul_pd(z
,P_A
);
806 z
= _mm_mul_pd(z
,gmx_mm_inv_pd(Q_A
));
810 t1
= _mm_and_pd(mask1
,morebits1
);
811 t1
= _mm_or_pd( _mm_and_pd(mask2
,morebits2
) , _mm_andnot_pd(mask2
,t1
) );
813 z
= _mm_add_pd(z
,t1
);
816 y
= _mm_xor_pd(y
,sign
);
822 gmx_mm_atan2_pd(__m128d y
, __m128d x
)
824 const __m128d pi
= _mm_set1_pd(M_PI
);
825 const __m128d minuspi
= _mm_set1_pd(-M_PI
);
826 const __m128d halfpi
= _mm_set1_pd(M_PI
/2.0);
827 const __m128d minushalfpi
= _mm_set1_pd(-M_PI
/2.0);
831 __m128d maskx_lt
,maskx_eq
;
832 __m128d masky_lt
,masky_eq
;
833 __m128d mask1
,mask2
,mask3
,mask4
,maskall
;
835 maskx_lt
= _mm_cmplt_pd(x
,_mm_setzero_pd());
836 masky_lt
= _mm_cmplt_pd(y
,_mm_setzero_pd());
837 maskx_eq
= _mm_cmpeq_pd(x
,_mm_setzero_pd());
838 masky_eq
= _mm_cmpeq_pd(y
,_mm_setzero_pd());
840 z
= _mm_mul_pd(y
,gmx_mm_inv_pd(x
));
841 z
= gmx_mm_atan_pd(z
);
843 mask1
= _mm_and_pd(maskx_eq
,masky_lt
);
844 mask2
= _mm_andnot_pd(maskx_lt
,masky_eq
);
845 mask3
= _mm_andnot_pd( _mm_or_pd(masky_lt
,masky_eq
) , maskx_eq
);
846 mask4
= _mm_and_pd(masky_eq
,maskx_lt
);
848 maskall
= _mm_or_pd( _mm_or_pd(mask1
,mask2
), _mm_or_pd(mask3
,mask4
) );
850 z
= _mm_andnot_pd(maskall
,z
);
851 z1
= _mm_and_pd(mask1
,minushalfpi
);
852 z3
= _mm_and_pd(mask3
,halfpi
);
853 z4
= _mm_and_pd(mask4
,pi
);
855 z
= _mm_or_pd( _mm_or_pd(z
,z1
), _mm_or_pd(z3
,z4
) );
857 mask1
= _mm_andnot_pd(masky_lt
,maskx_lt
);
858 mask2
= _mm_and_pd(maskx_lt
,masky_lt
);
860 w
= _mm_or_pd( _mm_and_pd(mask1
,pi
), _mm_and_pd(mask2
,minuspi
) );
861 w
= _mm_andnot_pd(maskall
,w
);
869 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
870 __m128d __gmx_t1 = row0; \
871 row0 = _mm_unpacklo_pd(row0,row1); \
872 row1 = _mm_unpackhi_pd(__gmx_t1,row1); \
875 #define GMX_MM_LOAD_2VALUES_PD(ptr1,ptr2,xmm1) \
878 xmm1 = _mm_load_sd(ptr1); \
879 _txmm2 = _mm_load_sd(ptr2); \
880 xmm1 = _mm_unpacklo_pd(xmm1,_txmm2); \
884 #define GMX_MM_LOAD_1VALUE_PD(ptr1,xmm1) \
886 xmm1 = _mm_load_sd(ptr1); \
890 #define GMX_MM_STORE_2VALUES_PD(ptr1,ptr2,xmm1) \
893 _txmm2 = _mm_unpackhi_pd(xmm1,xmm1); \
894 _mm_store_sd(ptr1,xmm1); \
895 _mm_store_sd(ptr2,_txmm2); \
899 #define GMX_MM_STORE_1VALUE_PD(ptr1,xmm1) \
901 _mm_store_sd(ptr1,xmm1); \
904 #define GMX_MM_INCREMENT_2VALUES_PD(ptr1,ptr2,xmm1) \
907 GMX_MM_LOAD_2VALUES_PD(ptr1,ptr2,_tincr); \
908 _tincr = _mm_add_pd(_tincr,xmm1); \
909 GMX_MM_STORE_2VALUES_PD(ptr1,ptr2,_tincr); \
912 #define GMX_MM_INCREMENT_1VALUE_PD(ptr1,xmm1) \
915 GMX_MM_LOAD_1VALUE_PD(ptr1,_tincr); \
916 _tincr = _mm_add_sd(_tincr,xmm1); \
917 GMX_MM_STORE_1VALUE_PD(ptr1,_tincr); \
921 #define GMX_MM_LOAD_2PAIRS_PD(ptr1,ptr2,c6,c12) \
923 __m128d _tmp1,_tmp2; \
924 _tmp1 = _mm_load_pd(ptr1); \
925 _tmp2 = _mm_load_pd(ptr2); \
926 c6 = _mm_unpacklo_pd(_tmp1,_tmp2); \
927 c12 = _mm_unpackhi_pd(_tmp1,_tmp2); \
930 #define GMX_MM_LOAD_1PAIR_PD(ptr1,c6,c12) \
932 c6 = _mm_load_sd(ptr1); \
933 c12 = _mm_load_sd(ptr1+1); \
939 /* Routines to load 1-4 rvecs from 1-2 places.
940 * We mainly use these to load coordinates. The extra routines
941 * are very efficient for the water-water loops, since we e.g.
942 * know that a TIP4p water has 4 atoms, so we should load 12 doubles from each pointer and shuffle.
944 #define GMX_MM_LOAD_1RVEC_1POINTER_PD(ptr1,jx1,jy1,jz1) { \
945 jx1 = _mm_load_sd(ptr1); \
946 jy1 = _mm_load_sd((ptr1)+1); \
947 jz1 = _mm_load_sd((ptr1)+2); \
950 #define GMX_MM_LOAD_2RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
951 jx1 = _mm_load_sd(ptr1); \
952 jy1 = _mm_load_sd((ptr1)+1); \
953 jz1 = _mm_load_sd((ptr1)+2); \
954 jx2 = _mm_load_sd((ptr1)+3); \
955 jy2 = _mm_load_sd((ptr1)+4); \
956 jz2 = _mm_load_sd((ptr1)+5); \
960 #define GMX_MM_LOAD_3RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
961 jx1 = _mm_load_sd(ptr1); \
962 jy1 = _mm_load_sd((ptr1)+1); \
963 jz1 = _mm_load_sd((ptr1)+2); \
964 jx2 = _mm_load_sd((ptr1)+3); \
965 jy2 = _mm_load_sd((ptr1)+4); \
966 jz2 = _mm_load_sd((ptr1)+5); \
967 jx3 = _mm_load_sd((ptr1)+6); \
968 jy3 = _mm_load_sd((ptr1)+7); \
969 jz3 = _mm_load_sd((ptr1)+8); \
973 #define GMX_MM_LOAD_4RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
974 jx1 = _mm_load_sd(ptr1); \
975 jy1 = _mm_load_sd((ptr1)+1); \
976 jz1 = _mm_load_sd((ptr1)+2); \
977 jx2 = _mm_load_sd((ptr1)+3); \
978 jy2 = _mm_load_sd((ptr1)+4); \
979 jz2 = _mm_load_sd((ptr1)+5); \
980 jx3 = _mm_load_sd((ptr1)+6); \
981 jy3 = _mm_load_sd((ptr1)+7); \
982 jz3 = _mm_load_sd((ptr1)+8); \
983 jx4 = _mm_load_sd((ptr1)+9); \
984 jy4 = _mm_load_sd((ptr1)+10); \
985 jz4 = _mm_load_sd((ptr1)+11); \
989 #define GMX_MM_LOAD_1RVEC_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1) { \
991 _tmp1 = _mm_loadu_pd(ptr1); \
992 jy1 = _mm_loadu_pd(ptr2); \
993 jz1 = _mm_load_sd(ptr1+2); \
994 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
995 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
996 jz1 = _mm_loadh_pd(jz1,ptr2+2); \
999 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1000 __m128d _tmp1, _tmp2,_tmp3; \
1001 _tmp1 = _mm_loadu_pd(ptr1); \
1002 jy1 = _mm_loadu_pd(ptr2); \
1003 _tmp2 = _mm_loadu_pd(ptr1+2); \
1004 jx2 = _mm_loadu_pd(ptr2+2); \
1005 _tmp3 = _mm_loadu_pd(ptr1+4); \
1006 jz2 = _mm_loadu_pd(ptr2+4); \
1007 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
1008 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
1009 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
1010 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
1011 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
1012 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
1016 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1017 __m128d _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
1018 _tmp1 = _mm_loadu_pd(ptr1); \
1019 jy1 = _mm_loadu_pd(ptr2); \
1020 _tmp2 = _mm_loadu_pd(ptr1+2); \
1021 jx2 = _mm_loadu_pd(ptr2+2); \
1022 _tmp3 = _mm_loadu_pd(ptr1+4); \
1023 jz2 = _mm_loadu_pd(ptr2+4); \
1024 _tmp4 = _mm_loadu_pd(ptr1+6); \
1025 jy3 = _mm_loadu_pd(ptr2+6); \
1026 jz3 = _mm_load_sd(ptr1+8); \
1027 _tmp5 = _mm_load_sd(ptr2+8); \
1028 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
1029 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
1030 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
1031 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
1032 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
1033 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
1034 jx3 = _mm_unpacklo_pd(_tmp4,jy3); \
1035 jy3 = _mm_unpackhi_pd(_tmp4,jy3); \
1036 jz3 = _mm_unpacklo_pd(jz2,_tmp5); \
1040 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1041 __m128d _tmp1, _tmp2,_tmp3, _tmp4, _tmp5, _tmp6; \
1042 _tmp1 = _mm_loadu_pd(ptr1); \
1043 jy1 = _mm_loadu_pd(ptr2); \
1044 _tmp2 = _mm_loadu_pd(ptr1+2); \
1045 jx2 = _mm_loadu_pd(ptr2+2); \
1046 _tmp3 = _mm_loadu_pd(ptr1+4); \
1047 jz2 = _mm_loadu_pd(ptr2+4); \
1048 _tmp4 = _mm_loadu_pd(ptr1+6); \
1049 jy3 = _mm_loadu_pd(ptr2+6); \
1050 _tmp5 = _mm_loadu_pd(ptr1+8); \
1051 jx4 = _mm_loadu_pd(ptr2+8); \
1052 _tmp6 = _mm_loadu_pd(ptr1+10); \
1053 jz4 = _mm_loadu_pd(ptr2+10); \
1054 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
1055 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
1056 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
1057 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
1058 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
1059 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
1060 jx3 = _mm_unpacklo_pd(_tmp4,jy3); \
1061 jy3 = _mm_unpackhi_pd(_tmp4,jy3); \
1062 jz3 = _mm_unpacklo_pd(_tmp5,jx4); \
1063 jx4 = _mm_unpackhi_pd(_tmp5,jx4); \
1064 jy4 = _mm_unpacklo_pd(_tmp6,jz4); \
1065 jz4 = _mm_unpackhi_pd(_tmp6,jz4); \
1069 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PD(ptr1,jx1,jy1,jz1) { \
1070 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1071 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1072 _mm_store_sd(ptr1+2, _mm_add_sd( _mm_load_sd(ptr1+2), jz1)); \
1076 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1077 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1078 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1079 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1080 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1081 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1082 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1086 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1087 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1088 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1089 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1090 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1091 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1092 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1093 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1094 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1095 _mm_store_sd(ptr1+8, _mm_add_pd( _mm_load_sd(ptr1+8), jz3 )); \
1099 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1100 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1101 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1102 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1103 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1104 jz3 = _mm_unpacklo_pd(jz3,jx4); \
1105 jy4 = _mm_unpacklo_pd(jy4,jz4); \
1106 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1107 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1108 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1109 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1110 _mm_storeu_pd(ptr1+8, _mm_add_pd( _mm_loadu_pd(ptr1+8), jz3 )); \
1111 _mm_storeu_pd(ptr1+10, _mm_add_pd( _mm_loadu_pd(ptr1+10), jy4 )); \
1115 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1) { \
1116 __m128d _tmp1,_tmp2,_tmp3; \
1117 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1118 _tmp2 = _mm_unpackhi_pd(jx1,jy1); \
1119 _tmp3 = _mm_unpackhi_pd(jz1,jz1); \
1120 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1121 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), _tmp2 )); \
1122 _mm_store_sd(ptr1+2, _mm_add_pd( _mm_load_sd(ptr1+2), jz1 )); \
1123 _mm_store_sd(ptr2+2, _mm_add_sd( _mm_load_sd(ptr2+2), _tmp3 )); \
1127 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1128 __m128d _tmp1,_tmp2,_tmp3; \
1129 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1130 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1131 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1132 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1133 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1134 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1135 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1136 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1137 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1138 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1139 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1140 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1144 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1145 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1146 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1147 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1148 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1149 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1150 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1151 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1152 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1153 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1154 _tmp5 = _mm_unpackhi_pd(jz3,jz3); \
1155 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1156 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1157 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1158 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1159 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1160 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1161 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1162 _mm_storeu_pd(ptr2+6, _mm_add_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1163 _mm_store_sd(ptr1+8, _mm_add_sd( _mm_load_sd(ptr1+8), jz3 )); \
1164 _mm_store_sd(ptr2+8, _mm_add_sd( _mm_load_sd(ptr2+8), _tmp5 )); \
1168 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1169 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6; \
1170 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1171 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1172 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1173 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1174 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1175 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1176 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1177 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1178 _tmp5 = _mm_unpacklo_pd(jz3,jx4); \
1179 jx4 = _mm_unpackhi_pd(jz3,jx4); \
1180 _tmp6 = _mm_unpacklo_pd(jy4,jz4); \
1181 jz4 = _mm_unpackhi_pd(jy4,jz4); \
1182 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1183 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1184 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1185 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1186 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1187 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1188 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1189 _mm_storeu_pd(ptr2+6, _mm_add_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1190 _mm_storeu_pd(ptr1+8, _mm_add_pd( _mm_loadu_pd(ptr1+8), _tmp5 )); \
1191 _mm_storeu_pd(ptr2+8, _mm_add_pd( _mm_loadu_pd(ptr2+8), jx4 )); \
1192 _mm_storeu_pd(ptr1+10,_mm_add_pd( _mm_loadu_pd(ptr1+10),_tmp6 )); \
1193 _mm_storeu_pd(ptr2+10,_mm_add_pd( _mm_loadu_pd(ptr2+10), jz4 )); \
1198 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PD(ptr1,jx1,jy1,jz1) { \
1199 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1200 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1201 _mm_store_sd(ptr1+2, _mm_sub_sd( _mm_load_sd(ptr1+2), jz1)); \
1205 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1206 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1207 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1208 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1209 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1210 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1211 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1215 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1216 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1217 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1218 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1219 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1220 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1221 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1222 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1223 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1224 _mm_store_sd(ptr1+8, _mm_sub_sd( _mm_load_sd(ptr1+8), jz3 )); \
1228 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1229 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1230 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1231 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1232 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1233 jz3 = _mm_unpacklo_pd(jz3,jx4); \
1234 jy4 = _mm_unpacklo_pd(jy4,jz4); \
1235 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1236 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1237 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1238 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1239 _mm_storeu_pd(ptr1+8, _mm_sub_pd( _mm_loadu_pd(ptr1+8), jz3 )); \
1240 _mm_storeu_pd(ptr1+10, _mm_sub_pd( _mm_loadu_pd(ptr1+10), jy4 )); \
1244 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1) { \
1245 __m128d _tmp1,_tmp2,_tmp3; \
1246 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1247 _tmp2 = _mm_unpackhi_pd(jx1,jy1); \
1248 _tmp3 = _mm_unpackhi_pd(jz1,jz1); \
1249 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1250 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), _tmp2 )); \
1251 _mm_store_sd(ptr1+2, _mm_sub_pd( _mm_load_sd(ptr1+2), jz1 )); \
1252 _mm_store_sd(ptr2+2, _mm_sub_sd( _mm_load_sd(ptr2+2), _tmp3 )); \
1256 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1257 __m128d _tmp1,_tmp2,_tmp3; \
1258 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1259 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1260 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1261 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1262 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1263 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1264 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1265 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), jy1 )); \
1266 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1267 _mm_storeu_pd(ptr2+2, _mm_sub_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1268 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1269 _mm_storeu_pd(ptr2+4, _mm_sub_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1273 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1274 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1275 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1276 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1277 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1278 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1279 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1280 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1281 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1282 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1283 _tmp5 = _mm_unpackhi_pd(jz3,jz3); \
1284 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1285 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), jy1 )); \
1286 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1287 _mm_storeu_pd(ptr2+2, _mm_sub_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1288 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1289 _mm_storeu_pd(ptr2+4, _mm_sub_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1290 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1291 _mm_storeu_pd(ptr2+6, _mm_sub_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1292 _mm_store_sd(ptr1+8, _mm_sub_sd( _mm_load_sd(ptr1+8), jz3 )); \
1293 _mm_store_sd(ptr2+8, _mm_sub_sd( _mm_load_sd(ptr2+8), _tmp5 )); \
1297 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1298 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6; \
1299 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1300 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1301 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1302 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1303 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1304 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1305 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1306 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1307 _tmp5 = _mm_unpacklo_pd(jz3,jx4); \
1308 jx4 = _mm_unpackhi_pd(jz3,jx4); \
1309 _tmp6 = _mm_unpacklo_pd(jy4,jz4); \
1310 jz4 = _mm_unpackhi_pd(jy4,jz4); \
1311 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1312 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), jy1 )); \
1313 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1314 _mm_storeu_pd(ptr2+2, _mm_sub_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1315 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1316 _mm_storeu_pd(ptr2+4, _mm_sub_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1317 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1318 _mm_storeu_pd(ptr2+6, _mm_sub_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1319 _mm_storeu_pd(ptr1+8, _mm_sub_pd( _mm_loadu_pd(ptr1+8), _tmp5 )); \
1320 _mm_storeu_pd(ptr2+8, _mm_sub_pd( _mm_loadu_pd(ptr2+8), jx4 )); \
1321 _mm_storeu_pd(ptr1+10,_mm_sub_pd( _mm_loadu_pd(ptr1+10),_tmp6 )); \
1322 _mm_storeu_pd(ptr2+10,_mm_sub_pd( _mm_loadu_pd(ptr2+10), jz4 )); \
1328 static inline __m128d
1329 gmx_mm_scalarprod_pd(__m128d x
, __m128d y
, __m128d z
)
1331 return _mm_add_pd(_mm_add_pd(_mm_mul_pd(x
,x
),_mm_mul_pd(y
,y
)),_mm_mul_pd(z
,z
));
1334 static inline __m128d
1335 gmx_mm_calc_rsq_pd(__m128d dx
, __m128d dy
, __m128d dz
)
1337 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
), _mm_mul_pd(dy
,dy
) ), _mm_mul_pd(dz
,dz
) );
1343 gmx_mm_update_iforce_1atom_pd(__m128d
*fix1
, __m128d
*fiy1
, __m128d
*fiz1
,
1350 *fix1
= _mm_hadd_pd(*fix1
,*fiy1
);
1351 /* fiz1 is fine as it is */
1354 /* transpose data */
1356 *fix1
= _mm_unpacklo_pd(*fix1
,*fiy1
); /* y0 x0 */
1357 *fiy1
= _mm_unpackhi_pd(t1
,*fiy1
); /* y1 x1 */
1359 *fix1
= _mm_add_pd(*fix1
,*fiy1
);
1360 *fiz1
= _mm_add_sd( *fiz1
, _mm_unpackhi_pd(*fiz1
,*fiz1
));
1362 _mm_storeu_pd( fptr
, _mm_add_pd( _mm_loadu_pd(fptr
), *fix1
));
1363 _mm_store_sd( fptr
+2, _mm_add_sd( _mm_load_sd(fptr
+2), *fiz1
));
1365 _mm_storeu_pd( fshiftptr
, _mm_add_pd( _mm_loadu_pd(fshiftptr
), *fix1
));
1366 _mm_store_sd( fshiftptr
+2, _mm_add_sd( _mm_load_sd(fshiftptr
+2), *fiz1
));
1370 gmx_mm_update_iforce_2atoms_pd(__m128d
*fix1
, __m128d
*fiy1
, __m128d
*fiz1
,
1371 __m128d
*fix2
, __m128d
*fiy2
, __m128d
*fiz2
,
1378 *fix1
= _mm_hadd_pd(*fix1
,*fiy1
);
1379 *fiz1
= _mm_hadd_pd(*fiz1
,*fix2
);
1380 *fiy2
= _mm_hadd_pd(*fiy2
,*fiz2
);
1383 /* transpose data */
1384 GMX_MM_TRANSPOSE2_PD(*fix1
,*fiy1
);
1385 GMX_MM_TRANSPOSE2_PD(*fiz1
,*fix2
);
1386 GMX_MM_TRANSPOSE2_PD(*fiy2
,*fiz2
);
1388 *fix1
= _mm_add_pd(*fix1
,*fiy1
);
1389 *fiz1
= _mm_add_pd(*fiz1
,*fix2
);
1390 *fiy2
= _mm_add_pd(*fiy2
,*fiz2
);
1392 _mm_storeu_pd( fptr
, _mm_add_pd( _mm_loadu_pd(fptr
), *fix1
));
1393 _mm_storeu_pd( fptr
+2, _mm_add_pd( _mm_loadu_pd(fptr
+2), *fiz1
));
1394 _mm_storeu_pd( fptr
+4, _mm_add_pd( _mm_loadu_pd(fptr
+4), *fiy2
));
1396 t1
= _mm_shuffle_pd(*fiz1
,*fiy2
,_MM_SHUFFLE2(0,1));
1397 *fix1
= _mm_add_pd(*fix1
,t1
); /* x and y sums */
1398 *fiz1
= _mm_add_sd(*fiz1
, _mm_unpackhi_pd(*fiy2
,*fiy2
)); /* z sum */
1400 _mm_storeu_pd( fshiftptr
, _mm_add_pd( _mm_loadu_pd(fshiftptr
), *fix1
));
1401 _mm_store_sd( fshiftptr
+2, _mm_add_sd( _mm_load_sd(fshiftptr
+2), *fiz1
));
1407 gmx_mm_update_iforce_3atoms_pd(__m128d
*fix1
, __m128d
*fiy1
, __m128d
*fiz1
,
1408 __m128d
*fix2
, __m128d
*fiy2
, __m128d
*fiz2
,
1409 __m128d
*fix3
, __m128d
*fiy3
, __m128d
*fiz3
,
1416 *fix1
= _mm_hadd_pd(*fix1
,*fiy1
);
1417 *fiz1
= _mm_hadd_pd(*fiz1
,*fix2
);
1418 *fiy2
= _mm_hadd_pd(*fiy2
,*fiz2
);
1419 *fix3
= _mm_hadd_pd(*fix3
,*fiy3
);
1420 /* fiz3 is fine as it is */
1423 /* transpose data */
1424 GMX_MM_TRANSPOSE2_PD(*fix1
,*fiy1
);
1425 GMX_MM_TRANSPOSE2_PD(*fiz1
,*fix2
);
1426 GMX_MM_TRANSPOSE2_PD(*fiy2
,*fiz2
);
1428 *fix3
= _mm_unpacklo_pd(*fix3
,*fiy3
); /* y0 x0 */
1429 *fiy3
= _mm_unpackhi_pd(t1
,*fiy3
); /* y1 x1 */
1431 *fix1
= _mm_add_pd(*fix1
,*fiy1
);
1432 *fiz1
= _mm_add_pd(*fiz1
,*fix2
);
1433 *fiy2
= _mm_add_pd(*fiy2
,*fiz2
);
1435 *fix3
= _mm_add_pd(*fix3
,*fiy3
);
1436 *fiz3
= _mm_add_sd( *fiz3
, _mm_unpackhi_pd(*fiz3
,*fiz3
));
1438 _mm_storeu_pd( fptr
, _mm_add_pd( _mm_loadu_pd(fptr
), *fix1
));
1439 _mm_storeu_pd( fptr
+2, _mm_add_pd( _mm_loadu_pd(fptr
+2), *fiz1
));
1440 _mm_storeu_pd( fptr
+4, _mm_add_pd( _mm_loadu_pd(fptr
+4), *fiy2
));
1441 _mm_storeu_pd( fptr
+6, _mm_add_pd( _mm_loadu_pd(fptr
+6), *fix3
));
1442 _mm_store_sd( fptr
+8, _mm_add_sd( _mm_load_sd(fptr
+8), *fiz3
));
1444 *fix1
= _mm_add_pd(*fix1
,*fix3
);
1445 t1
= _mm_shuffle_pd(*fiz1
,*fiy2
,_MM_SHUFFLE2(0,1));
1446 *fix1
= _mm_add_pd(*fix1
,t1
); /* x and y sums */
1448 t2
= _mm_shuffle_pd(*fiy2
,*fiy2
,_MM_SHUFFLE2(1,1));
1449 *fiz1
= _mm_add_sd(*fiz1
,*fiz3
);
1450 *fiz1
= _mm_add_sd(*fiz1
,t2
); /* z sum */
1452 _mm_storeu_pd( fshiftptr
, _mm_add_pd( _mm_loadu_pd(fshiftptr
), *fix1
));
1453 _mm_store_sd( fshiftptr
+2, _mm_add_sd( _mm_load_sd(fshiftptr
+2), *fiz1
));
1459 gmx_mm_update_iforce_4atoms_pd(__m128d
*fix1
, __m128d
*fiy1
, __m128d
*fiz1
,
1460 __m128d
*fix2
, __m128d
*fiy2
, __m128d
*fiz2
,
1461 __m128d
*fix3
, __m128d
*fiy3
, __m128d
*fiz3
,
1462 __m128d
*fix4
, __m128d
*fiy4
, __m128d
*fiz4
,
1469 *fix1
= _mm_hadd_pd(*fix1
,*fiy1
);
1470 *fiz1
= _mm_hadd_pd(*fiz1
,*fix2
);
1471 *fiy2
= _mm_hadd_pd(*fiy2
,*fiz2
);
1472 *fix3
= _mm_hadd_pd(*fix3
,*fiy3
);
1473 *fiz3
= _mm_hadd_pd(*fiz3
,*fix4
);
1474 *fiy4
= _mm_hadd_pd(*fiy4
,*fiz4
);
1477 /* transpose data */
1478 GMX_MM_TRANSPOSE2_PD(*fix1
,*fiy1
);
1479 GMX_MM_TRANSPOSE2_PD(*fiz1
,*fix2
);
1480 GMX_MM_TRANSPOSE2_PD(*fiy2
,*fiz2
);
1481 GMX_MM_TRANSPOSE2_PD(*fix3
,*fiy3
);
1482 GMX_MM_TRANSPOSE2_PD(*fiz3
,*fix4
);
1483 GMX_MM_TRANSPOSE2_PD(*fiy4
,*fiz4
);
1485 *fix1
= _mm_add_pd(*fix1
,*fiy1
);
1486 *fiz1
= _mm_add_pd(*fiz1
,*fix2
);
1487 *fiy2
= _mm_add_pd(*fiy2
,*fiz2
);
1488 *fix3
= _mm_add_pd(*fix3
,*fiy3
);
1489 *fiz3
= _mm_add_pd(*fiz3
,*fix4
);
1490 *fiy4
= _mm_add_pd(*fiy4
,*fiz4
);
1492 _mm_storeu_pd( fptr
, _mm_add_pd( _mm_loadu_pd(fptr
), *fix1
));
1493 _mm_storeu_pd( fptr
+2, _mm_add_pd( _mm_loadu_pd(fptr
+2), *fiz1
));
1494 _mm_storeu_pd( fptr
+4, _mm_add_pd( _mm_loadu_pd(fptr
+4), *fiy2
));
1495 _mm_storeu_pd( fptr
+6, _mm_add_pd( _mm_loadu_pd(fptr
+6), *fix3
));
1496 _mm_storeu_pd( fptr
+8, _mm_add_pd( _mm_loadu_pd(fptr
+8), *fiz3
));
1497 _mm_storeu_pd( fptr
+10, _mm_add_pd( _mm_loadu_pd(fptr
+10), *fiy4
));
1499 t1
= _mm_shuffle_pd(*fiz1
,*fiy2
,_MM_SHUFFLE2(0,1));
1500 *fix1
= _mm_add_pd(*fix1
,t1
);
1501 t2
= _mm_shuffle_pd(*fiz3
,*fiy4
,_MM_SHUFFLE2(0,1));
1502 *fix3
= _mm_add_pd(*fix3
,t2
);
1503 *fix1
= _mm_add_pd(*fix1
,*fix3
); /* x and y sums */
1506 *fiz1
= _mm_add_sd(*fiz1
, _mm_unpackhi_pd(*fiy2
,*fiy2
));
1507 *fiz3
= _mm_add_sd(*fiz3
, _mm_unpackhi_pd(*fiy4
,*fiy4
));
1508 *fiz1
= _mm_add_sd(*fiz1
,*fiz3
); /* z sum */
1511 _mm_storeu_pd( fshiftptr
, _mm_add_pd( _mm_loadu_pd(fshiftptr
), *fix1
));
1512 _mm_store_sd( fshiftptr
+2, _mm_add_sd( _mm_load_sd(fshiftptr
+2), *fiz1
));
1517 gmx_mm_update_1pot_pd(__m128d pot1
, double *ptr1
)
1520 pot1
= _mm_hadd_pd(pot1
,pot1
);
1523 pot1
= _mm_add_pd(pot1
, _mm_unpackhi_pd(pot1
,pot1
));
1525 _mm_store_sd(ptr1
,_mm_add_sd(pot1
,_mm_load_sd(ptr1
)));
1529 gmx_mm_update_2pot_pd(__m128d pot1
, double *ptr1
, __m128d pot2
, double *ptr2
)
1532 pot1
= _mm_hadd_pd(pot1
,pot2
);
1535 GMX_MM_TRANSPOSE2_PD(pot1
,pot2
);
1536 pot1
= _mm_add_pd(pot1
,pot2
);
1538 pot2
= _mm_unpackhi_pd(pot1
,pot1
);
1540 _mm_store_sd(ptr1
,_mm_add_sd(pot1
,_mm_load_sd(ptr1
)));
1541 _mm_store_sd(ptr2
,_mm_add_sd(pot2
,_mm_load_sd(ptr2
)));