Merge branch 'release-4-5-patches' of git.gromacs.org:gromacs into release-4-5-patches
[gromacs/rigid-bodies.git] / include / gmx_sse2_single.h
blob889b57a661e4b3c447919451063a4f2facf8d436
1 /*
2 * This source code is part of
4 * G R O M A C S
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
16 * later version.
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
30 #ifndef _gmx_sse2_single_h_
31 #define _gmx_sse2_single_h_
33 /* We require SSE2 now! */
35 #include <math.h>
38 #include <xmmintrin.h> /* SSE */
39 #include <emmintrin.h> /* SSE2 */
41 #ifdef GMX_SSE3
42 # include <pmmintrin.h> /* SSE3 */
43 #endif
44 #ifdef GMX_SSE4
45 # include <smmintrin.h> /* SSE4.1 */
46 #endif
48 #include <stdio.h>
50 #include "types/simple.h"
53 /***************************************************
54 * *
55 * COMPILER RANT WARNING: *
56 * *
57 * Ideally, this header would be filled with *
58 * simple static inline functions. Unfortunately, *
59 * many vendors provide really braindead compilers *
60 * that either cannot handle more than 1-2 SSE *
61 * function parameters, and some cannot handle *
62 * pointers to SSE __m128 datatypes as parameters *
63 * at all. Thus, for portability we have had to *
64 * implement all but the simplest routines as *
65 * macros instead... *
66 * *
67 ***************************************************/
70 /***************************************************
71 * *
72 * Wrappers/replacements for some instructions *
73 * not available in all SSE versions. *
74 * *
75 ***************************************************/
77 #ifdef GMX_SSE4
78 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
79 #else
80 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
81 #endif
84 * Some compilers require a cast to change the interpretation
85 * of a register from FP to Int and vice versa, and not all of
86 * the provide instructions to do this. Roll our own wrappers...
89 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
90 # define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
91 # define gmx_mm_castps_si128(a) _mm_castps_si128(a)
92 # define gmx_mm_castps_ps128(a) (a)
93 #elif defined(__GNUC__)
94 # define gmx_mm_castsi128_ps(a) ((__m128)(a))
95 # define gmx_mm_castps_si128(a) ((__m128i)(a))
96 # define gmx_mm_castps_ps128(a) ((__m128)(a))
97 #else
98 static __m128 gmx_mm_castsi128_ps(__m128i a) { return *(__m128 *) &a; }
99 static __m128i gmx_mm_castps_si128(__m128 a) { return *(__m128i *) &a; }
100 static __m128 gmx_mm_castps_ps128(__m128 a) { return *(__m128 *) &a; }
101 #endif
105 /* IO functions, just for debugging */
107 static void
108 printxmm(const char *s,__m128 xmm)
110 float f[4];
112 _mm_storeu_ps(f,xmm);
113 printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s,f[0],f[1],f[2],f[3]);
117 static void
118 printxmmsum(const char *s,__m128 xmm)
120 float f[4];
122 _mm_storeu_ps(f,xmm);
123 printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
127 static void
128 printxmmi(const char *s,__m128i xmmi)
130 int i[4];
132 _mm_storeu_si128((__m128i *)i,xmmi);
133 printf("%10s: %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3]);
137 static int gmx_mm_check_and_reset_overflow(void)
139 int MXCSR;
140 int sse_overflow;
142 MXCSR = _mm_getcsr();
143 /* The overflow flag is bit 3 in the register */
144 if (MXCSR & 0x0008)
146 sse_overflow = 1;
147 /* Set the overflow flag to zero */
148 MXCSR = MXCSR & 0xFFF7;
149 _mm_setcsr(MXCSR);
151 else
153 sse_overflow = 0;
156 return sse_overflow;
160 /************************
162 * Simple math routines *
164 ************************/
166 static inline __m128
167 gmx_mm_invsqrt_ps(__m128 x)
169 const __m128 half = _mm_set_ps(0.5,0.5,0.5,0.5);
170 const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
172 __m128 lu = _mm_rsqrt_ps(x);
174 return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
177 static inline __m128
178 gmx_mm_sqrt_ps(__m128 x)
180 __m128 mask;
181 __m128 res;
183 mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
184 res = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
186 res = _mm_mul_ps(x,res);
188 return res;
191 static inline __m128
192 gmx_mm_inv_ps(__m128 x)
194 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
196 __m128 lu = _mm_rcp_ps(x);
198 return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
202 static inline __m128
203 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
205 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
208 /* Normal sum of four xmm registers */
209 #define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
211 static __m128
212 gmx_mm_log_ps(__m128 x)
214 /* Same algorithm as cephes library */
215 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
216 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
217 const __m128 half = _mm_set1_ps(0.5f);
218 const __m128 one = _mm_set1_ps(1.0f);
219 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
220 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
221 const __m128 corr2 = _mm_set1_ps(0.693359375f);
223 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
224 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
225 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
226 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
227 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
228 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
229 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
230 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
231 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
233 __m128 fexp,fexp1;
234 __m128i iexp;
235 __m128 mask;
236 __m128 x1,x2;
237 __m128 y;
238 __m128 pA,pB,pC,pD,pE,tB,tC,tD,tE;
240 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
241 fexp = _mm_and_ps(x,expmask);
242 iexp = gmx_mm_castps_si128(fexp);
243 iexp = _mm_srli_epi32(iexp,23);
244 iexp = _mm_sub_epi32(iexp,expbase_m1);
246 x = _mm_andnot_ps(expmask,x);
247 x = _mm_or_ps(x,one);
248 x = _mm_mul_ps(x,half);
250 mask = _mm_cmplt_ps(x,invsq2);
252 x = _mm_add_ps(x,_mm_and_ps(mask,x));
253 x = _mm_sub_ps(x,one);
254 iexp = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
256 x2 = _mm_mul_ps(x,x);
258 pA = _mm_mul_ps(CA_1,x);
259 pB = _mm_mul_ps(CB_1,x);
260 pC = _mm_mul_ps(CC_1,x);
261 pD = _mm_mul_ps(CD_1,x);
262 pE = _mm_mul_ps(CE_1,x);
263 tB = _mm_add_ps(CB_0,x2);
264 tC = _mm_add_ps(CC_0,x2);
265 tD = _mm_add_ps(CD_0,x2);
266 tE = _mm_add_ps(CE_0,x2);
267 pB = _mm_add_ps(pB,tB);
268 pC = _mm_add_ps(pC,tC);
269 pD = _mm_add_ps(pD,tD);
270 pE = _mm_add_ps(pE,tE);
272 pA = _mm_mul_ps(pA,pB);
273 pC = _mm_mul_ps(pC,pD);
274 pE = _mm_mul_ps(pE,x2);
275 pA = _mm_mul_ps(pA,pC);
276 y = _mm_mul_ps(pA,pE);
278 fexp = _mm_cvtepi32_ps(iexp);
279 y = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
281 y = _mm_sub_ps(y, _mm_mul_ps(half,x2));
282 x2 = _mm_add_ps(x,y);
284 x2 = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
286 return x2;
291 * Exponential function.
293 * Exp(x) is calculate from the relation Exp(x)=2^(y), where y=log2(e)*x
294 * Thus, the contents of this routine is mostly about calculating 2^y.
296 * This is done by separating y=z+w, where z=[y] is an integer. For technical reasons it is easiest
297 * for us to round to the _nearest_ integer and have w in [-0.5,0.5] rather than always rounding down.
298 * (It is not until SSE4 there was an efficient operation to do rounding towards -infinity).
300 * With this we get 2^y=2^z*2^w
302 * Since we have IEEE fp representation, we can easily calculate 2^z by adding the FP exponent bias
303 * (127 in single), and shifting the integer to the exponent field of the FP number (23 bits up).
305 * The 2^w term is calculated from a (5,0)-th order (no denominator) Minimax polynomia on the interval
306 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
308 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 5, 0}, WorkingPrecision -> 15]
310 * The lowest exponent we can represent in IEEE single-precision binary format is 2^-126; below that
311 * it will wrap around and lead to very large positive numbers. This corresponds to a lower bound
312 * on the argument for exp(x) of roughly -87.33. For smaller arguments the return value will be 0.0.
314 * There appears to be a slight loss of precision for large arguments (~50), where the largest relative
315 * error reaches ~3e-6. However, since the actual value for that argument is around 10^21, it might
316 * not matter for typical single precision workloads. This is likely caused by the polynomial evaluation,
317 * and the only way around would then be a table-based version, which I haven't managed to get the
318 * same performance from.
320 * The _average_ accuracy is 22.7 bits in the range [-10,10], and the worst roughly 1 bit worse.
322 static __m128
323 gmx_mm_exp_ps(__m128 x)
325 const __m128 argscale = _mm_set1_ps(1.442695040888963f);
326 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
327 const __m128 arglimit = _mm_set1_ps(-126.0f/1.442695040888963f);
329 const __m128i expbase = _mm_set1_epi32(127);
330 const __m128 CA0 = _mm_set1_ps(0.00132764719920600f);
331 const __m128 CB0 = _mm_set1_ps(3.17196359322f);
332 const __m128 CC0 = _mm_set1_ps(20.36135752425f);
333 const __m128 CC1 = _mm_set1_ps(-0.681627790451f);
334 const __m128 CD0 = _mm_set1_ps(11.66225206128f);
335 const __m128 CD1 = _mm_set1_ps(4.79739947827f);
338 __m128 valuemask;
339 __m128i iexppart;
340 __m128 fexppart;
341 __m128 intpart;
342 __m128 z,z2;
343 __m128 factB,factC,factD;
345 z = _mm_mul_ps(x,argscale);
346 iexppart = _mm_cvtps_epi32(z);
347 #if GMX_SSE4
348 /* This reduces latency and speeds up the code by roughly 5% when supported */
349 intpart = _mm_round_ps(z,0);
350 #else
351 intpart = _mm_cvtepi32_ps(iexppart);
352 #endif
353 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
354 valuemask = _mm_cmpgt_ps(x,arglimit);
356 z = _mm_sub_ps(z,intpart);
357 z2 = _mm_mul_ps(z,z);
359 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
361 /* Since SSE floating-point has relatively high latency it is faster to do
362 * factorized polynomial summation with independent terms than using alternating add/multiply, i.e.
363 * p(z) = A0 * (B0 + z) * (C0 + C1*z + z^2) * (D0 + D1*z + z^2)
365 factB = _mm_add_ps(CB0,z);
366 factC = _mm_add_ps(CC0,_mm_mul_ps(CC1,z) );
367 factC = _mm_add_ps(factC,z2);
368 factD = _mm_add_ps(CD0,_mm_mul_ps(CD1,z) );
369 factD = _mm_add_ps(factD,z2);
371 z = _mm_mul_ps(CA0,fexppart);
372 factB = _mm_mul_ps(factB,factC);
373 z = _mm_mul_ps(z,factD);
374 z = _mm_mul_ps(z,factB);
376 /* Currently uses 22 actual (real, not including casts) SSE instructions */
377 return z;
382 static int
383 gmx_mm_sincos_ps(__m128 x,
384 __m128 *sinval,
385 __m128 *cosval)
387 const __m128 _sincosf_two_over_pi = _mm_set_ps(2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI);
388 const __m128 _sincosf_half = _mm_set_ps(0.5,0.5,0.5,0.5);
389 const __m128 _sincosf_one = _mm_set_ps(1.0,1.0,1.0,1.0);
391 const __m128i _sincosf_izero = _mm_set1_epi32(0);
392 const __m128i _sincosf_ione = _mm_set1_epi32(1);
393 const __m128i _sincosf_itwo = _mm_set1_epi32(2);
394 const __m128i _sincosf_ithree = _mm_set1_epi32(3);
396 const __m128 _sincosf_kc1 = _mm_set_ps(1.57079625129,1.57079625129,1.57079625129,1.57079625129);
397 const __m128 _sincosf_kc2 = _mm_set_ps(7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8);
398 const __m128 _sincosf_cc0 = _mm_set_ps(-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249);
399 const __m128 _sincosf_cc1 = _mm_set_ps(0.0416566950,0.0416566950,0.0416566950,0.0416566950);
400 const __m128 _sincosf_cc2 = _mm_set_ps(-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225);
401 const __m128 _sincosf_sc0 = _mm_set_ps(-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727);
402 const __m128 _sincosf_sc1 = _mm_set_ps(0.0083320758,0.0083320758,0.0083320758,0.0083320758);
403 const __m128 _sincosf_sc2 = _mm_set_ps(-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247);
405 __m128 _sincosf_signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
406 __m128 _sincosf_tiny = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) );
408 __m128 _sincosf_xl;
409 __m128 _sincosf_xl2;
410 __m128 _sincosf_xl3;
411 __m128 _sincosf_qf;
412 __m128 _sincosf_absxl;
413 __m128 _sincosf_p1;
414 __m128 _sincosf_cx;
415 __m128 _sincosf_sx;
416 __m128 _sincosf_ts;
417 __m128 _sincosf_tc;
418 __m128 _sincosf_tsn;
419 __m128 _sincosf_tcn;
420 __m128i _sincosf_q;
421 __m128i _sincosf_offsetSin;
422 __m128i _sincosf_offsetCos;
423 __m128 _sincosf_sinMask;
424 __m128 _sincosf_cosMask;
425 __m128 _sincosf_isTiny;
426 __m128 _sincosf_ct0;
427 __m128 _sincosf_ct1;
428 __m128 _sincosf_ct2;
429 __m128 _sincosf_st1;
430 __m128 _sincosf_st2;
432 _sincosf_xl = _mm_mul_ps(x,_sincosf_two_over_pi);
434 _sincosf_xl = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half));
436 _sincosf_q = _mm_cvttps_epi32(_sincosf_xl);
437 _sincosf_qf = _mm_cvtepi32_ps(_sincosf_q);
439 _sincosf_offsetSin = _mm_and_si128(_sincosf_q,_sincosf_ithree);
440 _sincosf_offsetCos = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione);
442 _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1);
443 _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2);
444 _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1);
445 _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl);
447 _sincosf_absxl = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl);
448 _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl);
450 _sincosf_xl2 = _mm_mul_ps(_sincosf_xl,_sincosf_xl);
451 _sincosf_xl3 = _mm_mul_ps(_sincosf_xl2,_sincosf_xl);
453 _sincosf_ct1 = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2);
454 _sincosf_ct1 = _mm_add_ps(_sincosf_ct1,_sincosf_cc1);
455 _sincosf_st1 = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2);
456 _sincosf_st1 = _mm_add_ps(_sincosf_st1,_sincosf_sc1);
457 _sincosf_ct2 = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2);
458 _sincosf_ct2 = _mm_add_ps(_sincosf_ct2,_sincosf_cc2);
459 _sincosf_st2 = _mm_mul_ps(_sincosf_st1,_sincosf_xl2);
460 _sincosf_st2 = _mm_add_ps(_sincosf_st2,_sincosf_sc2);
462 _sincosf_cx = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2);
463 _sincosf_cx = _mm_add_ps(_sincosf_cx,_sincosf_one);
465 _sincosf_sx = _mm_mul_ps(_sincosf_st2,_sincosf_xl3);
466 _sincosf_sx = _mm_add_ps(_sincosf_sx,_sincosf_xl);
468 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) );
469 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) );
471 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) );
472 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) );
474 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );
475 _sincosf_tsn = _mm_xor_ps(_sincosf_signbit,_sincosf_ts);
476 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) );
478 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) );
479 _sincosf_tcn = _mm_xor_ps(_sincosf_signbit,_sincosf_tc);
480 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) );
482 *sinval = _sincosf_ts;
483 *cosval = _sincosf_tc;
485 return 0;
488 static __m128
489 gmx_mm_tan_ps(__m128 x)
491 __m128 sinval,cosval;
492 __m128 tanval;
494 gmx_mm_sincos_ps(x,&sinval,&cosval);
496 tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
498 return tanval;
502 static __m128
503 gmx_mm_asin_ps(__m128 x)
505 /* Same algorithm as cephes library */
506 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
507 const __m128 limitlow = _mm_set1_ps(1e-4f);
508 const __m128 half = _mm_set1_ps(0.5f);
509 const __m128 one = _mm_set1_ps(1.0f);
510 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
512 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
513 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
514 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
515 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
516 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
518 __m128 sign;
519 __m128 mask;
520 __m128 xabs;
521 __m128 z,z1,z2,q,q1,q2;
522 __m128 pA,pB;
524 sign = _mm_andnot_ps(signmask,x);
525 xabs = _mm_and_ps(x,signmask);
527 mask = _mm_cmpgt_ps(xabs,half);
529 z1 = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
530 q1 = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
531 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
533 q2 = xabs;
534 z2 = _mm_mul_ps(q2,q2);
536 z = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
537 q = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
539 z2 = _mm_mul_ps(z,z);
541 pA = _mm_mul_ps(CC5,z2);
542 pB = _mm_mul_ps(CC4,z2);
544 pA = _mm_add_ps(pA,CC3);
545 pB = _mm_add_ps(pB,CC2);
547 pA = _mm_mul_ps(pA,z2);
548 pB = _mm_mul_ps(pB,z2);
550 pA = _mm_add_ps(pA,CC1);
551 pA = _mm_mul_ps(pA,z);
553 z = _mm_add_ps(pA,pB);
554 z = _mm_mul_ps(z,q);
555 z = _mm_add_ps(z,q);
557 q2 = _mm_sub_ps(halfpi,z);
558 q2 = _mm_sub_ps(q2,z);
560 z = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
562 mask = _mm_cmpgt_ps(xabs,limitlow);
563 z = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
565 z = _mm_xor_ps(z,sign);
567 return z;
571 static __m128
572 gmx_mm_acos_ps(__m128 x)
574 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
575 const __m128 one_ps = _mm_set1_ps(1.0f);
576 const __m128 half_ps = _mm_set1_ps(0.5f);
577 const __m128 pi_ps = _mm_set1_ps(M_PI);
578 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
580 __m128 mask1;
581 __m128 mask2;
582 __m128 xabs;
583 __m128 z,z1,z2,z3;
585 xabs = _mm_and_ps(x,signmask);
586 mask1 = _mm_cmpgt_ps(xabs,half_ps);
587 mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
589 z = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
590 z = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
591 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
593 z = _mm_or_ps( _mm_and_ps(mask1,z) , _mm_andnot_ps(mask1,x) );
594 z = gmx_mm_asin_ps(z);
596 z2 = _mm_add_ps(z,z);
597 z1 = _mm_sub_ps(pi_ps,z2);
598 z3 = _mm_sub_ps(halfpi_ps,z);
600 z = _mm_or_ps( _mm_and_ps(mask2,z2) , _mm_andnot_ps(mask2,z1) );
601 z = _mm_or_ps( _mm_and_ps(mask1,z) , _mm_andnot_ps(mask1,z3) );
603 return z;
607 static __m128
608 gmx_mm_atan_ps(__m128 x)
610 /* Same algorithm as cephes library */
611 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
612 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
613 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
614 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
615 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
616 const __m128 mone = _mm_set1_ps(-1.0f);
617 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
618 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
619 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
620 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
622 __m128 sign;
623 __m128 mask1,mask2;
624 __m128 y,z1,z2;
625 __m128 x2,x4;
626 __m128 sum1,sum2;
628 sign = _mm_andnot_ps(signmask,x);
629 x = _mm_and_ps(x,signmask);
631 mask1 = _mm_cmpgt_ps(x,limit1);
632 mask2 = _mm_cmpgt_ps(x,limit2);
634 z1 = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
635 z2 = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
637 y = _mm_and_ps(mask1,quarterpi);
638 y = _mm_or_ps( _mm_and_ps(mask2,halfpi) , _mm_andnot_ps(mask2,y) );
640 x = _mm_or_ps( _mm_and_ps(mask1,z1) , _mm_andnot_ps(mask1,x) );
641 x = _mm_or_ps( _mm_and_ps(mask2,z2) , _mm_andnot_ps(mask2,x) );
643 x2 = _mm_mul_ps(x,x);
644 x4 = _mm_mul_ps(x2,x2);
646 sum1 = _mm_mul_ps(CC9,x4);
647 sum2 = _mm_mul_ps(CC7,x4);
648 sum1 = _mm_add_ps(sum1,CC5);
649 sum2 = _mm_add_ps(sum2,CC3);
650 sum1 = _mm_mul_ps(sum1,x4);
651 sum2 = _mm_mul_ps(sum2,x2);
653 sum1 = _mm_add_ps(sum1,sum2);
654 sum1 = _mm_sub_ps(sum1,mone);
655 sum1 = _mm_mul_ps(sum1,x);
656 y = _mm_add_ps(y,sum1);
658 y = _mm_xor_ps(y,sign);
660 return y;
664 static __m128
665 gmx_mm_atan2_ps(__m128 y, __m128 x)
667 const __m128 pi = _mm_set1_ps(M_PI);
668 const __m128 minuspi = _mm_set1_ps(-M_PI);
669 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
670 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
672 __m128 z,z1,z3,z4;
673 __m128 w;
674 __m128 maskx_lt,maskx_eq;
675 __m128 masky_lt,masky_eq;
676 __m128 mask1,mask2,mask3,mask4,maskall;
678 maskx_lt = _mm_cmplt_ps(x,_mm_setzero_ps());
679 masky_lt = _mm_cmplt_ps(y,_mm_setzero_ps());
680 maskx_eq = _mm_cmpeq_ps(x,_mm_setzero_ps());
681 masky_eq = _mm_cmpeq_ps(y,_mm_setzero_ps());
683 z = _mm_mul_ps(y,gmx_mm_inv_ps(x));
684 z = gmx_mm_atan_ps(z);
686 mask1 = _mm_and_ps(maskx_eq,masky_lt);
687 mask2 = _mm_andnot_ps(maskx_lt,masky_eq);
688 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
689 mask4 = _mm_and_ps(masky_eq,maskx_lt);
691 maskall = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
693 z = _mm_andnot_ps(maskall,z);
694 z1 = _mm_and_ps(mask1,minushalfpi);
695 z3 = _mm_and_ps(mask3,halfpi);
696 z4 = _mm_and_ps(mask4,pi);
698 z = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
700 mask1 = _mm_andnot_ps(masky_lt,maskx_lt);
701 mask2 = _mm_and_ps(maskx_lt,masky_lt);
703 w = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
704 w = _mm_andnot_ps(maskall,w);
706 z = _mm_add_ps(z,w);
708 return z;
711 /* Load a single value from 1-4 places, merge into xmm register */
713 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
715 __m128 _txmm2,_txmm3,_txmm4; \
716 xmm1 = _mm_load_ss(ptr1); \
717 _txmm2 = _mm_load_ss(ptr2); \
718 _txmm3 = _mm_load_ss(ptr3); \
719 _txmm4 = _mm_load_ss(ptr4); \
720 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
721 _txmm2 = _mm_unpacklo_ps(_txmm2,_txmm4); \
722 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
726 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
728 __m128 _txmm2,_txmm3; \
729 xmm1 = _mm_load_ss(ptr1); \
730 _txmm2 = _mm_load_ss(ptr2); \
731 _txmm3 = _mm_load_ss(ptr3); \
732 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
733 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
737 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1) \
739 __m128 _txmm2; \
740 xmm1 = _mm_load_ss(ptr1); \
741 _txmm2 = _mm_load_ss(ptr2); \
742 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
746 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
748 xmm1 = _mm_load_ss(ptr1); \
751 /* Store data in an xmm register into 1-4 different places */
752 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
754 __m128 _txmm2,_txmm3,_txmm4; \
755 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
756 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
757 _txmm4 = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
758 _mm_store_ss(ptr1,xmm1); \
759 _mm_store_ss(ptr2,_txmm2); \
760 _mm_store_ss(ptr3,_txmm3); \
761 _mm_store_ss(ptr4,_txmm4); \
765 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
767 __m128 _txmm2,_txmm3; \
768 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
769 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
770 _mm_store_ss(ptr1,xmm1); \
771 _mm_store_ss(ptr2,_txmm2); \
772 _mm_store_ss(ptr3,_txmm3); \
776 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1) \
778 __m128 _txmm2; \
779 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
780 _mm_store_ss(ptr1,xmm1); \
781 _mm_store_ss(ptr2,_txmm2); \
785 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
787 _mm_store_ss(ptr1,xmm1); \
791 /* Similar to store, but increments value in memory */
792 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2) \
794 __m128 _tincr1,_tincr2; \
795 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
796 GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
797 _tincr1 = _mm_add_ps(_tincr1,xmm1); \
798 _tincr2 = _mm_add_ps(_tincr2,xmm2); \
799 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
800 GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
803 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
805 __m128 _tincr; \
806 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
807 _tincr = _mm_add_ps(_tincr,xmm1); \
808 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
811 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
813 __m128 _tincr; \
814 GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
815 _tincr = _mm_add_ps(_tincr,xmm1); \
816 GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
819 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1) \
821 __m128 _tincr; \
822 GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr); \
823 _tincr = _mm_add_ps(_tincr,xmm1); \
824 GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr); \
827 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1) \
829 __m128 _tincr; \
830 GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr); \
831 _tincr = _mm_add_ss(_tincr,xmm1); \
832 GMX_MM_STORE_1VALUE_PS(ptr1,_tincr); \
837 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
838 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12) \
840 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
841 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
842 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
843 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
844 _tmp4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4)); \
845 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
846 _tmp2 = _mm_unpacklo_ps(_tmp2,_tmp4); \
847 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
848 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
851 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12) \
853 __m128 _tmp1,_tmp2,_tmp3; \
854 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
855 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
856 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
857 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
858 _tmp2 = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps()); \
859 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
860 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
864 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12) \
866 __m128 _tmp1,_tmp2; \
867 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
868 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
869 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
870 c12 = _mm_movehl_ps(c12,c6); \
873 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12) \
875 c6 = _mm_load_ss(ptr1); \
876 c12 = _mm_load_ss(ptr1+1); \
880 /* Routines to load 1-4 rvecs from 1-4 places.
881 * We mainly use these to load coordinates. The extra routines
882 * are very efficient for the water-water loops, since we e.g.
883 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
885 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
886 jx1 = _mm_load_ss(ptr1); \
887 jy1 = _mm_load_ss((ptr1)+1); \
888 jz1 = _mm_load_ss((ptr1)+2); \
891 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
892 jx1 = _mm_load_ss(ptr1); \
893 jy1 = _mm_load_ss((ptr1)+1); \
894 jz1 = _mm_load_ss((ptr1)+2); \
895 jx2 = _mm_load_ss((ptr1)+3); \
896 jy2 = _mm_load_ss((ptr1)+4); \
897 jz2 = _mm_load_ss((ptr1)+5); \
901 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
902 jx1 = _mm_load_ss(ptr1); \
903 jy1 = _mm_load_ss((ptr1)+1); \
904 jz1 = _mm_load_ss((ptr1)+2); \
905 jx2 = _mm_load_ss((ptr1)+3); \
906 jy2 = _mm_load_ss((ptr1)+4); \
907 jz2 = _mm_load_ss((ptr1)+5); \
908 jx3 = _mm_load_ss((ptr1)+6); \
909 jy3 = _mm_load_ss((ptr1)+7); \
910 jz3 = _mm_load_ss((ptr1)+8); \
914 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
915 jx1 = _mm_load_ss(ptr1); \
916 jy1 = _mm_load_ss((ptr1)+1); \
917 jz1 = _mm_load_ss((ptr1)+2); \
918 jx2 = _mm_load_ss((ptr1)+3); \
919 jy2 = _mm_load_ss((ptr1)+4); \
920 jz2 = _mm_load_ss((ptr1)+5); \
921 jx3 = _mm_load_ss((ptr1)+6); \
922 jy3 = _mm_load_ss((ptr1)+7); \
923 jz3 = _mm_load_ss((ptr1)+8); \
924 jx4 = _mm_load_ss((ptr1)+9); \
925 jy4 = _mm_load_ss((ptr1)+10); \
926 jz4 = _mm_load_ss((ptr1)+11); \
930 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
931 __m128 _tmp1,_tmp2; \
932 _tmp1 = _mm_load_ss(ptr1); \
933 _tmp2 = _mm_load_ss(ptr2); \
934 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
935 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
936 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
937 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
938 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
939 jz1 = _mm_movehl_ps(jz1,jy1); \
942 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
943 __m128 _tmp1, _tmp2; \
944 _tmp1 = _mm_loadu_ps(ptr1); \
945 jy1 = _mm_loadu_ps(ptr2); \
946 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
947 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
948 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
949 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
950 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
951 jy1 = _mm_movehl_ps(jx1,jx1); \
952 jx2 = _mm_movehl_ps(jz1,jz1); \
953 jz2 = _mm_movehl_ps(jy2,jy2); \
957 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
958 __m128 _tmp1, _tmp2, _tmp3; \
959 _tmp1 = _mm_loadu_ps(ptr1); \
960 jy1 = _mm_loadu_ps(ptr2); \
961 _tmp2 = _mm_loadu_ps(ptr1+4); \
962 jz2 = _mm_loadu_ps(ptr2+4); \
963 jz3 = _mm_load_ss(ptr1+8); \
964 _tmp3 = _mm_load_ss(ptr2+8); \
965 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
966 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
967 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
968 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
969 jy1 = _mm_movehl_ps(jx1,jx1); \
970 jx2 = _mm_movehl_ps(jz1,jz1); \
971 jz2 = _mm_movehl_ps(jy2,jy2); \
972 jy3 = _mm_movehl_ps(jx3,jx3); \
973 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
977 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
978 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
979 _tmp1 = _mm_loadu_ps(ptr1); \
980 jy1 = _mm_loadu_ps(ptr2); \
981 _tmp2 = _mm_loadu_ps(ptr1+4); \
982 jz2 = _mm_loadu_ps(ptr2+4); \
983 _tmp3 = _mm_loadu_ps(ptr1+8); \
984 _tmp4 = _mm_loadu_ps(ptr2+8); \
985 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
986 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
987 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
988 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
989 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
990 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
991 jy1 = _mm_movehl_ps(jx1,jx1); \
992 jx2 = _mm_movehl_ps(jz1,jz1); \
993 jz2 = _mm_movehl_ps(jy2,jy2); \
994 jy3 = _mm_movehl_ps(jx3,jx3); \
995 jx4 = _mm_movehl_ps(jz3,jz3); \
996 jz4 = _mm_movehl_ps(jy4,jy4); \
1000 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1001 __m128 _tmp1,_tmp3,_tmp4; \
1002 jx1 = _mm_load_ss(ptr1); \
1003 jy1 = _mm_load_ss(ptr2); \
1004 jz1 = _mm_load_ss(ptr3); \
1005 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
1006 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
1007 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
1008 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
1009 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
1010 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
1011 jx1 = _mm_movelh_ps(_tmp1,jz1); \
1012 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
1013 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
1017 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1018 __m128 _tmp1, _tmp2; \
1019 jx1 = _mm_loadu_ps(ptr1); \
1020 jy1 = _mm_loadu_ps(ptr2); \
1021 jz1 = _mm_loadu_ps(ptr3); \
1022 jx2 = _mm_setzero_ps(); \
1023 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1024 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1025 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
1026 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1027 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
1028 jz2 = _mm_unpacklo_ps(jz2,_mm_setzero_ps()); \
1029 jy2 = _mm_unpacklo_ps(_tmp1,jz2); \
1030 jz2 = _mm_unpackhi_ps(_tmp1,jz2); \
1034 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1035 __m128 _tmp1, _tmp2; \
1036 jx1 = _mm_loadu_ps(ptr1); \
1037 jy1 = _mm_loadu_ps(ptr2); \
1038 jz1 = _mm_loadu_ps(ptr3); \
1039 jx2 = _mm_setzero_ps(); \
1040 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1041 jy2 = _mm_loadu_ps(ptr1+4); \
1042 jz2 = _mm_loadu_ps(ptr2+4); \
1043 jx3 = _mm_loadu_ps(ptr3+4); \
1044 jy3 = _mm_setzero_ps(); \
1045 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1046 jz3 = _mm_load_ss(ptr1+8); \
1047 _tmp1 = _mm_load_ss(ptr2+8); \
1048 _tmp2 = _mm_load_ss(ptr3+8); \
1049 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
1050 _tmp1 = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps()); \
1051 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
1055 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1056 jx1 = _mm_loadu_ps(ptr1); \
1057 jy1 = _mm_loadu_ps(ptr2); \
1058 jz1 = _mm_loadu_ps(ptr3); \
1059 jx2 = _mm_setzero_ps(); \
1060 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1061 jy2 = _mm_loadu_ps(ptr1+4); \
1062 jz2 = _mm_loadu_ps(ptr2+4); \
1063 jx3 = _mm_loadu_ps(ptr3+4); \
1064 jy3 = _mm_setzero_ps(); \
1065 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1066 jz3 = _mm_loadu_ps(ptr1+8); \
1067 jx4 = _mm_loadu_ps(ptr2+8); \
1068 jy4 = _mm_loadu_ps(ptr3+8); \
1069 jz4 = _mm_setzero_ps(); \
1070 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
1075 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1076 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1077 jx1 = _mm_load_ss(ptr1); \
1078 _tmp1 = _mm_load_ss(ptr2); \
1079 jy1 = _mm_load_ss(ptr3); \
1080 jz1 = _mm_load_ss(ptr4); \
1081 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
1082 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
1083 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
1084 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
1085 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
1086 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
1087 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
1088 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1089 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
1090 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
1091 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
1095 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1096 __m128 _tmp1, _tmp2; \
1097 jx1 = _mm_loadu_ps(ptr1); \
1098 jy1 = _mm_loadu_ps(ptr2); \
1099 jz1 = _mm_loadu_ps(ptr3); \
1100 jx2 = _mm_loadu_ps(ptr4); \
1101 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1102 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1103 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
1104 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1105 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
1106 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
1107 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
1108 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
1109 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
1113 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1114 __m128 _tmp1, _tmp2, _tmp3; \
1115 jx1 = _mm_loadu_ps(ptr1); \
1116 jy1 = _mm_loadu_ps(ptr2); \
1117 jz1 = _mm_loadu_ps(ptr3); \
1118 jx2 = _mm_loadu_ps(ptr4); \
1119 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1120 jy2 = _mm_loadu_ps(ptr1+4); \
1121 jz2 = _mm_loadu_ps(ptr2+4); \
1122 jx3 = _mm_loadu_ps(ptr3+4); \
1123 jy3 = _mm_loadu_ps(ptr4+4); \
1124 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1125 jz3 = _mm_load_ss(ptr1+8); \
1126 _tmp1 = _mm_load_ss(ptr2+8); \
1127 _tmp2 = _mm_load_ss(ptr3+8); \
1128 _tmp3 = _mm_load_ss(ptr4+8); \
1129 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
1130 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
1131 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
1135 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1136 jx1 = _mm_loadu_ps(ptr1); \
1137 jy1 = _mm_loadu_ps(ptr2); \
1138 jz1 = _mm_loadu_ps(ptr3); \
1139 jx2 = _mm_loadu_ps(ptr4); \
1140 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1141 jy2 = _mm_loadu_ps(ptr1+4); \
1142 jz2 = _mm_loadu_ps(ptr2+4); \
1143 jx3 = _mm_loadu_ps(ptr3+4); \
1144 jy3 = _mm_loadu_ps(ptr4+4); \
1145 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1146 jz3 = _mm_loadu_ps(ptr1+8); \
1147 jx4 = _mm_loadu_ps(ptr2+8); \
1148 jy4 = _mm_loadu_ps(ptr3+8); \
1149 jz4 = _mm_loadu_ps(ptr4+8); \
1150 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
1154 /* Routines to increment rvecs in memory, typically use for j particle force updates */
1155 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1156 __m128 _tmp1; \
1157 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1158 jx1 = _mm_movelh_ps(jx1,jy1); \
1159 _tmp1 = _mm_load_ss(ptr1); \
1160 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1161 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1162 _mm_store_ss(ptr1,_tmp1); \
1163 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1167 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1168 __m128 _tmp1, _tmp2; \
1169 _tmp1 = _mm_loadu_ps(ptr1); \
1170 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1171 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1172 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1173 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1174 jx1 = _mm_movelh_ps(jx1,jz1); \
1175 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1176 _tmp2 = _mm_add_ps(_tmp2,jy2); \
1177 _mm_storeu_ps(ptr1,_tmp1); \
1178 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1182 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1183 __m128 _tmp1, _tmp2, _tmp3; \
1184 _tmp1 = _mm_loadu_ps(ptr1); \
1185 _tmp2 = _mm_loadu_ps(ptr1+4); \
1186 _tmp3 = _mm_load_ss(ptr1+8); \
1187 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1188 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1189 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1190 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1191 jx1 = _mm_movelh_ps(jx1,jz1); \
1192 jy2 = _mm_movelh_ps(jy2,jx3); \
1193 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1194 _tmp2 = _mm_add_ps(_tmp2,jy2); \
1195 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1196 _mm_storeu_ps(ptr1,_tmp1); \
1197 _mm_storeu_ps(ptr1+4,_tmp2); \
1198 _mm_store_ss(ptr1+8,_tmp3); \
1202 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1203 __m128 _tmp1, _tmp2, _tmp3; \
1204 _tmp1 = _mm_loadu_ps(ptr1); \
1205 _tmp2 = _mm_loadu_ps(ptr1+4); \
1206 _tmp3 = _mm_loadu_ps(ptr1+8); \
1207 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1208 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1209 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1210 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1211 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1212 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1213 jx1 = _mm_movelh_ps(jx1,jz1); \
1214 jy2 = _mm_movelh_ps(jy2,jx3); \
1215 jz3 = _mm_movelh_ps(jz3,jy4); \
1216 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1217 _tmp2 = _mm_add_ps(_tmp2,jy2); \
1218 _tmp3 = _mm_add_ps(_tmp3,jz3); \
1219 _mm_storeu_ps(ptr1,_tmp1); \
1220 _mm_storeu_ps(ptr1+4,_tmp2); \
1221 _mm_storeu_ps(ptr1+8,_tmp3); \
1225 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1226 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1227 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1228 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1229 _tmp2 = _mm_load_ss(ptr1+2); \
1230 _tmp3 = _mm_load_ss(ptr2+2); \
1231 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1232 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1233 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1234 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1235 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1236 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
1237 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
1241 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1242 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1243 _tmp1 = _mm_loadu_ps(ptr1); \
1244 _tmp2 = _mm_loadu_ps(ptr2); \
1245 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1246 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1247 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1248 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1249 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1250 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1251 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1252 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
1253 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
1254 _tmp3 = _mm_add_ps(_tmp3,jy2); \
1255 _mm_storeu_ps(ptr1,_tmp1); \
1256 _mm_storeu_ps(ptr2,_tmp2); \
1257 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1258 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1262 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1263 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1264 _tmp1 = _mm_loadu_ps(ptr1); \
1265 _tmp2 = _mm_loadu_ps(ptr1+4); \
1266 _tmp3 = _mm_load_ss(ptr1+8); \
1267 _tmp4 = _mm_loadu_ps(ptr2); \
1268 _tmp5 = _mm_loadu_ps(ptr2+4); \
1269 _tmp6 = _mm_load_ss(ptr2+8); \
1270 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1271 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1272 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1273 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1274 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1275 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1276 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1277 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1278 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1279 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
1280 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1281 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1282 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
1283 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
1284 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
1285 _mm_storeu_ps(ptr1,_tmp1); \
1286 _mm_storeu_ps(ptr1+4,_tmp2); \
1287 _mm_store_ss(ptr1+8,_tmp3); \
1288 _mm_storeu_ps(ptr2,_tmp4); \
1289 _mm_storeu_ps(ptr2+4,_tmp5); \
1290 _mm_store_ss(ptr2+8,_tmp6); \
1294 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1295 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1296 _tmp1 = _mm_loadu_ps(ptr1); \
1297 _tmp2 = _mm_loadu_ps(ptr1+4); \
1298 _tmp3 = _mm_loadu_ps(ptr1+8); \
1299 _tmp4 = _mm_loadu_ps(ptr2); \
1300 _tmp5 = _mm_loadu_ps(ptr2+4); \
1301 _tmp6 = _mm_loadu_ps(ptr2+8); \
1302 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1303 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1304 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1305 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1306 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1307 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1308 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1309 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1310 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1311 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1312 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1313 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1314 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
1315 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1316 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1317 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
1318 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
1319 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
1320 _mm_storeu_ps(ptr1,_tmp1); \
1321 _mm_storeu_ps(ptr1+4,_tmp2); \
1322 _mm_storeu_ps(ptr1+8,_tmp3); \
1323 _mm_storeu_ps(ptr2,_tmp4); \
1324 _mm_storeu_ps(ptr2+4,_tmp5); \
1325 _mm_storeu_ps(ptr2+8,_tmp6); \
1329 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1330 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1331 _tmp1 = _mm_load_ss(ptr1); \
1332 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1333 _tmp2 = _mm_load_ss(ptr2); \
1334 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1335 _tmp3 = _mm_load_ss(ptr3); \
1336 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1337 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1338 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1339 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1340 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1341 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1342 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1343 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1344 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
1345 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
1346 _mm_store_ss(ptr1,_tmp1); \
1347 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1348 _mm_store_ss(ptr2,_tmp2); \
1349 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1350 _mm_store_ss(ptr3,_tmp3); \
1351 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1355 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1356 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1357 _tmp1 = _mm_loadu_ps(ptr1); \
1358 _tmp2 = _mm_loadu_ps(ptr2); \
1359 _tmp3 = _mm_loadu_ps(ptr3); \
1360 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1361 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1362 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1363 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1364 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1365 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1366 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1367 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1368 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1369 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1370 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1371 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1372 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
1373 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1374 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
1375 _tmp4 = _mm_add_ps(_tmp4,jy2); \
1376 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
1377 _mm_storeu_ps(ptr1,_tmp1); \
1378 _mm_storeu_ps(ptr2,_tmp2); \
1379 _mm_storeu_ps(ptr3,_tmp3); \
1380 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1381 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1382 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1386 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1387 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1388 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1389 _tmp1 = _mm_loadu_ps(ptr1); \
1390 _tmp2 = _mm_loadu_ps(ptr1+4); \
1391 _tmp3 = _mm_load_ss(ptr1+8); \
1392 _tmp4 = _mm_loadu_ps(ptr2); \
1393 _tmp5 = _mm_loadu_ps(ptr2+4); \
1394 _tmp6 = _mm_load_ss(ptr2+8); \
1395 _tmp7 = _mm_loadu_ps(ptr3); \
1396 _tmp8 = _mm_loadu_ps(ptr3+4); \
1397 _tmp9 = _mm_load_ss(ptr3+8); \
1398 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1399 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1400 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1401 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1402 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1403 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1404 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1405 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1406 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1407 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1408 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1409 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1410 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1411 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1412 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1413 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1414 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1415 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1416 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1417 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1418 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1419 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
1420 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1421 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1422 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
1423 _mm_storeu_ps(ptr1,_tmp1); \
1424 _mm_storeu_ps(ptr1+4,_tmp2); \
1425 _mm_store_ss(ptr1+8,_tmp3); \
1426 _mm_storeu_ps(ptr2,_tmp4); \
1427 _mm_storeu_ps(ptr2+4,_tmp5); \
1428 _mm_store_ss(ptr2+8,_tmp6); \
1429 _mm_storeu_ps(ptr3,_tmp7); \
1430 _mm_storeu_ps(ptr3+4,_tmp8); \
1431 _mm_store_ss(ptr3+8,_tmp9); \
1435 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1436 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1437 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1438 _tmp1 = _mm_loadu_ps(ptr1); \
1439 _tmp2 = _mm_loadu_ps(ptr1+4); \
1440 _tmp3 = _mm_loadu_ps(ptr1+8); \
1441 _tmp4 = _mm_loadu_ps(ptr2); \
1442 _tmp5 = _mm_loadu_ps(ptr2+4); \
1443 _tmp6 = _mm_loadu_ps(ptr2+8); \
1444 _tmp7 = _mm_loadu_ps(ptr3); \
1445 _tmp8 = _mm_loadu_ps(ptr3+4); \
1446 _tmp9 = _mm_loadu_ps(ptr3+8); \
1447 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1448 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1449 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1450 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1451 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1452 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1453 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1454 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1455 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1456 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1457 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1458 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1459 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1460 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1461 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1462 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1463 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1464 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1465 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1466 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1467 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1468 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1469 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1470 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
1471 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1472 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1473 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
1474 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1475 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1476 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
1477 _mm_storeu_ps(ptr1,_tmp1); \
1478 _mm_storeu_ps(ptr1+4,_tmp2); \
1479 _mm_storeu_ps(ptr1+8,_tmp3); \
1480 _mm_storeu_ps(ptr2,_tmp4); \
1481 _mm_storeu_ps(ptr2+4,_tmp5); \
1482 _mm_storeu_ps(ptr2+8,_tmp6); \
1483 _mm_storeu_ps(ptr3,_tmp7); \
1484 _mm_storeu_ps(ptr3+4,_tmp8); \
1485 _mm_storeu_ps(ptr3+8,_tmp9); \
1490 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1491 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1492 _tmp1 = _mm_load_ss(ptr1); \
1493 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1494 _tmp2 = _mm_load_ss(ptr2); \
1495 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1496 _tmp3 = _mm_load_ss(ptr3); \
1497 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1498 _tmp4 = _mm_load_ss(ptr4); \
1499 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1500 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1501 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1502 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1503 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1504 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1505 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1506 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
1507 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
1508 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
1509 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
1510 _mm_store_ss(ptr1,_tmp1); \
1511 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1512 _mm_store_ss(ptr2,_tmp2); \
1513 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1514 _mm_store_ss(ptr3,_tmp3); \
1515 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1516 _mm_store_ss(ptr4,_tmp4); \
1517 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1521 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1522 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1523 _tmp1 = _mm_loadu_ps(ptr1); \
1524 _tmp2 = _mm_loadu_ps(ptr2); \
1525 _tmp3 = _mm_loadu_ps(ptr3); \
1526 _tmp4 = _mm_loadu_ps(ptr4); \
1527 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1528 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1529 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1530 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1531 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1532 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1533 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1534 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1535 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1536 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1537 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1538 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1539 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1540 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1541 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
1542 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
1543 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1544 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
1545 _tmp5 = _mm_add_ps(_tmp5,jy2); \
1546 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
1547 _mm_storeu_ps(ptr1,_tmp1); \
1548 _mm_storeu_ps(ptr2,_tmp2); \
1549 _mm_storeu_ps(ptr3,_tmp3); \
1550 _mm_storeu_ps(ptr4,_tmp4); \
1551 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1552 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1553 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1554 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1558 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1559 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1560 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1561 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1562 _tmp1 = _mm_loadu_ps(ptr1); \
1563 _tmp2 = _mm_loadu_ps(ptr1+4); \
1564 _tmp3 = _mm_load_ss(ptr1+8); \
1565 _tmp4 = _mm_loadu_ps(ptr2); \
1566 _tmp5 = _mm_loadu_ps(ptr2+4); \
1567 _tmp6 = _mm_load_ss(ptr2+8); \
1568 _tmp7 = _mm_loadu_ps(ptr3); \
1569 _tmp8 = _mm_loadu_ps(ptr3+4); \
1570 _tmp9 = _mm_load_ss(ptr3+8); \
1571 _tmp10 = _mm_loadu_ps(ptr4); \
1572 _tmp11 = _mm_loadu_ps(ptr4+4); \
1573 _tmp12 = _mm_load_ss(ptr4+8); \
1574 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1575 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1576 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1577 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1578 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1579 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1580 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1581 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1582 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1583 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1584 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1585 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1586 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1587 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1588 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1589 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1590 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1591 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1592 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1593 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
1594 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
1595 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1596 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
1597 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
1598 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
1599 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
1600 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
1601 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
1602 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1603 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1604 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
1605 _mm_storeu_ps(ptr1,_tmp1); \
1606 _mm_storeu_ps(ptr1+4,_tmp2); \
1607 _mm_store_ss(ptr1+8,_tmp3); \
1608 _mm_storeu_ps(ptr2,_tmp4); \
1609 _mm_storeu_ps(ptr2+4,_tmp5); \
1610 _mm_store_ss(ptr2+8,_tmp6); \
1611 _mm_storeu_ps(ptr3,_tmp7); \
1612 _mm_storeu_ps(ptr3+4,_tmp8); \
1613 _mm_store_ss(ptr3+8,_tmp9); \
1614 _mm_storeu_ps(ptr4,_tmp10); \
1615 _mm_storeu_ps(ptr4+4,_tmp11); \
1616 _mm_store_ss(ptr4+8,_tmp12); \
1620 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1621 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1622 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22; \
1623 __m128 _tmp23,_tmp24; \
1624 _tmp1 = _mm_loadu_ps(ptr1); \
1625 _tmp2 = _mm_loadu_ps(ptr1+4); \
1626 _tmp3 = _mm_loadu_ps(ptr1+8); \
1627 _tmp4 = _mm_loadu_ps(ptr2); \
1628 _tmp5 = _mm_loadu_ps(ptr2+4); \
1629 _tmp6 = _mm_loadu_ps(ptr2+8); \
1630 _tmp7 = _mm_loadu_ps(ptr3); \
1631 _tmp8 = _mm_loadu_ps(ptr3+4); \
1632 _tmp9 = _mm_loadu_ps(ptr3+8); \
1633 _tmp10 = _mm_loadu_ps(ptr4); \
1634 _tmp11 = _mm_loadu_ps(ptr4+4); \
1635 _tmp12 = _mm_loadu_ps(ptr4+8); \
1636 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1637 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1638 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1639 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1640 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1641 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1642 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1643 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1644 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1645 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1646 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1647 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1648 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1649 jz1 = _mm_movehl_ps(jz1,jx1); \
1650 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1651 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1652 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1653 jx3 = _mm_movehl_ps(jx3,jy2); \
1654 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1655 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1656 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1657 jy4 = _mm_movehl_ps(jy4,jz3); \
1658 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1659 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1660 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
1661 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
1662 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
1663 _tmp4 = _mm_add_ps(_tmp4,jz1); \
1664 _tmp5 = _mm_add_ps(_tmp5,jx3); \
1665 _tmp6 = _mm_add_ps(_tmp6,jy4); \
1666 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
1667 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
1668 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
1669 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1670 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1671 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
1672 _mm_storeu_ps(ptr1,_tmp1); \
1673 _mm_storeu_ps(ptr1+4,_tmp2); \
1674 _mm_storeu_ps(ptr1+8,_tmp3); \
1675 _mm_storeu_ps(ptr2,_tmp4); \
1676 _mm_storeu_ps(ptr2+4,_tmp5); \
1677 _mm_storeu_ps(ptr2+8,_tmp6); \
1678 _mm_storeu_ps(ptr3,_tmp7); \
1679 _mm_storeu_ps(ptr3+4,_tmp8); \
1680 _mm_storeu_ps(ptr3+8,_tmp9); \
1681 _mm_storeu_ps(ptr4,_tmp10); \
1682 _mm_storeu_ps(ptr4+4,_tmp11); \
1683 _mm_storeu_ps(ptr4+8,_tmp12); \
1688 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1689 __m128 _tmp1; \
1690 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1691 jx1 = _mm_movelh_ps(jx1,jy1); \
1692 _tmp1 = _mm_load_ss(ptr1); \
1693 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1694 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1695 _mm_store_ss(ptr1,_tmp1); \
1696 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1700 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1701 __m128 _tmp1, _tmp2; \
1702 _tmp1 = _mm_loadu_ps(ptr1); \
1703 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1704 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1705 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1706 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1707 jx1 = _mm_movelh_ps(jx1,jz1); \
1708 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1709 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1710 _mm_storeu_ps(ptr1,_tmp1); \
1711 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1715 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1716 __m128 _tmp1, _tmp2, _tmp3; \
1717 _tmp1 = _mm_loadu_ps(ptr1); \
1718 _tmp2 = _mm_loadu_ps(ptr1+4); \
1719 _tmp3 = _mm_load_ss(ptr1+8); \
1720 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1721 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1722 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1723 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1724 jx1 = _mm_movelh_ps(jx1,jz1); \
1725 jy2 = _mm_movelh_ps(jy2,jx3); \
1726 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1727 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1728 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1729 _mm_storeu_ps(ptr1,_tmp1); \
1730 _mm_storeu_ps(ptr1+4,_tmp2); \
1731 _mm_store_ss(ptr1+8,_tmp3); \
1735 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1736 __m128 _tmp1, _tmp2, _tmp3; \
1737 _tmp1 = _mm_loadu_ps(ptr1); \
1738 _tmp2 = _mm_loadu_ps(ptr1+4); \
1739 _tmp3 = _mm_loadu_ps(ptr1+8); \
1740 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1741 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1742 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1743 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1744 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1745 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1746 jx1 = _mm_movelh_ps(jx1,jz1); \
1747 jy2 = _mm_movelh_ps(jy2,jx3); \
1748 jz3 = _mm_movelh_ps(jz3,jy4); \
1749 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1750 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1751 _tmp3 = _mm_sub_ps(_tmp3,jz3); \
1752 _mm_storeu_ps(ptr1,_tmp1); \
1753 _mm_storeu_ps(ptr1+4,_tmp2); \
1754 _mm_storeu_ps(ptr1+8,_tmp3); \
1758 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1759 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1760 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1761 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1762 _tmp2 = _mm_load_ss(ptr1+2); \
1763 _tmp3 = _mm_load_ss(ptr2+2); \
1764 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1765 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1766 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1767 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1768 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1769 _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1)); \
1770 _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4)); \
1774 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1775 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1776 _tmp1 = _mm_loadu_ps(ptr1); \
1777 _tmp2 = _mm_loadu_ps(ptr2); \
1778 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1779 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1780 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1781 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1782 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1783 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1784 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1785 _tmp1 = _mm_sub_ps(_tmp1,_tmp4); \
1786 _tmp2 = _mm_sub_ps(_tmp2,_tmp5); \
1787 _tmp3 = _mm_sub_ps(_tmp3,jy2); \
1788 _mm_storeu_ps(ptr1,_tmp1); \
1789 _mm_storeu_ps(ptr2,_tmp2); \
1790 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1791 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1795 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1796 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1797 _tmp1 = _mm_loadu_ps(ptr1); \
1798 _tmp2 = _mm_loadu_ps(ptr1+4); \
1799 _tmp3 = _mm_load_ss(ptr1+8); \
1800 _tmp4 = _mm_loadu_ps(ptr2); \
1801 _tmp5 = _mm_loadu_ps(ptr2+4); \
1802 _tmp6 = _mm_load_ss(ptr2+8); \
1803 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1804 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1805 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1806 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1807 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1808 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1809 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1810 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1811 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1812 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1813 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1814 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1815 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1816 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1817 _tmp6 = _mm_sub_ss(_tmp6,_tmp7); \
1818 _mm_storeu_ps(ptr1,_tmp1); \
1819 _mm_storeu_ps(ptr1+4,_tmp2); \
1820 _mm_store_ss(ptr1+8,_tmp3); \
1821 _mm_storeu_ps(ptr2,_tmp4); \
1822 _mm_storeu_ps(ptr2+4,_tmp5); \
1823 _mm_store_ss(ptr2+8,_tmp6); \
1827 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1828 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1829 _tmp1 = _mm_loadu_ps(ptr1); \
1830 _tmp2 = _mm_loadu_ps(ptr1+4); \
1831 _tmp3 = _mm_loadu_ps(ptr1+8); \
1832 _tmp4 = _mm_loadu_ps(ptr2); \
1833 _tmp5 = _mm_loadu_ps(ptr2+4); \
1834 _tmp6 = _mm_loadu_ps(ptr2+8); \
1835 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1836 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1837 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1838 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1839 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1840 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1841 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1842 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1843 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1844 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1845 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1846 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1847 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1848 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1849 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1850 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1851 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1852 _tmp6 = _mm_sub_ps(_tmp6,_tmp13); \
1853 _mm_storeu_ps(ptr1,_tmp1); \
1854 _mm_storeu_ps(ptr1+4,_tmp2); \
1855 _mm_storeu_ps(ptr1+8,_tmp3); \
1856 _mm_storeu_ps(ptr2,_tmp4); \
1857 _mm_storeu_ps(ptr2+4,_tmp5); \
1858 _mm_storeu_ps(ptr2+8,_tmp6); \
1862 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1863 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1864 _tmp1 = _mm_load_ss(ptr1); \
1865 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1866 _tmp2 = _mm_load_ss(ptr2); \
1867 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1868 _tmp3 = _mm_load_ss(ptr3); \
1869 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1870 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1871 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1872 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1873 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1874 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1875 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1876 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1877 _tmp2 = _mm_sub_ps(_tmp2,_tmp6); \
1878 _tmp3 = _mm_sub_ps(_tmp3,_tmp7); \
1879 _mm_store_ss(ptr1,_tmp1); \
1880 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1881 _mm_store_ss(ptr2,_tmp2); \
1882 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1883 _mm_store_ss(ptr3,_tmp3); \
1884 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1888 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1889 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1890 _tmp1 = _mm_loadu_ps(ptr1); \
1891 _tmp2 = _mm_loadu_ps(ptr2); \
1892 _tmp3 = _mm_loadu_ps(ptr3); \
1893 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1894 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1895 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1896 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1897 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1898 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1899 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1900 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1901 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1902 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1903 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1904 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1905 _tmp1 = _mm_sub_ps(_tmp1,_tmp9); \
1906 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1907 _tmp3 = _mm_sub_ps(_tmp3,_tmp6); \
1908 _tmp4 = _mm_sub_ps(_tmp4,jy2); \
1909 _tmp5 = _mm_sub_ps(_tmp5,_tmp8); \
1910 _mm_storeu_ps(ptr1,_tmp1); \
1911 _mm_storeu_ps(ptr2,_tmp2); \
1912 _mm_storeu_ps(ptr3,_tmp3); \
1913 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1914 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1915 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1919 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1920 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1921 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1922 _tmp1 = _mm_loadu_ps(ptr1); \
1923 _tmp2 = _mm_loadu_ps(ptr1+4); \
1924 _tmp3 = _mm_load_ss(ptr1+8); \
1925 _tmp4 = _mm_loadu_ps(ptr2); \
1926 _tmp5 = _mm_loadu_ps(ptr2+4); \
1927 _tmp6 = _mm_load_ss(ptr2+8); \
1928 _tmp7 = _mm_loadu_ps(ptr3); \
1929 _tmp8 = _mm_loadu_ps(ptr3+4); \
1930 _tmp9 = _mm_load_ss(ptr3+8); \
1931 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1932 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1933 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1934 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1935 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1936 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1937 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1938 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1939 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1940 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1941 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1942 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1943 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1944 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1945 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1946 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1947 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1948 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1949 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1950 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1951 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1952 _tmp6 = _mm_sub_ss(_tmp6,_tmp14); \
1953 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1954 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1955 _tmp9 = _mm_sub_ss(_tmp9,_tmp15); \
1956 _mm_storeu_ps(ptr1,_tmp1); \
1957 _mm_storeu_ps(ptr1+4,_tmp2); \
1958 _mm_store_ss(ptr1+8,_tmp3); \
1959 _mm_storeu_ps(ptr2,_tmp4); \
1960 _mm_storeu_ps(ptr2+4,_tmp5); \
1961 _mm_store_ss(ptr2+8,_tmp6); \
1962 _mm_storeu_ps(ptr3,_tmp7); \
1963 _mm_storeu_ps(ptr3+4,_tmp8); \
1964 _mm_store_ss(ptr3+8,_tmp9); \
1968 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1969 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1970 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1971 _tmp1 = _mm_loadu_ps(ptr1); \
1972 _tmp2 = _mm_loadu_ps(ptr1+4); \
1973 _tmp3 = _mm_loadu_ps(ptr1+8); \
1974 _tmp4 = _mm_loadu_ps(ptr2); \
1975 _tmp5 = _mm_loadu_ps(ptr2+4); \
1976 _tmp6 = _mm_loadu_ps(ptr2+8); \
1977 _tmp7 = _mm_loadu_ps(ptr3); \
1978 _tmp8 = _mm_loadu_ps(ptr3+4); \
1979 _tmp9 = _mm_loadu_ps(ptr3+8); \
1980 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1981 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1982 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1983 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1984 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1985 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1986 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1987 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1988 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1989 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1990 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1991 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1992 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1993 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1994 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1995 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1996 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1997 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1998 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1999 _tmp21 = _mm_movehl_ps(jy4,jz3); \
2000 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
2001 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
2002 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
2003 _tmp3 = _mm_sub_ps(_tmp3,_tmp20); \
2004 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
2005 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
2006 _tmp6 = _mm_sub_ps(_tmp6,_tmp21); \
2007 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
2008 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
2009 _tmp9 = _mm_sub_ps(_tmp9,_tmp14); \
2010 _mm_storeu_ps(ptr1,_tmp1); \
2011 _mm_storeu_ps(ptr1+4,_tmp2); \
2012 _mm_storeu_ps(ptr1+8,_tmp3); \
2013 _mm_storeu_ps(ptr2,_tmp4); \
2014 _mm_storeu_ps(ptr2+4,_tmp5); \
2015 _mm_storeu_ps(ptr2+8,_tmp6); \
2016 _mm_storeu_ps(ptr3,_tmp7); \
2017 _mm_storeu_ps(ptr3+4,_tmp8); \
2018 _mm_storeu_ps(ptr3+8,_tmp9); \
2024 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
2025 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
2026 _tmp1 = _mm_load_ss(ptr1); \
2027 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
2028 _tmp2 = _mm_load_ss(ptr2); \
2029 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
2030 _tmp3 = _mm_load_ss(ptr3); \
2031 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
2032 _tmp4 = _mm_load_ss(ptr4); \
2033 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
2034 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
2035 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
2036 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
2037 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
2038 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
2039 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
2040 _tmp1 = _mm_sub_ps(_tmp1,_tmp7); \
2041 _tmp2 = _mm_sub_ps(_tmp2,_tmp8); \
2042 _tmp3 = _mm_sub_ps(_tmp3,_tmp9); \
2043 _tmp4 = _mm_sub_ps(_tmp4,_tmp10); \
2044 _mm_store_ss(ptr1,_tmp1); \
2045 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
2046 _mm_store_ss(ptr2,_tmp2); \
2047 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
2048 _mm_store_ss(ptr3,_tmp3); \
2049 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
2050 _mm_store_ss(ptr4,_tmp4); \
2051 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
2056 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
2057 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
2058 _tmp1 = _mm_loadu_ps(ptr1); \
2059 _tmp2 = _mm_loadu_ps(ptr2); \
2060 _tmp3 = _mm_loadu_ps(ptr3); \
2061 _tmp4 = _mm_loadu_ps(ptr4); \
2062 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
2063 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
2064 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
2065 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
2066 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
2067 jx1 = _mm_unpacklo_ps(jx1,jy1); \
2068 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
2069 jz1 = _mm_unpacklo_ps(jz1,jx2); \
2070 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
2071 jy2 = _mm_unpacklo_ps(jy2,jz2); \
2072 _tmp10 = _mm_movelh_ps(jx1,jz1); \
2073 _tmp11 = _mm_movehl_ps(jz1,jx1); \
2074 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
2075 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
2076 _tmp1 = _mm_sub_ps(_tmp1,_tmp10); \
2077 _tmp2 = _mm_sub_ps(_tmp2,_tmp11); \
2078 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
2079 _tmp4 = _mm_sub_ps(_tmp4,_tmp13); \
2080 _tmp5 = _mm_sub_ps(_tmp5,jy2); \
2081 _tmp6 = _mm_sub_ps(_tmp6,_tmp9); \
2082 _mm_storeu_ps(ptr1,_tmp1); \
2083 _mm_storeu_ps(ptr2,_tmp2); \
2084 _mm_storeu_ps(ptr3,_tmp3); \
2085 _mm_storeu_ps(ptr4,_tmp4); \
2086 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
2087 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
2088 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
2089 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
2093 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
2094 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
2095 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
2096 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
2097 _tmp1 = _mm_loadu_ps(ptr1); \
2098 _tmp2 = _mm_loadu_ps(ptr1+4); \
2099 _tmp3 = _mm_load_ss(ptr1+8); \
2100 _tmp4 = _mm_loadu_ps(ptr2); \
2101 _tmp5 = _mm_loadu_ps(ptr2+4); \
2102 _tmp6 = _mm_load_ss(ptr2+8); \
2103 _tmp7 = _mm_loadu_ps(ptr3); \
2104 _tmp8 = _mm_loadu_ps(ptr3+4); \
2105 _tmp9 = _mm_load_ss(ptr3+8); \
2106 _tmp10 = _mm_loadu_ps(ptr4); \
2107 _tmp11 = _mm_loadu_ps(ptr4+4); \
2108 _tmp12 = _mm_load_ss(ptr4+8); \
2109 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
2110 jx1 = _mm_unpacklo_ps(jx1,jy1); \
2111 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
2112 jz1 = _mm_unpacklo_ps(jz1,jx2); \
2113 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
2114 jy2 = _mm_unpacklo_ps(jy2,jz2); \
2115 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
2116 jx3 = _mm_unpacklo_ps(jx3,jy3); \
2117 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
2118 _tmp18 = _mm_movehl_ps(jz3,jz3); \
2119 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
2120 _tmp20 = _mm_movelh_ps(jx1,jz1); \
2121 _tmp21 = _mm_movehl_ps(jz1,jx1); \
2122 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
2123 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
2124 _tmp23 = _mm_movelh_ps(jy2,jx3); \
2125 _tmp24 = _mm_movehl_ps(jx3,jy2); \
2126 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
2127 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
2128 _tmp1 = _mm_sub_ps(_tmp1,_tmp20); \
2129 _tmp2 = _mm_sub_ps(_tmp2,_tmp23); \
2130 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
2131 _tmp4 = _mm_sub_ps(_tmp4,_tmp21); \
2132 _tmp5 = _mm_sub_ps(_tmp5,_tmp24); \
2133 _tmp6 = _mm_sub_ss(_tmp6,_tmp17); \
2134 _tmp7 = _mm_sub_ps(_tmp7,_tmp22); \
2135 _tmp8 = _mm_sub_ps(_tmp8,_tmp25); \
2136 _tmp9 = _mm_sub_ss(_tmp9,_tmp18); \
2137 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
2138 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
2139 _tmp12 = _mm_sub_ss(_tmp12,_tmp19); \
2140 _mm_storeu_ps(ptr1,_tmp1); \
2141 _mm_storeu_ps(ptr1+4,_tmp2); \
2142 _mm_store_ss(ptr1+8,_tmp3); \
2143 _mm_storeu_ps(ptr2,_tmp4); \
2144 _mm_storeu_ps(ptr2+4,_tmp5); \
2145 _mm_store_ss(ptr2+8,_tmp6); \
2146 _mm_storeu_ps(ptr3,_tmp7); \
2147 _mm_storeu_ps(ptr3+4,_tmp8); \
2148 _mm_store_ss(ptr3+8,_tmp9); \
2149 _mm_storeu_ps(ptr4,_tmp10); \
2150 _mm_storeu_ps(ptr4+4,_tmp11); \
2151 _mm_store_ss(ptr4+8,_tmp12); \
2155 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
2156 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
2157 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
2158 __m128 _tmp23,_tmp24; \
2159 _tmp1 = _mm_loadu_ps(ptr1); \
2160 _tmp2 = _mm_loadu_ps(ptr1+4); \
2161 _tmp3 = _mm_loadu_ps(ptr1+8); \
2162 _tmp4 = _mm_loadu_ps(ptr2); \
2163 _tmp5 = _mm_loadu_ps(ptr2+4); \
2164 _tmp6 = _mm_loadu_ps(ptr2+8); \
2165 _tmp7 = _mm_loadu_ps(ptr3); \
2166 _tmp8 = _mm_loadu_ps(ptr3+4); \
2167 _tmp9 = _mm_loadu_ps(ptr3+8); \
2168 _tmp10 = _mm_loadu_ps(ptr4); \
2169 _tmp11 = _mm_loadu_ps(ptr4+4); \
2170 _tmp12 = _mm_loadu_ps(ptr4+8); \
2171 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
2172 jx1 = _mm_unpacklo_ps(jx1,jy1); \
2173 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
2174 jz1 = _mm_unpacklo_ps(jz1,jx2); \
2175 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
2176 jy2 = _mm_unpacklo_ps(jy2,jz2); \
2177 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
2178 jx3 = _mm_unpacklo_ps(jx3,jy3); \
2179 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
2180 jz3 = _mm_unpacklo_ps(jz3,jx4); \
2181 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
2182 jy4 = _mm_unpacklo_ps(jy4,jz4); \
2183 _tmp19 = _mm_movelh_ps(jx1,jz1); \
2184 jz1 = _mm_movehl_ps(jz1,jx1); \
2185 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
2186 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
2187 _tmp21 = _mm_movelh_ps(jy2,jx3); \
2188 jx3 = _mm_movehl_ps(jx3,jy2); \
2189 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
2190 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
2191 _tmp23 = _mm_movelh_ps(jz3,jy4); \
2192 jy4 = _mm_movehl_ps(jy4,jz3); \
2193 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
2194 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
2195 _tmp1 = _mm_sub_ps(_tmp1,_tmp19); \
2196 _tmp2 = _mm_sub_ps(_tmp2,_tmp21); \
2197 _tmp3 = _mm_sub_ps(_tmp3,_tmp23); \
2198 _tmp4 = _mm_sub_ps(_tmp4,jz1); \
2199 _tmp5 = _mm_sub_ps(_tmp5,jx3); \
2200 _tmp6 = _mm_sub_ps(_tmp6,jy4); \
2201 _tmp7 = _mm_sub_ps(_tmp7,_tmp20); \
2202 _tmp8 = _mm_sub_ps(_tmp8,_tmp22); \
2203 _tmp9 = _mm_sub_ps(_tmp9,_tmp24); \
2204 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
2205 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
2206 _tmp12 = _mm_sub_ps(_tmp12,_tmp18); \
2207 _mm_storeu_ps(ptr1,_tmp1); \
2208 _mm_storeu_ps(ptr1+4,_tmp2); \
2209 _mm_storeu_ps(ptr1+8,_tmp3); \
2210 _mm_storeu_ps(ptr2,_tmp4); \
2211 _mm_storeu_ps(ptr2+4,_tmp5); \
2212 _mm_storeu_ps(ptr2+8,_tmp6); \
2213 _mm_storeu_ps(ptr3,_tmp7); \
2214 _mm_storeu_ps(ptr3+4,_tmp8); \
2215 _mm_storeu_ps(ptr3+8,_tmp9); \
2216 _mm_storeu_ps(ptr4,_tmp10); \
2217 _mm_storeu_ps(ptr4+4,_tmp11); \
2218 _mm_storeu_ps(ptr4+8,_tmp12); \
2226 /* Routine to be called with rswitch/rcut at the beginning of a kernel
2227 * to set up the 7 constants used for analytic 5th order switch calculations.
2229 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) { \
2230 const __m128 _swsetup_cm6 = _mm_set_ps( -6.0, -6.0, -6.0, -6.0); \
2231 const __m128 _swsetup_cm10 = _mm_set_ps(-10.0,-10.0,-10.0,-10.0); \
2232 const __m128 _swsetup_c15 = _mm_set_ps( 15.0, 15.0, 15.0, 15.0); \
2233 const __m128 _swsetup_cm30 = _mm_set_ps(-30.0,-30.0,-30.0,-30.0); \
2234 const __m128 _swsetup_c60 = _mm_set_ps( 60.0, 60.0, 60.0, 60.0); \
2236 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5; \
2238 d = _mm_sub_ps(rcut,rswitch); \
2239 dinv = gmx_mm_inv_ps(d); \
2240 dinv2 = _mm_mul_ps(dinv,dinv); \
2241 dinv3 = _mm_mul_ps(dinv2,dinv); \
2242 dinv4 = _mm_mul_ps(dinv2,dinv2); \
2243 dinv5 = _mm_mul_ps(dinv3,dinv2); \
2245 switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3); \
2246 switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4); \
2247 switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5); \
2248 switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3); \
2249 switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4); \
2250 switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5); \
2254 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
2255 const __m128 _sw_one = _mm_set_ps( 1.0, 1.0, 1.0, 1.0); \
2256 __m128 d,d2; \
2257 d = _mm_max_ps(r,rswitch); \
2258 d = _mm_min_ps(d,rcut); \
2259 d = _mm_sub_ps(d,rswitch); \
2260 d2 = _mm_mul_ps(d,d); \
2261 sw = _mm_mul_ps(d,sw_C5); \
2262 dsw = _mm_mul_ps(d,sw_D4); \
2263 sw = _mm_add_ps(sw,sw_C4); \
2264 dsw = _mm_add_ps(dsw,sw_D3); \
2265 sw = _mm_mul_ps(sw,d); \
2266 dsw = _mm_mul_ps(dsw,d); \
2267 sw = _mm_add_ps(sw,sw_C3); \
2268 dsw = _mm_add_ps(dsw,sw_D2); \
2269 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
2270 dsw = _mm_mul_ps(dsw,d2); \
2271 sw = _mm_add_ps(sw,_sw_one); \
2276 static inline void
2277 gmx_mm_update_iforce_1atom_ps(__m128 *fix1, __m128 *fiy1, __m128 *fiz1,
2278 float *fptr,
2279 float *fshiftptr)
2281 __m128 t1,t2,t3;
2283 #ifdef GMX_SSE3
2284 *fix1 = _mm_hadd_ps(*fix1,*fix1);
2285 *fiy1 = _mm_hadd_ps(*fiy1,*fiz1);
2287 *fix1 = _mm_hadd_ps(*fix1,*fiy1); /* fiz1 fiy1 fix1 fix1 */
2288 #else
2289 /* SSE2 */
2290 /* transpose data */
2291 t1 = *fix1;
2292 _MM_TRANSPOSE4_PS(*fix1,t1,*fiy1,*fiz1);
2293 *fix1 = _mm_add_ps(_mm_add_ps(*fix1,t1), _mm_add_ps(*fiy1,*fiz1));
2294 #endif
2295 t2 = _mm_load_ss(fptr);
2296 t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
2297 t3 = _mm_load_ss(fshiftptr);
2298 t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
2300 t2 = _mm_add_ps(t2,*fix1);
2301 t3 = _mm_add_ps(t3,*fix1);
2303 _mm_store_ss(fptr,t2);
2304 _mm_storeh_pi((__m64 *)(fptr+1),t2);
2305 _mm_store_ss(fshiftptr,t3);
2306 _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
2309 static inline void
2310 gmx_mm_update_iforce_2atoms_ps(__m128 *fix1, __m128 *fiy1, __m128 *fiz1,
2311 __m128 *fix2, __m128 *fiy2, __m128 *fiz2,
2312 float *fptr,
2313 float *fshiftptr)
2315 __m128 t1,t2,t4;
2317 #ifdef GMX_SSE3
2318 *fix1 = _mm_hadd_ps(*fix1,*fiy1);
2319 *fiz1 = _mm_hadd_ps(*fiz1,*fix2);
2320 *fiy2 = _mm_hadd_ps(*fiy2,*fiz2);
2322 *fix1 = _mm_hadd_ps(*fix1,*fiz1); /* fix2 fiz1 fiy1 fix1 */
2323 *fiy2 = _mm_hadd_ps(*fiy2,*fiy2); /* - - fiz2 fiy2 */
2324 #else
2325 /* SSE2 */
2326 /* transpose data */
2327 _MM_TRANSPOSE4_PS(*fix1,*fiy1,*fiz1,*fix2);
2328 t1 = _mm_unpacklo_ps(*fiy2,*fiz2);
2329 t2 = _mm_unpackhi_ps(*fiy2,*fiz2);
2331 *fix1 = _mm_add_ps(_mm_add_ps(*fix1,*fiy1), _mm_add_ps(*fiz1,*fix2));
2332 t1 = _mm_add_ps(t1,t2);
2333 t2 = _mm_movehl_ps(t2,t1);
2334 *fiy2 = _mm_add_ps(t1,t2);
2335 #endif
2336 _mm_storeu_ps(fptr, _mm_add_ps(*fix1,_mm_loadu_ps(fptr) ));
2337 t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
2338 _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(*fiy2,t1));
2340 t4 = _mm_load_ss(fshiftptr+2);
2341 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
2343 t1 = _mm_shuffle_ps(*fix1,*fiy2,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
2344 t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
2345 t2 = _mm_shuffle_ps(*fiy2,*fix1,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
2347 t1 = _mm_add_ps(t1,t2);
2348 t1 = _mm_add_ps(t1,t4); /* y x - z */
2350 _mm_store_ss(fshiftptr+2,t1);
2351 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
2356 static inline void
2357 gmx_mm_update_iforce_3atoms_ps(__m128 *fix1, __m128 *fiy1, __m128 *fiz1,
2358 __m128 *fix2, __m128 *fiy2, __m128 *fiz2,
2359 __m128 *fix3, __m128 *fiy3, __m128 *fiz3,
2360 float *fptr,
2361 float *fshiftptr)
2363 __m128 t1,t2,t3,t4;
2365 #ifdef GMX_SSE3
2366 *fix1 = _mm_hadd_ps(*fix1,*fiy1);
2367 *fiz1 = _mm_hadd_ps(*fiz1,*fix2);
2368 *fiy2 = _mm_hadd_ps(*fiy2,*fiz2);
2369 *fix3 = _mm_hadd_ps(*fix3,*fiy3);
2370 *fiz3 = _mm_hadd_ps(*fiz3,*fiz3);
2372 *fix1 = _mm_hadd_ps(*fix1,*fiz1); /* fix2 fiz1 fiy1 fix1 */
2373 *fiy2 = _mm_hadd_ps(*fiy2,*fix3); /* fiy3 fix3 fiz2 fiy2 */
2374 *fiz3 = _mm_hadd_ps(*fiz3,*fiz3); /* - - - fiz3 */
2375 #else
2376 /* SSE2 */
2377 /* transpose data */
2378 _MM_TRANSPOSE4_PS(*fix1,*fiy1,*fiz1,*fix2);
2379 _MM_TRANSPOSE4_PS(*fiy2,*fiz2,*fix3,*fiy3);
2380 t2 = _mm_movehl_ps(_mm_setzero_ps(),*fiz3);
2381 t1 = _mm_shuffle_ps(*fiz3,*fiz3,_MM_SHUFFLE(0,0,0,1));
2382 t3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
2384 *fix1 = _mm_add_ps(_mm_add_ps(*fix1,*fiy1), _mm_add_ps(*fiz1,*fix2));
2385 *fiy2 = _mm_add_ps(_mm_add_ps(*fiy2,*fiz2), _mm_add_ps(*fix3,*fiy3));
2386 *fiz3 = _mm_add_ss(_mm_add_ps(*fiz3,t1) , _mm_add_ps(t2,t3));
2387 #endif
2388 _mm_storeu_ps(fptr, _mm_add_ps(*fix1,_mm_loadu_ps(fptr) ));
2389 _mm_storeu_ps(fptr+4,_mm_add_ps(*fiy2,_mm_loadu_ps(fptr+4)));
2390 _mm_store_ss (fptr+8,_mm_add_ss(*fiz3,_mm_load_ss(fptr+8) ));
2392 t4 = _mm_load_ss(fshiftptr+2);
2393 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
2395 t1 = _mm_shuffle_ps(*fiz3,*fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
2396 t2 = _mm_shuffle_ps(*fix1,*fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
2397 t3 = _mm_shuffle_ps(*fiy2,*fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
2398 t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
2400 t1 = _mm_add_ps(t1,t2);
2401 t3 = _mm_add_ps(t3,t4);
2402 t1 = _mm_add_ps(t1,t3); /* y x - z */
2404 _mm_store_ss(fshiftptr+2,t1);
2405 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
2409 static inline void
2410 gmx_mm_update_iforce_4atoms_ps(__m128 *fix1, __m128 *fiy1, __m128 *fiz1,
2411 __m128 *fix2, __m128 *fiy2, __m128 *fiz2,
2412 __m128 *fix3, __m128 *fiy3, __m128 *fiz3,
2413 __m128 *fix4, __m128 *fiy4, __m128 *fiz4,
2414 float *fptr,
2415 float *fshiftptr)
2417 __m128 t1,t2,t3,t4,t5;
2419 #ifdef GMX_SSE3
2420 *fix1 = _mm_hadd_ps(*fix1,*fiy1);
2421 *fiz1 = _mm_hadd_ps(*fiz1,*fix2);
2422 *fiy2 = _mm_hadd_ps(*fiy2,*fiz2);
2423 *fix3 = _mm_hadd_ps(*fix3,*fiy3);
2424 *fiz3 = _mm_hadd_ps(*fiz3,*fix4);
2425 *fiy4 = _mm_hadd_ps(*fiy4,*fiz4);
2427 *fix1 = _mm_hadd_ps(*fix1,*fiz1); /* fix2 fiz1 fiy1 fix1 */
2428 *fiy2 = _mm_hadd_ps(*fiy2,*fix3); /* fiy3 fix3 fiz2 fiy2 */
2429 *fiz3 = _mm_hadd_ps(*fiz3,*fiy4); /* fiz4 fiy4 fix4 fiz3 */
2430 #else
2431 /* SSE2 */
2432 /* transpose data */
2433 _MM_TRANSPOSE4_PS(*fix1,*fiy1,*fiz1,*fix2);
2434 _MM_TRANSPOSE4_PS(*fiy2,*fiz2,*fix3,*fiy3);
2435 _MM_TRANSPOSE4_PS(*fiz3,*fix4,*fiy4,*fiz4);
2437 *fix1 = _mm_add_ps(_mm_add_ps(*fix1,*fiy1), _mm_add_ps(*fiz1,*fix2));
2438 *fiy2 = _mm_add_ps(_mm_add_ps(*fiy2,*fiz2), _mm_add_ps(*fix3,*fiy3));
2439 *fiz3 = _mm_add_ps(_mm_add_ps(*fiz3,*fix4), _mm_add_ps(*fiy4,*fiz4));
2440 #endif
2441 _mm_storeu_ps(fptr, _mm_add_ps(*fix1,_mm_loadu_ps(fptr) ));
2442 _mm_storeu_ps(fptr+4,_mm_add_ps(*fiy2,_mm_loadu_ps(fptr+4)));
2443 _mm_storeu_ps(fptr+8,_mm_add_ps(*fiz3,_mm_loadu_ps(fptr+8)));
2445 t5 = _mm_load_ss(fshiftptr+2);
2446 t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
2448 t1 = _mm_shuffle_ps(*fix1,*fix1,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
2449 t2 = _mm_shuffle_ps(*fiy2,*fiy2,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
2450 t3 = _mm_shuffle_ps(*fiz3,*fiz3,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
2451 t4 = _mm_shuffle_ps(*fix1,*fiy2,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
2452 t4 = _mm_shuffle_ps(*fiz3,t4 ,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
2454 t1 = _mm_add_ps(t1,t2);
2455 t3 = _mm_add_ps(t3,t4);
2456 t1 = _mm_add_ps(t1,t3); /* y x - z */
2457 t5 = _mm_add_ps(t5,t1);
2459 _mm_store_ss(fshiftptr+2,t5);
2460 _mm_storeh_pi((__m64 *)(fshiftptr),t5);
2464 #ifdef GMX_SSE3
2466 #define GMX_MM_UPDATE_1POT_PS(pot1,ptr1) \
2468 pot1 = _mm_hadd_ps(pot1,pot1); \
2469 pot1 = _mm_hadd_ps(pot1,pot1); \
2470 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2473 #define GMX_MM_UPDATE_2POT_PS(pot1,ptr1,pot2,ptr2) \
2475 pot1 = _mm_hadd_ps(pot1,pot2); \
2476 pot1 = _mm_hadd_ps(pot1,pot1); \
2477 pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)); \
2478 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2479 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2))); \
2482 #else
2484 #define GMX_MM_UPDATE_1POT_PS(pot1,ptr1) \
2486 pot1 = _mm_add_ps(pot1,_mm_movehl_ps(pot1,pot1)); \
2487 pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1))); \
2488 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2491 #define GMX_MM_UPDATE_2POT_PS(pot1,ptr1,pot2,ptr2) \
2493 __m128 _updt1_,_updt2; \
2494 _updt1 = _mm_movehl_ps(pot2,pot1); /* 2d 2c 1d 1c */ \
2495 _updt2 = _mm_movelh_ps(pot1,pot2); /* 2b 2a 1b 1a */ \
2496 _updt1 = _mm_add_ps(_updt1,_updt2); /* 2 2 1 1 */ \
2497 _updt2 = _mm_shuffle_ps(_updt1,_updt1,_MM_SHUFFLE(3,3,1,1)); \
2498 pot1 = _mm_add_ps(_updt1,_updt2); /* - 2 - 1 */ \
2499 pot2 = _mm_movehl_ps(_updt2,pot1); /* - - - 2 */ \
2500 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2501 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2))); \
2504 #endif
2507 #define GMX_MM_UPDATE_4POT_PS(pot1,ptr1,pot2,ptr2,pot3,ptr3,pot4,ptr4) \
2509 _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4); \
2510 pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4)); \
2511 pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1)); \
2512 pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2)); \
2513 pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3)); \
2514 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2515 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2))); \
2516 _mm_store_ss(ptr3,_mm_add_ss(pot3,_mm_load_ss(ptr3))); \
2517 _mm_store_ss(ptr4,_mm_add_ss(pot4,_mm_load_ss(ptr4))); \
2521 #endif /* _gmx_sse2_single_h_ */