2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
34 #ifndef _gmx_sse2_single_h_
35 #define _gmx_sse2_single_h_
37 /* We require SSE2 now! */
42 #include <xmmintrin.h> /* SSE */
43 #include <emmintrin.h> /* SSE2 */
46 # include <pmmintrin.h> /* SSE3 */
49 # include <smmintrin.h> /* SSE4.1 */
54 /***************************************************
56 * COMPILER RANT WARNING: *
58 * Ideally, this header would be filled with *
59 * simple static inline functions. Unfortunately, *
60 * many vendors provide really braindead compilers *
61 * that either cannot handle more than 1-2 SSE *
62 * function parameters, and some cannot handle *
63 * pointers to SSE __m128 datatypes as parameters *
64 * at all. Thus, for portability we have had to *
65 * implement all but the simplest routines as *
68 ***************************************************/
71 /***************************************************
73 * Wrappers/replacements for some instructions *
74 * not available in all SSE versions. *
76 ***************************************************/
79 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
81 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
85 * Some compilers require a cast to change the interpretation
86 * of a register from FP to Int and vice versa, and not all of
87 * the provide instructions to do this. Roll our own wrappers...
90 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
91 # define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
92 # define gmx_mm_castps_si128(a) _mm_castps_si128(a)
93 # define gmx_mm_castps_ps128(a) (a)
94 #elif defined(__GNUC__)
95 # define gmx_mm_castsi128_ps(a) ((__m128)(a))
96 # define gmx_mm_castps_si128(a) ((__m128i)(a))
97 # define gmx_mm_castps_ps128(a) ((__m128)(a))
99 static __m128
gmx_mm_castsi128_ps(__m128i a
) { return *(__m128
*) &a
; }
100 static __m128i
gmx_mm_castps_si128(__m128 a
) { return *(__m128i
*) &a
; }
101 static __m128
gmx_mm_castps_ps128(__m128 a
) { return *(__m128
*) &a
; }
106 /* IO functions, just for debugging */
109 printxmm(const char *s
,__m128 xmm
)
113 _mm_storeu_ps(f
,xmm
);
114 printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s
,f
[0],f
[1],f
[2],f
[3]);
119 printxmmsum(const char *s
,__m128 xmm
)
123 _mm_storeu_ps(f
,xmm
);
124 printf("%s (sum): %15.10g\n",s
,f
[0]+f
[1]+f
[2]+f
[3]);
129 printxmmi(const char *s
,__m128i xmmi
)
133 _mm_storeu_si128((__m128i
*)i
,xmmi
);
134 printf("%10s: %2d %2d %2d %2d\n",s
,i
[0],i
[1],i
[2],i
[3]);
138 /************************
140 * Simple math routines *
142 ************************/
145 gmx_mm_invsqrt_ps(__m128 x
)
147 const __m128 half
= {0.5,0.5,0.5,0.5};
148 const __m128 three
= {3.0,3.0,3.0,3.0};
150 __m128 lu
= _mm_rsqrt_ps(x
);
152 return _mm_mul_ps(half
,_mm_mul_ps(_mm_sub_ps(three
,_mm_mul_ps(_mm_mul_ps(lu
,lu
),x
)),lu
));
156 gmx_mm_sqrt_ps(__m128 x
)
161 mask
= _mm_cmpeq_ps(x
,_mm_setzero_ps());
162 res
= _mm_andnot_ps(mask
,gmx_mm_invsqrt_ps(x
));
164 res
= _mm_mul_ps(x
,res
);
170 gmx_mm_inv_ps(__m128 x
)
172 const __m128 two
= {2.0f
,2.0f
,2.0f
,2.0f
};
174 __m128 lu
= _mm_rcp_ps(x
);
176 return _mm_mul_ps(lu
,_mm_sub_ps(two
,_mm_mul_ps(lu
,x
)));
181 gmx_mm_calc_rsq_ps(__m128 dx
, __m128 dy
, __m128 dz
)
183 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
), _mm_mul_ps(dy
,dy
) ), _mm_mul_ps(dz
,dz
) );
186 /* Normal sum of four xmm registers */
187 #define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
190 gmx_mm_log_ps(__m128 x
)
192 /* Same algorithm as cephes library */
193 const __m128 expmask
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
194 const __m128i expbase_m1
= _mm_set1_epi32(127-1); /* We want non-IEEE format */
195 const __m128 half
= _mm_set1_ps(0.5f
);
196 const __m128 one
= _mm_set1_ps(1.0f
);
197 const __m128 invsq2
= _mm_set1_ps(1.0f
/sqrt(2.0f
));
198 const __m128 corr1
= _mm_set1_ps(-2.12194440e-4f
);
199 const __m128 corr2
= _mm_set1_ps(0.693359375f
);
201 const __m128 CA_1
= _mm_set1_ps(0.070376836292f
);
202 const __m128 CB_0
= _mm_set1_ps(1.6714950086782716f
);
203 const __m128 CB_1
= _mm_set1_ps(-2.452088066061482f
);
204 const __m128 CC_0
= _mm_set1_ps(1.5220770854701728f
);
205 const __m128 CC_1
= _mm_set1_ps(-1.3422238433233642f
);
206 const __m128 CD_0
= _mm_set1_ps(1.386218787509749f
);
207 const __m128 CD_1
= _mm_set1_ps(0.35075468953796346f
);
208 const __m128 CE_0
= _mm_set1_ps(1.3429983063133937f
);
209 const __m128 CE_1
= _mm_set1_ps(1.807420826584643f
);
216 __m128 pA
,pB
,pC
,pD
,pE
,tB
,tC
,tD
,tE
;
218 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
219 fexp
= _mm_and_ps(x
,expmask
);
220 iexp
= gmx_mm_castps_si128(fexp
);
221 iexp
= _mm_srli_epi32(iexp
,23);
222 iexp
= _mm_sub_epi32(iexp
,expbase_m1
);
224 x
= _mm_andnot_ps(expmask
,x
);
225 x
= _mm_or_ps(x
,one
);
226 x
= _mm_mul_ps(x
,half
);
228 mask
= _mm_cmplt_ps(x
,invsq2
);
230 x
= _mm_add_ps(x
,_mm_and_ps(mask
,x
));
231 x
= _mm_sub_ps(x
,one
);
232 iexp
= _mm_add_epi32(iexp
,gmx_mm_castps_si128(mask
)); /* 0xFFFFFFFF = -1 as int */
234 x2
= _mm_mul_ps(x
,x
);
236 pA
= _mm_mul_ps(CA_1
,x
);
237 pB
= _mm_mul_ps(CB_1
,x
);
238 pC
= _mm_mul_ps(CC_1
,x
);
239 pD
= _mm_mul_ps(CD_1
,x
);
240 pE
= _mm_mul_ps(CE_1
,x
);
241 tB
= _mm_add_ps(CB_0
,x2
);
242 tC
= _mm_add_ps(CC_0
,x2
);
243 tD
= _mm_add_ps(CD_0
,x2
);
244 tE
= _mm_add_ps(CE_0
,x2
);
245 pB
= _mm_add_ps(pB
,tB
);
246 pC
= _mm_add_ps(pC
,tC
);
247 pD
= _mm_add_ps(pD
,tD
);
248 pE
= _mm_add_ps(pE
,tE
);
250 pA
= _mm_mul_ps(pA
,pB
);
251 pC
= _mm_mul_ps(pC
,pD
);
252 pE
= _mm_mul_ps(pE
,x2
);
253 pA
= _mm_mul_ps(pA
,pC
);
254 y
= _mm_mul_ps(pA
,pE
);
256 fexp
= _mm_cvtepi32_ps(iexp
);
257 y
= _mm_add_ps(y
,_mm_mul_ps(fexp
,corr1
));
259 y
= _mm_sub_ps(y
, _mm_mul_ps(half
,x2
));
260 x2
= _mm_add_ps(x
,y
);
262 x2
= _mm_add_ps(x2
,_mm_mul_ps(fexp
,corr2
));
269 * Exponential function.
271 * Exp(x) is calculate from the relation Exp(x)=2^(y), where y=log2(e)*x
272 * Thus, the contents of this routine is mostly about calculating 2^y.
274 * This is done by separating y=z+w, where z=[y] is an integer. For technical reasons it is easiest
275 * for us to round to the _nearest_ integer and have w in [-0.5,0.5] rather than always rounding down.
276 * (It is not until SSE4 there was an efficient operation to do rounding towards -infinity).
278 * With this we get 2^y=2^z*2^w
280 * Since we have IEEE fp representation, we can easily calculate 2^z by adding the FP exponent bias
281 * (127 in single), and shifting the integer to the exponent field of the FP number (23 bits up).
283 * The 2^w term is calculated from a (5,0)-th order (no denominator) Minimax polynomia on the interval
284 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
286 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 5, 0}, WorkingPrecision -> 15]
288 * The lowest exponent we can represent in IEEE single-precision binary format is 2^-126; below that
289 * it will wrap around and lead to very large positive numbers. This corresponds to a lower bound
290 * on the argument for exp(x) of roughly -87.33. For smaller arguments the return value will be 0.0.
292 * There appears to be a slight loss of precision for large arguments (~50), where the largest relative
293 * error reaches ~3e-6. However, since the actual value for that argument is around 10^21, it might
294 * not matter for typical single precision workloads. This is likely caused by the polynomial evaluation,
295 * and the only way around would then be a table-based version, which I haven't managed to get the
296 * same performance from.
298 * The _average_ accuracy is 22.7 bits in the range [-10,10], and the worst roughly 1 bit worse.
301 gmx_mm_exp_ps(__m128 x
)
303 const __m128 argscale
= _mm_set1_ps(1.442695040888963f
);
304 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
305 const __m128 arglimit
= _mm_set1_ps(-126.0f
/1.442695040888963f
);
307 const __m128i expbase
= _mm_set1_epi32(127);
308 const __m128 CA0
= _mm_set1_ps(0.00132764719920600f
);
309 const __m128 CB0
= _mm_set1_ps(3.17196359322f
);
310 const __m128 CC0
= _mm_set1_ps(20.36135752425f
);
311 const __m128 CC1
= _mm_set1_ps(-0.681627790451f
);
312 const __m128 CD0
= _mm_set1_ps(11.66225206128f
);
313 const __m128 CD1
= _mm_set1_ps(4.79739947827f
);
321 __m128 factB
,factC
,factD
;
323 z
= _mm_mul_ps(x
,argscale
);
324 iexppart
= _mm_cvtps_epi32(z
);
326 /* This reduces latency and speeds up the code by roughly 5% when supported */
327 intpart
= _mm_round_ps(z
,0);
329 intpart
= _mm_cvtepi32_ps(iexppart
);
331 iexppart
= _mm_slli_epi32(_mm_add_epi32(iexppart
,expbase
),23);
332 valuemask
= _mm_cmpgt_ps(x
,arglimit
);
334 z
= _mm_sub_ps(z
,intpart
);
335 z2
= _mm_mul_ps(z
,z
);
337 fexppart
= _mm_and_ps(valuemask
,gmx_mm_castsi128_ps(iexppart
));
339 /* Since SSE floating-point has relatively high latency it is faster to do
340 * factorized polynomial summation with independent terms than using alternating add/multiply, i.e.
341 * p(z) = A0 * (B0 + z) * (C0 + C1*z + z^2) * (D0 + D1*z + z^2)
343 factB
= _mm_add_ps(CB0
,z
);
344 factC
= _mm_add_ps(CC0
,_mm_mul_ps(CC1
,z
) );
345 factC
= _mm_add_ps(factC
,z2
);
346 factD
= _mm_add_ps(CD0
,_mm_mul_ps(CD1
,z
) );
347 factD
= _mm_add_ps(factD
,z2
);
349 z
= _mm_mul_ps(CA0
,fexppart
);
350 factB
= _mm_mul_ps(factB
,factC
);
351 z
= _mm_mul_ps(z
,factD
);
352 z
= _mm_mul_ps(z
,factB
);
354 /* Currently uses 22 actual (real, not including casts) SSE instructions */
361 gmx_mm_sincos_ps(__m128 x
,
365 const __m128 _sincosf_two_over_pi
= {2.0/M_PI
,2.0/M_PI
,2.0/M_PI
,2.0/M_PI
};
366 const __m128 _sincosf_half
= {0.5,0.5,0.5,0.5};
367 const __m128 _sincosf_one
= {1.0,1.0,1.0,1.0};
369 const __m128i _sincosf_izero
= _mm_set1_epi32(0);
370 const __m128i _sincosf_ione
= _mm_set1_epi32(1);
371 const __m128i _sincosf_itwo
= _mm_set1_epi32(2);
372 const __m128i _sincosf_ithree
= _mm_set1_epi32(3);
374 const __m128 _sincosf_kc1
= {1.57079625129,1.57079625129,1.57079625129,1.57079625129};
375 const __m128 _sincosf_kc2
= {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8};
376 const __m128 _sincosf_cc0
= {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249};
377 const __m128 _sincosf_cc1
= {0.0416566950,0.0416566950,0.0416566950,0.0416566950};
378 const __m128 _sincosf_cc2
= {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225};
379 const __m128 _sincosf_sc0
= {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727};
380 const __m128 _sincosf_sc1
= {0.0083320758,0.0083320758,0.0083320758,0.0083320758};
381 const __m128 _sincosf_sc2
= {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247};
383 __m128 _sincosf_signbit
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
384 __m128 _sincosf_tiny
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) );
390 __m128 _sincosf_absxl
;
399 __m128i _sincosf_offsetSin
;
400 __m128i _sincosf_offsetCos
;
401 __m128 _sincosf_sinMask
;
402 __m128 _sincosf_cosMask
;
403 __m128 _sincosf_isTiny
;
410 _sincosf_xl
= _mm_mul_ps(x
,_sincosf_two_over_pi
);
412 _sincosf_xl
= _mm_add_ps(_sincosf_xl
,_mm_or_ps(_mm_and_ps(_sincosf_xl
,_sincosf_signbit
),_sincosf_half
));
414 _sincosf_q
= _mm_cvttps_epi32(_sincosf_xl
);
415 _sincosf_qf
= _mm_cvtepi32_ps(_sincosf_q
);
417 _sincosf_offsetSin
= _mm_and_si128(_sincosf_q
,_sincosf_ithree
);
418 _sincosf_offsetCos
= _mm_add_epi32(_sincosf_offsetSin
,_sincosf_ione
);
420 _sincosf_p1
= _mm_mul_ps(_sincosf_qf
,_sincosf_kc1
);
421 _sincosf_xl
= _mm_mul_ps(_sincosf_qf
,_sincosf_kc2
);
422 _sincosf_p1
= _mm_sub_ps(x
,_sincosf_p1
);
423 _sincosf_xl
= _mm_sub_ps(_sincosf_p1
,_sincosf_xl
);
425 _sincosf_absxl
= _mm_andnot_ps(_sincosf_signbit
,_sincosf_xl
);
426 _sincosf_isTiny
= _mm_cmpgt_ps(_sincosf_tiny
,_sincosf_absxl
);
428 _sincosf_xl2
= _mm_mul_ps(_sincosf_xl
,_sincosf_xl
);
429 _sincosf_xl3
= _mm_mul_ps(_sincosf_xl2
,_sincosf_xl
);
431 _sincosf_ct1
= _mm_mul_ps(_sincosf_cc0
,_sincosf_xl2
);
432 _sincosf_ct1
= _mm_add_ps(_sincosf_ct1
,_sincosf_cc1
);
433 _sincosf_st1
= _mm_mul_ps(_sincosf_sc0
,_sincosf_xl2
);
434 _sincosf_st1
= _mm_add_ps(_sincosf_st1
,_sincosf_sc1
);
435 _sincosf_ct2
= _mm_mul_ps(_sincosf_ct1
,_sincosf_xl2
);
436 _sincosf_ct2
= _mm_add_ps(_sincosf_ct2
,_sincosf_cc2
);
437 _sincosf_st2
= _mm_mul_ps(_sincosf_st1
,_sincosf_xl2
);
438 _sincosf_st2
= _mm_add_ps(_sincosf_st2
,_sincosf_sc2
);
440 _sincosf_cx
= _mm_mul_ps(_sincosf_ct2
,_sincosf_xl2
);
441 _sincosf_cx
= _mm_add_ps(_sincosf_cx
,_sincosf_one
);
443 _sincosf_sx
= _mm_mul_ps(_sincosf_st2
,_sincosf_xl3
);
444 _sincosf_sx
= _mm_add_ps(_sincosf_sx
,_sincosf_xl
);
446 _sincosf_sinMask
= gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin
,_sincosf_ione
), _sincosf_izero
) );
447 _sincosf_cosMask
= gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos
,_sincosf_ione
), _sincosf_izero
) );
449 _sincosf_ts
= _mm_or_ps( _mm_and_ps(_sincosf_sinMask
,_sincosf_sx
) , _mm_andnot_ps(_sincosf_sinMask
,_sincosf_cx
) );
450 _sincosf_tc
= _mm_or_ps( _mm_and_ps(_sincosf_cosMask
,_sincosf_sx
) , _mm_andnot_ps(_sincosf_cosMask
,_sincosf_cx
) );
452 _sincosf_sinMask
= gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin
,_sincosf_itwo
), _sincosf_izero
) );
453 _sincosf_tsn
= _mm_xor_ps(_sincosf_signbit
,_sincosf_ts
);
454 _sincosf_ts
= _mm_or_ps( _mm_and_ps(_sincosf_sinMask
,_sincosf_ts
) , _mm_andnot_ps(_sincosf_sinMask
,_sincosf_tsn
) );
456 _sincosf_cosMask
= gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos
,_sincosf_itwo
), _sincosf_izero
) );
457 _sincosf_tcn
= _mm_xor_ps(_sincosf_signbit
,_sincosf_tc
);
458 _sincosf_tc
= _mm_or_ps( _mm_and_ps(_sincosf_cosMask
,_sincosf_tc
) , _mm_andnot_ps(_sincosf_cosMask
,_sincosf_tcn
) );
460 *sinval
= _sincosf_ts
;
461 *cosval
= _sincosf_tc
;
467 gmx_mm_tan_ps(__m128 x
)
469 __m128 sinval
,cosval
;
472 gmx_mm_sincos_ps(x
,&sinval
,&cosval
);
474 tanval
= _mm_mul_ps(sinval
,gmx_mm_inv_ps(cosval
));
481 gmx_mm_asin_ps(__m128 x
)
483 /* Same algorithm as cephes library */
484 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
485 const __m128 limitlow
= _mm_set1_ps(1e-4f
);
486 const __m128 half
= _mm_set1_ps(0.5f
);
487 const __m128 one
= _mm_set1_ps(1.0f
);
488 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0f
);
490 const __m128 CC5
= _mm_set1_ps(4.2163199048E-2f
);
491 const __m128 CC4
= _mm_set1_ps(2.4181311049E-2f
);
492 const __m128 CC3
= _mm_set1_ps(4.5470025998E-2f
);
493 const __m128 CC2
= _mm_set1_ps(7.4953002686E-2f
);
494 const __m128 CC1
= _mm_set1_ps(1.6666752422E-1f
);
499 __m128 z
,z1
,z2
,q
,q1
,q2
;
502 sign
= _mm_andnot_ps(signmask
,x
);
503 xabs
= _mm_and_ps(x
,signmask
);
505 mask
= _mm_cmpgt_ps(xabs
,half
);
507 z1
= _mm_mul_ps(half
, _mm_sub_ps(one
,xabs
));
508 q1
= _mm_mul_ps(z1
,gmx_mm_invsqrt_ps(z1
));
509 q1
= _mm_andnot_ps(_mm_cmpeq_ps(xabs
,one
),q1
);
512 z2
= _mm_mul_ps(q2
,q2
);
514 z
= _mm_or_ps( _mm_and_ps(mask
,z1
) , _mm_andnot_ps(mask
,z2
) );
515 q
= _mm_or_ps( _mm_and_ps(mask
,q1
) , _mm_andnot_ps(mask
,q2
) );
517 z2
= _mm_mul_ps(z
,z
);
519 pA
= _mm_mul_ps(CC5
,z2
);
520 pB
= _mm_mul_ps(CC4
,z2
);
522 pA
= _mm_add_ps(pA
,CC3
);
523 pB
= _mm_add_ps(pB
,CC2
);
525 pA
= _mm_mul_ps(pA
,z2
);
526 pB
= _mm_mul_ps(pB
,z2
);
528 pA
= _mm_add_ps(pA
,CC1
);
529 pA
= _mm_mul_ps(pA
,z
);
531 z
= _mm_add_ps(pA
,pB
);
535 q2
= _mm_sub_ps(halfpi
,z
);
536 q2
= _mm_sub_ps(q2
,z
);
538 z
= _mm_or_ps( _mm_and_ps(mask
,q2
) , _mm_andnot_ps(mask
,z
) );
540 mask
= _mm_cmpgt_ps(xabs
,limitlow
);
541 z
= _mm_or_ps( _mm_and_ps(mask
,z
) , _mm_andnot_ps(mask
,xabs
) );
543 z
= _mm_xor_ps(z
,sign
);
550 gmx_mm_acos_ps(__m128 x
)
552 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
553 const __m128 one_ps
= _mm_set1_ps(1.0f
);
554 const __m128 half_ps
= _mm_set1_ps(0.5f
);
555 const __m128 pi_ps
= _mm_set1_ps(M_PI
);
556 const __m128 halfpi_ps
= _mm_set1_ps(M_PI
/2.0f
);
563 xabs
= _mm_and_ps(x
,signmask
);
564 mask1
= _mm_cmpgt_ps(xabs
,half_ps
);
565 mask2
= _mm_cmpgt_ps(x
,_mm_setzero_ps());
567 z
= _mm_mul_ps(half_ps
,_mm_sub_ps(one_ps
,xabs
));
568 z
= _mm_mul_ps(z
,gmx_mm_invsqrt_ps(z
));
569 z
= _mm_andnot_ps(_mm_cmpeq_ps(xabs
,one_ps
),z
);
571 z
= _mm_or_ps( _mm_and_ps(mask1
,z
) , _mm_andnot_ps(mask1
,x
) );
572 z
= gmx_mm_asin_ps(z
);
574 z2
= _mm_add_ps(z
,z
);
575 z1
= _mm_sub_ps(pi_ps
,z2
);
576 z3
= _mm_sub_ps(halfpi_ps
,z
);
578 z
= _mm_or_ps( _mm_and_ps(mask2
,z2
) , _mm_andnot_ps(mask2
,z1
) );
579 z
= _mm_or_ps( _mm_and_ps(mask1
,z
) , _mm_andnot_ps(mask1
,z3
) );
586 gmx_mm_atan_ps(__m128 x
)
588 /* Same algorithm as cephes library */
589 const __m128 signmask
= gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
590 const __m128 limit1
= _mm_set1_ps(0.414213562373095f
);
591 const __m128 limit2
= _mm_set1_ps(2.414213562373095f
);
592 const __m128 quarterpi
= _mm_set1_ps(0.785398163397448f
);
593 const __m128 halfpi
= _mm_set1_ps(1.570796326794896f
);
594 const __m128 mone
= _mm_set1_ps(-1.0f
);
595 const __m128 CC3
= _mm_set1_ps(-3.33329491539E-1f
);
596 const __m128 CC5
= _mm_set1_ps(1.99777106478E-1f
);
597 const __m128 CC7
= _mm_set1_ps(-1.38776856032E-1);
598 const __m128 CC9
= _mm_set1_ps(8.05374449538e-2f
);
606 sign
= _mm_andnot_ps(signmask
,x
);
607 x
= _mm_and_ps(x
,signmask
);
609 mask1
= _mm_cmpgt_ps(x
,limit1
);
610 mask2
= _mm_cmpgt_ps(x
,limit2
);
612 z1
= _mm_mul_ps(_mm_add_ps(x
,mone
),gmx_mm_inv_ps(_mm_sub_ps(x
,mone
)));
613 z2
= _mm_mul_ps(mone
,gmx_mm_inv_ps(x
));
615 y
= _mm_and_ps(mask1
,quarterpi
);
616 y
= _mm_or_ps( _mm_and_ps(mask2
,halfpi
) , _mm_andnot_ps(mask2
,y
) );
618 x
= _mm_or_ps( _mm_and_ps(mask1
,z1
) , _mm_andnot_ps(mask1
,x
) );
619 x
= _mm_or_ps( _mm_and_ps(mask2
,z2
) , _mm_andnot_ps(mask2
,x
) );
621 x2
= _mm_mul_ps(x
,x
);
622 x4
= _mm_mul_ps(x2
,x2
);
624 sum1
= _mm_mul_ps(CC9
,x4
);
625 sum2
= _mm_mul_ps(CC7
,x4
);
626 sum1
= _mm_add_ps(sum1
,CC5
);
627 sum2
= _mm_add_ps(sum2
,CC3
);
628 sum1
= _mm_mul_ps(sum1
,x4
);
629 sum2
= _mm_mul_ps(sum2
,x2
);
631 sum1
= _mm_add_ps(sum1
,sum2
);
632 sum1
= _mm_sub_ps(sum1
,mone
);
633 sum1
= _mm_mul_ps(sum1
,x
);
634 y
= _mm_add_ps(y
,sum1
);
636 y
= _mm_xor_ps(y
,sign
);
643 gmx_mm_atan2_ps(__m128 y
, __m128 x
)
645 const __m128 pi
= _mm_set1_ps(M_PI
);
646 const __m128 minuspi
= _mm_set1_ps(-M_PI
);
647 const __m128 halfpi
= _mm_set1_ps(M_PI
/2.0);
648 const __m128 minushalfpi
= _mm_set1_ps(-M_PI
/2.0);
652 __m128 maskx_lt
,maskx_eq
;
653 __m128 masky_lt
,masky_eq
;
654 __m128 mask1
,mask2
,mask3
,mask4
,maskall
;
656 maskx_lt
= _mm_cmplt_ps(x
,_mm_setzero_ps());
657 masky_lt
= _mm_cmplt_ps(y
,_mm_setzero_ps());
658 maskx_eq
= _mm_cmpeq_ps(x
,_mm_setzero_ps());
659 masky_eq
= _mm_cmpeq_ps(y
,_mm_setzero_ps());
661 z
= _mm_mul_ps(y
,gmx_mm_inv_ps(x
));
662 z
= gmx_mm_atan_ps(z
);
664 mask1
= _mm_and_ps(maskx_eq
,masky_lt
);
665 mask2
= _mm_andnot_ps(maskx_lt
,masky_eq
);
666 mask3
= _mm_andnot_ps( _mm_or_ps(masky_lt
,masky_eq
) , maskx_eq
);
667 mask4
= _mm_and_ps(masky_eq
,maskx_lt
);
669 maskall
= _mm_or_ps( _mm_or_ps(mask1
,mask2
), _mm_or_ps(mask3
,mask4
) );
671 z
= _mm_andnot_ps(maskall
,z
);
672 z1
= _mm_and_ps(mask1
,minushalfpi
);
673 z3
= _mm_and_ps(mask3
,halfpi
);
674 z4
= _mm_and_ps(mask4
,pi
);
676 z
= _mm_or_ps( _mm_or_ps(z
,z1
), _mm_or_ps(z3
,z4
) );
678 mask1
= _mm_andnot_ps(masky_lt
,maskx_lt
);
679 mask2
= _mm_and_ps(maskx_lt
,masky_lt
);
681 w
= _mm_or_ps( _mm_and_ps(mask1
,pi
), _mm_and_ps(mask2
,minuspi
) );
682 w
= _mm_andnot_ps(maskall
,w
);
689 /* Load a single value from 1-4 places, merge into xmm register */
691 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
693 __m128 _txmm2,_txmm3,_txmm4; \
694 xmm1 = _mm_load_ss(ptr1); \
695 _txmm2 = _mm_load_ss(ptr2); \
696 _txmm3 = _mm_load_ss(ptr3); \
697 _txmm4 = _mm_load_ss(ptr4); \
698 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
699 _txmm2 = _mm_unpacklo_ps(_txmm2,_txmm4); \
700 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
704 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
706 __m128 _txmm2,_txmm3; \
707 xmm1 = _mm_load_ss(ptr1); \
708 _txmm2 = _mm_load_ss(ptr2); \
709 _txmm3 = _mm_load_ss(ptr3); \
710 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
711 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
715 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1) \
718 xmm1 = _mm_load_ss(ptr1); \
719 _txmm2 = _mm_load_ss(ptr2); \
720 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
724 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
726 xmm1 = _mm_load_ss(ptr1); \
729 /* Store data in an xmm register into 1-4 different places */
730 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
732 __m128 _txmm2,_txmm3,_txmm4; \
733 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
734 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
735 _txmm4 = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
736 _mm_store_ss(ptr1,xmm1); \
737 _mm_store_ss(ptr2,_txmm2); \
738 _mm_store_ss(ptr3,_txmm3); \
739 _mm_store_ss(ptr4,_txmm4); \
743 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
745 __m128 _txmm2,_txmm3; \
746 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
747 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
748 _mm_store_ss(ptr1,xmm1); \
749 _mm_store_ss(ptr2,_txmm2); \
750 _mm_store_ss(ptr3,_txmm3); \
754 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1) \
757 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
758 _mm_store_ss(ptr1,xmm1); \
759 _mm_store_ss(ptr2,_txmm2); \
763 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
765 _mm_store_ss(ptr1,xmm1); \
769 /* Similar to store, but increments value in memory */
770 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2) \
772 __m128 _tincr1,_tincr2; \
773 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
774 GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
775 _tincr1 = _mm_add_ps(_tincr1,xmm1); \
776 _tincr2 = _mm_add_ps(_tincr2,xmm2); \
777 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
778 GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
781 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
784 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
785 _tincr = _mm_add_ps(_tincr,xmm1); \
786 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
789 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
792 GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
793 _tincr = _mm_add_ps(_tincr,xmm1); \
794 GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
797 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1) \
800 GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr); \
801 _tincr = _mm_add_ps(_tincr,xmm1); \
802 GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr); \
805 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1) \
808 GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr); \
809 _tincr = _mm_add_ss(_tincr,xmm1); \
810 GMX_MM_STORE_1VALUE_PS(ptr1,_tincr); \
815 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
816 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12) \
818 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
819 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
820 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
821 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
822 _tmp4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4)); \
823 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
824 _tmp2 = _mm_unpacklo_ps(_tmp2,_tmp4); \
825 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
826 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
829 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12) \
831 __m128 _tmp1,_tmp2,_tmp3; \
832 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
833 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
834 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
835 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
836 _tmp2 = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps()); \
837 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
838 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
842 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12) \
844 __m128 _tmp1,_tmp2; \
845 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
846 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
847 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
848 c12 = _mm_movehl_ps(c12,c6); \
851 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12) \
853 c6 = _mm_load_ss(ptr1); \
854 c12 = _mm_load_ss(ptr1+1); \
858 /* Routines to load 1-4 rvecs from 1-4 places.
859 * We mainly use these to load coordinates. The extra routines
860 * are very efficient for the water-water loops, since we e.g.
861 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
863 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
864 jx1 = _mm_load_ss(ptr1); \
865 jy1 = _mm_load_ss((ptr1)+1); \
866 jz1 = _mm_load_ss((ptr1)+2); \
869 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
870 jx1 = _mm_load_ss(ptr1); \
871 jy1 = _mm_load_ss((ptr1)+1); \
872 jz1 = _mm_load_ss((ptr1)+2); \
873 jx2 = _mm_load_ss((ptr1)+3); \
874 jy2 = _mm_load_ss((ptr1)+4); \
875 jz2 = _mm_load_ss((ptr1)+5); \
879 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
880 jx1 = _mm_load_ss(ptr1); \
881 jy1 = _mm_load_ss((ptr1)+1); \
882 jz1 = _mm_load_ss((ptr1)+2); \
883 jx2 = _mm_load_ss((ptr1)+3); \
884 jy2 = _mm_load_ss((ptr1)+4); \
885 jz2 = _mm_load_ss((ptr1)+5); \
886 jx3 = _mm_load_ss((ptr1)+6); \
887 jy3 = _mm_load_ss((ptr1)+7); \
888 jz3 = _mm_load_ss((ptr1)+8); \
892 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
893 jx1 = _mm_load_ss(ptr1); \
894 jy1 = _mm_load_ss((ptr1)+1); \
895 jz1 = _mm_load_ss((ptr1)+2); \
896 jx2 = _mm_load_ss((ptr1)+3); \
897 jy2 = _mm_load_ss((ptr1)+4); \
898 jz2 = _mm_load_ss((ptr1)+5); \
899 jx3 = _mm_load_ss((ptr1)+6); \
900 jy3 = _mm_load_ss((ptr1)+7); \
901 jz3 = _mm_load_ss((ptr1)+8); \
902 jx4 = _mm_load_ss((ptr1)+9); \
903 jy4 = _mm_load_ss((ptr1)+10); \
904 jz4 = _mm_load_ss((ptr1)+11); \
908 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
909 __m128 _tmp1,_tmp2; \
910 _tmp1 = _mm_load_ss(ptr1); \
911 _tmp2 = _mm_load_ss(ptr2); \
912 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
913 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
914 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
915 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
916 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
917 jz1 = _mm_movehl_ps(jz1,jy1); \
920 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
921 __m128 _tmp1, _tmp2; \
922 _tmp1 = _mm_loadu_ps(ptr1); \
923 jy1 = _mm_loadu_ps(ptr2); \
924 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
925 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
926 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
927 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
928 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
929 jy1 = _mm_movehl_ps(jx1,jx1); \
930 jx2 = _mm_movehl_ps(jz1,jz1); \
931 jz2 = _mm_movehl_ps(jy2,jy2); \
935 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
936 __m128 _tmp1, _tmp2, _tmp3; \
937 _tmp1 = _mm_loadu_ps(ptr1); \
938 jy1 = _mm_loadu_ps(ptr2); \
939 _tmp2 = _mm_loadu_ps(ptr1+4); \
940 jz2 = _mm_loadu_ps(ptr2+4); \
941 jz3 = _mm_load_ss(ptr1+8); \
942 _tmp3 = _mm_load_ss(ptr2+8); \
943 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
944 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
945 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
946 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
947 jy1 = _mm_movehl_ps(jx1,jx1); \
948 jx2 = _mm_movehl_ps(jz1,jz1); \
949 jz2 = _mm_movehl_ps(jy2,jy2); \
950 jy3 = _mm_movehl_ps(jx3,jx3); \
951 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
955 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
956 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
957 _tmp1 = _mm_loadu_ps(ptr1); \
958 jy1 = _mm_loadu_ps(ptr2); \
959 _tmp2 = _mm_loadu_ps(ptr1+4); \
960 jz2 = _mm_loadu_ps(ptr2+4); \
961 _tmp3 = _mm_loadu_ps(ptr1+8); \
962 _tmp4 = _mm_loadu_ps(ptr2+8); \
963 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
964 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
965 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
966 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
967 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
968 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
969 jy1 = _mm_movehl_ps(jx1,jx1); \
970 jx2 = _mm_movehl_ps(jz1,jz1); \
971 jz2 = _mm_movehl_ps(jy2,jy2); \
972 jy3 = _mm_movehl_ps(jx3,jx3); \
973 jx4 = _mm_movehl_ps(jz3,jz3); \
974 jz4 = _mm_movehl_ps(jy4,jy4); \
978 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
979 __m128 _tmp1,_tmp3,_tmp4; \
980 jx1 = _mm_load_ss(ptr1); \
981 jy1 = _mm_load_ss(ptr2); \
982 jz1 = _mm_load_ss(ptr3); \
983 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
984 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
985 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
986 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
987 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
988 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
989 jx1 = _mm_movelh_ps(_tmp1,jz1); \
990 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
991 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
995 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
996 __m128 _tmp1, _tmp2; \
997 jx1 = _mm_loadu_ps(ptr1); \
998 jy1 = _mm_loadu_ps(ptr2); \
999 jz1 = _mm_loadu_ps(ptr3); \
1000 jx2 = _mm_setzero_ps(); \
1001 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1002 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1003 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
1004 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1005 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
1006 jz2 = _mm_unpacklo_ps(jz2,_mm_setzero_ps()); \
1007 jy2 = _mm_unpacklo_ps(_tmp1,jz2); \
1008 jz2 = _mm_unpackhi_ps(_tmp1,jz2); \
1012 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1013 __m128 _tmp1, _tmp2; \
1014 jx1 = _mm_loadu_ps(ptr1); \
1015 jy1 = _mm_loadu_ps(ptr2); \
1016 jz1 = _mm_loadu_ps(ptr3); \
1017 jx2 = _mm_setzero_ps(); \
1018 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1019 jy2 = _mm_loadu_ps(ptr1+4); \
1020 jz2 = _mm_loadu_ps(ptr2+4); \
1021 jx3 = _mm_loadu_ps(ptr3+4); \
1022 jy3 = _mm_setzero_ps(); \
1023 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1024 jz3 = _mm_load_ss(ptr1+8); \
1025 _tmp1 = _mm_load_ss(ptr2+8); \
1026 _tmp2 = _mm_load_ss(ptr3+8); \
1027 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
1028 _tmp1 = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps()); \
1029 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
1033 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1034 jx1 = _mm_loadu_ps(ptr1); \
1035 jy1 = _mm_loadu_ps(ptr2); \
1036 jz1 = _mm_loadu_ps(ptr3); \
1037 jx2 = _mm_setzero_ps(); \
1038 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1039 jy2 = _mm_loadu_ps(ptr1+4); \
1040 jz2 = _mm_loadu_ps(ptr2+4); \
1041 jx3 = _mm_loadu_ps(ptr3+4); \
1042 jy3 = _mm_setzero_ps(); \
1043 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1044 jz3 = _mm_loadu_ps(ptr1+8); \
1045 jx4 = _mm_loadu_ps(ptr2+8); \
1046 jy4 = _mm_loadu_ps(ptr3+8); \
1047 jz4 = _mm_setzero_ps(); \
1048 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
1053 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1054 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1055 jx1 = _mm_load_ss(ptr1); \
1056 _tmp1 = _mm_load_ss(ptr2); \
1057 jy1 = _mm_load_ss(ptr3); \
1058 jz1 = _mm_load_ss(ptr4); \
1059 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
1060 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
1061 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
1062 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
1063 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
1064 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
1065 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
1066 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1067 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
1068 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
1069 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
1073 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1074 __m128 _tmp1, _tmp2; \
1075 jx1 = _mm_loadu_ps(ptr1); \
1076 jy1 = _mm_loadu_ps(ptr2); \
1077 jz1 = _mm_loadu_ps(ptr3); \
1078 jx2 = _mm_loadu_ps(ptr4); \
1079 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1080 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1081 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
1082 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1083 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
1084 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
1085 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
1086 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
1087 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
1091 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1092 __m128 _tmp1, _tmp2, _tmp3; \
1093 jx1 = _mm_loadu_ps(ptr1); \
1094 jy1 = _mm_loadu_ps(ptr2); \
1095 jz1 = _mm_loadu_ps(ptr3); \
1096 jx2 = _mm_loadu_ps(ptr4); \
1097 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1098 jy2 = _mm_loadu_ps(ptr1+4); \
1099 jz2 = _mm_loadu_ps(ptr2+4); \
1100 jx3 = _mm_loadu_ps(ptr3+4); \
1101 jy3 = _mm_loadu_ps(ptr4+4); \
1102 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1103 jz3 = _mm_load_ss(ptr1+8); \
1104 _tmp1 = _mm_load_ss(ptr2+8); \
1105 _tmp2 = _mm_load_ss(ptr3+8); \
1106 _tmp3 = _mm_load_ss(ptr4+8); \
1107 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
1108 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
1109 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
1113 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1114 jx1 = _mm_loadu_ps(ptr1); \
1115 jy1 = _mm_loadu_ps(ptr2); \
1116 jz1 = _mm_loadu_ps(ptr3); \
1117 jx2 = _mm_loadu_ps(ptr4); \
1118 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
1119 jy2 = _mm_loadu_ps(ptr1+4); \
1120 jz2 = _mm_loadu_ps(ptr2+4); \
1121 jx3 = _mm_loadu_ps(ptr3+4); \
1122 jy3 = _mm_loadu_ps(ptr4+4); \
1123 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
1124 jz3 = _mm_loadu_ps(ptr1+8); \
1125 jx4 = _mm_loadu_ps(ptr2+8); \
1126 jy4 = _mm_loadu_ps(ptr3+8); \
1127 jz4 = _mm_loadu_ps(ptr4+8); \
1128 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
1132 /* Routines to increment rvecs in memory, typically use for j particle force updates */
1133 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1135 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1136 jx1 = _mm_movelh_ps(jx1,jy1); \
1137 _tmp1 = _mm_load_ss(ptr1); \
1138 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1139 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1140 _mm_store_ss(ptr1,_tmp1); \
1141 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1145 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1146 __m128 _tmp1, _tmp2; \
1147 _tmp1 = _mm_loadu_ps(ptr1); \
1148 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1149 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1150 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1151 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1152 jx1 = _mm_movelh_ps(jx1,jz1); \
1153 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1154 _tmp2 = _mm_add_ps(_tmp2,jy2); \
1155 _mm_storeu_ps(ptr1,_tmp1); \
1156 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1160 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1161 __m128 _tmp1, _tmp2, _tmp3; \
1162 _tmp1 = _mm_loadu_ps(ptr1); \
1163 _tmp2 = _mm_loadu_ps(ptr1+4); \
1164 _tmp3 = _mm_load_ss(ptr1+8); \
1165 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1166 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1167 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1168 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1169 jx1 = _mm_movelh_ps(jx1,jz1); \
1170 jy2 = _mm_movelh_ps(jy2,jx3); \
1171 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1172 _tmp2 = _mm_add_ps(_tmp2,jy2); \
1173 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1174 _mm_storeu_ps(ptr1,_tmp1); \
1175 _mm_storeu_ps(ptr1+4,_tmp2); \
1176 _mm_store_ss(ptr1+8,_tmp3); \
1180 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1181 __m128 _tmp1, _tmp2, _tmp3; \
1182 _tmp1 = _mm_loadu_ps(ptr1); \
1183 _tmp2 = _mm_loadu_ps(ptr1+4); \
1184 _tmp3 = _mm_loadu_ps(ptr1+8); \
1185 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1186 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1187 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1188 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1189 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1190 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1191 jx1 = _mm_movelh_ps(jx1,jz1); \
1192 jy2 = _mm_movelh_ps(jy2,jx3); \
1193 jz3 = _mm_movelh_ps(jz3,jy4); \
1194 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1195 _tmp2 = _mm_add_ps(_tmp2,jy2); \
1196 _tmp3 = _mm_add_ps(_tmp3,jz3); \
1197 _mm_storeu_ps(ptr1,_tmp1); \
1198 _mm_storeu_ps(ptr1+4,_tmp2); \
1199 _mm_storeu_ps(ptr1+8,_tmp3); \
1203 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1204 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1205 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1206 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1207 _tmp2 = _mm_load_ss(ptr1+2); \
1208 _tmp3 = _mm_load_ss(ptr2+2); \
1209 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1210 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1211 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1212 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1213 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1214 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
1215 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
1219 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1220 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1221 _tmp1 = _mm_loadu_ps(ptr1); \
1222 _tmp2 = _mm_loadu_ps(ptr2); \
1223 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1224 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1225 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1226 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1227 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1228 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1229 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1230 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
1231 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
1232 _tmp3 = _mm_add_ps(_tmp3,jy2); \
1233 _mm_storeu_ps(ptr1,_tmp1); \
1234 _mm_storeu_ps(ptr2,_tmp2); \
1235 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1236 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1240 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1241 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1242 _tmp1 = _mm_loadu_ps(ptr1); \
1243 _tmp2 = _mm_loadu_ps(ptr1+4); \
1244 _tmp3 = _mm_load_ss(ptr1+8); \
1245 _tmp4 = _mm_loadu_ps(ptr2); \
1246 _tmp5 = _mm_loadu_ps(ptr2+4); \
1247 _tmp6 = _mm_load_ss(ptr2+8); \
1248 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1249 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1250 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1251 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1252 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1253 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1254 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1255 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1256 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1257 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
1258 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1259 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1260 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
1261 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
1262 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
1263 _mm_storeu_ps(ptr1,_tmp1); \
1264 _mm_storeu_ps(ptr1+4,_tmp2); \
1265 _mm_store_ss(ptr1+8,_tmp3); \
1266 _mm_storeu_ps(ptr2,_tmp4); \
1267 _mm_storeu_ps(ptr2+4,_tmp5); \
1268 _mm_store_ss(ptr2+8,_tmp6); \
1272 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1273 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1274 _tmp1 = _mm_loadu_ps(ptr1); \
1275 _tmp2 = _mm_loadu_ps(ptr1+4); \
1276 _tmp3 = _mm_loadu_ps(ptr1+8); \
1277 _tmp4 = _mm_loadu_ps(ptr2); \
1278 _tmp5 = _mm_loadu_ps(ptr2+4); \
1279 _tmp6 = _mm_loadu_ps(ptr2+8); \
1280 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1281 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1282 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1283 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1284 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1285 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1286 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1287 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1288 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1289 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1290 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1291 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1292 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
1293 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1294 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1295 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
1296 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
1297 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
1298 _mm_storeu_ps(ptr1,_tmp1); \
1299 _mm_storeu_ps(ptr1+4,_tmp2); \
1300 _mm_storeu_ps(ptr1+8,_tmp3); \
1301 _mm_storeu_ps(ptr2,_tmp4); \
1302 _mm_storeu_ps(ptr2+4,_tmp5); \
1303 _mm_storeu_ps(ptr2+8,_tmp6); \
1307 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1308 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1309 _tmp1 = _mm_load_ss(ptr1); \
1310 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1311 _tmp2 = _mm_load_ss(ptr2); \
1312 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1313 _tmp3 = _mm_load_ss(ptr3); \
1314 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1315 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1316 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1317 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1318 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1319 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1320 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1321 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1322 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
1323 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
1324 _mm_store_ss(ptr1,_tmp1); \
1325 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1326 _mm_store_ss(ptr2,_tmp2); \
1327 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1328 _mm_store_ss(ptr3,_tmp3); \
1329 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1333 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1334 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1335 _tmp1 = _mm_loadu_ps(ptr1); \
1336 _tmp2 = _mm_loadu_ps(ptr2); \
1337 _tmp3 = _mm_loadu_ps(ptr3); \
1338 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1339 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1340 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1341 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1342 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1343 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1344 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1345 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1346 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1347 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1348 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1349 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1350 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
1351 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1352 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
1353 _tmp4 = _mm_add_ps(_tmp4,jy2); \
1354 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
1355 _mm_storeu_ps(ptr1,_tmp1); \
1356 _mm_storeu_ps(ptr2,_tmp2); \
1357 _mm_storeu_ps(ptr3,_tmp3); \
1358 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1359 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1360 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1364 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1365 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1366 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1367 _tmp1 = _mm_loadu_ps(ptr1); \
1368 _tmp2 = _mm_loadu_ps(ptr1+4); \
1369 _tmp3 = _mm_load_ss(ptr1+8); \
1370 _tmp4 = _mm_loadu_ps(ptr2); \
1371 _tmp5 = _mm_loadu_ps(ptr2+4); \
1372 _tmp6 = _mm_load_ss(ptr2+8); \
1373 _tmp7 = _mm_loadu_ps(ptr3); \
1374 _tmp8 = _mm_loadu_ps(ptr3+4); \
1375 _tmp9 = _mm_load_ss(ptr3+8); \
1376 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1377 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1378 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1379 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1380 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1381 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1382 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1383 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1384 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1385 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1386 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1387 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1388 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1389 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1390 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1391 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1392 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1393 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1394 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1395 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1396 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1397 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
1398 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1399 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1400 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
1401 _mm_storeu_ps(ptr1,_tmp1); \
1402 _mm_storeu_ps(ptr1+4,_tmp2); \
1403 _mm_store_ss(ptr1+8,_tmp3); \
1404 _mm_storeu_ps(ptr2,_tmp4); \
1405 _mm_storeu_ps(ptr2+4,_tmp5); \
1406 _mm_store_ss(ptr2+8,_tmp6); \
1407 _mm_storeu_ps(ptr3,_tmp7); \
1408 _mm_storeu_ps(ptr3+4,_tmp8); \
1409 _mm_store_ss(ptr3+8,_tmp9); \
1413 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1414 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1415 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1416 _tmp1 = _mm_loadu_ps(ptr1); \
1417 _tmp2 = _mm_loadu_ps(ptr1+4); \
1418 _tmp3 = _mm_loadu_ps(ptr1+8); \
1419 _tmp4 = _mm_loadu_ps(ptr2); \
1420 _tmp5 = _mm_loadu_ps(ptr2+4); \
1421 _tmp6 = _mm_loadu_ps(ptr2+8); \
1422 _tmp7 = _mm_loadu_ps(ptr3); \
1423 _tmp8 = _mm_loadu_ps(ptr3+4); \
1424 _tmp9 = _mm_loadu_ps(ptr3+8); \
1425 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1426 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1427 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1428 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1429 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1430 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1431 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1432 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1433 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1434 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1435 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1436 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1437 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1438 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1439 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1440 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1441 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1442 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1443 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1444 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1445 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1446 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1447 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1448 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
1449 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1450 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1451 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
1452 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1453 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1454 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
1455 _mm_storeu_ps(ptr1,_tmp1); \
1456 _mm_storeu_ps(ptr1+4,_tmp2); \
1457 _mm_storeu_ps(ptr1+8,_tmp3); \
1458 _mm_storeu_ps(ptr2,_tmp4); \
1459 _mm_storeu_ps(ptr2+4,_tmp5); \
1460 _mm_storeu_ps(ptr2+8,_tmp6); \
1461 _mm_storeu_ps(ptr3,_tmp7); \
1462 _mm_storeu_ps(ptr3+4,_tmp8); \
1463 _mm_storeu_ps(ptr3+8,_tmp9); \
1468 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1469 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1470 _tmp1 = _mm_load_ss(ptr1); \
1471 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1472 _tmp2 = _mm_load_ss(ptr2); \
1473 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1474 _tmp3 = _mm_load_ss(ptr3); \
1475 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1476 _tmp4 = _mm_load_ss(ptr4); \
1477 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1478 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1479 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1480 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1481 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1482 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1483 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1484 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
1485 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
1486 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
1487 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
1488 _mm_store_ss(ptr1,_tmp1); \
1489 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1490 _mm_store_ss(ptr2,_tmp2); \
1491 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1492 _mm_store_ss(ptr3,_tmp3); \
1493 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1494 _mm_store_ss(ptr4,_tmp4); \
1495 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1499 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1500 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1501 _tmp1 = _mm_loadu_ps(ptr1); \
1502 _tmp2 = _mm_loadu_ps(ptr2); \
1503 _tmp3 = _mm_loadu_ps(ptr3); \
1504 _tmp4 = _mm_loadu_ps(ptr4); \
1505 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1506 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1507 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1508 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1509 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1510 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1511 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1512 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1513 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1514 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1515 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1516 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1517 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1518 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1519 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
1520 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
1521 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1522 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
1523 _tmp5 = _mm_add_ps(_tmp5,jy2); \
1524 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
1525 _mm_storeu_ps(ptr1,_tmp1); \
1526 _mm_storeu_ps(ptr2,_tmp2); \
1527 _mm_storeu_ps(ptr3,_tmp3); \
1528 _mm_storeu_ps(ptr4,_tmp4); \
1529 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1530 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1531 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1532 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1536 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1537 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1538 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1539 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1540 _tmp1 = _mm_loadu_ps(ptr1); \
1541 _tmp2 = _mm_loadu_ps(ptr1+4); \
1542 _tmp3 = _mm_load_ss(ptr1+8); \
1543 _tmp4 = _mm_loadu_ps(ptr2); \
1544 _tmp5 = _mm_loadu_ps(ptr2+4); \
1545 _tmp6 = _mm_load_ss(ptr2+8); \
1546 _tmp7 = _mm_loadu_ps(ptr3); \
1547 _tmp8 = _mm_loadu_ps(ptr3+4); \
1548 _tmp9 = _mm_load_ss(ptr3+8); \
1549 _tmp10 = _mm_loadu_ps(ptr4); \
1550 _tmp11 = _mm_loadu_ps(ptr4+4); \
1551 _tmp12 = _mm_load_ss(ptr4+8); \
1552 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1553 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1554 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1555 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1556 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1557 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1558 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1559 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1560 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1561 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1562 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1563 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1564 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1565 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1566 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1567 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1568 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1569 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1570 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1571 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
1572 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
1573 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1574 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
1575 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
1576 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
1577 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
1578 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
1579 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
1580 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1581 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1582 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
1583 _mm_storeu_ps(ptr1,_tmp1); \
1584 _mm_storeu_ps(ptr1+4,_tmp2); \
1585 _mm_store_ss(ptr1+8,_tmp3); \
1586 _mm_storeu_ps(ptr2,_tmp4); \
1587 _mm_storeu_ps(ptr2+4,_tmp5); \
1588 _mm_store_ss(ptr2+8,_tmp6); \
1589 _mm_storeu_ps(ptr3,_tmp7); \
1590 _mm_storeu_ps(ptr3+4,_tmp8); \
1591 _mm_store_ss(ptr3+8,_tmp9); \
1592 _mm_storeu_ps(ptr4,_tmp10); \
1593 _mm_storeu_ps(ptr4+4,_tmp11); \
1594 _mm_store_ss(ptr4+8,_tmp12); \
1598 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1599 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1600 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22; \
1601 __m128 _tmp23,_tmp24; \
1602 _tmp1 = _mm_loadu_ps(ptr1); \
1603 _tmp2 = _mm_loadu_ps(ptr1+4); \
1604 _tmp3 = _mm_loadu_ps(ptr1+8); \
1605 _tmp4 = _mm_loadu_ps(ptr2); \
1606 _tmp5 = _mm_loadu_ps(ptr2+4); \
1607 _tmp6 = _mm_loadu_ps(ptr2+8); \
1608 _tmp7 = _mm_loadu_ps(ptr3); \
1609 _tmp8 = _mm_loadu_ps(ptr3+4); \
1610 _tmp9 = _mm_loadu_ps(ptr3+8); \
1611 _tmp10 = _mm_loadu_ps(ptr4); \
1612 _tmp11 = _mm_loadu_ps(ptr4+4); \
1613 _tmp12 = _mm_loadu_ps(ptr4+8); \
1614 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1615 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1616 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1617 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1618 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1619 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1620 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1621 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1622 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1623 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1624 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1625 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1626 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1627 jz1 = _mm_movehl_ps(jz1,jx1); \
1628 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1629 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1630 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1631 jx3 = _mm_movehl_ps(jx3,jy2); \
1632 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1633 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1634 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1635 jy4 = _mm_movehl_ps(jy4,jz3); \
1636 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1637 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1638 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
1639 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
1640 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
1641 _tmp4 = _mm_add_ps(_tmp4,jz1); \
1642 _tmp5 = _mm_add_ps(_tmp5,jx3); \
1643 _tmp6 = _mm_add_ps(_tmp6,jy4); \
1644 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
1645 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
1646 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
1647 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1648 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1649 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
1650 _mm_storeu_ps(ptr1,_tmp1); \
1651 _mm_storeu_ps(ptr1+4,_tmp2); \
1652 _mm_storeu_ps(ptr1+8,_tmp3); \
1653 _mm_storeu_ps(ptr2,_tmp4); \
1654 _mm_storeu_ps(ptr2+4,_tmp5); \
1655 _mm_storeu_ps(ptr2+8,_tmp6); \
1656 _mm_storeu_ps(ptr3,_tmp7); \
1657 _mm_storeu_ps(ptr3+4,_tmp8); \
1658 _mm_storeu_ps(ptr3+8,_tmp9); \
1659 _mm_storeu_ps(ptr4,_tmp10); \
1660 _mm_storeu_ps(ptr4+4,_tmp11); \
1661 _mm_storeu_ps(ptr4+8,_tmp12); \
1666 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1668 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1669 jx1 = _mm_movelh_ps(jx1,jy1); \
1670 _tmp1 = _mm_load_ss(ptr1); \
1671 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1672 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1673 _mm_store_ss(ptr1,_tmp1); \
1674 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1678 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1679 __m128 _tmp1, _tmp2; \
1680 _tmp1 = _mm_loadu_ps(ptr1); \
1681 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1682 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1683 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1684 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1685 jx1 = _mm_movelh_ps(jx1,jz1); \
1686 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1687 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1688 _mm_storeu_ps(ptr1,_tmp1); \
1689 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1693 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1694 __m128 _tmp1, _tmp2, _tmp3; \
1695 _tmp1 = _mm_loadu_ps(ptr1); \
1696 _tmp2 = _mm_loadu_ps(ptr1+4); \
1697 _tmp3 = _mm_load_ss(ptr1+8); \
1698 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1699 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1700 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1701 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1702 jx1 = _mm_movelh_ps(jx1,jz1); \
1703 jy2 = _mm_movelh_ps(jy2,jx3); \
1704 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1705 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1706 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1707 _mm_storeu_ps(ptr1,_tmp1); \
1708 _mm_storeu_ps(ptr1+4,_tmp2); \
1709 _mm_store_ss(ptr1+8,_tmp3); \
1713 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1714 __m128 _tmp1, _tmp2, _tmp3; \
1715 _tmp1 = _mm_loadu_ps(ptr1); \
1716 _tmp2 = _mm_loadu_ps(ptr1+4); \
1717 _tmp3 = _mm_loadu_ps(ptr1+8); \
1718 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1719 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1720 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1721 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1722 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1723 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1724 jx1 = _mm_movelh_ps(jx1,jz1); \
1725 jy2 = _mm_movelh_ps(jy2,jx3); \
1726 jz3 = _mm_movelh_ps(jz3,jy4); \
1727 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1728 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1729 _tmp3 = _mm_sub_ps(_tmp3,jz3); \
1730 _mm_storeu_ps(ptr1,_tmp1); \
1731 _mm_storeu_ps(ptr1+4,_tmp2); \
1732 _mm_storeu_ps(ptr1+8,_tmp3); \
1736 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1737 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1738 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1739 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1740 _tmp2 = _mm_load_ss(ptr1+2); \
1741 _tmp3 = _mm_load_ss(ptr2+2); \
1742 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1743 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1744 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1745 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1746 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1747 _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1)); \
1748 _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4)); \
1752 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1753 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1754 _tmp1 = _mm_loadu_ps(ptr1); \
1755 _tmp2 = _mm_loadu_ps(ptr2); \
1756 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1757 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1758 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1759 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1760 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1761 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1762 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1763 _tmp1 = _mm_sub_ps(_tmp1,_tmp4); \
1764 _tmp2 = _mm_sub_ps(_tmp2,_tmp5); \
1765 _tmp3 = _mm_sub_ps(_tmp3,jy2); \
1766 _mm_storeu_ps(ptr1,_tmp1); \
1767 _mm_storeu_ps(ptr2,_tmp2); \
1768 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1769 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1773 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1774 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1775 _tmp1 = _mm_loadu_ps(ptr1); \
1776 _tmp2 = _mm_loadu_ps(ptr1+4); \
1777 _tmp3 = _mm_load_ss(ptr1+8); \
1778 _tmp4 = _mm_loadu_ps(ptr2); \
1779 _tmp5 = _mm_loadu_ps(ptr2+4); \
1780 _tmp6 = _mm_load_ss(ptr2+8); \
1781 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1782 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1783 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1784 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1785 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1786 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1787 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1788 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1789 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1790 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1791 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1792 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1793 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1794 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1795 _tmp6 = _mm_sub_ss(_tmp6,_tmp7); \
1796 _mm_storeu_ps(ptr1,_tmp1); \
1797 _mm_storeu_ps(ptr1+4,_tmp2); \
1798 _mm_store_ss(ptr1+8,_tmp3); \
1799 _mm_storeu_ps(ptr2,_tmp4); \
1800 _mm_storeu_ps(ptr2+4,_tmp5); \
1801 _mm_store_ss(ptr2+8,_tmp6); \
1805 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1806 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1807 _tmp1 = _mm_loadu_ps(ptr1); \
1808 _tmp2 = _mm_loadu_ps(ptr1+4); \
1809 _tmp3 = _mm_loadu_ps(ptr1+8); \
1810 _tmp4 = _mm_loadu_ps(ptr2); \
1811 _tmp5 = _mm_loadu_ps(ptr2+4); \
1812 _tmp6 = _mm_loadu_ps(ptr2+8); \
1813 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1814 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1815 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1816 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1817 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1818 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1819 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1820 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1821 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1822 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1823 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1824 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1825 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1826 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1827 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1828 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1829 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1830 _tmp6 = _mm_sub_ps(_tmp6,_tmp13); \
1831 _mm_storeu_ps(ptr1,_tmp1); \
1832 _mm_storeu_ps(ptr1+4,_tmp2); \
1833 _mm_storeu_ps(ptr1+8,_tmp3); \
1834 _mm_storeu_ps(ptr2,_tmp4); \
1835 _mm_storeu_ps(ptr2+4,_tmp5); \
1836 _mm_storeu_ps(ptr2+8,_tmp6); \
1840 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1841 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1842 _tmp1 = _mm_load_ss(ptr1); \
1843 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1844 _tmp2 = _mm_load_ss(ptr2); \
1845 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1846 _tmp3 = _mm_load_ss(ptr3); \
1847 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1848 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1849 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1850 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1851 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1852 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1853 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1854 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1855 _tmp2 = _mm_sub_ps(_tmp2,_tmp6); \
1856 _tmp3 = _mm_sub_ps(_tmp3,_tmp7); \
1857 _mm_store_ss(ptr1,_tmp1); \
1858 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1859 _mm_store_ss(ptr2,_tmp2); \
1860 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1861 _mm_store_ss(ptr3,_tmp3); \
1862 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1866 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1867 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1868 _tmp1 = _mm_loadu_ps(ptr1); \
1869 _tmp2 = _mm_loadu_ps(ptr2); \
1870 _tmp3 = _mm_loadu_ps(ptr3); \
1871 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1872 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1873 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1874 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1875 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1876 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1877 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1878 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1879 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1880 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1881 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1882 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1883 _tmp1 = _mm_sub_ps(_tmp1,_tmp9); \
1884 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1885 _tmp3 = _mm_sub_ps(_tmp3,_tmp6); \
1886 _tmp4 = _mm_sub_ps(_tmp4,jy2); \
1887 _tmp5 = _mm_sub_ps(_tmp5,_tmp8); \
1888 _mm_storeu_ps(ptr1,_tmp1); \
1889 _mm_storeu_ps(ptr2,_tmp2); \
1890 _mm_storeu_ps(ptr3,_tmp3); \
1891 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1892 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1893 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1897 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1898 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1899 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1900 _tmp1 = _mm_loadu_ps(ptr1); \
1901 _tmp2 = _mm_loadu_ps(ptr1+4); \
1902 _tmp3 = _mm_load_ss(ptr1+8); \
1903 _tmp4 = _mm_loadu_ps(ptr2); \
1904 _tmp5 = _mm_loadu_ps(ptr2+4); \
1905 _tmp6 = _mm_load_ss(ptr2+8); \
1906 _tmp7 = _mm_loadu_ps(ptr3); \
1907 _tmp8 = _mm_loadu_ps(ptr3+4); \
1908 _tmp9 = _mm_load_ss(ptr3+8); \
1909 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1910 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1911 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1912 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1913 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1914 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1915 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1916 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1917 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1918 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1919 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1920 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1921 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1922 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1923 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1924 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1925 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1926 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1927 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1928 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1929 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1930 _tmp6 = _mm_sub_ss(_tmp6,_tmp14); \
1931 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1932 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1933 _tmp9 = _mm_sub_ss(_tmp9,_tmp15); \
1934 _mm_storeu_ps(ptr1,_tmp1); \
1935 _mm_storeu_ps(ptr1+4,_tmp2); \
1936 _mm_store_ss(ptr1+8,_tmp3); \
1937 _mm_storeu_ps(ptr2,_tmp4); \
1938 _mm_storeu_ps(ptr2+4,_tmp5); \
1939 _mm_store_ss(ptr2+8,_tmp6); \
1940 _mm_storeu_ps(ptr3,_tmp7); \
1941 _mm_storeu_ps(ptr3+4,_tmp8); \
1942 _mm_store_ss(ptr3+8,_tmp9); \
1946 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1947 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1948 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1949 _tmp1 = _mm_loadu_ps(ptr1); \
1950 _tmp2 = _mm_loadu_ps(ptr1+4); \
1951 _tmp3 = _mm_loadu_ps(ptr1+8); \
1952 _tmp4 = _mm_loadu_ps(ptr2); \
1953 _tmp5 = _mm_loadu_ps(ptr2+4); \
1954 _tmp6 = _mm_loadu_ps(ptr2+8); \
1955 _tmp7 = _mm_loadu_ps(ptr3); \
1956 _tmp8 = _mm_loadu_ps(ptr3+4); \
1957 _tmp9 = _mm_loadu_ps(ptr3+8); \
1958 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1959 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1960 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1961 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1962 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1963 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1964 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1965 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1966 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1967 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1968 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1969 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1970 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1971 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1972 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1973 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1974 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1975 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1976 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1977 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1978 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1979 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1980 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1981 _tmp3 = _mm_sub_ps(_tmp3,_tmp20); \
1982 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1983 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1984 _tmp6 = _mm_sub_ps(_tmp6,_tmp21); \
1985 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1986 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1987 _tmp9 = _mm_sub_ps(_tmp9,_tmp14); \
1988 _mm_storeu_ps(ptr1,_tmp1); \
1989 _mm_storeu_ps(ptr1+4,_tmp2); \
1990 _mm_storeu_ps(ptr1+8,_tmp3); \
1991 _mm_storeu_ps(ptr2,_tmp4); \
1992 _mm_storeu_ps(ptr2+4,_tmp5); \
1993 _mm_storeu_ps(ptr2+8,_tmp6); \
1994 _mm_storeu_ps(ptr3,_tmp7); \
1995 _mm_storeu_ps(ptr3+4,_tmp8); \
1996 _mm_storeu_ps(ptr3+8,_tmp9); \
2002 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
2003 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
2004 _tmp1 = _mm_load_ss(ptr1); \
2005 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
2006 _tmp2 = _mm_load_ss(ptr2); \
2007 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
2008 _tmp3 = _mm_load_ss(ptr3); \
2009 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
2010 _tmp4 = _mm_load_ss(ptr4); \
2011 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
2012 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
2013 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
2014 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
2015 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
2016 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
2017 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
2018 _tmp1 = _mm_sub_ps(_tmp1,_tmp7); \
2019 _tmp2 = _mm_sub_ps(_tmp2,_tmp8); \
2020 _tmp3 = _mm_sub_ps(_tmp3,_tmp9); \
2021 _tmp4 = _mm_sub_ps(_tmp4,_tmp10); \
2022 _mm_store_ss(ptr1,_tmp1); \
2023 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
2024 _mm_store_ss(ptr2,_tmp2); \
2025 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
2026 _mm_store_ss(ptr3,_tmp3); \
2027 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
2028 _mm_store_ss(ptr4,_tmp4); \
2029 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
2034 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
2035 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
2036 _tmp1 = _mm_loadu_ps(ptr1); \
2037 _tmp2 = _mm_loadu_ps(ptr2); \
2038 _tmp3 = _mm_loadu_ps(ptr3); \
2039 _tmp4 = _mm_loadu_ps(ptr4); \
2040 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
2041 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
2042 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
2043 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
2044 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
2045 jx1 = _mm_unpacklo_ps(jx1,jy1); \
2046 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
2047 jz1 = _mm_unpacklo_ps(jz1,jx2); \
2048 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
2049 jy2 = _mm_unpacklo_ps(jy2,jz2); \
2050 _tmp10 = _mm_movelh_ps(jx1,jz1); \
2051 _tmp11 = _mm_movehl_ps(jz1,jx1); \
2052 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
2053 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
2054 _tmp1 = _mm_sub_ps(_tmp1,_tmp10); \
2055 _tmp2 = _mm_sub_ps(_tmp2,_tmp11); \
2056 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
2057 _tmp4 = _mm_sub_ps(_tmp4,_tmp13); \
2058 _tmp5 = _mm_sub_ps(_tmp5,jy2); \
2059 _tmp6 = _mm_sub_ps(_tmp6,_tmp9); \
2060 _mm_storeu_ps(ptr1,_tmp1); \
2061 _mm_storeu_ps(ptr2,_tmp2); \
2062 _mm_storeu_ps(ptr3,_tmp3); \
2063 _mm_storeu_ps(ptr4,_tmp4); \
2064 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
2065 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
2066 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
2067 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
2071 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
2072 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
2073 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
2074 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
2075 _tmp1 = _mm_loadu_ps(ptr1); \
2076 _tmp2 = _mm_loadu_ps(ptr1+4); \
2077 _tmp3 = _mm_load_ss(ptr1+8); \
2078 _tmp4 = _mm_loadu_ps(ptr2); \
2079 _tmp5 = _mm_loadu_ps(ptr2+4); \
2080 _tmp6 = _mm_load_ss(ptr2+8); \
2081 _tmp7 = _mm_loadu_ps(ptr3); \
2082 _tmp8 = _mm_loadu_ps(ptr3+4); \
2083 _tmp9 = _mm_load_ss(ptr3+8); \
2084 _tmp10 = _mm_loadu_ps(ptr4); \
2085 _tmp11 = _mm_loadu_ps(ptr4+4); \
2086 _tmp12 = _mm_load_ss(ptr4+8); \
2087 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
2088 jx1 = _mm_unpacklo_ps(jx1,jy1); \
2089 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
2090 jz1 = _mm_unpacklo_ps(jz1,jx2); \
2091 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
2092 jy2 = _mm_unpacklo_ps(jy2,jz2); \
2093 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
2094 jx3 = _mm_unpacklo_ps(jx3,jy3); \
2095 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
2096 _tmp18 = _mm_movehl_ps(jz3,jz3); \
2097 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
2098 _tmp20 = _mm_movelh_ps(jx1,jz1); \
2099 _tmp21 = _mm_movehl_ps(jz1,jx1); \
2100 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
2101 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
2102 _tmp23 = _mm_movelh_ps(jy2,jx3); \
2103 _tmp24 = _mm_movehl_ps(jx3,jy2); \
2104 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
2105 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
2106 _tmp1 = _mm_sub_ps(_tmp1,_tmp20); \
2107 _tmp2 = _mm_sub_ps(_tmp2,_tmp23); \
2108 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
2109 _tmp4 = _mm_sub_ps(_tmp4,_tmp21); \
2110 _tmp5 = _mm_sub_ps(_tmp5,_tmp24); \
2111 _tmp6 = _mm_sub_ss(_tmp6,_tmp17); \
2112 _tmp7 = _mm_sub_ps(_tmp7,_tmp22); \
2113 _tmp8 = _mm_sub_ps(_tmp8,_tmp25); \
2114 _tmp9 = _mm_sub_ss(_tmp9,_tmp18); \
2115 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
2116 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
2117 _tmp12 = _mm_sub_ss(_tmp12,_tmp19); \
2118 _mm_storeu_ps(ptr1,_tmp1); \
2119 _mm_storeu_ps(ptr1+4,_tmp2); \
2120 _mm_store_ss(ptr1+8,_tmp3); \
2121 _mm_storeu_ps(ptr2,_tmp4); \
2122 _mm_storeu_ps(ptr2+4,_tmp5); \
2123 _mm_store_ss(ptr2+8,_tmp6); \
2124 _mm_storeu_ps(ptr3,_tmp7); \
2125 _mm_storeu_ps(ptr3+4,_tmp8); \
2126 _mm_store_ss(ptr3+8,_tmp9); \
2127 _mm_storeu_ps(ptr4,_tmp10); \
2128 _mm_storeu_ps(ptr4+4,_tmp11); \
2129 _mm_store_ss(ptr4+8,_tmp12); \
2133 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
2134 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
2135 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
2136 __m128 _tmp23,_tmp24; \
2137 _tmp1 = _mm_loadu_ps(ptr1); \
2138 _tmp2 = _mm_loadu_ps(ptr1+4); \
2139 _tmp3 = _mm_loadu_ps(ptr1+8); \
2140 _tmp4 = _mm_loadu_ps(ptr2); \
2141 _tmp5 = _mm_loadu_ps(ptr2+4); \
2142 _tmp6 = _mm_loadu_ps(ptr2+8); \
2143 _tmp7 = _mm_loadu_ps(ptr3); \
2144 _tmp8 = _mm_loadu_ps(ptr3+4); \
2145 _tmp9 = _mm_loadu_ps(ptr3+8); \
2146 _tmp10 = _mm_loadu_ps(ptr4); \
2147 _tmp11 = _mm_loadu_ps(ptr4+4); \
2148 _tmp12 = _mm_loadu_ps(ptr4+8); \
2149 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
2150 jx1 = _mm_unpacklo_ps(jx1,jy1); \
2151 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
2152 jz1 = _mm_unpacklo_ps(jz1,jx2); \
2153 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
2154 jy2 = _mm_unpacklo_ps(jy2,jz2); \
2155 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
2156 jx3 = _mm_unpacklo_ps(jx3,jy3); \
2157 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
2158 jz3 = _mm_unpacklo_ps(jz3,jx4); \
2159 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
2160 jy4 = _mm_unpacklo_ps(jy4,jz4); \
2161 _tmp19 = _mm_movelh_ps(jx1,jz1); \
2162 jz1 = _mm_movehl_ps(jz1,jx1); \
2163 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
2164 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
2165 _tmp21 = _mm_movelh_ps(jy2,jx3); \
2166 jx3 = _mm_movehl_ps(jx3,jy2); \
2167 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
2168 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
2169 _tmp23 = _mm_movelh_ps(jz3,jy4); \
2170 jy4 = _mm_movehl_ps(jy4,jz3); \
2171 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
2172 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
2173 _tmp1 = _mm_sub_ps(_tmp1,_tmp19); \
2174 _tmp2 = _mm_sub_ps(_tmp2,_tmp21); \
2175 _tmp3 = _mm_sub_ps(_tmp3,_tmp23); \
2176 _tmp4 = _mm_sub_ps(_tmp4,jz1); \
2177 _tmp5 = _mm_sub_ps(_tmp5,jx3); \
2178 _tmp6 = _mm_sub_ps(_tmp6,jy4); \
2179 _tmp7 = _mm_sub_ps(_tmp7,_tmp20); \
2180 _tmp8 = _mm_sub_ps(_tmp8,_tmp22); \
2181 _tmp9 = _mm_sub_ps(_tmp9,_tmp24); \
2182 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
2183 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
2184 _tmp12 = _mm_sub_ps(_tmp12,_tmp18); \
2185 _mm_storeu_ps(ptr1,_tmp1); \
2186 _mm_storeu_ps(ptr1+4,_tmp2); \
2187 _mm_storeu_ps(ptr1+8,_tmp3); \
2188 _mm_storeu_ps(ptr2,_tmp4); \
2189 _mm_storeu_ps(ptr2+4,_tmp5); \
2190 _mm_storeu_ps(ptr2+8,_tmp6); \
2191 _mm_storeu_ps(ptr3,_tmp7); \
2192 _mm_storeu_ps(ptr3+4,_tmp8); \
2193 _mm_storeu_ps(ptr3+8,_tmp9); \
2194 _mm_storeu_ps(ptr4,_tmp10); \
2195 _mm_storeu_ps(ptr4+4,_tmp11); \
2196 _mm_storeu_ps(ptr4+8,_tmp12); \
2204 /* Routine to be called with rswitch/rcut at the beginning of a kernel
2205 * to set up the 7 constants used for analytic 5th order switch calculations.
2207 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) { \
2208 const __m128 _swsetup_cm6 = { -6.0, -6.0, -6.0, -6.0}; \
2209 const __m128 _swsetup_cm10 = {-10.0,-10.0,-10.0,-10.0}; \
2210 const __m128 _swsetup_c15 = { 15.0, 15.0, 15.0, 15.0}; \
2211 const __m128 _swsetup_cm30 = {-30.0,-30.0,-30.0,-30.0}; \
2212 const __m128 _swsetup_c60 = { 60.0, 60.0, 60.0, 60.0}; \
2214 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5; \
2216 d = _mm_sub_ps(rcut,rswitch); \
2217 dinv = gmx_mm_inv_ps(d); \
2218 dinv2 = _mm_mul_ps(dinv,dinv); \
2219 dinv3 = _mm_mul_ps(dinv2,dinv); \
2220 dinv4 = _mm_mul_ps(dinv2,dinv2); \
2221 dinv5 = _mm_mul_ps(dinv3,dinv2); \
2223 switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3); \
2224 switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4); \
2225 switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5); \
2226 switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3); \
2227 switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4); \
2228 switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5); \
2232 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
2233 const __m128 _sw_one = { 1.0, 1.0, 1.0, 1.0}; \
2235 d = _mm_max_ps(r,rswitch); \
2236 d = _mm_min_ps(d,rcut); \
2237 d = _mm_sub_ps(d,rswitch); \
2238 d2 = _mm_mul_ps(d,d); \
2239 sw = _mm_mul_ps(d,sw_C5); \
2240 dsw = _mm_mul_ps(d,sw_D4); \
2241 sw = _mm_add_ps(sw,sw_C4); \
2242 dsw = _mm_add_ps(dsw,sw_D3); \
2243 sw = _mm_mul_ps(sw,d); \
2244 dsw = _mm_mul_ps(dsw,d); \
2245 sw = _mm_add_ps(sw,sw_C3); \
2246 dsw = _mm_add_ps(dsw,sw_D2); \
2247 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
2248 dsw = _mm_mul_ps(dsw,d2); \
2249 sw = _mm_add_ps(sw,_sw_one); \
2255 gmx_mm_update_iforce_1atom_ps(__m128
*fix1
, __m128
*fiy1
, __m128
*fiz1
,
2262 *fix1
= _mm_hadd_ps(*fix1
,*fix1
);
2263 *fiy1
= _mm_hadd_ps(*fiy1
,*fiz1
);
2265 *fix1
= _mm_hadd_ps(*fix1
,*fiy1
); /* fiz1 fiy1 fix1 fix1 */
2268 /* transpose data */
2270 _MM_TRANSPOSE4_PS(*fix1
,t1
,*fiy1
,*fiz1
);
2271 *fix1
= _mm_add_ps(_mm_add_ps(*fix1
,t1
), _mm_add_ps(*fiy1
,*fiz1
));
2273 t2
= _mm_load_ss(fptr
);
2274 t2
= _mm_loadh_pi(t2
,(__m64
*)(fptr
+1));
2275 t3
= _mm_load_ss(fshiftptr
);
2276 t3
= _mm_loadh_pi(t3
,(__m64
*)(fshiftptr
+1));
2278 t2
= _mm_add_ps(t2
,*fix1
);
2279 t3
= _mm_add_ps(t3
,*fix1
);
2281 _mm_store_ss(fptr
,t2
);
2282 _mm_storeh_pi((__m64
*)(fptr
+1),t2
);
2283 _mm_store_ss(fshiftptr
,t3
);
2284 _mm_storeh_pi((__m64
*)(fshiftptr
+1),t3
);
2288 gmx_mm_update_iforce_2atoms_ps(__m128
*fix1
, __m128
*fiy1
, __m128
*fiz1
,
2289 __m128
*fix2
, __m128
*fiy2
, __m128
*fiz2
,
2296 *fix1
= _mm_hadd_ps(*fix1
,*fiy1
);
2297 *fiz1
= _mm_hadd_ps(*fiz1
,*fix2
);
2298 *fiy2
= _mm_hadd_ps(*fiy2
,*fiz2
);
2300 *fix1
= _mm_hadd_ps(*fix1
,*fiz1
); /* fix2 fiz1 fiy1 fix1 */
2301 *fiy2
= _mm_hadd_ps(*fiy2
,*fiy2
); /* - - fiz2 fiy2 */
2304 /* transpose data */
2305 _MM_TRANSPOSE4_PS(*fix1
,*fiy1
,*fiz1
,*fix2
);
2306 t1
= _mm_unpacklo_ps(*fiy2
,*fiz2
);
2307 t2
= _mm_unpackhi_ps(*fiy2
,*fiz2
);
2309 *fix1
= _mm_add_ps(_mm_add_ps(*fix1
,*fiy1
), _mm_add_ps(*fiz1
,*fix2
));
2310 t1
= _mm_add_ps(t1
,t2
);
2311 t2
= _mm_movehl_ps(t2
,t1
);
2312 *fiy2
= _mm_add_ps(t1
,t2
);
2314 _mm_storeu_ps(fptr
, _mm_add_ps(*fix1
,_mm_loadu_ps(fptr
) ));
2315 t1
= _mm_loadl_pi(t1
,(__m64
*)(fptr
+4));
2316 _mm_storel_pi((__m64
*)(fptr
+4), _mm_add_ps(*fiy2
,t1
));
2318 t4
= _mm_load_ss(fshiftptr
+2);
2319 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
2321 t1
= _mm_shuffle_ps(*fix1
,*fiy2
,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
2322 t1
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
2323 t2
= _mm_shuffle_ps(*fiy2
,*fix1
,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
2325 t1
= _mm_add_ps(t1
,t2
);
2326 t1
= _mm_add_ps(t1
,t4
); /* y x - z */
2328 _mm_store_ss(fshiftptr
+2,t1
);
2329 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
2335 gmx_mm_update_iforce_3atoms_ps(__m128
*fix1
, __m128
*fiy1
, __m128
*fiz1
,
2336 __m128
*fix2
, __m128
*fiy2
, __m128
*fiz2
,
2337 __m128
*fix3
, __m128
*fiy3
, __m128
*fiz3
,
2344 *fix1
= _mm_hadd_ps(*fix1
,*fiy1
);
2345 *fiz1
= _mm_hadd_ps(*fiz1
,*fix2
);
2346 *fiy2
= _mm_hadd_ps(*fiy2
,*fiz2
);
2347 *fix3
= _mm_hadd_ps(*fix3
,*fiy3
);
2348 *fiz3
= _mm_hadd_ps(*fiz3
,*fiz3
);
2350 *fix1
= _mm_hadd_ps(*fix1
,*fiz1
); /* fix2 fiz1 fiy1 fix1 */
2351 *fiy2
= _mm_hadd_ps(*fiy2
,*fix3
); /* fiy3 fix3 fiz2 fiy2 */
2352 *fiz3
= _mm_hadd_ps(*fiz3
,*fiz3
); /* - - - fiz3 */
2355 /* transpose data */
2356 _MM_TRANSPOSE4_PS(*fix1
,*fiy1
,*fiz1
,*fix2
);
2357 _MM_TRANSPOSE4_PS(*fiy2
,*fiz2
,*fix3
,*fiy3
);
2358 t2
= _mm_movehl_ps(_mm_setzero_ps(),*fiz3
);
2359 t1
= _mm_shuffle_ps(*fiz3
,*fiz3
,_MM_SHUFFLE(0,0,0,1));
2360 t3
= _mm_shuffle_ps(t2
,t2
,_MM_SHUFFLE(0,0,0,1));
2362 *fix1
= _mm_add_ps(_mm_add_ps(*fix1
,*fiy1
), _mm_add_ps(*fiz1
,*fix2
));
2363 *fiy2
= _mm_add_ps(_mm_add_ps(*fiy2
,*fiz2
), _mm_add_ps(*fix3
,*fiy3
));
2364 *fiz3
= _mm_add_ss(_mm_add_ps(*fiz3
,t1
) , _mm_add_ps(t2
,t3
));
2366 _mm_storeu_ps(fptr
, _mm_add_ps(*fix1
,_mm_loadu_ps(fptr
) ));
2367 _mm_storeu_ps(fptr
+4,_mm_add_ps(*fiy2
,_mm_loadu_ps(fptr
+4)));
2368 _mm_store_ss (fptr
+8,_mm_add_ss(*fiz3
,_mm_load_ss(fptr
+8) ));
2370 t4
= _mm_load_ss(fshiftptr
+2);
2371 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
2373 t1
= _mm_shuffle_ps(*fiz3
,*fix1
,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
2374 t2
= _mm_shuffle_ps(*fix1
,*fiy2
,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
2375 t3
= _mm_shuffle_ps(*fiy2
,*fix1
,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
2376 t3
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
2378 t1
= _mm_add_ps(t1
,t2
);
2379 t3
= _mm_add_ps(t3
,t4
);
2380 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
2382 _mm_store_ss(fshiftptr
+2,t1
);
2383 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
2388 gmx_mm_update_iforce_4atoms_ps(__m128
*fix1
, __m128
*fiy1
, __m128
*fiz1
,
2389 __m128
*fix2
, __m128
*fiy2
, __m128
*fiz2
,
2390 __m128
*fix3
, __m128
*fiy3
, __m128
*fiz3
,
2391 __m128
*fix4
, __m128
*fiy4
, __m128
*fiz4
,
2395 __m128 t1
,t2
,t3
,t4
,t5
;
2398 *fix1
= _mm_hadd_ps(*fix1
,*fiy1
);
2399 *fiz1
= _mm_hadd_ps(*fiz1
,*fix2
);
2400 *fiy2
= _mm_hadd_ps(*fiy2
,*fiz2
);
2401 *fix3
= _mm_hadd_ps(*fix3
,*fiy3
);
2402 *fiz3
= _mm_hadd_ps(*fiz3
,*fix4
);
2403 *fiy4
= _mm_hadd_ps(*fiy4
,*fiz4
);
2405 *fix1
= _mm_hadd_ps(*fix1
,*fiz1
); /* fix2 fiz1 fiy1 fix1 */
2406 *fiy2
= _mm_hadd_ps(*fiy2
,*fix3
); /* fiy3 fix3 fiz2 fiy2 */
2407 *fiz3
= _mm_hadd_ps(*fiz3
,*fiy4
); /* fiz4 fiy4 fix4 fiz3 */
2410 /* transpose data */
2411 _MM_TRANSPOSE4_PS(*fix1
,*fiy1
,*fiz1
,*fix2
);
2412 _MM_TRANSPOSE4_PS(*fiy2
,*fiz2
,*fix3
,*fiy3
);
2413 _MM_TRANSPOSE4_PS(*fiz3
,*fix4
,*fiy4
,*fiz4
);
2415 *fix1
= _mm_add_ps(_mm_add_ps(*fix1
,*fiy1
), _mm_add_ps(*fiz1
,*fix2
));
2416 *fiy2
= _mm_add_ps(_mm_add_ps(*fiy2
,*fiz2
), _mm_add_ps(*fix3
,*fiy3
));
2417 *fiz3
= _mm_add_ps(_mm_add_ps(*fiz3
,*fix4
), _mm_add_ps(*fiy4
,*fiz4
));
2419 _mm_storeu_ps(fptr
, _mm_add_ps(*fix1
,_mm_loadu_ps(fptr
) ));
2420 _mm_storeu_ps(fptr
+4,_mm_add_ps(*fiy2
,_mm_loadu_ps(fptr
+4)));
2421 _mm_storeu_ps(fptr
+8,_mm_add_ps(*fiz3
,_mm_loadu_ps(fptr
+8)));
2423 t5
= _mm_load_ss(fshiftptr
+2);
2424 t5
= _mm_loadh_pi(t5
,(__m64
*)(fshiftptr
));
2426 t1
= _mm_shuffle_ps(*fix1
,*fix1
,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
2427 t2
= _mm_shuffle_ps(*fiy2
,*fiy2
,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
2428 t3
= _mm_shuffle_ps(*fiz3
,*fiz3
,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
2429 t4
= _mm_shuffle_ps(*fix1
,*fiy2
,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
2430 t4
= _mm_shuffle_ps(*fiz3
,t4
,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
2432 t1
= _mm_add_ps(t1
,t2
);
2433 t3
= _mm_add_ps(t3
,t4
);
2434 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
2435 t5
= _mm_add_ps(t5
,t1
);
2437 _mm_store_ss(fshiftptr
+2,t5
);
2438 _mm_storeh_pi((__m64
*)(fshiftptr
),t5
);
2444 #define GMX_MM_UPDATE_1POT_PS(pot1,ptr1) \
2446 pot1 = _mm_hadd_ps(pot1,pot1); \
2447 pot1 = _mm_hadd_ps(pot1,pot1); \
2448 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2451 #define GMX_MM_UPDATE_2POT_PS(pot1,ptr1,pot2,ptr2) \
2453 pot1 = _mm_hadd_ps(pot1,pot2); \
2454 pot1 = _mm_hadd_ps(pot1,pot1); \
2455 pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)); \
2456 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2457 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2))); \
2462 #define GMX_MM_UPDATE_1POT_PS(pot1,ptr1) \
2464 pot1 = _mm_add_ps(pot1,_mm_movehl_ps(pot1,pot1)); \
2465 pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1))); \
2466 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2469 #define GMX_MM_UPDATE_2POT_PS(pot1,ptr1,pot2,ptr2) \
2471 __m128 _updt1_,_updt2; \
2472 _updt1 = _mm_movehl_ps(pot2,pot1); /* 2d 2c 1d 1c */ \
2473 _updt2 = _mm_movelh_ps(pot1,pot2); /* 2b 2a 1b 1a */ \
2474 _updt1 = _mm_add_ps(_updt1,_updt2); /* 2 2 1 1 */ \
2475 _updt2 = _mm_shuffle_ps(_updt1,_updt1,_MM_SHUFFLE(3,3,1,1)); \
2476 pot1 = _mm_add_ps(_updt1,_updt2); /* - 2 - 1 */ \
2477 pot2 = _mm_movehl_ps(_updt2,pot1); /* - - - 2 */ \
2478 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2479 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2))); \
2485 #define GMX_MM_UPDATE_4POT_PS(pot1,ptr1,pot2,ptr2,pot3,ptr3,pot4,ptr4) \
2487 _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4); \
2488 pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4)); \
2489 pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1)); \
2490 pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2)); \
2491 pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3)); \
2492 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1))); \
2493 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2))); \
2494 _mm_store_ss(ptr3,_mm_add_ss(pot3,_mm_load_ss(ptr3))); \
2495 _mm_store_ss(ptr4,_mm_add_ss(pot4,_mm_load_ss(ptr4))); \
2499 #endif /* _gmx_sse2_single_h_ */