2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
34 /* We require SSE2 now! */
39 #include <xmmintrin.h> /* SSE */
40 #include <emmintrin.h> /* SSE2 */
43 # include <pmmintrin.h> /* SSE3 */
46 # include <smmintrin.h> /* SSE4.1 */
51 /***************************************************
53 * COMPILER RANT WARNING: *
55 * Ideally, this header would be filled with *
56 * simple static inline functions. Unfortunately, *
57 * many vendors provide really braindead compilers *
58 * that either cannot handle more than 1-2 SSE *
59 * function parameters, and some cannot handle *
60 * pointers to SSE __m128 datatypes as parameters *
61 * at all. Thus, for portability we have had to *
62 * implement all but the simplest routines as *
65 ***************************************************/
68 /***************************************************
70 * Wrappers/replacements for some instructions *
71 * not available in all SSE versions. *
73 ***************************************************/
76 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
78 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
82 * Some compilers require a cast to change the interpretation
83 * of a register from FP to Int and vice versa, and not all of
84 * the provide instructions to do this. Roll our own wrappers...
87 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
88 # define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
89 # define gmx_mm_castps_si128(a) _mm_castps_si128(a)
90 # define gmx_mm_castps_ps128(a) (a)
91 #elif defined(__GNUC__)
92 # define gmx_mm_castsi128_ps(a) ((__m128)(a))
93 # define gmx_mm_castps_si128(a) ((__m128i)(a))
94 # define gmx_mm_castps_ps128(a) ((__m128)(a))
96 static __m128
gmx_mm_castsi128_ps(__m128i a
) { return *(__m128
*) &a
; }
97 static __m128i
gmx_mm_castps_si128(__m128 a
) { return *(__m128i
*) &a
; }
98 static __m128
gmx_mm_castps_ps128(__m128 a
) { return *(__m128
*) &a
; }
103 /* IO functions, just for debugging */
106 printxmm(const char *s
,__m128 xmm
)
110 _mm_storeu_ps(f
,xmm
);
111 printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s
,f
[0],f
[1],f
[2],f
[3]);
116 printxmmsum(const char *s
,__m128 xmm
)
120 _mm_storeu_ps(f
,xmm
);
121 printf("%s (sum): %15.10g\n",s
,f
[0]+f
[1]+f
[2]+f
[3]);
126 printxmmi(const char *s
,__m128i xmmi
)
130 _mm_storeu_si128((__m128i
*)i
,xmmi
);
131 printf("%10s: %2d %2d %2d %2d\n",s
,i
[0],i
[1],i
[2],i
[3]);
135 /************************
137 * Simple math routines *
139 ************************/
142 gmx_mm_invsqrt_ps(__m128 x
)
144 const __m128 half
= {0.5,0.5,0.5,0.5};
145 const __m128 three
= {3.0,3.0,3.0,3.0};
147 __m128 lu
= _mm_rsqrt_ps(x
);
149 return _mm_mul_ps(half
,_mm_mul_ps(_mm_sub_ps(three
,_mm_mul_ps(_mm_mul_ps(lu
,lu
),x
)),lu
));
153 gmx_mm_inv_ps(__m128 x
)
155 const __m128 two
= {2.0f
,2.0f
,2.0f
,2.0f
};
157 __m128 lu
= _mm_rcp_ps(x
);
159 return _mm_mul_ps(lu
,_mm_sub_ps(two
,_mm_mul_ps(lu
,x
)));
164 gmx_mm_calc_rsq_ps(__m128 dx
, __m128 dy
, __m128 dz
)
166 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
), _mm_mul_ps(dy
,dy
) ), _mm_mul_ps(dz
,dz
) );
169 /* Normal sum of four xmm registers */
171 gmx_mm_sum4_ps(__m128 t0
, __m128 t1
, __m128 t2
, __m128 t3
)
173 t0
= _mm_add_ps(t0
,t1
);
174 t2
= _mm_add_ps(t2
,t3
);
175 return _mm_add_ps(t0
,t2
);
180 gmx_mm_log_ps(__m128 x
)
182 const __m128 exp_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
183 const __m128 one_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000) );
184 const __m128 off_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000) );
185 const __m128 mant_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF) );
186 const __m128 base_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000) );
187 const __m128 loge_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218) );
189 const __m128 D5
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5) );
190 const __m128 D4
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD) );
191 const __m128 D3
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9) );
192 const __m128 D2
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B) );
193 const __m128 D1
= gmx_mm_castsi128_ps( _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD) );
194 const __m128 D0
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A) );
196 __m128 xmm0
,xmm1
,xmm2
;
200 xmm1
= _mm_and_ps(xmm1
, exp_ps
);
201 xmm1
= gmx_mm_castsi128_ps( _mm_srli_epi32( gmx_mm_castps_si128(xmm1
),8) );
203 xmm1
= _mm_or_ps(xmm1
, one_ps
);
204 xmm1
= _mm_sub_ps(xmm1
, off_ps
);
206 xmm1
= _mm_mul_ps(xmm1
, base_ps
);
207 xmm0
= _mm_and_ps(xmm0
, mant_ps
);
208 xmm0
= _mm_or_ps(xmm0
, one_ps
);
210 xmm2
= _mm_mul_ps(xmm0
, D5
);
211 xmm2
= _mm_add_ps(xmm2
, D4
);
212 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
213 xmm2
= _mm_add_ps(xmm2
, D3
);
214 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
215 xmm2
= _mm_add_ps(xmm2
, D2
);
216 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
217 xmm2
= _mm_add_ps(xmm2
, D1
);
218 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
219 xmm2
= _mm_add_ps(xmm2
, D0
);
220 xmm0
= _mm_sub_ps(xmm0
, one_ps
);
221 xmm0
= _mm_mul_ps(xmm0
,xmm2
);
222 xmm1
= _mm_add_ps(xmm1
,xmm0
);
225 x
= _mm_mul_ps(x
, loge_ps
);
231 /* This exp-routine has a relative precision of:
232 * 2^-22.33 bits (essentially single precision :-)
233 * WARNING: no check against over or underflows (x beyond +-87)
236 gmx_mm_exp_ps(__m128 x
)
238 const __m128i half
= _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000); // 0.5e+0f
239 const __m128i base
= _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F); // 127
240 const __m128i CC
= _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B); // log2(e)
242 const __m128i D5
= _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905); // 1.8775767e-3f
243 const __m128i D4
= _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806); // 8.9893397e-3f
244 const __m128i D3
= _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23); // 5.5826318e-2f
245 const __m128i D2
= _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4); // 2.4015361e-1f
246 const __m128i D1
= _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B); // 6.9315308e-1f
247 const __m128i D0
= _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF); // 9.9999994e-1f
252 xmm0
= _mm_mul_ps(x
,gmx_mm_castsi128_ps(CC
));
253 xmm1
= _mm_sub_ps(xmm0
,gmx_mm_castsi128_ps(half
));
254 xmm2
= _mm_cvtps_epi32(xmm1
);
255 xmm1
= _mm_cvtepi32_ps(xmm2
);
257 xmm2
= _mm_add_epi32(xmm2
,gmx_mm_castps_si128(base
));
258 xmm2
= _mm_slli_epi32(xmm2
,23);
260 xmm0
= _mm_sub_ps(xmm0
,xmm1
);
261 xmm1
= _mm_mul_ps(xmm0
,gmx_mm_castsi128_ps(D5
));
262 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D4
));
263 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
264 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D3
));
265 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
266 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D2
));
267 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
268 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D1
));
269 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
270 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D0
));
271 xmm1
= _mm_mul_ps(xmm1
,gmx_mm_castsi128_ps(xmm2
));
273 /* 18 instructions currently */
278 /* Same as gmx_mm_exp_ps, but has a lower bound check, such that it can
279 * be safely called with x < -87.33.
280 * WARNING: no check against overflows (x > 87)
283 gmx_mm_exp_ps_lbc(__m128 x
)
285 const __m128i lim
= _mm_set_epi32(0xC2AE0000, 0xC2AE0000, 0xC2AE0000, 0xC2AE0000); // -87
286 const __m128i half
= _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000); // 0.5e+0f
287 const __m128i base
= _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F); // 127
288 const __m128i CC
= _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B); // log2(e)
290 const __m128i D5
= _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905); // 1.8775767e-3f
291 const __m128i D4
= _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806); // 8.9893397e-3f
292 const __m128i D3
= _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23); // 5.5826318e-2f
293 const __m128i D2
= _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4); // 2.4015361e-1f
294 const __m128i D1
= _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B); // 6.9315308e-1f
295 const __m128i D0
= _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF); // 9.9999994e-1f
300 xmm1
= _mm_max_ps(x
,gmx_mm_castsi128_ps(lim
)); /* x<-87 gives exp(-87) */
301 xmm0
= _mm_mul_ps(xmm1
,gmx_mm_castsi128_ps(CC
));
302 xmm1
= _mm_sub_ps(xmm0
,gmx_mm_castsi128_ps(half
));
303 xmm2
= _mm_cvtps_epi32(xmm1
);
304 xmm1
= _mm_cvtepi32_ps(xmm2
);
306 xmm2
= _mm_add_epi32(xmm2
,gmx_mm_castps_si128(base
));
307 xmm2
= _mm_slli_epi32(xmm2
,23);
309 xmm0
= _mm_sub_ps(xmm0
,xmm1
);
310 xmm1
= _mm_mul_ps(xmm0
,gmx_mm_castsi128_ps(D5
));
311 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D4
));
312 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
313 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D3
));
314 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
315 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D2
));
316 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
317 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D1
));
318 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
319 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D0
));
320 xmm1
= _mm_mul_ps(xmm1
,gmx_mm_castsi128_ps(xmm2
));
322 /* 19 instructions currently + pipeline latenct after max_ps */
327 #define GMX_MM_SINCOS_PS(x,sinval,cosval) \
329 const __m128 _sincosf_two_over_pi = {2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI}; \
330 const __m128 _sincosf_half = {0.5,0.5,0.5,0.5}; \
331 const __m128 _sincosf_one = {1.0,1.0,1.0,1.0}; \
333 const __m128i _sincosf_izero = _mm_set1_epi32(0); \
334 const __m128i _sincosf_ione = _mm_set1_epi32(1); \
335 const __m128i _sincosf_itwo = _mm_set1_epi32(2); \
336 const __m128i _sincosf_ithree = _mm_set1_epi32(3); \
338 const __m128 _sincosf_kc1 = {1.57079625129,1.57079625129,1.57079625129,1.57079625129}; \
339 const __m128 _sincosf_kc2 = {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8}; \
340 const __m128 _sincosf_cc0 = {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249}; \
341 const __m128 _sincosf_cc1 = {0.0416566950,0.0416566950,0.0416566950,0.0416566950}; \
342 const __m128 _sincosf_cc2 = {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225}; \
343 const __m128 _sincosf_sc0 = {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727}; \
344 const __m128 _sincosf_sc1 = {0.0083320758,0.0083320758,0.0083320758,0.0083320758}; \
345 const __m128 _sincosf_sc2 = {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247}; \
347 __m128 _sincosf_signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); \
348 __m128 _sincosf_tiny = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) ); \
350 __m128 _sincosf_xl; \
351 __m128 _sincosf_xl2; \
352 __m128 _sincosf_xl3; \
353 __m128 _sincosf_qf; \
354 __m128 _sincosf_absxl; \
355 __m128 _sincosf_p1; \
356 __m128 _sincosf_cx; \
357 __m128 _sincosf_sx; \
358 __m128 _sincosf_ts; \
359 __m128 _sincosf_tc; \
360 __m128 _sincosf_tsn; \
361 __m128 _sincosf_tcn; \
362 __m128i _sincosf_q; \
363 __m128i _sincosf_offsetSin; \
364 __m128i _sincosf_offsetCos; \
365 __m128 _sincosf_sinMask; \
366 __m128 _sincosf_cosMask; \
367 __m128 _sincosf_isTiny; \
368 __m128 _sincosf_ct0; \
369 __m128 _sincosf_ct1; \
370 __m128 _sincosf_ct2; \
371 __m128 _sincosf_st1; \
372 __m128 _sincosf_st2; \
374 _sincosf_xl = _mm_mul_ps(x,_sincosf_two_over_pi); \
376 _sincosf_xl = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half)); \
378 _sincosf_q = _mm_cvttps_epi32(_sincosf_xl); \
379 _sincosf_qf = _mm_cvtepi32_ps(_sincosf_q); \
381 _sincosf_offsetSin = _mm_and_si128(_sincosf_q,_sincosf_ithree); \
382 _sincosf_offsetCos = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione); \
384 _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1); \
385 _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2); \
386 _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1); \
387 _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl); \
389 _sincosf_absxl = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl); \
390 _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl); \
392 _sincosf_xl2 = _mm_mul_ps(_sincosf_xl,_sincosf_xl); \
393 _sincosf_xl3 = _mm_mul_ps(_sincosf_xl2,_sincosf_xl); \
395 _sincosf_ct1 = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2); \
396 _sincosf_ct1 = _mm_add_ps(_sincosf_ct1,_sincosf_cc1); \
397 _sincosf_st1 = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2); \
398 _sincosf_st1 = _mm_add_ps(_sincosf_st1,_sincosf_sc1); \
399 _sincosf_ct2 = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2); \
400 _sincosf_ct2 = _mm_add_ps(_sincosf_ct2,_sincosf_cc2); \
401 _sincosf_st2 = _mm_mul_ps(_sincosf_st1,_sincosf_xl2); \
402 _sincosf_st2 = _mm_add_ps(_sincosf_st2,_sincosf_sc2); \
404 _sincosf_cx = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2); \
405 _sincosf_cx = _mm_add_ps(_sincosf_cx,_sincosf_one); \
407 _sincosf_sx = _mm_mul_ps(_sincosf_st2,_sincosf_xl3); \
408 _sincosf_sx = _mm_add_ps(_sincosf_sx,_sincosf_xl); \
410 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) ); \
411 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) ); \
413 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) ); \
414 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) ); \
416 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );\
417 _sincosf_tsn = _mm_xor_ps(_sincosf_signbit,_sincosf_ts); \
418 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) ); \
420 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) ); \
421 _sincosf_tcn = _mm_xor_ps(_sincosf_signbit,_sincosf_tc); \
422 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) ); \
424 sinval = _sincosf_ts; \
425 cosval = _sincosf_tc; \
430 /* Load a single value from 1-4 places, merge into xmm register */
432 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
434 __m128 _txmm2,_txmm3,_txmm4; \
435 xmm1 = _mm_load_ss(ptr1); \
436 _txmm2 = _mm_load_ss(ptr2); \
437 _txmm3 = _mm_load_ss(ptr3); \
438 _txmm4 = _mm_load_ss(ptr4); \
439 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
440 _txmm2 = _mm_unpacklo_ps(_txmm2,_txmm4); \
441 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
445 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
447 __m128 _txmm2,_txmm3; \
448 xmm1 = _mm_load_ss(ptr1); \
449 _txmm2 = _mm_load_ss(ptr2); \
450 _txmm3 = _mm_load_ss(ptr3); \
451 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
452 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
456 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1) \
459 xmm1 = _mm_load_ss(ptr1); \
460 _txmm2 = _mm_load_ss(ptr2); \
461 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
465 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
467 xmm1 = _mm_load_ss(ptr1); \
470 /* Store data in an xmm register into 1-4 different places */
471 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
473 __m128 _txmm2,_txmm3,_txmm4; \
474 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
475 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
476 _txmm4 = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
477 _mm_store_ss(ptr1,xmm1); \
478 _mm_store_ss(ptr2,_txmm2); \
479 _mm_store_ss(ptr3,_txmm3); \
480 _mm_store_ss(ptr4,_txmm4); \
484 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
486 __m128 _txmm2,_txmm3; \
487 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
488 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
489 _mm_store_ss(ptr1,xmm1); \
490 _mm_store_ss(ptr2,_txmm2); \
491 _mm_store_ss(ptr3,_txmm3); \
495 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1) \
498 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
499 _mm_store_ss(ptr1,xmm1); \
500 _mm_store_ss(ptr2,_txmm2); \
504 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
506 _mm_store_ss(ptr1,xmm1); \
510 /* Similar to store, but increments value in memory */
511 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2) \
513 __m128 _tincr1,_tincr2; \
514 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
515 GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
516 _tincr1 = _mm_add_ps(_tincr1,xmm1); \
517 _tincr2 = _mm_add_ps(_tincr2,xmm2); \
518 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
519 GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
522 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
525 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
526 _tincr = _mm_add_ps(_tincr,xmm1); \
527 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
530 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
533 GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
534 _tincr = _mm_add_ps(_tincr,xmm1); \
535 GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
538 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1) \
541 GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr); \
542 _tincr = _mm_add_ps(_tincr,xmm1); \
543 GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr); \
546 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1) \
549 GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr); \
550 _tincr = _mm_add_ss(_tincr,xmm1); \
551 GMX_MM_STORE_1VALUE_PS(ptr1,_tincr); \
556 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
557 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12) \
559 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
560 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
561 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
562 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
563 _tmp4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4)); \
564 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
565 _tmp2 = _mm_unpacklo_ps(_tmp2,_tmp4); \
566 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
567 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
570 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12) \
572 __m128 _tmp1,_tmp2,_tmp3; \
573 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
574 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
575 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
576 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
577 _tmp2 = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps()); \
578 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
579 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
583 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12) \
585 __m128 _tmp1,_tmp2; \
586 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
587 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
588 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
589 c12 = _mm_movehl_ps(c12,c6); \
592 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12) \
594 c6 = _mm_load_ss(ptr1); \
595 c12 = _mm_load_ss(ptr1+1); \
599 /* Routines to load 1-4 rvecs from 1-4 places.
600 * We mainly use these to load coordinates. The extra routines
601 * are very efficient for the water-water loops, since we e.g.
602 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
604 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
605 jx1 = _mm_load_ss(ptr1); \
606 jy1 = _mm_load_ss((ptr1)+1); \
607 jz1 = _mm_load_ss((ptr1)+2); \
610 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
611 jx1 = _mm_load_ss(ptr1); \
612 jy1 = _mm_load_ss((ptr1)+1); \
613 jz1 = _mm_load_ss((ptr1)+2); \
614 jx2 = _mm_load_ss((ptr1)+3); \
615 jy2 = _mm_load_ss((ptr1)+4); \
616 jz2 = _mm_load_ss((ptr1)+5); \
620 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
621 jx1 = _mm_load_ss(ptr1); \
622 jy1 = _mm_load_ss((ptr1)+1); \
623 jz1 = _mm_load_ss((ptr1)+2); \
624 jx2 = _mm_load_ss((ptr1)+3); \
625 jy2 = _mm_load_ss((ptr1)+4); \
626 jz2 = _mm_load_ss((ptr1)+5); \
627 jx3 = _mm_load_ss((ptr1)+6); \
628 jy3 = _mm_load_ss((ptr1)+7); \
629 jz3 = _mm_load_ss((ptr1)+8); \
633 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
634 jx1 = _mm_load_ss(ptr1); \
635 jy1 = _mm_load_ss((ptr1)+1); \
636 jz1 = _mm_load_ss((ptr1)+2); \
637 jx2 = _mm_load_ss((ptr1)+3); \
638 jy2 = _mm_load_ss((ptr1)+4); \
639 jz2 = _mm_load_ss((ptr1)+5); \
640 jx3 = _mm_load_ss((ptr1)+6); \
641 jy3 = _mm_load_ss((ptr1)+7); \
642 jz3 = _mm_load_ss((ptr1)+8); \
643 jx4 = _mm_load_ss((ptr1)+9); \
644 jy4 = _mm_load_ss((ptr1)+10); \
645 jz4 = _mm_load_ss((ptr1)+11); \
649 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
650 __m128 _tmp1,_tmp2; \
651 _tmp1 = _mm_load_ss(ptr1); \
652 _tmp2 = _mm_load_ss(ptr2); \
653 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
654 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
655 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
656 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
657 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
658 jz1 = _mm_movehl_ps(jz1,jy1); \
661 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
662 __m128 _tmp1, _tmp2; \
663 _tmp1 = _mm_loadu_ps(ptr1); \
664 jy1 = _mm_loadu_ps(ptr2); \
665 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
666 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
667 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
668 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
669 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
670 jy1 = _mm_movehl_ps(jx1,jx1); \
671 jx2 = _mm_movehl_ps(jz1,jz1); \
672 jz2 = _mm_movehl_ps(jy2,jy2); \
676 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
677 __m128 _tmp1, _tmp2, _tmp3; \
678 _tmp1 = _mm_loadu_ps(ptr1); \
679 jy1 = _mm_loadu_ps(ptr2); \
680 _tmp2 = _mm_loadu_ps(ptr1+4); \
681 jz2 = _mm_loadu_ps(ptr2+4); \
682 jz3 = _mm_load_ss(ptr1+8); \
683 _tmp3 = _mm_load_ss(ptr2+8); \
684 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
685 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
686 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
687 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
688 jy1 = _mm_movehl_ps(jx1,jx1); \
689 jx2 = _mm_movehl_ps(jz1,jz1); \
690 jz2 = _mm_movehl_ps(jy2,jy2); \
691 jy3 = _mm_movehl_ps(jx3,jx3); \
692 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
696 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
697 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
698 _tmp1 = _mm_loadu_ps(ptr1); \
699 jy1 = _mm_loadu_ps(ptr2); \
700 _tmp2 = _mm_loadu_ps(ptr1+4); \
701 jz2 = _mm_loadu_ps(ptr2+4); \
702 _tmp3 = _mm_loadu_ps(ptr1+8); \
703 _tmp4 = _mm_loadu_ps(ptr2+8); \
704 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
705 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
706 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
707 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
708 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
709 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
710 jy1 = _mm_movehl_ps(jx1,jx1); \
711 jx2 = _mm_movehl_ps(jz1,jz1); \
712 jz2 = _mm_movehl_ps(jy2,jy2); \
713 jy3 = _mm_movehl_ps(jx3,jx3); \
714 jx4 = _mm_movehl_ps(jz3,jz3); \
715 jz4 = _mm_movehl_ps(jy4,jy4); \
719 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
720 __m128 _tmp1,_tmp3,_tmp4; \
721 jx1 = _mm_load_ss(ptr1); \
722 jy1 = _mm_load_ss(ptr2); \
723 jz1 = _mm_load_ss(ptr3); \
724 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
725 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
726 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
727 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
728 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
729 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
730 jx1 = _mm_movelh_ps(_tmp1,jz1); \
731 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
732 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
736 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
737 __m128 _tmp1, _tmp2; \
738 jx1 = _mm_loadu_ps(ptr1); \
739 jy1 = _mm_loadu_ps(ptr2); \
740 jz1 = _mm_loadu_ps(ptr3); \
741 jx2 = _mm_setzero_ps(); \
742 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
743 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
744 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
745 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
746 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
747 jz2 = _mm_unpacklo_ps(jz2,_mm_setzero_ps()); \
748 jy2 = _mm_unpacklo_ps(_tmp1,jz2); \
749 jz2 = _mm_unpackhi_ps(_tmp1,jz2); \
753 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
754 __m128 _tmp1, _tmp2; \
755 jx1 = _mm_loadu_ps(ptr1); \
756 jy1 = _mm_loadu_ps(ptr2); \
757 jz1 = _mm_loadu_ps(ptr3); \
758 jx2 = _mm_setzero_ps(); \
759 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
760 jy2 = _mm_loadu_ps(ptr1+4); \
761 jz2 = _mm_loadu_ps(ptr2+4); \
762 jx3 = _mm_loadu_ps(ptr3+4); \
763 jy3 = _mm_setzero_ps(); \
764 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
765 jz3 = _mm_load_ss(ptr1+8); \
766 _tmp1 = _mm_load_ss(ptr2+8); \
767 _tmp2 = _mm_load_ss(ptr3+8); \
768 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
769 _tmp1 = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps()); \
770 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
774 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
775 jx1 = _mm_loadu_ps(ptr1); \
776 jy1 = _mm_loadu_ps(ptr2); \
777 jz1 = _mm_loadu_ps(ptr3); \
778 jx2 = _mm_setzero_ps(); \
779 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
780 jy2 = _mm_loadu_ps(ptr1+4); \
781 jz2 = _mm_loadu_ps(ptr2+4); \
782 jx3 = _mm_loadu_ps(ptr3+4); \
783 jy3 = _mm_setzero_ps(); \
784 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
785 jz3 = _mm_loadu_ps(ptr1+8); \
786 jx4 = _mm_loadu_ps(ptr2+8); \
787 jy4 = _mm_loadu_ps(ptr3+8); \
788 jz4 = _mm_setzero_ps(); \
789 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
794 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
795 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
796 jx1 = _mm_load_ss(ptr1); \
797 _tmp1 = _mm_load_ss(ptr2); \
798 jy1 = _mm_load_ss(ptr3); \
799 jz1 = _mm_load_ss(ptr4); \
800 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
801 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
802 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
803 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
804 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
805 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
806 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
807 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
808 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
809 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
810 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
814 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
815 __m128 _tmp1, _tmp2; \
816 jx1 = _mm_loadu_ps(ptr1); \
817 jy1 = _mm_loadu_ps(ptr2); \
818 jz1 = _mm_loadu_ps(ptr3); \
819 jx2 = _mm_loadu_ps(ptr4); \
820 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
821 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
822 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
823 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
824 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
825 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
826 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
827 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
828 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
832 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
833 __m128 _tmp1, _tmp2, _tmp3; \
834 jx1 = _mm_loadu_ps(ptr1); \
835 jy1 = _mm_loadu_ps(ptr2); \
836 jz1 = _mm_loadu_ps(ptr3); \
837 jx2 = _mm_loadu_ps(ptr4); \
838 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
839 jy2 = _mm_loadu_ps(ptr1+4); \
840 jz2 = _mm_loadu_ps(ptr2+4); \
841 jx3 = _mm_loadu_ps(ptr3+4); \
842 jy3 = _mm_loadu_ps(ptr4+4); \
843 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
844 jz3 = _mm_load_ss(ptr1+8); \
845 _tmp1 = _mm_load_ss(ptr2+8); \
846 _tmp2 = _mm_load_ss(ptr3+8); \
847 _tmp3 = _mm_load_ss(ptr4+8); \
848 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
849 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
850 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
854 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
855 jx1 = _mm_loadu_ps(ptr1); \
856 jy1 = _mm_loadu_ps(ptr2); \
857 jz1 = _mm_loadu_ps(ptr3); \
858 jx2 = _mm_loadu_ps(ptr4); \
859 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
860 jy2 = _mm_loadu_ps(ptr1+4); \
861 jz2 = _mm_loadu_ps(ptr2+4); \
862 jx3 = _mm_loadu_ps(ptr3+4); \
863 jy3 = _mm_loadu_ps(ptr4+4); \
864 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
865 jz3 = _mm_loadu_ps(ptr1+8); \
866 jx4 = _mm_loadu_ps(ptr2+8); \
867 jy4 = _mm_loadu_ps(ptr3+8); \
868 jz4 = _mm_loadu_ps(ptr4+8); \
869 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
873 /* Routines to increment rvecs in memory, typically use for j particle force updates */
874 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
876 jy1 = _mm_unpacklo_ps(jy1,jz1); \
877 jx1 = _mm_movelh_ps(jx1,jy1); \
878 _tmp1 = _mm_load_ss(ptr1); \
879 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
880 _tmp1 = _mm_add_ps(_tmp1,jx1); \
881 _mm_store_ss(ptr1,_tmp1); \
882 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
886 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
887 __m128 _tmp1, _tmp2; \
888 _tmp1 = _mm_loadu_ps(ptr1); \
889 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
890 jx1 = _mm_unpacklo_ps(jx1,jy1); \
891 jz1 = _mm_unpacklo_ps(jz1,jx2); \
892 jy2 = _mm_unpacklo_ps(jy2,jz2); \
893 jx1 = _mm_movelh_ps(jx1,jz1); \
894 _tmp1 = _mm_add_ps(_tmp1,jx1); \
895 _tmp2 = _mm_add_ps(_tmp2,jy2); \
896 _mm_storeu_ps(ptr1,_tmp1); \
897 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
901 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
902 __m128 _tmp1, _tmp2, _tmp3; \
903 _tmp1 = _mm_loadu_ps(ptr1); \
904 _tmp2 = _mm_loadu_ps(ptr1+4); \
905 _tmp3 = _mm_load_ss(ptr1+8); \
906 jx1 = _mm_unpacklo_ps(jx1,jy1); \
907 jz1 = _mm_unpacklo_ps(jz1,jx2); \
908 jy2 = _mm_unpacklo_ps(jy2,jz2); \
909 jx3 = _mm_unpacklo_ps(jx3,jy3); \
910 jx1 = _mm_movelh_ps(jx1,jz1); \
911 jy2 = _mm_movelh_ps(jy2,jx3); \
912 _tmp1 = _mm_add_ps(_tmp1,jx1); \
913 _tmp2 = _mm_add_ps(_tmp2,jy2); \
914 _tmp3 = _mm_add_ss(_tmp3,jz3); \
915 _mm_storeu_ps(ptr1,_tmp1); \
916 _mm_storeu_ps(ptr1+4,_tmp2); \
917 _mm_store_ss(ptr1+8,_tmp3); \
921 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
922 __m128 _tmp1, _tmp2, _tmp3; \
923 _tmp1 = _mm_loadu_ps(ptr1); \
924 _tmp2 = _mm_loadu_ps(ptr1+4); \
925 _tmp3 = _mm_loadu_ps(ptr1+8); \
926 jx1 = _mm_unpacklo_ps(jx1,jy1); \
927 jz1 = _mm_unpacklo_ps(jz1,jx2); \
928 jy2 = _mm_unpacklo_ps(jy2,jz2); \
929 jx3 = _mm_unpacklo_ps(jx3,jy3); \
930 jz3 = _mm_unpacklo_ps(jz3,jx4); \
931 jy4 = _mm_unpacklo_ps(jy4,jz4); \
932 jx1 = _mm_movelh_ps(jx1,jz1); \
933 jy2 = _mm_movelh_ps(jy2,jx3); \
934 jz3 = _mm_movelh_ps(jz3,jy4); \
935 _tmp1 = _mm_add_ps(_tmp1,jx1); \
936 _tmp2 = _mm_add_ps(_tmp2,jy2); \
937 _tmp3 = _mm_add_ps(_tmp3,jz3); \
938 _mm_storeu_ps(ptr1,_tmp1); \
939 _mm_storeu_ps(ptr1+4,_tmp2); \
940 _mm_storeu_ps(ptr1+8,_tmp3); \
944 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
945 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
946 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
947 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
948 _tmp2 = _mm_load_ss(ptr1+2); \
949 _tmp3 = _mm_load_ss(ptr2+2); \
950 jx1 = _mm_unpacklo_ps(jx1,jy1); \
951 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
952 _tmp1 = _mm_add_ps(_tmp1,jx1); \
953 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
954 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
955 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
956 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
960 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
961 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
962 _tmp1 = _mm_loadu_ps(ptr1); \
963 _tmp2 = _mm_loadu_ps(ptr2); \
964 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
965 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
966 jx1 = _mm_unpacklo_ps(jx1,jy1); \
967 jz1 = _mm_unpacklo_ps(jz1,jx2); \
968 jy2 = _mm_unpacklo_ps(jy2,jz2); \
969 _tmp4 = _mm_movelh_ps(jx1,jz1); \
970 _tmp5 = _mm_movehl_ps(jz1,jx1); \
971 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
972 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
973 _tmp3 = _mm_add_ps(_tmp3,jy2); \
974 _mm_storeu_ps(ptr1,_tmp1); \
975 _mm_storeu_ps(ptr2,_tmp2); \
976 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
977 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
981 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
982 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
983 _tmp1 = _mm_loadu_ps(ptr1); \
984 _tmp2 = _mm_loadu_ps(ptr1+4); \
985 _tmp3 = _mm_load_ss(ptr1+8); \
986 _tmp4 = _mm_loadu_ps(ptr2); \
987 _tmp5 = _mm_loadu_ps(ptr2+4); \
988 _tmp6 = _mm_load_ss(ptr2+8); \
989 jx1 = _mm_unpacklo_ps(jx1,jy1); \
990 jz1 = _mm_unpacklo_ps(jz1,jx2); \
991 jy2 = _mm_unpacklo_ps(jy2,jz2); \
992 jx3 = _mm_unpacklo_ps(jx3,jy3); \
993 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
994 _tmp8 = _mm_movelh_ps(jx1,jz1); \
995 _tmp9 = _mm_movehl_ps(jz1,jx1); \
996 _tmp10 = _mm_movelh_ps(jy2,jx3); \
997 _tmp11 = _mm_movehl_ps(jx3,jy2); \
998 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
999 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1000 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1001 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
1002 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
1003 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
1004 _mm_storeu_ps(ptr1,_tmp1); \
1005 _mm_storeu_ps(ptr1+4,_tmp2); \
1006 _mm_store_ss(ptr1+8,_tmp3); \
1007 _mm_storeu_ps(ptr2,_tmp4); \
1008 _mm_storeu_ps(ptr2+4,_tmp5); \
1009 _mm_store_ss(ptr2+8,_tmp6); \
1013 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1014 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1015 _tmp1 = _mm_loadu_ps(ptr1); \
1016 _tmp2 = _mm_loadu_ps(ptr1+4); \
1017 _tmp3 = _mm_loadu_ps(ptr1+8); \
1018 _tmp4 = _mm_loadu_ps(ptr2); \
1019 _tmp5 = _mm_loadu_ps(ptr2+4); \
1020 _tmp6 = _mm_loadu_ps(ptr2+8); \
1021 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1022 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1023 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1024 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1025 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1026 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1027 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1028 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1029 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1030 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1031 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1032 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1033 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
1034 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1035 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1036 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
1037 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
1038 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
1039 _mm_storeu_ps(ptr1,_tmp1); \
1040 _mm_storeu_ps(ptr1+4,_tmp2); \
1041 _mm_storeu_ps(ptr1+8,_tmp3); \
1042 _mm_storeu_ps(ptr2,_tmp4); \
1043 _mm_storeu_ps(ptr2+4,_tmp5); \
1044 _mm_storeu_ps(ptr2+8,_tmp6); \
1048 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1049 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1050 _tmp1 = _mm_load_ss(ptr1); \
1051 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1052 _tmp2 = _mm_load_ss(ptr2); \
1053 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1054 _tmp3 = _mm_load_ss(ptr3); \
1055 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1056 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1057 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1058 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1059 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1060 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1061 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1062 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1063 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
1064 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
1065 _mm_store_ss(ptr1,_tmp1); \
1066 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1067 _mm_store_ss(ptr2,_tmp2); \
1068 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1069 _mm_store_ss(ptr3,_tmp3); \
1070 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1074 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1075 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1076 _tmp1 = _mm_loadu_ps(ptr1); \
1077 _tmp2 = _mm_loadu_ps(ptr2); \
1078 _tmp3 = _mm_loadu_ps(ptr3); \
1079 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1080 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1081 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1082 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1083 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1084 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1085 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1086 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1087 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1088 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1089 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1090 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1091 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
1092 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1093 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
1094 _tmp4 = _mm_add_ps(_tmp4,jy2); \
1095 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
1096 _mm_storeu_ps(ptr1,_tmp1); \
1097 _mm_storeu_ps(ptr2,_tmp2); \
1098 _mm_storeu_ps(ptr3,_tmp3); \
1099 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1100 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1101 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1105 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1106 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1107 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1108 _tmp1 = _mm_loadu_ps(ptr1); \
1109 _tmp2 = _mm_loadu_ps(ptr1+4); \
1110 _tmp3 = _mm_load_ss(ptr1+8); \
1111 _tmp4 = _mm_loadu_ps(ptr2); \
1112 _tmp5 = _mm_loadu_ps(ptr2+4); \
1113 _tmp6 = _mm_load_ss(ptr2+8); \
1114 _tmp7 = _mm_loadu_ps(ptr3); \
1115 _tmp8 = _mm_loadu_ps(ptr3+4); \
1116 _tmp9 = _mm_load_ss(ptr3+8); \
1117 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1118 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1119 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1120 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1121 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1122 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1123 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1124 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1125 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1126 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1127 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1128 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1129 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1130 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1131 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1132 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1133 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1134 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1135 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1136 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1137 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1138 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
1139 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1140 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1141 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
1142 _mm_storeu_ps(ptr1,_tmp1); \
1143 _mm_storeu_ps(ptr1+4,_tmp2); \
1144 _mm_store_ss(ptr1+8,_tmp3); \
1145 _mm_storeu_ps(ptr2,_tmp4); \
1146 _mm_storeu_ps(ptr2+4,_tmp5); \
1147 _mm_store_ss(ptr2+8,_tmp6); \
1148 _mm_storeu_ps(ptr3,_tmp7); \
1149 _mm_storeu_ps(ptr3+4,_tmp8); \
1150 _mm_store_ss(ptr3+8,_tmp9); \
1154 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1155 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1156 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1157 _tmp1 = _mm_loadu_ps(ptr1); \
1158 _tmp2 = _mm_loadu_ps(ptr1+4); \
1159 _tmp3 = _mm_loadu_ps(ptr1+8); \
1160 _tmp4 = _mm_loadu_ps(ptr2); \
1161 _tmp5 = _mm_loadu_ps(ptr2+4); \
1162 _tmp6 = _mm_loadu_ps(ptr2+8); \
1163 _tmp7 = _mm_loadu_ps(ptr3); \
1164 _tmp8 = _mm_loadu_ps(ptr3+4); \
1165 _tmp9 = _mm_loadu_ps(ptr3+8); \
1166 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1167 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1168 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1169 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1170 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1171 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1172 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1173 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1174 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1175 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1176 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1177 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1178 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1179 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1180 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1181 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1182 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1183 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1184 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1185 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1186 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1187 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1188 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1189 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
1190 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1191 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1192 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
1193 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1194 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1195 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
1196 _mm_storeu_ps(ptr1,_tmp1); \
1197 _mm_storeu_ps(ptr1+4,_tmp2); \
1198 _mm_storeu_ps(ptr1+8,_tmp3); \
1199 _mm_storeu_ps(ptr2,_tmp4); \
1200 _mm_storeu_ps(ptr2+4,_tmp5); \
1201 _mm_storeu_ps(ptr2+8,_tmp6); \
1202 _mm_storeu_ps(ptr3,_tmp7); \
1203 _mm_storeu_ps(ptr3+4,_tmp8); \
1204 _mm_storeu_ps(ptr3+8,_tmp9); \
1209 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1210 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1211 _tmp1 = _mm_load_ss(ptr1); \
1212 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1213 _tmp2 = _mm_load_ss(ptr2); \
1214 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1215 _tmp3 = _mm_load_ss(ptr3); \
1216 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1217 _tmp4 = _mm_load_ss(ptr4); \
1218 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1219 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1220 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1221 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1222 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1223 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1224 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1225 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
1226 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
1227 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
1228 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
1229 _mm_store_ss(ptr1,_tmp1); \
1230 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1231 _mm_store_ss(ptr2,_tmp2); \
1232 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1233 _mm_store_ss(ptr3,_tmp3); \
1234 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1235 _mm_store_ss(ptr4,_tmp4); \
1236 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1240 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1241 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1242 _tmp1 = _mm_loadu_ps(ptr1); \
1243 _tmp2 = _mm_loadu_ps(ptr2); \
1244 _tmp3 = _mm_loadu_ps(ptr3); \
1245 _tmp4 = _mm_loadu_ps(ptr4); \
1246 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1247 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1248 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1249 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1250 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1251 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1252 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1253 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1254 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1255 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1256 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1257 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1258 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1259 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1260 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
1261 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
1262 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1263 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
1264 _tmp5 = _mm_add_ps(_tmp5,jy2); \
1265 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
1266 _mm_storeu_ps(ptr1,_tmp1); \
1267 _mm_storeu_ps(ptr2,_tmp2); \
1268 _mm_storeu_ps(ptr3,_tmp3); \
1269 _mm_storeu_ps(ptr4,_tmp4); \
1270 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1271 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1272 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1273 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1277 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1278 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1279 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1280 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1281 _tmp1 = _mm_loadu_ps(ptr1); \
1282 _tmp2 = _mm_loadu_ps(ptr1+4); \
1283 _tmp3 = _mm_load_ss(ptr1+8); \
1284 _tmp4 = _mm_loadu_ps(ptr2); \
1285 _tmp5 = _mm_loadu_ps(ptr2+4); \
1286 _tmp6 = _mm_load_ss(ptr2+8); \
1287 _tmp7 = _mm_loadu_ps(ptr3); \
1288 _tmp8 = _mm_loadu_ps(ptr3+4); \
1289 _tmp9 = _mm_load_ss(ptr3+8); \
1290 _tmp10 = _mm_loadu_ps(ptr4); \
1291 _tmp11 = _mm_loadu_ps(ptr4+4); \
1292 _tmp12 = _mm_load_ss(ptr4+8); \
1293 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1294 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1295 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1296 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1297 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1298 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1299 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1300 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1301 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1302 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1303 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1304 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1305 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1306 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1307 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1308 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1309 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1310 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1311 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1312 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
1313 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
1314 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1315 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
1316 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
1317 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
1318 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
1319 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
1320 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
1321 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1322 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1323 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
1324 _mm_storeu_ps(ptr1,_tmp1); \
1325 _mm_storeu_ps(ptr1+4,_tmp2); \
1326 _mm_store_ss(ptr1+8,_tmp3); \
1327 _mm_storeu_ps(ptr2,_tmp4); \
1328 _mm_storeu_ps(ptr2+4,_tmp5); \
1329 _mm_store_ss(ptr2+8,_tmp6); \
1330 _mm_storeu_ps(ptr3,_tmp7); \
1331 _mm_storeu_ps(ptr3+4,_tmp8); \
1332 _mm_store_ss(ptr3+8,_tmp9); \
1333 _mm_storeu_ps(ptr4,_tmp10); \
1334 _mm_storeu_ps(ptr4+4,_tmp11); \
1335 _mm_store_ss(ptr4+8,_tmp12); \
1339 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1340 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1341 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22; \
1342 __m128 _tmp23,_tmp24; \
1343 _tmp1 = _mm_loadu_ps(ptr1); \
1344 _tmp2 = _mm_loadu_ps(ptr1+4); \
1345 _tmp3 = _mm_loadu_ps(ptr1+8); \
1346 _tmp4 = _mm_loadu_ps(ptr2); \
1347 _tmp5 = _mm_loadu_ps(ptr2+4); \
1348 _tmp6 = _mm_loadu_ps(ptr2+8); \
1349 _tmp7 = _mm_loadu_ps(ptr3); \
1350 _tmp8 = _mm_loadu_ps(ptr3+4); \
1351 _tmp9 = _mm_loadu_ps(ptr3+8); \
1352 _tmp10 = _mm_loadu_ps(ptr4); \
1353 _tmp11 = _mm_loadu_ps(ptr4+4); \
1354 _tmp12 = _mm_loadu_ps(ptr4+8); \
1355 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1356 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1357 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1358 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1359 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1360 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1361 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1362 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1363 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1364 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1365 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1366 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1367 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1368 jz1 = _mm_movehl_ps(jz1,jx1); \
1369 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1370 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1371 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1372 jx3 = _mm_movehl_ps(jx3,jy2); \
1373 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1374 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1375 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1376 jy4 = _mm_movehl_ps(jy4,jz3); \
1377 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1378 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1379 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
1380 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
1381 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
1382 _tmp4 = _mm_add_ps(_tmp4,jz1); \
1383 _tmp5 = _mm_add_ps(_tmp5,jx3); \
1384 _tmp6 = _mm_add_ps(_tmp6,jy4); \
1385 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
1386 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
1387 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
1388 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1389 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1390 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
1391 _mm_storeu_ps(ptr1,_tmp1); \
1392 _mm_storeu_ps(ptr1+4,_tmp2); \
1393 _mm_storeu_ps(ptr1+8,_tmp3); \
1394 _mm_storeu_ps(ptr2,_tmp4); \
1395 _mm_storeu_ps(ptr2+4,_tmp5); \
1396 _mm_storeu_ps(ptr2+8,_tmp6); \
1397 _mm_storeu_ps(ptr3,_tmp7); \
1398 _mm_storeu_ps(ptr3+4,_tmp8); \
1399 _mm_storeu_ps(ptr3+8,_tmp9); \
1400 _mm_storeu_ps(ptr4,_tmp10); \
1401 _mm_storeu_ps(ptr4+4,_tmp11); \
1402 _mm_storeu_ps(ptr4+8,_tmp12); \
1407 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1409 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1410 jx1 = _mm_movelh_ps(jx1,jy1); \
1411 _tmp1 = _mm_load_ss(ptr1); \
1412 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1413 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1414 _mm_store_ss(ptr1,_tmp1); \
1415 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1419 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1420 __m128 _tmp1, _tmp2; \
1421 _tmp1 = _mm_loadu_ps(ptr1); \
1422 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1423 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1424 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1425 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1426 jx1 = _mm_movelh_ps(jx1,jz1); \
1427 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1428 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1429 _mm_storeu_ps(ptr1,_tmp1); \
1430 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1434 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1435 __m128 _tmp1, _tmp2, _tmp3; \
1436 _tmp1 = _mm_loadu_ps(ptr1); \
1437 _tmp2 = _mm_loadu_ps(ptr1+4); \
1438 _tmp3 = _mm_load_ss(ptr1+8); \
1439 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1440 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1441 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1442 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1443 jx1 = _mm_movelh_ps(jx1,jz1); \
1444 jy2 = _mm_movelh_ps(jy2,jx3); \
1445 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1446 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1447 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1448 _mm_storeu_ps(ptr1,_tmp1); \
1449 _mm_storeu_ps(ptr1+4,_tmp2); \
1450 _mm_store_ss(ptr1+8,_tmp3); \
1454 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1455 __m128 _tmp1, _tmp2, _tmp3; \
1456 _tmp1 = _mm_loadu_ps(ptr1); \
1457 _tmp2 = _mm_loadu_ps(ptr1+4); \
1458 _tmp3 = _mm_loadu_ps(ptr1+8); \
1459 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1460 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1461 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1462 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1463 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1464 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1465 jx1 = _mm_movelh_ps(jx1,jz1); \
1466 jy2 = _mm_movelh_ps(jy2,jx3); \
1467 jz3 = _mm_movelh_ps(jz3,jy4); \
1468 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1469 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1470 _tmp3 = _mm_sub_ps(_tmp3,jz3); \
1471 _mm_storeu_ps(ptr1,_tmp1); \
1472 _mm_storeu_ps(ptr1+4,_tmp2); \
1473 _mm_storeu_ps(ptr1+8,_tmp3); \
1477 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1478 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1479 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1480 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1481 _tmp2 = _mm_load_ss(ptr1+2); \
1482 _tmp3 = _mm_load_ss(ptr2+2); \
1483 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1484 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1485 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1486 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1487 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1488 _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1)); \
1489 _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4)); \
1493 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1494 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1495 _tmp1 = _mm_loadu_ps(ptr1); \
1496 _tmp2 = _mm_loadu_ps(ptr2); \
1497 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1498 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1499 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1500 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1501 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1502 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1503 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1504 _tmp1 = _mm_sub_ps(_tmp1,_tmp4); \
1505 _tmp2 = _mm_sub_ps(_tmp2,_tmp5); \
1506 _tmp3 = _mm_sub_ps(_tmp3,jy2); \
1507 _mm_storeu_ps(ptr1,_tmp1); \
1508 _mm_storeu_ps(ptr2,_tmp2); \
1509 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1510 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1514 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1515 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1516 _tmp1 = _mm_loadu_ps(ptr1); \
1517 _tmp2 = _mm_loadu_ps(ptr1+4); \
1518 _tmp3 = _mm_load_ss(ptr1+8); \
1519 _tmp4 = _mm_loadu_ps(ptr2); \
1520 _tmp5 = _mm_loadu_ps(ptr2+4); \
1521 _tmp6 = _mm_load_ss(ptr2+8); \
1522 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1523 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1524 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1525 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1526 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1527 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1528 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1529 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1530 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1531 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1532 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1533 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1534 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1535 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1536 _tmp6 = _mm_sub_ss(_tmp6,_tmp7); \
1537 _mm_storeu_ps(ptr1,_tmp1); \
1538 _mm_storeu_ps(ptr1+4,_tmp2); \
1539 _mm_store_ss(ptr1+8,_tmp3); \
1540 _mm_storeu_ps(ptr2,_tmp4); \
1541 _mm_storeu_ps(ptr2+4,_tmp5); \
1542 _mm_store_ss(ptr2+8,_tmp6); \
1546 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1547 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1548 _tmp1 = _mm_loadu_ps(ptr1); \
1549 _tmp2 = _mm_loadu_ps(ptr1+4); \
1550 _tmp3 = _mm_loadu_ps(ptr1+8); \
1551 _tmp4 = _mm_loadu_ps(ptr2); \
1552 _tmp5 = _mm_loadu_ps(ptr2+4); \
1553 _tmp6 = _mm_loadu_ps(ptr2+8); \
1554 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1555 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1556 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1557 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1558 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1559 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1560 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1561 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1562 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1563 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1564 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1565 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1566 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1567 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1568 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1569 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1570 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1571 _tmp6 = _mm_sub_ps(_tmp6,_tmp13); \
1572 _mm_storeu_ps(ptr1,_tmp1); \
1573 _mm_storeu_ps(ptr1+4,_tmp2); \
1574 _mm_storeu_ps(ptr1+8,_tmp3); \
1575 _mm_storeu_ps(ptr2,_tmp4); \
1576 _mm_storeu_ps(ptr2+4,_tmp5); \
1577 _mm_storeu_ps(ptr2+8,_tmp6); \
1581 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1582 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1583 _tmp1 = _mm_load_ss(ptr1); \
1584 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1585 _tmp2 = _mm_load_ss(ptr2); \
1586 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1587 _tmp3 = _mm_load_ss(ptr3); \
1588 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1589 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1590 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1591 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1592 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1593 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1594 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1595 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1596 _tmp2 = _mm_sub_ps(_tmp2,_tmp6); \
1597 _tmp3 = _mm_sub_ps(_tmp3,_tmp7); \
1598 _mm_store_ss(ptr1,_tmp1); \
1599 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1600 _mm_store_ss(ptr2,_tmp2); \
1601 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1602 _mm_store_ss(ptr3,_tmp3); \
1603 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1607 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1608 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1609 _tmp1 = _mm_loadu_ps(ptr1); \
1610 _tmp2 = _mm_loadu_ps(ptr2); \
1611 _tmp3 = _mm_loadu_ps(ptr3); \
1612 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1613 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1614 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1615 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1616 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1617 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1618 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1619 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1620 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1621 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1622 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1623 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1624 _tmp1 = _mm_sub_ps(_tmp1,_tmp9); \
1625 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1626 _tmp3 = _mm_sub_ps(_tmp3,_tmp6); \
1627 _tmp4 = _mm_sub_ps(_tmp4,jy2); \
1628 _tmp5 = _mm_sub_ps(_tmp5,_tmp8); \
1629 _mm_storeu_ps(ptr1,_tmp1); \
1630 _mm_storeu_ps(ptr2,_tmp2); \
1631 _mm_storeu_ps(ptr3,_tmp3); \
1632 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1633 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1634 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1638 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1639 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1640 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1641 _tmp1 = _mm_loadu_ps(ptr1); \
1642 _tmp2 = _mm_loadu_ps(ptr1+4); \
1643 _tmp3 = _mm_load_ss(ptr1+8); \
1644 _tmp4 = _mm_loadu_ps(ptr2); \
1645 _tmp5 = _mm_loadu_ps(ptr2+4); \
1646 _tmp6 = _mm_load_ss(ptr2+8); \
1647 _tmp7 = _mm_loadu_ps(ptr3); \
1648 _tmp8 = _mm_loadu_ps(ptr3+4); \
1649 _tmp9 = _mm_load_ss(ptr3+8); \
1650 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1651 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1652 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1653 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1654 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1655 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1656 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1657 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1658 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1659 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1660 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1661 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1662 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1663 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1664 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1665 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1666 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1667 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1668 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1669 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1670 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1671 _tmp6 = _mm_sub_ss(_tmp6,_tmp14); \
1672 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1673 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1674 _tmp9 = _mm_sub_ss(_tmp9,_tmp15); \
1675 _mm_storeu_ps(ptr1,_tmp1); \
1676 _mm_storeu_ps(ptr1+4,_tmp2); \
1677 _mm_store_ss(ptr1+8,_tmp3); \
1678 _mm_storeu_ps(ptr2,_tmp4); \
1679 _mm_storeu_ps(ptr2+4,_tmp5); \
1680 _mm_store_ss(ptr2+8,_tmp6); \
1681 _mm_storeu_ps(ptr3,_tmp7); \
1682 _mm_storeu_ps(ptr3+4,_tmp8); \
1683 _mm_store_ss(ptr3+8,_tmp9); \
1687 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1688 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1689 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1690 _tmp1 = _mm_loadu_ps(ptr1); \
1691 _tmp2 = _mm_loadu_ps(ptr1+4); \
1692 _tmp3 = _mm_loadu_ps(ptr1+8); \
1693 _tmp4 = _mm_loadu_ps(ptr2); \
1694 _tmp5 = _mm_loadu_ps(ptr2+4); \
1695 _tmp6 = _mm_loadu_ps(ptr2+8); \
1696 _tmp7 = _mm_loadu_ps(ptr3); \
1697 _tmp8 = _mm_loadu_ps(ptr3+4); \
1698 _tmp9 = _mm_loadu_ps(ptr3+8); \
1699 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1700 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1701 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1702 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1703 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1704 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1705 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1706 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1707 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1708 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1709 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1710 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1711 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1712 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1713 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1714 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1715 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1716 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1717 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1718 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1719 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1720 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1721 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1722 _tmp3 = _mm_sub_ps(_tmp3,_tmp20); \
1723 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1724 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1725 _tmp6 = _mm_sub_ps(_tmp6,_tmp21); \
1726 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1727 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1728 _tmp9 = _mm_sub_ps(_tmp9,_tmp14); \
1729 _mm_storeu_ps(ptr1,_tmp1); \
1730 _mm_storeu_ps(ptr1+4,_tmp2); \
1731 _mm_storeu_ps(ptr1+8,_tmp3); \
1732 _mm_storeu_ps(ptr2,_tmp4); \
1733 _mm_storeu_ps(ptr2+4,_tmp5); \
1734 _mm_storeu_ps(ptr2+8,_tmp6); \
1735 _mm_storeu_ps(ptr3,_tmp7); \
1736 _mm_storeu_ps(ptr3+4,_tmp8); \
1737 _mm_storeu_ps(ptr3+8,_tmp9); \
1743 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1744 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1745 _tmp1 = _mm_load_ss(ptr1); \
1746 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1747 _tmp2 = _mm_load_ss(ptr2); \
1748 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1749 _tmp3 = _mm_load_ss(ptr3); \
1750 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1751 _tmp4 = _mm_load_ss(ptr4); \
1752 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1753 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1754 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1755 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1756 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1757 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1758 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1759 _tmp1 = _mm_sub_ps(_tmp1,_tmp7); \
1760 _tmp2 = _mm_sub_ps(_tmp2,_tmp8); \
1761 _tmp3 = _mm_sub_ps(_tmp3,_tmp9); \
1762 _tmp4 = _mm_sub_ps(_tmp4,_tmp10); \
1763 _mm_store_ss(ptr1,_tmp1); \
1764 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1765 _mm_store_ss(ptr2,_tmp2); \
1766 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1767 _mm_store_ss(ptr3,_tmp3); \
1768 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1769 _mm_store_ss(ptr4,_tmp4); \
1770 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1775 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1776 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1777 _tmp1 = _mm_loadu_ps(ptr1); \
1778 _tmp2 = _mm_loadu_ps(ptr2); \
1779 _tmp3 = _mm_loadu_ps(ptr3); \
1780 _tmp4 = _mm_loadu_ps(ptr4); \
1781 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1782 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1783 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1784 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1785 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1786 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1787 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1788 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1789 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1790 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1791 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1792 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1793 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1794 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1795 _tmp1 = _mm_sub_ps(_tmp1,_tmp10); \
1796 _tmp2 = _mm_sub_ps(_tmp2,_tmp11); \
1797 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1798 _tmp4 = _mm_sub_ps(_tmp4,_tmp13); \
1799 _tmp5 = _mm_sub_ps(_tmp5,jy2); \
1800 _tmp6 = _mm_sub_ps(_tmp6,_tmp9); \
1801 _mm_storeu_ps(ptr1,_tmp1); \
1802 _mm_storeu_ps(ptr2,_tmp2); \
1803 _mm_storeu_ps(ptr3,_tmp3); \
1804 _mm_storeu_ps(ptr4,_tmp4); \
1805 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1806 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1807 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1808 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1812 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1813 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1814 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1815 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1816 _tmp1 = _mm_loadu_ps(ptr1); \
1817 _tmp2 = _mm_loadu_ps(ptr1+4); \
1818 _tmp3 = _mm_load_ss(ptr1+8); \
1819 _tmp4 = _mm_loadu_ps(ptr2); \
1820 _tmp5 = _mm_loadu_ps(ptr2+4); \
1821 _tmp6 = _mm_load_ss(ptr2+8); \
1822 _tmp7 = _mm_loadu_ps(ptr3); \
1823 _tmp8 = _mm_loadu_ps(ptr3+4); \
1824 _tmp9 = _mm_load_ss(ptr3+8); \
1825 _tmp10 = _mm_loadu_ps(ptr4); \
1826 _tmp11 = _mm_loadu_ps(ptr4+4); \
1827 _tmp12 = _mm_load_ss(ptr4+8); \
1828 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1829 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1830 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1831 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1832 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1833 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1834 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1835 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1836 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1837 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1838 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1839 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1840 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1841 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1842 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1843 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1844 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1845 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1846 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1847 _tmp1 = _mm_sub_ps(_tmp1,_tmp20); \
1848 _tmp2 = _mm_sub_ps(_tmp2,_tmp23); \
1849 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1850 _tmp4 = _mm_sub_ps(_tmp4,_tmp21); \
1851 _tmp5 = _mm_sub_ps(_tmp5,_tmp24); \
1852 _tmp6 = _mm_sub_ss(_tmp6,_tmp17); \
1853 _tmp7 = _mm_sub_ps(_tmp7,_tmp22); \
1854 _tmp8 = _mm_sub_ps(_tmp8,_tmp25); \
1855 _tmp9 = _mm_sub_ss(_tmp9,_tmp18); \
1856 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1857 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1858 _tmp12 = _mm_sub_ss(_tmp12,_tmp19); \
1859 _mm_storeu_ps(ptr1,_tmp1); \
1860 _mm_storeu_ps(ptr1+4,_tmp2); \
1861 _mm_store_ss(ptr1+8,_tmp3); \
1862 _mm_storeu_ps(ptr2,_tmp4); \
1863 _mm_storeu_ps(ptr2+4,_tmp5); \
1864 _mm_store_ss(ptr2+8,_tmp6); \
1865 _mm_storeu_ps(ptr3,_tmp7); \
1866 _mm_storeu_ps(ptr3+4,_tmp8); \
1867 _mm_store_ss(ptr3+8,_tmp9); \
1868 _mm_storeu_ps(ptr4,_tmp10); \
1869 _mm_storeu_ps(ptr4+4,_tmp11); \
1870 _mm_store_ss(ptr4+8,_tmp12); \
1874 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1875 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1876 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
1877 __m128 _tmp23,_tmp24; \
1878 _tmp1 = _mm_loadu_ps(ptr1); \
1879 _tmp2 = _mm_loadu_ps(ptr1+4); \
1880 _tmp3 = _mm_loadu_ps(ptr1+8); \
1881 _tmp4 = _mm_loadu_ps(ptr2); \
1882 _tmp5 = _mm_loadu_ps(ptr2+4); \
1883 _tmp6 = _mm_loadu_ps(ptr2+8); \
1884 _tmp7 = _mm_loadu_ps(ptr3); \
1885 _tmp8 = _mm_loadu_ps(ptr3+4); \
1886 _tmp9 = _mm_loadu_ps(ptr3+8); \
1887 _tmp10 = _mm_loadu_ps(ptr4); \
1888 _tmp11 = _mm_loadu_ps(ptr4+4); \
1889 _tmp12 = _mm_loadu_ps(ptr4+8); \
1890 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1891 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1892 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1893 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1894 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1895 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1896 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1897 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1898 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1899 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1900 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1901 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1902 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1903 jz1 = _mm_movehl_ps(jz1,jx1); \
1904 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1905 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1906 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1907 jx3 = _mm_movehl_ps(jx3,jy2); \
1908 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1909 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1910 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1911 jy4 = _mm_movehl_ps(jy4,jz3); \
1912 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1913 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1914 _tmp1 = _mm_sub_ps(_tmp1,_tmp19); \
1915 _tmp2 = _mm_sub_ps(_tmp2,_tmp21); \
1916 _tmp3 = _mm_sub_ps(_tmp3,_tmp23); \
1917 _tmp4 = _mm_sub_ps(_tmp4,jz1); \
1918 _tmp5 = _mm_sub_ps(_tmp5,jx3); \
1919 _tmp6 = _mm_sub_ps(_tmp6,jy4); \
1920 _tmp7 = _mm_sub_ps(_tmp7,_tmp20); \
1921 _tmp8 = _mm_sub_ps(_tmp8,_tmp22); \
1922 _tmp9 = _mm_sub_ps(_tmp9,_tmp24); \
1923 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1924 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1925 _tmp12 = _mm_sub_ps(_tmp12,_tmp18); \
1926 _mm_storeu_ps(ptr1,_tmp1); \
1927 _mm_storeu_ps(ptr1+4,_tmp2); \
1928 _mm_storeu_ps(ptr1+8,_tmp3); \
1929 _mm_storeu_ps(ptr2,_tmp4); \
1930 _mm_storeu_ps(ptr2+4,_tmp5); \
1931 _mm_storeu_ps(ptr2+8,_tmp6); \
1932 _mm_storeu_ps(ptr3,_tmp7); \
1933 _mm_storeu_ps(ptr3+4,_tmp8); \
1934 _mm_storeu_ps(ptr3+8,_tmp9); \
1935 _mm_storeu_ps(ptr4,_tmp10); \
1936 _mm_storeu_ps(ptr4+4,_tmp11); \
1937 _mm_storeu_ps(ptr4+8,_tmp12); \
1945 /* Routine to be called with rswitch/rcut at the beginning of a kernel
1946 * to set up the 7 constants used for analytic 5th order switch calculations.
1948 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) { \
1949 const __m128 _swsetup_cm6 = { -6.0, -6.0, -6.0, -6.0}; \
1950 const __m128 _swsetup_cm10 = {-10.0,-10.0,-10.0,-10.0}; \
1951 const __m128 _swsetup_c15 = { 15.0, 15.0, 15.0, 15.0}; \
1952 const __m128 _swsetup_cm30 = {-30.0,-30.0,-30.0,-30.0}; \
1953 const __m128 _swsetup_c60 = { 60.0, 60.0, 60.0, 60.0}; \
1955 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5; \
1957 d = _mm_sub_ps(rcut,rswitch); \
1958 dinv = gmx_mm_inv_ps(d); \
1959 dinv2 = _mm_mul_ps(dinv,dinv); \
1960 dinv3 = _mm_mul_ps(dinv2,dinv); \
1961 dinv4 = _mm_mul_ps(dinv2,dinv2); \
1962 dinv5 = _mm_mul_ps(dinv3,dinv2); \
1964 switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3); \
1965 switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4); \
1966 switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5); \
1967 switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3); \
1968 switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4); \
1969 switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5); \
1973 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
1974 const __m128 _sw_one = { 1.0, 1.0, 1.0, 1.0}; \
1976 d = _mm_max_ps(r,rswitch); \
1977 d = _mm_min_ps(d,rcut); \
1978 d = _mm_sub_ps(d,rswitch); \
1979 d2 = _mm_mul_ps(d,d); \
1980 sw = _mm_mul_ps(d,sw_C5); \
1981 dsw = _mm_mul_ps(d,sw_D4); \
1982 sw = _mm_add_ps(sw,sw_C4); \
1983 dsw = _mm_add_ps(dsw,sw_D3); \
1984 sw = _mm_mul_ps(sw,d); \
1985 dsw = _mm_mul_ps(dsw,d); \
1986 sw = _mm_add_ps(sw,sw_C3); \
1987 dsw = _mm_add_ps(dsw,sw_D2); \
1988 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
1989 dsw = _mm_mul_ps(dsw,d2); \
1990 sw = _mm_add_ps(sw,_sw_one); \
1994 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1995 static inline __m128
1996 gmx_mm_interaction_coulomb_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
1998 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
1999 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2005 gmx_mm_interaction_coulomb_noforce_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
2007 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
2008 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2012 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
2013 static inline __m128
2014 gmx_mm_interaction_coulombrf_ps(const __m128 rinv
, const __m128 rsq
, const __m128 krf
, const __m128 crf
, const __m128 qq
,__m128
*vctot
)
2016 const __m128 two
= {2.0,2.0,2.0,2.0};
2019 krsq
= _mm_mul_ps(krf
,rsq
);
2020 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
2021 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2023 return _mm_mul_ps(qq
, _mm_sub_ps(rinv
, _mm_mul_ps(two
,krsq
)));
2028 gmx_mm_interaction_coulombrf_noforce_ps(__m128 rinv
, __m128 rsq
, __m128 krf
, __m128 crf
, __m128 qq
,__m128
*vctot
)
2032 krsq
= _mm_mul_ps(krf
,rsq
);
2033 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
2034 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2047 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
2048 static inline __m128
2049 gmx_mm_int_lj_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2051 const __m128 six
= {6.0,6.0,6.0,6.0};
2052 const __m128 twelve
= {12.0,12.0,12.0,12.0};
2054 __m128 rinvsix
,vvdw6
,vvdw12
;
2056 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2057 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2058 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2059 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2061 return _mm_sub_ps( _mm_mul_ps(twelve
,vvdw12
),_mm_mul_ps(six
,vvdw6
));
2066 gmx_mm_int_lj_potonly_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2068 __m128 rinvsix
,vvdw6
,vvdw12
;
2070 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2071 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2072 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2073 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2080 /* Return force should be multiplied by -rinv to get fscal */
2081 static inline __m128
2082 gmx_mm_int_4_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2084 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2086 int n_a
,n_b
,n_c
,n_d
;
2088 rt
= _mm_mul_ps(r
,tabscale
);
2089 n0
= _mm_cvttps_epi32(rt
);
2090 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2091 eps2
= _mm_mul_ps(eps
,eps
);
2093 /* Extract indices from n0 */
2094 n_a
= gmx_mm_extract_epi32(n0
,0);
2095 n_b
= gmx_mm_extract_epi32(n0
,1);
2096 n_c
= gmx_mm_extract_epi32(n0
,2);
2097 n_d
= gmx_mm_extract_epi32(n0
,3);
2098 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2099 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2100 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2101 H
= _mm_load_ps(VFtab
+ 4* n_d
);
2102 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2103 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2104 G
= _mm_mul_ps(G
,eps
); /* Geps */
2105 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2106 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2107 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2109 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2111 return _mm_mul_ps(F
,tabscale
);
2116 /* Return force should be multiplied by -rinv to get fscal */
2117 static inline __m128
2118 gmx_mm_int_4_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2120 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2122 int n_a
,n_b
,n_c
,n_d
;
2124 rt
= _mm_mul_ps(r
,tabscale
);
2125 n0
= _mm_cvttps_epi32(rt
);
2126 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2127 eps2
= _mm_mul_ps(eps
,eps
);
2129 /* Extract indices from n0 */
2130 n_a
= gmx_mm_extract_epi32(n0
,0);
2131 n_b
= gmx_mm_extract_epi32(n0
,1);
2132 n_c
= gmx_mm_extract_epi32(n0
,2);
2133 n_d
= gmx_mm_extract_epi32(n0
,3);
2135 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2136 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2137 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2139 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
);
2140 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
);
2141 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
);
2142 Hd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
);
2143 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
+ 4);
2144 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
+ 4);
2145 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
+ 4);
2146 Hr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
+ 4);
2147 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2148 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2149 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2150 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2151 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2152 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2153 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2154 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2155 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2156 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2157 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2159 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2160 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2162 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2166 /* Return force should be multiplied by -rinv to get fscal */
2167 static inline __m128
2168 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2169 __m128
*vctot
, __m128
*vvdwtot
)
2171 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2173 int n_a
,n_b
,n_c
,n_d
;
2175 rt
= _mm_mul_ps(r
,tabscale
);
2176 n0
= _mm_cvttps_epi32(rt
);
2177 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2178 eps2
= _mm_mul_ps(eps
,eps
);
2180 /* Extract indices from n0 */
2181 n_a
= gmx_mm_extract_epi32(n0
,0);
2182 n_b
= gmx_mm_extract_epi32(n0
,1);
2183 n_c
= gmx_mm_extract_epi32(n0
,2);
2184 n_d
= gmx_mm_extract_epi32(n0
,3);
2187 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2188 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2189 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2190 Hc
= _mm_load_ps(VFtab
+ 12* n_d
);
2191 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2192 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2193 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2194 Hd
= _mm_load_ps(VFtab
+ 12* n_d
+ 4);
2195 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2196 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2197 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2198 Hr
= _mm_load_ps(VFtab
+ 12* n_d
+ 8);
2199 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2200 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2201 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2202 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2203 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2204 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2205 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2206 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2207 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2208 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2209 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2210 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2212 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2213 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2215 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2216 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2217 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2219 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2220 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2221 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2223 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2228 /* Return force should be multiplied by -rinv to get fscal */
2229 static inline __m128
2230 gmx_mm_int_3_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2232 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2236 rt
= _mm_mul_ps(r
,tabscale
);
2237 n0
= _mm_cvttps_epi32(rt
);
2238 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2239 eps2
= _mm_mul_ps(eps
,eps
);
2241 /* Extract indices from n0 */
2242 n_a
= gmx_mm_extract_epi32(n0
,0);
2243 n_b
= gmx_mm_extract_epi32(n0
,1);
2244 n_c
= gmx_mm_extract_epi32(n0
,2);
2245 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2246 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2247 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2248 H
= _mm_setzero_ps();
2249 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2250 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2251 G
= _mm_mul_ps(G
,eps
); /* Geps */
2252 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2253 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2254 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2256 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2258 return _mm_mul_ps(F
,tabscale
);
2263 /* Return force should be multiplied by -rinv to get fscal */
2264 static inline __m128
2265 gmx_mm_int_3_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2267 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2271 rt
= _mm_mul_ps(r
,tabscale
);
2272 n0
= _mm_cvttps_epi32(rt
);
2273 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2274 eps2
= _mm_mul_ps(eps
,eps
);
2276 /* Extract indices from n0 */
2277 n_a
= gmx_mm_extract_epi32(n0
,0);
2278 n_b
= gmx_mm_extract_epi32(n0
,1);
2279 n_c
= gmx_mm_extract_epi32(n0
,2);
2281 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2282 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2283 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2285 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2286 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2287 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
);
2288 Hd
= _mm_setzero_ps();
2289 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2290 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2291 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
+ 4);
2292 Hr
= _mm_setzero_ps();
2293 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2294 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2295 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2296 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2297 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2298 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2299 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2300 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2301 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2302 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2303 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2305 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2306 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2308 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2312 /* Return force should be multiplied by -rinv to get fscal */
2313 static inline __m128
2314 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2315 __m128
*vctot
, __m128
*vvdwtot
)
2317 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2321 rt
= _mm_mul_ps(r
,tabscale
);
2322 n0
= _mm_cvttps_epi32(rt
);
2323 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2324 eps2
= _mm_mul_ps(eps
,eps
);
2326 /* Extract indices from n0 */
2327 n_a
= gmx_mm_extract_epi32(n0
,0);
2328 n_b
= gmx_mm_extract_epi32(n0
,1);
2329 n_c
= gmx_mm_extract_epi32(n0
,2);
2332 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2333 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2334 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2335 Hc
= _mm_setzero_ps();
2336 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2337 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2338 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2339 Hd
= _mm_setzero_ps();
2340 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2341 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2342 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2343 Hr
= _mm_setzero_ps();
2344 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2345 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2346 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2347 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2348 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2349 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2350 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2351 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2352 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2353 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2354 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2355 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2357 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2358 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2360 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2361 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2362 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2364 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2365 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2366 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2368 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2375 /* Return force should be multiplied by -rinv to get fscal */
2376 static inline __m128
2377 gmx_mm_int_2_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2379 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2383 rt
= _mm_mul_ps(r
,tabscale
);
2384 n0
= _mm_cvttps_epi32(rt
);
2385 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2386 eps2
= _mm_mul_ps(eps
,eps
);
2388 /* Extract indices from n0 */
2389 n_a
= gmx_mm_extract_epi32(n0
,0);
2390 n_b
= gmx_mm_extract_epi32(n0
,1);
2391 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2392 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2393 G
= _mm_setzero_ps();
2394 H
= _mm_setzero_ps();
2395 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2396 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2397 G
= _mm_mul_ps(G
,eps
); /* Geps */
2398 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2399 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2400 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2402 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2404 return _mm_mul_ps(F
,tabscale
);
2409 /* Return force should be multiplied by -rinv to get fscal */
2410 static inline __m128
2411 gmx_mm_int_2_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2413 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2417 rt
= _mm_mul_ps(r
,tabscale
);
2418 n0
= _mm_cvttps_epi32(rt
);
2419 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2420 eps2
= _mm_mul_ps(eps
,eps
);
2422 /* Extract indices from n0 */
2423 n_a
= gmx_mm_extract_epi32(n0
,0);
2424 n_b
= gmx_mm_extract_epi32(n0
,1);
2426 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2427 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2428 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2430 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2431 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2432 Gd
= _mm_setzero_ps();
2433 Hd
= _mm_setzero_ps();
2434 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2435 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2436 Gr
= _mm_setzero_ps();
2437 Hr
= _mm_setzero_ps();
2438 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2439 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2440 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2441 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2442 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2443 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2444 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2445 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2446 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2447 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2448 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2450 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2451 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2453 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2457 /* Return force should be multiplied by -rinv to get fscal */
2458 static inline __m128
2459 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2460 __m128
*vctot
, __m128
*vvdwtot
)
2462 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2466 rt
= _mm_mul_ps(r
,tabscale
);
2467 n0
= _mm_cvttps_epi32(rt
);
2468 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2469 eps2
= _mm_mul_ps(eps
,eps
);
2471 /* Extract indices from n0 */
2472 n_a
= gmx_mm_extract_epi32(n0
,0);
2473 n_b
= gmx_mm_extract_epi32(n0
,1);
2475 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2476 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2477 Gc
= _mm_setzero_ps();
2478 Hc
= _mm_setzero_ps();
2479 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2480 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2481 Gd
= _mm_setzero_ps();
2482 Hd
= _mm_setzero_ps();
2483 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2484 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2485 Gr
= _mm_setzero_ps();
2486 Hr
= _mm_setzero_ps();
2487 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2488 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2489 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2490 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2491 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2492 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2493 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2494 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2495 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2496 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2497 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2498 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2500 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2501 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2503 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2504 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2505 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2507 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2508 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2509 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2511 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2517 /* Return force should be multiplied by -rinv to get fscal */
2518 static inline __m128
2519 gmx_mm_int_1_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2521 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2525 rt
= _mm_mul_ps(r
,tabscale
);
2526 n0
= _mm_cvttps_epi32(rt
);
2527 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2528 eps2
= _mm_mul_ps(eps
,eps
);
2530 /* Extract indices from n0 */
2531 n_a
= gmx_mm_extract_epi32(n0
,0);
2532 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2533 F
= _mm_setzero_ps();
2534 G
= _mm_setzero_ps();
2535 H
= _mm_setzero_ps();
2536 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2537 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2538 G
= _mm_mul_ps(G
,eps
); /* Geps */
2539 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2540 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2541 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2543 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2545 return _mm_mul_ps(F
,tabscale
);
2550 /* Return force should be multiplied by -rinv to get fscal */
2551 static inline __m128
2552 gmx_mm_int_1_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2554 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2558 rt
= _mm_mul_ps(r
,tabscale
);
2559 n0
= _mm_cvttps_epi32(rt
);
2560 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2561 eps2
= _mm_mul_ps(eps
,eps
);
2563 /* Extract indices from n0 */
2564 n_a
= gmx_mm_extract_epi32(n0
,0);
2566 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2567 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2568 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2570 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2571 Fd
= _mm_setzero_ps();
2572 Gd
= _mm_setzero_ps();
2573 Hd
= _mm_setzero_ps();
2574 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2575 Fr
= _mm_setzero_ps();
2576 Gr
= _mm_setzero_ps();
2577 Hr
= _mm_setzero_ps();
2578 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2579 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2580 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2581 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2582 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2583 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2584 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2585 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2586 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2587 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2588 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2590 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2591 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2593 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2597 /* Return force should be multiplied by -rinv to get fscal */
2598 static inline __m128
2599 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2600 __m128
*vctot
, __m128
*vvdwtot
)
2602 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2606 rt
= _mm_mul_ps(r
,tabscale
);
2607 n0
= _mm_cvttps_epi32(rt
);
2608 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2609 eps2
= _mm_mul_ps(eps
,eps
);
2611 /* Extract indices from n0 */
2612 n_a
= gmx_mm_extract_epi32(n0
,0);
2614 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2615 Fc
= _mm_setzero_ps();
2616 Gc
= _mm_setzero_ps();
2617 Hc
= _mm_setzero_ps();
2618 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2619 Fd
= _mm_setzero_ps();
2620 Gd
= _mm_setzero_ps();
2621 Hd
= _mm_setzero_ps();
2622 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2623 Fr
= _mm_setzero_ps();
2624 Gr
= _mm_setzero_ps();
2625 Hr
= _mm_setzero_ps();
2626 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2627 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2628 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2629 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2630 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2631 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2632 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2633 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2634 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2635 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2636 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2637 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2639 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2640 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2642 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2643 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2644 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2646 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2647 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2648 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2650 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2657 /* Return force should be multiplied by +rinv to get fscal */
2658 static inline __m128
2659 gmx_mm_int_4_genborn_ps(__m128 r
, __m128 isai
,
2660 float * isaj1
, float *isaj2
, float *isaj3
, float *isaj4
,
2661 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2662 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
, float *dvdaj4
,
2665 const __m128 half
= {0.5,0.5,0.5,0.5};
2667 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2669 int n_a
,n_b
,n_c
,n_d
;
2672 isaj
= _mm_load_ss(isaj1
);
2673 t2
= _mm_load_ss(isaj2
);
2674 t3
= _mm_load_ss(isaj3
);
2675 t4
= _mm_load_ss(isaj4
);
2676 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2677 t3
= _mm_unpacklo_ps(t3
,t4
); /* - - t4 t3 */
2678 isaj
= _mm_movelh_ps(isaj
,t3
); /* t4 t3 t2 t1 */
2680 isaprod
= _mm_mul_ps(isai
,isaj
);
2681 qq
= _mm_mul_ps(qq
,isaprod
);
2682 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2684 rt
= _mm_mul_ps(r
,gbtabscale
);
2685 n0
= _mm_cvttps_epi32(rt
);
2686 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2687 eps2
= _mm_mul_ps(eps
,eps
);
2689 /* Extract indices from n0 */
2690 n_a
= gmx_mm_extract_epi32(n0
,0);
2691 n_b
= gmx_mm_extract_epi32(n0
,1);
2692 n_c
= gmx_mm_extract_epi32(n0
,2);
2693 n_d
= gmx_mm_extract_epi32(n0
,3);
2694 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2695 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2696 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2697 H
= _mm_load_ps(GBtab
+ 4* n_d
);
2698 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2699 G
= _mm_mul_ps(G
,eps
); /* Geps */
2700 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2701 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2703 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2704 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2706 vgb
= _mm_mul_ps(qq
, VV
);
2707 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2709 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2711 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2713 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2715 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2717 /* Update 4 dada[j] values */
2718 Y
= _mm_load_ss(dvdaj1
);
2719 F
= _mm_load_ss(dvdaj2
);
2720 G
= _mm_load_ss(dvdaj3
);
2721 H
= _mm_load_ss(dvdaj4
);
2722 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2723 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2724 t4
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(0,0,0,1));
2726 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2727 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2728 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2729 _mm_store_ss( dvdaj4
, _mm_add_ss( H
, t4
) );
2736 /* Return force should be multiplied by +rinv to get fscal */
2737 static inline __m128
2738 gmx_mm_int_3_genborn_ps(__m128 r
, __m128 isai
,
2739 float * isaj1
, float *isaj2
, float *isaj3
,
2740 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2741 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
,
2744 const __m128 half
= {0.5,0.5,0.5,0.5};
2746 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2748 int n_a
,n_b
,n_c
,n_d
;
2751 isaj
= _mm_load_ss(isaj1
);
2752 t2
= _mm_load_ss(isaj2
);
2753 t3
= _mm_load_ss(isaj3
);
2754 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2755 t3
= _mm_unpacklo_ps(t3
,t3
); /* - - t3 t3 */
2756 isaj
= _mm_movelh_ps(isaj
,t3
); /* t3 t3 t2 t1 */
2758 isaprod
= _mm_mul_ps(isai
,isaj
);
2759 qq
= _mm_mul_ps(qq
,isaprod
);
2760 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2762 rt
= _mm_mul_ps(r
,gbtabscale
);
2763 n0
= _mm_cvttps_epi32(rt
);
2764 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2765 eps2
= _mm_mul_ps(eps
,eps
);
2767 /* Extract indices from n0 */
2768 n_a
= gmx_mm_extract_epi32(n0
,0);
2769 n_b
= gmx_mm_extract_epi32(n0
,1);
2770 n_c
= gmx_mm_extract_epi32(n0
,2);
2771 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2772 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2773 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2774 H
= _mm_setzero_ps();
2775 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2776 G
= _mm_mul_ps(G
,eps
); /* Geps */
2777 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2778 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2780 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2781 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2783 vgb
= _mm_mul_ps(qq
, VV
);
2784 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2786 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2788 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2790 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2792 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2794 /* Update 3 dada[j] values */
2795 Y
= _mm_load_ss(dvdaj1
);
2796 F
= _mm_load_ss(dvdaj2
);
2797 G
= _mm_load_ss(dvdaj3
);
2798 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2799 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2801 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2802 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2803 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2811 /* Return force should be multiplied by +rinv to get fscal */
2812 static inline __m128
2813 gmx_mm_int_2_genborn_ps(__m128 r
, __m128 isai
,
2814 float * isaj1
, float *isaj2
,
2815 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2816 float *dvdaj1
, float *dvdaj2
,
2819 const __m128 half
= {0.5,0.5,0.5,0.5};
2821 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2823 int n_a
,n_b
,n_c
,n_d
;
2826 isaj
= _mm_load_ss(isaj1
);
2827 t2
= _mm_load_ss(isaj2
);
2828 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2830 isaprod
= _mm_mul_ps(isai
,isaj
);
2831 qq
= _mm_mul_ps(qq
,isaprod
);
2832 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2834 rt
= _mm_mul_ps(r
,gbtabscale
);
2835 n0
= _mm_cvttps_epi32(rt
);
2836 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2837 eps2
= _mm_mul_ps(eps
,eps
);
2839 /* Extract indices from n0 */
2840 n_a
= gmx_mm_extract_epi32(n0
,0);
2841 n_b
= gmx_mm_extract_epi32(n0
,1);
2842 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2843 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2844 G
= _mm_setzero_ps();
2845 H
= _mm_setzero_ps();
2846 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2847 G
= _mm_mul_ps(G
,eps
); /* Geps */
2848 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2849 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2851 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2852 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2854 vgb
= _mm_mul_ps(qq
, VV
);
2855 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2857 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2859 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2861 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2863 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2865 /* Update 2 dada[j] values */
2866 Y
= _mm_load_ss(dvdaj1
);
2867 F
= _mm_load_ss(dvdaj2
);
2868 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2870 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2871 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2876 /* Return force should be multiplied by +rinv to get fscal */
2877 static inline __m128
2878 gmx_mm_int_1_genborn_ps(__m128 r
, __m128 isai
,
2880 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2884 const __m128 half
= {0.5,0.5,0.5,0.5};
2886 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2888 int n_a
,n_b
,n_c
,n_d
;
2891 isaj
= _mm_load_ss(isaj1
);
2893 isaprod
= _mm_mul_ps(isai
,isaj
);
2894 qq
= _mm_mul_ps(qq
,isaprod
);
2895 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2897 rt
= _mm_mul_ps(r
,gbtabscale
);
2898 n0
= _mm_cvttps_epi32(rt
);
2899 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2900 eps2
= _mm_mul_ps(eps
,eps
);
2902 /* Extract indices from n0 */
2903 n_a
= gmx_mm_extract_epi32(n0
,0);
2904 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2905 F
= _mm_setzero_ps();
2906 G
= _mm_setzero_ps();
2907 H
= _mm_setzero_ps();
2908 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2909 G
= _mm_mul_ps(G
,eps
); /* Geps */
2910 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2911 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2913 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2914 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2916 vgb
= _mm_mul_ps(qq
, VV
);
2917 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2919 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2921 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2923 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2925 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2927 /* Update 1 dada[j] values */
2928 Y
= _mm_load_ss(dvdaj1
);
2930 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2940 gmx_mm_update_iforce_1atom_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2947 fix1
= _mm_hadd_ps(fix1
,fix1
);
2948 fiy1
= _mm_hadd_ps(fiy1
,fiz1
);
2950 fix1
= _mm_hadd_ps(fix1
,fiy1
); /* fiz1 fiy1 fix1 fix1 */
2953 /* transpose data */
2955 _MM_TRANSPOSE4_PS(fix1
,t1
,fiy1
,fiz1
);
2956 fix1
= _mm_add_ps(_mm_add_ps(fix1
,t1
), _mm_add_ps(fiy1
,fiz1
));
2958 t2
= _mm_load_ss(fptr
);
2959 t2
= _mm_loadh_pi(t2
,(__m64
*)(fptr
+1));
2960 t3
= _mm_load_ss(fshiftptr
);
2961 t3
= _mm_loadh_pi(t3
,(__m64
*)(fshiftptr
+1));
2963 t2
= _mm_add_ps(t2
,fix1
);
2964 t3
= _mm_add_ps(t3
,fix1
);
2966 _mm_store_ss(fptr
,t2
);
2967 _mm_storeh_pi((__m64
*)(fptr
+1),t2
);
2968 _mm_store_ss(fshiftptr
,t3
);
2969 _mm_storeh_pi((__m64
*)(fshiftptr
+1),t3
);
2973 gmx_mm_update_iforce_2atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2974 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
2981 fix1
= _mm_hadd_ps(fix1
,fiy1
);
2982 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
2983 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
2985 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
2986 fiy2
= _mm_hadd_ps(fiy2
,fiy2
); /* - - fiz2 fiy2 */
2989 /* transpose data */
2990 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
2991 t1
= _mm_unpacklo_ps(fiy2
,fiz2
);
2992 t2
= _mm_unpackhi_ps(fiy2
,fiz2
);
2994 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
2995 t1
= _mm_add_ps(t1
,t2
);
2996 t2
= _mm_movehl_ps(t2
,t1
);
2997 fiy2
= _mm_add_ps(t1
,t2
);
2999 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3000 t1
= _mm_loadl_pi(t1
,(__m64
*)(fptr
+4));
3001 _mm_storel_pi((__m64
*)(fptr
+4), _mm_add_ps(fiy2
,t1
));
3003 t4
= _mm_load_ss(fshiftptr
+2);
3004 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
3006 t1
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
3007 t1
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
3008 t2
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
3010 t1
= _mm_add_ps(t1
,t2
);
3011 t1
= _mm_add_ps(t1
,t4
); /* y x - z */
3013 _mm_store_ss(fshiftptr
+2,t1
);
3014 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
3020 gmx_mm_update_iforce_3atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
3021 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
3022 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
3029 fix1
= _mm_hadd_ps(fix1
,fiy1
);
3030 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
3031 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
3032 fix3
= _mm_hadd_ps(fix3
,fiy3
);
3033 fiz3
= _mm_hadd_ps(fiz3
,fiz3
);
3035 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
3036 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
3037 fiz3
= _mm_hadd_ps(fiz3
,fiz3
); /* - - - fiz3 */
3040 /* transpose data */
3041 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
3042 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
3043 t2
= _mm_movehl_ps(_mm_setzero_ps(),fiz3
);
3044 t1
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(0,0,0,1));
3045 t3
= _mm_shuffle_ps(t2
,t2
,_MM_SHUFFLE(0,0,0,1));
3047 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
3048 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
3049 fiz3
= _mm_add_ss(_mm_add_ps(fiz3
,t1
) , _mm_add_ps(t2
,t3
));
3051 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3052 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
3053 _mm_store_ss (fptr
+8,_mm_add_ss(fiz3
,_mm_load_ss(fptr
+8) ));
3055 t4
= _mm_load_ss(fshiftptr
+2);
3056 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
3058 t1
= _mm_shuffle_ps(fiz3
,fix1
,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
3059 t2
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
3060 t3
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
3061 t3
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
3063 t1
= _mm_add_ps(t1
,t2
);
3064 t3
= _mm_add_ps(t3
,t4
);
3065 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3067 _mm_store_ss(fshiftptr
+2,t1
);
3068 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
3073 gmx_mm_update_iforce_4atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
3074 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
3075 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
3076 __m128 fix4
, __m128 fiy4
, __m128 fiz4
,
3080 __m128 t1
,t2
,t3
,t4
,t5
;
3083 fix1
= _mm_hadd_ps(fix1
,fiy1
);
3084 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
3085 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
3086 fix3
= _mm_hadd_ps(fix3
,fiy3
);
3087 fiz3
= _mm_hadd_ps(fiz3
,fix4
);
3088 fiy4
= _mm_hadd_ps(fiy4
,fiz4
);
3090 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
3091 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
3092 fiz3
= _mm_hadd_ps(fiz3
,fiy4
); /* fiz4 fiy4 fix4 fiz3 */
3095 /* transpose data */
3096 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
3097 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
3098 _MM_TRANSPOSE4_PS(fiz3
,fix4
,fiy4
,fiz4
);
3100 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
3101 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
3102 fiz3
= _mm_add_ps(_mm_add_ps(fiz3
,fix4
), _mm_add_ps(fiy4
,fiz4
));
3104 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3105 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
3106 _mm_storeu_ps(fptr
+8,_mm_add_ps(fiz3
,_mm_loadu_ps(fptr
+8)));
3108 t5
= _mm_load_ss(fshiftptr
+2);
3109 t5
= _mm_loadh_pi(t5
,(__m64
*)(fshiftptr
));
3111 t1
= _mm_shuffle_ps(fix1
,fix1
,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
3112 t2
= _mm_shuffle_ps(fiy2
,fiy2
,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
3113 t3
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
3114 t4
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
3115 t4
= _mm_shuffle_ps(fiz3
,t4
,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
3117 t1
= _mm_add_ps(t1
,t2
);
3118 t3
= _mm_add_ps(t3
,t4
);
3119 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3120 t5
= _mm_add_ps(t5
,t1
);
3122 _mm_store_ss(fshiftptr
+2,t5
);
3123 _mm_storeh_pi((__m64
*)(fshiftptr
),t5
);
3128 gmx_mm_update_1pot_ps(__m128 pot1
, float *ptr1
)
3131 pot1
= _mm_hadd_ps(pot1
,pot1
);
3132 pot1
= _mm_hadd_ps(pot1
,pot1
);
3135 pot1
= _mm_add_ps(pot1
,_mm_movehl_ps(pot1
,pot1
));
3136 pot1
= _mm_add_ps(pot1
,_mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1)));
3138 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3143 gmx_mm_update_2pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
)
3146 pot1
= _mm_hadd_ps(pot1
,pot2
);
3147 pot1
= _mm_hadd_ps(pot1
,pot1
);
3148 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1));
3152 t1
= _mm_movehl_ps(pot2
,pot1
); /* 2d 2c 1d 1c */
3153 t2
= _mm_movelh_ps(pot1
,pot2
); /* 2b 2a 1b 1a */
3154 t1
= _mm_add_ps(t1
,t2
); /* 2 2 1 1 */
3155 t2
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,3,1,1));
3156 pot1
= _mm_add_ps(t1
,t2
); /* - 2 - 1 */
3157 pot2
= _mm_movehl_ps(t2
,pot1
); /* - - - 2 */
3160 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3161 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3166 gmx_mm_update_4pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
, __m128 pot3
, float *ptr3
, __m128 pot4
, float *ptr4
)
3168 _MM_TRANSPOSE4_PS(pot1
,pot2
,pot3
,pot4
);
3170 pot1
= _mm_add_ps(_mm_add_ps(pot1
,pot2
),_mm_add_ps(pot3
,pot4
));
3171 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(1,1,1,1));
3172 pot3
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(2,2,2,2));
3173 pot4
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(3,3,3,3));
3175 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3176 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3177 _mm_store_ss(ptr3
,_mm_add_ss(pot3
,_mm_load_ss(ptr3
)));
3178 _mm_store_ss(ptr4
,_mm_add_ss(pot4
,_mm_load_ss(ptr4
)));