1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
9 * Copyright (c) 2001-2009, The GROMACS Development Team
11 * Gromacs is a library for molecular simulation and trajectory analysis,
12 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
13 * a full list of developers and information, check out http://www.gromacs.org
15 * This program is free software; you can redistribute it and/or modify it under
16 * the terms of the GNU Lesser General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option) any
19 * As a special exception, you may use this file as part of a free software
20 * library without restriction. Specifically, if other files instantiate
21 * templates or use macros or inline functions from this file, or you compile
22 * this file and link it with other files to produce an executable, this
23 * file does not by itself cause the resulting executable to be covered by
24 * the GNU Lesser General Public License.
26 * In plain-speak: do not worry about classes/macros/templates either - only
27 * changes to the library have to be LGPL, not an application linking with it.
29 * To help fund GROMACS development, we humbly ask that you cite
30 * the papers people have written on it - you can find them on the website!
33 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
34 #include "gmx_x86_simd_macros.h"
36 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
38 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
39 #define UNROLLJ GMX_X86_SIMD_WIDTH_HERE
41 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
44 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
50 /* SSE single precision 4x4 kernel */
51 #define SUM_SIMD(x) SUM_SIMD4(x)
54 /* SSE double precision 4x2 kernel */
55 #define SUM_SIMD(x) (x[0]+x[1])
61 /* AVX single precision 4x8 kernel */
62 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
65 /* AVX double precision 4x4 kernel */
66 #define SUM_SIMD(x) SUM_SIMD4(x)
70 #define SIMD_MASK_ALL 0xffffffff
72 #include "nbnxn_kernel_x86_simd_utils.h"
74 /* All functionality defines are set here, except for:
75 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
76 * CHECK_EXCLS, which is set just before including the inner loop contents.
77 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
78 * set before calling the kernel function. We might want to move that
79 * to inside the n-loop and have a different combination rule for different
80 * ci's, as no combination rule gives a 50% performance hit for LJ.
83 /* We always calculate shift forces, because it's cheap anyhow */
84 #define CALC_SHIFTFORCES
86 /* Assumes all LJ parameters are identical */
87 /* #define FIX_LJ_C */
89 #define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
91 #if defined LJ_COMB_GEOM
92 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
94 #if defined LJ_COMB_LB
95 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
97 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
102 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
105 #ifndef VDW_CUTOFF_CHECK
106 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
108 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
112 #ifdef GMX_MM128_HERE
113 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
115 #ifdef GMX_MM256_HERE
116 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
120 #ifndef CALC_ENERGIES
121 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel
,noener
)
123 #ifndef ENERGY_GROUPS
124 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel
,ener
)
126 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel
,energrp
)
130 #undef NBK_FUNC_NAME_C
131 #undef NBK_FUNC_NAME_C_LJC
132 (const nbnxn_pairlist_t
*nbl
,
133 const nbnxn_atomdata_t
*nbat
,
134 const interaction_const_t
*ic
,
137 #ifdef CALC_SHIFTFORCES
148 const nbnxn_ci_t
*nbln
;
149 const nbnxn_cj_t
*l_cj
;
152 const real
*shiftvec
;
154 const real
*nbfp0
,*nbfp1
,*nbfp2
=NULL
,*nbfp3
=NULL
;
160 gmx_bool half_LJ
,do_coul
;
161 int sci
,scix
,sciy
,sciz
,sci2
;
162 int cjind0
,cjind1
,cjind
;
167 int egps_ishift
,egps_imask
;
168 int egps_jshift
,egps_jmask
,egps_jstride
;
170 real
*vvdwtp
[UNROLLI
];
177 gmx_mm_pr ix_SSE0
,iy_SSE0
,iz_SSE0
;
178 gmx_mm_pr ix_SSE1
,iy_SSE1
,iz_SSE1
;
179 gmx_mm_pr ix_SSE2
,iy_SSE2
,iz_SSE2
;
180 gmx_mm_pr ix_SSE3
,iy_SSE3
,iz_SSE3
;
181 gmx_mm_pr fix_SSE0
,fiy_SSE0
,fiz_SSE0
;
182 gmx_mm_pr fix_SSE1
,fiy_SSE1
,fiz_SSE1
;
183 gmx_mm_pr fix_SSE2
,fiy_SSE2
,fiz_SSE2
;
184 gmx_mm_pr fix_SSE3
,fiy_SSE3
,fiz_SSE3
;
187 __m128 fix_SSE
,fiy_SSE
,fiz_SSE
;
189 __m256d fix_SSE
,fiy_SSE
,fiz_SSE
;
192 __m128d fix0_SSE
,fiy0_SSE
,fiz0_SSE
;
193 __m128d fix2_SSE
,fiy2_SSE
,fiz2_SSE
;
196 #ifndef GMX_MM256_HERE
198 __m128i mask0
= _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
199 __m128i mask1
= _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
200 __m128i mask2
= _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
201 __m128i mask3
= _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
203 /* For double precision we need to set two 32bit ints for one double */
204 __m128i mask0
= _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
205 __m128i mask1
= _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
206 __m128i mask2
= _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
207 __m128i mask3
= _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
210 /* AVX: use floating point masks, as there are no integer instructions */
212 gmx_mm_pr mask0
= _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
213 gmx_mm_pr mask1
= _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
215 /* There is no 256-bit int to double conversion, so we use float here */
216 __m256 mask0
= _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
217 __m256 mask1
= _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
218 __m256 mask2
= _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
219 __m256 mask3
= _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
223 #ifndef GMX_MM256_HERE
225 __m128 diag_SSE0
= gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
226 __m128 diag_SSE1
= gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
227 __m128 diag_SSE2
= gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
228 __m128 diag_SSE3
= gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
230 __m128d diag0_SSE0
= gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
231 __m128d diag0_SSE1
= gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
232 __m128d diag0_SSE2
= gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
233 __m128d diag0_SSE3
= gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
234 __m128d diag1_SSE0
= gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
235 __m128d diag1_SSE1
= gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
236 __m128d diag1_SSE2
= gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
237 __m128d diag1_SSE3
= gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
239 #else /* GMX_MM256_HERE */
241 gmx_mm_pr diag0_SSE0
= _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
242 gmx_mm_pr diag0_SSE1
= _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
243 gmx_mm_pr diag0_SSE2
= _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
244 gmx_mm_pr diag0_SSE3
= _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
245 gmx_mm_pr diag1_SSE0
= _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
246 gmx_mm_pr diag1_SSE1
= _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
247 gmx_mm_pr diag1_SSE2
= _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
248 gmx_mm_pr diag1_SSE3
= _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
250 gmx_mm_pr diag_SSE0
= _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
251 gmx_mm_pr diag_SSE1
= _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
252 gmx_mm_pr diag_SSE2
= _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
253 gmx_mm_pr diag_SSE3
= _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
257 #ifndef GMX_MM256_HERE
258 __m128i zeroi_SSE
= _mm_setzero_si128();
260 #ifdef GMX_X86_SSE4_1
261 gmx_mm_pr zero_SSE
= gmx_set1_pr(0);
264 gmx_mm_pr one_SSE
=gmx_set1_pr(1.0);
265 gmx_mm_pr iq_SSE0
=gmx_setzero_pr();
266 gmx_mm_pr iq_SSE1
=gmx_setzero_pr();
267 gmx_mm_pr iq_SSE2
=gmx_setzero_pr();
268 gmx_mm_pr iq_SSE3
=gmx_setzero_pr();
271 gmx_mm_pr hrc_3_SSE
,moh_rc_SSE
;
274 /* Coulomb table variables */
275 gmx_mm_pr invtsp_SSE
;
276 const real
*tab_coul_F
;
278 const real
*tab_coul_V
;
280 #ifdef GMX_MM256_HERE
281 int ti0_array
[2*UNROLLJ
-1],*ti0
;
282 int ti1_array
[2*UNROLLJ
-1],*ti1
;
283 int ti2_array
[2*UNROLLJ
-1],*ti2
;
284 int ti3_array
[2*UNROLLJ
-1],*ti3
;
287 gmx_mm_pr mhalfsp_SSE
;
288 gmx_mm_pr sh_ewald_SSE
;
295 gmx_mm_pr hsig_i_SSE0
,seps_i_SSE0
;
296 gmx_mm_pr hsig_i_SSE1
,seps_i_SSE1
;
297 gmx_mm_pr hsig_i_SSE2
,seps_i_SSE2
;
298 gmx_mm_pr hsig_i_SSE3
,seps_i_SSE3
;
301 real pvdw_array
[2*UNROLLI
*UNROLLJ
+3];
302 real
*pvdw_c6
,*pvdw_c12
;
303 gmx_mm_pr c6_SSE0
,c12_SSE0
;
304 gmx_mm_pr c6_SSE1
,c12_SSE1
;
305 gmx_mm_pr c6_SSE2
,c12_SSE2
;
306 gmx_mm_pr c6_SSE3
,c12_SSE3
;
312 gmx_mm_pr c6s_SSE0
,c12s_SSE0
;
313 gmx_mm_pr c6s_SSE1
,c12s_SSE1
;
314 gmx_mm_pr c6s_SSE2
=gmx_setzero_pr(),c12s_SSE2
=gmx_setzero_pr();
315 gmx_mm_pr c6s_SSE3
=gmx_setzero_pr(),c12s_SSE3
=gmx_setzero_pr();
317 #endif /* LJ_COMB_LB */
319 gmx_mm_pr vctotSSE
,VvdwtotSSE
;
320 gmx_mm_pr sixthSSE
,twelvethSSE
;
322 gmx_mm_pr avoid_sing_SSE
;
324 #ifdef VDW_CUTOFF_CHECK
325 gmx_mm_pr rcvdw2_SSE
;
329 gmx_mm_pr sh_invrc6_SSE
,sh_invrc12_SSE
;
331 /* cppcheck-suppress unassignedVariable */
332 real tmpsum_array
[15],*tmpsum
;
334 #ifdef CALC_SHIFTFORCES
335 /* cppcheck-suppress unassignedVariable */
336 real shf_array
[15],*shf
;
345 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
348 /* No combination rule used */
350 nbfp_ptr
= nbat
->nbfp_s4
;
351 #define NBFP_STRIDE 4
353 nbfp_ptr
= nbat
->nbfp
;
354 #define NBFP_STRIDE 2
356 nbfp_stride
= NBFP_STRIDE
;
360 #ifdef GMX_MM256_HERE
361 /* Generate aligned table pointers */
362 ti0
= (int *)(((size_t)(ti0_array
+UNROLLJ
-1)) & (~((size_t)(UNROLLJ
*sizeof(real
)-1))));
363 ti1
= (int *)(((size_t)(ti1_array
+UNROLLJ
-1)) & (~((size_t)(UNROLLJ
*sizeof(real
)-1))));
364 ti2
= (int *)(((size_t)(ti2_array
+UNROLLJ
-1)) & (~((size_t)(UNROLLJ
*sizeof(real
)-1))));
365 ti3
= (int *)(((size_t)(ti3_array
+UNROLLJ
-1)) & (~((size_t)(UNROLLJ
*sizeof(real
)-1))));
368 invtsp_SSE
= gmx_set1_pr(ic
->tabq_scale
);
370 mhalfsp_SSE
= gmx_set1_pr(-0.5/ic
->tabq_scale
);
372 sh_ewald_SSE
= gmx_set1_pr(ic
->sh_ewald
);
376 tab_coul_F
= ic
->tabq_coul_FDV0
;
378 tab_coul_F
= ic
->tabq_coul_F
;
379 tab_coul_V
= ic
->tabq_coul_V
;
386 shiftvec
= shift_vec
[0];
389 avoid_sing_SSE
= gmx_set1_pr(NBNXN_AVOID_SING_R2_INC
);
391 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
392 rc2_SSE
= gmx_set1_pr(ic
->rcoulomb
*ic
->rcoulomb
);
393 #ifdef VDW_CUTOFF_CHECK
394 rcvdw2_SSE
= gmx_set1_pr(ic
->rvdw
*ic
->rvdw
);
398 sixthSSE
= gmx_set1_pr(1.0/6.0);
399 twelvethSSE
= gmx_set1_pr(1.0/12.0);
401 sh_invrc6_SSE
= gmx_set1_pr(ic
->sh_invrc6
);
402 sh_invrc12_SSE
= gmx_set1_pr(ic
->sh_invrc6
*ic
->sh_invrc6
);
405 mrc_3_SSE
= gmx_set1_pr(-2*ic
->k_rf
);
408 hrc_3_SSE
= gmx_set1_pr(ic
->k_rf
);
410 moh_rc_SSE
= gmx_set1_pr(-ic
->c_rf
);
414 tmpsum
= (real
*)(((size_t)(tmpsum_array
+7)) & (~((size_t)31)));
416 #ifdef CALC_SHIFTFORCES
417 shf
= (real
*)(((size_t)(shf_array
+7)) & (~((size_t)31)));
421 pvdw_c6
= (real
*)(((size_t)(pvdw_array
+3)) & (~((size_t)15)));
422 pvdw_c12
= pvdw_c6
+ UNROLLI
*UNROLLJ
;
424 for(jp
=0; jp
<UNROLLJ
; jp
++)
426 pvdw_c6
[0*UNROLLJ
+jp
] = nbat
->nbfp
[0*2];
427 pvdw_c6
[1*UNROLLJ
+jp
] = nbat
->nbfp
[0*2];
428 pvdw_c6
[2*UNROLLJ
+jp
] = nbat
->nbfp
[0*2];
429 pvdw_c6
[3*UNROLLJ
+jp
] = nbat
->nbfp
[0*2];
431 pvdw_c12
[0*UNROLLJ
+jp
] = nbat
->nbfp
[0*2+1];
432 pvdw_c12
[1*UNROLLJ
+jp
] = nbat
->nbfp
[0*2+1];
433 pvdw_c12
[2*UNROLLJ
+jp
] = nbat
->nbfp
[0*2+1];
434 pvdw_c12
[3*UNROLLJ
+jp
] = nbat
->nbfp
[0*2+1];
436 c6_SSE0
= gmx_load_pr(pvdw_c6
+0*UNROLLJ
);
437 c6_SSE1
= gmx_load_pr(pvdw_c6
+1*UNROLLJ
);
438 c6_SSE2
= gmx_load_pr(pvdw_c6
+2*UNROLLJ
);
439 c6_SSE3
= gmx_load_pr(pvdw_c6
+3*UNROLLJ
);
441 c12_SSE0
= gmx_load_pr(pvdw_c12
+0*UNROLLJ
);
442 c12_SSE1
= gmx_load_pr(pvdw_c12
+1*UNROLLJ
);
443 c12_SSE2
= gmx_load_pr(pvdw_c12
+2*UNROLLJ
);
444 c12_SSE3
= gmx_load_pr(pvdw_c12
+3*UNROLLJ
);
445 #endif /* FIX_LJ_C */
448 egps_ishift
= nbat
->neg_2log
;
449 egps_imask
= (1<<egps_ishift
) - 1;
450 egps_jshift
= 2*nbat
->neg_2log
;
451 egps_jmask
= (1<<egps_jshift
) - 1;
452 egps_jstride
= (UNROLLJ
>>1)*UNROLLJ
;
453 /* Major division is over i-particles: divide nVS by 4 for i-stride */
454 Vstride_i
= nbat
->nenergrp
*(1<<nbat
->neg_2log
)*egps_jstride
;
460 for(n
=0; n
<nbl
->nci
; n
++)
464 ish
= (nbln
->shift
& NBNXN_CI_SHIFT
);
466 cjind0
= nbln
->cj_ind_start
;
467 cjind1
= nbln
->cj_ind_end
;
468 /* Currently only works super-cells equal to sub-cells */
470 ci_sh
= (ish
== CENTRAL
? ci
: -1);
472 shX_SSE
= gmx_load1_pr(shiftvec
+ish3
);
473 shY_SSE
= gmx_load1_pr(shiftvec
+ish3
+1);
474 shZ_SSE
= gmx_load1_pr(shiftvec
+ish3
+2);
481 sci
= (ci
>>1)*STRIDE
;
482 scix
= sci
*DIM
+ (ci
& 1)*(STRIDE
>>1);
483 sci2
= sci
*2 + (ci
& 1)*(STRIDE
>>1);
484 sci
+= (ci
& 1)*(STRIDE
>>1);
487 half_LJ
= (nbln
->shift
& NBNXN_CI_HALF_LJ(0));
488 do_coul
= (nbln
->shift
& NBNXN_CI_DO_COUL(0));
491 egps_i
= nbat
->energrp
[ci
];
495 for(ia
=0; ia
<4; ia
++)
497 egp_ia
= (egps_i
>> (ia
*egps_ishift
)) & egps_imask
;
498 vvdwtp
[ia
] = Vvdw
+ egp_ia
*Vstride_i
;
499 vctp
[ia
] = Vc
+ egp_ia
*Vstride_i
;
503 #if defined CALC_ENERGIES
505 if (do_coul
&& l_cj
[nbln
->cj_ind_start
].cj
== ci_sh
)
508 if (do_coul
&& l_cj
[nbln
->cj_ind_start
].cj
== (ci_sh
<<1))
511 if (do_coul
&& l_cj
[nbln
->cj_ind_start
].cj
== (ci_sh
>>1))
518 Vc_sub_self
= 0.5*ic
->c_rf
;
522 Vc_sub_self
= 0.5*tab_coul_F
[2];
524 Vc_sub_self
= 0.5*tab_coul_V
[0];
528 for(ia
=0; ia
<UNROLLI
; ia
++)
534 vctp
[ia
][((egps_i
>>(ia
*egps_ishift
)) & egps_imask
)*egps_jstride
]
538 -= facel
*qi
*qi
*Vc_sub_self
;
543 /* Load i atom data */
544 sciy
= scix
+ STRIDE
;
545 sciz
= sciy
+ STRIDE
;
546 ix_SSE0
= gmx_add_pr(gmx_load1_pr(x
+scix
) ,shX_SSE
);
547 ix_SSE1
= gmx_add_pr(gmx_load1_pr(x
+scix
+1),shX_SSE
);
548 ix_SSE2
= gmx_add_pr(gmx_load1_pr(x
+scix
+2),shX_SSE
);
549 ix_SSE3
= gmx_add_pr(gmx_load1_pr(x
+scix
+3),shX_SSE
);
550 iy_SSE0
= gmx_add_pr(gmx_load1_pr(x
+sciy
) ,shY_SSE
);
551 iy_SSE1
= gmx_add_pr(gmx_load1_pr(x
+sciy
+1),shY_SSE
);
552 iy_SSE2
= gmx_add_pr(gmx_load1_pr(x
+sciy
+2),shY_SSE
);
553 iy_SSE3
= gmx_add_pr(gmx_load1_pr(x
+sciy
+3),shY_SSE
);
554 iz_SSE0
= gmx_add_pr(gmx_load1_pr(x
+sciz
) ,shZ_SSE
);
555 iz_SSE1
= gmx_add_pr(gmx_load1_pr(x
+sciz
+1),shZ_SSE
);
556 iz_SSE2
= gmx_add_pr(gmx_load1_pr(x
+sciz
+2),shZ_SSE
);
557 iz_SSE3
= gmx_add_pr(gmx_load1_pr(x
+sciz
+3),shZ_SSE
);
559 /* With half_LJ we currently always calculate Coulomb interactions */
560 if (do_coul
|| half_LJ
)
562 iq_SSE0
= gmx_set1_pr(facel
*q
[sci
]);
563 iq_SSE1
= gmx_set1_pr(facel
*q
[sci
+1]);
564 iq_SSE2
= gmx_set1_pr(facel
*q
[sci
+2]);
565 iq_SSE3
= gmx_set1_pr(facel
*q
[sci
+3]);
569 hsig_i_SSE0
= gmx_load1_pr(ljc
+sci2
+0);
570 hsig_i_SSE1
= gmx_load1_pr(ljc
+sci2
+1);
571 hsig_i_SSE2
= gmx_load1_pr(ljc
+sci2
+2);
572 hsig_i_SSE3
= gmx_load1_pr(ljc
+sci2
+3);
573 seps_i_SSE0
= gmx_load1_pr(ljc
+sci2
+STRIDE
+0);
574 seps_i_SSE1
= gmx_load1_pr(ljc
+sci2
+STRIDE
+1);
575 seps_i_SSE2
= gmx_load1_pr(ljc
+sci2
+STRIDE
+2);
576 seps_i_SSE3
= gmx_load1_pr(ljc
+sci2
+STRIDE
+3);
579 c6s_SSE0
= gmx_load1_pr(ljc
+sci2
+0);
580 c6s_SSE1
= gmx_load1_pr(ljc
+sci2
+1);
583 c6s_SSE2
= gmx_load1_pr(ljc
+sci2
+2);
584 c6s_SSE3
= gmx_load1_pr(ljc
+sci2
+3);
586 c12s_SSE0
= gmx_load1_pr(ljc
+sci2
+STRIDE
+0);
587 c12s_SSE1
= gmx_load1_pr(ljc
+sci2
+STRIDE
+1);
590 c12s_SSE2
= gmx_load1_pr(ljc
+sci2
+STRIDE
+2);
591 c12s_SSE3
= gmx_load1_pr(ljc
+sci2
+STRIDE
+3);
594 nbfp0
= nbfp_ptr
+ type
[sci
]*nbat
->ntype
*nbfp_stride
;
595 nbfp1
= nbfp_ptr
+ type
[sci
+1]*nbat
->ntype
*nbfp_stride
;
598 nbfp2
= nbfp_ptr
+ type
[sci
+2]*nbat
->ntype
*nbfp_stride
;
599 nbfp3
= nbfp_ptr
+ type
[sci
+3]*nbat
->ntype
*nbfp_stride
;
604 /* Zero the potential energy for this list */
605 VvdwtotSSE
= gmx_setzero_pr();
606 vctotSSE
= gmx_setzero_pr();
608 /* Clear i atom forces */
609 fix_SSE0
= gmx_setzero_pr();
610 fix_SSE1
= gmx_setzero_pr();
611 fix_SSE2
= gmx_setzero_pr();
612 fix_SSE3
= gmx_setzero_pr();
613 fiy_SSE0
= gmx_setzero_pr();
614 fiy_SSE1
= gmx_setzero_pr();
615 fiy_SSE2
= gmx_setzero_pr();
616 fiy_SSE3
= gmx_setzero_pr();
617 fiz_SSE0
= gmx_setzero_pr();
618 fiz_SSE1
= gmx_setzero_pr();
619 fiz_SSE2
= gmx_setzero_pr();
620 fiz_SSE3
= gmx_setzero_pr();
624 /* Currently all kernels use (at least half) LJ */
631 while (cjind
< cjind1
&& nbl
->cj
[cjind
].excl
!= SIMD_MASK_ALL
)
633 #include "nbnxn_kernel_x86_simd_inner.h"
637 for(; (cjind
<cjind1
); cjind
++)
639 #include "nbnxn_kernel_x86_simd_inner.h"
648 while (cjind
< cjind1
&& nbl
->cj
[cjind
].excl
!= SIMD_MASK_ALL
)
650 #include "nbnxn_kernel_x86_simd_inner.h"
654 for(; (cjind
<cjind1
); cjind
++)
656 #include "nbnxn_kernel_x86_simd_inner.h"
663 while (cjind
< cjind1
&& nbl
->cj
[cjind
].excl
!= SIMD_MASK_ALL
)
665 #include "nbnxn_kernel_x86_simd_inner.h"
669 for(; (cjind
<cjind1
); cjind
++)
671 #include "nbnxn_kernel_x86_simd_inner.h"
675 ninner
+= cjind1
- cjind0
;
677 /* Add accumulated i-forces to the force array */
680 #define gmx_load_ps4 _mm_load_ps
681 #define gmx_store_ps4 _mm_store_ps
682 #define gmx_add_ps4 _mm_add_ps
684 #define gmx_load_ps4 _mm256_load_pd
685 #define gmx_store_ps4 _mm256_store_pd
686 #define gmx_add_ps4 _mm256_add_pd
688 GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0
,fix_SSE1
,fix_SSE2
,fix_SSE3
,fix_SSE
);
689 gmx_store_ps4(f
+scix
, gmx_add_ps4(fix_SSE
, gmx_load_ps4(f
+scix
)));
691 GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0
,fiy_SSE1
,fiy_SSE2
,fiy_SSE3
,fiy_SSE
);
692 gmx_store_ps4(f
+sciy
, gmx_add_ps4(fiy_SSE
, gmx_load_ps4(f
+sciy
)));
694 GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0
,fiz_SSE1
,fiz_SSE2
,fiz_SSE3
,fiz_SSE
);
695 gmx_store_ps4(f
+sciz
, gmx_add_ps4(fiz_SSE
, gmx_load_ps4(f
+sciz
)));
697 #ifdef CALC_SHIFTFORCES
698 gmx_store_ps4(shf
,fix_SSE
);
699 fshift
[ish3
+0] += SUM_SIMD4(shf
);
700 gmx_store_ps4(shf
,fiy_SSE
);
701 fshift
[ish3
+1] += SUM_SIMD4(shf
);
702 gmx_store_ps4(shf
,fiz_SSE
);
703 fshift
[ish3
+2] += SUM_SIMD4(shf
);
706 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0
,fix_SSE1
,fix0_SSE
);
707 _mm_store_pd(f
+scix
, _mm_add_pd(fix0_SSE
, _mm_load_pd(f
+scix
)));
708 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2
,fix_SSE3
,fix2_SSE
);
709 _mm_store_pd(f
+scix
+2, _mm_add_pd(fix2_SSE
, _mm_load_pd(f
+scix
+2)));
711 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0
,fiy_SSE1
,fiy0_SSE
);
712 _mm_store_pd(f
+sciy
, _mm_add_pd(fiy0_SSE
, _mm_load_pd(f
+sciy
)));
713 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2
,fiy_SSE3
,fiy2_SSE
);
714 _mm_store_pd(f
+sciy
+2, _mm_add_pd(fiy2_SSE
, _mm_load_pd(f
+sciy
+2)));
716 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0
,fiz_SSE1
,fiz0_SSE
);
717 _mm_store_pd(f
+sciz
, _mm_add_pd(fiz0_SSE
, _mm_load_pd(f
+sciz
)));
718 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2
,fiz_SSE3
,fiz2_SSE
);
719 _mm_store_pd(f
+sciz
+2, _mm_add_pd(fiz2_SSE
, _mm_load_pd(f
+sciz
+2)));
721 #ifdef CALC_SHIFTFORCES
722 _mm_store_pd(shf
,_mm_add_pd(fix0_SSE
,fix2_SSE
));
723 fshift
[ish3
+0] += shf
[0] + shf
[1];
724 _mm_store_pd(shf
,_mm_add_pd(fiy0_SSE
,fiy2_SSE
));
725 fshift
[ish3
+1] += shf
[0] + shf
[1];
726 _mm_store_pd(shf
,_mm_add_pd(fiz0_SSE
,fiz2_SSE
));
727 fshift
[ish3
+2] += shf
[0] + shf
[1];
734 gmx_store_pr(tmpsum
,vctotSSE
);
735 *Vc
+= SUM_SIMD(tmpsum
);
738 gmx_store_pr(tmpsum
,VvdwtotSSE
);
739 *Vvdw
+= SUM_SIMD(tmpsum
);
742 /* Outer loop uses 6 flops/iteration */
746 printf("atom pairs %d\n",npair
);
754 #undef CALC_SHIFTFORCES