added Verlet scheme and NxN non-bonded functionality
[gromacs.git] / src / mdlib / nbnxn_kernels / nbnxn_kernel_x86_simd_outer.h
blob960d783e5ec4454c710faa408281b899be168104
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
6 * G R O M A C S
8 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
9 * Copyright (c) 2001-2009, The GROMACS Development Team
11 * Gromacs is a library for molecular simulation and trajectory analysis,
12 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
13 * a full list of developers and information, check out http://www.gromacs.org
15 * This program is free software; you can redistribute it and/or modify it under
16 * the terms of the GNU Lesser General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option) any
18 * later version.
19 * As a special exception, you may use this file as part of a free software
20 * library without restriction. Specifically, if other files instantiate
21 * templates or use macros or inline functions from this file, or you compile
22 * this file and link it with other files to produce an executable, this
23 * file does not by itself cause the resulting executable to be covered by
24 * the GNU Lesser General Public License.
26 * In plain-speak: do not worry about classes/macros/templates either - only
27 * changes to the library have to be LGPL, not an application linking with it.
29 * To help fund GROMACS development, we humbly ask that you cite
30 * the papers people have written on it - you can find them on the website!
33 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
34 #include "gmx_x86_simd_macros.h"
36 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
38 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
39 #define UNROLLJ GMX_X86_SIMD_WIDTH_HERE
41 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
42 #define STRIDE 4
43 #endif
44 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
45 #define STRIDE 8
46 #endif
48 #ifdef GMX_MM128_HERE
49 #ifndef GMX_DOUBLE
50 /* SSE single precision 4x4 kernel */
51 #define SUM_SIMD(x) SUM_SIMD4(x)
52 #define TAB_FDV0
53 #else
54 /* SSE double precision 4x2 kernel */
55 #define SUM_SIMD(x) (x[0]+x[1])
56 #endif
57 #endif
59 #ifdef GMX_MM256_HERE
60 #ifndef GMX_DOUBLE
61 /* AVX single precision 4x8 kernel */
62 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
63 #define TAB_FDV0
64 #else
65 /* AVX double precision 4x4 kernel */
66 #define SUM_SIMD(x) SUM_SIMD4(x)
67 #endif
68 #endif
70 #define SIMD_MASK_ALL 0xffffffff
72 #include "nbnxn_kernel_x86_simd_utils.h"
74 /* All functionality defines are set here, except for:
75 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
76 * CHECK_EXCLS, which is set just before including the inner loop contents.
77 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
78 * set before calling the kernel function. We might want to move that
79 * to inside the n-loop and have a different combination rule for different
80 * ci's, as no combination rule gives a 50% performance hit for LJ.
83 /* We always calculate shift forces, because it's cheap anyhow */
84 #define CALC_SHIFTFORCES
86 /* Assumes all LJ parameters are identical */
87 /* #define FIX_LJ_C */
89 #define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
91 #if defined LJ_COMB_GEOM
92 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
93 #else
94 #if defined LJ_COMB_LB
95 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
96 #else
97 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
98 #endif
99 #endif
101 #ifdef CALC_COUL_RF
102 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
103 #endif
104 #ifdef CALC_COUL_TAB
105 #ifndef VDW_CUTOFF_CHECK
106 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
107 #else
108 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
109 #endif
110 #endif
112 #ifdef GMX_MM128_HERE
113 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
114 #endif
115 #ifdef GMX_MM256_HERE
116 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
117 #endif
119 static void
120 #ifndef CALC_ENERGIES
121 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
122 #else
123 #ifndef ENERGY_GROUPS
124 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
125 #else
126 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
127 #endif
128 #endif
129 #undef NBK_FUNC_NAME
130 #undef NBK_FUNC_NAME_C
131 #undef NBK_FUNC_NAME_C_LJC
132 (const nbnxn_pairlist_t *nbl,
133 const nbnxn_atomdata_t *nbat,
134 const interaction_const_t *ic,
135 rvec *shift_vec,
136 real *f
137 #ifdef CALC_SHIFTFORCES
139 real *fshift
140 #endif
141 #ifdef CALC_ENERGIES
143 real *Vvdw,
144 real *Vc
145 #endif
148 const nbnxn_ci_t *nbln;
149 const nbnxn_cj_t *l_cj;
150 const int *type;
151 const real *q;
152 const real *shiftvec;
153 const real *x;
154 const real *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
155 real facel;
156 real *nbfp_ptr;
157 int nbfp_stride;
158 int n,ci,ci_sh;
159 int ish,ish3;
160 gmx_bool half_LJ,do_coul;
161 int sci,scix,sciy,sciz,sci2;
162 int cjind0,cjind1,cjind;
163 int ip,jp;
165 #ifdef ENERGY_GROUPS
166 int Vstride_i;
167 int egps_ishift,egps_imask;
168 int egps_jshift,egps_jmask,egps_jstride;
169 int egps_i;
170 real *vvdwtp[UNROLLI];
171 real *vctp[UNROLLI];
172 #endif
174 gmx_mm_pr shX_SSE;
175 gmx_mm_pr shY_SSE;
176 gmx_mm_pr shZ_SSE;
177 gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
178 gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
179 gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
180 gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
181 gmx_mm_pr fix_SSE0,fiy_SSE0,fiz_SSE0;
182 gmx_mm_pr fix_SSE1,fiy_SSE1,fiz_SSE1;
183 gmx_mm_pr fix_SSE2,fiy_SSE2,fiz_SSE2;
184 gmx_mm_pr fix_SSE3,fiy_SSE3,fiz_SSE3;
185 #if UNROLLJ >= 4
186 #ifndef GMX_DOUBLE
187 __m128 fix_SSE,fiy_SSE,fiz_SSE;
188 #else
189 __m256d fix_SSE,fiy_SSE,fiz_SSE;
190 #endif
191 #else
192 __m128d fix0_SSE,fiy0_SSE,fiz0_SSE;
193 __m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
194 #endif
196 #ifndef GMX_MM256_HERE
197 #ifndef GMX_DOUBLE
198 __m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
199 __m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
200 __m128i mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
201 __m128i mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
202 #else
203 /* For double precision we need to set two 32bit ints for one double */
204 __m128i mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
205 __m128i mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
206 __m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
207 __m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
208 #endif
209 #else
210 /* AVX: use floating point masks, as there are no integer instructions */
211 #ifndef GMX_DOUBLE
212 gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
213 gmx_mm_pr mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
214 #else
215 /* There is no 256-bit int to double conversion, so we use float here */
216 __m256 mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
217 __m256 mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
218 __m256 mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
219 __m256 mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
220 #endif
221 #endif
223 #ifndef GMX_MM256_HERE
224 #ifndef GMX_DOUBLE
225 __m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
226 __m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
227 __m128 diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
228 __m128 diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
229 #else
230 __m128d diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
231 __m128d diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
232 __m128d diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
233 __m128d diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
234 __m128d diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
235 __m128d diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
236 __m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
237 __m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
238 #endif
239 #else /* GMX_MM256_HERE */
240 #ifndef GMX_DOUBLE
241 gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
242 gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
243 gmx_mm_pr diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
244 gmx_mm_pr diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
245 gmx_mm_pr diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
246 gmx_mm_pr diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
247 gmx_mm_pr diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
248 gmx_mm_pr diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
249 #else
250 gmx_mm_pr diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
251 gmx_mm_pr diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
252 gmx_mm_pr diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
253 gmx_mm_pr diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
254 #endif
255 #endif
257 #ifndef GMX_MM256_HERE
258 __m128i zeroi_SSE = _mm_setzero_si128();
259 #endif
260 #ifdef GMX_X86_SSE4_1
261 gmx_mm_pr zero_SSE = gmx_set1_pr(0);
262 #endif
264 gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
265 gmx_mm_pr iq_SSE0=gmx_setzero_pr();
266 gmx_mm_pr iq_SSE1=gmx_setzero_pr();
267 gmx_mm_pr iq_SSE2=gmx_setzero_pr();
268 gmx_mm_pr iq_SSE3=gmx_setzero_pr();
269 gmx_mm_pr mrc_3_SSE;
270 #ifdef CALC_ENERGIES
271 gmx_mm_pr hrc_3_SSE,moh_rc_SSE;
272 #endif
273 #ifdef CALC_COUL_TAB
274 /* Coulomb table variables */
275 gmx_mm_pr invtsp_SSE;
276 const real *tab_coul_F;
277 #ifndef TAB_FDV0
278 const real *tab_coul_V;
279 #endif
280 #ifdef GMX_MM256_HERE
281 int ti0_array[2*UNROLLJ-1],*ti0;
282 int ti1_array[2*UNROLLJ-1],*ti1;
283 int ti2_array[2*UNROLLJ-1],*ti2;
284 int ti3_array[2*UNROLLJ-1],*ti3;
285 #endif
286 #ifdef CALC_ENERGIES
287 gmx_mm_pr mhalfsp_SSE;
288 gmx_mm_pr sh_ewald_SSE;
289 #endif
290 #endif
292 #ifdef LJ_COMB_LB
293 const real *ljc;
295 gmx_mm_pr hsig_i_SSE0,seps_i_SSE0;
296 gmx_mm_pr hsig_i_SSE1,seps_i_SSE1;
297 gmx_mm_pr hsig_i_SSE2,seps_i_SSE2;
298 gmx_mm_pr hsig_i_SSE3,seps_i_SSE3;
299 #else
300 #ifdef FIX_LJ_C
301 real pvdw_array[2*UNROLLI*UNROLLJ+3];
302 real *pvdw_c6,*pvdw_c12;
303 gmx_mm_pr c6_SSE0,c12_SSE0;
304 gmx_mm_pr c6_SSE1,c12_SSE1;
305 gmx_mm_pr c6_SSE2,c12_SSE2;
306 gmx_mm_pr c6_SSE3,c12_SSE3;
307 #endif
309 #ifdef LJ_COMB_GEOM
310 const real *ljc;
312 gmx_mm_pr c6s_SSE0,c12s_SSE0;
313 gmx_mm_pr c6s_SSE1,c12s_SSE1;
314 gmx_mm_pr c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
315 gmx_mm_pr c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
316 #endif
317 #endif /* LJ_COMB_LB */
319 gmx_mm_pr vctotSSE,VvdwtotSSE;
320 gmx_mm_pr sixthSSE,twelvethSSE;
322 gmx_mm_pr avoid_sing_SSE;
323 gmx_mm_pr rc2_SSE;
324 #ifdef VDW_CUTOFF_CHECK
325 gmx_mm_pr rcvdw2_SSE;
326 #endif
328 #ifdef CALC_ENERGIES
329 gmx_mm_pr sh_invrc6_SSE,sh_invrc12_SSE;
331 /* cppcheck-suppress unassignedVariable */
332 real tmpsum_array[15],*tmpsum;
333 #endif
334 #ifdef CALC_SHIFTFORCES
335 /* cppcheck-suppress unassignedVariable */
336 real shf_array[15],*shf;
337 #endif
339 int ninner;
341 #ifdef COUNT_PAIRS
342 int npair=0;
343 #endif
345 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
346 ljc = nbat->lj_comb;
347 #else
348 /* No combination rule used */
349 #ifndef GMX_DOUBLE
350 nbfp_ptr = nbat->nbfp_s4;
351 #define NBFP_STRIDE 4
352 #else
353 nbfp_ptr = nbat->nbfp;
354 #define NBFP_STRIDE 2
355 #endif
356 nbfp_stride = NBFP_STRIDE;
357 #endif
359 #ifdef CALC_COUL_TAB
360 #ifdef GMX_MM256_HERE
361 /* Generate aligned table pointers */
362 ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
363 ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
364 ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
365 ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
366 #endif
368 invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
369 #ifdef CALC_ENERGIES
370 mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
372 sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
373 #endif
375 #ifdef TAB_FDV0
376 tab_coul_F = ic->tabq_coul_FDV0;
377 #else
378 tab_coul_F = ic->tabq_coul_F;
379 tab_coul_V = ic->tabq_coul_V;
380 #endif
381 #endif
383 q = nbat->q;
384 type = nbat->type;
385 facel = ic->epsfac;
386 shiftvec = shift_vec[0];
387 x = nbat->x;
389 avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
391 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
392 rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
393 #ifdef VDW_CUTOFF_CHECK
394 rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
395 #endif
397 #ifdef CALC_ENERGIES
398 sixthSSE = gmx_set1_pr(1.0/6.0);
399 twelvethSSE = gmx_set1_pr(1.0/12.0);
401 sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
402 sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
403 #endif
405 mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
407 #ifdef CALC_ENERGIES
408 hrc_3_SSE = gmx_set1_pr(ic->k_rf);
410 moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
411 #endif
413 #ifdef CALC_ENERGIES
414 tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
415 #endif
416 #ifdef CALC_SHIFTFORCES
417 shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
418 #endif
420 #ifdef FIX_LJ_C
421 pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
422 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
424 for(jp=0; jp<UNROLLJ; jp++)
426 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
427 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
428 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
429 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
431 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
432 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
433 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
434 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
436 c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
437 c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
438 c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
439 c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
441 c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
442 c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
443 c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
444 c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
445 #endif /* FIX_LJ_C */
447 #ifdef ENERGY_GROUPS
448 egps_ishift = nbat->neg_2log;
449 egps_imask = (1<<egps_ishift) - 1;
450 egps_jshift = 2*nbat->neg_2log;
451 egps_jmask = (1<<egps_jshift) - 1;
452 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
453 /* Major division is over i-particles: divide nVS by 4 for i-stride */
454 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
455 #endif
457 l_cj = nbl->cj;
459 ninner = 0;
460 for(n=0; n<nbl->nci; n++)
462 nbln = &nbl->ci[n];
464 ish = (nbln->shift & NBNXN_CI_SHIFT);
465 ish3 = ish*3;
466 cjind0 = nbln->cj_ind_start;
467 cjind1 = nbln->cj_ind_end;
468 /* Currently only works super-cells equal to sub-cells */
469 ci = nbln->ci;
470 ci_sh = (ish == CENTRAL ? ci : -1);
472 shX_SSE = gmx_load1_pr(shiftvec+ish3);
473 shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
474 shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
476 #if UNROLLJ <= 4
477 sci = ci*STRIDE;
478 scix = sci*DIM;
479 sci2 = sci*2;
480 #else
481 sci = (ci>>1)*STRIDE;
482 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
483 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
484 sci += (ci & 1)*(STRIDE>>1);
485 #endif
487 half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
488 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
490 #ifdef ENERGY_GROUPS
491 egps_i = nbat->energrp[ci];
493 int ia,egp_ia;
495 for(ia=0; ia<4; ia++)
497 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
498 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
499 vctp[ia] = Vc + egp_ia*Vstride_i;
502 #endif
503 #if defined CALC_ENERGIES
504 #if UNROLLJ == 4
505 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
506 #endif
507 #if UNROLLJ == 2
508 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
509 #endif
510 #if UNROLLJ == 8
511 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
512 #endif
514 int ia;
515 real Vc_sub_self;
517 #ifdef CALC_COUL_RF
518 Vc_sub_self = 0.5*ic->c_rf;
519 #endif
520 #ifdef CALC_COUL_TAB
521 #ifdef TAB_FDV0
522 Vc_sub_self = 0.5*tab_coul_F[2];
523 #else
524 Vc_sub_self = 0.5*tab_coul_V[0];
525 #endif
526 #endif
528 for(ia=0; ia<UNROLLI; ia++)
530 real qi;
532 qi = q[sci+ia];
533 #ifdef ENERGY_GROUPS
534 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
535 #else
536 Vc[0]
537 #endif
538 -= facel*qi*qi*Vc_sub_self;
541 #endif
543 /* Load i atom data */
544 sciy = scix + STRIDE;
545 sciz = sciy + STRIDE;
546 ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix) ,shX_SSE);
547 ix_SSE1 = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
548 ix_SSE2 = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
549 ix_SSE3 = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
550 iy_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciy) ,shY_SSE);
551 iy_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
552 iy_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
553 iy_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
554 iz_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciz) ,shZ_SSE);
555 iz_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
556 iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
557 iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
559 /* With half_LJ we currently always calculate Coulomb interactions */
560 if (do_coul || half_LJ)
562 iq_SSE0 = gmx_set1_pr(facel*q[sci]);
563 iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
564 iq_SSE2 = gmx_set1_pr(facel*q[sci+2]);
565 iq_SSE3 = gmx_set1_pr(facel*q[sci+3]);
568 #ifdef LJ_COMB_LB
569 hsig_i_SSE0 = gmx_load1_pr(ljc+sci2+0);
570 hsig_i_SSE1 = gmx_load1_pr(ljc+sci2+1);
571 hsig_i_SSE2 = gmx_load1_pr(ljc+sci2+2);
572 hsig_i_SSE3 = gmx_load1_pr(ljc+sci2+3);
573 seps_i_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
574 seps_i_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
575 seps_i_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
576 seps_i_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
577 #else
578 #ifdef LJ_COMB_GEOM
579 c6s_SSE0 = gmx_load1_pr(ljc+sci2+0);
580 c6s_SSE1 = gmx_load1_pr(ljc+sci2+1);
581 if (!half_LJ)
583 c6s_SSE2 = gmx_load1_pr(ljc+sci2+2);
584 c6s_SSE3 = gmx_load1_pr(ljc+sci2+3);
586 c12s_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
587 c12s_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
588 if (!half_LJ)
590 c12s_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
591 c12s_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
593 #else
594 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
595 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
596 if (!half_LJ)
598 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
599 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
601 #endif
602 #endif
604 /* Zero the potential energy for this list */
605 VvdwtotSSE = gmx_setzero_pr();
606 vctotSSE = gmx_setzero_pr();
608 /* Clear i atom forces */
609 fix_SSE0 = gmx_setzero_pr();
610 fix_SSE1 = gmx_setzero_pr();
611 fix_SSE2 = gmx_setzero_pr();
612 fix_SSE3 = gmx_setzero_pr();
613 fiy_SSE0 = gmx_setzero_pr();
614 fiy_SSE1 = gmx_setzero_pr();
615 fiy_SSE2 = gmx_setzero_pr();
616 fiy_SSE3 = gmx_setzero_pr();
617 fiz_SSE0 = gmx_setzero_pr();
618 fiz_SSE1 = gmx_setzero_pr();
619 fiz_SSE2 = gmx_setzero_pr();
620 fiz_SSE3 = gmx_setzero_pr();
622 cjind = cjind0;
624 /* Currently all kernels use (at least half) LJ */
625 #define CALC_LJ
626 if (half_LJ)
628 #define CALC_COULOMB
629 #define HALF_LJ
630 #define CHECK_EXCLS
631 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
633 #include "nbnxn_kernel_x86_simd_inner.h"
634 cjind++;
636 #undef CHECK_EXCLS
637 for(; (cjind<cjind1); cjind++)
639 #include "nbnxn_kernel_x86_simd_inner.h"
641 #undef HALF_LJ
642 #undef CALC_COULOMB
644 else if (do_coul)
646 #define CALC_COULOMB
647 #define CHECK_EXCLS
648 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
650 #include "nbnxn_kernel_x86_simd_inner.h"
651 cjind++;
653 #undef CHECK_EXCLS
654 for(; (cjind<cjind1); cjind++)
656 #include "nbnxn_kernel_x86_simd_inner.h"
658 #undef CALC_COULOMB
660 else
662 #define CHECK_EXCLS
663 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
665 #include "nbnxn_kernel_x86_simd_inner.h"
666 cjind++;
668 #undef CHECK_EXCLS
669 for(; (cjind<cjind1); cjind++)
671 #include "nbnxn_kernel_x86_simd_inner.h"
674 #undef CALC_LJ
675 ninner += cjind1 - cjind0;
677 /* Add accumulated i-forces to the force array */
678 #if UNROLLJ >= 4
679 #ifndef GMX_DOUBLE
680 #define gmx_load_ps4 _mm_load_ps
681 #define gmx_store_ps4 _mm_store_ps
682 #define gmx_add_ps4 _mm_add_ps
683 #else
684 #define gmx_load_ps4 _mm256_load_pd
685 #define gmx_store_ps4 _mm256_store_pd
686 #define gmx_add_ps4 _mm256_add_pd
687 #endif
688 GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
689 gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
691 GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
692 gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
694 GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
695 gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
697 #ifdef CALC_SHIFTFORCES
698 gmx_store_ps4(shf,fix_SSE);
699 fshift[ish3+0] += SUM_SIMD4(shf);
700 gmx_store_ps4(shf,fiy_SSE);
701 fshift[ish3+1] += SUM_SIMD4(shf);
702 gmx_store_ps4(shf,fiz_SSE);
703 fshift[ish3+2] += SUM_SIMD4(shf);
704 #endif
705 #else
706 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
707 _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
708 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
709 _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
711 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
712 _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
713 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
714 _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
716 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
717 _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
718 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
719 _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
721 #ifdef CALC_SHIFTFORCES
722 _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
723 fshift[ish3+0] += shf[0] + shf[1];
724 _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
725 fshift[ish3+1] += shf[0] + shf[1];
726 _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
727 fshift[ish3+2] += shf[0] + shf[1];
728 #endif
729 #endif
731 #ifdef CALC_ENERGIES
732 if (do_coul)
734 gmx_store_pr(tmpsum,vctotSSE);
735 *Vc += SUM_SIMD(tmpsum);
738 gmx_store_pr(tmpsum,VvdwtotSSE);
739 *Vvdw += SUM_SIMD(tmpsum);
740 #endif
742 /* Outer loop uses 6 flops/iteration */
745 #ifdef COUNT_PAIRS
746 printf("atom pairs %d\n",npair);
747 #endif
750 #undef gmx_load_ps4
751 #undef gmx_store_ps4
752 #undef gmx_store_ps4
754 #undef CALC_SHIFTFORCES
756 #undef UNROLLI
757 #undef UNROLLJ
758 #undef STRIDE
759 #undef TAB_FDV0
760 #undef NBFP_STRIDE