Remove all unnecessary HAVE_CONFIG_H
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_256_double.c
blobf9245833df2930c9f4ebc630294d9d42d5be4d10
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
38 #include "config.h"
40 #include <math.h>
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Particle
55 * Calculate force/pot: PotentialAndForce
57 void
58 nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real rcutoff_scalar;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 real scratch[4*DIM];
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 real * vdwgridioffsetptr0;
86 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 real * vdwioffsetptr1;
88 real * vdwgridioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 real * vdwgridioffsetptr2;
92 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
93 real * vdwioffsetptr3;
94 real * vdwgridioffsetptr3;
95 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
97 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
102 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
103 real *charge;
104 int nvdwtype;
105 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
106 int *vdwtype;
107 real *vdwparam;
108 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
109 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
110 __m256d c6grid_00;
111 __m256d c6grid_10;
112 __m256d c6grid_20;
113 __m256d c6grid_30;
114 real *vdwgridparam;
115 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
116 __m256d one_half = _mm256_set1_pd(0.5);
117 __m256d minus_one = _mm256_set1_pd(-1.0);
118 __m128i ewitab;
119 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
121 real *ewtab;
122 __m256d dummy_mask,cutoff_mask;
123 __m128 tmpmask0,tmpmask1;
124 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
125 __m256d one = _mm256_set1_pd(1.0);
126 __m256d two = _mm256_set1_pd(2.0);
127 x = xx[0];
128 f = ff[0];
130 nri = nlist->nri;
131 iinr = nlist->iinr;
132 jindex = nlist->jindex;
133 jjnr = nlist->jjnr;
134 shiftidx = nlist->shift;
135 gid = nlist->gid;
136 shiftvec = fr->shift_vec[0];
137 fshift = fr->fshift[0];
138 facel = _mm256_set1_pd(fr->epsfac);
139 charge = mdatoms->chargeA;
140 nvdwtype = fr->ntype;
141 vdwparam = fr->nbfp;
142 vdwtype = mdatoms->typeA;
143 vdwgridparam = fr->ljpme_c6grid;
144 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
145 ewclj = _mm256_set1_pd(fr->ewaldcoeff_lj);
146 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
148 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
149 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
150 beta2 = _mm256_mul_pd(beta,beta);
151 beta3 = _mm256_mul_pd(beta,beta2);
153 ewtab = fr->ic->tabq_coul_FDV0;
154 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
155 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
157 /* Setup water-specific parameters */
158 inr = nlist->iinr[0];
159 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
160 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
161 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
162 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
163 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
165 /* Avoid stupid compiler warnings */
166 jnrA = jnrB = jnrC = jnrD = 0;
167 j_coord_offsetA = 0;
168 j_coord_offsetB = 0;
169 j_coord_offsetC = 0;
170 j_coord_offsetD = 0;
172 outeriter = 0;
173 inneriter = 0;
175 for(iidx=0;iidx<4*DIM;iidx++)
177 scratch[iidx] = 0.0;
180 /* Start outer loop over neighborlists */
181 for(iidx=0; iidx<nri; iidx++)
183 /* Load shift vector for this list */
184 i_shift_offset = DIM*shiftidx[iidx];
186 /* Load limits for loop over neighbors */
187 j_index_start = jindex[iidx];
188 j_index_end = jindex[iidx+1];
190 /* Get outer coordinate index */
191 inr = iinr[iidx];
192 i_coord_offset = DIM*inr;
194 /* Load i particle coords and add shift vector */
195 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
196 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
198 fix0 = _mm256_setzero_pd();
199 fiy0 = _mm256_setzero_pd();
200 fiz0 = _mm256_setzero_pd();
201 fix1 = _mm256_setzero_pd();
202 fiy1 = _mm256_setzero_pd();
203 fiz1 = _mm256_setzero_pd();
204 fix2 = _mm256_setzero_pd();
205 fiy2 = _mm256_setzero_pd();
206 fiz2 = _mm256_setzero_pd();
207 fix3 = _mm256_setzero_pd();
208 fiy3 = _mm256_setzero_pd();
209 fiz3 = _mm256_setzero_pd();
211 /* Reset potential sums */
212 velecsum = _mm256_setzero_pd();
213 vvdwsum = _mm256_setzero_pd();
215 /* Start inner kernel loop */
216 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
219 /* Get j neighbor index, and coordinate index */
220 jnrA = jjnr[jidx];
221 jnrB = jjnr[jidx+1];
222 jnrC = jjnr[jidx+2];
223 jnrD = jjnr[jidx+3];
224 j_coord_offsetA = DIM*jnrA;
225 j_coord_offsetB = DIM*jnrB;
226 j_coord_offsetC = DIM*jnrC;
227 j_coord_offsetD = DIM*jnrD;
229 /* load j atom coordinates */
230 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
231 x+j_coord_offsetC,x+j_coord_offsetD,
232 &jx0,&jy0,&jz0);
234 /* Calculate displacement vector */
235 dx00 = _mm256_sub_pd(ix0,jx0);
236 dy00 = _mm256_sub_pd(iy0,jy0);
237 dz00 = _mm256_sub_pd(iz0,jz0);
238 dx10 = _mm256_sub_pd(ix1,jx0);
239 dy10 = _mm256_sub_pd(iy1,jy0);
240 dz10 = _mm256_sub_pd(iz1,jz0);
241 dx20 = _mm256_sub_pd(ix2,jx0);
242 dy20 = _mm256_sub_pd(iy2,jy0);
243 dz20 = _mm256_sub_pd(iz2,jz0);
244 dx30 = _mm256_sub_pd(ix3,jx0);
245 dy30 = _mm256_sub_pd(iy3,jy0);
246 dz30 = _mm256_sub_pd(iz3,jz0);
248 /* Calculate squared distance and things based on it */
249 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
250 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
251 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
252 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
254 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
255 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
256 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
257 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
259 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
260 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
261 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
262 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
264 /* Load parameters for j particles */
265 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
266 charge+jnrC+0,charge+jnrD+0);
267 vdwjidx0A = 2*vdwtype[jnrA+0];
268 vdwjidx0B = 2*vdwtype[jnrB+0];
269 vdwjidx0C = 2*vdwtype[jnrC+0];
270 vdwjidx0D = 2*vdwtype[jnrD+0];
272 fjx0 = _mm256_setzero_pd();
273 fjy0 = _mm256_setzero_pd();
274 fjz0 = _mm256_setzero_pd();
276 /**************************
277 * CALCULATE INTERACTIONS *
278 **************************/
280 r00 = _mm256_mul_pd(rsq00,rinv00);
282 /* Compute parameters for interactions between i and j atoms */
283 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
284 vdwioffsetptr0+vdwjidx0B,
285 vdwioffsetptr0+vdwjidx0C,
286 vdwioffsetptr0+vdwjidx0D,
287 &c6_00,&c12_00);
289 c6grid_00 = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
290 vdwgridioffsetptr0+vdwjidx0B,
291 vdwgridioffsetptr0+vdwjidx0C,
292 vdwgridioffsetptr0+vdwjidx0D);
294 /* Analytical LJ-PME */
295 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
296 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
297 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
298 exponent = gmx_simd_exp_d(ewcljrsq);
299 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
300 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
301 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
302 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
303 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
304 vvdw = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
305 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
306 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
311 fscal = fvdw;
313 /* Calculate temporary vectorial force */
314 tx = _mm256_mul_pd(fscal,dx00);
315 ty = _mm256_mul_pd(fscal,dy00);
316 tz = _mm256_mul_pd(fscal,dz00);
318 /* Update vectorial force */
319 fix0 = _mm256_add_pd(fix0,tx);
320 fiy0 = _mm256_add_pd(fiy0,ty);
321 fiz0 = _mm256_add_pd(fiz0,tz);
323 fjx0 = _mm256_add_pd(fjx0,tx);
324 fjy0 = _mm256_add_pd(fjy0,ty);
325 fjz0 = _mm256_add_pd(fjz0,tz);
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 r10 = _mm256_mul_pd(rsq10,rinv10);
333 /* Compute parameters for interactions between i and j atoms */
334 qq10 = _mm256_mul_pd(iq1,jq0);
336 /* EWALD ELECTROSTATICS */
338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339 ewrt = _mm256_mul_pd(r10,ewtabscale);
340 ewitab = _mm256_cvttpd_epi32(ewrt);
341 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
342 ewitab = _mm_slli_epi32(ewitab,2);
343 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
344 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
345 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
346 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
347 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
348 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
349 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
350 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
351 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velecsum = _mm256_add_pd(velecsum,velec);
356 fscal = felec;
358 /* Calculate temporary vectorial force */
359 tx = _mm256_mul_pd(fscal,dx10);
360 ty = _mm256_mul_pd(fscal,dy10);
361 tz = _mm256_mul_pd(fscal,dz10);
363 /* Update vectorial force */
364 fix1 = _mm256_add_pd(fix1,tx);
365 fiy1 = _mm256_add_pd(fiy1,ty);
366 fiz1 = _mm256_add_pd(fiz1,tz);
368 fjx0 = _mm256_add_pd(fjx0,tx);
369 fjy0 = _mm256_add_pd(fjy0,ty);
370 fjz0 = _mm256_add_pd(fjz0,tz);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 r20 = _mm256_mul_pd(rsq20,rinv20);
378 /* Compute parameters for interactions between i and j atoms */
379 qq20 = _mm256_mul_pd(iq2,jq0);
381 /* EWALD ELECTROSTATICS */
383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384 ewrt = _mm256_mul_pd(r20,ewtabscale);
385 ewitab = _mm256_cvttpd_epi32(ewrt);
386 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
387 ewitab = _mm_slli_epi32(ewitab,2);
388 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
389 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
390 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
391 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
392 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
393 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
394 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
395 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
396 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velecsum = _mm256_add_pd(velecsum,velec);
401 fscal = felec;
403 /* Calculate temporary vectorial force */
404 tx = _mm256_mul_pd(fscal,dx20);
405 ty = _mm256_mul_pd(fscal,dy20);
406 tz = _mm256_mul_pd(fscal,dz20);
408 /* Update vectorial force */
409 fix2 = _mm256_add_pd(fix2,tx);
410 fiy2 = _mm256_add_pd(fiy2,ty);
411 fiz2 = _mm256_add_pd(fiz2,tz);
413 fjx0 = _mm256_add_pd(fjx0,tx);
414 fjy0 = _mm256_add_pd(fjy0,ty);
415 fjz0 = _mm256_add_pd(fjz0,tz);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 r30 = _mm256_mul_pd(rsq30,rinv30);
423 /* Compute parameters for interactions between i and j atoms */
424 qq30 = _mm256_mul_pd(iq3,jq0);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm256_mul_pd(r30,ewtabscale);
430 ewitab = _mm256_cvttpd_epi32(ewrt);
431 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
434 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
435 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
436 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
437 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
438 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
439 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
440 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
441 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm256_add_pd(velecsum,velec);
446 fscal = felec;
448 /* Calculate temporary vectorial force */
449 tx = _mm256_mul_pd(fscal,dx30);
450 ty = _mm256_mul_pd(fscal,dy30);
451 tz = _mm256_mul_pd(fscal,dz30);
453 /* Update vectorial force */
454 fix3 = _mm256_add_pd(fix3,tx);
455 fiy3 = _mm256_add_pd(fiy3,ty);
456 fiz3 = _mm256_add_pd(fiz3,tz);
458 fjx0 = _mm256_add_pd(fjx0,tx);
459 fjy0 = _mm256_add_pd(fjy0,ty);
460 fjz0 = _mm256_add_pd(fjz0,tz);
462 fjptrA = f+j_coord_offsetA;
463 fjptrB = f+j_coord_offsetB;
464 fjptrC = f+j_coord_offsetC;
465 fjptrD = f+j_coord_offsetD;
467 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
469 /* Inner loop uses 180 flops */
472 if(jidx<j_index_end)
475 /* Get j neighbor index, and coordinate index */
476 jnrlistA = jjnr[jidx];
477 jnrlistB = jjnr[jidx+1];
478 jnrlistC = jjnr[jidx+2];
479 jnrlistD = jjnr[jidx+3];
480 /* Sign of each element will be negative for non-real atoms.
481 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
482 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
484 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
486 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
487 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
488 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
490 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
491 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
492 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
493 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
494 j_coord_offsetA = DIM*jnrA;
495 j_coord_offsetB = DIM*jnrB;
496 j_coord_offsetC = DIM*jnrC;
497 j_coord_offsetD = DIM*jnrD;
499 /* load j atom coordinates */
500 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
501 x+j_coord_offsetC,x+j_coord_offsetD,
502 &jx0,&jy0,&jz0);
504 /* Calculate displacement vector */
505 dx00 = _mm256_sub_pd(ix0,jx0);
506 dy00 = _mm256_sub_pd(iy0,jy0);
507 dz00 = _mm256_sub_pd(iz0,jz0);
508 dx10 = _mm256_sub_pd(ix1,jx0);
509 dy10 = _mm256_sub_pd(iy1,jy0);
510 dz10 = _mm256_sub_pd(iz1,jz0);
511 dx20 = _mm256_sub_pd(ix2,jx0);
512 dy20 = _mm256_sub_pd(iy2,jy0);
513 dz20 = _mm256_sub_pd(iz2,jz0);
514 dx30 = _mm256_sub_pd(ix3,jx0);
515 dy30 = _mm256_sub_pd(iy3,jy0);
516 dz30 = _mm256_sub_pd(iz3,jz0);
518 /* Calculate squared distance and things based on it */
519 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
520 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
521 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
522 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
524 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
525 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
526 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
527 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
529 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
530 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
531 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
532 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
534 /* Load parameters for j particles */
535 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
536 charge+jnrC+0,charge+jnrD+0);
537 vdwjidx0A = 2*vdwtype[jnrA+0];
538 vdwjidx0B = 2*vdwtype[jnrB+0];
539 vdwjidx0C = 2*vdwtype[jnrC+0];
540 vdwjidx0D = 2*vdwtype[jnrD+0];
542 fjx0 = _mm256_setzero_pd();
543 fjy0 = _mm256_setzero_pd();
544 fjz0 = _mm256_setzero_pd();
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 r00 = _mm256_mul_pd(rsq00,rinv00);
551 r00 = _mm256_andnot_pd(dummy_mask,r00);
553 /* Compute parameters for interactions between i and j atoms */
554 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
555 vdwioffsetptr0+vdwjidx0B,
556 vdwioffsetptr0+vdwjidx0C,
557 vdwioffsetptr0+vdwjidx0D,
558 &c6_00,&c12_00);
560 c6grid_00 = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
561 vdwgridioffsetptr0+vdwjidx0B,
562 vdwgridioffsetptr0+vdwjidx0C,
563 vdwgridioffsetptr0+vdwjidx0D);
565 /* Analytical LJ-PME */
566 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
567 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
568 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
569 exponent = gmx_simd_exp_d(ewcljrsq);
570 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
571 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
572 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
573 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
574 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
575 vvdw = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
576 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
577 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
579 /* Update potential sum for this i atom from the interaction with this j atom. */
580 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
581 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
583 fscal = fvdw;
585 fscal = _mm256_andnot_pd(dummy_mask,fscal);
587 /* Calculate temporary vectorial force */
588 tx = _mm256_mul_pd(fscal,dx00);
589 ty = _mm256_mul_pd(fscal,dy00);
590 tz = _mm256_mul_pd(fscal,dz00);
592 /* Update vectorial force */
593 fix0 = _mm256_add_pd(fix0,tx);
594 fiy0 = _mm256_add_pd(fiy0,ty);
595 fiz0 = _mm256_add_pd(fiz0,tz);
597 fjx0 = _mm256_add_pd(fjx0,tx);
598 fjy0 = _mm256_add_pd(fjy0,ty);
599 fjz0 = _mm256_add_pd(fjz0,tz);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r10 = _mm256_mul_pd(rsq10,rinv10);
606 r10 = _mm256_andnot_pd(dummy_mask,r10);
608 /* Compute parameters for interactions between i and j atoms */
609 qq10 = _mm256_mul_pd(iq1,jq0);
611 /* EWALD ELECTROSTATICS */
613 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
614 ewrt = _mm256_mul_pd(r10,ewtabscale);
615 ewitab = _mm256_cvttpd_epi32(ewrt);
616 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
617 ewitab = _mm_slli_epi32(ewitab,2);
618 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
619 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
620 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
621 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
622 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
623 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
624 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
625 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
626 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
628 /* Update potential sum for this i atom from the interaction with this j atom. */
629 velec = _mm256_andnot_pd(dummy_mask,velec);
630 velecsum = _mm256_add_pd(velecsum,velec);
632 fscal = felec;
634 fscal = _mm256_andnot_pd(dummy_mask,fscal);
636 /* Calculate temporary vectorial force */
637 tx = _mm256_mul_pd(fscal,dx10);
638 ty = _mm256_mul_pd(fscal,dy10);
639 tz = _mm256_mul_pd(fscal,dz10);
641 /* Update vectorial force */
642 fix1 = _mm256_add_pd(fix1,tx);
643 fiy1 = _mm256_add_pd(fiy1,ty);
644 fiz1 = _mm256_add_pd(fiz1,tz);
646 fjx0 = _mm256_add_pd(fjx0,tx);
647 fjy0 = _mm256_add_pd(fjy0,ty);
648 fjz0 = _mm256_add_pd(fjz0,tz);
650 /**************************
651 * CALCULATE INTERACTIONS *
652 **************************/
654 r20 = _mm256_mul_pd(rsq20,rinv20);
655 r20 = _mm256_andnot_pd(dummy_mask,r20);
657 /* Compute parameters for interactions between i and j atoms */
658 qq20 = _mm256_mul_pd(iq2,jq0);
660 /* EWALD ELECTROSTATICS */
662 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
663 ewrt = _mm256_mul_pd(r20,ewtabscale);
664 ewitab = _mm256_cvttpd_epi32(ewrt);
665 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
666 ewitab = _mm_slli_epi32(ewitab,2);
667 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
668 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
669 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
670 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
671 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
672 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
673 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
674 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
675 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
677 /* Update potential sum for this i atom from the interaction with this j atom. */
678 velec = _mm256_andnot_pd(dummy_mask,velec);
679 velecsum = _mm256_add_pd(velecsum,velec);
681 fscal = felec;
683 fscal = _mm256_andnot_pd(dummy_mask,fscal);
685 /* Calculate temporary vectorial force */
686 tx = _mm256_mul_pd(fscal,dx20);
687 ty = _mm256_mul_pd(fscal,dy20);
688 tz = _mm256_mul_pd(fscal,dz20);
690 /* Update vectorial force */
691 fix2 = _mm256_add_pd(fix2,tx);
692 fiy2 = _mm256_add_pd(fiy2,ty);
693 fiz2 = _mm256_add_pd(fiz2,tz);
695 fjx0 = _mm256_add_pd(fjx0,tx);
696 fjy0 = _mm256_add_pd(fjy0,ty);
697 fjz0 = _mm256_add_pd(fjz0,tz);
699 /**************************
700 * CALCULATE INTERACTIONS *
701 **************************/
703 r30 = _mm256_mul_pd(rsq30,rinv30);
704 r30 = _mm256_andnot_pd(dummy_mask,r30);
706 /* Compute parameters for interactions between i and j atoms */
707 qq30 = _mm256_mul_pd(iq3,jq0);
709 /* EWALD ELECTROSTATICS */
711 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
712 ewrt = _mm256_mul_pd(r30,ewtabscale);
713 ewitab = _mm256_cvttpd_epi32(ewrt);
714 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
715 ewitab = _mm_slli_epi32(ewitab,2);
716 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
717 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
718 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
719 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
720 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
721 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
722 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
723 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
724 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
726 /* Update potential sum for this i atom from the interaction with this j atom. */
727 velec = _mm256_andnot_pd(dummy_mask,velec);
728 velecsum = _mm256_add_pd(velecsum,velec);
730 fscal = felec;
732 fscal = _mm256_andnot_pd(dummy_mask,fscal);
734 /* Calculate temporary vectorial force */
735 tx = _mm256_mul_pd(fscal,dx30);
736 ty = _mm256_mul_pd(fscal,dy30);
737 tz = _mm256_mul_pd(fscal,dz30);
739 /* Update vectorial force */
740 fix3 = _mm256_add_pd(fix3,tx);
741 fiy3 = _mm256_add_pd(fiy3,ty);
742 fiz3 = _mm256_add_pd(fiz3,tz);
744 fjx0 = _mm256_add_pd(fjx0,tx);
745 fjy0 = _mm256_add_pd(fjy0,ty);
746 fjz0 = _mm256_add_pd(fjz0,tz);
748 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
749 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
750 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
751 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
753 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
755 /* Inner loop uses 184 flops */
758 /* End of innermost loop */
760 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
761 f+i_coord_offset,fshift+i_shift_offset);
763 ggid = gid[iidx];
764 /* Update potential energies */
765 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
766 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
768 /* Increment number of inner iterations */
769 inneriter += j_index_end - j_index_start;
771 /* Outer loop uses 26 flops */
774 /* Increment number of outer iterations */
775 outeriter += nri;
777 /* Update outer/inner flops */
779 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*184);
782 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_double
783 * Electrostatics interaction: Ewald
784 * VdW interaction: LJEwald
785 * Geometry: Water4-Particle
786 * Calculate force/pot: Force
788 void
789 nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_double
790 (t_nblist * gmx_restrict nlist,
791 rvec * gmx_restrict xx,
792 rvec * gmx_restrict ff,
793 t_forcerec * gmx_restrict fr,
794 t_mdatoms * gmx_restrict mdatoms,
795 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
796 t_nrnb * gmx_restrict nrnb)
798 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
799 * just 0 for non-waters.
800 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
801 * jnr indices corresponding to data put in the four positions in the SIMD register.
803 int i_shift_offset,i_coord_offset,outeriter,inneriter;
804 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
805 int jnrA,jnrB,jnrC,jnrD;
806 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
807 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
808 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
809 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
810 real rcutoff_scalar;
811 real *shiftvec,*fshift,*x,*f;
812 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
813 real scratch[4*DIM];
814 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
815 real * vdwioffsetptr0;
816 real * vdwgridioffsetptr0;
817 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
818 real * vdwioffsetptr1;
819 real * vdwgridioffsetptr1;
820 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
821 real * vdwioffsetptr2;
822 real * vdwgridioffsetptr2;
823 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
824 real * vdwioffsetptr3;
825 real * vdwgridioffsetptr3;
826 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
827 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
828 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
829 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
830 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
831 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
832 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
833 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
834 real *charge;
835 int nvdwtype;
836 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
837 int *vdwtype;
838 real *vdwparam;
839 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
840 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
841 __m256d c6grid_00;
842 __m256d c6grid_10;
843 __m256d c6grid_20;
844 __m256d c6grid_30;
845 real *vdwgridparam;
846 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
847 __m256d one_half = _mm256_set1_pd(0.5);
848 __m256d minus_one = _mm256_set1_pd(-1.0);
849 __m128i ewitab;
850 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
851 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
852 real *ewtab;
853 __m256d dummy_mask,cutoff_mask;
854 __m128 tmpmask0,tmpmask1;
855 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
856 __m256d one = _mm256_set1_pd(1.0);
857 __m256d two = _mm256_set1_pd(2.0);
858 x = xx[0];
859 f = ff[0];
861 nri = nlist->nri;
862 iinr = nlist->iinr;
863 jindex = nlist->jindex;
864 jjnr = nlist->jjnr;
865 shiftidx = nlist->shift;
866 gid = nlist->gid;
867 shiftvec = fr->shift_vec[0];
868 fshift = fr->fshift[0];
869 facel = _mm256_set1_pd(fr->epsfac);
870 charge = mdatoms->chargeA;
871 nvdwtype = fr->ntype;
872 vdwparam = fr->nbfp;
873 vdwtype = mdatoms->typeA;
874 vdwgridparam = fr->ljpme_c6grid;
875 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
876 ewclj = _mm256_set1_pd(fr->ewaldcoeff_lj);
877 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
879 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
880 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
881 beta2 = _mm256_mul_pd(beta,beta);
882 beta3 = _mm256_mul_pd(beta,beta2);
884 ewtab = fr->ic->tabq_coul_F;
885 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
886 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
888 /* Setup water-specific parameters */
889 inr = nlist->iinr[0];
890 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
891 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
892 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
893 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
894 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
896 /* Avoid stupid compiler warnings */
897 jnrA = jnrB = jnrC = jnrD = 0;
898 j_coord_offsetA = 0;
899 j_coord_offsetB = 0;
900 j_coord_offsetC = 0;
901 j_coord_offsetD = 0;
903 outeriter = 0;
904 inneriter = 0;
906 for(iidx=0;iidx<4*DIM;iidx++)
908 scratch[iidx] = 0.0;
911 /* Start outer loop over neighborlists */
912 for(iidx=0; iidx<nri; iidx++)
914 /* Load shift vector for this list */
915 i_shift_offset = DIM*shiftidx[iidx];
917 /* Load limits for loop over neighbors */
918 j_index_start = jindex[iidx];
919 j_index_end = jindex[iidx+1];
921 /* Get outer coordinate index */
922 inr = iinr[iidx];
923 i_coord_offset = DIM*inr;
925 /* Load i particle coords and add shift vector */
926 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
927 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
929 fix0 = _mm256_setzero_pd();
930 fiy0 = _mm256_setzero_pd();
931 fiz0 = _mm256_setzero_pd();
932 fix1 = _mm256_setzero_pd();
933 fiy1 = _mm256_setzero_pd();
934 fiz1 = _mm256_setzero_pd();
935 fix2 = _mm256_setzero_pd();
936 fiy2 = _mm256_setzero_pd();
937 fiz2 = _mm256_setzero_pd();
938 fix3 = _mm256_setzero_pd();
939 fiy3 = _mm256_setzero_pd();
940 fiz3 = _mm256_setzero_pd();
942 /* Start inner kernel loop */
943 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
946 /* Get j neighbor index, and coordinate index */
947 jnrA = jjnr[jidx];
948 jnrB = jjnr[jidx+1];
949 jnrC = jjnr[jidx+2];
950 jnrD = jjnr[jidx+3];
951 j_coord_offsetA = DIM*jnrA;
952 j_coord_offsetB = DIM*jnrB;
953 j_coord_offsetC = DIM*jnrC;
954 j_coord_offsetD = DIM*jnrD;
956 /* load j atom coordinates */
957 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
958 x+j_coord_offsetC,x+j_coord_offsetD,
959 &jx0,&jy0,&jz0);
961 /* Calculate displacement vector */
962 dx00 = _mm256_sub_pd(ix0,jx0);
963 dy00 = _mm256_sub_pd(iy0,jy0);
964 dz00 = _mm256_sub_pd(iz0,jz0);
965 dx10 = _mm256_sub_pd(ix1,jx0);
966 dy10 = _mm256_sub_pd(iy1,jy0);
967 dz10 = _mm256_sub_pd(iz1,jz0);
968 dx20 = _mm256_sub_pd(ix2,jx0);
969 dy20 = _mm256_sub_pd(iy2,jy0);
970 dz20 = _mm256_sub_pd(iz2,jz0);
971 dx30 = _mm256_sub_pd(ix3,jx0);
972 dy30 = _mm256_sub_pd(iy3,jy0);
973 dz30 = _mm256_sub_pd(iz3,jz0);
975 /* Calculate squared distance and things based on it */
976 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
977 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
978 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
979 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
981 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
982 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
983 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
984 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
986 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
987 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
988 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
989 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
991 /* Load parameters for j particles */
992 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
993 charge+jnrC+0,charge+jnrD+0);
994 vdwjidx0A = 2*vdwtype[jnrA+0];
995 vdwjidx0B = 2*vdwtype[jnrB+0];
996 vdwjidx0C = 2*vdwtype[jnrC+0];
997 vdwjidx0D = 2*vdwtype[jnrD+0];
999 fjx0 = _mm256_setzero_pd();
1000 fjy0 = _mm256_setzero_pd();
1001 fjz0 = _mm256_setzero_pd();
1003 /**************************
1004 * CALCULATE INTERACTIONS *
1005 **************************/
1007 r00 = _mm256_mul_pd(rsq00,rinv00);
1009 /* Compute parameters for interactions between i and j atoms */
1010 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1011 vdwioffsetptr0+vdwjidx0B,
1012 vdwioffsetptr0+vdwjidx0C,
1013 vdwioffsetptr0+vdwjidx0D,
1014 &c6_00,&c12_00);
1016 c6grid_00 = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
1017 vdwgridioffsetptr0+vdwjidx0B,
1018 vdwgridioffsetptr0+vdwjidx0C,
1019 vdwgridioffsetptr0+vdwjidx0D);
1021 /* Analytical LJ-PME */
1022 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1023 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
1024 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1025 exponent = gmx_simd_exp_d(ewcljrsq);
1026 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1027 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1028 /* f6A = 6 * C6grid * (1 - poly) */
1029 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1030 /* f6B = C6grid * exponent * beta^6 */
1031 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1032 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1033 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1035 fscal = fvdw;
1037 /* Calculate temporary vectorial force */
1038 tx = _mm256_mul_pd(fscal,dx00);
1039 ty = _mm256_mul_pd(fscal,dy00);
1040 tz = _mm256_mul_pd(fscal,dz00);
1042 /* Update vectorial force */
1043 fix0 = _mm256_add_pd(fix0,tx);
1044 fiy0 = _mm256_add_pd(fiy0,ty);
1045 fiz0 = _mm256_add_pd(fiz0,tz);
1047 fjx0 = _mm256_add_pd(fjx0,tx);
1048 fjy0 = _mm256_add_pd(fjy0,ty);
1049 fjz0 = _mm256_add_pd(fjz0,tz);
1051 /**************************
1052 * CALCULATE INTERACTIONS *
1053 **************************/
1055 r10 = _mm256_mul_pd(rsq10,rinv10);
1057 /* Compute parameters for interactions between i and j atoms */
1058 qq10 = _mm256_mul_pd(iq1,jq0);
1060 /* EWALD ELECTROSTATICS */
1062 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1063 ewrt = _mm256_mul_pd(r10,ewtabscale);
1064 ewitab = _mm256_cvttpd_epi32(ewrt);
1065 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1066 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1067 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1068 &ewtabF,&ewtabFn);
1069 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1070 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1072 fscal = felec;
1074 /* Calculate temporary vectorial force */
1075 tx = _mm256_mul_pd(fscal,dx10);
1076 ty = _mm256_mul_pd(fscal,dy10);
1077 tz = _mm256_mul_pd(fscal,dz10);
1079 /* Update vectorial force */
1080 fix1 = _mm256_add_pd(fix1,tx);
1081 fiy1 = _mm256_add_pd(fiy1,ty);
1082 fiz1 = _mm256_add_pd(fiz1,tz);
1084 fjx0 = _mm256_add_pd(fjx0,tx);
1085 fjy0 = _mm256_add_pd(fjy0,ty);
1086 fjz0 = _mm256_add_pd(fjz0,tz);
1088 /**************************
1089 * CALCULATE INTERACTIONS *
1090 **************************/
1092 r20 = _mm256_mul_pd(rsq20,rinv20);
1094 /* Compute parameters for interactions between i and j atoms */
1095 qq20 = _mm256_mul_pd(iq2,jq0);
1097 /* EWALD ELECTROSTATICS */
1099 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100 ewrt = _mm256_mul_pd(r20,ewtabscale);
1101 ewitab = _mm256_cvttpd_epi32(ewrt);
1102 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1103 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1104 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1105 &ewtabF,&ewtabFn);
1106 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1107 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1109 fscal = felec;
1111 /* Calculate temporary vectorial force */
1112 tx = _mm256_mul_pd(fscal,dx20);
1113 ty = _mm256_mul_pd(fscal,dy20);
1114 tz = _mm256_mul_pd(fscal,dz20);
1116 /* Update vectorial force */
1117 fix2 = _mm256_add_pd(fix2,tx);
1118 fiy2 = _mm256_add_pd(fiy2,ty);
1119 fiz2 = _mm256_add_pd(fiz2,tz);
1121 fjx0 = _mm256_add_pd(fjx0,tx);
1122 fjy0 = _mm256_add_pd(fjy0,ty);
1123 fjz0 = _mm256_add_pd(fjz0,tz);
1125 /**************************
1126 * CALCULATE INTERACTIONS *
1127 **************************/
1129 r30 = _mm256_mul_pd(rsq30,rinv30);
1131 /* Compute parameters for interactions between i and j atoms */
1132 qq30 = _mm256_mul_pd(iq3,jq0);
1134 /* EWALD ELECTROSTATICS */
1136 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1137 ewrt = _mm256_mul_pd(r30,ewtabscale);
1138 ewitab = _mm256_cvttpd_epi32(ewrt);
1139 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1140 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1141 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1142 &ewtabF,&ewtabFn);
1143 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1144 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1146 fscal = felec;
1148 /* Calculate temporary vectorial force */
1149 tx = _mm256_mul_pd(fscal,dx30);
1150 ty = _mm256_mul_pd(fscal,dy30);
1151 tz = _mm256_mul_pd(fscal,dz30);
1153 /* Update vectorial force */
1154 fix3 = _mm256_add_pd(fix3,tx);
1155 fiy3 = _mm256_add_pd(fiy3,ty);
1156 fiz3 = _mm256_add_pd(fiz3,tz);
1158 fjx0 = _mm256_add_pd(fjx0,tx);
1159 fjy0 = _mm256_add_pd(fjy0,ty);
1160 fjz0 = _mm256_add_pd(fjz0,tz);
1162 fjptrA = f+j_coord_offsetA;
1163 fjptrB = f+j_coord_offsetB;
1164 fjptrC = f+j_coord_offsetC;
1165 fjptrD = f+j_coord_offsetD;
1167 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1169 /* Inner loop uses 157 flops */
1172 if(jidx<j_index_end)
1175 /* Get j neighbor index, and coordinate index */
1176 jnrlistA = jjnr[jidx];
1177 jnrlistB = jjnr[jidx+1];
1178 jnrlistC = jjnr[jidx+2];
1179 jnrlistD = jjnr[jidx+3];
1180 /* Sign of each element will be negative for non-real atoms.
1181 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1182 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1184 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1186 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1187 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1188 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1190 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1191 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1192 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1193 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1194 j_coord_offsetA = DIM*jnrA;
1195 j_coord_offsetB = DIM*jnrB;
1196 j_coord_offsetC = DIM*jnrC;
1197 j_coord_offsetD = DIM*jnrD;
1199 /* load j atom coordinates */
1200 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1201 x+j_coord_offsetC,x+j_coord_offsetD,
1202 &jx0,&jy0,&jz0);
1204 /* Calculate displacement vector */
1205 dx00 = _mm256_sub_pd(ix0,jx0);
1206 dy00 = _mm256_sub_pd(iy0,jy0);
1207 dz00 = _mm256_sub_pd(iz0,jz0);
1208 dx10 = _mm256_sub_pd(ix1,jx0);
1209 dy10 = _mm256_sub_pd(iy1,jy0);
1210 dz10 = _mm256_sub_pd(iz1,jz0);
1211 dx20 = _mm256_sub_pd(ix2,jx0);
1212 dy20 = _mm256_sub_pd(iy2,jy0);
1213 dz20 = _mm256_sub_pd(iz2,jz0);
1214 dx30 = _mm256_sub_pd(ix3,jx0);
1215 dy30 = _mm256_sub_pd(iy3,jy0);
1216 dz30 = _mm256_sub_pd(iz3,jz0);
1218 /* Calculate squared distance and things based on it */
1219 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1220 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1221 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1222 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1224 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1225 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1226 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1227 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
1229 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1230 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1231 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1232 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
1234 /* Load parameters for j particles */
1235 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1236 charge+jnrC+0,charge+jnrD+0);
1237 vdwjidx0A = 2*vdwtype[jnrA+0];
1238 vdwjidx0B = 2*vdwtype[jnrB+0];
1239 vdwjidx0C = 2*vdwtype[jnrC+0];
1240 vdwjidx0D = 2*vdwtype[jnrD+0];
1242 fjx0 = _mm256_setzero_pd();
1243 fjy0 = _mm256_setzero_pd();
1244 fjz0 = _mm256_setzero_pd();
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 r00 = _mm256_mul_pd(rsq00,rinv00);
1251 r00 = _mm256_andnot_pd(dummy_mask,r00);
1253 /* Compute parameters for interactions between i and j atoms */
1254 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1255 vdwioffsetptr0+vdwjidx0B,
1256 vdwioffsetptr0+vdwjidx0C,
1257 vdwioffsetptr0+vdwjidx0D,
1258 &c6_00,&c12_00);
1260 c6grid_00 = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
1261 vdwgridioffsetptr0+vdwjidx0B,
1262 vdwgridioffsetptr0+vdwjidx0C,
1263 vdwgridioffsetptr0+vdwjidx0D);
1265 /* Analytical LJ-PME */
1266 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1267 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
1268 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1269 exponent = gmx_simd_exp_d(ewcljrsq);
1270 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1271 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1272 /* f6A = 6 * C6grid * (1 - poly) */
1273 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1274 /* f6B = C6grid * exponent * beta^6 */
1275 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1276 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1277 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1279 fscal = fvdw;
1281 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1283 /* Calculate temporary vectorial force */
1284 tx = _mm256_mul_pd(fscal,dx00);
1285 ty = _mm256_mul_pd(fscal,dy00);
1286 tz = _mm256_mul_pd(fscal,dz00);
1288 /* Update vectorial force */
1289 fix0 = _mm256_add_pd(fix0,tx);
1290 fiy0 = _mm256_add_pd(fiy0,ty);
1291 fiz0 = _mm256_add_pd(fiz0,tz);
1293 fjx0 = _mm256_add_pd(fjx0,tx);
1294 fjy0 = _mm256_add_pd(fjy0,ty);
1295 fjz0 = _mm256_add_pd(fjz0,tz);
1297 /**************************
1298 * CALCULATE INTERACTIONS *
1299 **************************/
1301 r10 = _mm256_mul_pd(rsq10,rinv10);
1302 r10 = _mm256_andnot_pd(dummy_mask,r10);
1304 /* Compute parameters for interactions between i and j atoms */
1305 qq10 = _mm256_mul_pd(iq1,jq0);
1307 /* EWALD ELECTROSTATICS */
1309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1310 ewrt = _mm256_mul_pd(r10,ewtabscale);
1311 ewitab = _mm256_cvttpd_epi32(ewrt);
1312 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1313 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1314 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1315 &ewtabF,&ewtabFn);
1316 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1317 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1319 fscal = felec;
1321 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1323 /* Calculate temporary vectorial force */
1324 tx = _mm256_mul_pd(fscal,dx10);
1325 ty = _mm256_mul_pd(fscal,dy10);
1326 tz = _mm256_mul_pd(fscal,dz10);
1328 /* Update vectorial force */
1329 fix1 = _mm256_add_pd(fix1,tx);
1330 fiy1 = _mm256_add_pd(fiy1,ty);
1331 fiz1 = _mm256_add_pd(fiz1,tz);
1333 fjx0 = _mm256_add_pd(fjx0,tx);
1334 fjy0 = _mm256_add_pd(fjy0,ty);
1335 fjz0 = _mm256_add_pd(fjz0,tz);
1337 /**************************
1338 * CALCULATE INTERACTIONS *
1339 **************************/
1341 r20 = _mm256_mul_pd(rsq20,rinv20);
1342 r20 = _mm256_andnot_pd(dummy_mask,r20);
1344 /* Compute parameters for interactions between i and j atoms */
1345 qq20 = _mm256_mul_pd(iq2,jq0);
1347 /* EWALD ELECTROSTATICS */
1349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1350 ewrt = _mm256_mul_pd(r20,ewtabscale);
1351 ewitab = _mm256_cvttpd_epi32(ewrt);
1352 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1353 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1354 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1355 &ewtabF,&ewtabFn);
1356 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1357 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1359 fscal = felec;
1361 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1363 /* Calculate temporary vectorial force */
1364 tx = _mm256_mul_pd(fscal,dx20);
1365 ty = _mm256_mul_pd(fscal,dy20);
1366 tz = _mm256_mul_pd(fscal,dz20);
1368 /* Update vectorial force */
1369 fix2 = _mm256_add_pd(fix2,tx);
1370 fiy2 = _mm256_add_pd(fiy2,ty);
1371 fiz2 = _mm256_add_pd(fiz2,tz);
1373 fjx0 = _mm256_add_pd(fjx0,tx);
1374 fjy0 = _mm256_add_pd(fjy0,ty);
1375 fjz0 = _mm256_add_pd(fjz0,tz);
1377 /**************************
1378 * CALCULATE INTERACTIONS *
1379 **************************/
1381 r30 = _mm256_mul_pd(rsq30,rinv30);
1382 r30 = _mm256_andnot_pd(dummy_mask,r30);
1384 /* Compute parameters for interactions between i and j atoms */
1385 qq30 = _mm256_mul_pd(iq3,jq0);
1387 /* EWALD ELECTROSTATICS */
1389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1390 ewrt = _mm256_mul_pd(r30,ewtabscale);
1391 ewitab = _mm256_cvttpd_epi32(ewrt);
1392 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1393 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1394 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1395 &ewtabF,&ewtabFn);
1396 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1397 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1399 fscal = felec;
1401 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1403 /* Calculate temporary vectorial force */
1404 tx = _mm256_mul_pd(fscal,dx30);
1405 ty = _mm256_mul_pd(fscal,dy30);
1406 tz = _mm256_mul_pd(fscal,dz30);
1408 /* Update vectorial force */
1409 fix3 = _mm256_add_pd(fix3,tx);
1410 fiy3 = _mm256_add_pd(fiy3,ty);
1411 fiz3 = _mm256_add_pd(fiz3,tz);
1413 fjx0 = _mm256_add_pd(fjx0,tx);
1414 fjy0 = _mm256_add_pd(fjy0,ty);
1415 fjz0 = _mm256_add_pd(fjz0,tz);
1417 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1418 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1419 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1420 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1422 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1424 /* Inner loop uses 161 flops */
1427 /* End of innermost loop */
1429 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1430 f+i_coord_offset,fshift+i_shift_offset);
1432 /* Increment number of inner iterations */
1433 inneriter += j_index_end - j_index_start;
1435 /* Outer loop uses 24 flops */
1438 /* Increment number of outer iterations */
1439 outeriter += nri;
1441 /* Update outer/inner flops */
1443 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*161);