Remove nb-parameters from t_forcerec
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_double.c
blob212654305a2c010244989e0009186e140eb3073a
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 real scratch[4*DIM];
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 real * vdwgridioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 real * vdwgridioffsetptr1;
88 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 real * vdwgridioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 real * vdwgridioffsetptr3;
94 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
95 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
96 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
97 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
98 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
99 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
100 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
101 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
102 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
103 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
104 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
107 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
110 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
111 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
112 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
113 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
114 real *charge;
115 int nvdwtype;
116 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
117 int *vdwtype;
118 real *vdwparam;
119 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
120 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
121 __m256d c6grid_00;
122 __m256d c6grid_11;
123 __m256d c6grid_12;
124 __m256d c6grid_13;
125 __m256d c6grid_21;
126 __m256d c6grid_22;
127 __m256d c6grid_23;
128 __m256d c6grid_31;
129 __m256d c6grid_32;
130 __m256d c6grid_33;
131 real *vdwgridparam;
132 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
133 __m256d one_half = _mm256_set1_pd(0.5);
134 __m256d minus_one = _mm256_set1_pd(-1.0);
135 __m128i ewitab;
136 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
137 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
138 real *ewtab;
139 __m256d dummy_mask,cutoff_mask;
140 __m128 tmpmask0,tmpmask1;
141 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
142 __m256d one = _mm256_set1_pd(1.0);
143 __m256d two = _mm256_set1_pd(2.0);
144 x = xx[0];
145 f = ff[0];
147 nri = nlist->nri;
148 iinr = nlist->iinr;
149 jindex = nlist->jindex;
150 jjnr = nlist->jjnr;
151 shiftidx = nlist->shift;
152 gid = nlist->gid;
153 shiftvec = fr->shift_vec[0];
154 fshift = fr->fshift[0];
155 facel = _mm256_set1_pd(fr->ic->epsfac);
156 charge = mdatoms->chargeA;
157 nvdwtype = fr->ntype;
158 vdwparam = fr->nbfp;
159 vdwtype = mdatoms->typeA;
160 vdwgridparam = fr->ljpme_c6grid;
161 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
162 ewclj = _mm256_set1_pd(fr->ic->ewaldcoeff_lj);
163 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
165 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
166 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
167 beta2 = _mm256_mul_pd(beta,beta);
168 beta3 = _mm256_mul_pd(beta,beta2);
170 ewtab = fr->ic->tabq_coul_FDV0;
171 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
172 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
174 /* Setup water-specific parameters */
175 inr = nlist->iinr[0];
176 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
177 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
178 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
179 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
180 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
182 jq1 = _mm256_set1_pd(charge[inr+1]);
183 jq2 = _mm256_set1_pd(charge[inr+2]);
184 jq3 = _mm256_set1_pd(charge[inr+3]);
185 vdwjidx0A = 2*vdwtype[inr+0];
186 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
187 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
188 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
189 qq11 = _mm256_mul_pd(iq1,jq1);
190 qq12 = _mm256_mul_pd(iq1,jq2);
191 qq13 = _mm256_mul_pd(iq1,jq3);
192 qq21 = _mm256_mul_pd(iq2,jq1);
193 qq22 = _mm256_mul_pd(iq2,jq2);
194 qq23 = _mm256_mul_pd(iq2,jq3);
195 qq31 = _mm256_mul_pd(iq3,jq1);
196 qq32 = _mm256_mul_pd(iq3,jq2);
197 qq33 = _mm256_mul_pd(iq3,jq3);
199 /* Avoid stupid compiler warnings */
200 jnrA = jnrB = jnrC = jnrD = 0;
201 j_coord_offsetA = 0;
202 j_coord_offsetB = 0;
203 j_coord_offsetC = 0;
204 j_coord_offsetD = 0;
206 outeriter = 0;
207 inneriter = 0;
209 for(iidx=0;iidx<4*DIM;iidx++)
211 scratch[iidx] = 0.0;
214 /* Start outer loop over neighborlists */
215 for(iidx=0; iidx<nri; iidx++)
217 /* Load shift vector for this list */
218 i_shift_offset = DIM*shiftidx[iidx];
220 /* Load limits for loop over neighbors */
221 j_index_start = jindex[iidx];
222 j_index_end = jindex[iidx+1];
224 /* Get outer coordinate index */
225 inr = iinr[iidx];
226 i_coord_offset = DIM*inr;
228 /* Load i particle coords and add shift vector */
229 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
230 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
232 fix0 = _mm256_setzero_pd();
233 fiy0 = _mm256_setzero_pd();
234 fiz0 = _mm256_setzero_pd();
235 fix1 = _mm256_setzero_pd();
236 fiy1 = _mm256_setzero_pd();
237 fiz1 = _mm256_setzero_pd();
238 fix2 = _mm256_setzero_pd();
239 fiy2 = _mm256_setzero_pd();
240 fiz2 = _mm256_setzero_pd();
241 fix3 = _mm256_setzero_pd();
242 fiy3 = _mm256_setzero_pd();
243 fiz3 = _mm256_setzero_pd();
245 /* Reset potential sums */
246 velecsum = _mm256_setzero_pd();
247 vvdwsum = _mm256_setzero_pd();
249 /* Start inner kernel loop */
250 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
253 /* Get j neighbor index, and coordinate index */
254 jnrA = jjnr[jidx];
255 jnrB = jjnr[jidx+1];
256 jnrC = jjnr[jidx+2];
257 jnrD = jjnr[jidx+3];
258 j_coord_offsetA = DIM*jnrA;
259 j_coord_offsetB = DIM*jnrB;
260 j_coord_offsetC = DIM*jnrC;
261 j_coord_offsetD = DIM*jnrD;
263 /* load j atom coordinates */
264 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
265 x+j_coord_offsetC,x+j_coord_offsetD,
266 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
267 &jy2,&jz2,&jx3,&jy3,&jz3);
269 /* Calculate displacement vector */
270 dx00 = _mm256_sub_pd(ix0,jx0);
271 dy00 = _mm256_sub_pd(iy0,jy0);
272 dz00 = _mm256_sub_pd(iz0,jz0);
273 dx11 = _mm256_sub_pd(ix1,jx1);
274 dy11 = _mm256_sub_pd(iy1,jy1);
275 dz11 = _mm256_sub_pd(iz1,jz1);
276 dx12 = _mm256_sub_pd(ix1,jx2);
277 dy12 = _mm256_sub_pd(iy1,jy2);
278 dz12 = _mm256_sub_pd(iz1,jz2);
279 dx13 = _mm256_sub_pd(ix1,jx3);
280 dy13 = _mm256_sub_pd(iy1,jy3);
281 dz13 = _mm256_sub_pd(iz1,jz3);
282 dx21 = _mm256_sub_pd(ix2,jx1);
283 dy21 = _mm256_sub_pd(iy2,jy1);
284 dz21 = _mm256_sub_pd(iz2,jz1);
285 dx22 = _mm256_sub_pd(ix2,jx2);
286 dy22 = _mm256_sub_pd(iy2,jy2);
287 dz22 = _mm256_sub_pd(iz2,jz2);
288 dx23 = _mm256_sub_pd(ix2,jx3);
289 dy23 = _mm256_sub_pd(iy2,jy3);
290 dz23 = _mm256_sub_pd(iz2,jz3);
291 dx31 = _mm256_sub_pd(ix3,jx1);
292 dy31 = _mm256_sub_pd(iy3,jy1);
293 dz31 = _mm256_sub_pd(iz3,jz1);
294 dx32 = _mm256_sub_pd(ix3,jx2);
295 dy32 = _mm256_sub_pd(iy3,jy2);
296 dz32 = _mm256_sub_pd(iz3,jz2);
297 dx33 = _mm256_sub_pd(ix3,jx3);
298 dy33 = _mm256_sub_pd(iy3,jy3);
299 dz33 = _mm256_sub_pd(iz3,jz3);
301 /* Calculate squared distance and things based on it */
302 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
303 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
304 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
305 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
306 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
307 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
308 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
309 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
310 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
311 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
313 rinv00 = avx256_invsqrt_d(rsq00);
314 rinv11 = avx256_invsqrt_d(rsq11);
315 rinv12 = avx256_invsqrt_d(rsq12);
316 rinv13 = avx256_invsqrt_d(rsq13);
317 rinv21 = avx256_invsqrt_d(rsq21);
318 rinv22 = avx256_invsqrt_d(rsq22);
319 rinv23 = avx256_invsqrt_d(rsq23);
320 rinv31 = avx256_invsqrt_d(rsq31);
321 rinv32 = avx256_invsqrt_d(rsq32);
322 rinv33 = avx256_invsqrt_d(rsq33);
324 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
325 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
326 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
327 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
328 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
329 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
330 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
331 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
332 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
333 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
335 fjx0 = _mm256_setzero_pd();
336 fjy0 = _mm256_setzero_pd();
337 fjz0 = _mm256_setzero_pd();
338 fjx1 = _mm256_setzero_pd();
339 fjy1 = _mm256_setzero_pd();
340 fjz1 = _mm256_setzero_pd();
341 fjx2 = _mm256_setzero_pd();
342 fjy2 = _mm256_setzero_pd();
343 fjz2 = _mm256_setzero_pd();
344 fjx3 = _mm256_setzero_pd();
345 fjy3 = _mm256_setzero_pd();
346 fjz3 = _mm256_setzero_pd();
348 /**************************
349 * CALCULATE INTERACTIONS *
350 **************************/
352 r00 = _mm256_mul_pd(rsq00,rinv00);
354 /* Analytical LJ-PME */
355 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
356 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
357 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
358 exponent = avx256_exp_d(ewcljrsq);
359 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
360 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
361 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
362 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
363 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
364 vvdw = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
365 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
366 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
368 /* Update potential sum for this i atom from the interaction with this j atom. */
369 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
371 fscal = fvdw;
373 /* Calculate temporary vectorial force */
374 tx = _mm256_mul_pd(fscal,dx00);
375 ty = _mm256_mul_pd(fscal,dy00);
376 tz = _mm256_mul_pd(fscal,dz00);
378 /* Update vectorial force */
379 fix0 = _mm256_add_pd(fix0,tx);
380 fiy0 = _mm256_add_pd(fiy0,ty);
381 fiz0 = _mm256_add_pd(fiz0,tz);
383 fjx0 = _mm256_add_pd(fjx0,tx);
384 fjy0 = _mm256_add_pd(fjy0,ty);
385 fjz0 = _mm256_add_pd(fjz0,tz);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 r11 = _mm256_mul_pd(rsq11,rinv11);
393 /* EWALD ELECTROSTATICS */
395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
396 ewrt = _mm256_mul_pd(r11,ewtabscale);
397 ewitab = _mm256_cvttpd_epi32(ewrt);
398 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
399 ewitab = _mm_slli_epi32(ewitab,2);
400 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
401 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
402 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
403 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
404 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
405 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
406 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
407 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
408 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
410 /* Update potential sum for this i atom from the interaction with this j atom. */
411 velecsum = _mm256_add_pd(velecsum,velec);
413 fscal = felec;
415 /* Calculate temporary vectorial force */
416 tx = _mm256_mul_pd(fscal,dx11);
417 ty = _mm256_mul_pd(fscal,dy11);
418 tz = _mm256_mul_pd(fscal,dz11);
420 /* Update vectorial force */
421 fix1 = _mm256_add_pd(fix1,tx);
422 fiy1 = _mm256_add_pd(fiy1,ty);
423 fiz1 = _mm256_add_pd(fiz1,tz);
425 fjx1 = _mm256_add_pd(fjx1,tx);
426 fjy1 = _mm256_add_pd(fjy1,ty);
427 fjz1 = _mm256_add_pd(fjz1,tz);
429 /**************************
430 * CALCULATE INTERACTIONS *
431 **************************/
433 r12 = _mm256_mul_pd(rsq12,rinv12);
435 /* EWALD ELECTROSTATICS */
437 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
438 ewrt = _mm256_mul_pd(r12,ewtabscale);
439 ewitab = _mm256_cvttpd_epi32(ewrt);
440 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
441 ewitab = _mm_slli_epi32(ewitab,2);
442 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
443 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
444 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
445 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
446 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
447 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
448 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
449 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
450 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
452 /* Update potential sum for this i atom from the interaction with this j atom. */
453 velecsum = _mm256_add_pd(velecsum,velec);
455 fscal = felec;
457 /* Calculate temporary vectorial force */
458 tx = _mm256_mul_pd(fscal,dx12);
459 ty = _mm256_mul_pd(fscal,dy12);
460 tz = _mm256_mul_pd(fscal,dz12);
462 /* Update vectorial force */
463 fix1 = _mm256_add_pd(fix1,tx);
464 fiy1 = _mm256_add_pd(fiy1,ty);
465 fiz1 = _mm256_add_pd(fiz1,tz);
467 fjx2 = _mm256_add_pd(fjx2,tx);
468 fjy2 = _mm256_add_pd(fjy2,ty);
469 fjz2 = _mm256_add_pd(fjz2,tz);
471 /**************************
472 * CALCULATE INTERACTIONS *
473 **************************/
475 r13 = _mm256_mul_pd(rsq13,rinv13);
477 /* EWALD ELECTROSTATICS */
479 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
480 ewrt = _mm256_mul_pd(r13,ewtabscale);
481 ewitab = _mm256_cvttpd_epi32(ewrt);
482 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
483 ewitab = _mm_slli_epi32(ewitab,2);
484 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
485 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
486 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
487 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
488 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
489 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
490 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
491 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
492 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velecsum = _mm256_add_pd(velecsum,velec);
497 fscal = felec;
499 /* Calculate temporary vectorial force */
500 tx = _mm256_mul_pd(fscal,dx13);
501 ty = _mm256_mul_pd(fscal,dy13);
502 tz = _mm256_mul_pd(fscal,dz13);
504 /* Update vectorial force */
505 fix1 = _mm256_add_pd(fix1,tx);
506 fiy1 = _mm256_add_pd(fiy1,ty);
507 fiz1 = _mm256_add_pd(fiz1,tz);
509 fjx3 = _mm256_add_pd(fjx3,tx);
510 fjy3 = _mm256_add_pd(fjy3,ty);
511 fjz3 = _mm256_add_pd(fjz3,tz);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 r21 = _mm256_mul_pd(rsq21,rinv21);
519 /* EWALD ELECTROSTATICS */
521 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
522 ewrt = _mm256_mul_pd(r21,ewtabscale);
523 ewitab = _mm256_cvttpd_epi32(ewrt);
524 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
525 ewitab = _mm_slli_epi32(ewitab,2);
526 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
527 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
528 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
529 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
530 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
531 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
532 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
533 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
534 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum = _mm256_add_pd(velecsum,velec);
539 fscal = felec;
541 /* Calculate temporary vectorial force */
542 tx = _mm256_mul_pd(fscal,dx21);
543 ty = _mm256_mul_pd(fscal,dy21);
544 tz = _mm256_mul_pd(fscal,dz21);
546 /* Update vectorial force */
547 fix2 = _mm256_add_pd(fix2,tx);
548 fiy2 = _mm256_add_pd(fiy2,ty);
549 fiz2 = _mm256_add_pd(fiz2,tz);
551 fjx1 = _mm256_add_pd(fjx1,tx);
552 fjy1 = _mm256_add_pd(fjy1,ty);
553 fjz1 = _mm256_add_pd(fjz1,tz);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r22 = _mm256_mul_pd(rsq22,rinv22);
561 /* EWALD ELECTROSTATICS */
563 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
564 ewrt = _mm256_mul_pd(r22,ewtabscale);
565 ewitab = _mm256_cvttpd_epi32(ewrt);
566 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
567 ewitab = _mm_slli_epi32(ewitab,2);
568 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
569 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
570 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
571 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
572 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
573 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
574 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
575 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
576 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
578 /* Update potential sum for this i atom from the interaction with this j atom. */
579 velecsum = _mm256_add_pd(velecsum,velec);
581 fscal = felec;
583 /* Calculate temporary vectorial force */
584 tx = _mm256_mul_pd(fscal,dx22);
585 ty = _mm256_mul_pd(fscal,dy22);
586 tz = _mm256_mul_pd(fscal,dz22);
588 /* Update vectorial force */
589 fix2 = _mm256_add_pd(fix2,tx);
590 fiy2 = _mm256_add_pd(fiy2,ty);
591 fiz2 = _mm256_add_pd(fiz2,tz);
593 fjx2 = _mm256_add_pd(fjx2,tx);
594 fjy2 = _mm256_add_pd(fjy2,ty);
595 fjz2 = _mm256_add_pd(fjz2,tz);
597 /**************************
598 * CALCULATE INTERACTIONS *
599 **************************/
601 r23 = _mm256_mul_pd(rsq23,rinv23);
603 /* EWALD ELECTROSTATICS */
605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
606 ewrt = _mm256_mul_pd(r23,ewtabscale);
607 ewitab = _mm256_cvttpd_epi32(ewrt);
608 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
609 ewitab = _mm_slli_epi32(ewitab,2);
610 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
611 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
612 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
613 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
614 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
615 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
616 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
617 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
618 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velecsum = _mm256_add_pd(velecsum,velec);
623 fscal = felec;
625 /* Calculate temporary vectorial force */
626 tx = _mm256_mul_pd(fscal,dx23);
627 ty = _mm256_mul_pd(fscal,dy23);
628 tz = _mm256_mul_pd(fscal,dz23);
630 /* Update vectorial force */
631 fix2 = _mm256_add_pd(fix2,tx);
632 fiy2 = _mm256_add_pd(fiy2,ty);
633 fiz2 = _mm256_add_pd(fiz2,tz);
635 fjx3 = _mm256_add_pd(fjx3,tx);
636 fjy3 = _mm256_add_pd(fjy3,ty);
637 fjz3 = _mm256_add_pd(fjz3,tz);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 r31 = _mm256_mul_pd(rsq31,rinv31);
645 /* EWALD ELECTROSTATICS */
647 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
648 ewrt = _mm256_mul_pd(r31,ewtabscale);
649 ewitab = _mm256_cvttpd_epi32(ewrt);
650 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
651 ewitab = _mm_slli_epi32(ewitab,2);
652 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
653 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
654 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
655 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
656 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
657 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
658 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
659 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
660 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
662 /* Update potential sum for this i atom from the interaction with this j atom. */
663 velecsum = _mm256_add_pd(velecsum,velec);
665 fscal = felec;
667 /* Calculate temporary vectorial force */
668 tx = _mm256_mul_pd(fscal,dx31);
669 ty = _mm256_mul_pd(fscal,dy31);
670 tz = _mm256_mul_pd(fscal,dz31);
672 /* Update vectorial force */
673 fix3 = _mm256_add_pd(fix3,tx);
674 fiy3 = _mm256_add_pd(fiy3,ty);
675 fiz3 = _mm256_add_pd(fiz3,tz);
677 fjx1 = _mm256_add_pd(fjx1,tx);
678 fjy1 = _mm256_add_pd(fjy1,ty);
679 fjz1 = _mm256_add_pd(fjz1,tz);
681 /**************************
682 * CALCULATE INTERACTIONS *
683 **************************/
685 r32 = _mm256_mul_pd(rsq32,rinv32);
687 /* EWALD ELECTROSTATICS */
689 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
690 ewrt = _mm256_mul_pd(r32,ewtabscale);
691 ewitab = _mm256_cvttpd_epi32(ewrt);
692 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
693 ewitab = _mm_slli_epi32(ewitab,2);
694 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
695 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
696 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
697 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
698 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
699 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
700 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
701 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
702 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
704 /* Update potential sum for this i atom from the interaction with this j atom. */
705 velecsum = _mm256_add_pd(velecsum,velec);
707 fscal = felec;
709 /* Calculate temporary vectorial force */
710 tx = _mm256_mul_pd(fscal,dx32);
711 ty = _mm256_mul_pd(fscal,dy32);
712 tz = _mm256_mul_pd(fscal,dz32);
714 /* Update vectorial force */
715 fix3 = _mm256_add_pd(fix3,tx);
716 fiy3 = _mm256_add_pd(fiy3,ty);
717 fiz3 = _mm256_add_pd(fiz3,tz);
719 fjx2 = _mm256_add_pd(fjx2,tx);
720 fjy2 = _mm256_add_pd(fjy2,ty);
721 fjz2 = _mm256_add_pd(fjz2,tz);
723 /**************************
724 * CALCULATE INTERACTIONS *
725 **************************/
727 r33 = _mm256_mul_pd(rsq33,rinv33);
729 /* EWALD ELECTROSTATICS */
731 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
732 ewrt = _mm256_mul_pd(r33,ewtabscale);
733 ewitab = _mm256_cvttpd_epi32(ewrt);
734 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
735 ewitab = _mm_slli_epi32(ewitab,2);
736 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
737 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
738 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
739 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
740 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
741 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
742 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
743 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
744 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velecsum = _mm256_add_pd(velecsum,velec);
749 fscal = felec;
751 /* Calculate temporary vectorial force */
752 tx = _mm256_mul_pd(fscal,dx33);
753 ty = _mm256_mul_pd(fscal,dy33);
754 tz = _mm256_mul_pd(fscal,dz33);
756 /* Update vectorial force */
757 fix3 = _mm256_add_pd(fix3,tx);
758 fiy3 = _mm256_add_pd(fiy3,ty);
759 fiz3 = _mm256_add_pd(fiz3,tz);
761 fjx3 = _mm256_add_pd(fjx3,tx);
762 fjy3 = _mm256_add_pd(fjy3,ty);
763 fjz3 = _mm256_add_pd(fjz3,tz);
765 fjptrA = f+j_coord_offsetA;
766 fjptrB = f+j_coord_offsetB;
767 fjptrC = f+j_coord_offsetC;
768 fjptrD = f+j_coord_offsetD;
770 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
771 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
772 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
774 /* Inner loop uses 426 flops */
777 if(jidx<j_index_end)
780 /* Get j neighbor index, and coordinate index */
781 jnrlistA = jjnr[jidx];
782 jnrlistB = jjnr[jidx+1];
783 jnrlistC = jjnr[jidx+2];
784 jnrlistD = jjnr[jidx+3];
785 /* Sign of each element will be negative for non-real atoms.
786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
787 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
789 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
791 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
792 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
793 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
795 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
796 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
797 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
798 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
799 j_coord_offsetA = DIM*jnrA;
800 j_coord_offsetB = DIM*jnrB;
801 j_coord_offsetC = DIM*jnrC;
802 j_coord_offsetD = DIM*jnrD;
804 /* load j atom coordinates */
805 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
806 x+j_coord_offsetC,x+j_coord_offsetD,
807 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
808 &jy2,&jz2,&jx3,&jy3,&jz3);
810 /* Calculate displacement vector */
811 dx00 = _mm256_sub_pd(ix0,jx0);
812 dy00 = _mm256_sub_pd(iy0,jy0);
813 dz00 = _mm256_sub_pd(iz0,jz0);
814 dx11 = _mm256_sub_pd(ix1,jx1);
815 dy11 = _mm256_sub_pd(iy1,jy1);
816 dz11 = _mm256_sub_pd(iz1,jz1);
817 dx12 = _mm256_sub_pd(ix1,jx2);
818 dy12 = _mm256_sub_pd(iy1,jy2);
819 dz12 = _mm256_sub_pd(iz1,jz2);
820 dx13 = _mm256_sub_pd(ix1,jx3);
821 dy13 = _mm256_sub_pd(iy1,jy3);
822 dz13 = _mm256_sub_pd(iz1,jz3);
823 dx21 = _mm256_sub_pd(ix2,jx1);
824 dy21 = _mm256_sub_pd(iy2,jy1);
825 dz21 = _mm256_sub_pd(iz2,jz1);
826 dx22 = _mm256_sub_pd(ix2,jx2);
827 dy22 = _mm256_sub_pd(iy2,jy2);
828 dz22 = _mm256_sub_pd(iz2,jz2);
829 dx23 = _mm256_sub_pd(ix2,jx3);
830 dy23 = _mm256_sub_pd(iy2,jy3);
831 dz23 = _mm256_sub_pd(iz2,jz3);
832 dx31 = _mm256_sub_pd(ix3,jx1);
833 dy31 = _mm256_sub_pd(iy3,jy1);
834 dz31 = _mm256_sub_pd(iz3,jz1);
835 dx32 = _mm256_sub_pd(ix3,jx2);
836 dy32 = _mm256_sub_pd(iy3,jy2);
837 dz32 = _mm256_sub_pd(iz3,jz2);
838 dx33 = _mm256_sub_pd(ix3,jx3);
839 dy33 = _mm256_sub_pd(iy3,jy3);
840 dz33 = _mm256_sub_pd(iz3,jz3);
842 /* Calculate squared distance and things based on it */
843 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
844 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
845 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
846 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
847 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
848 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
849 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
850 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
851 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
852 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
854 rinv00 = avx256_invsqrt_d(rsq00);
855 rinv11 = avx256_invsqrt_d(rsq11);
856 rinv12 = avx256_invsqrt_d(rsq12);
857 rinv13 = avx256_invsqrt_d(rsq13);
858 rinv21 = avx256_invsqrt_d(rsq21);
859 rinv22 = avx256_invsqrt_d(rsq22);
860 rinv23 = avx256_invsqrt_d(rsq23);
861 rinv31 = avx256_invsqrt_d(rsq31);
862 rinv32 = avx256_invsqrt_d(rsq32);
863 rinv33 = avx256_invsqrt_d(rsq33);
865 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
866 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
867 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
868 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
869 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
870 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
871 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
872 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
873 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
874 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
876 fjx0 = _mm256_setzero_pd();
877 fjy0 = _mm256_setzero_pd();
878 fjz0 = _mm256_setzero_pd();
879 fjx1 = _mm256_setzero_pd();
880 fjy1 = _mm256_setzero_pd();
881 fjz1 = _mm256_setzero_pd();
882 fjx2 = _mm256_setzero_pd();
883 fjy2 = _mm256_setzero_pd();
884 fjz2 = _mm256_setzero_pd();
885 fjx3 = _mm256_setzero_pd();
886 fjy3 = _mm256_setzero_pd();
887 fjz3 = _mm256_setzero_pd();
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 r00 = _mm256_mul_pd(rsq00,rinv00);
894 r00 = _mm256_andnot_pd(dummy_mask,r00);
896 /* Analytical LJ-PME */
897 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
898 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
899 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
900 exponent = avx256_exp_d(ewcljrsq);
901 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
902 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
903 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
904 vvdw6 = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
905 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
906 vvdw = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
907 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
908 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
910 /* Update potential sum for this i atom from the interaction with this j atom. */
911 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
912 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
914 fscal = fvdw;
916 fscal = _mm256_andnot_pd(dummy_mask,fscal);
918 /* Calculate temporary vectorial force */
919 tx = _mm256_mul_pd(fscal,dx00);
920 ty = _mm256_mul_pd(fscal,dy00);
921 tz = _mm256_mul_pd(fscal,dz00);
923 /* Update vectorial force */
924 fix0 = _mm256_add_pd(fix0,tx);
925 fiy0 = _mm256_add_pd(fiy0,ty);
926 fiz0 = _mm256_add_pd(fiz0,tz);
928 fjx0 = _mm256_add_pd(fjx0,tx);
929 fjy0 = _mm256_add_pd(fjy0,ty);
930 fjz0 = _mm256_add_pd(fjz0,tz);
932 /**************************
933 * CALCULATE INTERACTIONS *
934 **************************/
936 r11 = _mm256_mul_pd(rsq11,rinv11);
937 r11 = _mm256_andnot_pd(dummy_mask,r11);
939 /* EWALD ELECTROSTATICS */
941 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
942 ewrt = _mm256_mul_pd(r11,ewtabscale);
943 ewitab = _mm256_cvttpd_epi32(ewrt);
944 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
945 ewitab = _mm_slli_epi32(ewitab,2);
946 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
947 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
948 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
949 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
950 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
951 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
952 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
953 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
954 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec = _mm256_andnot_pd(dummy_mask,velec);
958 velecsum = _mm256_add_pd(velecsum,velec);
960 fscal = felec;
962 fscal = _mm256_andnot_pd(dummy_mask,fscal);
964 /* Calculate temporary vectorial force */
965 tx = _mm256_mul_pd(fscal,dx11);
966 ty = _mm256_mul_pd(fscal,dy11);
967 tz = _mm256_mul_pd(fscal,dz11);
969 /* Update vectorial force */
970 fix1 = _mm256_add_pd(fix1,tx);
971 fiy1 = _mm256_add_pd(fiy1,ty);
972 fiz1 = _mm256_add_pd(fiz1,tz);
974 fjx1 = _mm256_add_pd(fjx1,tx);
975 fjy1 = _mm256_add_pd(fjy1,ty);
976 fjz1 = _mm256_add_pd(fjz1,tz);
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 r12 = _mm256_mul_pd(rsq12,rinv12);
983 r12 = _mm256_andnot_pd(dummy_mask,r12);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt = _mm256_mul_pd(r12,ewtabscale);
989 ewitab = _mm256_cvttpd_epi32(ewrt);
990 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
991 ewitab = _mm_slli_epi32(ewitab,2);
992 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
993 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
994 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
995 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
996 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
997 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
998 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
999 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1000 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1002 /* Update potential sum for this i atom from the interaction with this j atom. */
1003 velec = _mm256_andnot_pd(dummy_mask,velec);
1004 velecsum = _mm256_add_pd(velecsum,velec);
1006 fscal = felec;
1008 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1010 /* Calculate temporary vectorial force */
1011 tx = _mm256_mul_pd(fscal,dx12);
1012 ty = _mm256_mul_pd(fscal,dy12);
1013 tz = _mm256_mul_pd(fscal,dz12);
1015 /* Update vectorial force */
1016 fix1 = _mm256_add_pd(fix1,tx);
1017 fiy1 = _mm256_add_pd(fiy1,ty);
1018 fiz1 = _mm256_add_pd(fiz1,tz);
1020 fjx2 = _mm256_add_pd(fjx2,tx);
1021 fjy2 = _mm256_add_pd(fjy2,ty);
1022 fjz2 = _mm256_add_pd(fjz2,tz);
1024 /**************************
1025 * CALCULATE INTERACTIONS *
1026 **************************/
1028 r13 = _mm256_mul_pd(rsq13,rinv13);
1029 r13 = _mm256_andnot_pd(dummy_mask,r13);
1031 /* EWALD ELECTROSTATICS */
1033 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1034 ewrt = _mm256_mul_pd(r13,ewtabscale);
1035 ewitab = _mm256_cvttpd_epi32(ewrt);
1036 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1037 ewitab = _mm_slli_epi32(ewitab,2);
1038 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1039 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1040 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1041 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1042 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1043 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1044 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1045 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1046 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec = _mm256_andnot_pd(dummy_mask,velec);
1050 velecsum = _mm256_add_pd(velecsum,velec);
1052 fscal = felec;
1054 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1056 /* Calculate temporary vectorial force */
1057 tx = _mm256_mul_pd(fscal,dx13);
1058 ty = _mm256_mul_pd(fscal,dy13);
1059 tz = _mm256_mul_pd(fscal,dz13);
1061 /* Update vectorial force */
1062 fix1 = _mm256_add_pd(fix1,tx);
1063 fiy1 = _mm256_add_pd(fiy1,ty);
1064 fiz1 = _mm256_add_pd(fiz1,tz);
1066 fjx3 = _mm256_add_pd(fjx3,tx);
1067 fjy3 = _mm256_add_pd(fjy3,ty);
1068 fjz3 = _mm256_add_pd(fjz3,tz);
1070 /**************************
1071 * CALCULATE INTERACTIONS *
1072 **************************/
1074 r21 = _mm256_mul_pd(rsq21,rinv21);
1075 r21 = _mm256_andnot_pd(dummy_mask,r21);
1077 /* EWALD ELECTROSTATICS */
1079 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1080 ewrt = _mm256_mul_pd(r21,ewtabscale);
1081 ewitab = _mm256_cvttpd_epi32(ewrt);
1082 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1083 ewitab = _mm_slli_epi32(ewitab,2);
1084 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1085 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1086 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1087 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1088 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1089 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1090 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1091 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1092 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1094 /* Update potential sum for this i atom from the interaction with this j atom. */
1095 velec = _mm256_andnot_pd(dummy_mask,velec);
1096 velecsum = _mm256_add_pd(velecsum,velec);
1098 fscal = felec;
1100 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1102 /* Calculate temporary vectorial force */
1103 tx = _mm256_mul_pd(fscal,dx21);
1104 ty = _mm256_mul_pd(fscal,dy21);
1105 tz = _mm256_mul_pd(fscal,dz21);
1107 /* Update vectorial force */
1108 fix2 = _mm256_add_pd(fix2,tx);
1109 fiy2 = _mm256_add_pd(fiy2,ty);
1110 fiz2 = _mm256_add_pd(fiz2,tz);
1112 fjx1 = _mm256_add_pd(fjx1,tx);
1113 fjy1 = _mm256_add_pd(fjy1,ty);
1114 fjz1 = _mm256_add_pd(fjz1,tz);
1116 /**************************
1117 * CALCULATE INTERACTIONS *
1118 **************************/
1120 r22 = _mm256_mul_pd(rsq22,rinv22);
1121 r22 = _mm256_andnot_pd(dummy_mask,r22);
1123 /* EWALD ELECTROSTATICS */
1125 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1126 ewrt = _mm256_mul_pd(r22,ewtabscale);
1127 ewitab = _mm256_cvttpd_epi32(ewrt);
1128 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1129 ewitab = _mm_slli_epi32(ewitab,2);
1130 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1131 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1132 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1133 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1134 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1135 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1136 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1137 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1138 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec = _mm256_andnot_pd(dummy_mask,velec);
1142 velecsum = _mm256_add_pd(velecsum,velec);
1144 fscal = felec;
1146 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1148 /* Calculate temporary vectorial force */
1149 tx = _mm256_mul_pd(fscal,dx22);
1150 ty = _mm256_mul_pd(fscal,dy22);
1151 tz = _mm256_mul_pd(fscal,dz22);
1153 /* Update vectorial force */
1154 fix2 = _mm256_add_pd(fix2,tx);
1155 fiy2 = _mm256_add_pd(fiy2,ty);
1156 fiz2 = _mm256_add_pd(fiz2,tz);
1158 fjx2 = _mm256_add_pd(fjx2,tx);
1159 fjy2 = _mm256_add_pd(fjy2,ty);
1160 fjz2 = _mm256_add_pd(fjz2,tz);
1162 /**************************
1163 * CALCULATE INTERACTIONS *
1164 **************************/
1166 r23 = _mm256_mul_pd(rsq23,rinv23);
1167 r23 = _mm256_andnot_pd(dummy_mask,r23);
1169 /* EWALD ELECTROSTATICS */
1171 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1172 ewrt = _mm256_mul_pd(r23,ewtabscale);
1173 ewitab = _mm256_cvttpd_epi32(ewrt);
1174 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1175 ewitab = _mm_slli_epi32(ewitab,2);
1176 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1177 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1178 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1179 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1180 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1181 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1182 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1183 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1184 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1186 /* Update potential sum for this i atom from the interaction with this j atom. */
1187 velec = _mm256_andnot_pd(dummy_mask,velec);
1188 velecsum = _mm256_add_pd(velecsum,velec);
1190 fscal = felec;
1192 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1194 /* Calculate temporary vectorial force */
1195 tx = _mm256_mul_pd(fscal,dx23);
1196 ty = _mm256_mul_pd(fscal,dy23);
1197 tz = _mm256_mul_pd(fscal,dz23);
1199 /* Update vectorial force */
1200 fix2 = _mm256_add_pd(fix2,tx);
1201 fiy2 = _mm256_add_pd(fiy2,ty);
1202 fiz2 = _mm256_add_pd(fiz2,tz);
1204 fjx3 = _mm256_add_pd(fjx3,tx);
1205 fjy3 = _mm256_add_pd(fjy3,ty);
1206 fjz3 = _mm256_add_pd(fjz3,tz);
1208 /**************************
1209 * CALCULATE INTERACTIONS *
1210 **************************/
1212 r31 = _mm256_mul_pd(rsq31,rinv31);
1213 r31 = _mm256_andnot_pd(dummy_mask,r31);
1215 /* EWALD ELECTROSTATICS */
1217 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1218 ewrt = _mm256_mul_pd(r31,ewtabscale);
1219 ewitab = _mm256_cvttpd_epi32(ewrt);
1220 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1221 ewitab = _mm_slli_epi32(ewitab,2);
1222 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1223 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1224 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1225 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1226 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1227 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1228 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1229 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1230 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1232 /* Update potential sum for this i atom from the interaction with this j atom. */
1233 velec = _mm256_andnot_pd(dummy_mask,velec);
1234 velecsum = _mm256_add_pd(velecsum,velec);
1236 fscal = felec;
1238 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1240 /* Calculate temporary vectorial force */
1241 tx = _mm256_mul_pd(fscal,dx31);
1242 ty = _mm256_mul_pd(fscal,dy31);
1243 tz = _mm256_mul_pd(fscal,dz31);
1245 /* Update vectorial force */
1246 fix3 = _mm256_add_pd(fix3,tx);
1247 fiy3 = _mm256_add_pd(fiy3,ty);
1248 fiz3 = _mm256_add_pd(fiz3,tz);
1250 fjx1 = _mm256_add_pd(fjx1,tx);
1251 fjy1 = _mm256_add_pd(fjy1,ty);
1252 fjz1 = _mm256_add_pd(fjz1,tz);
1254 /**************************
1255 * CALCULATE INTERACTIONS *
1256 **************************/
1258 r32 = _mm256_mul_pd(rsq32,rinv32);
1259 r32 = _mm256_andnot_pd(dummy_mask,r32);
1261 /* EWALD ELECTROSTATICS */
1263 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1264 ewrt = _mm256_mul_pd(r32,ewtabscale);
1265 ewitab = _mm256_cvttpd_epi32(ewrt);
1266 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1267 ewitab = _mm_slli_epi32(ewitab,2);
1268 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1269 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1270 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1271 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1272 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1273 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1274 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1275 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1276 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1278 /* Update potential sum for this i atom from the interaction with this j atom. */
1279 velec = _mm256_andnot_pd(dummy_mask,velec);
1280 velecsum = _mm256_add_pd(velecsum,velec);
1282 fscal = felec;
1284 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1286 /* Calculate temporary vectorial force */
1287 tx = _mm256_mul_pd(fscal,dx32);
1288 ty = _mm256_mul_pd(fscal,dy32);
1289 tz = _mm256_mul_pd(fscal,dz32);
1291 /* Update vectorial force */
1292 fix3 = _mm256_add_pd(fix3,tx);
1293 fiy3 = _mm256_add_pd(fiy3,ty);
1294 fiz3 = _mm256_add_pd(fiz3,tz);
1296 fjx2 = _mm256_add_pd(fjx2,tx);
1297 fjy2 = _mm256_add_pd(fjy2,ty);
1298 fjz2 = _mm256_add_pd(fjz2,tz);
1300 /**************************
1301 * CALCULATE INTERACTIONS *
1302 **************************/
1304 r33 = _mm256_mul_pd(rsq33,rinv33);
1305 r33 = _mm256_andnot_pd(dummy_mask,r33);
1307 /* EWALD ELECTROSTATICS */
1309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1310 ewrt = _mm256_mul_pd(r33,ewtabscale);
1311 ewitab = _mm256_cvttpd_epi32(ewrt);
1312 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1313 ewitab = _mm_slli_epi32(ewitab,2);
1314 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1315 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1316 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1317 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1318 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1319 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1320 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1321 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1322 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1324 /* Update potential sum for this i atom from the interaction with this j atom. */
1325 velec = _mm256_andnot_pd(dummy_mask,velec);
1326 velecsum = _mm256_add_pd(velecsum,velec);
1328 fscal = felec;
1330 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1332 /* Calculate temporary vectorial force */
1333 tx = _mm256_mul_pd(fscal,dx33);
1334 ty = _mm256_mul_pd(fscal,dy33);
1335 tz = _mm256_mul_pd(fscal,dz33);
1337 /* Update vectorial force */
1338 fix3 = _mm256_add_pd(fix3,tx);
1339 fiy3 = _mm256_add_pd(fiy3,ty);
1340 fiz3 = _mm256_add_pd(fiz3,tz);
1342 fjx3 = _mm256_add_pd(fjx3,tx);
1343 fjy3 = _mm256_add_pd(fjy3,ty);
1344 fjz3 = _mm256_add_pd(fjz3,tz);
1346 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1347 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1348 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1349 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1351 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1352 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1353 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1355 /* Inner loop uses 436 flops */
1358 /* End of innermost loop */
1360 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1361 f+i_coord_offset,fshift+i_shift_offset);
1363 ggid = gid[iidx];
1364 /* Update potential energies */
1365 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1366 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1368 /* Increment number of inner iterations */
1369 inneriter += j_index_end - j_index_start;
1371 /* Outer loop uses 26 flops */
1374 /* Increment number of outer iterations */
1375 outeriter += nri;
1377 /* Update outer/inner flops */
1379 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*436);
1382 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1383 * Electrostatics interaction: Ewald
1384 * VdW interaction: LJEwald
1385 * Geometry: Water4-Water4
1386 * Calculate force/pot: Force
1388 void
1389 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1390 (t_nblist * gmx_restrict nlist,
1391 rvec * gmx_restrict xx,
1392 rvec * gmx_restrict ff,
1393 struct t_forcerec * gmx_restrict fr,
1394 t_mdatoms * gmx_restrict mdatoms,
1395 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1396 t_nrnb * gmx_restrict nrnb)
1398 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1399 * just 0 for non-waters.
1400 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1401 * jnr indices corresponding to data put in the four positions in the SIMD register.
1403 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1404 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1405 int jnrA,jnrB,jnrC,jnrD;
1406 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1407 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1408 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1409 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1410 real rcutoff_scalar;
1411 real *shiftvec,*fshift,*x,*f;
1412 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1413 real scratch[4*DIM];
1414 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1415 real * vdwioffsetptr0;
1416 real * vdwgridioffsetptr0;
1417 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1418 real * vdwioffsetptr1;
1419 real * vdwgridioffsetptr1;
1420 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1421 real * vdwioffsetptr2;
1422 real * vdwgridioffsetptr2;
1423 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1424 real * vdwioffsetptr3;
1425 real * vdwgridioffsetptr3;
1426 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1427 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1428 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1429 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1430 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1431 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1432 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1433 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1434 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1435 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1436 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1437 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1438 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1439 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1440 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1441 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1442 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1443 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1444 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1445 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1446 real *charge;
1447 int nvdwtype;
1448 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1449 int *vdwtype;
1450 real *vdwparam;
1451 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1452 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1453 __m256d c6grid_00;
1454 __m256d c6grid_11;
1455 __m256d c6grid_12;
1456 __m256d c6grid_13;
1457 __m256d c6grid_21;
1458 __m256d c6grid_22;
1459 __m256d c6grid_23;
1460 __m256d c6grid_31;
1461 __m256d c6grid_32;
1462 __m256d c6grid_33;
1463 real *vdwgridparam;
1464 __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1465 __m256d one_half = _mm256_set1_pd(0.5);
1466 __m256d minus_one = _mm256_set1_pd(-1.0);
1467 __m128i ewitab;
1468 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1469 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1470 real *ewtab;
1471 __m256d dummy_mask,cutoff_mask;
1472 __m128 tmpmask0,tmpmask1;
1473 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1474 __m256d one = _mm256_set1_pd(1.0);
1475 __m256d two = _mm256_set1_pd(2.0);
1476 x = xx[0];
1477 f = ff[0];
1479 nri = nlist->nri;
1480 iinr = nlist->iinr;
1481 jindex = nlist->jindex;
1482 jjnr = nlist->jjnr;
1483 shiftidx = nlist->shift;
1484 gid = nlist->gid;
1485 shiftvec = fr->shift_vec[0];
1486 fshift = fr->fshift[0];
1487 facel = _mm256_set1_pd(fr->ic->epsfac);
1488 charge = mdatoms->chargeA;
1489 nvdwtype = fr->ntype;
1490 vdwparam = fr->nbfp;
1491 vdwtype = mdatoms->typeA;
1492 vdwgridparam = fr->ljpme_c6grid;
1493 sh_lj_ewald = _mm256_set1_pd(fr->ic->sh_lj_ewald);
1494 ewclj = _mm256_set1_pd(fr->ic->ewaldcoeff_lj);
1495 ewclj2 = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
1497 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1498 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1499 beta2 = _mm256_mul_pd(beta,beta);
1500 beta3 = _mm256_mul_pd(beta,beta2);
1502 ewtab = fr->ic->tabq_coul_F;
1503 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1504 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1506 /* Setup water-specific parameters */
1507 inr = nlist->iinr[0];
1508 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1509 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1510 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1511 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1512 vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
1514 jq1 = _mm256_set1_pd(charge[inr+1]);
1515 jq2 = _mm256_set1_pd(charge[inr+2]);
1516 jq3 = _mm256_set1_pd(charge[inr+3]);
1517 vdwjidx0A = 2*vdwtype[inr+0];
1518 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1519 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1520 c6grid_00 = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
1521 qq11 = _mm256_mul_pd(iq1,jq1);
1522 qq12 = _mm256_mul_pd(iq1,jq2);
1523 qq13 = _mm256_mul_pd(iq1,jq3);
1524 qq21 = _mm256_mul_pd(iq2,jq1);
1525 qq22 = _mm256_mul_pd(iq2,jq2);
1526 qq23 = _mm256_mul_pd(iq2,jq3);
1527 qq31 = _mm256_mul_pd(iq3,jq1);
1528 qq32 = _mm256_mul_pd(iq3,jq2);
1529 qq33 = _mm256_mul_pd(iq3,jq3);
1531 /* Avoid stupid compiler warnings */
1532 jnrA = jnrB = jnrC = jnrD = 0;
1533 j_coord_offsetA = 0;
1534 j_coord_offsetB = 0;
1535 j_coord_offsetC = 0;
1536 j_coord_offsetD = 0;
1538 outeriter = 0;
1539 inneriter = 0;
1541 for(iidx=0;iidx<4*DIM;iidx++)
1543 scratch[iidx] = 0.0;
1546 /* Start outer loop over neighborlists */
1547 for(iidx=0; iidx<nri; iidx++)
1549 /* Load shift vector for this list */
1550 i_shift_offset = DIM*shiftidx[iidx];
1552 /* Load limits for loop over neighbors */
1553 j_index_start = jindex[iidx];
1554 j_index_end = jindex[iidx+1];
1556 /* Get outer coordinate index */
1557 inr = iinr[iidx];
1558 i_coord_offset = DIM*inr;
1560 /* Load i particle coords and add shift vector */
1561 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1562 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1564 fix0 = _mm256_setzero_pd();
1565 fiy0 = _mm256_setzero_pd();
1566 fiz0 = _mm256_setzero_pd();
1567 fix1 = _mm256_setzero_pd();
1568 fiy1 = _mm256_setzero_pd();
1569 fiz1 = _mm256_setzero_pd();
1570 fix2 = _mm256_setzero_pd();
1571 fiy2 = _mm256_setzero_pd();
1572 fiz2 = _mm256_setzero_pd();
1573 fix3 = _mm256_setzero_pd();
1574 fiy3 = _mm256_setzero_pd();
1575 fiz3 = _mm256_setzero_pd();
1577 /* Start inner kernel loop */
1578 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1581 /* Get j neighbor index, and coordinate index */
1582 jnrA = jjnr[jidx];
1583 jnrB = jjnr[jidx+1];
1584 jnrC = jjnr[jidx+2];
1585 jnrD = jjnr[jidx+3];
1586 j_coord_offsetA = DIM*jnrA;
1587 j_coord_offsetB = DIM*jnrB;
1588 j_coord_offsetC = DIM*jnrC;
1589 j_coord_offsetD = DIM*jnrD;
1591 /* load j atom coordinates */
1592 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1593 x+j_coord_offsetC,x+j_coord_offsetD,
1594 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1595 &jy2,&jz2,&jx3,&jy3,&jz3);
1597 /* Calculate displacement vector */
1598 dx00 = _mm256_sub_pd(ix0,jx0);
1599 dy00 = _mm256_sub_pd(iy0,jy0);
1600 dz00 = _mm256_sub_pd(iz0,jz0);
1601 dx11 = _mm256_sub_pd(ix1,jx1);
1602 dy11 = _mm256_sub_pd(iy1,jy1);
1603 dz11 = _mm256_sub_pd(iz1,jz1);
1604 dx12 = _mm256_sub_pd(ix1,jx2);
1605 dy12 = _mm256_sub_pd(iy1,jy2);
1606 dz12 = _mm256_sub_pd(iz1,jz2);
1607 dx13 = _mm256_sub_pd(ix1,jx3);
1608 dy13 = _mm256_sub_pd(iy1,jy3);
1609 dz13 = _mm256_sub_pd(iz1,jz3);
1610 dx21 = _mm256_sub_pd(ix2,jx1);
1611 dy21 = _mm256_sub_pd(iy2,jy1);
1612 dz21 = _mm256_sub_pd(iz2,jz1);
1613 dx22 = _mm256_sub_pd(ix2,jx2);
1614 dy22 = _mm256_sub_pd(iy2,jy2);
1615 dz22 = _mm256_sub_pd(iz2,jz2);
1616 dx23 = _mm256_sub_pd(ix2,jx3);
1617 dy23 = _mm256_sub_pd(iy2,jy3);
1618 dz23 = _mm256_sub_pd(iz2,jz3);
1619 dx31 = _mm256_sub_pd(ix3,jx1);
1620 dy31 = _mm256_sub_pd(iy3,jy1);
1621 dz31 = _mm256_sub_pd(iz3,jz1);
1622 dx32 = _mm256_sub_pd(ix3,jx2);
1623 dy32 = _mm256_sub_pd(iy3,jy2);
1624 dz32 = _mm256_sub_pd(iz3,jz2);
1625 dx33 = _mm256_sub_pd(ix3,jx3);
1626 dy33 = _mm256_sub_pd(iy3,jy3);
1627 dz33 = _mm256_sub_pd(iz3,jz3);
1629 /* Calculate squared distance and things based on it */
1630 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1631 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1632 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1633 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1634 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1635 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1636 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1637 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1638 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1639 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1641 rinv00 = avx256_invsqrt_d(rsq00);
1642 rinv11 = avx256_invsqrt_d(rsq11);
1643 rinv12 = avx256_invsqrt_d(rsq12);
1644 rinv13 = avx256_invsqrt_d(rsq13);
1645 rinv21 = avx256_invsqrt_d(rsq21);
1646 rinv22 = avx256_invsqrt_d(rsq22);
1647 rinv23 = avx256_invsqrt_d(rsq23);
1648 rinv31 = avx256_invsqrt_d(rsq31);
1649 rinv32 = avx256_invsqrt_d(rsq32);
1650 rinv33 = avx256_invsqrt_d(rsq33);
1652 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1653 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1654 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1655 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1656 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1657 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1658 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1659 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1660 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1661 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1663 fjx0 = _mm256_setzero_pd();
1664 fjy0 = _mm256_setzero_pd();
1665 fjz0 = _mm256_setzero_pd();
1666 fjx1 = _mm256_setzero_pd();
1667 fjy1 = _mm256_setzero_pd();
1668 fjz1 = _mm256_setzero_pd();
1669 fjx2 = _mm256_setzero_pd();
1670 fjy2 = _mm256_setzero_pd();
1671 fjz2 = _mm256_setzero_pd();
1672 fjx3 = _mm256_setzero_pd();
1673 fjy3 = _mm256_setzero_pd();
1674 fjz3 = _mm256_setzero_pd();
1676 /**************************
1677 * CALCULATE INTERACTIONS *
1678 **************************/
1680 r00 = _mm256_mul_pd(rsq00,rinv00);
1682 /* Analytical LJ-PME */
1683 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1684 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
1685 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
1686 exponent = avx256_exp_d(ewcljrsq);
1687 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1688 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1689 /* f6A = 6 * C6grid * (1 - poly) */
1690 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
1691 /* f6B = C6grid * exponent * beta^6 */
1692 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
1693 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1694 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1696 fscal = fvdw;
1698 /* Calculate temporary vectorial force */
1699 tx = _mm256_mul_pd(fscal,dx00);
1700 ty = _mm256_mul_pd(fscal,dy00);
1701 tz = _mm256_mul_pd(fscal,dz00);
1703 /* Update vectorial force */
1704 fix0 = _mm256_add_pd(fix0,tx);
1705 fiy0 = _mm256_add_pd(fiy0,ty);
1706 fiz0 = _mm256_add_pd(fiz0,tz);
1708 fjx0 = _mm256_add_pd(fjx0,tx);
1709 fjy0 = _mm256_add_pd(fjy0,ty);
1710 fjz0 = _mm256_add_pd(fjz0,tz);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 r11 = _mm256_mul_pd(rsq11,rinv11);
1718 /* EWALD ELECTROSTATICS */
1720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1721 ewrt = _mm256_mul_pd(r11,ewtabscale);
1722 ewitab = _mm256_cvttpd_epi32(ewrt);
1723 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1724 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1725 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1726 &ewtabF,&ewtabFn);
1727 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1728 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1730 fscal = felec;
1732 /* Calculate temporary vectorial force */
1733 tx = _mm256_mul_pd(fscal,dx11);
1734 ty = _mm256_mul_pd(fscal,dy11);
1735 tz = _mm256_mul_pd(fscal,dz11);
1737 /* Update vectorial force */
1738 fix1 = _mm256_add_pd(fix1,tx);
1739 fiy1 = _mm256_add_pd(fiy1,ty);
1740 fiz1 = _mm256_add_pd(fiz1,tz);
1742 fjx1 = _mm256_add_pd(fjx1,tx);
1743 fjy1 = _mm256_add_pd(fjy1,ty);
1744 fjz1 = _mm256_add_pd(fjz1,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 r12 = _mm256_mul_pd(rsq12,rinv12);
1752 /* EWALD ELECTROSTATICS */
1754 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1755 ewrt = _mm256_mul_pd(r12,ewtabscale);
1756 ewitab = _mm256_cvttpd_epi32(ewrt);
1757 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1758 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1759 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1760 &ewtabF,&ewtabFn);
1761 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1762 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1764 fscal = felec;
1766 /* Calculate temporary vectorial force */
1767 tx = _mm256_mul_pd(fscal,dx12);
1768 ty = _mm256_mul_pd(fscal,dy12);
1769 tz = _mm256_mul_pd(fscal,dz12);
1771 /* Update vectorial force */
1772 fix1 = _mm256_add_pd(fix1,tx);
1773 fiy1 = _mm256_add_pd(fiy1,ty);
1774 fiz1 = _mm256_add_pd(fiz1,tz);
1776 fjx2 = _mm256_add_pd(fjx2,tx);
1777 fjy2 = _mm256_add_pd(fjy2,ty);
1778 fjz2 = _mm256_add_pd(fjz2,tz);
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1784 r13 = _mm256_mul_pd(rsq13,rinv13);
1786 /* EWALD ELECTROSTATICS */
1788 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1789 ewrt = _mm256_mul_pd(r13,ewtabscale);
1790 ewitab = _mm256_cvttpd_epi32(ewrt);
1791 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1792 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1793 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1794 &ewtabF,&ewtabFn);
1795 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1796 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1798 fscal = felec;
1800 /* Calculate temporary vectorial force */
1801 tx = _mm256_mul_pd(fscal,dx13);
1802 ty = _mm256_mul_pd(fscal,dy13);
1803 tz = _mm256_mul_pd(fscal,dz13);
1805 /* Update vectorial force */
1806 fix1 = _mm256_add_pd(fix1,tx);
1807 fiy1 = _mm256_add_pd(fiy1,ty);
1808 fiz1 = _mm256_add_pd(fiz1,tz);
1810 fjx3 = _mm256_add_pd(fjx3,tx);
1811 fjy3 = _mm256_add_pd(fjy3,ty);
1812 fjz3 = _mm256_add_pd(fjz3,tz);
1814 /**************************
1815 * CALCULATE INTERACTIONS *
1816 **************************/
1818 r21 = _mm256_mul_pd(rsq21,rinv21);
1820 /* EWALD ELECTROSTATICS */
1822 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1823 ewrt = _mm256_mul_pd(r21,ewtabscale);
1824 ewitab = _mm256_cvttpd_epi32(ewrt);
1825 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1826 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1827 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1828 &ewtabF,&ewtabFn);
1829 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1830 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1832 fscal = felec;
1834 /* Calculate temporary vectorial force */
1835 tx = _mm256_mul_pd(fscal,dx21);
1836 ty = _mm256_mul_pd(fscal,dy21);
1837 tz = _mm256_mul_pd(fscal,dz21);
1839 /* Update vectorial force */
1840 fix2 = _mm256_add_pd(fix2,tx);
1841 fiy2 = _mm256_add_pd(fiy2,ty);
1842 fiz2 = _mm256_add_pd(fiz2,tz);
1844 fjx1 = _mm256_add_pd(fjx1,tx);
1845 fjy1 = _mm256_add_pd(fjy1,ty);
1846 fjz1 = _mm256_add_pd(fjz1,tz);
1848 /**************************
1849 * CALCULATE INTERACTIONS *
1850 **************************/
1852 r22 = _mm256_mul_pd(rsq22,rinv22);
1854 /* EWALD ELECTROSTATICS */
1856 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1857 ewrt = _mm256_mul_pd(r22,ewtabscale);
1858 ewitab = _mm256_cvttpd_epi32(ewrt);
1859 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1860 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1861 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1862 &ewtabF,&ewtabFn);
1863 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1864 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1866 fscal = felec;
1868 /* Calculate temporary vectorial force */
1869 tx = _mm256_mul_pd(fscal,dx22);
1870 ty = _mm256_mul_pd(fscal,dy22);
1871 tz = _mm256_mul_pd(fscal,dz22);
1873 /* Update vectorial force */
1874 fix2 = _mm256_add_pd(fix2,tx);
1875 fiy2 = _mm256_add_pd(fiy2,ty);
1876 fiz2 = _mm256_add_pd(fiz2,tz);
1878 fjx2 = _mm256_add_pd(fjx2,tx);
1879 fjy2 = _mm256_add_pd(fjy2,ty);
1880 fjz2 = _mm256_add_pd(fjz2,tz);
1882 /**************************
1883 * CALCULATE INTERACTIONS *
1884 **************************/
1886 r23 = _mm256_mul_pd(rsq23,rinv23);
1888 /* EWALD ELECTROSTATICS */
1890 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1891 ewrt = _mm256_mul_pd(r23,ewtabscale);
1892 ewitab = _mm256_cvttpd_epi32(ewrt);
1893 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1894 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1895 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1896 &ewtabF,&ewtabFn);
1897 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1898 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1900 fscal = felec;
1902 /* Calculate temporary vectorial force */
1903 tx = _mm256_mul_pd(fscal,dx23);
1904 ty = _mm256_mul_pd(fscal,dy23);
1905 tz = _mm256_mul_pd(fscal,dz23);
1907 /* Update vectorial force */
1908 fix2 = _mm256_add_pd(fix2,tx);
1909 fiy2 = _mm256_add_pd(fiy2,ty);
1910 fiz2 = _mm256_add_pd(fiz2,tz);
1912 fjx3 = _mm256_add_pd(fjx3,tx);
1913 fjy3 = _mm256_add_pd(fjy3,ty);
1914 fjz3 = _mm256_add_pd(fjz3,tz);
1916 /**************************
1917 * CALCULATE INTERACTIONS *
1918 **************************/
1920 r31 = _mm256_mul_pd(rsq31,rinv31);
1922 /* EWALD ELECTROSTATICS */
1924 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1925 ewrt = _mm256_mul_pd(r31,ewtabscale);
1926 ewitab = _mm256_cvttpd_epi32(ewrt);
1927 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1928 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1929 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1930 &ewtabF,&ewtabFn);
1931 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1932 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1934 fscal = felec;
1936 /* Calculate temporary vectorial force */
1937 tx = _mm256_mul_pd(fscal,dx31);
1938 ty = _mm256_mul_pd(fscal,dy31);
1939 tz = _mm256_mul_pd(fscal,dz31);
1941 /* Update vectorial force */
1942 fix3 = _mm256_add_pd(fix3,tx);
1943 fiy3 = _mm256_add_pd(fiy3,ty);
1944 fiz3 = _mm256_add_pd(fiz3,tz);
1946 fjx1 = _mm256_add_pd(fjx1,tx);
1947 fjy1 = _mm256_add_pd(fjy1,ty);
1948 fjz1 = _mm256_add_pd(fjz1,tz);
1950 /**************************
1951 * CALCULATE INTERACTIONS *
1952 **************************/
1954 r32 = _mm256_mul_pd(rsq32,rinv32);
1956 /* EWALD ELECTROSTATICS */
1958 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1959 ewrt = _mm256_mul_pd(r32,ewtabscale);
1960 ewitab = _mm256_cvttpd_epi32(ewrt);
1961 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1962 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1963 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1964 &ewtabF,&ewtabFn);
1965 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1966 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1968 fscal = felec;
1970 /* Calculate temporary vectorial force */
1971 tx = _mm256_mul_pd(fscal,dx32);
1972 ty = _mm256_mul_pd(fscal,dy32);
1973 tz = _mm256_mul_pd(fscal,dz32);
1975 /* Update vectorial force */
1976 fix3 = _mm256_add_pd(fix3,tx);
1977 fiy3 = _mm256_add_pd(fiy3,ty);
1978 fiz3 = _mm256_add_pd(fiz3,tz);
1980 fjx2 = _mm256_add_pd(fjx2,tx);
1981 fjy2 = _mm256_add_pd(fjy2,ty);
1982 fjz2 = _mm256_add_pd(fjz2,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 r33 = _mm256_mul_pd(rsq33,rinv33);
1990 /* EWALD ELECTROSTATICS */
1992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993 ewrt = _mm256_mul_pd(r33,ewtabscale);
1994 ewitab = _mm256_cvttpd_epi32(ewrt);
1995 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1996 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1997 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1998 &ewtabF,&ewtabFn);
1999 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2000 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2002 fscal = felec;
2004 /* Calculate temporary vectorial force */
2005 tx = _mm256_mul_pd(fscal,dx33);
2006 ty = _mm256_mul_pd(fscal,dy33);
2007 tz = _mm256_mul_pd(fscal,dz33);
2009 /* Update vectorial force */
2010 fix3 = _mm256_add_pd(fix3,tx);
2011 fiy3 = _mm256_add_pd(fiy3,ty);
2012 fiz3 = _mm256_add_pd(fiz3,tz);
2014 fjx3 = _mm256_add_pd(fjx3,tx);
2015 fjy3 = _mm256_add_pd(fjy3,ty);
2016 fjz3 = _mm256_add_pd(fjz3,tz);
2018 fjptrA = f+j_coord_offsetA;
2019 fjptrB = f+j_coord_offsetB;
2020 fjptrC = f+j_coord_offsetC;
2021 fjptrD = f+j_coord_offsetD;
2023 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2024 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2025 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2027 /* Inner loop uses 373 flops */
2030 if(jidx<j_index_end)
2033 /* Get j neighbor index, and coordinate index */
2034 jnrlistA = jjnr[jidx];
2035 jnrlistB = jjnr[jidx+1];
2036 jnrlistC = jjnr[jidx+2];
2037 jnrlistD = jjnr[jidx+3];
2038 /* Sign of each element will be negative for non-real atoms.
2039 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2040 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2042 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2044 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2045 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2046 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2048 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2049 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2050 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2051 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2052 j_coord_offsetA = DIM*jnrA;
2053 j_coord_offsetB = DIM*jnrB;
2054 j_coord_offsetC = DIM*jnrC;
2055 j_coord_offsetD = DIM*jnrD;
2057 /* load j atom coordinates */
2058 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2059 x+j_coord_offsetC,x+j_coord_offsetD,
2060 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2061 &jy2,&jz2,&jx3,&jy3,&jz3);
2063 /* Calculate displacement vector */
2064 dx00 = _mm256_sub_pd(ix0,jx0);
2065 dy00 = _mm256_sub_pd(iy0,jy0);
2066 dz00 = _mm256_sub_pd(iz0,jz0);
2067 dx11 = _mm256_sub_pd(ix1,jx1);
2068 dy11 = _mm256_sub_pd(iy1,jy1);
2069 dz11 = _mm256_sub_pd(iz1,jz1);
2070 dx12 = _mm256_sub_pd(ix1,jx2);
2071 dy12 = _mm256_sub_pd(iy1,jy2);
2072 dz12 = _mm256_sub_pd(iz1,jz2);
2073 dx13 = _mm256_sub_pd(ix1,jx3);
2074 dy13 = _mm256_sub_pd(iy1,jy3);
2075 dz13 = _mm256_sub_pd(iz1,jz3);
2076 dx21 = _mm256_sub_pd(ix2,jx1);
2077 dy21 = _mm256_sub_pd(iy2,jy1);
2078 dz21 = _mm256_sub_pd(iz2,jz1);
2079 dx22 = _mm256_sub_pd(ix2,jx2);
2080 dy22 = _mm256_sub_pd(iy2,jy2);
2081 dz22 = _mm256_sub_pd(iz2,jz2);
2082 dx23 = _mm256_sub_pd(ix2,jx3);
2083 dy23 = _mm256_sub_pd(iy2,jy3);
2084 dz23 = _mm256_sub_pd(iz2,jz3);
2085 dx31 = _mm256_sub_pd(ix3,jx1);
2086 dy31 = _mm256_sub_pd(iy3,jy1);
2087 dz31 = _mm256_sub_pd(iz3,jz1);
2088 dx32 = _mm256_sub_pd(ix3,jx2);
2089 dy32 = _mm256_sub_pd(iy3,jy2);
2090 dz32 = _mm256_sub_pd(iz3,jz2);
2091 dx33 = _mm256_sub_pd(ix3,jx3);
2092 dy33 = _mm256_sub_pd(iy3,jy3);
2093 dz33 = _mm256_sub_pd(iz3,jz3);
2095 /* Calculate squared distance and things based on it */
2096 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2097 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2098 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2099 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2100 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2101 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2102 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2103 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2104 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2105 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2107 rinv00 = avx256_invsqrt_d(rsq00);
2108 rinv11 = avx256_invsqrt_d(rsq11);
2109 rinv12 = avx256_invsqrt_d(rsq12);
2110 rinv13 = avx256_invsqrt_d(rsq13);
2111 rinv21 = avx256_invsqrt_d(rsq21);
2112 rinv22 = avx256_invsqrt_d(rsq22);
2113 rinv23 = avx256_invsqrt_d(rsq23);
2114 rinv31 = avx256_invsqrt_d(rsq31);
2115 rinv32 = avx256_invsqrt_d(rsq32);
2116 rinv33 = avx256_invsqrt_d(rsq33);
2118 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2119 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2120 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2121 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2122 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2123 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2124 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2125 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2126 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2127 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2129 fjx0 = _mm256_setzero_pd();
2130 fjy0 = _mm256_setzero_pd();
2131 fjz0 = _mm256_setzero_pd();
2132 fjx1 = _mm256_setzero_pd();
2133 fjy1 = _mm256_setzero_pd();
2134 fjz1 = _mm256_setzero_pd();
2135 fjx2 = _mm256_setzero_pd();
2136 fjy2 = _mm256_setzero_pd();
2137 fjz2 = _mm256_setzero_pd();
2138 fjx3 = _mm256_setzero_pd();
2139 fjy3 = _mm256_setzero_pd();
2140 fjz3 = _mm256_setzero_pd();
2142 /**************************
2143 * CALCULATE INTERACTIONS *
2144 **************************/
2146 r00 = _mm256_mul_pd(rsq00,rinv00);
2147 r00 = _mm256_andnot_pd(dummy_mask,r00);
2149 /* Analytical LJ-PME */
2150 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2151 ewcljrsq = _mm256_mul_pd(ewclj2,rsq00);
2152 ewclj6 = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
2153 exponent = avx256_exp_d(ewcljrsq);
2154 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2155 poly = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2156 /* f6A = 6 * C6grid * (1 - poly) */
2157 f6A = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
2158 /* f6B = C6grid * exponent * beta^6 */
2159 f6B = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
2160 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2161 fvdw = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2163 fscal = fvdw;
2165 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2167 /* Calculate temporary vectorial force */
2168 tx = _mm256_mul_pd(fscal,dx00);
2169 ty = _mm256_mul_pd(fscal,dy00);
2170 tz = _mm256_mul_pd(fscal,dz00);
2172 /* Update vectorial force */
2173 fix0 = _mm256_add_pd(fix0,tx);
2174 fiy0 = _mm256_add_pd(fiy0,ty);
2175 fiz0 = _mm256_add_pd(fiz0,tz);
2177 fjx0 = _mm256_add_pd(fjx0,tx);
2178 fjy0 = _mm256_add_pd(fjy0,ty);
2179 fjz0 = _mm256_add_pd(fjz0,tz);
2181 /**************************
2182 * CALCULATE INTERACTIONS *
2183 **************************/
2185 r11 = _mm256_mul_pd(rsq11,rinv11);
2186 r11 = _mm256_andnot_pd(dummy_mask,r11);
2188 /* EWALD ELECTROSTATICS */
2190 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2191 ewrt = _mm256_mul_pd(r11,ewtabscale);
2192 ewitab = _mm256_cvttpd_epi32(ewrt);
2193 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2194 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2195 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2196 &ewtabF,&ewtabFn);
2197 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2198 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2200 fscal = felec;
2202 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2204 /* Calculate temporary vectorial force */
2205 tx = _mm256_mul_pd(fscal,dx11);
2206 ty = _mm256_mul_pd(fscal,dy11);
2207 tz = _mm256_mul_pd(fscal,dz11);
2209 /* Update vectorial force */
2210 fix1 = _mm256_add_pd(fix1,tx);
2211 fiy1 = _mm256_add_pd(fiy1,ty);
2212 fiz1 = _mm256_add_pd(fiz1,tz);
2214 fjx1 = _mm256_add_pd(fjx1,tx);
2215 fjy1 = _mm256_add_pd(fjy1,ty);
2216 fjz1 = _mm256_add_pd(fjz1,tz);
2218 /**************************
2219 * CALCULATE INTERACTIONS *
2220 **************************/
2222 r12 = _mm256_mul_pd(rsq12,rinv12);
2223 r12 = _mm256_andnot_pd(dummy_mask,r12);
2225 /* EWALD ELECTROSTATICS */
2227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228 ewrt = _mm256_mul_pd(r12,ewtabscale);
2229 ewitab = _mm256_cvttpd_epi32(ewrt);
2230 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2231 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2232 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2233 &ewtabF,&ewtabFn);
2234 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2235 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2237 fscal = felec;
2239 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2241 /* Calculate temporary vectorial force */
2242 tx = _mm256_mul_pd(fscal,dx12);
2243 ty = _mm256_mul_pd(fscal,dy12);
2244 tz = _mm256_mul_pd(fscal,dz12);
2246 /* Update vectorial force */
2247 fix1 = _mm256_add_pd(fix1,tx);
2248 fiy1 = _mm256_add_pd(fiy1,ty);
2249 fiz1 = _mm256_add_pd(fiz1,tz);
2251 fjx2 = _mm256_add_pd(fjx2,tx);
2252 fjy2 = _mm256_add_pd(fjy2,ty);
2253 fjz2 = _mm256_add_pd(fjz2,tz);
2255 /**************************
2256 * CALCULATE INTERACTIONS *
2257 **************************/
2259 r13 = _mm256_mul_pd(rsq13,rinv13);
2260 r13 = _mm256_andnot_pd(dummy_mask,r13);
2262 /* EWALD ELECTROSTATICS */
2264 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2265 ewrt = _mm256_mul_pd(r13,ewtabscale);
2266 ewitab = _mm256_cvttpd_epi32(ewrt);
2267 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2268 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2269 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2270 &ewtabF,&ewtabFn);
2271 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2272 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2274 fscal = felec;
2276 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2278 /* Calculate temporary vectorial force */
2279 tx = _mm256_mul_pd(fscal,dx13);
2280 ty = _mm256_mul_pd(fscal,dy13);
2281 tz = _mm256_mul_pd(fscal,dz13);
2283 /* Update vectorial force */
2284 fix1 = _mm256_add_pd(fix1,tx);
2285 fiy1 = _mm256_add_pd(fiy1,ty);
2286 fiz1 = _mm256_add_pd(fiz1,tz);
2288 fjx3 = _mm256_add_pd(fjx3,tx);
2289 fjy3 = _mm256_add_pd(fjy3,ty);
2290 fjz3 = _mm256_add_pd(fjz3,tz);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 r21 = _mm256_mul_pd(rsq21,rinv21);
2297 r21 = _mm256_andnot_pd(dummy_mask,r21);
2299 /* EWALD ELECTROSTATICS */
2301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2302 ewrt = _mm256_mul_pd(r21,ewtabscale);
2303 ewitab = _mm256_cvttpd_epi32(ewrt);
2304 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2305 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2306 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2307 &ewtabF,&ewtabFn);
2308 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2309 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2311 fscal = felec;
2313 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2315 /* Calculate temporary vectorial force */
2316 tx = _mm256_mul_pd(fscal,dx21);
2317 ty = _mm256_mul_pd(fscal,dy21);
2318 tz = _mm256_mul_pd(fscal,dz21);
2320 /* Update vectorial force */
2321 fix2 = _mm256_add_pd(fix2,tx);
2322 fiy2 = _mm256_add_pd(fiy2,ty);
2323 fiz2 = _mm256_add_pd(fiz2,tz);
2325 fjx1 = _mm256_add_pd(fjx1,tx);
2326 fjy1 = _mm256_add_pd(fjy1,ty);
2327 fjz1 = _mm256_add_pd(fjz1,tz);
2329 /**************************
2330 * CALCULATE INTERACTIONS *
2331 **************************/
2333 r22 = _mm256_mul_pd(rsq22,rinv22);
2334 r22 = _mm256_andnot_pd(dummy_mask,r22);
2336 /* EWALD ELECTROSTATICS */
2338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2339 ewrt = _mm256_mul_pd(r22,ewtabscale);
2340 ewitab = _mm256_cvttpd_epi32(ewrt);
2341 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2342 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2343 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2344 &ewtabF,&ewtabFn);
2345 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2346 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2348 fscal = felec;
2350 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2352 /* Calculate temporary vectorial force */
2353 tx = _mm256_mul_pd(fscal,dx22);
2354 ty = _mm256_mul_pd(fscal,dy22);
2355 tz = _mm256_mul_pd(fscal,dz22);
2357 /* Update vectorial force */
2358 fix2 = _mm256_add_pd(fix2,tx);
2359 fiy2 = _mm256_add_pd(fiy2,ty);
2360 fiz2 = _mm256_add_pd(fiz2,tz);
2362 fjx2 = _mm256_add_pd(fjx2,tx);
2363 fjy2 = _mm256_add_pd(fjy2,ty);
2364 fjz2 = _mm256_add_pd(fjz2,tz);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 r23 = _mm256_mul_pd(rsq23,rinv23);
2371 r23 = _mm256_andnot_pd(dummy_mask,r23);
2373 /* EWALD ELECTROSTATICS */
2375 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2376 ewrt = _mm256_mul_pd(r23,ewtabscale);
2377 ewitab = _mm256_cvttpd_epi32(ewrt);
2378 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2379 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2380 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2381 &ewtabF,&ewtabFn);
2382 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2383 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2385 fscal = felec;
2387 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2389 /* Calculate temporary vectorial force */
2390 tx = _mm256_mul_pd(fscal,dx23);
2391 ty = _mm256_mul_pd(fscal,dy23);
2392 tz = _mm256_mul_pd(fscal,dz23);
2394 /* Update vectorial force */
2395 fix2 = _mm256_add_pd(fix2,tx);
2396 fiy2 = _mm256_add_pd(fiy2,ty);
2397 fiz2 = _mm256_add_pd(fiz2,tz);
2399 fjx3 = _mm256_add_pd(fjx3,tx);
2400 fjy3 = _mm256_add_pd(fjy3,ty);
2401 fjz3 = _mm256_add_pd(fjz3,tz);
2403 /**************************
2404 * CALCULATE INTERACTIONS *
2405 **************************/
2407 r31 = _mm256_mul_pd(rsq31,rinv31);
2408 r31 = _mm256_andnot_pd(dummy_mask,r31);
2410 /* EWALD ELECTROSTATICS */
2412 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2413 ewrt = _mm256_mul_pd(r31,ewtabscale);
2414 ewitab = _mm256_cvttpd_epi32(ewrt);
2415 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2416 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2417 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2418 &ewtabF,&ewtabFn);
2419 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2420 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2422 fscal = felec;
2424 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2426 /* Calculate temporary vectorial force */
2427 tx = _mm256_mul_pd(fscal,dx31);
2428 ty = _mm256_mul_pd(fscal,dy31);
2429 tz = _mm256_mul_pd(fscal,dz31);
2431 /* Update vectorial force */
2432 fix3 = _mm256_add_pd(fix3,tx);
2433 fiy3 = _mm256_add_pd(fiy3,ty);
2434 fiz3 = _mm256_add_pd(fiz3,tz);
2436 fjx1 = _mm256_add_pd(fjx1,tx);
2437 fjy1 = _mm256_add_pd(fjy1,ty);
2438 fjz1 = _mm256_add_pd(fjz1,tz);
2440 /**************************
2441 * CALCULATE INTERACTIONS *
2442 **************************/
2444 r32 = _mm256_mul_pd(rsq32,rinv32);
2445 r32 = _mm256_andnot_pd(dummy_mask,r32);
2447 /* EWALD ELECTROSTATICS */
2449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2450 ewrt = _mm256_mul_pd(r32,ewtabscale);
2451 ewitab = _mm256_cvttpd_epi32(ewrt);
2452 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2453 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2454 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2455 &ewtabF,&ewtabFn);
2456 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2457 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2459 fscal = felec;
2461 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2463 /* Calculate temporary vectorial force */
2464 tx = _mm256_mul_pd(fscal,dx32);
2465 ty = _mm256_mul_pd(fscal,dy32);
2466 tz = _mm256_mul_pd(fscal,dz32);
2468 /* Update vectorial force */
2469 fix3 = _mm256_add_pd(fix3,tx);
2470 fiy3 = _mm256_add_pd(fiy3,ty);
2471 fiz3 = _mm256_add_pd(fiz3,tz);
2473 fjx2 = _mm256_add_pd(fjx2,tx);
2474 fjy2 = _mm256_add_pd(fjy2,ty);
2475 fjz2 = _mm256_add_pd(fjz2,tz);
2477 /**************************
2478 * CALCULATE INTERACTIONS *
2479 **************************/
2481 r33 = _mm256_mul_pd(rsq33,rinv33);
2482 r33 = _mm256_andnot_pd(dummy_mask,r33);
2484 /* EWALD ELECTROSTATICS */
2486 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2487 ewrt = _mm256_mul_pd(r33,ewtabscale);
2488 ewitab = _mm256_cvttpd_epi32(ewrt);
2489 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2490 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2491 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2492 &ewtabF,&ewtabFn);
2493 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2494 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2496 fscal = felec;
2498 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2500 /* Calculate temporary vectorial force */
2501 tx = _mm256_mul_pd(fscal,dx33);
2502 ty = _mm256_mul_pd(fscal,dy33);
2503 tz = _mm256_mul_pd(fscal,dz33);
2505 /* Update vectorial force */
2506 fix3 = _mm256_add_pd(fix3,tx);
2507 fiy3 = _mm256_add_pd(fiy3,ty);
2508 fiz3 = _mm256_add_pd(fiz3,tz);
2510 fjx3 = _mm256_add_pd(fjx3,tx);
2511 fjy3 = _mm256_add_pd(fjy3,ty);
2512 fjz3 = _mm256_add_pd(fjz3,tz);
2514 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2515 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2516 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2517 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2519 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2520 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2521 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2523 /* Inner loop uses 383 flops */
2526 /* End of innermost loop */
2528 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2529 f+i_coord_offset,fshift+i_shift_offset);
2531 /* Increment number of inner iterations */
2532 inneriter += j_index_end - j_index_start;
2534 /* Outer loop uses 24 flops */
2537 /* Increment number of outer iterations */
2538 outeriter += nri;
2540 /* Update outer/inner flops */
2542 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);