Fix segmentation fault in minimize
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwLJ_GeomW4W4_avx_256_double.cpp
blobc3587ae7ad3c98ac92adb2ffaafd3acd73be5541
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 real scratch[4*DIM];
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 real * vdwioffsetptr3;
90 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
110 real *charge;
111 int nvdwtype;
112 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 int *vdwtype;
114 real *vdwparam;
115 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
116 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
117 __m128i ewitab;
118 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
120 real *ewtab;
121 __m256d dummy_mask,cutoff_mask;
122 __m128 tmpmask0,tmpmask1;
123 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
124 __m256d one = _mm256_set1_pd(1.0);
125 __m256d two = _mm256_set1_pd(2.0);
126 x = xx[0];
127 f = ff[0];
129 nri = nlist->nri;
130 iinr = nlist->iinr;
131 jindex = nlist->jindex;
132 jjnr = nlist->jjnr;
133 shiftidx = nlist->shift;
134 gid = nlist->gid;
135 shiftvec = fr->shift_vec[0];
136 fshift = fr->fshift[0];
137 facel = _mm256_set1_pd(fr->ic->epsfac);
138 charge = mdatoms->chargeA;
139 nvdwtype = fr->ntype;
140 vdwparam = fr->nbfp;
141 vdwtype = mdatoms->typeA;
143 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
144 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
145 beta2 = _mm256_mul_pd(beta,beta);
146 beta3 = _mm256_mul_pd(beta,beta2);
148 ewtab = fr->ic->tabq_coul_FDV0;
149 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
150 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
152 /* Setup water-specific parameters */
153 inr = nlist->iinr[0];
154 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
155 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
156 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
157 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
159 jq1 = _mm256_set1_pd(charge[inr+1]);
160 jq2 = _mm256_set1_pd(charge[inr+2]);
161 jq3 = _mm256_set1_pd(charge[inr+3]);
162 vdwjidx0A = 2*vdwtype[inr+0];
163 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
164 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
165 qq11 = _mm256_mul_pd(iq1,jq1);
166 qq12 = _mm256_mul_pd(iq1,jq2);
167 qq13 = _mm256_mul_pd(iq1,jq3);
168 qq21 = _mm256_mul_pd(iq2,jq1);
169 qq22 = _mm256_mul_pd(iq2,jq2);
170 qq23 = _mm256_mul_pd(iq2,jq3);
171 qq31 = _mm256_mul_pd(iq3,jq1);
172 qq32 = _mm256_mul_pd(iq3,jq2);
173 qq33 = _mm256_mul_pd(iq3,jq3);
175 /* Avoid stupid compiler warnings */
176 jnrA = jnrB = jnrC = jnrD = 0;
177 j_coord_offsetA = 0;
178 j_coord_offsetB = 0;
179 j_coord_offsetC = 0;
180 j_coord_offsetD = 0;
182 outeriter = 0;
183 inneriter = 0;
185 for(iidx=0;iidx<4*DIM;iidx++)
187 scratch[iidx] = 0.0;
190 /* Start outer loop over neighborlists */
191 for(iidx=0; iidx<nri; iidx++)
193 /* Load shift vector for this list */
194 i_shift_offset = DIM*shiftidx[iidx];
196 /* Load limits for loop over neighbors */
197 j_index_start = jindex[iidx];
198 j_index_end = jindex[iidx+1];
200 /* Get outer coordinate index */
201 inr = iinr[iidx];
202 i_coord_offset = DIM*inr;
204 /* Load i particle coords and add shift vector */
205 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
206 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
208 fix0 = _mm256_setzero_pd();
209 fiy0 = _mm256_setzero_pd();
210 fiz0 = _mm256_setzero_pd();
211 fix1 = _mm256_setzero_pd();
212 fiy1 = _mm256_setzero_pd();
213 fiz1 = _mm256_setzero_pd();
214 fix2 = _mm256_setzero_pd();
215 fiy2 = _mm256_setzero_pd();
216 fiz2 = _mm256_setzero_pd();
217 fix3 = _mm256_setzero_pd();
218 fiy3 = _mm256_setzero_pd();
219 fiz3 = _mm256_setzero_pd();
221 /* Reset potential sums */
222 velecsum = _mm256_setzero_pd();
223 vvdwsum = _mm256_setzero_pd();
225 /* Start inner kernel loop */
226 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
229 /* Get j neighbor index, and coordinate index */
230 jnrA = jjnr[jidx];
231 jnrB = jjnr[jidx+1];
232 jnrC = jjnr[jidx+2];
233 jnrD = jjnr[jidx+3];
234 j_coord_offsetA = DIM*jnrA;
235 j_coord_offsetB = DIM*jnrB;
236 j_coord_offsetC = DIM*jnrC;
237 j_coord_offsetD = DIM*jnrD;
239 /* load j atom coordinates */
240 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
241 x+j_coord_offsetC,x+j_coord_offsetD,
242 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243 &jy2,&jz2,&jx3,&jy3,&jz3);
245 /* Calculate displacement vector */
246 dx00 = _mm256_sub_pd(ix0,jx0);
247 dy00 = _mm256_sub_pd(iy0,jy0);
248 dz00 = _mm256_sub_pd(iz0,jz0);
249 dx11 = _mm256_sub_pd(ix1,jx1);
250 dy11 = _mm256_sub_pd(iy1,jy1);
251 dz11 = _mm256_sub_pd(iz1,jz1);
252 dx12 = _mm256_sub_pd(ix1,jx2);
253 dy12 = _mm256_sub_pd(iy1,jy2);
254 dz12 = _mm256_sub_pd(iz1,jz2);
255 dx13 = _mm256_sub_pd(ix1,jx3);
256 dy13 = _mm256_sub_pd(iy1,jy3);
257 dz13 = _mm256_sub_pd(iz1,jz3);
258 dx21 = _mm256_sub_pd(ix2,jx1);
259 dy21 = _mm256_sub_pd(iy2,jy1);
260 dz21 = _mm256_sub_pd(iz2,jz1);
261 dx22 = _mm256_sub_pd(ix2,jx2);
262 dy22 = _mm256_sub_pd(iy2,jy2);
263 dz22 = _mm256_sub_pd(iz2,jz2);
264 dx23 = _mm256_sub_pd(ix2,jx3);
265 dy23 = _mm256_sub_pd(iy2,jy3);
266 dz23 = _mm256_sub_pd(iz2,jz3);
267 dx31 = _mm256_sub_pd(ix3,jx1);
268 dy31 = _mm256_sub_pd(iy3,jy1);
269 dz31 = _mm256_sub_pd(iz3,jz1);
270 dx32 = _mm256_sub_pd(ix3,jx2);
271 dy32 = _mm256_sub_pd(iy3,jy2);
272 dz32 = _mm256_sub_pd(iz3,jz2);
273 dx33 = _mm256_sub_pd(ix3,jx3);
274 dy33 = _mm256_sub_pd(iy3,jy3);
275 dz33 = _mm256_sub_pd(iz3,jz3);
277 /* Calculate squared distance and things based on it */
278 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
279 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
280 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
281 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
282 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
283 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
284 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
285 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
286 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
287 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
289 rinv11 = avx256_invsqrt_d(rsq11);
290 rinv12 = avx256_invsqrt_d(rsq12);
291 rinv13 = avx256_invsqrt_d(rsq13);
292 rinv21 = avx256_invsqrt_d(rsq21);
293 rinv22 = avx256_invsqrt_d(rsq22);
294 rinv23 = avx256_invsqrt_d(rsq23);
295 rinv31 = avx256_invsqrt_d(rsq31);
296 rinv32 = avx256_invsqrt_d(rsq32);
297 rinv33 = avx256_invsqrt_d(rsq33);
299 rinvsq00 = avx256_inv_d(rsq00);
300 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
301 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
302 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
303 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
304 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
305 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
306 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
307 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
308 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
310 fjx0 = _mm256_setzero_pd();
311 fjy0 = _mm256_setzero_pd();
312 fjz0 = _mm256_setzero_pd();
313 fjx1 = _mm256_setzero_pd();
314 fjy1 = _mm256_setzero_pd();
315 fjz1 = _mm256_setzero_pd();
316 fjx2 = _mm256_setzero_pd();
317 fjy2 = _mm256_setzero_pd();
318 fjz2 = _mm256_setzero_pd();
319 fjx3 = _mm256_setzero_pd();
320 fjy3 = _mm256_setzero_pd();
321 fjz3 = _mm256_setzero_pd();
323 /**************************
324 * CALCULATE INTERACTIONS *
325 **************************/
327 /* LENNARD-JONES DISPERSION/REPULSION */
329 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
330 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
331 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
332 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
333 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
338 fscal = fvdw;
340 /* Calculate temporary vectorial force */
341 tx = _mm256_mul_pd(fscal,dx00);
342 ty = _mm256_mul_pd(fscal,dy00);
343 tz = _mm256_mul_pd(fscal,dz00);
345 /* Update vectorial force */
346 fix0 = _mm256_add_pd(fix0,tx);
347 fiy0 = _mm256_add_pd(fiy0,ty);
348 fiz0 = _mm256_add_pd(fiz0,tz);
350 fjx0 = _mm256_add_pd(fjx0,tx);
351 fjy0 = _mm256_add_pd(fjy0,ty);
352 fjz0 = _mm256_add_pd(fjz0,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 r11 = _mm256_mul_pd(rsq11,rinv11);
360 /* EWALD ELECTROSTATICS */
362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
363 ewrt = _mm256_mul_pd(r11,ewtabscale);
364 ewitab = _mm256_cvttpd_epi32(ewrt);
365 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
366 ewitab = _mm_slli_epi32(ewitab,2);
367 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
368 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
369 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
370 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
371 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
372 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
373 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
374 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
375 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velecsum = _mm256_add_pd(velecsum,velec);
380 fscal = felec;
382 /* Calculate temporary vectorial force */
383 tx = _mm256_mul_pd(fscal,dx11);
384 ty = _mm256_mul_pd(fscal,dy11);
385 tz = _mm256_mul_pd(fscal,dz11);
387 /* Update vectorial force */
388 fix1 = _mm256_add_pd(fix1,tx);
389 fiy1 = _mm256_add_pd(fiy1,ty);
390 fiz1 = _mm256_add_pd(fiz1,tz);
392 fjx1 = _mm256_add_pd(fjx1,tx);
393 fjy1 = _mm256_add_pd(fjy1,ty);
394 fjz1 = _mm256_add_pd(fjz1,tz);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 r12 = _mm256_mul_pd(rsq12,rinv12);
402 /* EWALD ELECTROSTATICS */
404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
405 ewrt = _mm256_mul_pd(r12,ewtabscale);
406 ewitab = _mm256_cvttpd_epi32(ewrt);
407 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
408 ewitab = _mm_slli_epi32(ewitab,2);
409 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
410 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
411 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
412 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
413 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
414 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
415 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
416 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
417 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum = _mm256_add_pd(velecsum,velec);
422 fscal = felec;
424 /* Calculate temporary vectorial force */
425 tx = _mm256_mul_pd(fscal,dx12);
426 ty = _mm256_mul_pd(fscal,dy12);
427 tz = _mm256_mul_pd(fscal,dz12);
429 /* Update vectorial force */
430 fix1 = _mm256_add_pd(fix1,tx);
431 fiy1 = _mm256_add_pd(fiy1,ty);
432 fiz1 = _mm256_add_pd(fiz1,tz);
434 fjx2 = _mm256_add_pd(fjx2,tx);
435 fjy2 = _mm256_add_pd(fjy2,ty);
436 fjz2 = _mm256_add_pd(fjz2,tz);
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
442 r13 = _mm256_mul_pd(rsq13,rinv13);
444 /* EWALD ELECTROSTATICS */
446 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
447 ewrt = _mm256_mul_pd(r13,ewtabscale);
448 ewitab = _mm256_cvttpd_epi32(ewrt);
449 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
450 ewitab = _mm_slli_epi32(ewitab,2);
451 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
452 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
453 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
454 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
455 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
456 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
457 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
458 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
459 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum = _mm256_add_pd(velecsum,velec);
464 fscal = felec;
466 /* Calculate temporary vectorial force */
467 tx = _mm256_mul_pd(fscal,dx13);
468 ty = _mm256_mul_pd(fscal,dy13);
469 tz = _mm256_mul_pd(fscal,dz13);
471 /* Update vectorial force */
472 fix1 = _mm256_add_pd(fix1,tx);
473 fiy1 = _mm256_add_pd(fiy1,ty);
474 fiz1 = _mm256_add_pd(fiz1,tz);
476 fjx3 = _mm256_add_pd(fjx3,tx);
477 fjy3 = _mm256_add_pd(fjy3,ty);
478 fjz3 = _mm256_add_pd(fjz3,tz);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 r21 = _mm256_mul_pd(rsq21,rinv21);
486 /* EWALD ELECTROSTATICS */
488 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
489 ewrt = _mm256_mul_pd(r21,ewtabscale);
490 ewitab = _mm256_cvttpd_epi32(ewrt);
491 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
492 ewitab = _mm_slli_epi32(ewitab,2);
493 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
494 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
495 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
496 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
497 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
498 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
499 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
500 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
501 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
503 /* Update potential sum for this i atom from the interaction with this j atom. */
504 velecsum = _mm256_add_pd(velecsum,velec);
506 fscal = felec;
508 /* Calculate temporary vectorial force */
509 tx = _mm256_mul_pd(fscal,dx21);
510 ty = _mm256_mul_pd(fscal,dy21);
511 tz = _mm256_mul_pd(fscal,dz21);
513 /* Update vectorial force */
514 fix2 = _mm256_add_pd(fix2,tx);
515 fiy2 = _mm256_add_pd(fiy2,ty);
516 fiz2 = _mm256_add_pd(fiz2,tz);
518 fjx1 = _mm256_add_pd(fjx1,tx);
519 fjy1 = _mm256_add_pd(fjy1,ty);
520 fjz1 = _mm256_add_pd(fjz1,tz);
522 /**************************
523 * CALCULATE INTERACTIONS *
524 **************************/
526 r22 = _mm256_mul_pd(rsq22,rinv22);
528 /* EWALD ELECTROSTATICS */
530 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
531 ewrt = _mm256_mul_pd(r22,ewtabscale);
532 ewitab = _mm256_cvttpd_epi32(ewrt);
533 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
534 ewitab = _mm_slli_epi32(ewitab,2);
535 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
536 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
537 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
538 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
539 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
540 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
541 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
542 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
543 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
545 /* Update potential sum for this i atom from the interaction with this j atom. */
546 velecsum = _mm256_add_pd(velecsum,velec);
548 fscal = felec;
550 /* Calculate temporary vectorial force */
551 tx = _mm256_mul_pd(fscal,dx22);
552 ty = _mm256_mul_pd(fscal,dy22);
553 tz = _mm256_mul_pd(fscal,dz22);
555 /* Update vectorial force */
556 fix2 = _mm256_add_pd(fix2,tx);
557 fiy2 = _mm256_add_pd(fiy2,ty);
558 fiz2 = _mm256_add_pd(fiz2,tz);
560 fjx2 = _mm256_add_pd(fjx2,tx);
561 fjy2 = _mm256_add_pd(fjy2,ty);
562 fjz2 = _mm256_add_pd(fjz2,tz);
564 /**************************
565 * CALCULATE INTERACTIONS *
566 **************************/
568 r23 = _mm256_mul_pd(rsq23,rinv23);
570 /* EWALD ELECTROSTATICS */
572 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
573 ewrt = _mm256_mul_pd(r23,ewtabscale);
574 ewitab = _mm256_cvttpd_epi32(ewrt);
575 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
576 ewitab = _mm_slli_epi32(ewitab,2);
577 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
578 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
579 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
580 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
581 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
582 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
583 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
584 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
585 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
587 /* Update potential sum for this i atom from the interaction with this j atom. */
588 velecsum = _mm256_add_pd(velecsum,velec);
590 fscal = felec;
592 /* Calculate temporary vectorial force */
593 tx = _mm256_mul_pd(fscal,dx23);
594 ty = _mm256_mul_pd(fscal,dy23);
595 tz = _mm256_mul_pd(fscal,dz23);
597 /* Update vectorial force */
598 fix2 = _mm256_add_pd(fix2,tx);
599 fiy2 = _mm256_add_pd(fiy2,ty);
600 fiz2 = _mm256_add_pd(fiz2,tz);
602 fjx3 = _mm256_add_pd(fjx3,tx);
603 fjy3 = _mm256_add_pd(fjy3,ty);
604 fjz3 = _mm256_add_pd(fjz3,tz);
606 /**************************
607 * CALCULATE INTERACTIONS *
608 **************************/
610 r31 = _mm256_mul_pd(rsq31,rinv31);
612 /* EWALD ELECTROSTATICS */
614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
615 ewrt = _mm256_mul_pd(r31,ewtabscale);
616 ewitab = _mm256_cvttpd_epi32(ewrt);
617 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
618 ewitab = _mm_slli_epi32(ewitab,2);
619 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
620 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
621 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
622 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
623 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
624 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
625 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
626 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
627 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
629 /* Update potential sum for this i atom from the interaction with this j atom. */
630 velecsum = _mm256_add_pd(velecsum,velec);
632 fscal = felec;
634 /* Calculate temporary vectorial force */
635 tx = _mm256_mul_pd(fscal,dx31);
636 ty = _mm256_mul_pd(fscal,dy31);
637 tz = _mm256_mul_pd(fscal,dz31);
639 /* Update vectorial force */
640 fix3 = _mm256_add_pd(fix3,tx);
641 fiy3 = _mm256_add_pd(fiy3,ty);
642 fiz3 = _mm256_add_pd(fiz3,tz);
644 fjx1 = _mm256_add_pd(fjx1,tx);
645 fjy1 = _mm256_add_pd(fjy1,ty);
646 fjz1 = _mm256_add_pd(fjz1,tz);
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
652 r32 = _mm256_mul_pd(rsq32,rinv32);
654 /* EWALD ELECTROSTATICS */
656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657 ewrt = _mm256_mul_pd(r32,ewtabscale);
658 ewitab = _mm256_cvttpd_epi32(ewrt);
659 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
660 ewitab = _mm_slli_epi32(ewitab,2);
661 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
662 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
663 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
664 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
665 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
666 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
667 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
668 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
669 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
671 /* Update potential sum for this i atom from the interaction with this j atom. */
672 velecsum = _mm256_add_pd(velecsum,velec);
674 fscal = felec;
676 /* Calculate temporary vectorial force */
677 tx = _mm256_mul_pd(fscal,dx32);
678 ty = _mm256_mul_pd(fscal,dy32);
679 tz = _mm256_mul_pd(fscal,dz32);
681 /* Update vectorial force */
682 fix3 = _mm256_add_pd(fix3,tx);
683 fiy3 = _mm256_add_pd(fiy3,ty);
684 fiz3 = _mm256_add_pd(fiz3,tz);
686 fjx2 = _mm256_add_pd(fjx2,tx);
687 fjy2 = _mm256_add_pd(fjy2,ty);
688 fjz2 = _mm256_add_pd(fjz2,tz);
690 /**************************
691 * CALCULATE INTERACTIONS *
692 **************************/
694 r33 = _mm256_mul_pd(rsq33,rinv33);
696 /* EWALD ELECTROSTATICS */
698 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
699 ewrt = _mm256_mul_pd(r33,ewtabscale);
700 ewitab = _mm256_cvttpd_epi32(ewrt);
701 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
702 ewitab = _mm_slli_epi32(ewitab,2);
703 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
704 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
705 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
706 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
707 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
708 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
709 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
710 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
711 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velecsum = _mm256_add_pd(velecsum,velec);
716 fscal = felec;
718 /* Calculate temporary vectorial force */
719 tx = _mm256_mul_pd(fscal,dx33);
720 ty = _mm256_mul_pd(fscal,dy33);
721 tz = _mm256_mul_pd(fscal,dz33);
723 /* Update vectorial force */
724 fix3 = _mm256_add_pd(fix3,tx);
725 fiy3 = _mm256_add_pd(fiy3,ty);
726 fiz3 = _mm256_add_pd(fiz3,tz);
728 fjx3 = _mm256_add_pd(fjx3,tx);
729 fjy3 = _mm256_add_pd(fjy3,ty);
730 fjz3 = _mm256_add_pd(fjz3,tz);
732 fjptrA = f+j_coord_offsetA;
733 fjptrB = f+j_coord_offsetB;
734 fjptrC = f+j_coord_offsetC;
735 fjptrD = f+j_coord_offsetD;
737 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
738 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
739 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
741 /* Inner loop uses 404 flops */
744 if(jidx<j_index_end)
747 /* Get j neighbor index, and coordinate index */
748 jnrlistA = jjnr[jidx];
749 jnrlistB = jjnr[jidx+1];
750 jnrlistC = jjnr[jidx+2];
751 jnrlistD = jjnr[jidx+3];
752 /* Sign of each element will be negative for non-real atoms.
753 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
754 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
756 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
758 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
759 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
760 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
762 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
763 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
764 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
765 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
766 j_coord_offsetA = DIM*jnrA;
767 j_coord_offsetB = DIM*jnrB;
768 j_coord_offsetC = DIM*jnrC;
769 j_coord_offsetD = DIM*jnrD;
771 /* load j atom coordinates */
772 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
773 x+j_coord_offsetC,x+j_coord_offsetD,
774 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
775 &jy2,&jz2,&jx3,&jy3,&jz3);
777 /* Calculate displacement vector */
778 dx00 = _mm256_sub_pd(ix0,jx0);
779 dy00 = _mm256_sub_pd(iy0,jy0);
780 dz00 = _mm256_sub_pd(iz0,jz0);
781 dx11 = _mm256_sub_pd(ix1,jx1);
782 dy11 = _mm256_sub_pd(iy1,jy1);
783 dz11 = _mm256_sub_pd(iz1,jz1);
784 dx12 = _mm256_sub_pd(ix1,jx2);
785 dy12 = _mm256_sub_pd(iy1,jy2);
786 dz12 = _mm256_sub_pd(iz1,jz2);
787 dx13 = _mm256_sub_pd(ix1,jx3);
788 dy13 = _mm256_sub_pd(iy1,jy3);
789 dz13 = _mm256_sub_pd(iz1,jz3);
790 dx21 = _mm256_sub_pd(ix2,jx1);
791 dy21 = _mm256_sub_pd(iy2,jy1);
792 dz21 = _mm256_sub_pd(iz2,jz1);
793 dx22 = _mm256_sub_pd(ix2,jx2);
794 dy22 = _mm256_sub_pd(iy2,jy2);
795 dz22 = _mm256_sub_pd(iz2,jz2);
796 dx23 = _mm256_sub_pd(ix2,jx3);
797 dy23 = _mm256_sub_pd(iy2,jy3);
798 dz23 = _mm256_sub_pd(iz2,jz3);
799 dx31 = _mm256_sub_pd(ix3,jx1);
800 dy31 = _mm256_sub_pd(iy3,jy1);
801 dz31 = _mm256_sub_pd(iz3,jz1);
802 dx32 = _mm256_sub_pd(ix3,jx2);
803 dy32 = _mm256_sub_pd(iy3,jy2);
804 dz32 = _mm256_sub_pd(iz3,jz2);
805 dx33 = _mm256_sub_pd(ix3,jx3);
806 dy33 = _mm256_sub_pd(iy3,jy3);
807 dz33 = _mm256_sub_pd(iz3,jz3);
809 /* Calculate squared distance and things based on it */
810 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
811 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
812 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
813 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
814 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
815 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
816 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
817 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
818 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
819 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
821 rinv11 = avx256_invsqrt_d(rsq11);
822 rinv12 = avx256_invsqrt_d(rsq12);
823 rinv13 = avx256_invsqrt_d(rsq13);
824 rinv21 = avx256_invsqrt_d(rsq21);
825 rinv22 = avx256_invsqrt_d(rsq22);
826 rinv23 = avx256_invsqrt_d(rsq23);
827 rinv31 = avx256_invsqrt_d(rsq31);
828 rinv32 = avx256_invsqrt_d(rsq32);
829 rinv33 = avx256_invsqrt_d(rsq33);
831 rinvsq00 = avx256_inv_d(rsq00);
832 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
833 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
834 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
835 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
836 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
837 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
838 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
839 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
840 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
842 fjx0 = _mm256_setzero_pd();
843 fjy0 = _mm256_setzero_pd();
844 fjz0 = _mm256_setzero_pd();
845 fjx1 = _mm256_setzero_pd();
846 fjy1 = _mm256_setzero_pd();
847 fjz1 = _mm256_setzero_pd();
848 fjx2 = _mm256_setzero_pd();
849 fjy2 = _mm256_setzero_pd();
850 fjz2 = _mm256_setzero_pd();
851 fjx3 = _mm256_setzero_pd();
852 fjy3 = _mm256_setzero_pd();
853 fjz3 = _mm256_setzero_pd();
855 /**************************
856 * CALCULATE INTERACTIONS *
857 **************************/
859 /* LENNARD-JONES DISPERSION/REPULSION */
861 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
862 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
863 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
864 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
865 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
867 /* Update potential sum for this i atom from the interaction with this j atom. */
868 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
869 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
871 fscal = fvdw;
873 fscal = _mm256_andnot_pd(dummy_mask,fscal);
875 /* Calculate temporary vectorial force */
876 tx = _mm256_mul_pd(fscal,dx00);
877 ty = _mm256_mul_pd(fscal,dy00);
878 tz = _mm256_mul_pd(fscal,dz00);
880 /* Update vectorial force */
881 fix0 = _mm256_add_pd(fix0,tx);
882 fiy0 = _mm256_add_pd(fiy0,ty);
883 fiz0 = _mm256_add_pd(fiz0,tz);
885 fjx0 = _mm256_add_pd(fjx0,tx);
886 fjy0 = _mm256_add_pd(fjy0,ty);
887 fjz0 = _mm256_add_pd(fjz0,tz);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 r11 = _mm256_mul_pd(rsq11,rinv11);
894 r11 = _mm256_andnot_pd(dummy_mask,r11);
896 /* EWALD ELECTROSTATICS */
898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
899 ewrt = _mm256_mul_pd(r11,ewtabscale);
900 ewitab = _mm256_cvttpd_epi32(ewrt);
901 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
902 ewitab = _mm_slli_epi32(ewitab,2);
903 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
904 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
905 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
906 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
907 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
908 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
909 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
910 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
911 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 velec = _mm256_andnot_pd(dummy_mask,velec);
915 velecsum = _mm256_add_pd(velecsum,velec);
917 fscal = felec;
919 fscal = _mm256_andnot_pd(dummy_mask,fscal);
921 /* Calculate temporary vectorial force */
922 tx = _mm256_mul_pd(fscal,dx11);
923 ty = _mm256_mul_pd(fscal,dy11);
924 tz = _mm256_mul_pd(fscal,dz11);
926 /* Update vectorial force */
927 fix1 = _mm256_add_pd(fix1,tx);
928 fiy1 = _mm256_add_pd(fiy1,ty);
929 fiz1 = _mm256_add_pd(fiz1,tz);
931 fjx1 = _mm256_add_pd(fjx1,tx);
932 fjy1 = _mm256_add_pd(fjy1,ty);
933 fjz1 = _mm256_add_pd(fjz1,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 r12 = _mm256_mul_pd(rsq12,rinv12);
940 r12 = _mm256_andnot_pd(dummy_mask,r12);
942 /* EWALD ELECTROSTATICS */
944 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
945 ewrt = _mm256_mul_pd(r12,ewtabscale);
946 ewitab = _mm256_cvttpd_epi32(ewrt);
947 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
948 ewitab = _mm_slli_epi32(ewitab,2);
949 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
950 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
951 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
952 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
953 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
954 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
955 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
956 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
957 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
959 /* Update potential sum for this i atom from the interaction with this j atom. */
960 velec = _mm256_andnot_pd(dummy_mask,velec);
961 velecsum = _mm256_add_pd(velecsum,velec);
963 fscal = felec;
965 fscal = _mm256_andnot_pd(dummy_mask,fscal);
967 /* Calculate temporary vectorial force */
968 tx = _mm256_mul_pd(fscal,dx12);
969 ty = _mm256_mul_pd(fscal,dy12);
970 tz = _mm256_mul_pd(fscal,dz12);
972 /* Update vectorial force */
973 fix1 = _mm256_add_pd(fix1,tx);
974 fiy1 = _mm256_add_pd(fiy1,ty);
975 fiz1 = _mm256_add_pd(fiz1,tz);
977 fjx2 = _mm256_add_pd(fjx2,tx);
978 fjy2 = _mm256_add_pd(fjy2,ty);
979 fjz2 = _mm256_add_pd(fjz2,tz);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 r13 = _mm256_mul_pd(rsq13,rinv13);
986 r13 = _mm256_andnot_pd(dummy_mask,r13);
988 /* EWALD ELECTROSTATICS */
990 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
991 ewrt = _mm256_mul_pd(r13,ewtabscale);
992 ewitab = _mm256_cvttpd_epi32(ewrt);
993 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
994 ewitab = _mm_slli_epi32(ewitab,2);
995 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
996 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
997 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
998 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
999 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1000 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1001 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1002 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
1003 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1005 /* Update potential sum for this i atom from the interaction with this j atom. */
1006 velec = _mm256_andnot_pd(dummy_mask,velec);
1007 velecsum = _mm256_add_pd(velecsum,velec);
1009 fscal = felec;
1011 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1013 /* Calculate temporary vectorial force */
1014 tx = _mm256_mul_pd(fscal,dx13);
1015 ty = _mm256_mul_pd(fscal,dy13);
1016 tz = _mm256_mul_pd(fscal,dz13);
1018 /* Update vectorial force */
1019 fix1 = _mm256_add_pd(fix1,tx);
1020 fiy1 = _mm256_add_pd(fiy1,ty);
1021 fiz1 = _mm256_add_pd(fiz1,tz);
1023 fjx3 = _mm256_add_pd(fjx3,tx);
1024 fjy3 = _mm256_add_pd(fjy3,ty);
1025 fjz3 = _mm256_add_pd(fjz3,tz);
1027 /**************************
1028 * CALCULATE INTERACTIONS *
1029 **************************/
1031 r21 = _mm256_mul_pd(rsq21,rinv21);
1032 r21 = _mm256_andnot_pd(dummy_mask,r21);
1034 /* EWALD ELECTROSTATICS */
1036 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1037 ewrt = _mm256_mul_pd(r21,ewtabscale);
1038 ewitab = _mm256_cvttpd_epi32(ewrt);
1039 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1040 ewitab = _mm_slli_epi32(ewitab,2);
1041 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1042 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1043 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1044 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1045 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1046 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1047 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1048 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1049 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm256_andnot_pd(dummy_mask,velec);
1053 velecsum = _mm256_add_pd(velecsum,velec);
1055 fscal = felec;
1057 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1059 /* Calculate temporary vectorial force */
1060 tx = _mm256_mul_pd(fscal,dx21);
1061 ty = _mm256_mul_pd(fscal,dy21);
1062 tz = _mm256_mul_pd(fscal,dz21);
1064 /* Update vectorial force */
1065 fix2 = _mm256_add_pd(fix2,tx);
1066 fiy2 = _mm256_add_pd(fiy2,ty);
1067 fiz2 = _mm256_add_pd(fiz2,tz);
1069 fjx1 = _mm256_add_pd(fjx1,tx);
1070 fjy1 = _mm256_add_pd(fjy1,ty);
1071 fjz1 = _mm256_add_pd(fjz1,tz);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 r22 = _mm256_mul_pd(rsq22,rinv22);
1078 r22 = _mm256_andnot_pd(dummy_mask,r22);
1080 /* EWALD ELECTROSTATICS */
1082 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1083 ewrt = _mm256_mul_pd(r22,ewtabscale);
1084 ewitab = _mm256_cvttpd_epi32(ewrt);
1085 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1086 ewitab = _mm_slli_epi32(ewitab,2);
1087 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1088 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1089 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1090 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1091 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1092 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1093 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1094 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1095 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1097 /* Update potential sum for this i atom from the interaction with this j atom. */
1098 velec = _mm256_andnot_pd(dummy_mask,velec);
1099 velecsum = _mm256_add_pd(velecsum,velec);
1101 fscal = felec;
1103 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1105 /* Calculate temporary vectorial force */
1106 tx = _mm256_mul_pd(fscal,dx22);
1107 ty = _mm256_mul_pd(fscal,dy22);
1108 tz = _mm256_mul_pd(fscal,dz22);
1110 /* Update vectorial force */
1111 fix2 = _mm256_add_pd(fix2,tx);
1112 fiy2 = _mm256_add_pd(fiy2,ty);
1113 fiz2 = _mm256_add_pd(fiz2,tz);
1115 fjx2 = _mm256_add_pd(fjx2,tx);
1116 fjy2 = _mm256_add_pd(fjy2,ty);
1117 fjz2 = _mm256_add_pd(fjz2,tz);
1119 /**************************
1120 * CALCULATE INTERACTIONS *
1121 **************************/
1123 r23 = _mm256_mul_pd(rsq23,rinv23);
1124 r23 = _mm256_andnot_pd(dummy_mask,r23);
1126 /* EWALD ELECTROSTATICS */
1128 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1129 ewrt = _mm256_mul_pd(r23,ewtabscale);
1130 ewitab = _mm256_cvttpd_epi32(ewrt);
1131 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1132 ewitab = _mm_slli_epi32(ewitab,2);
1133 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1134 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1135 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1136 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1137 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1138 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1139 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1140 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
1141 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1143 /* Update potential sum for this i atom from the interaction with this j atom. */
1144 velec = _mm256_andnot_pd(dummy_mask,velec);
1145 velecsum = _mm256_add_pd(velecsum,velec);
1147 fscal = felec;
1149 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1151 /* Calculate temporary vectorial force */
1152 tx = _mm256_mul_pd(fscal,dx23);
1153 ty = _mm256_mul_pd(fscal,dy23);
1154 tz = _mm256_mul_pd(fscal,dz23);
1156 /* Update vectorial force */
1157 fix2 = _mm256_add_pd(fix2,tx);
1158 fiy2 = _mm256_add_pd(fiy2,ty);
1159 fiz2 = _mm256_add_pd(fiz2,tz);
1161 fjx3 = _mm256_add_pd(fjx3,tx);
1162 fjy3 = _mm256_add_pd(fjy3,ty);
1163 fjz3 = _mm256_add_pd(fjz3,tz);
1165 /**************************
1166 * CALCULATE INTERACTIONS *
1167 **************************/
1169 r31 = _mm256_mul_pd(rsq31,rinv31);
1170 r31 = _mm256_andnot_pd(dummy_mask,r31);
1172 /* EWALD ELECTROSTATICS */
1174 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1175 ewrt = _mm256_mul_pd(r31,ewtabscale);
1176 ewitab = _mm256_cvttpd_epi32(ewrt);
1177 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1178 ewitab = _mm_slli_epi32(ewitab,2);
1179 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1180 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1181 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1182 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1183 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1184 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1185 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1186 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
1187 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec = _mm256_andnot_pd(dummy_mask,velec);
1191 velecsum = _mm256_add_pd(velecsum,velec);
1193 fscal = felec;
1195 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1197 /* Calculate temporary vectorial force */
1198 tx = _mm256_mul_pd(fscal,dx31);
1199 ty = _mm256_mul_pd(fscal,dy31);
1200 tz = _mm256_mul_pd(fscal,dz31);
1202 /* Update vectorial force */
1203 fix3 = _mm256_add_pd(fix3,tx);
1204 fiy3 = _mm256_add_pd(fiy3,ty);
1205 fiz3 = _mm256_add_pd(fiz3,tz);
1207 fjx1 = _mm256_add_pd(fjx1,tx);
1208 fjy1 = _mm256_add_pd(fjy1,ty);
1209 fjz1 = _mm256_add_pd(fjz1,tz);
1211 /**************************
1212 * CALCULATE INTERACTIONS *
1213 **************************/
1215 r32 = _mm256_mul_pd(rsq32,rinv32);
1216 r32 = _mm256_andnot_pd(dummy_mask,r32);
1218 /* EWALD ELECTROSTATICS */
1220 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1221 ewrt = _mm256_mul_pd(r32,ewtabscale);
1222 ewitab = _mm256_cvttpd_epi32(ewrt);
1223 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1224 ewitab = _mm_slli_epi32(ewitab,2);
1225 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1226 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1227 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1228 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1229 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1230 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1231 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1232 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
1233 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1235 /* Update potential sum for this i atom from the interaction with this j atom. */
1236 velec = _mm256_andnot_pd(dummy_mask,velec);
1237 velecsum = _mm256_add_pd(velecsum,velec);
1239 fscal = felec;
1241 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1243 /* Calculate temporary vectorial force */
1244 tx = _mm256_mul_pd(fscal,dx32);
1245 ty = _mm256_mul_pd(fscal,dy32);
1246 tz = _mm256_mul_pd(fscal,dz32);
1248 /* Update vectorial force */
1249 fix3 = _mm256_add_pd(fix3,tx);
1250 fiy3 = _mm256_add_pd(fiy3,ty);
1251 fiz3 = _mm256_add_pd(fiz3,tz);
1253 fjx2 = _mm256_add_pd(fjx2,tx);
1254 fjy2 = _mm256_add_pd(fjy2,ty);
1255 fjz2 = _mm256_add_pd(fjz2,tz);
1257 /**************************
1258 * CALCULATE INTERACTIONS *
1259 **************************/
1261 r33 = _mm256_mul_pd(rsq33,rinv33);
1262 r33 = _mm256_andnot_pd(dummy_mask,r33);
1264 /* EWALD ELECTROSTATICS */
1266 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1267 ewrt = _mm256_mul_pd(r33,ewtabscale);
1268 ewitab = _mm256_cvttpd_epi32(ewrt);
1269 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1270 ewitab = _mm_slli_epi32(ewitab,2);
1271 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1272 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1273 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1274 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1275 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1276 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1277 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1278 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
1279 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1281 /* Update potential sum for this i atom from the interaction with this j atom. */
1282 velec = _mm256_andnot_pd(dummy_mask,velec);
1283 velecsum = _mm256_add_pd(velecsum,velec);
1285 fscal = felec;
1287 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1289 /* Calculate temporary vectorial force */
1290 tx = _mm256_mul_pd(fscal,dx33);
1291 ty = _mm256_mul_pd(fscal,dy33);
1292 tz = _mm256_mul_pd(fscal,dz33);
1294 /* Update vectorial force */
1295 fix3 = _mm256_add_pd(fix3,tx);
1296 fiy3 = _mm256_add_pd(fiy3,ty);
1297 fiz3 = _mm256_add_pd(fiz3,tz);
1299 fjx3 = _mm256_add_pd(fjx3,tx);
1300 fjy3 = _mm256_add_pd(fjy3,ty);
1301 fjz3 = _mm256_add_pd(fjz3,tz);
1303 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1304 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1305 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1306 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1308 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1309 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1310 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1312 /* Inner loop uses 413 flops */
1315 /* End of innermost loop */
1317 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1318 f+i_coord_offset,fshift+i_shift_offset);
1320 ggid = gid[iidx];
1321 /* Update potential energies */
1322 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1323 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1325 /* Increment number of inner iterations */
1326 inneriter += j_index_end - j_index_start;
1328 /* Outer loop uses 26 flops */
1331 /* Increment number of outer iterations */
1332 outeriter += nri;
1334 /* Update outer/inner flops */
1336 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
1339 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
1340 * Electrostatics interaction: Ewald
1341 * VdW interaction: LennardJones
1342 * Geometry: Water4-Water4
1343 * Calculate force/pot: Force
1345 void
1346 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
1347 (t_nblist * gmx_restrict nlist,
1348 rvec * gmx_restrict xx,
1349 rvec * gmx_restrict ff,
1350 struct t_forcerec * gmx_restrict fr,
1351 t_mdatoms * gmx_restrict mdatoms,
1352 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1353 t_nrnb * gmx_restrict nrnb)
1355 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1356 * just 0 for non-waters.
1357 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1358 * jnr indices corresponding to data put in the four positions in the SIMD register.
1360 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1361 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1362 int jnrA,jnrB,jnrC,jnrD;
1363 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1364 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1365 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1366 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1367 real rcutoff_scalar;
1368 real *shiftvec,*fshift,*x,*f;
1369 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1370 real scratch[4*DIM];
1371 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1372 real * vdwioffsetptr0;
1373 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1374 real * vdwioffsetptr1;
1375 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1376 real * vdwioffsetptr2;
1377 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1378 real * vdwioffsetptr3;
1379 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1380 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1381 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1382 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1383 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1384 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1385 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1386 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1387 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1388 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1389 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1390 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1391 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1392 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1393 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1394 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1395 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1396 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1397 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1398 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1399 real *charge;
1400 int nvdwtype;
1401 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1402 int *vdwtype;
1403 real *vdwparam;
1404 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1405 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1406 __m128i ewitab;
1407 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1408 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1409 real *ewtab;
1410 __m256d dummy_mask,cutoff_mask;
1411 __m128 tmpmask0,tmpmask1;
1412 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1413 __m256d one = _mm256_set1_pd(1.0);
1414 __m256d two = _mm256_set1_pd(2.0);
1415 x = xx[0];
1416 f = ff[0];
1418 nri = nlist->nri;
1419 iinr = nlist->iinr;
1420 jindex = nlist->jindex;
1421 jjnr = nlist->jjnr;
1422 shiftidx = nlist->shift;
1423 gid = nlist->gid;
1424 shiftvec = fr->shift_vec[0];
1425 fshift = fr->fshift[0];
1426 facel = _mm256_set1_pd(fr->ic->epsfac);
1427 charge = mdatoms->chargeA;
1428 nvdwtype = fr->ntype;
1429 vdwparam = fr->nbfp;
1430 vdwtype = mdatoms->typeA;
1432 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1433 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1434 beta2 = _mm256_mul_pd(beta,beta);
1435 beta3 = _mm256_mul_pd(beta,beta2);
1437 ewtab = fr->ic->tabq_coul_F;
1438 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1439 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1441 /* Setup water-specific parameters */
1442 inr = nlist->iinr[0];
1443 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1444 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1445 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1446 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1448 jq1 = _mm256_set1_pd(charge[inr+1]);
1449 jq2 = _mm256_set1_pd(charge[inr+2]);
1450 jq3 = _mm256_set1_pd(charge[inr+3]);
1451 vdwjidx0A = 2*vdwtype[inr+0];
1452 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1453 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1454 qq11 = _mm256_mul_pd(iq1,jq1);
1455 qq12 = _mm256_mul_pd(iq1,jq2);
1456 qq13 = _mm256_mul_pd(iq1,jq3);
1457 qq21 = _mm256_mul_pd(iq2,jq1);
1458 qq22 = _mm256_mul_pd(iq2,jq2);
1459 qq23 = _mm256_mul_pd(iq2,jq3);
1460 qq31 = _mm256_mul_pd(iq3,jq1);
1461 qq32 = _mm256_mul_pd(iq3,jq2);
1462 qq33 = _mm256_mul_pd(iq3,jq3);
1464 /* Avoid stupid compiler warnings */
1465 jnrA = jnrB = jnrC = jnrD = 0;
1466 j_coord_offsetA = 0;
1467 j_coord_offsetB = 0;
1468 j_coord_offsetC = 0;
1469 j_coord_offsetD = 0;
1471 outeriter = 0;
1472 inneriter = 0;
1474 for(iidx=0;iidx<4*DIM;iidx++)
1476 scratch[iidx] = 0.0;
1479 /* Start outer loop over neighborlists */
1480 for(iidx=0; iidx<nri; iidx++)
1482 /* Load shift vector for this list */
1483 i_shift_offset = DIM*shiftidx[iidx];
1485 /* Load limits for loop over neighbors */
1486 j_index_start = jindex[iidx];
1487 j_index_end = jindex[iidx+1];
1489 /* Get outer coordinate index */
1490 inr = iinr[iidx];
1491 i_coord_offset = DIM*inr;
1493 /* Load i particle coords and add shift vector */
1494 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1495 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1497 fix0 = _mm256_setzero_pd();
1498 fiy0 = _mm256_setzero_pd();
1499 fiz0 = _mm256_setzero_pd();
1500 fix1 = _mm256_setzero_pd();
1501 fiy1 = _mm256_setzero_pd();
1502 fiz1 = _mm256_setzero_pd();
1503 fix2 = _mm256_setzero_pd();
1504 fiy2 = _mm256_setzero_pd();
1505 fiz2 = _mm256_setzero_pd();
1506 fix3 = _mm256_setzero_pd();
1507 fiy3 = _mm256_setzero_pd();
1508 fiz3 = _mm256_setzero_pd();
1510 /* Start inner kernel loop */
1511 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1514 /* Get j neighbor index, and coordinate index */
1515 jnrA = jjnr[jidx];
1516 jnrB = jjnr[jidx+1];
1517 jnrC = jjnr[jidx+2];
1518 jnrD = jjnr[jidx+3];
1519 j_coord_offsetA = DIM*jnrA;
1520 j_coord_offsetB = DIM*jnrB;
1521 j_coord_offsetC = DIM*jnrC;
1522 j_coord_offsetD = DIM*jnrD;
1524 /* load j atom coordinates */
1525 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1526 x+j_coord_offsetC,x+j_coord_offsetD,
1527 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1528 &jy2,&jz2,&jx3,&jy3,&jz3);
1530 /* Calculate displacement vector */
1531 dx00 = _mm256_sub_pd(ix0,jx0);
1532 dy00 = _mm256_sub_pd(iy0,jy0);
1533 dz00 = _mm256_sub_pd(iz0,jz0);
1534 dx11 = _mm256_sub_pd(ix1,jx1);
1535 dy11 = _mm256_sub_pd(iy1,jy1);
1536 dz11 = _mm256_sub_pd(iz1,jz1);
1537 dx12 = _mm256_sub_pd(ix1,jx2);
1538 dy12 = _mm256_sub_pd(iy1,jy2);
1539 dz12 = _mm256_sub_pd(iz1,jz2);
1540 dx13 = _mm256_sub_pd(ix1,jx3);
1541 dy13 = _mm256_sub_pd(iy1,jy3);
1542 dz13 = _mm256_sub_pd(iz1,jz3);
1543 dx21 = _mm256_sub_pd(ix2,jx1);
1544 dy21 = _mm256_sub_pd(iy2,jy1);
1545 dz21 = _mm256_sub_pd(iz2,jz1);
1546 dx22 = _mm256_sub_pd(ix2,jx2);
1547 dy22 = _mm256_sub_pd(iy2,jy2);
1548 dz22 = _mm256_sub_pd(iz2,jz2);
1549 dx23 = _mm256_sub_pd(ix2,jx3);
1550 dy23 = _mm256_sub_pd(iy2,jy3);
1551 dz23 = _mm256_sub_pd(iz2,jz3);
1552 dx31 = _mm256_sub_pd(ix3,jx1);
1553 dy31 = _mm256_sub_pd(iy3,jy1);
1554 dz31 = _mm256_sub_pd(iz3,jz1);
1555 dx32 = _mm256_sub_pd(ix3,jx2);
1556 dy32 = _mm256_sub_pd(iy3,jy2);
1557 dz32 = _mm256_sub_pd(iz3,jz2);
1558 dx33 = _mm256_sub_pd(ix3,jx3);
1559 dy33 = _mm256_sub_pd(iy3,jy3);
1560 dz33 = _mm256_sub_pd(iz3,jz3);
1562 /* Calculate squared distance and things based on it */
1563 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1564 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1565 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1566 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1567 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1568 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1569 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1570 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1571 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1572 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1574 rinv11 = avx256_invsqrt_d(rsq11);
1575 rinv12 = avx256_invsqrt_d(rsq12);
1576 rinv13 = avx256_invsqrt_d(rsq13);
1577 rinv21 = avx256_invsqrt_d(rsq21);
1578 rinv22 = avx256_invsqrt_d(rsq22);
1579 rinv23 = avx256_invsqrt_d(rsq23);
1580 rinv31 = avx256_invsqrt_d(rsq31);
1581 rinv32 = avx256_invsqrt_d(rsq32);
1582 rinv33 = avx256_invsqrt_d(rsq33);
1584 rinvsq00 = avx256_inv_d(rsq00);
1585 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1586 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1587 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1588 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1589 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1590 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1591 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1592 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1593 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1595 fjx0 = _mm256_setzero_pd();
1596 fjy0 = _mm256_setzero_pd();
1597 fjz0 = _mm256_setzero_pd();
1598 fjx1 = _mm256_setzero_pd();
1599 fjy1 = _mm256_setzero_pd();
1600 fjz1 = _mm256_setzero_pd();
1601 fjx2 = _mm256_setzero_pd();
1602 fjy2 = _mm256_setzero_pd();
1603 fjz2 = _mm256_setzero_pd();
1604 fjx3 = _mm256_setzero_pd();
1605 fjy3 = _mm256_setzero_pd();
1606 fjz3 = _mm256_setzero_pd();
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 /* LENNARD-JONES DISPERSION/REPULSION */
1614 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1615 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1617 fscal = fvdw;
1619 /* Calculate temporary vectorial force */
1620 tx = _mm256_mul_pd(fscal,dx00);
1621 ty = _mm256_mul_pd(fscal,dy00);
1622 tz = _mm256_mul_pd(fscal,dz00);
1624 /* Update vectorial force */
1625 fix0 = _mm256_add_pd(fix0,tx);
1626 fiy0 = _mm256_add_pd(fiy0,ty);
1627 fiz0 = _mm256_add_pd(fiz0,tz);
1629 fjx0 = _mm256_add_pd(fjx0,tx);
1630 fjy0 = _mm256_add_pd(fjy0,ty);
1631 fjz0 = _mm256_add_pd(fjz0,tz);
1633 /**************************
1634 * CALCULATE INTERACTIONS *
1635 **************************/
1637 r11 = _mm256_mul_pd(rsq11,rinv11);
1639 /* EWALD ELECTROSTATICS */
1641 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1642 ewrt = _mm256_mul_pd(r11,ewtabscale);
1643 ewitab = _mm256_cvttpd_epi32(ewrt);
1644 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1645 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1646 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1647 &ewtabF,&ewtabFn);
1648 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1649 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1651 fscal = felec;
1653 /* Calculate temporary vectorial force */
1654 tx = _mm256_mul_pd(fscal,dx11);
1655 ty = _mm256_mul_pd(fscal,dy11);
1656 tz = _mm256_mul_pd(fscal,dz11);
1658 /* Update vectorial force */
1659 fix1 = _mm256_add_pd(fix1,tx);
1660 fiy1 = _mm256_add_pd(fiy1,ty);
1661 fiz1 = _mm256_add_pd(fiz1,tz);
1663 fjx1 = _mm256_add_pd(fjx1,tx);
1664 fjy1 = _mm256_add_pd(fjy1,ty);
1665 fjz1 = _mm256_add_pd(fjz1,tz);
1667 /**************************
1668 * CALCULATE INTERACTIONS *
1669 **************************/
1671 r12 = _mm256_mul_pd(rsq12,rinv12);
1673 /* EWALD ELECTROSTATICS */
1675 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1676 ewrt = _mm256_mul_pd(r12,ewtabscale);
1677 ewitab = _mm256_cvttpd_epi32(ewrt);
1678 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1679 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1680 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1681 &ewtabF,&ewtabFn);
1682 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1683 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1685 fscal = felec;
1687 /* Calculate temporary vectorial force */
1688 tx = _mm256_mul_pd(fscal,dx12);
1689 ty = _mm256_mul_pd(fscal,dy12);
1690 tz = _mm256_mul_pd(fscal,dz12);
1692 /* Update vectorial force */
1693 fix1 = _mm256_add_pd(fix1,tx);
1694 fiy1 = _mm256_add_pd(fiy1,ty);
1695 fiz1 = _mm256_add_pd(fiz1,tz);
1697 fjx2 = _mm256_add_pd(fjx2,tx);
1698 fjy2 = _mm256_add_pd(fjy2,ty);
1699 fjz2 = _mm256_add_pd(fjz2,tz);
1701 /**************************
1702 * CALCULATE INTERACTIONS *
1703 **************************/
1705 r13 = _mm256_mul_pd(rsq13,rinv13);
1707 /* EWALD ELECTROSTATICS */
1709 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1710 ewrt = _mm256_mul_pd(r13,ewtabscale);
1711 ewitab = _mm256_cvttpd_epi32(ewrt);
1712 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1713 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1714 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1715 &ewtabF,&ewtabFn);
1716 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1717 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
1719 fscal = felec;
1721 /* Calculate temporary vectorial force */
1722 tx = _mm256_mul_pd(fscal,dx13);
1723 ty = _mm256_mul_pd(fscal,dy13);
1724 tz = _mm256_mul_pd(fscal,dz13);
1726 /* Update vectorial force */
1727 fix1 = _mm256_add_pd(fix1,tx);
1728 fiy1 = _mm256_add_pd(fiy1,ty);
1729 fiz1 = _mm256_add_pd(fiz1,tz);
1731 fjx3 = _mm256_add_pd(fjx3,tx);
1732 fjy3 = _mm256_add_pd(fjy3,ty);
1733 fjz3 = _mm256_add_pd(fjz3,tz);
1735 /**************************
1736 * CALCULATE INTERACTIONS *
1737 **************************/
1739 r21 = _mm256_mul_pd(rsq21,rinv21);
1741 /* EWALD ELECTROSTATICS */
1743 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1744 ewrt = _mm256_mul_pd(r21,ewtabscale);
1745 ewitab = _mm256_cvttpd_epi32(ewrt);
1746 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1747 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1748 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1749 &ewtabF,&ewtabFn);
1750 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1751 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1753 fscal = felec;
1755 /* Calculate temporary vectorial force */
1756 tx = _mm256_mul_pd(fscal,dx21);
1757 ty = _mm256_mul_pd(fscal,dy21);
1758 tz = _mm256_mul_pd(fscal,dz21);
1760 /* Update vectorial force */
1761 fix2 = _mm256_add_pd(fix2,tx);
1762 fiy2 = _mm256_add_pd(fiy2,ty);
1763 fiz2 = _mm256_add_pd(fiz2,tz);
1765 fjx1 = _mm256_add_pd(fjx1,tx);
1766 fjy1 = _mm256_add_pd(fjy1,ty);
1767 fjz1 = _mm256_add_pd(fjz1,tz);
1769 /**************************
1770 * CALCULATE INTERACTIONS *
1771 **************************/
1773 r22 = _mm256_mul_pd(rsq22,rinv22);
1775 /* EWALD ELECTROSTATICS */
1777 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778 ewrt = _mm256_mul_pd(r22,ewtabscale);
1779 ewitab = _mm256_cvttpd_epi32(ewrt);
1780 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1781 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1782 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1783 &ewtabF,&ewtabFn);
1784 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1785 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1787 fscal = felec;
1789 /* Calculate temporary vectorial force */
1790 tx = _mm256_mul_pd(fscal,dx22);
1791 ty = _mm256_mul_pd(fscal,dy22);
1792 tz = _mm256_mul_pd(fscal,dz22);
1794 /* Update vectorial force */
1795 fix2 = _mm256_add_pd(fix2,tx);
1796 fiy2 = _mm256_add_pd(fiy2,ty);
1797 fiz2 = _mm256_add_pd(fiz2,tz);
1799 fjx2 = _mm256_add_pd(fjx2,tx);
1800 fjy2 = _mm256_add_pd(fjy2,ty);
1801 fjz2 = _mm256_add_pd(fjz2,tz);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 r23 = _mm256_mul_pd(rsq23,rinv23);
1809 /* EWALD ELECTROSTATICS */
1811 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1812 ewrt = _mm256_mul_pd(r23,ewtabscale);
1813 ewitab = _mm256_cvttpd_epi32(ewrt);
1814 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1815 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1816 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1817 &ewtabF,&ewtabFn);
1818 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1819 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
1821 fscal = felec;
1823 /* Calculate temporary vectorial force */
1824 tx = _mm256_mul_pd(fscal,dx23);
1825 ty = _mm256_mul_pd(fscal,dy23);
1826 tz = _mm256_mul_pd(fscal,dz23);
1828 /* Update vectorial force */
1829 fix2 = _mm256_add_pd(fix2,tx);
1830 fiy2 = _mm256_add_pd(fiy2,ty);
1831 fiz2 = _mm256_add_pd(fiz2,tz);
1833 fjx3 = _mm256_add_pd(fjx3,tx);
1834 fjy3 = _mm256_add_pd(fjy3,ty);
1835 fjz3 = _mm256_add_pd(fjz3,tz);
1837 /**************************
1838 * CALCULATE INTERACTIONS *
1839 **************************/
1841 r31 = _mm256_mul_pd(rsq31,rinv31);
1843 /* EWALD ELECTROSTATICS */
1845 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1846 ewrt = _mm256_mul_pd(r31,ewtabscale);
1847 ewitab = _mm256_cvttpd_epi32(ewrt);
1848 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1849 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1850 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1851 &ewtabF,&ewtabFn);
1852 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1853 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
1855 fscal = felec;
1857 /* Calculate temporary vectorial force */
1858 tx = _mm256_mul_pd(fscal,dx31);
1859 ty = _mm256_mul_pd(fscal,dy31);
1860 tz = _mm256_mul_pd(fscal,dz31);
1862 /* Update vectorial force */
1863 fix3 = _mm256_add_pd(fix3,tx);
1864 fiy3 = _mm256_add_pd(fiy3,ty);
1865 fiz3 = _mm256_add_pd(fiz3,tz);
1867 fjx1 = _mm256_add_pd(fjx1,tx);
1868 fjy1 = _mm256_add_pd(fjy1,ty);
1869 fjz1 = _mm256_add_pd(fjz1,tz);
1871 /**************************
1872 * CALCULATE INTERACTIONS *
1873 **************************/
1875 r32 = _mm256_mul_pd(rsq32,rinv32);
1877 /* EWALD ELECTROSTATICS */
1879 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1880 ewrt = _mm256_mul_pd(r32,ewtabscale);
1881 ewitab = _mm256_cvttpd_epi32(ewrt);
1882 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1883 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1884 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1885 &ewtabF,&ewtabFn);
1886 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1887 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
1889 fscal = felec;
1891 /* Calculate temporary vectorial force */
1892 tx = _mm256_mul_pd(fscal,dx32);
1893 ty = _mm256_mul_pd(fscal,dy32);
1894 tz = _mm256_mul_pd(fscal,dz32);
1896 /* Update vectorial force */
1897 fix3 = _mm256_add_pd(fix3,tx);
1898 fiy3 = _mm256_add_pd(fiy3,ty);
1899 fiz3 = _mm256_add_pd(fiz3,tz);
1901 fjx2 = _mm256_add_pd(fjx2,tx);
1902 fjy2 = _mm256_add_pd(fjy2,ty);
1903 fjz2 = _mm256_add_pd(fjz2,tz);
1905 /**************************
1906 * CALCULATE INTERACTIONS *
1907 **************************/
1909 r33 = _mm256_mul_pd(rsq33,rinv33);
1911 /* EWALD ELECTROSTATICS */
1913 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1914 ewrt = _mm256_mul_pd(r33,ewtabscale);
1915 ewitab = _mm256_cvttpd_epi32(ewrt);
1916 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1917 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1918 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1919 &ewtabF,&ewtabFn);
1920 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1921 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
1923 fscal = felec;
1925 /* Calculate temporary vectorial force */
1926 tx = _mm256_mul_pd(fscal,dx33);
1927 ty = _mm256_mul_pd(fscal,dy33);
1928 tz = _mm256_mul_pd(fscal,dz33);
1930 /* Update vectorial force */
1931 fix3 = _mm256_add_pd(fix3,tx);
1932 fiy3 = _mm256_add_pd(fiy3,ty);
1933 fiz3 = _mm256_add_pd(fiz3,tz);
1935 fjx3 = _mm256_add_pd(fjx3,tx);
1936 fjy3 = _mm256_add_pd(fjy3,ty);
1937 fjz3 = _mm256_add_pd(fjz3,tz);
1939 fjptrA = f+j_coord_offsetA;
1940 fjptrB = f+j_coord_offsetB;
1941 fjptrC = f+j_coord_offsetC;
1942 fjptrD = f+j_coord_offsetD;
1944 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1945 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1946 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1948 /* Inner loop uses 354 flops */
1951 if(jidx<j_index_end)
1954 /* Get j neighbor index, and coordinate index */
1955 jnrlistA = jjnr[jidx];
1956 jnrlistB = jjnr[jidx+1];
1957 jnrlistC = jjnr[jidx+2];
1958 jnrlistD = jjnr[jidx+3];
1959 /* Sign of each element will be negative for non-real atoms.
1960 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1961 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1963 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1965 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1966 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1967 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1969 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1970 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1971 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1972 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1973 j_coord_offsetA = DIM*jnrA;
1974 j_coord_offsetB = DIM*jnrB;
1975 j_coord_offsetC = DIM*jnrC;
1976 j_coord_offsetD = DIM*jnrD;
1978 /* load j atom coordinates */
1979 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1980 x+j_coord_offsetC,x+j_coord_offsetD,
1981 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1982 &jy2,&jz2,&jx3,&jy3,&jz3);
1984 /* Calculate displacement vector */
1985 dx00 = _mm256_sub_pd(ix0,jx0);
1986 dy00 = _mm256_sub_pd(iy0,jy0);
1987 dz00 = _mm256_sub_pd(iz0,jz0);
1988 dx11 = _mm256_sub_pd(ix1,jx1);
1989 dy11 = _mm256_sub_pd(iy1,jy1);
1990 dz11 = _mm256_sub_pd(iz1,jz1);
1991 dx12 = _mm256_sub_pd(ix1,jx2);
1992 dy12 = _mm256_sub_pd(iy1,jy2);
1993 dz12 = _mm256_sub_pd(iz1,jz2);
1994 dx13 = _mm256_sub_pd(ix1,jx3);
1995 dy13 = _mm256_sub_pd(iy1,jy3);
1996 dz13 = _mm256_sub_pd(iz1,jz3);
1997 dx21 = _mm256_sub_pd(ix2,jx1);
1998 dy21 = _mm256_sub_pd(iy2,jy1);
1999 dz21 = _mm256_sub_pd(iz2,jz1);
2000 dx22 = _mm256_sub_pd(ix2,jx2);
2001 dy22 = _mm256_sub_pd(iy2,jy2);
2002 dz22 = _mm256_sub_pd(iz2,jz2);
2003 dx23 = _mm256_sub_pd(ix2,jx3);
2004 dy23 = _mm256_sub_pd(iy2,jy3);
2005 dz23 = _mm256_sub_pd(iz2,jz3);
2006 dx31 = _mm256_sub_pd(ix3,jx1);
2007 dy31 = _mm256_sub_pd(iy3,jy1);
2008 dz31 = _mm256_sub_pd(iz3,jz1);
2009 dx32 = _mm256_sub_pd(ix3,jx2);
2010 dy32 = _mm256_sub_pd(iy3,jy2);
2011 dz32 = _mm256_sub_pd(iz3,jz2);
2012 dx33 = _mm256_sub_pd(ix3,jx3);
2013 dy33 = _mm256_sub_pd(iy3,jy3);
2014 dz33 = _mm256_sub_pd(iz3,jz3);
2016 /* Calculate squared distance and things based on it */
2017 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2018 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2019 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2020 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
2021 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2022 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2023 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2024 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2025 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2026 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2028 rinv11 = avx256_invsqrt_d(rsq11);
2029 rinv12 = avx256_invsqrt_d(rsq12);
2030 rinv13 = avx256_invsqrt_d(rsq13);
2031 rinv21 = avx256_invsqrt_d(rsq21);
2032 rinv22 = avx256_invsqrt_d(rsq22);
2033 rinv23 = avx256_invsqrt_d(rsq23);
2034 rinv31 = avx256_invsqrt_d(rsq31);
2035 rinv32 = avx256_invsqrt_d(rsq32);
2036 rinv33 = avx256_invsqrt_d(rsq33);
2038 rinvsq00 = avx256_inv_d(rsq00);
2039 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2040 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2041 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2042 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2043 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2044 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2045 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2046 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2047 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2049 fjx0 = _mm256_setzero_pd();
2050 fjy0 = _mm256_setzero_pd();
2051 fjz0 = _mm256_setzero_pd();
2052 fjx1 = _mm256_setzero_pd();
2053 fjy1 = _mm256_setzero_pd();
2054 fjz1 = _mm256_setzero_pd();
2055 fjx2 = _mm256_setzero_pd();
2056 fjy2 = _mm256_setzero_pd();
2057 fjz2 = _mm256_setzero_pd();
2058 fjx3 = _mm256_setzero_pd();
2059 fjy3 = _mm256_setzero_pd();
2060 fjz3 = _mm256_setzero_pd();
2062 /**************************
2063 * CALCULATE INTERACTIONS *
2064 **************************/
2066 /* LENNARD-JONES DISPERSION/REPULSION */
2068 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2069 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
2071 fscal = fvdw;
2073 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2075 /* Calculate temporary vectorial force */
2076 tx = _mm256_mul_pd(fscal,dx00);
2077 ty = _mm256_mul_pd(fscal,dy00);
2078 tz = _mm256_mul_pd(fscal,dz00);
2080 /* Update vectorial force */
2081 fix0 = _mm256_add_pd(fix0,tx);
2082 fiy0 = _mm256_add_pd(fiy0,ty);
2083 fiz0 = _mm256_add_pd(fiz0,tz);
2085 fjx0 = _mm256_add_pd(fjx0,tx);
2086 fjy0 = _mm256_add_pd(fjy0,ty);
2087 fjz0 = _mm256_add_pd(fjz0,tz);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 r11 = _mm256_mul_pd(rsq11,rinv11);
2094 r11 = _mm256_andnot_pd(dummy_mask,r11);
2096 /* EWALD ELECTROSTATICS */
2098 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2099 ewrt = _mm256_mul_pd(r11,ewtabscale);
2100 ewitab = _mm256_cvttpd_epi32(ewrt);
2101 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2102 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2103 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2104 &ewtabF,&ewtabFn);
2105 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2106 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2108 fscal = felec;
2110 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2112 /* Calculate temporary vectorial force */
2113 tx = _mm256_mul_pd(fscal,dx11);
2114 ty = _mm256_mul_pd(fscal,dy11);
2115 tz = _mm256_mul_pd(fscal,dz11);
2117 /* Update vectorial force */
2118 fix1 = _mm256_add_pd(fix1,tx);
2119 fiy1 = _mm256_add_pd(fiy1,ty);
2120 fiz1 = _mm256_add_pd(fiz1,tz);
2122 fjx1 = _mm256_add_pd(fjx1,tx);
2123 fjy1 = _mm256_add_pd(fjy1,ty);
2124 fjz1 = _mm256_add_pd(fjz1,tz);
2126 /**************************
2127 * CALCULATE INTERACTIONS *
2128 **************************/
2130 r12 = _mm256_mul_pd(rsq12,rinv12);
2131 r12 = _mm256_andnot_pd(dummy_mask,r12);
2133 /* EWALD ELECTROSTATICS */
2135 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2136 ewrt = _mm256_mul_pd(r12,ewtabscale);
2137 ewitab = _mm256_cvttpd_epi32(ewrt);
2138 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2139 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2140 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2141 &ewtabF,&ewtabFn);
2142 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2143 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2145 fscal = felec;
2147 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2149 /* Calculate temporary vectorial force */
2150 tx = _mm256_mul_pd(fscal,dx12);
2151 ty = _mm256_mul_pd(fscal,dy12);
2152 tz = _mm256_mul_pd(fscal,dz12);
2154 /* Update vectorial force */
2155 fix1 = _mm256_add_pd(fix1,tx);
2156 fiy1 = _mm256_add_pd(fiy1,ty);
2157 fiz1 = _mm256_add_pd(fiz1,tz);
2159 fjx2 = _mm256_add_pd(fjx2,tx);
2160 fjy2 = _mm256_add_pd(fjy2,ty);
2161 fjz2 = _mm256_add_pd(fjz2,tz);
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 r13 = _mm256_mul_pd(rsq13,rinv13);
2168 r13 = _mm256_andnot_pd(dummy_mask,r13);
2170 /* EWALD ELECTROSTATICS */
2172 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173 ewrt = _mm256_mul_pd(r13,ewtabscale);
2174 ewitab = _mm256_cvttpd_epi32(ewrt);
2175 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2176 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2177 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2178 &ewtabF,&ewtabFn);
2179 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2180 felec = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
2182 fscal = felec;
2184 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2186 /* Calculate temporary vectorial force */
2187 tx = _mm256_mul_pd(fscal,dx13);
2188 ty = _mm256_mul_pd(fscal,dy13);
2189 tz = _mm256_mul_pd(fscal,dz13);
2191 /* Update vectorial force */
2192 fix1 = _mm256_add_pd(fix1,tx);
2193 fiy1 = _mm256_add_pd(fiy1,ty);
2194 fiz1 = _mm256_add_pd(fiz1,tz);
2196 fjx3 = _mm256_add_pd(fjx3,tx);
2197 fjy3 = _mm256_add_pd(fjy3,ty);
2198 fjz3 = _mm256_add_pd(fjz3,tz);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 r21 = _mm256_mul_pd(rsq21,rinv21);
2205 r21 = _mm256_andnot_pd(dummy_mask,r21);
2207 /* EWALD ELECTROSTATICS */
2209 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2210 ewrt = _mm256_mul_pd(r21,ewtabscale);
2211 ewitab = _mm256_cvttpd_epi32(ewrt);
2212 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2213 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2214 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2215 &ewtabF,&ewtabFn);
2216 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2217 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2219 fscal = felec;
2221 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2223 /* Calculate temporary vectorial force */
2224 tx = _mm256_mul_pd(fscal,dx21);
2225 ty = _mm256_mul_pd(fscal,dy21);
2226 tz = _mm256_mul_pd(fscal,dz21);
2228 /* Update vectorial force */
2229 fix2 = _mm256_add_pd(fix2,tx);
2230 fiy2 = _mm256_add_pd(fiy2,ty);
2231 fiz2 = _mm256_add_pd(fiz2,tz);
2233 fjx1 = _mm256_add_pd(fjx1,tx);
2234 fjy1 = _mm256_add_pd(fjy1,ty);
2235 fjz1 = _mm256_add_pd(fjz1,tz);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 r22 = _mm256_mul_pd(rsq22,rinv22);
2242 r22 = _mm256_andnot_pd(dummy_mask,r22);
2244 /* EWALD ELECTROSTATICS */
2246 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2247 ewrt = _mm256_mul_pd(r22,ewtabscale);
2248 ewitab = _mm256_cvttpd_epi32(ewrt);
2249 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2250 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2251 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2252 &ewtabF,&ewtabFn);
2253 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2254 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2256 fscal = felec;
2258 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2260 /* Calculate temporary vectorial force */
2261 tx = _mm256_mul_pd(fscal,dx22);
2262 ty = _mm256_mul_pd(fscal,dy22);
2263 tz = _mm256_mul_pd(fscal,dz22);
2265 /* Update vectorial force */
2266 fix2 = _mm256_add_pd(fix2,tx);
2267 fiy2 = _mm256_add_pd(fiy2,ty);
2268 fiz2 = _mm256_add_pd(fiz2,tz);
2270 fjx2 = _mm256_add_pd(fjx2,tx);
2271 fjy2 = _mm256_add_pd(fjy2,ty);
2272 fjz2 = _mm256_add_pd(fjz2,tz);
2274 /**************************
2275 * CALCULATE INTERACTIONS *
2276 **************************/
2278 r23 = _mm256_mul_pd(rsq23,rinv23);
2279 r23 = _mm256_andnot_pd(dummy_mask,r23);
2281 /* EWALD ELECTROSTATICS */
2283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2284 ewrt = _mm256_mul_pd(r23,ewtabscale);
2285 ewitab = _mm256_cvttpd_epi32(ewrt);
2286 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2287 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2288 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2289 &ewtabF,&ewtabFn);
2290 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2291 felec = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
2293 fscal = felec;
2295 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2297 /* Calculate temporary vectorial force */
2298 tx = _mm256_mul_pd(fscal,dx23);
2299 ty = _mm256_mul_pd(fscal,dy23);
2300 tz = _mm256_mul_pd(fscal,dz23);
2302 /* Update vectorial force */
2303 fix2 = _mm256_add_pd(fix2,tx);
2304 fiy2 = _mm256_add_pd(fiy2,ty);
2305 fiz2 = _mm256_add_pd(fiz2,tz);
2307 fjx3 = _mm256_add_pd(fjx3,tx);
2308 fjy3 = _mm256_add_pd(fjy3,ty);
2309 fjz3 = _mm256_add_pd(fjz3,tz);
2311 /**************************
2312 * CALCULATE INTERACTIONS *
2313 **************************/
2315 r31 = _mm256_mul_pd(rsq31,rinv31);
2316 r31 = _mm256_andnot_pd(dummy_mask,r31);
2318 /* EWALD ELECTROSTATICS */
2320 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2321 ewrt = _mm256_mul_pd(r31,ewtabscale);
2322 ewitab = _mm256_cvttpd_epi32(ewrt);
2323 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2324 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2325 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2326 &ewtabF,&ewtabFn);
2327 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2328 felec = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
2330 fscal = felec;
2332 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2334 /* Calculate temporary vectorial force */
2335 tx = _mm256_mul_pd(fscal,dx31);
2336 ty = _mm256_mul_pd(fscal,dy31);
2337 tz = _mm256_mul_pd(fscal,dz31);
2339 /* Update vectorial force */
2340 fix3 = _mm256_add_pd(fix3,tx);
2341 fiy3 = _mm256_add_pd(fiy3,ty);
2342 fiz3 = _mm256_add_pd(fiz3,tz);
2344 fjx1 = _mm256_add_pd(fjx1,tx);
2345 fjy1 = _mm256_add_pd(fjy1,ty);
2346 fjz1 = _mm256_add_pd(fjz1,tz);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 r32 = _mm256_mul_pd(rsq32,rinv32);
2353 r32 = _mm256_andnot_pd(dummy_mask,r32);
2355 /* EWALD ELECTROSTATICS */
2357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2358 ewrt = _mm256_mul_pd(r32,ewtabscale);
2359 ewitab = _mm256_cvttpd_epi32(ewrt);
2360 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2361 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2362 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2363 &ewtabF,&ewtabFn);
2364 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2365 felec = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
2367 fscal = felec;
2369 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2371 /* Calculate temporary vectorial force */
2372 tx = _mm256_mul_pd(fscal,dx32);
2373 ty = _mm256_mul_pd(fscal,dy32);
2374 tz = _mm256_mul_pd(fscal,dz32);
2376 /* Update vectorial force */
2377 fix3 = _mm256_add_pd(fix3,tx);
2378 fiy3 = _mm256_add_pd(fiy3,ty);
2379 fiz3 = _mm256_add_pd(fiz3,tz);
2381 fjx2 = _mm256_add_pd(fjx2,tx);
2382 fjy2 = _mm256_add_pd(fjy2,ty);
2383 fjz2 = _mm256_add_pd(fjz2,tz);
2385 /**************************
2386 * CALCULATE INTERACTIONS *
2387 **************************/
2389 r33 = _mm256_mul_pd(rsq33,rinv33);
2390 r33 = _mm256_andnot_pd(dummy_mask,r33);
2392 /* EWALD ELECTROSTATICS */
2394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2395 ewrt = _mm256_mul_pd(r33,ewtabscale);
2396 ewitab = _mm256_cvttpd_epi32(ewrt);
2397 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2398 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2399 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2400 &ewtabF,&ewtabFn);
2401 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2402 felec = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
2404 fscal = felec;
2406 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2408 /* Calculate temporary vectorial force */
2409 tx = _mm256_mul_pd(fscal,dx33);
2410 ty = _mm256_mul_pd(fscal,dy33);
2411 tz = _mm256_mul_pd(fscal,dz33);
2413 /* Update vectorial force */
2414 fix3 = _mm256_add_pd(fix3,tx);
2415 fiy3 = _mm256_add_pd(fiy3,ty);
2416 fiz3 = _mm256_add_pd(fiz3,tz);
2418 fjx3 = _mm256_add_pd(fjx3,tx);
2419 fjy3 = _mm256_add_pd(fjy3,ty);
2420 fjz3 = _mm256_add_pd(fjz3,tz);
2422 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2423 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2424 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2425 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2427 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2428 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2429 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2431 /* Inner loop uses 363 flops */
2434 /* End of innermost loop */
2436 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2437 f+i_coord_offset,fshift+i_shift_offset);
2439 /* Increment number of inner iterations */
2440 inneriter += j_index_end - j_index_start;
2442 /* Outer loop uses 24 flops */
2445 /* Increment number of outer iterations */
2446 outeriter += nri;
2448 /* Update outer/inner flops */
2450 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*363);