Remove nb-parameters from t_forcerec
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEw_VdwCSTab_GeomW3W3_avx_256_double.c
blob8d8bbd32c129f2c21fe2bab451a3f1cf16f259be
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 real scratch[4*DIM];
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
105 real *charge;
106 int nvdwtype;
107 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108 int *vdwtype;
109 real *vdwparam;
110 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
111 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
112 __m128i vfitab;
113 __m128i ifour = _mm_set1_epi32(4);
114 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115 real *vftab;
116 __m128i ewitab;
117 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
118 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
119 real *ewtab;
120 __m256d dummy_mask,cutoff_mask;
121 __m128 tmpmask0,tmpmask1;
122 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
123 __m256d one = _mm256_set1_pd(1.0);
124 __m256d two = _mm256_set1_pd(2.0);
125 x = xx[0];
126 f = ff[0];
128 nri = nlist->nri;
129 iinr = nlist->iinr;
130 jindex = nlist->jindex;
131 jjnr = nlist->jjnr;
132 shiftidx = nlist->shift;
133 gid = nlist->gid;
134 shiftvec = fr->shift_vec[0];
135 fshift = fr->fshift[0];
136 facel = _mm256_set1_pd(fr->ic->epsfac);
137 charge = mdatoms->chargeA;
138 nvdwtype = fr->ntype;
139 vdwparam = fr->nbfp;
140 vdwtype = mdatoms->typeA;
142 vftab = kernel_data->table_vdw->data;
143 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
145 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
146 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
147 beta2 = _mm256_mul_pd(beta,beta);
148 beta3 = _mm256_mul_pd(beta,beta2);
150 ewtab = fr->ic->tabq_coul_FDV0;
151 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
152 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
154 /* Setup water-specific parameters */
155 inr = nlist->iinr[0];
156 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
157 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
158 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
159 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
161 jq0 = _mm256_set1_pd(charge[inr+0]);
162 jq1 = _mm256_set1_pd(charge[inr+1]);
163 jq2 = _mm256_set1_pd(charge[inr+2]);
164 vdwjidx0A = 2*vdwtype[inr+0];
165 qq00 = _mm256_mul_pd(iq0,jq0);
166 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
167 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
168 qq01 = _mm256_mul_pd(iq0,jq1);
169 qq02 = _mm256_mul_pd(iq0,jq2);
170 qq10 = _mm256_mul_pd(iq1,jq0);
171 qq11 = _mm256_mul_pd(iq1,jq1);
172 qq12 = _mm256_mul_pd(iq1,jq2);
173 qq20 = _mm256_mul_pd(iq2,jq0);
174 qq21 = _mm256_mul_pd(iq2,jq1);
175 qq22 = _mm256_mul_pd(iq2,jq2);
177 /* Avoid stupid compiler warnings */
178 jnrA = jnrB = jnrC = jnrD = 0;
179 j_coord_offsetA = 0;
180 j_coord_offsetB = 0;
181 j_coord_offsetC = 0;
182 j_coord_offsetD = 0;
184 outeriter = 0;
185 inneriter = 0;
187 for(iidx=0;iidx<4*DIM;iidx++)
189 scratch[iidx] = 0.0;
192 /* Start outer loop over neighborlists */
193 for(iidx=0; iidx<nri; iidx++)
195 /* Load shift vector for this list */
196 i_shift_offset = DIM*shiftidx[iidx];
198 /* Load limits for loop over neighbors */
199 j_index_start = jindex[iidx];
200 j_index_end = jindex[iidx+1];
202 /* Get outer coordinate index */
203 inr = iinr[iidx];
204 i_coord_offset = DIM*inr;
206 /* Load i particle coords and add shift vector */
207 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
208 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
210 fix0 = _mm256_setzero_pd();
211 fiy0 = _mm256_setzero_pd();
212 fiz0 = _mm256_setzero_pd();
213 fix1 = _mm256_setzero_pd();
214 fiy1 = _mm256_setzero_pd();
215 fiz1 = _mm256_setzero_pd();
216 fix2 = _mm256_setzero_pd();
217 fiy2 = _mm256_setzero_pd();
218 fiz2 = _mm256_setzero_pd();
220 /* Reset potential sums */
221 velecsum = _mm256_setzero_pd();
222 vvdwsum = _mm256_setzero_pd();
224 /* Start inner kernel loop */
225 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
228 /* Get j neighbor index, and coordinate index */
229 jnrA = jjnr[jidx];
230 jnrB = jjnr[jidx+1];
231 jnrC = jjnr[jidx+2];
232 jnrD = jjnr[jidx+3];
233 j_coord_offsetA = DIM*jnrA;
234 j_coord_offsetB = DIM*jnrB;
235 j_coord_offsetC = DIM*jnrC;
236 j_coord_offsetD = DIM*jnrD;
238 /* load j atom coordinates */
239 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
240 x+j_coord_offsetC,x+j_coord_offsetD,
241 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
243 /* Calculate displacement vector */
244 dx00 = _mm256_sub_pd(ix0,jx0);
245 dy00 = _mm256_sub_pd(iy0,jy0);
246 dz00 = _mm256_sub_pd(iz0,jz0);
247 dx01 = _mm256_sub_pd(ix0,jx1);
248 dy01 = _mm256_sub_pd(iy0,jy1);
249 dz01 = _mm256_sub_pd(iz0,jz1);
250 dx02 = _mm256_sub_pd(ix0,jx2);
251 dy02 = _mm256_sub_pd(iy0,jy2);
252 dz02 = _mm256_sub_pd(iz0,jz2);
253 dx10 = _mm256_sub_pd(ix1,jx0);
254 dy10 = _mm256_sub_pd(iy1,jy0);
255 dz10 = _mm256_sub_pd(iz1,jz0);
256 dx11 = _mm256_sub_pd(ix1,jx1);
257 dy11 = _mm256_sub_pd(iy1,jy1);
258 dz11 = _mm256_sub_pd(iz1,jz1);
259 dx12 = _mm256_sub_pd(ix1,jx2);
260 dy12 = _mm256_sub_pd(iy1,jy2);
261 dz12 = _mm256_sub_pd(iz1,jz2);
262 dx20 = _mm256_sub_pd(ix2,jx0);
263 dy20 = _mm256_sub_pd(iy2,jy0);
264 dz20 = _mm256_sub_pd(iz2,jz0);
265 dx21 = _mm256_sub_pd(ix2,jx1);
266 dy21 = _mm256_sub_pd(iy2,jy1);
267 dz21 = _mm256_sub_pd(iz2,jz1);
268 dx22 = _mm256_sub_pd(ix2,jx2);
269 dy22 = _mm256_sub_pd(iy2,jy2);
270 dz22 = _mm256_sub_pd(iz2,jz2);
272 /* Calculate squared distance and things based on it */
273 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
274 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
275 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
276 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
277 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
278 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
279 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
280 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
281 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
283 rinv00 = avx256_invsqrt_d(rsq00);
284 rinv01 = avx256_invsqrt_d(rsq01);
285 rinv02 = avx256_invsqrt_d(rsq02);
286 rinv10 = avx256_invsqrt_d(rsq10);
287 rinv11 = avx256_invsqrt_d(rsq11);
288 rinv12 = avx256_invsqrt_d(rsq12);
289 rinv20 = avx256_invsqrt_d(rsq20);
290 rinv21 = avx256_invsqrt_d(rsq21);
291 rinv22 = avx256_invsqrt_d(rsq22);
293 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
294 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
295 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
296 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
297 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
298 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
299 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
300 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
301 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
303 fjx0 = _mm256_setzero_pd();
304 fjy0 = _mm256_setzero_pd();
305 fjz0 = _mm256_setzero_pd();
306 fjx1 = _mm256_setzero_pd();
307 fjy1 = _mm256_setzero_pd();
308 fjz1 = _mm256_setzero_pd();
309 fjx2 = _mm256_setzero_pd();
310 fjy2 = _mm256_setzero_pd();
311 fjz2 = _mm256_setzero_pd();
313 /**************************
314 * CALCULATE INTERACTIONS *
315 **************************/
317 r00 = _mm256_mul_pd(rsq00,rinv00);
319 /* Calculate table index by multiplying r with table scale and truncate to integer */
320 rt = _mm256_mul_pd(r00,vftabscale);
321 vfitab = _mm256_cvttpd_epi32(rt);
322 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
323 vfitab = _mm_slli_epi32(vfitab,3);
325 /* EWALD ELECTROSTATICS */
327 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
328 ewrt = _mm256_mul_pd(r00,ewtabscale);
329 ewitab = _mm256_cvttpd_epi32(ewrt);
330 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
331 ewitab = _mm_slli_epi32(ewitab,2);
332 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
333 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
334 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
335 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
336 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
337 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
338 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
339 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
340 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
342 /* CUBIC SPLINE TABLE DISPERSION */
343 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
344 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
345 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
346 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
347 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
348 Heps = _mm256_mul_pd(vfeps,H);
349 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
350 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
351 vvdw6 = _mm256_mul_pd(c6_00,VV);
352 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
353 fvdw6 = _mm256_mul_pd(c6_00,FF);
355 /* CUBIC SPLINE TABLE REPULSION */
356 vfitab = _mm_add_epi32(vfitab,ifour);
357 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
358 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
359 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
360 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
361 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
362 Heps = _mm256_mul_pd(vfeps,H);
363 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
364 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
365 vvdw12 = _mm256_mul_pd(c12_00,VV);
366 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
367 fvdw12 = _mm256_mul_pd(c12_00,FF);
368 vvdw = _mm256_add_pd(vvdw12,vvdw6);
369 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 velecsum = _mm256_add_pd(velecsum,velec);
373 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
375 fscal = _mm256_add_pd(felec,fvdw);
377 /* Calculate temporary vectorial force */
378 tx = _mm256_mul_pd(fscal,dx00);
379 ty = _mm256_mul_pd(fscal,dy00);
380 tz = _mm256_mul_pd(fscal,dz00);
382 /* Update vectorial force */
383 fix0 = _mm256_add_pd(fix0,tx);
384 fiy0 = _mm256_add_pd(fiy0,ty);
385 fiz0 = _mm256_add_pd(fiz0,tz);
387 fjx0 = _mm256_add_pd(fjx0,tx);
388 fjy0 = _mm256_add_pd(fjy0,ty);
389 fjz0 = _mm256_add_pd(fjz0,tz);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 r01 = _mm256_mul_pd(rsq01,rinv01);
397 /* EWALD ELECTROSTATICS */
399 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
400 ewrt = _mm256_mul_pd(r01,ewtabscale);
401 ewitab = _mm256_cvttpd_epi32(ewrt);
402 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
403 ewitab = _mm_slli_epi32(ewitab,2);
404 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
405 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
406 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
407 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
408 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
409 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
410 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
411 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
412 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velecsum = _mm256_add_pd(velecsum,velec);
417 fscal = felec;
419 /* Calculate temporary vectorial force */
420 tx = _mm256_mul_pd(fscal,dx01);
421 ty = _mm256_mul_pd(fscal,dy01);
422 tz = _mm256_mul_pd(fscal,dz01);
424 /* Update vectorial force */
425 fix0 = _mm256_add_pd(fix0,tx);
426 fiy0 = _mm256_add_pd(fiy0,ty);
427 fiz0 = _mm256_add_pd(fiz0,tz);
429 fjx1 = _mm256_add_pd(fjx1,tx);
430 fjy1 = _mm256_add_pd(fjy1,ty);
431 fjz1 = _mm256_add_pd(fjz1,tz);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 r02 = _mm256_mul_pd(rsq02,rinv02);
439 /* EWALD ELECTROSTATICS */
441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442 ewrt = _mm256_mul_pd(r02,ewtabscale);
443 ewitab = _mm256_cvttpd_epi32(ewrt);
444 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
445 ewitab = _mm_slli_epi32(ewitab,2);
446 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
447 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
448 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
449 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
450 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
451 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
452 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
453 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
454 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velecsum = _mm256_add_pd(velecsum,velec);
459 fscal = felec;
461 /* Calculate temporary vectorial force */
462 tx = _mm256_mul_pd(fscal,dx02);
463 ty = _mm256_mul_pd(fscal,dy02);
464 tz = _mm256_mul_pd(fscal,dz02);
466 /* Update vectorial force */
467 fix0 = _mm256_add_pd(fix0,tx);
468 fiy0 = _mm256_add_pd(fiy0,ty);
469 fiz0 = _mm256_add_pd(fiz0,tz);
471 fjx2 = _mm256_add_pd(fjx2,tx);
472 fjy2 = _mm256_add_pd(fjy2,ty);
473 fjz2 = _mm256_add_pd(fjz2,tz);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 r10 = _mm256_mul_pd(rsq10,rinv10);
481 /* EWALD ELECTROSTATICS */
483 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
484 ewrt = _mm256_mul_pd(r10,ewtabscale);
485 ewitab = _mm256_cvttpd_epi32(ewrt);
486 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
487 ewitab = _mm_slli_epi32(ewitab,2);
488 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
489 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
490 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
491 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
492 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
493 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
494 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
495 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
496 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
498 /* Update potential sum for this i atom from the interaction with this j atom. */
499 velecsum = _mm256_add_pd(velecsum,velec);
501 fscal = felec;
503 /* Calculate temporary vectorial force */
504 tx = _mm256_mul_pd(fscal,dx10);
505 ty = _mm256_mul_pd(fscal,dy10);
506 tz = _mm256_mul_pd(fscal,dz10);
508 /* Update vectorial force */
509 fix1 = _mm256_add_pd(fix1,tx);
510 fiy1 = _mm256_add_pd(fiy1,ty);
511 fiz1 = _mm256_add_pd(fiz1,tz);
513 fjx0 = _mm256_add_pd(fjx0,tx);
514 fjy0 = _mm256_add_pd(fjy0,ty);
515 fjz0 = _mm256_add_pd(fjz0,tz);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 r11 = _mm256_mul_pd(rsq11,rinv11);
523 /* EWALD ELECTROSTATICS */
525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
526 ewrt = _mm256_mul_pd(r11,ewtabscale);
527 ewitab = _mm256_cvttpd_epi32(ewrt);
528 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
529 ewitab = _mm_slli_epi32(ewitab,2);
530 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
531 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
532 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
533 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
534 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
535 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
536 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
537 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
538 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
540 /* Update potential sum for this i atom from the interaction with this j atom. */
541 velecsum = _mm256_add_pd(velecsum,velec);
543 fscal = felec;
545 /* Calculate temporary vectorial force */
546 tx = _mm256_mul_pd(fscal,dx11);
547 ty = _mm256_mul_pd(fscal,dy11);
548 tz = _mm256_mul_pd(fscal,dz11);
550 /* Update vectorial force */
551 fix1 = _mm256_add_pd(fix1,tx);
552 fiy1 = _mm256_add_pd(fiy1,ty);
553 fiz1 = _mm256_add_pd(fiz1,tz);
555 fjx1 = _mm256_add_pd(fjx1,tx);
556 fjy1 = _mm256_add_pd(fjy1,ty);
557 fjz1 = _mm256_add_pd(fjz1,tz);
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
563 r12 = _mm256_mul_pd(rsq12,rinv12);
565 /* EWALD ELECTROSTATICS */
567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
568 ewrt = _mm256_mul_pd(r12,ewtabscale);
569 ewitab = _mm256_cvttpd_epi32(ewrt);
570 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
571 ewitab = _mm_slli_epi32(ewitab,2);
572 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
573 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
574 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
575 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
576 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
577 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
578 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
579 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
580 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velecsum = _mm256_add_pd(velecsum,velec);
585 fscal = felec;
587 /* Calculate temporary vectorial force */
588 tx = _mm256_mul_pd(fscal,dx12);
589 ty = _mm256_mul_pd(fscal,dy12);
590 tz = _mm256_mul_pd(fscal,dz12);
592 /* Update vectorial force */
593 fix1 = _mm256_add_pd(fix1,tx);
594 fiy1 = _mm256_add_pd(fiy1,ty);
595 fiz1 = _mm256_add_pd(fiz1,tz);
597 fjx2 = _mm256_add_pd(fjx2,tx);
598 fjy2 = _mm256_add_pd(fjy2,ty);
599 fjz2 = _mm256_add_pd(fjz2,tz);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r20 = _mm256_mul_pd(rsq20,rinv20);
607 /* EWALD ELECTROSTATICS */
609 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
610 ewrt = _mm256_mul_pd(r20,ewtabscale);
611 ewitab = _mm256_cvttpd_epi32(ewrt);
612 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
613 ewitab = _mm_slli_epi32(ewitab,2);
614 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
615 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
616 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
617 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
618 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
619 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
620 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
621 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
622 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
624 /* Update potential sum for this i atom from the interaction with this j atom. */
625 velecsum = _mm256_add_pd(velecsum,velec);
627 fscal = felec;
629 /* Calculate temporary vectorial force */
630 tx = _mm256_mul_pd(fscal,dx20);
631 ty = _mm256_mul_pd(fscal,dy20);
632 tz = _mm256_mul_pd(fscal,dz20);
634 /* Update vectorial force */
635 fix2 = _mm256_add_pd(fix2,tx);
636 fiy2 = _mm256_add_pd(fiy2,ty);
637 fiz2 = _mm256_add_pd(fiz2,tz);
639 fjx0 = _mm256_add_pd(fjx0,tx);
640 fjy0 = _mm256_add_pd(fjy0,ty);
641 fjz0 = _mm256_add_pd(fjz0,tz);
643 /**************************
644 * CALCULATE INTERACTIONS *
645 **************************/
647 r21 = _mm256_mul_pd(rsq21,rinv21);
649 /* EWALD ELECTROSTATICS */
651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
652 ewrt = _mm256_mul_pd(r21,ewtabscale);
653 ewitab = _mm256_cvttpd_epi32(ewrt);
654 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
655 ewitab = _mm_slli_epi32(ewitab,2);
656 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
657 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
658 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
659 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
660 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
661 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
662 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
663 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
664 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
666 /* Update potential sum for this i atom from the interaction with this j atom. */
667 velecsum = _mm256_add_pd(velecsum,velec);
669 fscal = felec;
671 /* Calculate temporary vectorial force */
672 tx = _mm256_mul_pd(fscal,dx21);
673 ty = _mm256_mul_pd(fscal,dy21);
674 tz = _mm256_mul_pd(fscal,dz21);
676 /* Update vectorial force */
677 fix2 = _mm256_add_pd(fix2,tx);
678 fiy2 = _mm256_add_pd(fiy2,ty);
679 fiz2 = _mm256_add_pd(fiz2,tz);
681 fjx1 = _mm256_add_pd(fjx1,tx);
682 fjy1 = _mm256_add_pd(fjy1,ty);
683 fjz1 = _mm256_add_pd(fjz1,tz);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 r22 = _mm256_mul_pd(rsq22,rinv22);
691 /* EWALD ELECTROSTATICS */
693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694 ewrt = _mm256_mul_pd(r22,ewtabscale);
695 ewitab = _mm256_cvttpd_epi32(ewrt);
696 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
697 ewitab = _mm_slli_epi32(ewitab,2);
698 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
699 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
700 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
701 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
702 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
703 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
704 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
705 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
706 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velecsum = _mm256_add_pd(velecsum,velec);
711 fscal = felec;
713 /* Calculate temporary vectorial force */
714 tx = _mm256_mul_pd(fscal,dx22);
715 ty = _mm256_mul_pd(fscal,dy22);
716 tz = _mm256_mul_pd(fscal,dz22);
718 /* Update vectorial force */
719 fix2 = _mm256_add_pd(fix2,tx);
720 fiy2 = _mm256_add_pd(fiy2,ty);
721 fiz2 = _mm256_add_pd(fiz2,tz);
723 fjx2 = _mm256_add_pd(fjx2,tx);
724 fjy2 = _mm256_add_pd(fjy2,ty);
725 fjz2 = _mm256_add_pd(fjz2,tz);
727 fjptrA = f+j_coord_offsetA;
728 fjptrB = f+j_coord_offsetB;
729 fjptrC = f+j_coord_offsetC;
730 fjptrD = f+j_coord_offsetD;
732 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
733 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
735 /* Inner loop uses 403 flops */
738 if(jidx<j_index_end)
741 /* Get j neighbor index, and coordinate index */
742 jnrlistA = jjnr[jidx];
743 jnrlistB = jjnr[jidx+1];
744 jnrlistC = jjnr[jidx+2];
745 jnrlistD = jjnr[jidx+3];
746 /* Sign of each element will be negative for non-real atoms.
747 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
748 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
750 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
752 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
753 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
754 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
756 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
757 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
758 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
759 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
760 j_coord_offsetA = DIM*jnrA;
761 j_coord_offsetB = DIM*jnrB;
762 j_coord_offsetC = DIM*jnrC;
763 j_coord_offsetD = DIM*jnrD;
765 /* load j atom coordinates */
766 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
767 x+j_coord_offsetC,x+j_coord_offsetD,
768 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
770 /* Calculate displacement vector */
771 dx00 = _mm256_sub_pd(ix0,jx0);
772 dy00 = _mm256_sub_pd(iy0,jy0);
773 dz00 = _mm256_sub_pd(iz0,jz0);
774 dx01 = _mm256_sub_pd(ix0,jx1);
775 dy01 = _mm256_sub_pd(iy0,jy1);
776 dz01 = _mm256_sub_pd(iz0,jz1);
777 dx02 = _mm256_sub_pd(ix0,jx2);
778 dy02 = _mm256_sub_pd(iy0,jy2);
779 dz02 = _mm256_sub_pd(iz0,jz2);
780 dx10 = _mm256_sub_pd(ix1,jx0);
781 dy10 = _mm256_sub_pd(iy1,jy0);
782 dz10 = _mm256_sub_pd(iz1,jz0);
783 dx11 = _mm256_sub_pd(ix1,jx1);
784 dy11 = _mm256_sub_pd(iy1,jy1);
785 dz11 = _mm256_sub_pd(iz1,jz1);
786 dx12 = _mm256_sub_pd(ix1,jx2);
787 dy12 = _mm256_sub_pd(iy1,jy2);
788 dz12 = _mm256_sub_pd(iz1,jz2);
789 dx20 = _mm256_sub_pd(ix2,jx0);
790 dy20 = _mm256_sub_pd(iy2,jy0);
791 dz20 = _mm256_sub_pd(iz2,jz0);
792 dx21 = _mm256_sub_pd(ix2,jx1);
793 dy21 = _mm256_sub_pd(iy2,jy1);
794 dz21 = _mm256_sub_pd(iz2,jz1);
795 dx22 = _mm256_sub_pd(ix2,jx2);
796 dy22 = _mm256_sub_pd(iy2,jy2);
797 dz22 = _mm256_sub_pd(iz2,jz2);
799 /* Calculate squared distance and things based on it */
800 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
801 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
802 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
803 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
804 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
805 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
806 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
807 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
808 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
810 rinv00 = avx256_invsqrt_d(rsq00);
811 rinv01 = avx256_invsqrt_d(rsq01);
812 rinv02 = avx256_invsqrt_d(rsq02);
813 rinv10 = avx256_invsqrt_d(rsq10);
814 rinv11 = avx256_invsqrt_d(rsq11);
815 rinv12 = avx256_invsqrt_d(rsq12);
816 rinv20 = avx256_invsqrt_d(rsq20);
817 rinv21 = avx256_invsqrt_d(rsq21);
818 rinv22 = avx256_invsqrt_d(rsq22);
820 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
821 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
822 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
823 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
824 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
825 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
826 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
827 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
828 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
830 fjx0 = _mm256_setzero_pd();
831 fjy0 = _mm256_setzero_pd();
832 fjz0 = _mm256_setzero_pd();
833 fjx1 = _mm256_setzero_pd();
834 fjy1 = _mm256_setzero_pd();
835 fjz1 = _mm256_setzero_pd();
836 fjx2 = _mm256_setzero_pd();
837 fjy2 = _mm256_setzero_pd();
838 fjz2 = _mm256_setzero_pd();
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 r00 = _mm256_mul_pd(rsq00,rinv00);
845 r00 = _mm256_andnot_pd(dummy_mask,r00);
847 /* Calculate table index by multiplying r with table scale and truncate to integer */
848 rt = _mm256_mul_pd(r00,vftabscale);
849 vfitab = _mm256_cvttpd_epi32(rt);
850 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
851 vfitab = _mm_slli_epi32(vfitab,3);
853 /* EWALD ELECTROSTATICS */
855 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
856 ewrt = _mm256_mul_pd(r00,ewtabscale);
857 ewitab = _mm256_cvttpd_epi32(ewrt);
858 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
859 ewitab = _mm_slli_epi32(ewitab,2);
860 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
861 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
862 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
863 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
864 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
865 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
866 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
867 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
868 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
870 /* CUBIC SPLINE TABLE DISPERSION */
871 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
872 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
873 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
874 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
875 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
876 Heps = _mm256_mul_pd(vfeps,H);
877 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
878 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
879 vvdw6 = _mm256_mul_pd(c6_00,VV);
880 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
881 fvdw6 = _mm256_mul_pd(c6_00,FF);
883 /* CUBIC SPLINE TABLE REPULSION */
884 vfitab = _mm_add_epi32(vfitab,ifour);
885 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
886 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
887 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
888 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
889 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
890 Heps = _mm256_mul_pd(vfeps,H);
891 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
892 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
893 vvdw12 = _mm256_mul_pd(c12_00,VV);
894 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
895 fvdw12 = _mm256_mul_pd(c12_00,FF);
896 vvdw = _mm256_add_pd(vvdw12,vvdw6);
897 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec = _mm256_andnot_pd(dummy_mask,velec);
901 velecsum = _mm256_add_pd(velecsum,velec);
902 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
903 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
905 fscal = _mm256_add_pd(felec,fvdw);
907 fscal = _mm256_andnot_pd(dummy_mask,fscal);
909 /* Calculate temporary vectorial force */
910 tx = _mm256_mul_pd(fscal,dx00);
911 ty = _mm256_mul_pd(fscal,dy00);
912 tz = _mm256_mul_pd(fscal,dz00);
914 /* Update vectorial force */
915 fix0 = _mm256_add_pd(fix0,tx);
916 fiy0 = _mm256_add_pd(fiy0,ty);
917 fiz0 = _mm256_add_pd(fiz0,tz);
919 fjx0 = _mm256_add_pd(fjx0,tx);
920 fjy0 = _mm256_add_pd(fjy0,ty);
921 fjz0 = _mm256_add_pd(fjz0,tz);
923 /**************************
924 * CALCULATE INTERACTIONS *
925 **************************/
927 r01 = _mm256_mul_pd(rsq01,rinv01);
928 r01 = _mm256_andnot_pd(dummy_mask,r01);
930 /* EWALD ELECTROSTATICS */
932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
933 ewrt = _mm256_mul_pd(r01,ewtabscale);
934 ewitab = _mm256_cvttpd_epi32(ewrt);
935 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
936 ewitab = _mm_slli_epi32(ewitab,2);
937 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
938 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
939 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
940 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
941 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
942 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
943 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
944 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
945 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
947 /* Update potential sum for this i atom from the interaction with this j atom. */
948 velec = _mm256_andnot_pd(dummy_mask,velec);
949 velecsum = _mm256_add_pd(velecsum,velec);
951 fscal = felec;
953 fscal = _mm256_andnot_pd(dummy_mask,fscal);
955 /* Calculate temporary vectorial force */
956 tx = _mm256_mul_pd(fscal,dx01);
957 ty = _mm256_mul_pd(fscal,dy01);
958 tz = _mm256_mul_pd(fscal,dz01);
960 /* Update vectorial force */
961 fix0 = _mm256_add_pd(fix0,tx);
962 fiy0 = _mm256_add_pd(fiy0,ty);
963 fiz0 = _mm256_add_pd(fiz0,tz);
965 fjx1 = _mm256_add_pd(fjx1,tx);
966 fjy1 = _mm256_add_pd(fjy1,ty);
967 fjz1 = _mm256_add_pd(fjz1,tz);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 r02 = _mm256_mul_pd(rsq02,rinv02);
974 r02 = _mm256_andnot_pd(dummy_mask,r02);
976 /* EWALD ELECTROSTATICS */
978 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979 ewrt = _mm256_mul_pd(r02,ewtabscale);
980 ewitab = _mm256_cvttpd_epi32(ewrt);
981 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
982 ewitab = _mm_slli_epi32(ewitab,2);
983 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
984 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
985 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
986 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
987 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
988 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
989 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
990 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
991 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
993 /* Update potential sum for this i atom from the interaction with this j atom. */
994 velec = _mm256_andnot_pd(dummy_mask,velec);
995 velecsum = _mm256_add_pd(velecsum,velec);
997 fscal = felec;
999 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1001 /* Calculate temporary vectorial force */
1002 tx = _mm256_mul_pd(fscal,dx02);
1003 ty = _mm256_mul_pd(fscal,dy02);
1004 tz = _mm256_mul_pd(fscal,dz02);
1006 /* Update vectorial force */
1007 fix0 = _mm256_add_pd(fix0,tx);
1008 fiy0 = _mm256_add_pd(fiy0,ty);
1009 fiz0 = _mm256_add_pd(fiz0,tz);
1011 fjx2 = _mm256_add_pd(fjx2,tx);
1012 fjy2 = _mm256_add_pd(fjy2,ty);
1013 fjz2 = _mm256_add_pd(fjz2,tz);
1015 /**************************
1016 * CALCULATE INTERACTIONS *
1017 **************************/
1019 r10 = _mm256_mul_pd(rsq10,rinv10);
1020 r10 = _mm256_andnot_pd(dummy_mask,r10);
1022 /* EWALD ELECTROSTATICS */
1024 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1025 ewrt = _mm256_mul_pd(r10,ewtabscale);
1026 ewitab = _mm256_cvttpd_epi32(ewrt);
1027 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1028 ewitab = _mm_slli_epi32(ewitab,2);
1029 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1030 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1031 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1032 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1033 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1034 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1035 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1036 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1037 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1039 /* Update potential sum for this i atom from the interaction with this j atom. */
1040 velec = _mm256_andnot_pd(dummy_mask,velec);
1041 velecsum = _mm256_add_pd(velecsum,velec);
1043 fscal = felec;
1045 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1047 /* Calculate temporary vectorial force */
1048 tx = _mm256_mul_pd(fscal,dx10);
1049 ty = _mm256_mul_pd(fscal,dy10);
1050 tz = _mm256_mul_pd(fscal,dz10);
1052 /* Update vectorial force */
1053 fix1 = _mm256_add_pd(fix1,tx);
1054 fiy1 = _mm256_add_pd(fiy1,ty);
1055 fiz1 = _mm256_add_pd(fiz1,tz);
1057 fjx0 = _mm256_add_pd(fjx0,tx);
1058 fjy0 = _mm256_add_pd(fjy0,ty);
1059 fjz0 = _mm256_add_pd(fjz0,tz);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 r11 = _mm256_mul_pd(rsq11,rinv11);
1066 r11 = _mm256_andnot_pd(dummy_mask,r11);
1068 /* EWALD ELECTROSTATICS */
1070 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1071 ewrt = _mm256_mul_pd(r11,ewtabscale);
1072 ewitab = _mm256_cvttpd_epi32(ewrt);
1073 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1074 ewitab = _mm_slli_epi32(ewitab,2);
1075 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1076 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1077 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1078 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1079 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1080 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1081 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1082 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
1083 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1085 /* Update potential sum for this i atom from the interaction with this j atom. */
1086 velec = _mm256_andnot_pd(dummy_mask,velec);
1087 velecsum = _mm256_add_pd(velecsum,velec);
1089 fscal = felec;
1091 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1093 /* Calculate temporary vectorial force */
1094 tx = _mm256_mul_pd(fscal,dx11);
1095 ty = _mm256_mul_pd(fscal,dy11);
1096 tz = _mm256_mul_pd(fscal,dz11);
1098 /* Update vectorial force */
1099 fix1 = _mm256_add_pd(fix1,tx);
1100 fiy1 = _mm256_add_pd(fiy1,ty);
1101 fiz1 = _mm256_add_pd(fiz1,tz);
1103 fjx1 = _mm256_add_pd(fjx1,tx);
1104 fjy1 = _mm256_add_pd(fjy1,ty);
1105 fjz1 = _mm256_add_pd(fjz1,tz);
1107 /**************************
1108 * CALCULATE INTERACTIONS *
1109 **************************/
1111 r12 = _mm256_mul_pd(rsq12,rinv12);
1112 r12 = _mm256_andnot_pd(dummy_mask,r12);
1114 /* EWALD ELECTROSTATICS */
1116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1117 ewrt = _mm256_mul_pd(r12,ewtabscale);
1118 ewitab = _mm256_cvttpd_epi32(ewrt);
1119 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1120 ewitab = _mm_slli_epi32(ewitab,2);
1121 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1122 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1123 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1124 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1125 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1126 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1127 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1128 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1129 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1131 /* Update potential sum for this i atom from the interaction with this j atom. */
1132 velec = _mm256_andnot_pd(dummy_mask,velec);
1133 velecsum = _mm256_add_pd(velecsum,velec);
1135 fscal = felec;
1137 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1139 /* Calculate temporary vectorial force */
1140 tx = _mm256_mul_pd(fscal,dx12);
1141 ty = _mm256_mul_pd(fscal,dy12);
1142 tz = _mm256_mul_pd(fscal,dz12);
1144 /* Update vectorial force */
1145 fix1 = _mm256_add_pd(fix1,tx);
1146 fiy1 = _mm256_add_pd(fiy1,ty);
1147 fiz1 = _mm256_add_pd(fiz1,tz);
1149 fjx2 = _mm256_add_pd(fjx2,tx);
1150 fjy2 = _mm256_add_pd(fjy2,ty);
1151 fjz2 = _mm256_add_pd(fjz2,tz);
1153 /**************************
1154 * CALCULATE INTERACTIONS *
1155 **************************/
1157 r20 = _mm256_mul_pd(rsq20,rinv20);
1158 r20 = _mm256_andnot_pd(dummy_mask,r20);
1160 /* EWALD ELECTROSTATICS */
1162 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1163 ewrt = _mm256_mul_pd(r20,ewtabscale);
1164 ewitab = _mm256_cvttpd_epi32(ewrt);
1165 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1166 ewitab = _mm_slli_epi32(ewitab,2);
1167 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1168 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1169 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1170 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1171 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1172 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1173 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1174 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1175 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec = _mm256_andnot_pd(dummy_mask,velec);
1179 velecsum = _mm256_add_pd(velecsum,velec);
1181 fscal = felec;
1183 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1185 /* Calculate temporary vectorial force */
1186 tx = _mm256_mul_pd(fscal,dx20);
1187 ty = _mm256_mul_pd(fscal,dy20);
1188 tz = _mm256_mul_pd(fscal,dz20);
1190 /* Update vectorial force */
1191 fix2 = _mm256_add_pd(fix2,tx);
1192 fiy2 = _mm256_add_pd(fiy2,ty);
1193 fiz2 = _mm256_add_pd(fiz2,tz);
1195 fjx0 = _mm256_add_pd(fjx0,tx);
1196 fjy0 = _mm256_add_pd(fjy0,ty);
1197 fjz0 = _mm256_add_pd(fjz0,tz);
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 r21 = _mm256_mul_pd(rsq21,rinv21);
1204 r21 = _mm256_andnot_pd(dummy_mask,r21);
1206 /* EWALD ELECTROSTATICS */
1208 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1209 ewrt = _mm256_mul_pd(r21,ewtabscale);
1210 ewitab = _mm256_cvttpd_epi32(ewrt);
1211 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1212 ewitab = _mm_slli_epi32(ewitab,2);
1213 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1214 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1215 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1216 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1217 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1218 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1219 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1220 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1221 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1223 /* Update potential sum for this i atom from the interaction with this j atom. */
1224 velec = _mm256_andnot_pd(dummy_mask,velec);
1225 velecsum = _mm256_add_pd(velecsum,velec);
1227 fscal = felec;
1229 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1231 /* Calculate temporary vectorial force */
1232 tx = _mm256_mul_pd(fscal,dx21);
1233 ty = _mm256_mul_pd(fscal,dy21);
1234 tz = _mm256_mul_pd(fscal,dz21);
1236 /* Update vectorial force */
1237 fix2 = _mm256_add_pd(fix2,tx);
1238 fiy2 = _mm256_add_pd(fiy2,ty);
1239 fiz2 = _mm256_add_pd(fiz2,tz);
1241 fjx1 = _mm256_add_pd(fjx1,tx);
1242 fjy1 = _mm256_add_pd(fjy1,ty);
1243 fjz1 = _mm256_add_pd(fjz1,tz);
1245 /**************************
1246 * CALCULATE INTERACTIONS *
1247 **************************/
1249 r22 = _mm256_mul_pd(rsq22,rinv22);
1250 r22 = _mm256_andnot_pd(dummy_mask,r22);
1252 /* EWALD ELECTROSTATICS */
1254 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1255 ewrt = _mm256_mul_pd(r22,ewtabscale);
1256 ewitab = _mm256_cvttpd_epi32(ewrt);
1257 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1258 ewitab = _mm_slli_epi32(ewitab,2);
1259 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1260 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1261 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1262 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1263 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1264 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1265 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1266 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1267 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1269 /* Update potential sum for this i atom from the interaction with this j atom. */
1270 velec = _mm256_andnot_pd(dummy_mask,velec);
1271 velecsum = _mm256_add_pd(velecsum,velec);
1273 fscal = felec;
1275 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1277 /* Calculate temporary vectorial force */
1278 tx = _mm256_mul_pd(fscal,dx22);
1279 ty = _mm256_mul_pd(fscal,dy22);
1280 tz = _mm256_mul_pd(fscal,dz22);
1282 /* Update vectorial force */
1283 fix2 = _mm256_add_pd(fix2,tx);
1284 fiy2 = _mm256_add_pd(fiy2,ty);
1285 fiz2 = _mm256_add_pd(fiz2,tz);
1287 fjx2 = _mm256_add_pd(fjx2,tx);
1288 fjy2 = _mm256_add_pd(fjy2,ty);
1289 fjz2 = _mm256_add_pd(fjz2,tz);
1291 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1292 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1293 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1294 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1296 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1297 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1299 /* Inner loop uses 412 flops */
1302 /* End of innermost loop */
1304 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1305 f+i_coord_offset,fshift+i_shift_offset);
1307 ggid = gid[iidx];
1308 /* Update potential energies */
1309 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1310 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1312 /* Increment number of inner iterations */
1313 inneriter += j_index_end - j_index_start;
1315 /* Outer loop uses 20 flops */
1318 /* Increment number of outer iterations */
1319 outeriter += nri;
1321 /* Update outer/inner flops */
1323 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*412);
1326 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1327 * Electrostatics interaction: Ewald
1328 * VdW interaction: CubicSplineTable
1329 * Geometry: Water3-Water3
1330 * Calculate force/pot: Force
1332 void
1333 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1334 (t_nblist * gmx_restrict nlist,
1335 rvec * gmx_restrict xx,
1336 rvec * gmx_restrict ff,
1337 struct t_forcerec * gmx_restrict fr,
1338 t_mdatoms * gmx_restrict mdatoms,
1339 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1340 t_nrnb * gmx_restrict nrnb)
1342 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1343 * just 0 for non-waters.
1344 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1345 * jnr indices corresponding to data put in the four positions in the SIMD register.
1347 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1348 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1349 int jnrA,jnrB,jnrC,jnrD;
1350 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1351 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1352 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1353 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1354 real rcutoff_scalar;
1355 real *shiftvec,*fshift,*x,*f;
1356 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1357 real scratch[4*DIM];
1358 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1359 real * vdwioffsetptr0;
1360 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1361 real * vdwioffsetptr1;
1362 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1363 real * vdwioffsetptr2;
1364 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1365 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1366 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1367 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1368 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1369 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1370 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1371 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1372 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1373 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1374 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1375 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1376 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1377 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1378 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1379 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1380 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1381 real *charge;
1382 int nvdwtype;
1383 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1384 int *vdwtype;
1385 real *vdwparam;
1386 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1387 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1388 __m128i vfitab;
1389 __m128i ifour = _mm_set1_epi32(4);
1390 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1391 real *vftab;
1392 __m128i ewitab;
1393 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1394 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1395 real *ewtab;
1396 __m256d dummy_mask,cutoff_mask;
1397 __m128 tmpmask0,tmpmask1;
1398 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1399 __m256d one = _mm256_set1_pd(1.0);
1400 __m256d two = _mm256_set1_pd(2.0);
1401 x = xx[0];
1402 f = ff[0];
1404 nri = nlist->nri;
1405 iinr = nlist->iinr;
1406 jindex = nlist->jindex;
1407 jjnr = nlist->jjnr;
1408 shiftidx = nlist->shift;
1409 gid = nlist->gid;
1410 shiftvec = fr->shift_vec[0];
1411 fshift = fr->fshift[0];
1412 facel = _mm256_set1_pd(fr->ic->epsfac);
1413 charge = mdatoms->chargeA;
1414 nvdwtype = fr->ntype;
1415 vdwparam = fr->nbfp;
1416 vdwtype = mdatoms->typeA;
1418 vftab = kernel_data->table_vdw->data;
1419 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1421 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1422 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1423 beta2 = _mm256_mul_pd(beta,beta);
1424 beta3 = _mm256_mul_pd(beta,beta2);
1426 ewtab = fr->ic->tabq_coul_F;
1427 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1428 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1430 /* Setup water-specific parameters */
1431 inr = nlist->iinr[0];
1432 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1433 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1434 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1435 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1437 jq0 = _mm256_set1_pd(charge[inr+0]);
1438 jq1 = _mm256_set1_pd(charge[inr+1]);
1439 jq2 = _mm256_set1_pd(charge[inr+2]);
1440 vdwjidx0A = 2*vdwtype[inr+0];
1441 qq00 = _mm256_mul_pd(iq0,jq0);
1442 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1443 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1444 qq01 = _mm256_mul_pd(iq0,jq1);
1445 qq02 = _mm256_mul_pd(iq0,jq2);
1446 qq10 = _mm256_mul_pd(iq1,jq0);
1447 qq11 = _mm256_mul_pd(iq1,jq1);
1448 qq12 = _mm256_mul_pd(iq1,jq2);
1449 qq20 = _mm256_mul_pd(iq2,jq0);
1450 qq21 = _mm256_mul_pd(iq2,jq1);
1451 qq22 = _mm256_mul_pd(iq2,jq2);
1453 /* Avoid stupid compiler warnings */
1454 jnrA = jnrB = jnrC = jnrD = 0;
1455 j_coord_offsetA = 0;
1456 j_coord_offsetB = 0;
1457 j_coord_offsetC = 0;
1458 j_coord_offsetD = 0;
1460 outeriter = 0;
1461 inneriter = 0;
1463 for(iidx=0;iidx<4*DIM;iidx++)
1465 scratch[iidx] = 0.0;
1468 /* Start outer loop over neighborlists */
1469 for(iidx=0; iidx<nri; iidx++)
1471 /* Load shift vector for this list */
1472 i_shift_offset = DIM*shiftidx[iidx];
1474 /* Load limits for loop over neighbors */
1475 j_index_start = jindex[iidx];
1476 j_index_end = jindex[iidx+1];
1478 /* Get outer coordinate index */
1479 inr = iinr[iidx];
1480 i_coord_offset = DIM*inr;
1482 /* Load i particle coords and add shift vector */
1483 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1484 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1486 fix0 = _mm256_setzero_pd();
1487 fiy0 = _mm256_setzero_pd();
1488 fiz0 = _mm256_setzero_pd();
1489 fix1 = _mm256_setzero_pd();
1490 fiy1 = _mm256_setzero_pd();
1491 fiz1 = _mm256_setzero_pd();
1492 fix2 = _mm256_setzero_pd();
1493 fiy2 = _mm256_setzero_pd();
1494 fiz2 = _mm256_setzero_pd();
1496 /* Start inner kernel loop */
1497 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1500 /* Get j neighbor index, and coordinate index */
1501 jnrA = jjnr[jidx];
1502 jnrB = jjnr[jidx+1];
1503 jnrC = jjnr[jidx+2];
1504 jnrD = jjnr[jidx+3];
1505 j_coord_offsetA = DIM*jnrA;
1506 j_coord_offsetB = DIM*jnrB;
1507 j_coord_offsetC = DIM*jnrC;
1508 j_coord_offsetD = DIM*jnrD;
1510 /* load j atom coordinates */
1511 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1512 x+j_coord_offsetC,x+j_coord_offsetD,
1513 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1515 /* Calculate displacement vector */
1516 dx00 = _mm256_sub_pd(ix0,jx0);
1517 dy00 = _mm256_sub_pd(iy0,jy0);
1518 dz00 = _mm256_sub_pd(iz0,jz0);
1519 dx01 = _mm256_sub_pd(ix0,jx1);
1520 dy01 = _mm256_sub_pd(iy0,jy1);
1521 dz01 = _mm256_sub_pd(iz0,jz1);
1522 dx02 = _mm256_sub_pd(ix0,jx2);
1523 dy02 = _mm256_sub_pd(iy0,jy2);
1524 dz02 = _mm256_sub_pd(iz0,jz2);
1525 dx10 = _mm256_sub_pd(ix1,jx0);
1526 dy10 = _mm256_sub_pd(iy1,jy0);
1527 dz10 = _mm256_sub_pd(iz1,jz0);
1528 dx11 = _mm256_sub_pd(ix1,jx1);
1529 dy11 = _mm256_sub_pd(iy1,jy1);
1530 dz11 = _mm256_sub_pd(iz1,jz1);
1531 dx12 = _mm256_sub_pd(ix1,jx2);
1532 dy12 = _mm256_sub_pd(iy1,jy2);
1533 dz12 = _mm256_sub_pd(iz1,jz2);
1534 dx20 = _mm256_sub_pd(ix2,jx0);
1535 dy20 = _mm256_sub_pd(iy2,jy0);
1536 dz20 = _mm256_sub_pd(iz2,jz0);
1537 dx21 = _mm256_sub_pd(ix2,jx1);
1538 dy21 = _mm256_sub_pd(iy2,jy1);
1539 dz21 = _mm256_sub_pd(iz2,jz1);
1540 dx22 = _mm256_sub_pd(ix2,jx2);
1541 dy22 = _mm256_sub_pd(iy2,jy2);
1542 dz22 = _mm256_sub_pd(iz2,jz2);
1544 /* Calculate squared distance and things based on it */
1545 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1546 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1547 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1548 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1549 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1550 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1551 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1552 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1553 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1555 rinv00 = avx256_invsqrt_d(rsq00);
1556 rinv01 = avx256_invsqrt_d(rsq01);
1557 rinv02 = avx256_invsqrt_d(rsq02);
1558 rinv10 = avx256_invsqrt_d(rsq10);
1559 rinv11 = avx256_invsqrt_d(rsq11);
1560 rinv12 = avx256_invsqrt_d(rsq12);
1561 rinv20 = avx256_invsqrt_d(rsq20);
1562 rinv21 = avx256_invsqrt_d(rsq21);
1563 rinv22 = avx256_invsqrt_d(rsq22);
1565 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1566 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1567 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1568 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1569 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1570 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1571 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1572 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1573 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1575 fjx0 = _mm256_setzero_pd();
1576 fjy0 = _mm256_setzero_pd();
1577 fjz0 = _mm256_setzero_pd();
1578 fjx1 = _mm256_setzero_pd();
1579 fjy1 = _mm256_setzero_pd();
1580 fjz1 = _mm256_setzero_pd();
1581 fjx2 = _mm256_setzero_pd();
1582 fjy2 = _mm256_setzero_pd();
1583 fjz2 = _mm256_setzero_pd();
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 r00 = _mm256_mul_pd(rsq00,rinv00);
1591 /* Calculate table index by multiplying r with table scale and truncate to integer */
1592 rt = _mm256_mul_pd(r00,vftabscale);
1593 vfitab = _mm256_cvttpd_epi32(rt);
1594 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1595 vfitab = _mm_slli_epi32(vfitab,3);
1597 /* EWALD ELECTROSTATICS */
1599 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1600 ewrt = _mm256_mul_pd(r00,ewtabscale);
1601 ewitab = _mm256_cvttpd_epi32(ewrt);
1602 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1603 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1604 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1605 &ewtabF,&ewtabFn);
1606 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1607 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1609 /* CUBIC SPLINE TABLE DISPERSION */
1610 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1611 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1612 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1613 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1614 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1615 Heps = _mm256_mul_pd(vfeps,H);
1616 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1617 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1618 fvdw6 = _mm256_mul_pd(c6_00,FF);
1620 /* CUBIC SPLINE TABLE REPULSION */
1621 vfitab = _mm_add_epi32(vfitab,ifour);
1622 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1623 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1624 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1625 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1626 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1627 Heps = _mm256_mul_pd(vfeps,H);
1628 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1629 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1630 fvdw12 = _mm256_mul_pd(c12_00,FF);
1631 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1633 fscal = _mm256_add_pd(felec,fvdw);
1635 /* Calculate temporary vectorial force */
1636 tx = _mm256_mul_pd(fscal,dx00);
1637 ty = _mm256_mul_pd(fscal,dy00);
1638 tz = _mm256_mul_pd(fscal,dz00);
1640 /* Update vectorial force */
1641 fix0 = _mm256_add_pd(fix0,tx);
1642 fiy0 = _mm256_add_pd(fiy0,ty);
1643 fiz0 = _mm256_add_pd(fiz0,tz);
1645 fjx0 = _mm256_add_pd(fjx0,tx);
1646 fjy0 = _mm256_add_pd(fjy0,ty);
1647 fjz0 = _mm256_add_pd(fjz0,tz);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 r01 = _mm256_mul_pd(rsq01,rinv01);
1655 /* EWALD ELECTROSTATICS */
1657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1658 ewrt = _mm256_mul_pd(r01,ewtabscale);
1659 ewitab = _mm256_cvttpd_epi32(ewrt);
1660 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1661 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1662 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1663 &ewtabF,&ewtabFn);
1664 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1665 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1667 fscal = felec;
1669 /* Calculate temporary vectorial force */
1670 tx = _mm256_mul_pd(fscal,dx01);
1671 ty = _mm256_mul_pd(fscal,dy01);
1672 tz = _mm256_mul_pd(fscal,dz01);
1674 /* Update vectorial force */
1675 fix0 = _mm256_add_pd(fix0,tx);
1676 fiy0 = _mm256_add_pd(fiy0,ty);
1677 fiz0 = _mm256_add_pd(fiz0,tz);
1679 fjx1 = _mm256_add_pd(fjx1,tx);
1680 fjy1 = _mm256_add_pd(fjy1,ty);
1681 fjz1 = _mm256_add_pd(fjz1,tz);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 r02 = _mm256_mul_pd(rsq02,rinv02);
1689 /* EWALD ELECTROSTATICS */
1691 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1692 ewrt = _mm256_mul_pd(r02,ewtabscale);
1693 ewitab = _mm256_cvttpd_epi32(ewrt);
1694 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1695 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1696 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1697 &ewtabF,&ewtabFn);
1698 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1699 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1701 fscal = felec;
1703 /* Calculate temporary vectorial force */
1704 tx = _mm256_mul_pd(fscal,dx02);
1705 ty = _mm256_mul_pd(fscal,dy02);
1706 tz = _mm256_mul_pd(fscal,dz02);
1708 /* Update vectorial force */
1709 fix0 = _mm256_add_pd(fix0,tx);
1710 fiy0 = _mm256_add_pd(fiy0,ty);
1711 fiz0 = _mm256_add_pd(fiz0,tz);
1713 fjx2 = _mm256_add_pd(fjx2,tx);
1714 fjy2 = _mm256_add_pd(fjy2,ty);
1715 fjz2 = _mm256_add_pd(fjz2,tz);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 r10 = _mm256_mul_pd(rsq10,rinv10);
1723 /* EWALD ELECTROSTATICS */
1725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726 ewrt = _mm256_mul_pd(r10,ewtabscale);
1727 ewitab = _mm256_cvttpd_epi32(ewrt);
1728 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1729 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1730 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1731 &ewtabF,&ewtabFn);
1732 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1733 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1735 fscal = felec;
1737 /* Calculate temporary vectorial force */
1738 tx = _mm256_mul_pd(fscal,dx10);
1739 ty = _mm256_mul_pd(fscal,dy10);
1740 tz = _mm256_mul_pd(fscal,dz10);
1742 /* Update vectorial force */
1743 fix1 = _mm256_add_pd(fix1,tx);
1744 fiy1 = _mm256_add_pd(fiy1,ty);
1745 fiz1 = _mm256_add_pd(fiz1,tz);
1747 fjx0 = _mm256_add_pd(fjx0,tx);
1748 fjy0 = _mm256_add_pd(fjy0,ty);
1749 fjz0 = _mm256_add_pd(fjz0,tz);
1751 /**************************
1752 * CALCULATE INTERACTIONS *
1753 **************************/
1755 r11 = _mm256_mul_pd(rsq11,rinv11);
1757 /* EWALD ELECTROSTATICS */
1759 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1760 ewrt = _mm256_mul_pd(r11,ewtabscale);
1761 ewitab = _mm256_cvttpd_epi32(ewrt);
1762 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1763 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1764 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1765 &ewtabF,&ewtabFn);
1766 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1767 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1769 fscal = felec;
1771 /* Calculate temporary vectorial force */
1772 tx = _mm256_mul_pd(fscal,dx11);
1773 ty = _mm256_mul_pd(fscal,dy11);
1774 tz = _mm256_mul_pd(fscal,dz11);
1776 /* Update vectorial force */
1777 fix1 = _mm256_add_pd(fix1,tx);
1778 fiy1 = _mm256_add_pd(fiy1,ty);
1779 fiz1 = _mm256_add_pd(fiz1,tz);
1781 fjx1 = _mm256_add_pd(fjx1,tx);
1782 fjy1 = _mm256_add_pd(fjy1,ty);
1783 fjz1 = _mm256_add_pd(fjz1,tz);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r12 = _mm256_mul_pd(rsq12,rinv12);
1791 /* EWALD ELECTROSTATICS */
1793 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1794 ewrt = _mm256_mul_pd(r12,ewtabscale);
1795 ewitab = _mm256_cvttpd_epi32(ewrt);
1796 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1797 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1798 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1799 &ewtabF,&ewtabFn);
1800 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1801 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1803 fscal = felec;
1805 /* Calculate temporary vectorial force */
1806 tx = _mm256_mul_pd(fscal,dx12);
1807 ty = _mm256_mul_pd(fscal,dy12);
1808 tz = _mm256_mul_pd(fscal,dz12);
1810 /* Update vectorial force */
1811 fix1 = _mm256_add_pd(fix1,tx);
1812 fiy1 = _mm256_add_pd(fiy1,ty);
1813 fiz1 = _mm256_add_pd(fiz1,tz);
1815 fjx2 = _mm256_add_pd(fjx2,tx);
1816 fjy2 = _mm256_add_pd(fjy2,ty);
1817 fjz2 = _mm256_add_pd(fjz2,tz);
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1823 r20 = _mm256_mul_pd(rsq20,rinv20);
1825 /* EWALD ELECTROSTATICS */
1827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1828 ewrt = _mm256_mul_pd(r20,ewtabscale);
1829 ewitab = _mm256_cvttpd_epi32(ewrt);
1830 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1831 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1832 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1833 &ewtabF,&ewtabFn);
1834 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1835 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1837 fscal = felec;
1839 /* Calculate temporary vectorial force */
1840 tx = _mm256_mul_pd(fscal,dx20);
1841 ty = _mm256_mul_pd(fscal,dy20);
1842 tz = _mm256_mul_pd(fscal,dz20);
1844 /* Update vectorial force */
1845 fix2 = _mm256_add_pd(fix2,tx);
1846 fiy2 = _mm256_add_pd(fiy2,ty);
1847 fiz2 = _mm256_add_pd(fiz2,tz);
1849 fjx0 = _mm256_add_pd(fjx0,tx);
1850 fjy0 = _mm256_add_pd(fjy0,ty);
1851 fjz0 = _mm256_add_pd(fjz0,tz);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 r21 = _mm256_mul_pd(rsq21,rinv21);
1859 /* EWALD ELECTROSTATICS */
1861 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862 ewrt = _mm256_mul_pd(r21,ewtabscale);
1863 ewitab = _mm256_cvttpd_epi32(ewrt);
1864 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1865 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1866 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1867 &ewtabF,&ewtabFn);
1868 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1869 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1871 fscal = felec;
1873 /* Calculate temporary vectorial force */
1874 tx = _mm256_mul_pd(fscal,dx21);
1875 ty = _mm256_mul_pd(fscal,dy21);
1876 tz = _mm256_mul_pd(fscal,dz21);
1878 /* Update vectorial force */
1879 fix2 = _mm256_add_pd(fix2,tx);
1880 fiy2 = _mm256_add_pd(fiy2,ty);
1881 fiz2 = _mm256_add_pd(fiz2,tz);
1883 fjx1 = _mm256_add_pd(fjx1,tx);
1884 fjy1 = _mm256_add_pd(fjy1,ty);
1885 fjz1 = _mm256_add_pd(fjz1,tz);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r22 = _mm256_mul_pd(rsq22,rinv22);
1893 /* EWALD ELECTROSTATICS */
1895 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1896 ewrt = _mm256_mul_pd(r22,ewtabscale);
1897 ewitab = _mm256_cvttpd_epi32(ewrt);
1898 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1899 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1900 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1901 &ewtabF,&ewtabFn);
1902 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1903 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1905 fscal = felec;
1907 /* Calculate temporary vectorial force */
1908 tx = _mm256_mul_pd(fscal,dx22);
1909 ty = _mm256_mul_pd(fscal,dy22);
1910 tz = _mm256_mul_pd(fscal,dz22);
1912 /* Update vectorial force */
1913 fix2 = _mm256_add_pd(fix2,tx);
1914 fiy2 = _mm256_add_pd(fiy2,ty);
1915 fiz2 = _mm256_add_pd(fiz2,tz);
1917 fjx2 = _mm256_add_pd(fjx2,tx);
1918 fjy2 = _mm256_add_pd(fjy2,ty);
1919 fjz2 = _mm256_add_pd(fjz2,tz);
1921 fjptrA = f+j_coord_offsetA;
1922 fjptrB = f+j_coord_offsetB;
1923 fjptrC = f+j_coord_offsetC;
1924 fjptrD = f+j_coord_offsetD;
1926 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1927 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1929 /* Inner loop uses 350 flops */
1932 if(jidx<j_index_end)
1935 /* Get j neighbor index, and coordinate index */
1936 jnrlistA = jjnr[jidx];
1937 jnrlistB = jjnr[jidx+1];
1938 jnrlistC = jjnr[jidx+2];
1939 jnrlistD = jjnr[jidx+3];
1940 /* Sign of each element will be negative for non-real atoms.
1941 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1942 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1944 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1946 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1947 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1948 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1950 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1951 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1952 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1953 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1954 j_coord_offsetA = DIM*jnrA;
1955 j_coord_offsetB = DIM*jnrB;
1956 j_coord_offsetC = DIM*jnrC;
1957 j_coord_offsetD = DIM*jnrD;
1959 /* load j atom coordinates */
1960 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1961 x+j_coord_offsetC,x+j_coord_offsetD,
1962 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1964 /* Calculate displacement vector */
1965 dx00 = _mm256_sub_pd(ix0,jx0);
1966 dy00 = _mm256_sub_pd(iy0,jy0);
1967 dz00 = _mm256_sub_pd(iz0,jz0);
1968 dx01 = _mm256_sub_pd(ix0,jx1);
1969 dy01 = _mm256_sub_pd(iy0,jy1);
1970 dz01 = _mm256_sub_pd(iz0,jz1);
1971 dx02 = _mm256_sub_pd(ix0,jx2);
1972 dy02 = _mm256_sub_pd(iy0,jy2);
1973 dz02 = _mm256_sub_pd(iz0,jz2);
1974 dx10 = _mm256_sub_pd(ix1,jx0);
1975 dy10 = _mm256_sub_pd(iy1,jy0);
1976 dz10 = _mm256_sub_pd(iz1,jz0);
1977 dx11 = _mm256_sub_pd(ix1,jx1);
1978 dy11 = _mm256_sub_pd(iy1,jy1);
1979 dz11 = _mm256_sub_pd(iz1,jz1);
1980 dx12 = _mm256_sub_pd(ix1,jx2);
1981 dy12 = _mm256_sub_pd(iy1,jy2);
1982 dz12 = _mm256_sub_pd(iz1,jz2);
1983 dx20 = _mm256_sub_pd(ix2,jx0);
1984 dy20 = _mm256_sub_pd(iy2,jy0);
1985 dz20 = _mm256_sub_pd(iz2,jz0);
1986 dx21 = _mm256_sub_pd(ix2,jx1);
1987 dy21 = _mm256_sub_pd(iy2,jy1);
1988 dz21 = _mm256_sub_pd(iz2,jz1);
1989 dx22 = _mm256_sub_pd(ix2,jx2);
1990 dy22 = _mm256_sub_pd(iy2,jy2);
1991 dz22 = _mm256_sub_pd(iz2,jz2);
1993 /* Calculate squared distance and things based on it */
1994 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1995 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1996 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1997 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1998 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1999 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2000 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2001 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2002 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2004 rinv00 = avx256_invsqrt_d(rsq00);
2005 rinv01 = avx256_invsqrt_d(rsq01);
2006 rinv02 = avx256_invsqrt_d(rsq02);
2007 rinv10 = avx256_invsqrt_d(rsq10);
2008 rinv11 = avx256_invsqrt_d(rsq11);
2009 rinv12 = avx256_invsqrt_d(rsq12);
2010 rinv20 = avx256_invsqrt_d(rsq20);
2011 rinv21 = avx256_invsqrt_d(rsq21);
2012 rinv22 = avx256_invsqrt_d(rsq22);
2014 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2015 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2016 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2017 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2018 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2019 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2020 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2021 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2022 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2024 fjx0 = _mm256_setzero_pd();
2025 fjy0 = _mm256_setzero_pd();
2026 fjz0 = _mm256_setzero_pd();
2027 fjx1 = _mm256_setzero_pd();
2028 fjy1 = _mm256_setzero_pd();
2029 fjz1 = _mm256_setzero_pd();
2030 fjx2 = _mm256_setzero_pd();
2031 fjy2 = _mm256_setzero_pd();
2032 fjz2 = _mm256_setzero_pd();
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 r00 = _mm256_mul_pd(rsq00,rinv00);
2039 r00 = _mm256_andnot_pd(dummy_mask,r00);
2041 /* Calculate table index by multiplying r with table scale and truncate to integer */
2042 rt = _mm256_mul_pd(r00,vftabscale);
2043 vfitab = _mm256_cvttpd_epi32(rt);
2044 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2045 vfitab = _mm_slli_epi32(vfitab,3);
2047 /* EWALD ELECTROSTATICS */
2049 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2050 ewrt = _mm256_mul_pd(r00,ewtabscale);
2051 ewitab = _mm256_cvttpd_epi32(ewrt);
2052 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2053 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2054 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2055 &ewtabF,&ewtabFn);
2056 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2057 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2059 /* CUBIC SPLINE TABLE DISPERSION */
2060 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2061 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2062 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2063 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2064 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2065 Heps = _mm256_mul_pd(vfeps,H);
2066 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2067 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2068 fvdw6 = _mm256_mul_pd(c6_00,FF);
2070 /* CUBIC SPLINE TABLE REPULSION */
2071 vfitab = _mm_add_epi32(vfitab,ifour);
2072 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2073 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2074 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2075 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2076 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2077 Heps = _mm256_mul_pd(vfeps,H);
2078 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2079 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2080 fvdw12 = _mm256_mul_pd(c12_00,FF);
2081 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
2083 fscal = _mm256_add_pd(felec,fvdw);
2085 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2087 /* Calculate temporary vectorial force */
2088 tx = _mm256_mul_pd(fscal,dx00);
2089 ty = _mm256_mul_pd(fscal,dy00);
2090 tz = _mm256_mul_pd(fscal,dz00);
2092 /* Update vectorial force */
2093 fix0 = _mm256_add_pd(fix0,tx);
2094 fiy0 = _mm256_add_pd(fiy0,ty);
2095 fiz0 = _mm256_add_pd(fiz0,tz);
2097 fjx0 = _mm256_add_pd(fjx0,tx);
2098 fjy0 = _mm256_add_pd(fjy0,ty);
2099 fjz0 = _mm256_add_pd(fjz0,tz);
2101 /**************************
2102 * CALCULATE INTERACTIONS *
2103 **************************/
2105 r01 = _mm256_mul_pd(rsq01,rinv01);
2106 r01 = _mm256_andnot_pd(dummy_mask,r01);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt = _mm256_mul_pd(r01,ewtabscale);
2112 ewitab = _mm256_cvttpd_epi32(ewrt);
2113 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2114 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2115 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2116 &ewtabF,&ewtabFn);
2117 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2118 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2120 fscal = felec;
2122 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2124 /* Calculate temporary vectorial force */
2125 tx = _mm256_mul_pd(fscal,dx01);
2126 ty = _mm256_mul_pd(fscal,dy01);
2127 tz = _mm256_mul_pd(fscal,dz01);
2129 /* Update vectorial force */
2130 fix0 = _mm256_add_pd(fix0,tx);
2131 fiy0 = _mm256_add_pd(fiy0,ty);
2132 fiz0 = _mm256_add_pd(fiz0,tz);
2134 fjx1 = _mm256_add_pd(fjx1,tx);
2135 fjy1 = _mm256_add_pd(fjy1,ty);
2136 fjz1 = _mm256_add_pd(fjz1,tz);
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 r02 = _mm256_mul_pd(rsq02,rinv02);
2143 r02 = _mm256_andnot_pd(dummy_mask,r02);
2145 /* EWALD ELECTROSTATICS */
2147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2148 ewrt = _mm256_mul_pd(r02,ewtabscale);
2149 ewitab = _mm256_cvttpd_epi32(ewrt);
2150 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2151 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2152 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2153 &ewtabF,&ewtabFn);
2154 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2155 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2157 fscal = felec;
2159 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2161 /* Calculate temporary vectorial force */
2162 tx = _mm256_mul_pd(fscal,dx02);
2163 ty = _mm256_mul_pd(fscal,dy02);
2164 tz = _mm256_mul_pd(fscal,dz02);
2166 /* Update vectorial force */
2167 fix0 = _mm256_add_pd(fix0,tx);
2168 fiy0 = _mm256_add_pd(fiy0,ty);
2169 fiz0 = _mm256_add_pd(fiz0,tz);
2171 fjx2 = _mm256_add_pd(fjx2,tx);
2172 fjy2 = _mm256_add_pd(fjy2,ty);
2173 fjz2 = _mm256_add_pd(fjz2,tz);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r10 = _mm256_mul_pd(rsq10,rinv10);
2180 r10 = _mm256_andnot_pd(dummy_mask,r10);
2182 /* EWALD ELECTROSTATICS */
2184 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2185 ewrt = _mm256_mul_pd(r10,ewtabscale);
2186 ewitab = _mm256_cvttpd_epi32(ewrt);
2187 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2188 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2189 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2190 &ewtabF,&ewtabFn);
2191 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2192 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2194 fscal = felec;
2196 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2198 /* Calculate temporary vectorial force */
2199 tx = _mm256_mul_pd(fscal,dx10);
2200 ty = _mm256_mul_pd(fscal,dy10);
2201 tz = _mm256_mul_pd(fscal,dz10);
2203 /* Update vectorial force */
2204 fix1 = _mm256_add_pd(fix1,tx);
2205 fiy1 = _mm256_add_pd(fiy1,ty);
2206 fiz1 = _mm256_add_pd(fiz1,tz);
2208 fjx0 = _mm256_add_pd(fjx0,tx);
2209 fjy0 = _mm256_add_pd(fjy0,ty);
2210 fjz0 = _mm256_add_pd(fjz0,tz);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 r11 = _mm256_mul_pd(rsq11,rinv11);
2217 r11 = _mm256_andnot_pd(dummy_mask,r11);
2219 /* EWALD ELECTROSTATICS */
2221 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2222 ewrt = _mm256_mul_pd(r11,ewtabscale);
2223 ewitab = _mm256_cvttpd_epi32(ewrt);
2224 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2225 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2226 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2227 &ewtabF,&ewtabFn);
2228 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2229 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2231 fscal = felec;
2233 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2235 /* Calculate temporary vectorial force */
2236 tx = _mm256_mul_pd(fscal,dx11);
2237 ty = _mm256_mul_pd(fscal,dy11);
2238 tz = _mm256_mul_pd(fscal,dz11);
2240 /* Update vectorial force */
2241 fix1 = _mm256_add_pd(fix1,tx);
2242 fiy1 = _mm256_add_pd(fiy1,ty);
2243 fiz1 = _mm256_add_pd(fiz1,tz);
2245 fjx1 = _mm256_add_pd(fjx1,tx);
2246 fjy1 = _mm256_add_pd(fjy1,ty);
2247 fjz1 = _mm256_add_pd(fjz1,tz);
2249 /**************************
2250 * CALCULATE INTERACTIONS *
2251 **************************/
2253 r12 = _mm256_mul_pd(rsq12,rinv12);
2254 r12 = _mm256_andnot_pd(dummy_mask,r12);
2256 /* EWALD ELECTROSTATICS */
2258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2259 ewrt = _mm256_mul_pd(r12,ewtabscale);
2260 ewitab = _mm256_cvttpd_epi32(ewrt);
2261 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2262 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2263 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2264 &ewtabF,&ewtabFn);
2265 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2266 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2268 fscal = felec;
2270 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2272 /* Calculate temporary vectorial force */
2273 tx = _mm256_mul_pd(fscal,dx12);
2274 ty = _mm256_mul_pd(fscal,dy12);
2275 tz = _mm256_mul_pd(fscal,dz12);
2277 /* Update vectorial force */
2278 fix1 = _mm256_add_pd(fix1,tx);
2279 fiy1 = _mm256_add_pd(fiy1,ty);
2280 fiz1 = _mm256_add_pd(fiz1,tz);
2282 fjx2 = _mm256_add_pd(fjx2,tx);
2283 fjy2 = _mm256_add_pd(fjy2,ty);
2284 fjz2 = _mm256_add_pd(fjz2,tz);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 r20 = _mm256_mul_pd(rsq20,rinv20);
2291 r20 = _mm256_andnot_pd(dummy_mask,r20);
2293 /* EWALD ELECTROSTATICS */
2295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296 ewrt = _mm256_mul_pd(r20,ewtabscale);
2297 ewitab = _mm256_cvttpd_epi32(ewrt);
2298 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2299 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2300 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2301 &ewtabF,&ewtabFn);
2302 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2303 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2305 fscal = felec;
2307 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2309 /* Calculate temporary vectorial force */
2310 tx = _mm256_mul_pd(fscal,dx20);
2311 ty = _mm256_mul_pd(fscal,dy20);
2312 tz = _mm256_mul_pd(fscal,dz20);
2314 /* Update vectorial force */
2315 fix2 = _mm256_add_pd(fix2,tx);
2316 fiy2 = _mm256_add_pd(fiy2,ty);
2317 fiz2 = _mm256_add_pd(fiz2,tz);
2319 fjx0 = _mm256_add_pd(fjx0,tx);
2320 fjy0 = _mm256_add_pd(fjy0,ty);
2321 fjz0 = _mm256_add_pd(fjz0,tz);
2323 /**************************
2324 * CALCULATE INTERACTIONS *
2325 **************************/
2327 r21 = _mm256_mul_pd(rsq21,rinv21);
2328 r21 = _mm256_andnot_pd(dummy_mask,r21);
2330 /* EWALD ELECTROSTATICS */
2332 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2333 ewrt = _mm256_mul_pd(r21,ewtabscale);
2334 ewitab = _mm256_cvttpd_epi32(ewrt);
2335 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2336 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2337 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2338 &ewtabF,&ewtabFn);
2339 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2340 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2342 fscal = felec;
2344 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2346 /* Calculate temporary vectorial force */
2347 tx = _mm256_mul_pd(fscal,dx21);
2348 ty = _mm256_mul_pd(fscal,dy21);
2349 tz = _mm256_mul_pd(fscal,dz21);
2351 /* Update vectorial force */
2352 fix2 = _mm256_add_pd(fix2,tx);
2353 fiy2 = _mm256_add_pd(fiy2,ty);
2354 fiz2 = _mm256_add_pd(fiz2,tz);
2356 fjx1 = _mm256_add_pd(fjx1,tx);
2357 fjy1 = _mm256_add_pd(fjy1,ty);
2358 fjz1 = _mm256_add_pd(fjz1,tz);
2360 /**************************
2361 * CALCULATE INTERACTIONS *
2362 **************************/
2364 r22 = _mm256_mul_pd(rsq22,rinv22);
2365 r22 = _mm256_andnot_pd(dummy_mask,r22);
2367 /* EWALD ELECTROSTATICS */
2369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2370 ewrt = _mm256_mul_pd(r22,ewtabscale);
2371 ewitab = _mm256_cvttpd_epi32(ewrt);
2372 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2373 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2374 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2375 &ewtabF,&ewtabFn);
2376 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2377 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2379 fscal = felec;
2381 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2383 /* Calculate temporary vectorial force */
2384 tx = _mm256_mul_pd(fscal,dx22);
2385 ty = _mm256_mul_pd(fscal,dy22);
2386 tz = _mm256_mul_pd(fscal,dz22);
2388 /* Update vectorial force */
2389 fix2 = _mm256_add_pd(fix2,tx);
2390 fiy2 = _mm256_add_pd(fiy2,ty);
2391 fiz2 = _mm256_add_pd(fiz2,tz);
2393 fjx2 = _mm256_add_pd(fjx2,tx);
2394 fjy2 = _mm256_add_pd(fjy2,ty);
2395 fjz2 = _mm256_add_pd(fjz2,tz);
2397 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2398 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2399 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2400 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2402 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2403 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2405 /* Inner loop uses 359 flops */
2408 /* End of innermost loop */
2410 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2411 f+i_coord_offset,fshift+i_shift_offset);
2413 /* Increment number of inner iterations */
2414 inneriter += j_index_end - j_index_start;
2416 /* Outer loop uses 18 flops */
2419 /* Increment number of outer iterations */
2420 outeriter += nri;
2422 /* Update outer/inner flops */
2424 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);