Remove all unnecessary HAVE_CONFIG_H
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecEwSh_VdwNone_GeomW3W3_avx_256_double.c
blobe8d74fe5e374685e6687c836249a2b2b9779f828
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
38 #include "config.h"
40 #include <math.h>
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: None
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
57 void
58 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real rcutoff_scalar;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 real scratch[4*DIM];
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
106 real *charge;
107 __m128i ewitab;
108 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
110 real *ewtab;
111 __m256d dummy_mask,cutoff_mask;
112 __m128 tmpmask0,tmpmask1;
113 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
114 __m256d one = _mm256_set1_pd(1.0);
115 __m256d two = _mm256_set1_pd(2.0);
116 x = xx[0];
117 f = ff[0];
119 nri = nlist->nri;
120 iinr = nlist->iinr;
121 jindex = nlist->jindex;
122 jjnr = nlist->jjnr;
123 shiftidx = nlist->shift;
124 gid = nlist->gid;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm256_set1_pd(fr->epsfac);
128 charge = mdatoms->chargeA;
130 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
131 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
132 beta2 = _mm256_mul_pd(beta,beta);
133 beta3 = _mm256_mul_pd(beta,beta2);
135 ewtab = fr->ic->tabq_coul_FDV0;
136 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
137 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
142 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
143 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
145 jq0 = _mm256_set1_pd(charge[inr+0]);
146 jq1 = _mm256_set1_pd(charge[inr+1]);
147 jq2 = _mm256_set1_pd(charge[inr+2]);
148 qq00 = _mm256_mul_pd(iq0,jq0);
149 qq01 = _mm256_mul_pd(iq0,jq1);
150 qq02 = _mm256_mul_pd(iq0,jq2);
151 qq10 = _mm256_mul_pd(iq1,jq0);
152 qq11 = _mm256_mul_pd(iq1,jq1);
153 qq12 = _mm256_mul_pd(iq1,jq2);
154 qq20 = _mm256_mul_pd(iq2,jq0);
155 qq21 = _mm256_mul_pd(iq2,jq1);
156 qq22 = _mm256_mul_pd(iq2,jq2);
158 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
159 rcutoff_scalar = fr->rcoulomb;
160 rcutoff = _mm256_set1_pd(rcutoff_scalar);
161 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
163 /* Avoid stupid compiler warnings */
164 jnrA = jnrB = jnrC = jnrD = 0;
165 j_coord_offsetA = 0;
166 j_coord_offsetB = 0;
167 j_coord_offsetC = 0;
168 j_coord_offsetD = 0;
170 outeriter = 0;
171 inneriter = 0;
173 for(iidx=0;iidx<4*DIM;iidx++)
175 scratch[iidx] = 0.0;
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
189 inr = iinr[iidx];
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm256_setzero_pd();
197 fiy0 = _mm256_setzero_pd();
198 fiz0 = _mm256_setzero_pd();
199 fix1 = _mm256_setzero_pd();
200 fiy1 = _mm256_setzero_pd();
201 fiz1 = _mm256_setzero_pd();
202 fix2 = _mm256_setzero_pd();
203 fiy2 = _mm256_setzero_pd();
204 fiz2 = _mm256_setzero_pd();
206 /* Reset potential sums */
207 velecsum = _mm256_setzero_pd();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
214 jnrA = jjnr[jidx];
215 jnrB = jjnr[jidx+1];
216 jnrC = jjnr[jidx+2];
217 jnrD = jjnr[jidx+3];
218 j_coord_offsetA = DIM*jnrA;
219 j_coord_offsetB = DIM*jnrB;
220 j_coord_offsetC = DIM*jnrC;
221 j_coord_offsetD = DIM*jnrD;
223 /* load j atom coordinates */
224 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
225 x+j_coord_offsetC,x+j_coord_offsetD,
226 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
228 /* Calculate displacement vector */
229 dx00 = _mm256_sub_pd(ix0,jx0);
230 dy00 = _mm256_sub_pd(iy0,jy0);
231 dz00 = _mm256_sub_pd(iz0,jz0);
232 dx01 = _mm256_sub_pd(ix0,jx1);
233 dy01 = _mm256_sub_pd(iy0,jy1);
234 dz01 = _mm256_sub_pd(iz0,jz1);
235 dx02 = _mm256_sub_pd(ix0,jx2);
236 dy02 = _mm256_sub_pd(iy0,jy2);
237 dz02 = _mm256_sub_pd(iz0,jz2);
238 dx10 = _mm256_sub_pd(ix1,jx0);
239 dy10 = _mm256_sub_pd(iy1,jy0);
240 dz10 = _mm256_sub_pd(iz1,jz0);
241 dx11 = _mm256_sub_pd(ix1,jx1);
242 dy11 = _mm256_sub_pd(iy1,jy1);
243 dz11 = _mm256_sub_pd(iz1,jz1);
244 dx12 = _mm256_sub_pd(ix1,jx2);
245 dy12 = _mm256_sub_pd(iy1,jy2);
246 dz12 = _mm256_sub_pd(iz1,jz2);
247 dx20 = _mm256_sub_pd(ix2,jx0);
248 dy20 = _mm256_sub_pd(iy2,jy0);
249 dz20 = _mm256_sub_pd(iz2,jz0);
250 dx21 = _mm256_sub_pd(ix2,jx1);
251 dy21 = _mm256_sub_pd(iy2,jy1);
252 dz21 = _mm256_sub_pd(iz2,jz1);
253 dx22 = _mm256_sub_pd(ix2,jx2);
254 dy22 = _mm256_sub_pd(iy2,jy2);
255 dz22 = _mm256_sub_pd(iz2,jz2);
257 /* Calculate squared distance and things based on it */
258 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
259 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
260 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
261 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
262 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
263 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
264 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
265 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
266 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
268 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
269 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
270 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
271 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
272 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
273 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
274 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
275 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
276 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
278 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
279 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
280 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
281 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
282 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
283 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
284 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
285 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
286 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
288 fjx0 = _mm256_setzero_pd();
289 fjy0 = _mm256_setzero_pd();
290 fjz0 = _mm256_setzero_pd();
291 fjx1 = _mm256_setzero_pd();
292 fjy1 = _mm256_setzero_pd();
293 fjz1 = _mm256_setzero_pd();
294 fjx2 = _mm256_setzero_pd();
295 fjy2 = _mm256_setzero_pd();
296 fjz2 = _mm256_setzero_pd();
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 if (gmx_mm256_any_lt(rsq00,rcutoff2))
305 r00 = _mm256_mul_pd(rsq00,rinv00);
307 /* EWALD ELECTROSTATICS */
309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
310 ewrt = _mm256_mul_pd(r00,ewtabscale);
311 ewitab = _mm256_cvttpd_epi32(ewrt);
312 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
313 ewitab = _mm_slli_epi32(ewitab,2);
314 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
315 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
316 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
317 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
318 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
319 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
320 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
321 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
322 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
324 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
326 /* Update potential sum for this i atom from the interaction with this j atom. */
327 velec = _mm256_and_pd(velec,cutoff_mask);
328 velecsum = _mm256_add_pd(velecsum,velec);
330 fscal = felec;
332 fscal = _mm256_and_pd(fscal,cutoff_mask);
334 /* Calculate temporary vectorial force */
335 tx = _mm256_mul_pd(fscal,dx00);
336 ty = _mm256_mul_pd(fscal,dy00);
337 tz = _mm256_mul_pd(fscal,dz00);
339 /* Update vectorial force */
340 fix0 = _mm256_add_pd(fix0,tx);
341 fiy0 = _mm256_add_pd(fiy0,ty);
342 fiz0 = _mm256_add_pd(fiz0,tz);
344 fjx0 = _mm256_add_pd(fjx0,tx);
345 fjy0 = _mm256_add_pd(fjy0,ty);
346 fjz0 = _mm256_add_pd(fjz0,tz);
350 /**************************
351 * CALCULATE INTERACTIONS *
352 **************************/
354 if (gmx_mm256_any_lt(rsq01,rcutoff2))
357 r01 = _mm256_mul_pd(rsq01,rinv01);
359 /* EWALD ELECTROSTATICS */
361 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
362 ewrt = _mm256_mul_pd(r01,ewtabscale);
363 ewitab = _mm256_cvttpd_epi32(ewrt);
364 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
365 ewitab = _mm_slli_epi32(ewitab,2);
366 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
367 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
368 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
369 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
370 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
371 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
372 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
373 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
374 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
376 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velec = _mm256_and_pd(velec,cutoff_mask);
380 velecsum = _mm256_add_pd(velecsum,velec);
382 fscal = felec;
384 fscal = _mm256_and_pd(fscal,cutoff_mask);
386 /* Calculate temporary vectorial force */
387 tx = _mm256_mul_pd(fscal,dx01);
388 ty = _mm256_mul_pd(fscal,dy01);
389 tz = _mm256_mul_pd(fscal,dz01);
391 /* Update vectorial force */
392 fix0 = _mm256_add_pd(fix0,tx);
393 fiy0 = _mm256_add_pd(fiy0,ty);
394 fiz0 = _mm256_add_pd(fiz0,tz);
396 fjx1 = _mm256_add_pd(fjx1,tx);
397 fjy1 = _mm256_add_pd(fjy1,ty);
398 fjz1 = _mm256_add_pd(fjz1,tz);
402 /**************************
403 * CALCULATE INTERACTIONS *
404 **************************/
406 if (gmx_mm256_any_lt(rsq02,rcutoff2))
409 r02 = _mm256_mul_pd(rsq02,rinv02);
411 /* EWALD ELECTROSTATICS */
413 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
414 ewrt = _mm256_mul_pd(r02,ewtabscale);
415 ewitab = _mm256_cvttpd_epi32(ewrt);
416 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
417 ewitab = _mm_slli_epi32(ewitab,2);
418 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
419 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
420 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
421 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
422 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
423 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
424 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
425 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
426 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
428 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velec = _mm256_and_pd(velec,cutoff_mask);
432 velecsum = _mm256_add_pd(velecsum,velec);
434 fscal = felec;
436 fscal = _mm256_and_pd(fscal,cutoff_mask);
438 /* Calculate temporary vectorial force */
439 tx = _mm256_mul_pd(fscal,dx02);
440 ty = _mm256_mul_pd(fscal,dy02);
441 tz = _mm256_mul_pd(fscal,dz02);
443 /* Update vectorial force */
444 fix0 = _mm256_add_pd(fix0,tx);
445 fiy0 = _mm256_add_pd(fiy0,ty);
446 fiz0 = _mm256_add_pd(fiz0,tz);
448 fjx2 = _mm256_add_pd(fjx2,tx);
449 fjy2 = _mm256_add_pd(fjy2,ty);
450 fjz2 = _mm256_add_pd(fjz2,tz);
454 /**************************
455 * CALCULATE INTERACTIONS *
456 **************************/
458 if (gmx_mm256_any_lt(rsq10,rcutoff2))
461 r10 = _mm256_mul_pd(rsq10,rinv10);
463 /* EWALD ELECTROSTATICS */
465 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
466 ewrt = _mm256_mul_pd(r10,ewtabscale);
467 ewitab = _mm256_cvttpd_epi32(ewrt);
468 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
469 ewitab = _mm_slli_epi32(ewitab,2);
470 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
471 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
472 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
473 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
474 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
475 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
476 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
477 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
478 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
480 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velec = _mm256_and_pd(velec,cutoff_mask);
484 velecsum = _mm256_add_pd(velecsum,velec);
486 fscal = felec;
488 fscal = _mm256_and_pd(fscal,cutoff_mask);
490 /* Calculate temporary vectorial force */
491 tx = _mm256_mul_pd(fscal,dx10);
492 ty = _mm256_mul_pd(fscal,dy10);
493 tz = _mm256_mul_pd(fscal,dz10);
495 /* Update vectorial force */
496 fix1 = _mm256_add_pd(fix1,tx);
497 fiy1 = _mm256_add_pd(fiy1,ty);
498 fiz1 = _mm256_add_pd(fiz1,tz);
500 fjx0 = _mm256_add_pd(fjx0,tx);
501 fjy0 = _mm256_add_pd(fjy0,ty);
502 fjz0 = _mm256_add_pd(fjz0,tz);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 if (gmx_mm256_any_lt(rsq11,rcutoff2))
513 r11 = _mm256_mul_pd(rsq11,rinv11);
515 /* EWALD ELECTROSTATICS */
517 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
518 ewrt = _mm256_mul_pd(r11,ewtabscale);
519 ewitab = _mm256_cvttpd_epi32(ewrt);
520 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
521 ewitab = _mm_slli_epi32(ewitab,2);
522 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
523 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
524 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
525 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
526 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
527 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
528 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
529 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
530 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
532 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velec = _mm256_and_pd(velec,cutoff_mask);
536 velecsum = _mm256_add_pd(velecsum,velec);
538 fscal = felec;
540 fscal = _mm256_and_pd(fscal,cutoff_mask);
542 /* Calculate temporary vectorial force */
543 tx = _mm256_mul_pd(fscal,dx11);
544 ty = _mm256_mul_pd(fscal,dy11);
545 tz = _mm256_mul_pd(fscal,dz11);
547 /* Update vectorial force */
548 fix1 = _mm256_add_pd(fix1,tx);
549 fiy1 = _mm256_add_pd(fiy1,ty);
550 fiz1 = _mm256_add_pd(fiz1,tz);
552 fjx1 = _mm256_add_pd(fjx1,tx);
553 fjy1 = _mm256_add_pd(fjy1,ty);
554 fjz1 = _mm256_add_pd(fjz1,tz);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 if (gmx_mm256_any_lt(rsq12,rcutoff2))
565 r12 = _mm256_mul_pd(rsq12,rinv12);
567 /* EWALD ELECTROSTATICS */
569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570 ewrt = _mm256_mul_pd(r12,ewtabscale);
571 ewitab = _mm256_cvttpd_epi32(ewrt);
572 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
573 ewitab = _mm_slli_epi32(ewitab,2);
574 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
575 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
576 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
577 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
578 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
579 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
580 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
581 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
582 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
584 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velec = _mm256_and_pd(velec,cutoff_mask);
588 velecsum = _mm256_add_pd(velecsum,velec);
590 fscal = felec;
592 fscal = _mm256_and_pd(fscal,cutoff_mask);
594 /* Calculate temporary vectorial force */
595 tx = _mm256_mul_pd(fscal,dx12);
596 ty = _mm256_mul_pd(fscal,dy12);
597 tz = _mm256_mul_pd(fscal,dz12);
599 /* Update vectorial force */
600 fix1 = _mm256_add_pd(fix1,tx);
601 fiy1 = _mm256_add_pd(fiy1,ty);
602 fiz1 = _mm256_add_pd(fiz1,tz);
604 fjx2 = _mm256_add_pd(fjx2,tx);
605 fjy2 = _mm256_add_pd(fjy2,ty);
606 fjz2 = _mm256_add_pd(fjz2,tz);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 if (gmx_mm256_any_lt(rsq20,rcutoff2))
617 r20 = _mm256_mul_pd(rsq20,rinv20);
619 /* EWALD ELECTROSTATICS */
621 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
622 ewrt = _mm256_mul_pd(r20,ewtabscale);
623 ewitab = _mm256_cvttpd_epi32(ewrt);
624 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
625 ewitab = _mm_slli_epi32(ewitab,2);
626 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
627 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
628 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
629 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
630 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
631 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
632 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
633 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
634 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
636 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velec = _mm256_and_pd(velec,cutoff_mask);
640 velecsum = _mm256_add_pd(velecsum,velec);
642 fscal = felec;
644 fscal = _mm256_and_pd(fscal,cutoff_mask);
646 /* Calculate temporary vectorial force */
647 tx = _mm256_mul_pd(fscal,dx20);
648 ty = _mm256_mul_pd(fscal,dy20);
649 tz = _mm256_mul_pd(fscal,dz20);
651 /* Update vectorial force */
652 fix2 = _mm256_add_pd(fix2,tx);
653 fiy2 = _mm256_add_pd(fiy2,ty);
654 fiz2 = _mm256_add_pd(fiz2,tz);
656 fjx0 = _mm256_add_pd(fjx0,tx);
657 fjy0 = _mm256_add_pd(fjy0,ty);
658 fjz0 = _mm256_add_pd(fjz0,tz);
662 /**************************
663 * CALCULATE INTERACTIONS *
664 **************************/
666 if (gmx_mm256_any_lt(rsq21,rcutoff2))
669 r21 = _mm256_mul_pd(rsq21,rinv21);
671 /* EWALD ELECTROSTATICS */
673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
674 ewrt = _mm256_mul_pd(r21,ewtabscale);
675 ewitab = _mm256_cvttpd_epi32(ewrt);
676 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
677 ewitab = _mm_slli_epi32(ewitab,2);
678 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
679 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
680 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
681 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
682 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
683 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
684 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
685 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
686 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
688 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec = _mm256_and_pd(velec,cutoff_mask);
692 velecsum = _mm256_add_pd(velecsum,velec);
694 fscal = felec;
696 fscal = _mm256_and_pd(fscal,cutoff_mask);
698 /* Calculate temporary vectorial force */
699 tx = _mm256_mul_pd(fscal,dx21);
700 ty = _mm256_mul_pd(fscal,dy21);
701 tz = _mm256_mul_pd(fscal,dz21);
703 /* Update vectorial force */
704 fix2 = _mm256_add_pd(fix2,tx);
705 fiy2 = _mm256_add_pd(fiy2,ty);
706 fiz2 = _mm256_add_pd(fiz2,tz);
708 fjx1 = _mm256_add_pd(fjx1,tx);
709 fjy1 = _mm256_add_pd(fjy1,ty);
710 fjz1 = _mm256_add_pd(fjz1,tz);
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 if (gmx_mm256_any_lt(rsq22,rcutoff2))
721 r22 = _mm256_mul_pd(rsq22,rinv22);
723 /* EWALD ELECTROSTATICS */
725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
726 ewrt = _mm256_mul_pd(r22,ewtabscale);
727 ewitab = _mm256_cvttpd_epi32(ewrt);
728 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
729 ewitab = _mm_slli_epi32(ewitab,2);
730 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
731 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
732 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
733 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
734 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
735 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
736 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
737 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
738 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
740 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
742 /* Update potential sum for this i atom from the interaction with this j atom. */
743 velec = _mm256_and_pd(velec,cutoff_mask);
744 velecsum = _mm256_add_pd(velecsum,velec);
746 fscal = felec;
748 fscal = _mm256_and_pd(fscal,cutoff_mask);
750 /* Calculate temporary vectorial force */
751 tx = _mm256_mul_pd(fscal,dx22);
752 ty = _mm256_mul_pd(fscal,dy22);
753 tz = _mm256_mul_pd(fscal,dz22);
755 /* Update vectorial force */
756 fix2 = _mm256_add_pd(fix2,tx);
757 fiy2 = _mm256_add_pd(fiy2,ty);
758 fiz2 = _mm256_add_pd(fiz2,tz);
760 fjx2 = _mm256_add_pd(fjx2,tx);
761 fjy2 = _mm256_add_pd(fjy2,ty);
762 fjz2 = _mm256_add_pd(fjz2,tz);
766 fjptrA = f+j_coord_offsetA;
767 fjptrB = f+j_coord_offsetB;
768 fjptrC = f+j_coord_offsetC;
769 fjptrD = f+j_coord_offsetD;
771 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
772 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
774 /* Inner loop uses 414 flops */
777 if(jidx<j_index_end)
780 /* Get j neighbor index, and coordinate index */
781 jnrlistA = jjnr[jidx];
782 jnrlistB = jjnr[jidx+1];
783 jnrlistC = jjnr[jidx+2];
784 jnrlistD = jjnr[jidx+3];
785 /* Sign of each element will be negative for non-real atoms.
786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
787 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
789 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
791 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
792 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
793 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
795 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
796 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
797 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
798 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
799 j_coord_offsetA = DIM*jnrA;
800 j_coord_offsetB = DIM*jnrB;
801 j_coord_offsetC = DIM*jnrC;
802 j_coord_offsetD = DIM*jnrD;
804 /* load j atom coordinates */
805 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
806 x+j_coord_offsetC,x+j_coord_offsetD,
807 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
809 /* Calculate displacement vector */
810 dx00 = _mm256_sub_pd(ix0,jx0);
811 dy00 = _mm256_sub_pd(iy0,jy0);
812 dz00 = _mm256_sub_pd(iz0,jz0);
813 dx01 = _mm256_sub_pd(ix0,jx1);
814 dy01 = _mm256_sub_pd(iy0,jy1);
815 dz01 = _mm256_sub_pd(iz0,jz1);
816 dx02 = _mm256_sub_pd(ix0,jx2);
817 dy02 = _mm256_sub_pd(iy0,jy2);
818 dz02 = _mm256_sub_pd(iz0,jz2);
819 dx10 = _mm256_sub_pd(ix1,jx0);
820 dy10 = _mm256_sub_pd(iy1,jy0);
821 dz10 = _mm256_sub_pd(iz1,jz0);
822 dx11 = _mm256_sub_pd(ix1,jx1);
823 dy11 = _mm256_sub_pd(iy1,jy1);
824 dz11 = _mm256_sub_pd(iz1,jz1);
825 dx12 = _mm256_sub_pd(ix1,jx2);
826 dy12 = _mm256_sub_pd(iy1,jy2);
827 dz12 = _mm256_sub_pd(iz1,jz2);
828 dx20 = _mm256_sub_pd(ix2,jx0);
829 dy20 = _mm256_sub_pd(iy2,jy0);
830 dz20 = _mm256_sub_pd(iz2,jz0);
831 dx21 = _mm256_sub_pd(ix2,jx1);
832 dy21 = _mm256_sub_pd(iy2,jy1);
833 dz21 = _mm256_sub_pd(iz2,jz1);
834 dx22 = _mm256_sub_pd(ix2,jx2);
835 dy22 = _mm256_sub_pd(iy2,jy2);
836 dz22 = _mm256_sub_pd(iz2,jz2);
838 /* Calculate squared distance and things based on it */
839 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
840 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
841 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
842 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
843 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
844 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
845 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
846 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
847 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
849 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
850 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
851 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
852 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
853 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
854 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
855 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
856 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
857 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
859 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
860 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
861 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
862 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
863 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
864 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
865 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
866 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
867 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
869 fjx0 = _mm256_setzero_pd();
870 fjy0 = _mm256_setzero_pd();
871 fjz0 = _mm256_setzero_pd();
872 fjx1 = _mm256_setzero_pd();
873 fjy1 = _mm256_setzero_pd();
874 fjz1 = _mm256_setzero_pd();
875 fjx2 = _mm256_setzero_pd();
876 fjy2 = _mm256_setzero_pd();
877 fjz2 = _mm256_setzero_pd();
879 /**************************
880 * CALCULATE INTERACTIONS *
881 **************************/
883 if (gmx_mm256_any_lt(rsq00,rcutoff2))
886 r00 = _mm256_mul_pd(rsq00,rinv00);
887 r00 = _mm256_andnot_pd(dummy_mask,r00);
889 /* EWALD ELECTROSTATICS */
891 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
892 ewrt = _mm256_mul_pd(r00,ewtabscale);
893 ewitab = _mm256_cvttpd_epi32(ewrt);
894 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
895 ewitab = _mm_slli_epi32(ewitab,2);
896 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
897 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
898 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
899 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
900 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
901 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
902 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
903 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
904 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
906 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
908 /* Update potential sum for this i atom from the interaction with this j atom. */
909 velec = _mm256_and_pd(velec,cutoff_mask);
910 velec = _mm256_andnot_pd(dummy_mask,velec);
911 velecsum = _mm256_add_pd(velecsum,velec);
913 fscal = felec;
915 fscal = _mm256_and_pd(fscal,cutoff_mask);
917 fscal = _mm256_andnot_pd(dummy_mask,fscal);
919 /* Calculate temporary vectorial force */
920 tx = _mm256_mul_pd(fscal,dx00);
921 ty = _mm256_mul_pd(fscal,dy00);
922 tz = _mm256_mul_pd(fscal,dz00);
924 /* Update vectorial force */
925 fix0 = _mm256_add_pd(fix0,tx);
926 fiy0 = _mm256_add_pd(fiy0,ty);
927 fiz0 = _mm256_add_pd(fiz0,tz);
929 fjx0 = _mm256_add_pd(fjx0,tx);
930 fjy0 = _mm256_add_pd(fjy0,ty);
931 fjz0 = _mm256_add_pd(fjz0,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm256_any_lt(rsq01,rcutoff2))
942 r01 = _mm256_mul_pd(rsq01,rinv01);
943 r01 = _mm256_andnot_pd(dummy_mask,r01);
945 /* EWALD ELECTROSTATICS */
947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
948 ewrt = _mm256_mul_pd(r01,ewtabscale);
949 ewitab = _mm256_cvttpd_epi32(ewrt);
950 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
951 ewitab = _mm_slli_epi32(ewitab,2);
952 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
953 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
954 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
955 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
956 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
957 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
958 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
959 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
960 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
962 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 velec = _mm256_and_pd(velec,cutoff_mask);
966 velec = _mm256_andnot_pd(dummy_mask,velec);
967 velecsum = _mm256_add_pd(velecsum,velec);
969 fscal = felec;
971 fscal = _mm256_and_pd(fscal,cutoff_mask);
973 fscal = _mm256_andnot_pd(dummy_mask,fscal);
975 /* Calculate temporary vectorial force */
976 tx = _mm256_mul_pd(fscal,dx01);
977 ty = _mm256_mul_pd(fscal,dy01);
978 tz = _mm256_mul_pd(fscal,dz01);
980 /* Update vectorial force */
981 fix0 = _mm256_add_pd(fix0,tx);
982 fiy0 = _mm256_add_pd(fiy0,ty);
983 fiz0 = _mm256_add_pd(fiz0,tz);
985 fjx1 = _mm256_add_pd(fjx1,tx);
986 fjy1 = _mm256_add_pd(fjy1,ty);
987 fjz1 = _mm256_add_pd(fjz1,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm256_any_lt(rsq02,rcutoff2))
998 r02 = _mm256_mul_pd(rsq02,rinv02);
999 r02 = _mm256_andnot_pd(dummy_mask,r02);
1001 /* EWALD ELECTROSTATICS */
1003 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1004 ewrt = _mm256_mul_pd(r02,ewtabscale);
1005 ewitab = _mm256_cvttpd_epi32(ewrt);
1006 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1007 ewitab = _mm_slli_epi32(ewitab,2);
1008 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1009 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1010 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1011 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1012 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1013 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1014 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1015 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
1016 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1018 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1020 /* Update potential sum for this i atom from the interaction with this j atom. */
1021 velec = _mm256_and_pd(velec,cutoff_mask);
1022 velec = _mm256_andnot_pd(dummy_mask,velec);
1023 velecsum = _mm256_add_pd(velecsum,velec);
1025 fscal = felec;
1027 fscal = _mm256_and_pd(fscal,cutoff_mask);
1029 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1031 /* Calculate temporary vectorial force */
1032 tx = _mm256_mul_pd(fscal,dx02);
1033 ty = _mm256_mul_pd(fscal,dy02);
1034 tz = _mm256_mul_pd(fscal,dz02);
1036 /* Update vectorial force */
1037 fix0 = _mm256_add_pd(fix0,tx);
1038 fiy0 = _mm256_add_pd(fiy0,ty);
1039 fiz0 = _mm256_add_pd(fiz0,tz);
1041 fjx2 = _mm256_add_pd(fjx2,tx);
1042 fjy2 = _mm256_add_pd(fjy2,ty);
1043 fjz2 = _mm256_add_pd(fjz2,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1054 r10 = _mm256_mul_pd(rsq10,rinv10);
1055 r10 = _mm256_andnot_pd(dummy_mask,r10);
1057 /* EWALD ELECTROSTATICS */
1059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060 ewrt = _mm256_mul_pd(r10,ewtabscale);
1061 ewitab = _mm256_cvttpd_epi32(ewrt);
1062 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1063 ewitab = _mm_slli_epi32(ewitab,2);
1064 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1065 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1066 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1067 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1068 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1069 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1070 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1071 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
1072 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1074 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec = _mm256_and_pd(velec,cutoff_mask);
1078 velec = _mm256_andnot_pd(dummy_mask,velec);
1079 velecsum = _mm256_add_pd(velecsum,velec);
1081 fscal = felec;
1083 fscal = _mm256_and_pd(fscal,cutoff_mask);
1085 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1087 /* Calculate temporary vectorial force */
1088 tx = _mm256_mul_pd(fscal,dx10);
1089 ty = _mm256_mul_pd(fscal,dy10);
1090 tz = _mm256_mul_pd(fscal,dz10);
1092 /* Update vectorial force */
1093 fix1 = _mm256_add_pd(fix1,tx);
1094 fiy1 = _mm256_add_pd(fiy1,ty);
1095 fiz1 = _mm256_add_pd(fiz1,tz);
1097 fjx0 = _mm256_add_pd(fjx0,tx);
1098 fjy0 = _mm256_add_pd(fjy0,ty);
1099 fjz0 = _mm256_add_pd(fjz0,tz);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1110 r11 = _mm256_mul_pd(rsq11,rinv11);
1111 r11 = _mm256_andnot_pd(dummy_mask,r11);
1113 /* EWALD ELECTROSTATICS */
1115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1116 ewrt = _mm256_mul_pd(r11,ewtabscale);
1117 ewitab = _mm256_cvttpd_epi32(ewrt);
1118 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1119 ewitab = _mm_slli_epi32(ewitab,2);
1120 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1121 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1122 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1123 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1124 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1125 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1126 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1127 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
1128 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1130 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1132 /* Update potential sum for this i atom from the interaction with this j atom. */
1133 velec = _mm256_and_pd(velec,cutoff_mask);
1134 velec = _mm256_andnot_pd(dummy_mask,velec);
1135 velecsum = _mm256_add_pd(velecsum,velec);
1137 fscal = felec;
1139 fscal = _mm256_and_pd(fscal,cutoff_mask);
1141 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1143 /* Calculate temporary vectorial force */
1144 tx = _mm256_mul_pd(fscal,dx11);
1145 ty = _mm256_mul_pd(fscal,dy11);
1146 tz = _mm256_mul_pd(fscal,dz11);
1148 /* Update vectorial force */
1149 fix1 = _mm256_add_pd(fix1,tx);
1150 fiy1 = _mm256_add_pd(fiy1,ty);
1151 fiz1 = _mm256_add_pd(fiz1,tz);
1153 fjx1 = _mm256_add_pd(fjx1,tx);
1154 fjy1 = _mm256_add_pd(fjy1,ty);
1155 fjz1 = _mm256_add_pd(fjz1,tz);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1166 r12 = _mm256_mul_pd(rsq12,rinv12);
1167 r12 = _mm256_andnot_pd(dummy_mask,r12);
1169 /* EWALD ELECTROSTATICS */
1171 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1172 ewrt = _mm256_mul_pd(r12,ewtabscale);
1173 ewitab = _mm256_cvttpd_epi32(ewrt);
1174 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1175 ewitab = _mm_slli_epi32(ewitab,2);
1176 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1177 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1178 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1179 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1180 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1181 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1182 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1183 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
1184 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1186 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec = _mm256_and_pd(velec,cutoff_mask);
1190 velec = _mm256_andnot_pd(dummy_mask,velec);
1191 velecsum = _mm256_add_pd(velecsum,velec);
1193 fscal = felec;
1195 fscal = _mm256_and_pd(fscal,cutoff_mask);
1197 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1199 /* Calculate temporary vectorial force */
1200 tx = _mm256_mul_pd(fscal,dx12);
1201 ty = _mm256_mul_pd(fscal,dy12);
1202 tz = _mm256_mul_pd(fscal,dz12);
1204 /* Update vectorial force */
1205 fix1 = _mm256_add_pd(fix1,tx);
1206 fiy1 = _mm256_add_pd(fiy1,ty);
1207 fiz1 = _mm256_add_pd(fiz1,tz);
1209 fjx2 = _mm256_add_pd(fjx2,tx);
1210 fjy2 = _mm256_add_pd(fjy2,ty);
1211 fjz2 = _mm256_add_pd(fjz2,tz);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1222 r20 = _mm256_mul_pd(rsq20,rinv20);
1223 r20 = _mm256_andnot_pd(dummy_mask,r20);
1225 /* EWALD ELECTROSTATICS */
1227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1228 ewrt = _mm256_mul_pd(r20,ewtabscale);
1229 ewitab = _mm256_cvttpd_epi32(ewrt);
1230 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1231 ewitab = _mm_slli_epi32(ewitab,2);
1232 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1233 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1234 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1235 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1236 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1237 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1238 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1239 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
1240 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1242 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec = _mm256_and_pd(velec,cutoff_mask);
1246 velec = _mm256_andnot_pd(dummy_mask,velec);
1247 velecsum = _mm256_add_pd(velecsum,velec);
1249 fscal = felec;
1251 fscal = _mm256_and_pd(fscal,cutoff_mask);
1253 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1255 /* Calculate temporary vectorial force */
1256 tx = _mm256_mul_pd(fscal,dx20);
1257 ty = _mm256_mul_pd(fscal,dy20);
1258 tz = _mm256_mul_pd(fscal,dz20);
1260 /* Update vectorial force */
1261 fix2 = _mm256_add_pd(fix2,tx);
1262 fiy2 = _mm256_add_pd(fiy2,ty);
1263 fiz2 = _mm256_add_pd(fiz2,tz);
1265 fjx0 = _mm256_add_pd(fjx0,tx);
1266 fjy0 = _mm256_add_pd(fjy0,ty);
1267 fjz0 = _mm256_add_pd(fjz0,tz);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1278 r21 = _mm256_mul_pd(rsq21,rinv21);
1279 r21 = _mm256_andnot_pd(dummy_mask,r21);
1281 /* EWALD ELECTROSTATICS */
1283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284 ewrt = _mm256_mul_pd(r21,ewtabscale);
1285 ewitab = _mm256_cvttpd_epi32(ewrt);
1286 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1287 ewitab = _mm_slli_epi32(ewitab,2);
1288 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1289 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1290 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1291 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1292 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1293 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1294 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1295 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
1296 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1298 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1300 /* Update potential sum for this i atom from the interaction with this j atom. */
1301 velec = _mm256_and_pd(velec,cutoff_mask);
1302 velec = _mm256_andnot_pd(dummy_mask,velec);
1303 velecsum = _mm256_add_pd(velecsum,velec);
1305 fscal = felec;
1307 fscal = _mm256_and_pd(fscal,cutoff_mask);
1309 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1311 /* Calculate temporary vectorial force */
1312 tx = _mm256_mul_pd(fscal,dx21);
1313 ty = _mm256_mul_pd(fscal,dy21);
1314 tz = _mm256_mul_pd(fscal,dz21);
1316 /* Update vectorial force */
1317 fix2 = _mm256_add_pd(fix2,tx);
1318 fiy2 = _mm256_add_pd(fiy2,ty);
1319 fiz2 = _mm256_add_pd(fiz2,tz);
1321 fjx1 = _mm256_add_pd(fjx1,tx);
1322 fjy1 = _mm256_add_pd(fjy1,ty);
1323 fjz1 = _mm256_add_pd(fjz1,tz);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1334 r22 = _mm256_mul_pd(rsq22,rinv22);
1335 r22 = _mm256_andnot_pd(dummy_mask,r22);
1337 /* EWALD ELECTROSTATICS */
1339 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1340 ewrt = _mm256_mul_pd(r22,ewtabscale);
1341 ewitab = _mm256_cvttpd_epi32(ewrt);
1342 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1343 ewitab = _mm_slli_epi32(ewitab,2);
1344 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1345 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1346 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1347 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1348 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1349 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1350 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1351 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
1352 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1354 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1356 /* Update potential sum for this i atom from the interaction with this j atom. */
1357 velec = _mm256_and_pd(velec,cutoff_mask);
1358 velec = _mm256_andnot_pd(dummy_mask,velec);
1359 velecsum = _mm256_add_pd(velecsum,velec);
1361 fscal = felec;
1363 fscal = _mm256_and_pd(fscal,cutoff_mask);
1365 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1367 /* Calculate temporary vectorial force */
1368 tx = _mm256_mul_pd(fscal,dx22);
1369 ty = _mm256_mul_pd(fscal,dy22);
1370 tz = _mm256_mul_pd(fscal,dz22);
1372 /* Update vectorial force */
1373 fix2 = _mm256_add_pd(fix2,tx);
1374 fiy2 = _mm256_add_pd(fiy2,ty);
1375 fiz2 = _mm256_add_pd(fiz2,tz);
1377 fjx2 = _mm256_add_pd(fjx2,tx);
1378 fjy2 = _mm256_add_pd(fjy2,ty);
1379 fjz2 = _mm256_add_pd(fjz2,tz);
1383 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1384 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1385 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1386 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1388 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1389 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1391 /* Inner loop uses 423 flops */
1394 /* End of innermost loop */
1396 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1397 f+i_coord_offset,fshift+i_shift_offset);
1399 ggid = gid[iidx];
1400 /* Update potential energies */
1401 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1403 /* Increment number of inner iterations */
1404 inneriter += j_index_end - j_index_start;
1406 /* Outer loop uses 19 flops */
1409 /* Increment number of outer iterations */
1410 outeriter += nri;
1412 /* Update outer/inner flops */
1414 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*423);
1417 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double
1418 * Electrostatics interaction: Ewald
1419 * VdW interaction: None
1420 * Geometry: Water3-Water3
1421 * Calculate force/pot: Force
1423 void
1424 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double
1425 (t_nblist * gmx_restrict nlist,
1426 rvec * gmx_restrict xx,
1427 rvec * gmx_restrict ff,
1428 t_forcerec * gmx_restrict fr,
1429 t_mdatoms * gmx_restrict mdatoms,
1430 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1431 t_nrnb * gmx_restrict nrnb)
1433 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1434 * just 0 for non-waters.
1435 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1436 * jnr indices corresponding to data put in the four positions in the SIMD register.
1438 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1439 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1440 int jnrA,jnrB,jnrC,jnrD;
1441 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1442 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1443 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1444 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1445 real rcutoff_scalar;
1446 real *shiftvec,*fshift,*x,*f;
1447 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1448 real scratch[4*DIM];
1449 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1450 real * vdwioffsetptr0;
1451 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1452 real * vdwioffsetptr1;
1453 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1454 real * vdwioffsetptr2;
1455 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1456 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1457 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1458 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1459 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1460 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1461 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1462 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1463 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1464 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1465 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1466 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1467 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1468 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1469 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1470 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1471 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1472 real *charge;
1473 __m128i ewitab;
1474 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1475 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1476 real *ewtab;
1477 __m256d dummy_mask,cutoff_mask;
1478 __m128 tmpmask0,tmpmask1;
1479 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1480 __m256d one = _mm256_set1_pd(1.0);
1481 __m256d two = _mm256_set1_pd(2.0);
1482 x = xx[0];
1483 f = ff[0];
1485 nri = nlist->nri;
1486 iinr = nlist->iinr;
1487 jindex = nlist->jindex;
1488 jjnr = nlist->jjnr;
1489 shiftidx = nlist->shift;
1490 gid = nlist->gid;
1491 shiftvec = fr->shift_vec[0];
1492 fshift = fr->fshift[0];
1493 facel = _mm256_set1_pd(fr->epsfac);
1494 charge = mdatoms->chargeA;
1496 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1497 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
1498 beta2 = _mm256_mul_pd(beta,beta);
1499 beta3 = _mm256_mul_pd(beta,beta2);
1501 ewtab = fr->ic->tabq_coul_F;
1502 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1503 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1505 /* Setup water-specific parameters */
1506 inr = nlist->iinr[0];
1507 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1508 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1509 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1511 jq0 = _mm256_set1_pd(charge[inr+0]);
1512 jq1 = _mm256_set1_pd(charge[inr+1]);
1513 jq2 = _mm256_set1_pd(charge[inr+2]);
1514 qq00 = _mm256_mul_pd(iq0,jq0);
1515 qq01 = _mm256_mul_pd(iq0,jq1);
1516 qq02 = _mm256_mul_pd(iq0,jq2);
1517 qq10 = _mm256_mul_pd(iq1,jq0);
1518 qq11 = _mm256_mul_pd(iq1,jq1);
1519 qq12 = _mm256_mul_pd(iq1,jq2);
1520 qq20 = _mm256_mul_pd(iq2,jq0);
1521 qq21 = _mm256_mul_pd(iq2,jq1);
1522 qq22 = _mm256_mul_pd(iq2,jq2);
1524 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1525 rcutoff_scalar = fr->rcoulomb;
1526 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1527 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1529 /* Avoid stupid compiler warnings */
1530 jnrA = jnrB = jnrC = jnrD = 0;
1531 j_coord_offsetA = 0;
1532 j_coord_offsetB = 0;
1533 j_coord_offsetC = 0;
1534 j_coord_offsetD = 0;
1536 outeriter = 0;
1537 inneriter = 0;
1539 for(iidx=0;iidx<4*DIM;iidx++)
1541 scratch[iidx] = 0.0;
1544 /* Start outer loop over neighborlists */
1545 for(iidx=0; iidx<nri; iidx++)
1547 /* Load shift vector for this list */
1548 i_shift_offset = DIM*shiftidx[iidx];
1550 /* Load limits for loop over neighbors */
1551 j_index_start = jindex[iidx];
1552 j_index_end = jindex[iidx+1];
1554 /* Get outer coordinate index */
1555 inr = iinr[iidx];
1556 i_coord_offset = DIM*inr;
1558 /* Load i particle coords and add shift vector */
1559 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1560 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1562 fix0 = _mm256_setzero_pd();
1563 fiy0 = _mm256_setzero_pd();
1564 fiz0 = _mm256_setzero_pd();
1565 fix1 = _mm256_setzero_pd();
1566 fiy1 = _mm256_setzero_pd();
1567 fiz1 = _mm256_setzero_pd();
1568 fix2 = _mm256_setzero_pd();
1569 fiy2 = _mm256_setzero_pd();
1570 fiz2 = _mm256_setzero_pd();
1572 /* Start inner kernel loop */
1573 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1576 /* Get j neighbor index, and coordinate index */
1577 jnrA = jjnr[jidx];
1578 jnrB = jjnr[jidx+1];
1579 jnrC = jjnr[jidx+2];
1580 jnrD = jjnr[jidx+3];
1581 j_coord_offsetA = DIM*jnrA;
1582 j_coord_offsetB = DIM*jnrB;
1583 j_coord_offsetC = DIM*jnrC;
1584 j_coord_offsetD = DIM*jnrD;
1586 /* load j atom coordinates */
1587 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1588 x+j_coord_offsetC,x+j_coord_offsetD,
1589 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1591 /* Calculate displacement vector */
1592 dx00 = _mm256_sub_pd(ix0,jx0);
1593 dy00 = _mm256_sub_pd(iy0,jy0);
1594 dz00 = _mm256_sub_pd(iz0,jz0);
1595 dx01 = _mm256_sub_pd(ix0,jx1);
1596 dy01 = _mm256_sub_pd(iy0,jy1);
1597 dz01 = _mm256_sub_pd(iz0,jz1);
1598 dx02 = _mm256_sub_pd(ix0,jx2);
1599 dy02 = _mm256_sub_pd(iy0,jy2);
1600 dz02 = _mm256_sub_pd(iz0,jz2);
1601 dx10 = _mm256_sub_pd(ix1,jx0);
1602 dy10 = _mm256_sub_pd(iy1,jy0);
1603 dz10 = _mm256_sub_pd(iz1,jz0);
1604 dx11 = _mm256_sub_pd(ix1,jx1);
1605 dy11 = _mm256_sub_pd(iy1,jy1);
1606 dz11 = _mm256_sub_pd(iz1,jz1);
1607 dx12 = _mm256_sub_pd(ix1,jx2);
1608 dy12 = _mm256_sub_pd(iy1,jy2);
1609 dz12 = _mm256_sub_pd(iz1,jz2);
1610 dx20 = _mm256_sub_pd(ix2,jx0);
1611 dy20 = _mm256_sub_pd(iy2,jy0);
1612 dz20 = _mm256_sub_pd(iz2,jz0);
1613 dx21 = _mm256_sub_pd(ix2,jx1);
1614 dy21 = _mm256_sub_pd(iy2,jy1);
1615 dz21 = _mm256_sub_pd(iz2,jz1);
1616 dx22 = _mm256_sub_pd(ix2,jx2);
1617 dy22 = _mm256_sub_pd(iy2,jy2);
1618 dz22 = _mm256_sub_pd(iz2,jz2);
1620 /* Calculate squared distance and things based on it */
1621 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1622 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1623 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1624 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1625 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1626 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1627 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1628 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1629 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1631 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1632 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1633 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1634 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1635 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1636 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1637 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1638 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1639 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1641 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1642 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1643 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1644 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1645 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1646 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1647 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1648 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1649 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1651 fjx0 = _mm256_setzero_pd();
1652 fjy0 = _mm256_setzero_pd();
1653 fjz0 = _mm256_setzero_pd();
1654 fjx1 = _mm256_setzero_pd();
1655 fjy1 = _mm256_setzero_pd();
1656 fjz1 = _mm256_setzero_pd();
1657 fjx2 = _mm256_setzero_pd();
1658 fjy2 = _mm256_setzero_pd();
1659 fjz2 = _mm256_setzero_pd();
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1668 r00 = _mm256_mul_pd(rsq00,rinv00);
1670 /* EWALD ELECTROSTATICS */
1672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1673 ewrt = _mm256_mul_pd(r00,ewtabscale);
1674 ewitab = _mm256_cvttpd_epi32(ewrt);
1675 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1676 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1677 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1678 &ewtabF,&ewtabFn);
1679 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1680 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1682 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1684 fscal = felec;
1686 fscal = _mm256_and_pd(fscal,cutoff_mask);
1688 /* Calculate temporary vectorial force */
1689 tx = _mm256_mul_pd(fscal,dx00);
1690 ty = _mm256_mul_pd(fscal,dy00);
1691 tz = _mm256_mul_pd(fscal,dz00);
1693 /* Update vectorial force */
1694 fix0 = _mm256_add_pd(fix0,tx);
1695 fiy0 = _mm256_add_pd(fiy0,ty);
1696 fiz0 = _mm256_add_pd(fiz0,tz);
1698 fjx0 = _mm256_add_pd(fjx0,tx);
1699 fjy0 = _mm256_add_pd(fjy0,ty);
1700 fjz0 = _mm256_add_pd(fjz0,tz);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1711 r01 = _mm256_mul_pd(rsq01,rinv01);
1713 /* EWALD ELECTROSTATICS */
1715 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1716 ewrt = _mm256_mul_pd(r01,ewtabscale);
1717 ewitab = _mm256_cvttpd_epi32(ewrt);
1718 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1719 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1720 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1721 &ewtabF,&ewtabFn);
1722 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1723 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1725 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1727 fscal = felec;
1729 fscal = _mm256_and_pd(fscal,cutoff_mask);
1731 /* Calculate temporary vectorial force */
1732 tx = _mm256_mul_pd(fscal,dx01);
1733 ty = _mm256_mul_pd(fscal,dy01);
1734 tz = _mm256_mul_pd(fscal,dz01);
1736 /* Update vectorial force */
1737 fix0 = _mm256_add_pd(fix0,tx);
1738 fiy0 = _mm256_add_pd(fiy0,ty);
1739 fiz0 = _mm256_add_pd(fiz0,tz);
1741 fjx1 = _mm256_add_pd(fjx1,tx);
1742 fjy1 = _mm256_add_pd(fjy1,ty);
1743 fjz1 = _mm256_add_pd(fjz1,tz);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1754 r02 = _mm256_mul_pd(rsq02,rinv02);
1756 /* EWALD ELECTROSTATICS */
1758 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1759 ewrt = _mm256_mul_pd(r02,ewtabscale);
1760 ewitab = _mm256_cvttpd_epi32(ewrt);
1761 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1762 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1763 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1764 &ewtabF,&ewtabFn);
1765 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1766 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1768 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1770 fscal = felec;
1772 fscal = _mm256_and_pd(fscal,cutoff_mask);
1774 /* Calculate temporary vectorial force */
1775 tx = _mm256_mul_pd(fscal,dx02);
1776 ty = _mm256_mul_pd(fscal,dy02);
1777 tz = _mm256_mul_pd(fscal,dz02);
1779 /* Update vectorial force */
1780 fix0 = _mm256_add_pd(fix0,tx);
1781 fiy0 = _mm256_add_pd(fiy0,ty);
1782 fiz0 = _mm256_add_pd(fiz0,tz);
1784 fjx2 = _mm256_add_pd(fjx2,tx);
1785 fjy2 = _mm256_add_pd(fjy2,ty);
1786 fjz2 = _mm256_add_pd(fjz2,tz);
1790 /**************************
1791 * CALCULATE INTERACTIONS *
1792 **************************/
1794 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1797 r10 = _mm256_mul_pd(rsq10,rinv10);
1799 /* EWALD ELECTROSTATICS */
1801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1802 ewrt = _mm256_mul_pd(r10,ewtabscale);
1803 ewitab = _mm256_cvttpd_epi32(ewrt);
1804 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1805 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1806 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1807 &ewtabF,&ewtabFn);
1808 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1809 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1811 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1813 fscal = felec;
1815 fscal = _mm256_and_pd(fscal,cutoff_mask);
1817 /* Calculate temporary vectorial force */
1818 tx = _mm256_mul_pd(fscal,dx10);
1819 ty = _mm256_mul_pd(fscal,dy10);
1820 tz = _mm256_mul_pd(fscal,dz10);
1822 /* Update vectorial force */
1823 fix1 = _mm256_add_pd(fix1,tx);
1824 fiy1 = _mm256_add_pd(fiy1,ty);
1825 fiz1 = _mm256_add_pd(fiz1,tz);
1827 fjx0 = _mm256_add_pd(fjx0,tx);
1828 fjy0 = _mm256_add_pd(fjy0,ty);
1829 fjz0 = _mm256_add_pd(fjz0,tz);
1833 /**************************
1834 * CALCULATE INTERACTIONS *
1835 **************************/
1837 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1840 r11 = _mm256_mul_pd(rsq11,rinv11);
1842 /* EWALD ELECTROSTATICS */
1844 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1845 ewrt = _mm256_mul_pd(r11,ewtabscale);
1846 ewitab = _mm256_cvttpd_epi32(ewrt);
1847 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1848 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1849 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1850 &ewtabF,&ewtabFn);
1851 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1852 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1854 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1856 fscal = felec;
1858 fscal = _mm256_and_pd(fscal,cutoff_mask);
1860 /* Calculate temporary vectorial force */
1861 tx = _mm256_mul_pd(fscal,dx11);
1862 ty = _mm256_mul_pd(fscal,dy11);
1863 tz = _mm256_mul_pd(fscal,dz11);
1865 /* Update vectorial force */
1866 fix1 = _mm256_add_pd(fix1,tx);
1867 fiy1 = _mm256_add_pd(fiy1,ty);
1868 fiz1 = _mm256_add_pd(fiz1,tz);
1870 fjx1 = _mm256_add_pd(fjx1,tx);
1871 fjy1 = _mm256_add_pd(fjy1,ty);
1872 fjz1 = _mm256_add_pd(fjz1,tz);
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1883 r12 = _mm256_mul_pd(rsq12,rinv12);
1885 /* EWALD ELECTROSTATICS */
1887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888 ewrt = _mm256_mul_pd(r12,ewtabscale);
1889 ewitab = _mm256_cvttpd_epi32(ewrt);
1890 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1891 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1892 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1893 &ewtabF,&ewtabFn);
1894 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1895 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1897 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1899 fscal = felec;
1901 fscal = _mm256_and_pd(fscal,cutoff_mask);
1903 /* Calculate temporary vectorial force */
1904 tx = _mm256_mul_pd(fscal,dx12);
1905 ty = _mm256_mul_pd(fscal,dy12);
1906 tz = _mm256_mul_pd(fscal,dz12);
1908 /* Update vectorial force */
1909 fix1 = _mm256_add_pd(fix1,tx);
1910 fiy1 = _mm256_add_pd(fiy1,ty);
1911 fiz1 = _mm256_add_pd(fiz1,tz);
1913 fjx2 = _mm256_add_pd(fjx2,tx);
1914 fjy2 = _mm256_add_pd(fjy2,ty);
1915 fjz2 = _mm256_add_pd(fjz2,tz);
1919 /**************************
1920 * CALCULATE INTERACTIONS *
1921 **************************/
1923 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1926 r20 = _mm256_mul_pd(rsq20,rinv20);
1928 /* EWALD ELECTROSTATICS */
1930 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1931 ewrt = _mm256_mul_pd(r20,ewtabscale);
1932 ewitab = _mm256_cvttpd_epi32(ewrt);
1933 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1934 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1935 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1936 &ewtabF,&ewtabFn);
1937 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1938 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1940 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1942 fscal = felec;
1944 fscal = _mm256_and_pd(fscal,cutoff_mask);
1946 /* Calculate temporary vectorial force */
1947 tx = _mm256_mul_pd(fscal,dx20);
1948 ty = _mm256_mul_pd(fscal,dy20);
1949 tz = _mm256_mul_pd(fscal,dz20);
1951 /* Update vectorial force */
1952 fix2 = _mm256_add_pd(fix2,tx);
1953 fiy2 = _mm256_add_pd(fiy2,ty);
1954 fiz2 = _mm256_add_pd(fiz2,tz);
1956 fjx0 = _mm256_add_pd(fjx0,tx);
1957 fjy0 = _mm256_add_pd(fjy0,ty);
1958 fjz0 = _mm256_add_pd(fjz0,tz);
1962 /**************************
1963 * CALCULATE INTERACTIONS *
1964 **************************/
1966 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1969 r21 = _mm256_mul_pd(rsq21,rinv21);
1971 /* EWALD ELECTROSTATICS */
1973 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1974 ewrt = _mm256_mul_pd(r21,ewtabscale);
1975 ewitab = _mm256_cvttpd_epi32(ewrt);
1976 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1977 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1978 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1979 &ewtabF,&ewtabFn);
1980 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1981 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1983 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1985 fscal = felec;
1987 fscal = _mm256_and_pd(fscal,cutoff_mask);
1989 /* Calculate temporary vectorial force */
1990 tx = _mm256_mul_pd(fscal,dx21);
1991 ty = _mm256_mul_pd(fscal,dy21);
1992 tz = _mm256_mul_pd(fscal,dz21);
1994 /* Update vectorial force */
1995 fix2 = _mm256_add_pd(fix2,tx);
1996 fiy2 = _mm256_add_pd(fiy2,ty);
1997 fiz2 = _mm256_add_pd(fiz2,tz);
1999 fjx1 = _mm256_add_pd(fjx1,tx);
2000 fjy1 = _mm256_add_pd(fjy1,ty);
2001 fjz1 = _mm256_add_pd(fjz1,tz);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2012 r22 = _mm256_mul_pd(rsq22,rinv22);
2014 /* EWALD ELECTROSTATICS */
2016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2017 ewrt = _mm256_mul_pd(r22,ewtabscale);
2018 ewitab = _mm256_cvttpd_epi32(ewrt);
2019 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2020 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2021 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2022 &ewtabF,&ewtabFn);
2023 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2024 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2026 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2028 fscal = felec;
2030 fscal = _mm256_and_pd(fscal,cutoff_mask);
2032 /* Calculate temporary vectorial force */
2033 tx = _mm256_mul_pd(fscal,dx22);
2034 ty = _mm256_mul_pd(fscal,dy22);
2035 tz = _mm256_mul_pd(fscal,dz22);
2037 /* Update vectorial force */
2038 fix2 = _mm256_add_pd(fix2,tx);
2039 fiy2 = _mm256_add_pd(fiy2,ty);
2040 fiz2 = _mm256_add_pd(fiz2,tz);
2042 fjx2 = _mm256_add_pd(fjx2,tx);
2043 fjy2 = _mm256_add_pd(fjy2,ty);
2044 fjz2 = _mm256_add_pd(fjz2,tz);
2048 fjptrA = f+j_coord_offsetA;
2049 fjptrB = f+j_coord_offsetB;
2050 fjptrC = f+j_coord_offsetC;
2051 fjptrD = f+j_coord_offsetD;
2053 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2054 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2056 /* Inner loop uses 351 flops */
2059 if(jidx<j_index_end)
2062 /* Get j neighbor index, and coordinate index */
2063 jnrlistA = jjnr[jidx];
2064 jnrlistB = jjnr[jidx+1];
2065 jnrlistC = jjnr[jidx+2];
2066 jnrlistD = jjnr[jidx+3];
2067 /* Sign of each element will be negative for non-real atoms.
2068 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2069 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2071 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2073 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
2074 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
2075 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
2077 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2078 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2079 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2080 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2081 j_coord_offsetA = DIM*jnrA;
2082 j_coord_offsetB = DIM*jnrB;
2083 j_coord_offsetC = DIM*jnrC;
2084 j_coord_offsetD = DIM*jnrD;
2086 /* load j atom coordinates */
2087 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
2088 x+j_coord_offsetC,x+j_coord_offsetD,
2089 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2091 /* Calculate displacement vector */
2092 dx00 = _mm256_sub_pd(ix0,jx0);
2093 dy00 = _mm256_sub_pd(iy0,jy0);
2094 dz00 = _mm256_sub_pd(iz0,jz0);
2095 dx01 = _mm256_sub_pd(ix0,jx1);
2096 dy01 = _mm256_sub_pd(iy0,jy1);
2097 dz01 = _mm256_sub_pd(iz0,jz1);
2098 dx02 = _mm256_sub_pd(ix0,jx2);
2099 dy02 = _mm256_sub_pd(iy0,jy2);
2100 dz02 = _mm256_sub_pd(iz0,jz2);
2101 dx10 = _mm256_sub_pd(ix1,jx0);
2102 dy10 = _mm256_sub_pd(iy1,jy0);
2103 dz10 = _mm256_sub_pd(iz1,jz0);
2104 dx11 = _mm256_sub_pd(ix1,jx1);
2105 dy11 = _mm256_sub_pd(iy1,jy1);
2106 dz11 = _mm256_sub_pd(iz1,jz1);
2107 dx12 = _mm256_sub_pd(ix1,jx2);
2108 dy12 = _mm256_sub_pd(iy1,jy2);
2109 dz12 = _mm256_sub_pd(iz1,jz2);
2110 dx20 = _mm256_sub_pd(ix2,jx0);
2111 dy20 = _mm256_sub_pd(iy2,jy0);
2112 dz20 = _mm256_sub_pd(iz2,jz0);
2113 dx21 = _mm256_sub_pd(ix2,jx1);
2114 dy21 = _mm256_sub_pd(iy2,jy1);
2115 dz21 = _mm256_sub_pd(iz2,jz1);
2116 dx22 = _mm256_sub_pd(ix2,jx2);
2117 dy22 = _mm256_sub_pd(iy2,jy2);
2118 dz22 = _mm256_sub_pd(iz2,jz2);
2120 /* Calculate squared distance and things based on it */
2121 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
2122 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
2123 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
2124 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
2125 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
2126 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
2127 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
2128 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2129 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2131 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2132 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
2133 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
2134 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
2135 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2136 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2137 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
2138 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2139 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2141 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2142 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
2143 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
2144 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
2145 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2146 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2147 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
2148 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2149 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2151 fjx0 = _mm256_setzero_pd();
2152 fjy0 = _mm256_setzero_pd();
2153 fjz0 = _mm256_setzero_pd();
2154 fjx1 = _mm256_setzero_pd();
2155 fjy1 = _mm256_setzero_pd();
2156 fjz1 = _mm256_setzero_pd();
2157 fjx2 = _mm256_setzero_pd();
2158 fjy2 = _mm256_setzero_pd();
2159 fjz2 = _mm256_setzero_pd();
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2168 r00 = _mm256_mul_pd(rsq00,rinv00);
2169 r00 = _mm256_andnot_pd(dummy_mask,r00);
2171 /* EWALD ELECTROSTATICS */
2173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2174 ewrt = _mm256_mul_pd(r00,ewtabscale);
2175 ewitab = _mm256_cvttpd_epi32(ewrt);
2176 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2177 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2178 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2179 &ewtabF,&ewtabFn);
2180 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2181 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
2183 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2185 fscal = felec;
2187 fscal = _mm256_and_pd(fscal,cutoff_mask);
2189 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2191 /* Calculate temporary vectorial force */
2192 tx = _mm256_mul_pd(fscal,dx00);
2193 ty = _mm256_mul_pd(fscal,dy00);
2194 tz = _mm256_mul_pd(fscal,dz00);
2196 /* Update vectorial force */
2197 fix0 = _mm256_add_pd(fix0,tx);
2198 fiy0 = _mm256_add_pd(fiy0,ty);
2199 fiz0 = _mm256_add_pd(fiz0,tz);
2201 fjx0 = _mm256_add_pd(fjx0,tx);
2202 fjy0 = _mm256_add_pd(fjy0,ty);
2203 fjz0 = _mm256_add_pd(fjz0,tz);
2207 /**************************
2208 * CALCULATE INTERACTIONS *
2209 **************************/
2211 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2214 r01 = _mm256_mul_pd(rsq01,rinv01);
2215 r01 = _mm256_andnot_pd(dummy_mask,r01);
2217 /* EWALD ELECTROSTATICS */
2219 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2220 ewrt = _mm256_mul_pd(r01,ewtabscale);
2221 ewitab = _mm256_cvttpd_epi32(ewrt);
2222 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2223 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2224 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2225 &ewtabF,&ewtabFn);
2226 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2227 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
2229 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
2231 fscal = felec;
2233 fscal = _mm256_and_pd(fscal,cutoff_mask);
2235 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2237 /* Calculate temporary vectorial force */
2238 tx = _mm256_mul_pd(fscal,dx01);
2239 ty = _mm256_mul_pd(fscal,dy01);
2240 tz = _mm256_mul_pd(fscal,dz01);
2242 /* Update vectorial force */
2243 fix0 = _mm256_add_pd(fix0,tx);
2244 fiy0 = _mm256_add_pd(fiy0,ty);
2245 fiz0 = _mm256_add_pd(fiz0,tz);
2247 fjx1 = _mm256_add_pd(fjx1,tx);
2248 fjy1 = _mm256_add_pd(fjy1,ty);
2249 fjz1 = _mm256_add_pd(fjz1,tz);
2253 /**************************
2254 * CALCULATE INTERACTIONS *
2255 **************************/
2257 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2260 r02 = _mm256_mul_pd(rsq02,rinv02);
2261 r02 = _mm256_andnot_pd(dummy_mask,r02);
2263 /* EWALD ELECTROSTATICS */
2265 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2266 ewrt = _mm256_mul_pd(r02,ewtabscale);
2267 ewitab = _mm256_cvttpd_epi32(ewrt);
2268 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2269 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2270 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2271 &ewtabF,&ewtabFn);
2272 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2273 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
2275 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
2277 fscal = felec;
2279 fscal = _mm256_and_pd(fscal,cutoff_mask);
2281 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2283 /* Calculate temporary vectorial force */
2284 tx = _mm256_mul_pd(fscal,dx02);
2285 ty = _mm256_mul_pd(fscal,dy02);
2286 tz = _mm256_mul_pd(fscal,dz02);
2288 /* Update vectorial force */
2289 fix0 = _mm256_add_pd(fix0,tx);
2290 fiy0 = _mm256_add_pd(fiy0,ty);
2291 fiz0 = _mm256_add_pd(fiz0,tz);
2293 fjx2 = _mm256_add_pd(fjx2,tx);
2294 fjy2 = _mm256_add_pd(fjy2,ty);
2295 fjz2 = _mm256_add_pd(fjz2,tz);
2299 /**************************
2300 * CALCULATE INTERACTIONS *
2301 **************************/
2303 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2306 r10 = _mm256_mul_pd(rsq10,rinv10);
2307 r10 = _mm256_andnot_pd(dummy_mask,r10);
2309 /* EWALD ELECTROSTATICS */
2311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2312 ewrt = _mm256_mul_pd(r10,ewtabscale);
2313 ewitab = _mm256_cvttpd_epi32(ewrt);
2314 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2315 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2316 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2317 &ewtabF,&ewtabFn);
2318 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2319 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2321 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2323 fscal = felec;
2325 fscal = _mm256_and_pd(fscal,cutoff_mask);
2327 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2329 /* Calculate temporary vectorial force */
2330 tx = _mm256_mul_pd(fscal,dx10);
2331 ty = _mm256_mul_pd(fscal,dy10);
2332 tz = _mm256_mul_pd(fscal,dz10);
2334 /* Update vectorial force */
2335 fix1 = _mm256_add_pd(fix1,tx);
2336 fiy1 = _mm256_add_pd(fiy1,ty);
2337 fiz1 = _mm256_add_pd(fiz1,tz);
2339 fjx0 = _mm256_add_pd(fjx0,tx);
2340 fjy0 = _mm256_add_pd(fjy0,ty);
2341 fjz0 = _mm256_add_pd(fjz0,tz);
2345 /**************************
2346 * CALCULATE INTERACTIONS *
2347 **************************/
2349 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2352 r11 = _mm256_mul_pd(rsq11,rinv11);
2353 r11 = _mm256_andnot_pd(dummy_mask,r11);
2355 /* EWALD ELECTROSTATICS */
2357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2358 ewrt = _mm256_mul_pd(r11,ewtabscale);
2359 ewitab = _mm256_cvttpd_epi32(ewrt);
2360 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2361 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2362 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2363 &ewtabF,&ewtabFn);
2364 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2365 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2367 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2369 fscal = felec;
2371 fscal = _mm256_and_pd(fscal,cutoff_mask);
2373 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2375 /* Calculate temporary vectorial force */
2376 tx = _mm256_mul_pd(fscal,dx11);
2377 ty = _mm256_mul_pd(fscal,dy11);
2378 tz = _mm256_mul_pd(fscal,dz11);
2380 /* Update vectorial force */
2381 fix1 = _mm256_add_pd(fix1,tx);
2382 fiy1 = _mm256_add_pd(fiy1,ty);
2383 fiz1 = _mm256_add_pd(fiz1,tz);
2385 fjx1 = _mm256_add_pd(fjx1,tx);
2386 fjy1 = _mm256_add_pd(fjy1,ty);
2387 fjz1 = _mm256_add_pd(fjz1,tz);
2391 /**************************
2392 * CALCULATE INTERACTIONS *
2393 **************************/
2395 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2398 r12 = _mm256_mul_pd(rsq12,rinv12);
2399 r12 = _mm256_andnot_pd(dummy_mask,r12);
2401 /* EWALD ELECTROSTATICS */
2403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2404 ewrt = _mm256_mul_pd(r12,ewtabscale);
2405 ewitab = _mm256_cvttpd_epi32(ewrt);
2406 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2407 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2408 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2409 &ewtabF,&ewtabFn);
2410 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2411 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2413 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2415 fscal = felec;
2417 fscal = _mm256_and_pd(fscal,cutoff_mask);
2419 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2421 /* Calculate temporary vectorial force */
2422 tx = _mm256_mul_pd(fscal,dx12);
2423 ty = _mm256_mul_pd(fscal,dy12);
2424 tz = _mm256_mul_pd(fscal,dz12);
2426 /* Update vectorial force */
2427 fix1 = _mm256_add_pd(fix1,tx);
2428 fiy1 = _mm256_add_pd(fiy1,ty);
2429 fiz1 = _mm256_add_pd(fiz1,tz);
2431 fjx2 = _mm256_add_pd(fjx2,tx);
2432 fjy2 = _mm256_add_pd(fjy2,ty);
2433 fjz2 = _mm256_add_pd(fjz2,tz);
2437 /**************************
2438 * CALCULATE INTERACTIONS *
2439 **************************/
2441 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2444 r20 = _mm256_mul_pd(rsq20,rinv20);
2445 r20 = _mm256_andnot_pd(dummy_mask,r20);
2447 /* EWALD ELECTROSTATICS */
2449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2450 ewrt = _mm256_mul_pd(r20,ewtabscale);
2451 ewitab = _mm256_cvttpd_epi32(ewrt);
2452 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2453 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2454 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2455 &ewtabF,&ewtabFn);
2456 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2457 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2459 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2461 fscal = felec;
2463 fscal = _mm256_and_pd(fscal,cutoff_mask);
2465 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2467 /* Calculate temporary vectorial force */
2468 tx = _mm256_mul_pd(fscal,dx20);
2469 ty = _mm256_mul_pd(fscal,dy20);
2470 tz = _mm256_mul_pd(fscal,dz20);
2472 /* Update vectorial force */
2473 fix2 = _mm256_add_pd(fix2,tx);
2474 fiy2 = _mm256_add_pd(fiy2,ty);
2475 fiz2 = _mm256_add_pd(fiz2,tz);
2477 fjx0 = _mm256_add_pd(fjx0,tx);
2478 fjy0 = _mm256_add_pd(fjy0,ty);
2479 fjz0 = _mm256_add_pd(fjz0,tz);
2483 /**************************
2484 * CALCULATE INTERACTIONS *
2485 **************************/
2487 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2490 r21 = _mm256_mul_pd(rsq21,rinv21);
2491 r21 = _mm256_andnot_pd(dummy_mask,r21);
2493 /* EWALD ELECTROSTATICS */
2495 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2496 ewrt = _mm256_mul_pd(r21,ewtabscale);
2497 ewitab = _mm256_cvttpd_epi32(ewrt);
2498 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2499 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2500 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2501 &ewtabF,&ewtabFn);
2502 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2503 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2505 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2507 fscal = felec;
2509 fscal = _mm256_and_pd(fscal,cutoff_mask);
2511 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2513 /* Calculate temporary vectorial force */
2514 tx = _mm256_mul_pd(fscal,dx21);
2515 ty = _mm256_mul_pd(fscal,dy21);
2516 tz = _mm256_mul_pd(fscal,dz21);
2518 /* Update vectorial force */
2519 fix2 = _mm256_add_pd(fix2,tx);
2520 fiy2 = _mm256_add_pd(fiy2,ty);
2521 fiz2 = _mm256_add_pd(fiz2,tz);
2523 fjx1 = _mm256_add_pd(fjx1,tx);
2524 fjy1 = _mm256_add_pd(fjy1,ty);
2525 fjz1 = _mm256_add_pd(fjz1,tz);
2529 /**************************
2530 * CALCULATE INTERACTIONS *
2531 **************************/
2533 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2536 r22 = _mm256_mul_pd(rsq22,rinv22);
2537 r22 = _mm256_andnot_pd(dummy_mask,r22);
2539 /* EWALD ELECTROSTATICS */
2541 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2542 ewrt = _mm256_mul_pd(r22,ewtabscale);
2543 ewitab = _mm256_cvttpd_epi32(ewrt);
2544 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2545 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2546 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2547 &ewtabF,&ewtabFn);
2548 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2549 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2551 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2553 fscal = felec;
2555 fscal = _mm256_and_pd(fscal,cutoff_mask);
2557 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2559 /* Calculate temporary vectorial force */
2560 tx = _mm256_mul_pd(fscal,dx22);
2561 ty = _mm256_mul_pd(fscal,dy22);
2562 tz = _mm256_mul_pd(fscal,dz22);
2564 /* Update vectorial force */
2565 fix2 = _mm256_add_pd(fix2,tx);
2566 fiy2 = _mm256_add_pd(fiy2,ty);
2567 fiz2 = _mm256_add_pd(fiz2,tz);
2569 fjx2 = _mm256_add_pd(fjx2,tx);
2570 fjy2 = _mm256_add_pd(fjy2,ty);
2571 fjz2 = _mm256_add_pd(fjz2,tz);
2575 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2576 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2577 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2578 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2580 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2581 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2583 /* Inner loop uses 360 flops */
2586 /* End of innermost loop */
2588 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2589 f+i_coord_offset,fshift+i_shift_offset);
2591 /* Increment number of inner iterations */
2592 inneriter += j_index_end - j_index_start;
2594 /* Outer loop uses 18 flops */
2597 /* Increment number of outer iterations */
2598 outeriter += nri;
2600 /* Update outer/inner flops */
2602 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);