Remove nb-parameters from t_forcerec
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwLJ_GeomW3W3_avx_128_fma_single.c
blobcb8b0f2cd3d83fe6df01292b45791b3f5e983886
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real rcutoff_scalar;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80 real scratch[4*DIM];
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
82 int vdwioffset0;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84 int vdwioffset1;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86 int vdwioffset2;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
104 real *charge;
105 int nvdwtype;
106 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 int *vdwtype;
108 real *vdwparam;
109 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
111 __m128i ewitab;
112 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
114 real *ewtab;
115 __m128 dummy_mask,cutoff_mask;
116 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one = _mm_set1_ps(1.0);
118 __m128 two = _mm_set1_ps(2.0);
119 x = xx[0];
120 f = ff[0];
122 nri = nlist->nri;
123 iinr = nlist->iinr;
124 jindex = nlist->jindex;
125 jjnr = nlist->jjnr;
126 shiftidx = nlist->shift;
127 gid = nlist->gid;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_ps(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
133 vdwparam = fr->nbfp;
134 vdwtype = mdatoms->typeA;
136 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
137 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
138 beta2 = _mm_mul_ps(beta,beta);
139 beta3 = _mm_mul_ps(beta,beta2);
140 ewtab = fr->ic->tabq_coul_FDV0;
141 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
142 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
144 /* Setup water-specific parameters */
145 inr = nlist->iinr[0];
146 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
147 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
148 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
149 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
151 jq0 = _mm_set1_ps(charge[inr+0]);
152 jq1 = _mm_set1_ps(charge[inr+1]);
153 jq2 = _mm_set1_ps(charge[inr+2]);
154 vdwjidx0A = 2*vdwtype[inr+0];
155 qq00 = _mm_mul_ps(iq0,jq0);
156 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
157 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
158 qq01 = _mm_mul_ps(iq0,jq1);
159 qq02 = _mm_mul_ps(iq0,jq2);
160 qq10 = _mm_mul_ps(iq1,jq0);
161 qq11 = _mm_mul_ps(iq1,jq1);
162 qq12 = _mm_mul_ps(iq1,jq2);
163 qq20 = _mm_mul_ps(iq2,jq0);
164 qq21 = _mm_mul_ps(iq2,jq1);
165 qq22 = _mm_mul_ps(iq2,jq2);
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = 0;
169 j_coord_offsetA = 0;
170 j_coord_offsetB = 0;
171 j_coord_offsetC = 0;
172 j_coord_offsetD = 0;
174 outeriter = 0;
175 inneriter = 0;
177 for(iidx=0;iidx<4*DIM;iidx++)
179 scratch[iidx] = 0.0;
182 /* Start outer loop over neighborlists */
183 for(iidx=0; iidx<nri; iidx++)
185 /* Load shift vector for this list */
186 i_shift_offset = DIM*shiftidx[iidx];
188 /* Load limits for loop over neighbors */
189 j_index_start = jindex[iidx];
190 j_index_end = jindex[iidx+1];
192 /* Get outer coordinate index */
193 inr = iinr[iidx];
194 i_coord_offset = DIM*inr;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
198 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
200 fix0 = _mm_setzero_ps();
201 fiy0 = _mm_setzero_ps();
202 fiz0 = _mm_setzero_ps();
203 fix1 = _mm_setzero_ps();
204 fiy1 = _mm_setzero_ps();
205 fiz1 = _mm_setzero_ps();
206 fix2 = _mm_setzero_ps();
207 fiy2 = _mm_setzero_ps();
208 fiz2 = _mm_setzero_ps();
210 /* Reset potential sums */
211 velecsum = _mm_setzero_ps();
212 vvdwsum = _mm_setzero_ps();
214 /* Start inner kernel loop */
215 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218 /* Get j neighbor index, and coordinate index */
219 jnrA = jjnr[jidx];
220 jnrB = jjnr[jidx+1];
221 jnrC = jjnr[jidx+2];
222 jnrD = jjnr[jidx+3];
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
225 j_coord_offsetC = DIM*jnrC;
226 j_coord_offsetD = DIM*jnrD;
228 /* load j atom coordinates */
229 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
230 x+j_coord_offsetC,x+j_coord_offsetD,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm_sub_ps(ix0,jx0);
235 dy00 = _mm_sub_ps(iy0,jy0);
236 dz00 = _mm_sub_ps(iz0,jz0);
237 dx01 = _mm_sub_ps(ix0,jx1);
238 dy01 = _mm_sub_ps(iy0,jy1);
239 dz01 = _mm_sub_ps(iz0,jz1);
240 dx02 = _mm_sub_ps(ix0,jx2);
241 dy02 = _mm_sub_ps(iy0,jy2);
242 dz02 = _mm_sub_ps(iz0,jz2);
243 dx10 = _mm_sub_ps(ix1,jx0);
244 dy10 = _mm_sub_ps(iy1,jy0);
245 dz10 = _mm_sub_ps(iz1,jz0);
246 dx11 = _mm_sub_ps(ix1,jx1);
247 dy11 = _mm_sub_ps(iy1,jy1);
248 dz11 = _mm_sub_ps(iz1,jz1);
249 dx12 = _mm_sub_ps(ix1,jx2);
250 dy12 = _mm_sub_ps(iy1,jy2);
251 dz12 = _mm_sub_ps(iz1,jz2);
252 dx20 = _mm_sub_ps(ix2,jx0);
253 dy20 = _mm_sub_ps(iy2,jy0);
254 dz20 = _mm_sub_ps(iz2,jz0);
255 dx21 = _mm_sub_ps(ix2,jx1);
256 dy21 = _mm_sub_ps(iy2,jy1);
257 dz21 = _mm_sub_ps(iz2,jz1);
258 dx22 = _mm_sub_ps(ix2,jx2);
259 dy22 = _mm_sub_ps(iy2,jy2);
260 dz22 = _mm_sub_ps(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
265 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
266 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
267 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
270 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
273 rinv00 = avx128fma_invsqrt_f(rsq00);
274 rinv01 = avx128fma_invsqrt_f(rsq01);
275 rinv02 = avx128fma_invsqrt_f(rsq02);
276 rinv10 = avx128fma_invsqrt_f(rsq10);
277 rinv11 = avx128fma_invsqrt_f(rsq11);
278 rinv12 = avx128fma_invsqrt_f(rsq12);
279 rinv20 = avx128fma_invsqrt_f(rsq20);
280 rinv21 = avx128fma_invsqrt_f(rsq21);
281 rinv22 = avx128fma_invsqrt_f(rsq22);
283 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
284 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
285 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
286 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
287 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
288 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
289 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
290 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
291 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
293 fjx0 = _mm_setzero_ps();
294 fjy0 = _mm_setzero_ps();
295 fjz0 = _mm_setzero_ps();
296 fjx1 = _mm_setzero_ps();
297 fjy1 = _mm_setzero_ps();
298 fjz1 = _mm_setzero_ps();
299 fjx2 = _mm_setzero_ps();
300 fjy2 = _mm_setzero_ps();
301 fjz2 = _mm_setzero_ps();
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 r00 = _mm_mul_ps(rsq00,rinv00);
309 /* EWALD ELECTROSTATICS */
311 /* Analytical PME correction */
312 zeta2 = _mm_mul_ps(beta2,rsq00);
313 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
314 pmecorrF = avx128fma_pmecorrF_f(zeta2);
315 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
316 felec = _mm_mul_ps(qq00,felec);
317 pmecorrV = avx128fma_pmecorrV_f(zeta2);
318 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
319 velec = _mm_mul_ps(qq00,velec);
321 /* LENNARD-JONES DISPERSION/REPULSION */
323 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
324 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
325 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
326 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
327 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 velecsum = _mm_add_ps(velecsum,velec);
331 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
333 fscal = _mm_add_ps(felec,fvdw);
335 /* Update vectorial force */
336 fix0 = _mm_macc_ps(dx00,fscal,fix0);
337 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
338 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
340 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
341 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
342 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
344 /**************************
345 * CALCULATE INTERACTIONS *
346 **************************/
348 r01 = _mm_mul_ps(rsq01,rinv01);
350 /* EWALD ELECTROSTATICS */
352 /* Analytical PME correction */
353 zeta2 = _mm_mul_ps(beta2,rsq01);
354 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
355 pmecorrF = avx128fma_pmecorrF_f(zeta2);
356 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
357 felec = _mm_mul_ps(qq01,felec);
358 pmecorrV = avx128fma_pmecorrV_f(zeta2);
359 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
360 velec = _mm_mul_ps(qq01,velec);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum = _mm_add_ps(velecsum,velec);
365 fscal = felec;
367 /* Update vectorial force */
368 fix0 = _mm_macc_ps(dx01,fscal,fix0);
369 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
370 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
372 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
373 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
374 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r02 = _mm_mul_ps(rsq02,rinv02);
382 /* EWALD ELECTROSTATICS */
384 /* Analytical PME correction */
385 zeta2 = _mm_mul_ps(beta2,rsq02);
386 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
387 pmecorrF = avx128fma_pmecorrF_f(zeta2);
388 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
389 felec = _mm_mul_ps(qq02,felec);
390 pmecorrV = avx128fma_pmecorrV_f(zeta2);
391 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
392 velec = _mm_mul_ps(qq02,velec);
394 /* Update potential sum for this i atom from the interaction with this j atom. */
395 velecsum = _mm_add_ps(velecsum,velec);
397 fscal = felec;
399 /* Update vectorial force */
400 fix0 = _mm_macc_ps(dx02,fscal,fix0);
401 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
402 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
404 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
405 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
406 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 r10 = _mm_mul_ps(rsq10,rinv10);
414 /* EWALD ELECTROSTATICS */
416 /* Analytical PME correction */
417 zeta2 = _mm_mul_ps(beta2,rsq10);
418 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
419 pmecorrF = avx128fma_pmecorrF_f(zeta2);
420 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
421 felec = _mm_mul_ps(qq10,felec);
422 pmecorrV = avx128fma_pmecorrV_f(zeta2);
423 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
424 velec = _mm_mul_ps(qq10,velec);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velecsum = _mm_add_ps(velecsum,velec);
429 fscal = felec;
431 /* Update vectorial force */
432 fix1 = _mm_macc_ps(dx10,fscal,fix1);
433 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
434 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
436 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
437 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
438 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
440 /**************************
441 * CALCULATE INTERACTIONS *
442 **************************/
444 r11 = _mm_mul_ps(rsq11,rinv11);
446 /* EWALD ELECTROSTATICS */
448 /* Analytical PME correction */
449 zeta2 = _mm_mul_ps(beta2,rsq11);
450 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
451 pmecorrF = avx128fma_pmecorrF_f(zeta2);
452 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
453 felec = _mm_mul_ps(qq11,felec);
454 pmecorrV = avx128fma_pmecorrV_f(zeta2);
455 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
456 velec = _mm_mul_ps(qq11,velec);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum = _mm_add_ps(velecsum,velec);
461 fscal = felec;
463 /* Update vectorial force */
464 fix1 = _mm_macc_ps(dx11,fscal,fix1);
465 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
466 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
468 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
469 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
470 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
476 r12 = _mm_mul_ps(rsq12,rinv12);
478 /* EWALD ELECTROSTATICS */
480 /* Analytical PME correction */
481 zeta2 = _mm_mul_ps(beta2,rsq12);
482 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
483 pmecorrF = avx128fma_pmecorrF_f(zeta2);
484 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
485 felec = _mm_mul_ps(qq12,felec);
486 pmecorrV = avx128fma_pmecorrV_f(zeta2);
487 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
488 velec = _mm_mul_ps(qq12,velec);
490 /* Update potential sum for this i atom from the interaction with this j atom. */
491 velecsum = _mm_add_ps(velecsum,velec);
493 fscal = felec;
495 /* Update vectorial force */
496 fix1 = _mm_macc_ps(dx12,fscal,fix1);
497 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
498 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
500 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
501 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
502 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 r20 = _mm_mul_ps(rsq20,rinv20);
510 /* EWALD ELECTROSTATICS */
512 /* Analytical PME correction */
513 zeta2 = _mm_mul_ps(beta2,rsq20);
514 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
515 pmecorrF = avx128fma_pmecorrF_f(zeta2);
516 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
517 felec = _mm_mul_ps(qq20,felec);
518 pmecorrV = avx128fma_pmecorrV_f(zeta2);
519 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
520 velec = _mm_mul_ps(qq20,velec);
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum = _mm_add_ps(velecsum,velec);
525 fscal = felec;
527 /* Update vectorial force */
528 fix2 = _mm_macc_ps(dx20,fscal,fix2);
529 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
530 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
532 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
533 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
534 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
536 /**************************
537 * CALCULATE INTERACTIONS *
538 **************************/
540 r21 = _mm_mul_ps(rsq21,rinv21);
542 /* EWALD ELECTROSTATICS */
544 /* Analytical PME correction */
545 zeta2 = _mm_mul_ps(beta2,rsq21);
546 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
547 pmecorrF = avx128fma_pmecorrF_f(zeta2);
548 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
549 felec = _mm_mul_ps(qq21,felec);
550 pmecorrV = avx128fma_pmecorrV_f(zeta2);
551 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
552 velec = _mm_mul_ps(qq21,velec);
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velecsum = _mm_add_ps(velecsum,velec);
557 fscal = felec;
559 /* Update vectorial force */
560 fix2 = _mm_macc_ps(dx21,fscal,fix2);
561 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
562 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
564 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
565 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
566 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 r22 = _mm_mul_ps(rsq22,rinv22);
574 /* EWALD ELECTROSTATICS */
576 /* Analytical PME correction */
577 zeta2 = _mm_mul_ps(beta2,rsq22);
578 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
579 pmecorrF = avx128fma_pmecorrF_f(zeta2);
580 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
581 felec = _mm_mul_ps(qq22,felec);
582 pmecorrV = avx128fma_pmecorrV_f(zeta2);
583 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
584 velec = _mm_mul_ps(qq22,velec);
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velecsum = _mm_add_ps(velecsum,velec);
589 fscal = felec;
591 /* Update vectorial force */
592 fix2 = _mm_macc_ps(dx22,fscal,fix2);
593 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
594 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
596 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
597 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
598 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
600 fjptrA = f+j_coord_offsetA;
601 fjptrB = f+j_coord_offsetB;
602 fjptrC = f+j_coord_offsetC;
603 fjptrD = f+j_coord_offsetD;
605 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
606 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
608 /* Inner loop uses 273 flops */
611 if(jidx<j_index_end)
614 /* Get j neighbor index, and coordinate index */
615 jnrlistA = jjnr[jidx];
616 jnrlistB = jjnr[jidx+1];
617 jnrlistC = jjnr[jidx+2];
618 jnrlistD = jjnr[jidx+3];
619 /* Sign of each element will be negative for non-real atoms.
620 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
621 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
623 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
624 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
625 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
626 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
627 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
628 j_coord_offsetA = DIM*jnrA;
629 j_coord_offsetB = DIM*jnrB;
630 j_coord_offsetC = DIM*jnrC;
631 j_coord_offsetD = DIM*jnrD;
633 /* load j atom coordinates */
634 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
635 x+j_coord_offsetC,x+j_coord_offsetD,
636 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
638 /* Calculate displacement vector */
639 dx00 = _mm_sub_ps(ix0,jx0);
640 dy00 = _mm_sub_ps(iy0,jy0);
641 dz00 = _mm_sub_ps(iz0,jz0);
642 dx01 = _mm_sub_ps(ix0,jx1);
643 dy01 = _mm_sub_ps(iy0,jy1);
644 dz01 = _mm_sub_ps(iz0,jz1);
645 dx02 = _mm_sub_ps(ix0,jx2);
646 dy02 = _mm_sub_ps(iy0,jy2);
647 dz02 = _mm_sub_ps(iz0,jz2);
648 dx10 = _mm_sub_ps(ix1,jx0);
649 dy10 = _mm_sub_ps(iy1,jy0);
650 dz10 = _mm_sub_ps(iz1,jz0);
651 dx11 = _mm_sub_ps(ix1,jx1);
652 dy11 = _mm_sub_ps(iy1,jy1);
653 dz11 = _mm_sub_ps(iz1,jz1);
654 dx12 = _mm_sub_ps(ix1,jx2);
655 dy12 = _mm_sub_ps(iy1,jy2);
656 dz12 = _mm_sub_ps(iz1,jz2);
657 dx20 = _mm_sub_ps(ix2,jx0);
658 dy20 = _mm_sub_ps(iy2,jy0);
659 dz20 = _mm_sub_ps(iz2,jz0);
660 dx21 = _mm_sub_ps(ix2,jx1);
661 dy21 = _mm_sub_ps(iy2,jy1);
662 dz21 = _mm_sub_ps(iz2,jz1);
663 dx22 = _mm_sub_ps(ix2,jx2);
664 dy22 = _mm_sub_ps(iy2,jy2);
665 dz22 = _mm_sub_ps(iz2,jz2);
667 /* Calculate squared distance and things based on it */
668 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
669 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
670 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
671 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
672 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
673 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
674 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
675 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
676 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
678 rinv00 = avx128fma_invsqrt_f(rsq00);
679 rinv01 = avx128fma_invsqrt_f(rsq01);
680 rinv02 = avx128fma_invsqrt_f(rsq02);
681 rinv10 = avx128fma_invsqrt_f(rsq10);
682 rinv11 = avx128fma_invsqrt_f(rsq11);
683 rinv12 = avx128fma_invsqrt_f(rsq12);
684 rinv20 = avx128fma_invsqrt_f(rsq20);
685 rinv21 = avx128fma_invsqrt_f(rsq21);
686 rinv22 = avx128fma_invsqrt_f(rsq22);
688 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
689 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
690 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
691 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
692 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
693 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
694 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
695 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
696 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
698 fjx0 = _mm_setzero_ps();
699 fjy0 = _mm_setzero_ps();
700 fjz0 = _mm_setzero_ps();
701 fjx1 = _mm_setzero_ps();
702 fjy1 = _mm_setzero_ps();
703 fjz1 = _mm_setzero_ps();
704 fjx2 = _mm_setzero_ps();
705 fjy2 = _mm_setzero_ps();
706 fjz2 = _mm_setzero_ps();
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 r00 = _mm_mul_ps(rsq00,rinv00);
713 r00 = _mm_andnot_ps(dummy_mask,r00);
715 /* EWALD ELECTROSTATICS */
717 /* Analytical PME correction */
718 zeta2 = _mm_mul_ps(beta2,rsq00);
719 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
720 pmecorrF = avx128fma_pmecorrF_f(zeta2);
721 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
722 felec = _mm_mul_ps(qq00,felec);
723 pmecorrV = avx128fma_pmecorrV_f(zeta2);
724 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
725 velec = _mm_mul_ps(qq00,velec);
727 /* LENNARD-JONES DISPERSION/REPULSION */
729 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
730 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
731 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
732 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
733 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velec = _mm_andnot_ps(dummy_mask,velec);
737 velecsum = _mm_add_ps(velecsum,velec);
738 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
739 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
741 fscal = _mm_add_ps(felec,fvdw);
743 fscal = _mm_andnot_ps(dummy_mask,fscal);
745 /* Update vectorial force */
746 fix0 = _mm_macc_ps(dx00,fscal,fix0);
747 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
748 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
750 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
751 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
752 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
754 /**************************
755 * CALCULATE INTERACTIONS *
756 **************************/
758 r01 = _mm_mul_ps(rsq01,rinv01);
759 r01 = _mm_andnot_ps(dummy_mask,r01);
761 /* EWALD ELECTROSTATICS */
763 /* Analytical PME correction */
764 zeta2 = _mm_mul_ps(beta2,rsq01);
765 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
766 pmecorrF = avx128fma_pmecorrF_f(zeta2);
767 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
768 felec = _mm_mul_ps(qq01,felec);
769 pmecorrV = avx128fma_pmecorrV_f(zeta2);
770 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
771 velec = _mm_mul_ps(qq01,velec);
773 /* Update potential sum for this i atom from the interaction with this j atom. */
774 velec = _mm_andnot_ps(dummy_mask,velec);
775 velecsum = _mm_add_ps(velecsum,velec);
777 fscal = felec;
779 fscal = _mm_andnot_ps(dummy_mask,fscal);
781 /* Update vectorial force */
782 fix0 = _mm_macc_ps(dx01,fscal,fix0);
783 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
784 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
786 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
787 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
788 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
790 /**************************
791 * CALCULATE INTERACTIONS *
792 **************************/
794 r02 = _mm_mul_ps(rsq02,rinv02);
795 r02 = _mm_andnot_ps(dummy_mask,r02);
797 /* EWALD ELECTROSTATICS */
799 /* Analytical PME correction */
800 zeta2 = _mm_mul_ps(beta2,rsq02);
801 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
802 pmecorrF = avx128fma_pmecorrF_f(zeta2);
803 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
804 felec = _mm_mul_ps(qq02,felec);
805 pmecorrV = avx128fma_pmecorrV_f(zeta2);
806 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
807 velec = _mm_mul_ps(qq02,velec);
809 /* Update potential sum for this i atom from the interaction with this j atom. */
810 velec = _mm_andnot_ps(dummy_mask,velec);
811 velecsum = _mm_add_ps(velecsum,velec);
813 fscal = felec;
815 fscal = _mm_andnot_ps(dummy_mask,fscal);
817 /* Update vectorial force */
818 fix0 = _mm_macc_ps(dx02,fscal,fix0);
819 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
820 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
822 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
823 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
824 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
826 /**************************
827 * CALCULATE INTERACTIONS *
828 **************************/
830 r10 = _mm_mul_ps(rsq10,rinv10);
831 r10 = _mm_andnot_ps(dummy_mask,r10);
833 /* EWALD ELECTROSTATICS */
835 /* Analytical PME correction */
836 zeta2 = _mm_mul_ps(beta2,rsq10);
837 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
838 pmecorrF = avx128fma_pmecorrF_f(zeta2);
839 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
840 felec = _mm_mul_ps(qq10,felec);
841 pmecorrV = avx128fma_pmecorrV_f(zeta2);
842 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
843 velec = _mm_mul_ps(qq10,velec);
845 /* Update potential sum for this i atom from the interaction with this j atom. */
846 velec = _mm_andnot_ps(dummy_mask,velec);
847 velecsum = _mm_add_ps(velecsum,velec);
849 fscal = felec;
851 fscal = _mm_andnot_ps(dummy_mask,fscal);
853 /* Update vectorial force */
854 fix1 = _mm_macc_ps(dx10,fscal,fix1);
855 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
856 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
858 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
859 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
860 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 r11 = _mm_mul_ps(rsq11,rinv11);
867 r11 = _mm_andnot_ps(dummy_mask,r11);
869 /* EWALD ELECTROSTATICS */
871 /* Analytical PME correction */
872 zeta2 = _mm_mul_ps(beta2,rsq11);
873 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
874 pmecorrF = avx128fma_pmecorrF_f(zeta2);
875 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
876 felec = _mm_mul_ps(qq11,felec);
877 pmecorrV = avx128fma_pmecorrV_f(zeta2);
878 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
879 velec = _mm_mul_ps(qq11,velec);
881 /* Update potential sum for this i atom from the interaction with this j atom. */
882 velec = _mm_andnot_ps(dummy_mask,velec);
883 velecsum = _mm_add_ps(velecsum,velec);
885 fscal = felec;
887 fscal = _mm_andnot_ps(dummy_mask,fscal);
889 /* Update vectorial force */
890 fix1 = _mm_macc_ps(dx11,fscal,fix1);
891 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
892 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
894 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
895 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
896 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
898 /**************************
899 * CALCULATE INTERACTIONS *
900 **************************/
902 r12 = _mm_mul_ps(rsq12,rinv12);
903 r12 = _mm_andnot_ps(dummy_mask,r12);
905 /* EWALD ELECTROSTATICS */
907 /* Analytical PME correction */
908 zeta2 = _mm_mul_ps(beta2,rsq12);
909 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
910 pmecorrF = avx128fma_pmecorrF_f(zeta2);
911 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
912 felec = _mm_mul_ps(qq12,felec);
913 pmecorrV = avx128fma_pmecorrV_f(zeta2);
914 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
915 velec = _mm_mul_ps(qq12,velec);
917 /* Update potential sum for this i atom from the interaction with this j atom. */
918 velec = _mm_andnot_ps(dummy_mask,velec);
919 velecsum = _mm_add_ps(velecsum,velec);
921 fscal = felec;
923 fscal = _mm_andnot_ps(dummy_mask,fscal);
925 /* Update vectorial force */
926 fix1 = _mm_macc_ps(dx12,fscal,fix1);
927 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
928 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
930 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
931 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
932 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r20 = _mm_mul_ps(rsq20,rinv20);
939 r20 = _mm_andnot_ps(dummy_mask,r20);
941 /* EWALD ELECTROSTATICS */
943 /* Analytical PME correction */
944 zeta2 = _mm_mul_ps(beta2,rsq20);
945 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
946 pmecorrF = avx128fma_pmecorrF_f(zeta2);
947 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
948 felec = _mm_mul_ps(qq20,felec);
949 pmecorrV = avx128fma_pmecorrV_f(zeta2);
950 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
951 velec = _mm_mul_ps(qq20,velec);
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_andnot_ps(dummy_mask,velec);
955 velecsum = _mm_add_ps(velecsum,velec);
957 fscal = felec;
959 fscal = _mm_andnot_ps(dummy_mask,fscal);
961 /* Update vectorial force */
962 fix2 = _mm_macc_ps(dx20,fscal,fix2);
963 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
964 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
966 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
967 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
968 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r21 = _mm_mul_ps(rsq21,rinv21);
975 r21 = _mm_andnot_ps(dummy_mask,r21);
977 /* EWALD ELECTROSTATICS */
979 /* Analytical PME correction */
980 zeta2 = _mm_mul_ps(beta2,rsq21);
981 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
982 pmecorrF = avx128fma_pmecorrF_f(zeta2);
983 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
984 felec = _mm_mul_ps(qq21,felec);
985 pmecorrV = avx128fma_pmecorrV_f(zeta2);
986 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
987 velec = _mm_mul_ps(qq21,velec);
989 /* Update potential sum for this i atom from the interaction with this j atom. */
990 velec = _mm_andnot_ps(dummy_mask,velec);
991 velecsum = _mm_add_ps(velecsum,velec);
993 fscal = felec;
995 fscal = _mm_andnot_ps(dummy_mask,fscal);
997 /* Update vectorial force */
998 fix2 = _mm_macc_ps(dx21,fscal,fix2);
999 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1000 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1002 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1003 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1004 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 r22 = _mm_mul_ps(rsq22,rinv22);
1011 r22 = _mm_andnot_ps(dummy_mask,r22);
1013 /* EWALD ELECTROSTATICS */
1015 /* Analytical PME correction */
1016 zeta2 = _mm_mul_ps(beta2,rsq22);
1017 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1018 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1019 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1020 felec = _mm_mul_ps(qq22,felec);
1021 pmecorrV = avx128fma_pmecorrV_f(zeta2);
1022 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1023 velec = _mm_mul_ps(qq22,velec);
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm_andnot_ps(dummy_mask,velec);
1027 velecsum = _mm_add_ps(velecsum,velec);
1029 fscal = felec;
1031 fscal = _mm_andnot_ps(dummy_mask,fscal);
1033 /* Update vectorial force */
1034 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1035 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1036 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1038 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1039 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1040 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1042 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1043 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1044 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1045 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1047 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1048 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1050 /* Inner loop uses 282 flops */
1053 /* End of innermost loop */
1055 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1056 f+i_coord_offset,fshift+i_shift_offset);
1058 ggid = gid[iidx];
1059 /* Update potential energies */
1060 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1061 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1063 /* Increment number of inner iterations */
1064 inneriter += j_index_end - j_index_start;
1066 /* Outer loop uses 20 flops */
1069 /* Increment number of outer iterations */
1070 outeriter += nri;
1072 /* Update outer/inner flops */
1074 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*282);
1077 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single
1078 * Electrostatics interaction: Ewald
1079 * VdW interaction: LennardJones
1080 * Geometry: Water3-Water3
1081 * Calculate force/pot: Force
1083 void
1084 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single
1085 (t_nblist * gmx_restrict nlist,
1086 rvec * gmx_restrict xx,
1087 rvec * gmx_restrict ff,
1088 struct t_forcerec * gmx_restrict fr,
1089 t_mdatoms * gmx_restrict mdatoms,
1090 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1091 t_nrnb * gmx_restrict nrnb)
1093 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1094 * just 0 for non-waters.
1095 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1096 * jnr indices corresponding to data put in the four positions in the SIMD register.
1098 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1099 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1100 int jnrA,jnrB,jnrC,jnrD;
1101 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1102 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1103 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1104 real rcutoff_scalar;
1105 real *shiftvec,*fshift,*x,*f;
1106 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1107 real scratch[4*DIM];
1108 __m128 fscal,rcutoff,rcutoff2,jidxall;
1109 int vdwioffset0;
1110 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1111 int vdwioffset1;
1112 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1113 int vdwioffset2;
1114 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1115 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1116 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1117 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1118 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1119 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1120 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1121 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1122 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1123 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1124 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1125 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1126 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1127 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1128 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1129 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1130 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1131 real *charge;
1132 int nvdwtype;
1133 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1134 int *vdwtype;
1135 real *vdwparam;
1136 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1137 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1138 __m128i ewitab;
1139 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1140 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1141 real *ewtab;
1142 __m128 dummy_mask,cutoff_mask;
1143 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1144 __m128 one = _mm_set1_ps(1.0);
1145 __m128 two = _mm_set1_ps(2.0);
1146 x = xx[0];
1147 f = ff[0];
1149 nri = nlist->nri;
1150 iinr = nlist->iinr;
1151 jindex = nlist->jindex;
1152 jjnr = nlist->jjnr;
1153 shiftidx = nlist->shift;
1154 gid = nlist->gid;
1155 shiftvec = fr->shift_vec[0];
1156 fshift = fr->fshift[0];
1157 facel = _mm_set1_ps(fr->ic->epsfac);
1158 charge = mdatoms->chargeA;
1159 nvdwtype = fr->ntype;
1160 vdwparam = fr->nbfp;
1161 vdwtype = mdatoms->typeA;
1163 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1164 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1165 beta2 = _mm_mul_ps(beta,beta);
1166 beta3 = _mm_mul_ps(beta,beta2);
1167 ewtab = fr->ic->tabq_coul_F;
1168 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1169 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1171 /* Setup water-specific parameters */
1172 inr = nlist->iinr[0];
1173 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1174 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1175 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1176 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1178 jq0 = _mm_set1_ps(charge[inr+0]);
1179 jq1 = _mm_set1_ps(charge[inr+1]);
1180 jq2 = _mm_set1_ps(charge[inr+2]);
1181 vdwjidx0A = 2*vdwtype[inr+0];
1182 qq00 = _mm_mul_ps(iq0,jq0);
1183 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1184 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1185 qq01 = _mm_mul_ps(iq0,jq1);
1186 qq02 = _mm_mul_ps(iq0,jq2);
1187 qq10 = _mm_mul_ps(iq1,jq0);
1188 qq11 = _mm_mul_ps(iq1,jq1);
1189 qq12 = _mm_mul_ps(iq1,jq2);
1190 qq20 = _mm_mul_ps(iq2,jq0);
1191 qq21 = _mm_mul_ps(iq2,jq1);
1192 qq22 = _mm_mul_ps(iq2,jq2);
1194 /* Avoid stupid compiler warnings */
1195 jnrA = jnrB = jnrC = jnrD = 0;
1196 j_coord_offsetA = 0;
1197 j_coord_offsetB = 0;
1198 j_coord_offsetC = 0;
1199 j_coord_offsetD = 0;
1201 outeriter = 0;
1202 inneriter = 0;
1204 for(iidx=0;iidx<4*DIM;iidx++)
1206 scratch[iidx] = 0.0;
1209 /* Start outer loop over neighborlists */
1210 for(iidx=0; iidx<nri; iidx++)
1212 /* Load shift vector for this list */
1213 i_shift_offset = DIM*shiftidx[iidx];
1215 /* Load limits for loop over neighbors */
1216 j_index_start = jindex[iidx];
1217 j_index_end = jindex[iidx+1];
1219 /* Get outer coordinate index */
1220 inr = iinr[iidx];
1221 i_coord_offset = DIM*inr;
1223 /* Load i particle coords and add shift vector */
1224 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1225 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1227 fix0 = _mm_setzero_ps();
1228 fiy0 = _mm_setzero_ps();
1229 fiz0 = _mm_setzero_ps();
1230 fix1 = _mm_setzero_ps();
1231 fiy1 = _mm_setzero_ps();
1232 fiz1 = _mm_setzero_ps();
1233 fix2 = _mm_setzero_ps();
1234 fiy2 = _mm_setzero_ps();
1235 fiz2 = _mm_setzero_ps();
1237 /* Start inner kernel loop */
1238 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1241 /* Get j neighbor index, and coordinate index */
1242 jnrA = jjnr[jidx];
1243 jnrB = jjnr[jidx+1];
1244 jnrC = jjnr[jidx+2];
1245 jnrD = jjnr[jidx+3];
1246 j_coord_offsetA = DIM*jnrA;
1247 j_coord_offsetB = DIM*jnrB;
1248 j_coord_offsetC = DIM*jnrC;
1249 j_coord_offsetD = DIM*jnrD;
1251 /* load j atom coordinates */
1252 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1253 x+j_coord_offsetC,x+j_coord_offsetD,
1254 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1256 /* Calculate displacement vector */
1257 dx00 = _mm_sub_ps(ix0,jx0);
1258 dy00 = _mm_sub_ps(iy0,jy0);
1259 dz00 = _mm_sub_ps(iz0,jz0);
1260 dx01 = _mm_sub_ps(ix0,jx1);
1261 dy01 = _mm_sub_ps(iy0,jy1);
1262 dz01 = _mm_sub_ps(iz0,jz1);
1263 dx02 = _mm_sub_ps(ix0,jx2);
1264 dy02 = _mm_sub_ps(iy0,jy2);
1265 dz02 = _mm_sub_ps(iz0,jz2);
1266 dx10 = _mm_sub_ps(ix1,jx0);
1267 dy10 = _mm_sub_ps(iy1,jy0);
1268 dz10 = _mm_sub_ps(iz1,jz0);
1269 dx11 = _mm_sub_ps(ix1,jx1);
1270 dy11 = _mm_sub_ps(iy1,jy1);
1271 dz11 = _mm_sub_ps(iz1,jz1);
1272 dx12 = _mm_sub_ps(ix1,jx2);
1273 dy12 = _mm_sub_ps(iy1,jy2);
1274 dz12 = _mm_sub_ps(iz1,jz2);
1275 dx20 = _mm_sub_ps(ix2,jx0);
1276 dy20 = _mm_sub_ps(iy2,jy0);
1277 dz20 = _mm_sub_ps(iz2,jz0);
1278 dx21 = _mm_sub_ps(ix2,jx1);
1279 dy21 = _mm_sub_ps(iy2,jy1);
1280 dz21 = _mm_sub_ps(iz2,jz1);
1281 dx22 = _mm_sub_ps(ix2,jx2);
1282 dy22 = _mm_sub_ps(iy2,jy2);
1283 dz22 = _mm_sub_ps(iz2,jz2);
1285 /* Calculate squared distance and things based on it */
1286 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1287 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1288 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1289 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1290 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1291 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1292 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1293 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1294 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1296 rinv00 = avx128fma_invsqrt_f(rsq00);
1297 rinv01 = avx128fma_invsqrt_f(rsq01);
1298 rinv02 = avx128fma_invsqrt_f(rsq02);
1299 rinv10 = avx128fma_invsqrt_f(rsq10);
1300 rinv11 = avx128fma_invsqrt_f(rsq11);
1301 rinv12 = avx128fma_invsqrt_f(rsq12);
1302 rinv20 = avx128fma_invsqrt_f(rsq20);
1303 rinv21 = avx128fma_invsqrt_f(rsq21);
1304 rinv22 = avx128fma_invsqrt_f(rsq22);
1306 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1307 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1308 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1309 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1310 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1311 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1312 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1313 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1314 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1316 fjx0 = _mm_setzero_ps();
1317 fjy0 = _mm_setzero_ps();
1318 fjz0 = _mm_setzero_ps();
1319 fjx1 = _mm_setzero_ps();
1320 fjy1 = _mm_setzero_ps();
1321 fjz1 = _mm_setzero_ps();
1322 fjx2 = _mm_setzero_ps();
1323 fjy2 = _mm_setzero_ps();
1324 fjz2 = _mm_setzero_ps();
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 r00 = _mm_mul_ps(rsq00,rinv00);
1332 /* EWALD ELECTROSTATICS */
1334 /* Analytical PME correction */
1335 zeta2 = _mm_mul_ps(beta2,rsq00);
1336 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1337 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1338 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1339 felec = _mm_mul_ps(qq00,felec);
1341 /* LENNARD-JONES DISPERSION/REPULSION */
1343 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1344 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1346 fscal = _mm_add_ps(felec,fvdw);
1348 /* Update vectorial force */
1349 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1350 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1351 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1353 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1354 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1355 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1357 /**************************
1358 * CALCULATE INTERACTIONS *
1359 **************************/
1361 r01 = _mm_mul_ps(rsq01,rinv01);
1363 /* EWALD ELECTROSTATICS */
1365 /* Analytical PME correction */
1366 zeta2 = _mm_mul_ps(beta2,rsq01);
1367 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1368 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1369 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1370 felec = _mm_mul_ps(qq01,felec);
1372 fscal = felec;
1374 /* Update vectorial force */
1375 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1376 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1377 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1379 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1380 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1381 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1383 /**************************
1384 * CALCULATE INTERACTIONS *
1385 **************************/
1387 r02 = _mm_mul_ps(rsq02,rinv02);
1389 /* EWALD ELECTROSTATICS */
1391 /* Analytical PME correction */
1392 zeta2 = _mm_mul_ps(beta2,rsq02);
1393 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1394 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1395 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1396 felec = _mm_mul_ps(qq02,felec);
1398 fscal = felec;
1400 /* Update vectorial force */
1401 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1402 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1403 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1405 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1406 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1407 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1409 /**************************
1410 * CALCULATE INTERACTIONS *
1411 **************************/
1413 r10 = _mm_mul_ps(rsq10,rinv10);
1415 /* EWALD ELECTROSTATICS */
1417 /* Analytical PME correction */
1418 zeta2 = _mm_mul_ps(beta2,rsq10);
1419 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1420 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1421 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1422 felec = _mm_mul_ps(qq10,felec);
1424 fscal = felec;
1426 /* Update vectorial force */
1427 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1428 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1429 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1431 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1432 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1433 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 r11 = _mm_mul_ps(rsq11,rinv11);
1441 /* EWALD ELECTROSTATICS */
1443 /* Analytical PME correction */
1444 zeta2 = _mm_mul_ps(beta2,rsq11);
1445 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1446 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1447 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1448 felec = _mm_mul_ps(qq11,felec);
1450 fscal = felec;
1452 /* Update vectorial force */
1453 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1454 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1455 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1457 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1458 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1459 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1461 /**************************
1462 * CALCULATE INTERACTIONS *
1463 **************************/
1465 r12 = _mm_mul_ps(rsq12,rinv12);
1467 /* EWALD ELECTROSTATICS */
1469 /* Analytical PME correction */
1470 zeta2 = _mm_mul_ps(beta2,rsq12);
1471 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1472 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1473 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1474 felec = _mm_mul_ps(qq12,felec);
1476 fscal = felec;
1478 /* Update vectorial force */
1479 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1480 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1481 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1483 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1484 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1485 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 r20 = _mm_mul_ps(rsq20,rinv20);
1493 /* EWALD ELECTROSTATICS */
1495 /* Analytical PME correction */
1496 zeta2 = _mm_mul_ps(beta2,rsq20);
1497 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1498 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1499 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1500 felec = _mm_mul_ps(qq20,felec);
1502 fscal = felec;
1504 /* Update vectorial force */
1505 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1506 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1507 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1509 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1510 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1511 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1513 /**************************
1514 * CALCULATE INTERACTIONS *
1515 **************************/
1517 r21 = _mm_mul_ps(rsq21,rinv21);
1519 /* EWALD ELECTROSTATICS */
1521 /* Analytical PME correction */
1522 zeta2 = _mm_mul_ps(beta2,rsq21);
1523 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1524 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1525 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1526 felec = _mm_mul_ps(qq21,felec);
1528 fscal = felec;
1530 /* Update vectorial force */
1531 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1532 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1533 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1535 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1536 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1537 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1543 r22 = _mm_mul_ps(rsq22,rinv22);
1545 /* EWALD ELECTROSTATICS */
1547 /* Analytical PME correction */
1548 zeta2 = _mm_mul_ps(beta2,rsq22);
1549 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1550 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1551 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1552 felec = _mm_mul_ps(qq22,felec);
1554 fscal = felec;
1556 /* Update vectorial force */
1557 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1558 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1559 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1561 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1562 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1563 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1565 fjptrA = f+j_coord_offsetA;
1566 fjptrB = f+j_coord_offsetB;
1567 fjptrC = f+j_coord_offsetC;
1568 fjptrD = f+j_coord_offsetD;
1570 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1571 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1573 /* Inner loop uses 259 flops */
1576 if(jidx<j_index_end)
1579 /* Get j neighbor index, and coordinate index */
1580 jnrlistA = jjnr[jidx];
1581 jnrlistB = jjnr[jidx+1];
1582 jnrlistC = jjnr[jidx+2];
1583 jnrlistD = jjnr[jidx+3];
1584 /* Sign of each element will be negative for non-real atoms.
1585 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1586 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1588 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1589 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1590 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1591 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1592 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1593 j_coord_offsetA = DIM*jnrA;
1594 j_coord_offsetB = DIM*jnrB;
1595 j_coord_offsetC = DIM*jnrC;
1596 j_coord_offsetD = DIM*jnrD;
1598 /* load j atom coordinates */
1599 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1600 x+j_coord_offsetC,x+j_coord_offsetD,
1601 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1603 /* Calculate displacement vector */
1604 dx00 = _mm_sub_ps(ix0,jx0);
1605 dy00 = _mm_sub_ps(iy0,jy0);
1606 dz00 = _mm_sub_ps(iz0,jz0);
1607 dx01 = _mm_sub_ps(ix0,jx1);
1608 dy01 = _mm_sub_ps(iy0,jy1);
1609 dz01 = _mm_sub_ps(iz0,jz1);
1610 dx02 = _mm_sub_ps(ix0,jx2);
1611 dy02 = _mm_sub_ps(iy0,jy2);
1612 dz02 = _mm_sub_ps(iz0,jz2);
1613 dx10 = _mm_sub_ps(ix1,jx0);
1614 dy10 = _mm_sub_ps(iy1,jy0);
1615 dz10 = _mm_sub_ps(iz1,jz0);
1616 dx11 = _mm_sub_ps(ix1,jx1);
1617 dy11 = _mm_sub_ps(iy1,jy1);
1618 dz11 = _mm_sub_ps(iz1,jz1);
1619 dx12 = _mm_sub_ps(ix1,jx2);
1620 dy12 = _mm_sub_ps(iy1,jy2);
1621 dz12 = _mm_sub_ps(iz1,jz2);
1622 dx20 = _mm_sub_ps(ix2,jx0);
1623 dy20 = _mm_sub_ps(iy2,jy0);
1624 dz20 = _mm_sub_ps(iz2,jz0);
1625 dx21 = _mm_sub_ps(ix2,jx1);
1626 dy21 = _mm_sub_ps(iy2,jy1);
1627 dz21 = _mm_sub_ps(iz2,jz1);
1628 dx22 = _mm_sub_ps(ix2,jx2);
1629 dy22 = _mm_sub_ps(iy2,jy2);
1630 dz22 = _mm_sub_ps(iz2,jz2);
1632 /* Calculate squared distance and things based on it */
1633 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1634 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1635 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1636 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1637 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1638 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1639 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1640 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1641 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1643 rinv00 = avx128fma_invsqrt_f(rsq00);
1644 rinv01 = avx128fma_invsqrt_f(rsq01);
1645 rinv02 = avx128fma_invsqrt_f(rsq02);
1646 rinv10 = avx128fma_invsqrt_f(rsq10);
1647 rinv11 = avx128fma_invsqrt_f(rsq11);
1648 rinv12 = avx128fma_invsqrt_f(rsq12);
1649 rinv20 = avx128fma_invsqrt_f(rsq20);
1650 rinv21 = avx128fma_invsqrt_f(rsq21);
1651 rinv22 = avx128fma_invsqrt_f(rsq22);
1653 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1654 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1655 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1656 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1657 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1658 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1659 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1660 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1661 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1663 fjx0 = _mm_setzero_ps();
1664 fjy0 = _mm_setzero_ps();
1665 fjz0 = _mm_setzero_ps();
1666 fjx1 = _mm_setzero_ps();
1667 fjy1 = _mm_setzero_ps();
1668 fjz1 = _mm_setzero_ps();
1669 fjx2 = _mm_setzero_ps();
1670 fjy2 = _mm_setzero_ps();
1671 fjz2 = _mm_setzero_ps();
1673 /**************************
1674 * CALCULATE INTERACTIONS *
1675 **************************/
1677 r00 = _mm_mul_ps(rsq00,rinv00);
1678 r00 = _mm_andnot_ps(dummy_mask,r00);
1680 /* EWALD ELECTROSTATICS */
1682 /* Analytical PME correction */
1683 zeta2 = _mm_mul_ps(beta2,rsq00);
1684 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1685 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1686 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1687 felec = _mm_mul_ps(qq00,felec);
1689 /* LENNARD-JONES DISPERSION/REPULSION */
1691 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1692 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1694 fscal = _mm_add_ps(felec,fvdw);
1696 fscal = _mm_andnot_ps(dummy_mask,fscal);
1698 /* Update vectorial force */
1699 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1700 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1701 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1703 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1704 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1705 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 r01 = _mm_mul_ps(rsq01,rinv01);
1712 r01 = _mm_andnot_ps(dummy_mask,r01);
1714 /* EWALD ELECTROSTATICS */
1716 /* Analytical PME correction */
1717 zeta2 = _mm_mul_ps(beta2,rsq01);
1718 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1719 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1720 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1721 felec = _mm_mul_ps(qq01,felec);
1723 fscal = felec;
1725 fscal = _mm_andnot_ps(dummy_mask,fscal);
1727 /* Update vectorial force */
1728 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1729 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1730 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1732 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1733 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1734 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1736 /**************************
1737 * CALCULATE INTERACTIONS *
1738 **************************/
1740 r02 = _mm_mul_ps(rsq02,rinv02);
1741 r02 = _mm_andnot_ps(dummy_mask,r02);
1743 /* EWALD ELECTROSTATICS */
1745 /* Analytical PME correction */
1746 zeta2 = _mm_mul_ps(beta2,rsq02);
1747 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1748 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1749 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1750 felec = _mm_mul_ps(qq02,felec);
1752 fscal = felec;
1754 fscal = _mm_andnot_ps(dummy_mask,fscal);
1756 /* Update vectorial force */
1757 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1758 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1759 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1761 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1762 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1763 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1765 /**************************
1766 * CALCULATE INTERACTIONS *
1767 **************************/
1769 r10 = _mm_mul_ps(rsq10,rinv10);
1770 r10 = _mm_andnot_ps(dummy_mask,r10);
1772 /* EWALD ELECTROSTATICS */
1774 /* Analytical PME correction */
1775 zeta2 = _mm_mul_ps(beta2,rsq10);
1776 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1777 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1778 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1779 felec = _mm_mul_ps(qq10,felec);
1781 fscal = felec;
1783 fscal = _mm_andnot_ps(dummy_mask,fscal);
1785 /* Update vectorial force */
1786 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1787 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1788 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1790 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1791 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1792 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 r11 = _mm_mul_ps(rsq11,rinv11);
1799 r11 = _mm_andnot_ps(dummy_mask,r11);
1801 /* EWALD ELECTROSTATICS */
1803 /* Analytical PME correction */
1804 zeta2 = _mm_mul_ps(beta2,rsq11);
1805 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1806 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1807 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1808 felec = _mm_mul_ps(qq11,felec);
1810 fscal = felec;
1812 fscal = _mm_andnot_ps(dummy_mask,fscal);
1814 /* Update vectorial force */
1815 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1816 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1817 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1819 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1820 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1821 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1823 /**************************
1824 * CALCULATE INTERACTIONS *
1825 **************************/
1827 r12 = _mm_mul_ps(rsq12,rinv12);
1828 r12 = _mm_andnot_ps(dummy_mask,r12);
1830 /* EWALD ELECTROSTATICS */
1832 /* Analytical PME correction */
1833 zeta2 = _mm_mul_ps(beta2,rsq12);
1834 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1835 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1836 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1837 felec = _mm_mul_ps(qq12,felec);
1839 fscal = felec;
1841 fscal = _mm_andnot_ps(dummy_mask,fscal);
1843 /* Update vectorial force */
1844 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1845 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1846 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1848 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1849 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1850 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 r20 = _mm_mul_ps(rsq20,rinv20);
1857 r20 = _mm_andnot_ps(dummy_mask,r20);
1859 /* EWALD ELECTROSTATICS */
1861 /* Analytical PME correction */
1862 zeta2 = _mm_mul_ps(beta2,rsq20);
1863 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1864 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1865 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1866 felec = _mm_mul_ps(qq20,felec);
1868 fscal = felec;
1870 fscal = _mm_andnot_ps(dummy_mask,fscal);
1872 /* Update vectorial force */
1873 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1874 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1875 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1877 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1878 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1879 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1881 /**************************
1882 * CALCULATE INTERACTIONS *
1883 **************************/
1885 r21 = _mm_mul_ps(rsq21,rinv21);
1886 r21 = _mm_andnot_ps(dummy_mask,r21);
1888 /* EWALD ELECTROSTATICS */
1890 /* Analytical PME correction */
1891 zeta2 = _mm_mul_ps(beta2,rsq21);
1892 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1893 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1894 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1895 felec = _mm_mul_ps(qq21,felec);
1897 fscal = felec;
1899 fscal = _mm_andnot_ps(dummy_mask,fscal);
1901 /* Update vectorial force */
1902 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1903 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1904 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1906 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1907 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1908 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1910 /**************************
1911 * CALCULATE INTERACTIONS *
1912 **************************/
1914 r22 = _mm_mul_ps(rsq22,rinv22);
1915 r22 = _mm_andnot_ps(dummy_mask,r22);
1917 /* EWALD ELECTROSTATICS */
1919 /* Analytical PME correction */
1920 zeta2 = _mm_mul_ps(beta2,rsq22);
1921 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1922 pmecorrF = avx128fma_pmecorrF_f(zeta2);
1923 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1924 felec = _mm_mul_ps(qq22,felec);
1926 fscal = felec;
1928 fscal = _mm_andnot_ps(dummy_mask,fscal);
1930 /* Update vectorial force */
1931 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1932 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1933 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1935 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1936 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1937 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1939 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1940 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1941 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1942 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1944 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1945 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1947 /* Inner loop uses 268 flops */
1950 /* End of innermost loop */
1952 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1953 f+i_coord_offset,fshift+i_shift_offset);
1955 /* Increment number of inner iterations */
1956 inneriter += j_index_end - j_index_start;
1958 /* Outer loop uses 18 flops */
1961 /* Increment number of outer iterations */
1962 outeriter += nri;
1964 /* Update outer/inner flops */
1966 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*268);