Removed simple.h from nb_kernel_sse4_1_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse4_1_double.c
blobbe4773b59767c97b64ba8b60f8706b4fa0ee3ec9
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse4_1_double.h"
49 #include "kernelutil_x86_sse4_1_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LJEwald
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB;
76 int j_coord_offsetA,j_coord_offsetB;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 int vdwioffset0;
82 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 int vdwioffset1;
84 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 int vdwioffset2;
86 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 int vdwioffset3;
88 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89 int vdwjidx0A,vdwjidx0B;
90 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B;
92 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B;
94 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 int vdwjidx3A,vdwjidx3B;
96 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 real *charge;
109 int nvdwtype;
110 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 int *vdwtype;
112 real *vdwparam;
113 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
115 __m128d c6grid_00;
116 __m128d c6grid_11;
117 __m128d c6grid_12;
118 __m128d c6grid_13;
119 __m128d c6grid_21;
120 __m128d c6grid_22;
121 __m128d c6grid_23;
122 __m128d c6grid_31;
123 __m128d c6grid_32;
124 __m128d c6grid_33;
125 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
126 real *vdwgridparam;
127 __m128d one_half = _mm_set1_pd(0.5);
128 __m128d minus_one = _mm_set1_pd(-1.0);
129 __m128i ewitab;
130 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
131 real *ewtab;
132 __m128d dummy_mask,cutoff_mask;
133 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
134 __m128d one = _mm_set1_pd(1.0);
135 __m128d two = _mm_set1_pd(2.0);
136 x = xx[0];
137 f = ff[0];
139 nri = nlist->nri;
140 iinr = nlist->iinr;
141 jindex = nlist->jindex;
142 jjnr = nlist->jjnr;
143 shiftidx = nlist->shift;
144 gid = nlist->gid;
145 shiftvec = fr->shift_vec[0];
146 fshift = fr->fshift[0];
147 facel = _mm_set1_pd(fr->epsfac);
148 charge = mdatoms->chargeA;
149 nvdwtype = fr->ntype;
150 vdwparam = fr->nbfp;
151 vdwtype = mdatoms->typeA;
152 vdwgridparam = fr->ljpme_c6grid;
153 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
154 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
155 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
157 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
158 ewtab = fr->ic->tabq_coul_FDV0;
159 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
160 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
162 /* Setup water-specific parameters */
163 inr = nlist->iinr[0];
164 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
165 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
166 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
167 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
169 jq1 = _mm_set1_pd(charge[inr+1]);
170 jq2 = _mm_set1_pd(charge[inr+2]);
171 jq3 = _mm_set1_pd(charge[inr+3]);
172 vdwjidx0A = 2*vdwtype[inr+0];
173 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
174 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
175 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
176 qq11 = _mm_mul_pd(iq1,jq1);
177 qq12 = _mm_mul_pd(iq1,jq2);
178 qq13 = _mm_mul_pd(iq1,jq3);
179 qq21 = _mm_mul_pd(iq2,jq1);
180 qq22 = _mm_mul_pd(iq2,jq2);
181 qq23 = _mm_mul_pd(iq2,jq3);
182 qq31 = _mm_mul_pd(iq3,jq1);
183 qq32 = _mm_mul_pd(iq3,jq2);
184 qq33 = _mm_mul_pd(iq3,jq3);
186 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
187 rcutoff_scalar = fr->rcoulomb;
188 rcutoff = _mm_set1_pd(rcutoff_scalar);
189 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
191 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
192 rvdw = _mm_set1_pd(fr->rvdw);
194 /* Avoid stupid compiler warnings */
195 jnrA = jnrB = 0;
196 j_coord_offsetA = 0;
197 j_coord_offsetB = 0;
199 outeriter = 0;
200 inneriter = 0;
202 /* Start outer loop over neighborlists */
203 for(iidx=0; iidx<nri; iidx++)
205 /* Load shift vector for this list */
206 i_shift_offset = DIM*shiftidx[iidx];
208 /* Load limits for loop over neighbors */
209 j_index_start = jindex[iidx];
210 j_index_end = jindex[iidx+1];
212 /* Get outer coordinate index */
213 inr = iinr[iidx];
214 i_coord_offset = DIM*inr;
216 /* Load i particle coords and add shift vector */
217 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
218 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
220 fix0 = _mm_setzero_pd();
221 fiy0 = _mm_setzero_pd();
222 fiz0 = _mm_setzero_pd();
223 fix1 = _mm_setzero_pd();
224 fiy1 = _mm_setzero_pd();
225 fiz1 = _mm_setzero_pd();
226 fix2 = _mm_setzero_pd();
227 fiy2 = _mm_setzero_pd();
228 fiz2 = _mm_setzero_pd();
229 fix3 = _mm_setzero_pd();
230 fiy3 = _mm_setzero_pd();
231 fiz3 = _mm_setzero_pd();
233 /* Reset potential sums */
234 velecsum = _mm_setzero_pd();
235 vvdwsum = _mm_setzero_pd();
237 /* Start inner kernel loop */
238 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
241 /* Get j neighbor index, and coordinate index */
242 jnrA = jjnr[jidx];
243 jnrB = jjnr[jidx+1];
244 j_coord_offsetA = DIM*jnrA;
245 j_coord_offsetB = DIM*jnrB;
247 /* load j atom coordinates */
248 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
249 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
250 &jy2,&jz2,&jx3,&jy3,&jz3);
252 /* Calculate displacement vector */
253 dx00 = _mm_sub_pd(ix0,jx0);
254 dy00 = _mm_sub_pd(iy0,jy0);
255 dz00 = _mm_sub_pd(iz0,jz0);
256 dx11 = _mm_sub_pd(ix1,jx1);
257 dy11 = _mm_sub_pd(iy1,jy1);
258 dz11 = _mm_sub_pd(iz1,jz1);
259 dx12 = _mm_sub_pd(ix1,jx2);
260 dy12 = _mm_sub_pd(iy1,jy2);
261 dz12 = _mm_sub_pd(iz1,jz2);
262 dx13 = _mm_sub_pd(ix1,jx3);
263 dy13 = _mm_sub_pd(iy1,jy3);
264 dz13 = _mm_sub_pd(iz1,jz3);
265 dx21 = _mm_sub_pd(ix2,jx1);
266 dy21 = _mm_sub_pd(iy2,jy1);
267 dz21 = _mm_sub_pd(iz2,jz1);
268 dx22 = _mm_sub_pd(ix2,jx2);
269 dy22 = _mm_sub_pd(iy2,jy2);
270 dz22 = _mm_sub_pd(iz2,jz2);
271 dx23 = _mm_sub_pd(ix2,jx3);
272 dy23 = _mm_sub_pd(iy2,jy3);
273 dz23 = _mm_sub_pd(iz2,jz3);
274 dx31 = _mm_sub_pd(ix3,jx1);
275 dy31 = _mm_sub_pd(iy3,jy1);
276 dz31 = _mm_sub_pd(iz3,jz1);
277 dx32 = _mm_sub_pd(ix3,jx2);
278 dy32 = _mm_sub_pd(iy3,jy2);
279 dz32 = _mm_sub_pd(iz3,jz2);
280 dx33 = _mm_sub_pd(ix3,jx3);
281 dy33 = _mm_sub_pd(iy3,jy3);
282 dz33 = _mm_sub_pd(iz3,jz3);
284 /* Calculate squared distance and things based on it */
285 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
286 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
287 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
288 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
289 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
290 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
291 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
292 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
293 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
294 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
296 rinv00 = gmx_mm_invsqrt_pd(rsq00);
297 rinv11 = gmx_mm_invsqrt_pd(rsq11);
298 rinv12 = gmx_mm_invsqrt_pd(rsq12);
299 rinv13 = gmx_mm_invsqrt_pd(rsq13);
300 rinv21 = gmx_mm_invsqrt_pd(rsq21);
301 rinv22 = gmx_mm_invsqrt_pd(rsq22);
302 rinv23 = gmx_mm_invsqrt_pd(rsq23);
303 rinv31 = gmx_mm_invsqrt_pd(rsq31);
304 rinv32 = gmx_mm_invsqrt_pd(rsq32);
305 rinv33 = gmx_mm_invsqrt_pd(rsq33);
307 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
308 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
309 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
310 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
311 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
312 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
313 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
314 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
315 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
316 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
318 fjx0 = _mm_setzero_pd();
319 fjy0 = _mm_setzero_pd();
320 fjz0 = _mm_setzero_pd();
321 fjx1 = _mm_setzero_pd();
322 fjy1 = _mm_setzero_pd();
323 fjz1 = _mm_setzero_pd();
324 fjx2 = _mm_setzero_pd();
325 fjy2 = _mm_setzero_pd();
326 fjz2 = _mm_setzero_pd();
327 fjx3 = _mm_setzero_pd();
328 fjy3 = _mm_setzero_pd();
329 fjz3 = _mm_setzero_pd();
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 if (gmx_mm_any_lt(rsq00,rcutoff2))
338 r00 = _mm_mul_pd(rsq00,rinv00);
340 /* Analytical LJ-PME */
341 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
342 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
343 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
344 exponent = gmx_simd_exp_d(ewcljrsq);
345 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
346 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
347 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
348 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
349 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
350 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
351 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
352 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
353 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
355 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 vvdw = _mm_and_pd(vvdw,cutoff_mask);
359 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
361 fscal = fvdw;
363 fscal = _mm_and_pd(fscal,cutoff_mask);
365 /* Calculate temporary vectorial force */
366 tx = _mm_mul_pd(fscal,dx00);
367 ty = _mm_mul_pd(fscal,dy00);
368 tz = _mm_mul_pd(fscal,dz00);
370 /* Update vectorial force */
371 fix0 = _mm_add_pd(fix0,tx);
372 fiy0 = _mm_add_pd(fiy0,ty);
373 fiz0 = _mm_add_pd(fiz0,tz);
375 fjx0 = _mm_add_pd(fjx0,tx);
376 fjy0 = _mm_add_pd(fjy0,ty);
377 fjz0 = _mm_add_pd(fjz0,tz);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 if (gmx_mm_any_lt(rsq11,rcutoff2))
388 r11 = _mm_mul_pd(rsq11,rinv11);
390 /* EWALD ELECTROSTATICS */
392 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
393 ewrt = _mm_mul_pd(r11,ewtabscale);
394 ewitab = _mm_cvttpd_epi32(ewrt);
395 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
396 ewitab = _mm_slli_epi32(ewitab,2);
397 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
398 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
399 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
400 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
401 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
402 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
403 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
404 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
405 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
406 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
408 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
410 /* Update potential sum for this i atom from the interaction with this j atom. */
411 velec = _mm_and_pd(velec,cutoff_mask);
412 velecsum = _mm_add_pd(velecsum,velec);
414 fscal = felec;
416 fscal = _mm_and_pd(fscal,cutoff_mask);
418 /* Calculate temporary vectorial force */
419 tx = _mm_mul_pd(fscal,dx11);
420 ty = _mm_mul_pd(fscal,dy11);
421 tz = _mm_mul_pd(fscal,dz11);
423 /* Update vectorial force */
424 fix1 = _mm_add_pd(fix1,tx);
425 fiy1 = _mm_add_pd(fiy1,ty);
426 fiz1 = _mm_add_pd(fiz1,tz);
428 fjx1 = _mm_add_pd(fjx1,tx);
429 fjy1 = _mm_add_pd(fjy1,ty);
430 fjz1 = _mm_add_pd(fjz1,tz);
434 /**************************
435 * CALCULATE INTERACTIONS *
436 **************************/
438 if (gmx_mm_any_lt(rsq12,rcutoff2))
441 r12 = _mm_mul_pd(rsq12,rinv12);
443 /* EWALD ELECTROSTATICS */
445 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
446 ewrt = _mm_mul_pd(r12,ewtabscale);
447 ewitab = _mm_cvttpd_epi32(ewrt);
448 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
449 ewitab = _mm_slli_epi32(ewitab,2);
450 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
451 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
452 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
453 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
454 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
455 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
456 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
457 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
458 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
459 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
461 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
463 /* Update potential sum for this i atom from the interaction with this j atom. */
464 velec = _mm_and_pd(velec,cutoff_mask);
465 velecsum = _mm_add_pd(velecsum,velec);
467 fscal = felec;
469 fscal = _mm_and_pd(fscal,cutoff_mask);
471 /* Calculate temporary vectorial force */
472 tx = _mm_mul_pd(fscal,dx12);
473 ty = _mm_mul_pd(fscal,dy12);
474 tz = _mm_mul_pd(fscal,dz12);
476 /* Update vectorial force */
477 fix1 = _mm_add_pd(fix1,tx);
478 fiy1 = _mm_add_pd(fiy1,ty);
479 fiz1 = _mm_add_pd(fiz1,tz);
481 fjx2 = _mm_add_pd(fjx2,tx);
482 fjy2 = _mm_add_pd(fjy2,ty);
483 fjz2 = _mm_add_pd(fjz2,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 if (gmx_mm_any_lt(rsq13,rcutoff2))
494 r13 = _mm_mul_pd(rsq13,rinv13);
496 /* EWALD ELECTROSTATICS */
498 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
499 ewrt = _mm_mul_pd(r13,ewtabscale);
500 ewitab = _mm_cvttpd_epi32(ewrt);
501 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
502 ewitab = _mm_slli_epi32(ewitab,2);
503 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
504 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
505 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
506 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
507 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
508 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
509 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
510 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
511 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
512 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
514 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
516 /* Update potential sum for this i atom from the interaction with this j atom. */
517 velec = _mm_and_pd(velec,cutoff_mask);
518 velecsum = _mm_add_pd(velecsum,velec);
520 fscal = felec;
522 fscal = _mm_and_pd(fscal,cutoff_mask);
524 /* Calculate temporary vectorial force */
525 tx = _mm_mul_pd(fscal,dx13);
526 ty = _mm_mul_pd(fscal,dy13);
527 tz = _mm_mul_pd(fscal,dz13);
529 /* Update vectorial force */
530 fix1 = _mm_add_pd(fix1,tx);
531 fiy1 = _mm_add_pd(fiy1,ty);
532 fiz1 = _mm_add_pd(fiz1,tz);
534 fjx3 = _mm_add_pd(fjx3,tx);
535 fjy3 = _mm_add_pd(fjy3,ty);
536 fjz3 = _mm_add_pd(fjz3,tz);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 if (gmx_mm_any_lt(rsq21,rcutoff2))
547 r21 = _mm_mul_pd(rsq21,rinv21);
549 /* EWALD ELECTROSTATICS */
551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
552 ewrt = _mm_mul_pd(r21,ewtabscale);
553 ewitab = _mm_cvttpd_epi32(ewrt);
554 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
555 ewitab = _mm_slli_epi32(ewitab,2);
556 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
557 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
558 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
559 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
560 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
561 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
562 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
563 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
564 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
565 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
567 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velec = _mm_and_pd(velec,cutoff_mask);
571 velecsum = _mm_add_pd(velecsum,velec);
573 fscal = felec;
575 fscal = _mm_and_pd(fscal,cutoff_mask);
577 /* Calculate temporary vectorial force */
578 tx = _mm_mul_pd(fscal,dx21);
579 ty = _mm_mul_pd(fscal,dy21);
580 tz = _mm_mul_pd(fscal,dz21);
582 /* Update vectorial force */
583 fix2 = _mm_add_pd(fix2,tx);
584 fiy2 = _mm_add_pd(fiy2,ty);
585 fiz2 = _mm_add_pd(fiz2,tz);
587 fjx1 = _mm_add_pd(fjx1,tx);
588 fjy1 = _mm_add_pd(fjy1,ty);
589 fjz1 = _mm_add_pd(fjz1,tz);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 if (gmx_mm_any_lt(rsq22,rcutoff2))
600 r22 = _mm_mul_pd(rsq22,rinv22);
602 /* EWALD ELECTROSTATICS */
604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
605 ewrt = _mm_mul_pd(r22,ewtabscale);
606 ewitab = _mm_cvttpd_epi32(ewrt);
607 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
608 ewitab = _mm_slli_epi32(ewitab,2);
609 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
610 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
611 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
612 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
613 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
614 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
615 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
616 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
617 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
618 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
620 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec = _mm_and_pd(velec,cutoff_mask);
624 velecsum = _mm_add_pd(velecsum,velec);
626 fscal = felec;
628 fscal = _mm_and_pd(fscal,cutoff_mask);
630 /* Calculate temporary vectorial force */
631 tx = _mm_mul_pd(fscal,dx22);
632 ty = _mm_mul_pd(fscal,dy22);
633 tz = _mm_mul_pd(fscal,dz22);
635 /* Update vectorial force */
636 fix2 = _mm_add_pd(fix2,tx);
637 fiy2 = _mm_add_pd(fiy2,ty);
638 fiz2 = _mm_add_pd(fiz2,tz);
640 fjx2 = _mm_add_pd(fjx2,tx);
641 fjy2 = _mm_add_pd(fjy2,ty);
642 fjz2 = _mm_add_pd(fjz2,tz);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 if (gmx_mm_any_lt(rsq23,rcutoff2))
653 r23 = _mm_mul_pd(rsq23,rinv23);
655 /* EWALD ELECTROSTATICS */
657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
658 ewrt = _mm_mul_pd(r23,ewtabscale);
659 ewitab = _mm_cvttpd_epi32(ewrt);
660 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
661 ewitab = _mm_slli_epi32(ewitab,2);
662 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
663 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
664 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
665 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
666 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
667 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
668 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
669 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
670 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
671 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
673 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 velec = _mm_and_pd(velec,cutoff_mask);
677 velecsum = _mm_add_pd(velecsum,velec);
679 fscal = felec;
681 fscal = _mm_and_pd(fscal,cutoff_mask);
683 /* Calculate temporary vectorial force */
684 tx = _mm_mul_pd(fscal,dx23);
685 ty = _mm_mul_pd(fscal,dy23);
686 tz = _mm_mul_pd(fscal,dz23);
688 /* Update vectorial force */
689 fix2 = _mm_add_pd(fix2,tx);
690 fiy2 = _mm_add_pd(fiy2,ty);
691 fiz2 = _mm_add_pd(fiz2,tz);
693 fjx3 = _mm_add_pd(fjx3,tx);
694 fjy3 = _mm_add_pd(fjy3,ty);
695 fjz3 = _mm_add_pd(fjz3,tz);
699 /**************************
700 * CALCULATE INTERACTIONS *
701 **************************/
703 if (gmx_mm_any_lt(rsq31,rcutoff2))
706 r31 = _mm_mul_pd(rsq31,rinv31);
708 /* EWALD ELECTROSTATICS */
710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
711 ewrt = _mm_mul_pd(r31,ewtabscale);
712 ewitab = _mm_cvttpd_epi32(ewrt);
713 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
714 ewitab = _mm_slli_epi32(ewitab,2);
715 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
716 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
717 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
718 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
719 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
720 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
721 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
722 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
723 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
724 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
726 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
728 /* Update potential sum for this i atom from the interaction with this j atom. */
729 velec = _mm_and_pd(velec,cutoff_mask);
730 velecsum = _mm_add_pd(velecsum,velec);
732 fscal = felec;
734 fscal = _mm_and_pd(fscal,cutoff_mask);
736 /* Calculate temporary vectorial force */
737 tx = _mm_mul_pd(fscal,dx31);
738 ty = _mm_mul_pd(fscal,dy31);
739 tz = _mm_mul_pd(fscal,dz31);
741 /* Update vectorial force */
742 fix3 = _mm_add_pd(fix3,tx);
743 fiy3 = _mm_add_pd(fiy3,ty);
744 fiz3 = _mm_add_pd(fiz3,tz);
746 fjx1 = _mm_add_pd(fjx1,tx);
747 fjy1 = _mm_add_pd(fjy1,ty);
748 fjz1 = _mm_add_pd(fjz1,tz);
752 /**************************
753 * CALCULATE INTERACTIONS *
754 **************************/
756 if (gmx_mm_any_lt(rsq32,rcutoff2))
759 r32 = _mm_mul_pd(rsq32,rinv32);
761 /* EWALD ELECTROSTATICS */
763 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
764 ewrt = _mm_mul_pd(r32,ewtabscale);
765 ewitab = _mm_cvttpd_epi32(ewrt);
766 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
767 ewitab = _mm_slli_epi32(ewitab,2);
768 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
769 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
770 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
771 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
772 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
773 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
774 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
775 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
776 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
777 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
779 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
781 /* Update potential sum for this i atom from the interaction with this j atom. */
782 velec = _mm_and_pd(velec,cutoff_mask);
783 velecsum = _mm_add_pd(velecsum,velec);
785 fscal = felec;
787 fscal = _mm_and_pd(fscal,cutoff_mask);
789 /* Calculate temporary vectorial force */
790 tx = _mm_mul_pd(fscal,dx32);
791 ty = _mm_mul_pd(fscal,dy32);
792 tz = _mm_mul_pd(fscal,dz32);
794 /* Update vectorial force */
795 fix3 = _mm_add_pd(fix3,tx);
796 fiy3 = _mm_add_pd(fiy3,ty);
797 fiz3 = _mm_add_pd(fiz3,tz);
799 fjx2 = _mm_add_pd(fjx2,tx);
800 fjy2 = _mm_add_pd(fjy2,ty);
801 fjz2 = _mm_add_pd(fjz2,tz);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 if (gmx_mm_any_lt(rsq33,rcutoff2))
812 r33 = _mm_mul_pd(rsq33,rinv33);
814 /* EWALD ELECTROSTATICS */
816 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
817 ewrt = _mm_mul_pd(r33,ewtabscale);
818 ewitab = _mm_cvttpd_epi32(ewrt);
819 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
820 ewitab = _mm_slli_epi32(ewitab,2);
821 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
822 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
823 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
824 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
825 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
826 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
827 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
828 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
829 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
830 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
832 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec = _mm_and_pd(velec,cutoff_mask);
836 velecsum = _mm_add_pd(velecsum,velec);
838 fscal = felec;
840 fscal = _mm_and_pd(fscal,cutoff_mask);
842 /* Calculate temporary vectorial force */
843 tx = _mm_mul_pd(fscal,dx33);
844 ty = _mm_mul_pd(fscal,dy33);
845 tz = _mm_mul_pd(fscal,dz33);
847 /* Update vectorial force */
848 fix3 = _mm_add_pd(fix3,tx);
849 fiy3 = _mm_add_pd(fiy3,ty);
850 fiz3 = _mm_add_pd(fiz3,tz);
852 fjx3 = _mm_add_pd(fjx3,tx);
853 fjy3 = _mm_add_pd(fjy3,ty);
854 fjz3 = _mm_add_pd(fjz3,tz);
858 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
860 /* Inner loop uses 478 flops */
863 if(jidx<j_index_end)
866 jnrA = jjnr[jidx];
867 j_coord_offsetA = DIM*jnrA;
869 /* load j atom coordinates */
870 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
871 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
872 &jy2,&jz2,&jx3,&jy3,&jz3);
874 /* Calculate displacement vector */
875 dx00 = _mm_sub_pd(ix0,jx0);
876 dy00 = _mm_sub_pd(iy0,jy0);
877 dz00 = _mm_sub_pd(iz0,jz0);
878 dx11 = _mm_sub_pd(ix1,jx1);
879 dy11 = _mm_sub_pd(iy1,jy1);
880 dz11 = _mm_sub_pd(iz1,jz1);
881 dx12 = _mm_sub_pd(ix1,jx2);
882 dy12 = _mm_sub_pd(iy1,jy2);
883 dz12 = _mm_sub_pd(iz1,jz2);
884 dx13 = _mm_sub_pd(ix1,jx3);
885 dy13 = _mm_sub_pd(iy1,jy3);
886 dz13 = _mm_sub_pd(iz1,jz3);
887 dx21 = _mm_sub_pd(ix2,jx1);
888 dy21 = _mm_sub_pd(iy2,jy1);
889 dz21 = _mm_sub_pd(iz2,jz1);
890 dx22 = _mm_sub_pd(ix2,jx2);
891 dy22 = _mm_sub_pd(iy2,jy2);
892 dz22 = _mm_sub_pd(iz2,jz2);
893 dx23 = _mm_sub_pd(ix2,jx3);
894 dy23 = _mm_sub_pd(iy2,jy3);
895 dz23 = _mm_sub_pd(iz2,jz3);
896 dx31 = _mm_sub_pd(ix3,jx1);
897 dy31 = _mm_sub_pd(iy3,jy1);
898 dz31 = _mm_sub_pd(iz3,jz1);
899 dx32 = _mm_sub_pd(ix3,jx2);
900 dy32 = _mm_sub_pd(iy3,jy2);
901 dz32 = _mm_sub_pd(iz3,jz2);
902 dx33 = _mm_sub_pd(ix3,jx3);
903 dy33 = _mm_sub_pd(iy3,jy3);
904 dz33 = _mm_sub_pd(iz3,jz3);
906 /* Calculate squared distance and things based on it */
907 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
908 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
909 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
910 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
911 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
912 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
913 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
914 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
915 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
916 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
918 rinv00 = gmx_mm_invsqrt_pd(rsq00);
919 rinv11 = gmx_mm_invsqrt_pd(rsq11);
920 rinv12 = gmx_mm_invsqrt_pd(rsq12);
921 rinv13 = gmx_mm_invsqrt_pd(rsq13);
922 rinv21 = gmx_mm_invsqrt_pd(rsq21);
923 rinv22 = gmx_mm_invsqrt_pd(rsq22);
924 rinv23 = gmx_mm_invsqrt_pd(rsq23);
925 rinv31 = gmx_mm_invsqrt_pd(rsq31);
926 rinv32 = gmx_mm_invsqrt_pd(rsq32);
927 rinv33 = gmx_mm_invsqrt_pd(rsq33);
929 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
930 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
931 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
932 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
933 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
934 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
935 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
936 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
937 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
938 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
940 fjx0 = _mm_setzero_pd();
941 fjy0 = _mm_setzero_pd();
942 fjz0 = _mm_setzero_pd();
943 fjx1 = _mm_setzero_pd();
944 fjy1 = _mm_setzero_pd();
945 fjz1 = _mm_setzero_pd();
946 fjx2 = _mm_setzero_pd();
947 fjy2 = _mm_setzero_pd();
948 fjz2 = _mm_setzero_pd();
949 fjx3 = _mm_setzero_pd();
950 fjy3 = _mm_setzero_pd();
951 fjz3 = _mm_setzero_pd();
953 /**************************
954 * CALCULATE INTERACTIONS *
955 **************************/
957 if (gmx_mm_any_lt(rsq00,rcutoff2))
960 r00 = _mm_mul_pd(rsq00,rinv00);
962 /* Analytical LJ-PME */
963 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
964 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
965 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
966 exponent = gmx_simd_exp_d(ewcljrsq);
967 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
968 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
969 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
970 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
971 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
972 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
973 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
974 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
975 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
977 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 vvdw = _mm_and_pd(vvdw,cutoff_mask);
981 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
982 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
984 fscal = fvdw;
986 fscal = _mm_and_pd(fscal,cutoff_mask);
988 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
990 /* Calculate temporary vectorial force */
991 tx = _mm_mul_pd(fscal,dx00);
992 ty = _mm_mul_pd(fscal,dy00);
993 tz = _mm_mul_pd(fscal,dz00);
995 /* Update vectorial force */
996 fix0 = _mm_add_pd(fix0,tx);
997 fiy0 = _mm_add_pd(fiy0,ty);
998 fiz0 = _mm_add_pd(fiz0,tz);
1000 fjx0 = _mm_add_pd(fjx0,tx);
1001 fjy0 = _mm_add_pd(fjy0,ty);
1002 fjz0 = _mm_add_pd(fjz0,tz);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 if (gmx_mm_any_lt(rsq11,rcutoff2))
1013 r11 = _mm_mul_pd(rsq11,rinv11);
1015 /* EWALD ELECTROSTATICS */
1017 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018 ewrt = _mm_mul_pd(r11,ewtabscale);
1019 ewitab = _mm_cvttpd_epi32(ewrt);
1020 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1021 ewitab = _mm_slli_epi32(ewitab,2);
1022 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1023 ewtabD = _mm_setzero_pd();
1024 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1025 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1026 ewtabFn = _mm_setzero_pd();
1027 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1028 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1029 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1030 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1031 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1033 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1035 /* Update potential sum for this i atom from the interaction with this j atom. */
1036 velec = _mm_and_pd(velec,cutoff_mask);
1037 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1038 velecsum = _mm_add_pd(velecsum,velec);
1040 fscal = felec;
1042 fscal = _mm_and_pd(fscal,cutoff_mask);
1044 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1046 /* Calculate temporary vectorial force */
1047 tx = _mm_mul_pd(fscal,dx11);
1048 ty = _mm_mul_pd(fscal,dy11);
1049 tz = _mm_mul_pd(fscal,dz11);
1051 /* Update vectorial force */
1052 fix1 = _mm_add_pd(fix1,tx);
1053 fiy1 = _mm_add_pd(fiy1,ty);
1054 fiz1 = _mm_add_pd(fiz1,tz);
1056 fjx1 = _mm_add_pd(fjx1,tx);
1057 fjy1 = _mm_add_pd(fjy1,ty);
1058 fjz1 = _mm_add_pd(fjz1,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 if (gmx_mm_any_lt(rsq12,rcutoff2))
1069 r12 = _mm_mul_pd(rsq12,rinv12);
1071 /* EWALD ELECTROSTATICS */
1073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1074 ewrt = _mm_mul_pd(r12,ewtabscale);
1075 ewitab = _mm_cvttpd_epi32(ewrt);
1076 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1077 ewitab = _mm_slli_epi32(ewitab,2);
1078 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1079 ewtabD = _mm_setzero_pd();
1080 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1081 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1082 ewtabFn = _mm_setzero_pd();
1083 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1084 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1085 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1086 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1087 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1089 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1091 /* Update potential sum for this i atom from the interaction with this j atom. */
1092 velec = _mm_and_pd(velec,cutoff_mask);
1093 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1094 velecsum = _mm_add_pd(velecsum,velec);
1096 fscal = felec;
1098 fscal = _mm_and_pd(fscal,cutoff_mask);
1100 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1102 /* Calculate temporary vectorial force */
1103 tx = _mm_mul_pd(fscal,dx12);
1104 ty = _mm_mul_pd(fscal,dy12);
1105 tz = _mm_mul_pd(fscal,dz12);
1107 /* Update vectorial force */
1108 fix1 = _mm_add_pd(fix1,tx);
1109 fiy1 = _mm_add_pd(fiy1,ty);
1110 fiz1 = _mm_add_pd(fiz1,tz);
1112 fjx2 = _mm_add_pd(fjx2,tx);
1113 fjy2 = _mm_add_pd(fjy2,ty);
1114 fjz2 = _mm_add_pd(fjz2,tz);
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 if (gmx_mm_any_lt(rsq13,rcutoff2))
1125 r13 = _mm_mul_pd(rsq13,rinv13);
1127 /* EWALD ELECTROSTATICS */
1129 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1130 ewrt = _mm_mul_pd(r13,ewtabscale);
1131 ewitab = _mm_cvttpd_epi32(ewrt);
1132 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1133 ewitab = _mm_slli_epi32(ewitab,2);
1134 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1135 ewtabD = _mm_setzero_pd();
1136 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1137 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1138 ewtabFn = _mm_setzero_pd();
1139 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1140 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1141 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1142 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
1143 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1145 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1147 /* Update potential sum for this i atom from the interaction with this j atom. */
1148 velec = _mm_and_pd(velec,cutoff_mask);
1149 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1150 velecsum = _mm_add_pd(velecsum,velec);
1152 fscal = felec;
1154 fscal = _mm_and_pd(fscal,cutoff_mask);
1156 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1158 /* Calculate temporary vectorial force */
1159 tx = _mm_mul_pd(fscal,dx13);
1160 ty = _mm_mul_pd(fscal,dy13);
1161 tz = _mm_mul_pd(fscal,dz13);
1163 /* Update vectorial force */
1164 fix1 = _mm_add_pd(fix1,tx);
1165 fiy1 = _mm_add_pd(fiy1,ty);
1166 fiz1 = _mm_add_pd(fiz1,tz);
1168 fjx3 = _mm_add_pd(fjx3,tx);
1169 fjy3 = _mm_add_pd(fjy3,ty);
1170 fjz3 = _mm_add_pd(fjz3,tz);
1174 /**************************
1175 * CALCULATE INTERACTIONS *
1176 **************************/
1178 if (gmx_mm_any_lt(rsq21,rcutoff2))
1181 r21 = _mm_mul_pd(rsq21,rinv21);
1183 /* EWALD ELECTROSTATICS */
1185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1186 ewrt = _mm_mul_pd(r21,ewtabscale);
1187 ewitab = _mm_cvttpd_epi32(ewrt);
1188 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1189 ewitab = _mm_slli_epi32(ewitab,2);
1190 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1191 ewtabD = _mm_setzero_pd();
1192 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1193 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1194 ewtabFn = _mm_setzero_pd();
1195 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1196 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1197 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1198 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1199 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1201 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1203 /* Update potential sum for this i atom from the interaction with this j atom. */
1204 velec = _mm_and_pd(velec,cutoff_mask);
1205 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1206 velecsum = _mm_add_pd(velecsum,velec);
1208 fscal = felec;
1210 fscal = _mm_and_pd(fscal,cutoff_mask);
1212 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1214 /* Calculate temporary vectorial force */
1215 tx = _mm_mul_pd(fscal,dx21);
1216 ty = _mm_mul_pd(fscal,dy21);
1217 tz = _mm_mul_pd(fscal,dz21);
1219 /* Update vectorial force */
1220 fix2 = _mm_add_pd(fix2,tx);
1221 fiy2 = _mm_add_pd(fiy2,ty);
1222 fiz2 = _mm_add_pd(fiz2,tz);
1224 fjx1 = _mm_add_pd(fjx1,tx);
1225 fjy1 = _mm_add_pd(fjy1,ty);
1226 fjz1 = _mm_add_pd(fjz1,tz);
1230 /**************************
1231 * CALCULATE INTERACTIONS *
1232 **************************/
1234 if (gmx_mm_any_lt(rsq22,rcutoff2))
1237 r22 = _mm_mul_pd(rsq22,rinv22);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt = _mm_mul_pd(r22,ewtabscale);
1243 ewitab = _mm_cvttpd_epi32(ewrt);
1244 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1245 ewitab = _mm_slli_epi32(ewitab,2);
1246 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1247 ewtabD = _mm_setzero_pd();
1248 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1249 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1250 ewtabFn = _mm_setzero_pd();
1251 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1252 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1253 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1254 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1255 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1257 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1259 /* Update potential sum for this i atom from the interaction with this j atom. */
1260 velec = _mm_and_pd(velec,cutoff_mask);
1261 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1262 velecsum = _mm_add_pd(velecsum,velec);
1264 fscal = felec;
1266 fscal = _mm_and_pd(fscal,cutoff_mask);
1268 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1270 /* Calculate temporary vectorial force */
1271 tx = _mm_mul_pd(fscal,dx22);
1272 ty = _mm_mul_pd(fscal,dy22);
1273 tz = _mm_mul_pd(fscal,dz22);
1275 /* Update vectorial force */
1276 fix2 = _mm_add_pd(fix2,tx);
1277 fiy2 = _mm_add_pd(fiy2,ty);
1278 fiz2 = _mm_add_pd(fiz2,tz);
1280 fjx2 = _mm_add_pd(fjx2,tx);
1281 fjy2 = _mm_add_pd(fjy2,ty);
1282 fjz2 = _mm_add_pd(fjz2,tz);
1286 /**************************
1287 * CALCULATE INTERACTIONS *
1288 **************************/
1290 if (gmx_mm_any_lt(rsq23,rcutoff2))
1293 r23 = _mm_mul_pd(rsq23,rinv23);
1295 /* EWALD ELECTROSTATICS */
1297 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1298 ewrt = _mm_mul_pd(r23,ewtabscale);
1299 ewitab = _mm_cvttpd_epi32(ewrt);
1300 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1301 ewitab = _mm_slli_epi32(ewitab,2);
1302 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1303 ewtabD = _mm_setzero_pd();
1304 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1305 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1306 ewtabFn = _mm_setzero_pd();
1307 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1308 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1309 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1310 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
1311 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1313 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1315 /* Update potential sum for this i atom from the interaction with this j atom. */
1316 velec = _mm_and_pd(velec,cutoff_mask);
1317 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1318 velecsum = _mm_add_pd(velecsum,velec);
1320 fscal = felec;
1322 fscal = _mm_and_pd(fscal,cutoff_mask);
1324 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1326 /* Calculate temporary vectorial force */
1327 tx = _mm_mul_pd(fscal,dx23);
1328 ty = _mm_mul_pd(fscal,dy23);
1329 tz = _mm_mul_pd(fscal,dz23);
1331 /* Update vectorial force */
1332 fix2 = _mm_add_pd(fix2,tx);
1333 fiy2 = _mm_add_pd(fiy2,ty);
1334 fiz2 = _mm_add_pd(fiz2,tz);
1336 fjx3 = _mm_add_pd(fjx3,tx);
1337 fjy3 = _mm_add_pd(fjy3,ty);
1338 fjz3 = _mm_add_pd(fjz3,tz);
1342 /**************************
1343 * CALCULATE INTERACTIONS *
1344 **************************/
1346 if (gmx_mm_any_lt(rsq31,rcutoff2))
1349 r31 = _mm_mul_pd(rsq31,rinv31);
1351 /* EWALD ELECTROSTATICS */
1353 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1354 ewrt = _mm_mul_pd(r31,ewtabscale);
1355 ewitab = _mm_cvttpd_epi32(ewrt);
1356 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1357 ewitab = _mm_slli_epi32(ewitab,2);
1358 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1359 ewtabD = _mm_setzero_pd();
1360 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1361 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1362 ewtabFn = _mm_setzero_pd();
1363 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1364 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1365 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1366 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
1367 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1369 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1371 /* Update potential sum for this i atom from the interaction with this j atom. */
1372 velec = _mm_and_pd(velec,cutoff_mask);
1373 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1374 velecsum = _mm_add_pd(velecsum,velec);
1376 fscal = felec;
1378 fscal = _mm_and_pd(fscal,cutoff_mask);
1380 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1382 /* Calculate temporary vectorial force */
1383 tx = _mm_mul_pd(fscal,dx31);
1384 ty = _mm_mul_pd(fscal,dy31);
1385 tz = _mm_mul_pd(fscal,dz31);
1387 /* Update vectorial force */
1388 fix3 = _mm_add_pd(fix3,tx);
1389 fiy3 = _mm_add_pd(fiy3,ty);
1390 fiz3 = _mm_add_pd(fiz3,tz);
1392 fjx1 = _mm_add_pd(fjx1,tx);
1393 fjy1 = _mm_add_pd(fjy1,ty);
1394 fjz1 = _mm_add_pd(fjz1,tz);
1398 /**************************
1399 * CALCULATE INTERACTIONS *
1400 **************************/
1402 if (gmx_mm_any_lt(rsq32,rcutoff2))
1405 r32 = _mm_mul_pd(rsq32,rinv32);
1407 /* EWALD ELECTROSTATICS */
1409 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1410 ewrt = _mm_mul_pd(r32,ewtabscale);
1411 ewitab = _mm_cvttpd_epi32(ewrt);
1412 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1413 ewitab = _mm_slli_epi32(ewitab,2);
1414 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1415 ewtabD = _mm_setzero_pd();
1416 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1417 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1418 ewtabFn = _mm_setzero_pd();
1419 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1420 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1421 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1422 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
1423 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1425 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1427 /* Update potential sum for this i atom from the interaction with this j atom. */
1428 velec = _mm_and_pd(velec,cutoff_mask);
1429 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1430 velecsum = _mm_add_pd(velecsum,velec);
1432 fscal = felec;
1434 fscal = _mm_and_pd(fscal,cutoff_mask);
1436 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1438 /* Calculate temporary vectorial force */
1439 tx = _mm_mul_pd(fscal,dx32);
1440 ty = _mm_mul_pd(fscal,dy32);
1441 tz = _mm_mul_pd(fscal,dz32);
1443 /* Update vectorial force */
1444 fix3 = _mm_add_pd(fix3,tx);
1445 fiy3 = _mm_add_pd(fiy3,ty);
1446 fiz3 = _mm_add_pd(fiz3,tz);
1448 fjx2 = _mm_add_pd(fjx2,tx);
1449 fjy2 = _mm_add_pd(fjy2,ty);
1450 fjz2 = _mm_add_pd(fjz2,tz);
1454 /**************************
1455 * CALCULATE INTERACTIONS *
1456 **************************/
1458 if (gmx_mm_any_lt(rsq33,rcutoff2))
1461 r33 = _mm_mul_pd(rsq33,rinv33);
1463 /* EWALD ELECTROSTATICS */
1465 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1466 ewrt = _mm_mul_pd(r33,ewtabscale);
1467 ewitab = _mm_cvttpd_epi32(ewrt);
1468 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1469 ewitab = _mm_slli_epi32(ewitab,2);
1470 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1471 ewtabD = _mm_setzero_pd();
1472 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1473 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1474 ewtabFn = _mm_setzero_pd();
1475 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1476 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1477 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1478 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
1479 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1481 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1483 /* Update potential sum for this i atom from the interaction with this j atom. */
1484 velec = _mm_and_pd(velec,cutoff_mask);
1485 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1486 velecsum = _mm_add_pd(velecsum,velec);
1488 fscal = felec;
1490 fscal = _mm_and_pd(fscal,cutoff_mask);
1492 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1494 /* Calculate temporary vectorial force */
1495 tx = _mm_mul_pd(fscal,dx33);
1496 ty = _mm_mul_pd(fscal,dy33);
1497 tz = _mm_mul_pd(fscal,dz33);
1499 /* Update vectorial force */
1500 fix3 = _mm_add_pd(fix3,tx);
1501 fiy3 = _mm_add_pd(fiy3,ty);
1502 fiz3 = _mm_add_pd(fiz3,tz);
1504 fjx3 = _mm_add_pd(fjx3,tx);
1505 fjy3 = _mm_add_pd(fjy3,ty);
1506 fjz3 = _mm_add_pd(fjz3,tz);
1510 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1512 /* Inner loop uses 478 flops */
1515 /* End of innermost loop */
1517 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1518 f+i_coord_offset,fshift+i_shift_offset);
1520 ggid = gid[iidx];
1521 /* Update potential energies */
1522 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1523 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1525 /* Increment number of inner iterations */
1526 inneriter += j_index_end - j_index_start;
1528 /* Outer loop uses 26 flops */
1531 /* Increment number of outer iterations */
1532 outeriter += nri;
1534 /* Update outer/inner flops */
1536 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*478);
1539 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
1540 * Electrostatics interaction: Ewald
1541 * VdW interaction: LJEwald
1542 * Geometry: Water4-Water4
1543 * Calculate force/pot: Force
1545 void
1546 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
1547 (t_nblist * gmx_restrict nlist,
1548 rvec * gmx_restrict xx,
1549 rvec * gmx_restrict ff,
1550 t_forcerec * gmx_restrict fr,
1551 t_mdatoms * gmx_restrict mdatoms,
1552 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1553 t_nrnb * gmx_restrict nrnb)
1555 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1556 * just 0 for non-waters.
1557 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1558 * jnr indices corresponding to data put in the four positions in the SIMD register.
1560 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1561 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1562 int jnrA,jnrB;
1563 int j_coord_offsetA,j_coord_offsetB;
1564 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1565 real rcutoff_scalar;
1566 real *shiftvec,*fshift,*x,*f;
1567 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1568 int vdwioffset0;
1569 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1570 int vdwioffset1;
1571 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1572 int vdwioffset2;
1573 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1574 int vdwioffset3;
1575 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1576 int vdwjidx0A,vdwjidx0B;
1577 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1578 int vdwjidx1A,vdwjidx1B;
1579 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1580 int vdwjidx2A,vdwjidx2B;
1581 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1582 int vdwjidx3A,vdwjidx3B;
1583 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1584 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1585 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1586 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1587 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1588 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1589 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1590 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1591 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1592 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1593 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1594 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1595 real *charge;
1596 int nvdwtype;
1597 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1598 int *vdwtype;
1599 real *vdwparam;
1600 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1601 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1602 __m128d c6grid_00;
1603 __m128d c6grid_11;
1604 __m128d c6grid_12;
1605 __m128d c6grid_13;
1606 __m128d c6grid_21;
1607 __m128d c6grid_22;
1608 __m128d c6grid_23;
1609 __m128d c6grid_31;
1610 __m128d c6grid_32;
1611 __m128d c6grid_33;
1612 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1613 real *vdwgridparam;
1614 __m128d one_half = _mm_set1_pd(0.5);
1615 __m128d minus_one = _mm_set1_pd(-1.0);
1616 __m128i ewitab;
1617 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1618 real *ewtab;
1619 __m128d dummy_mask,cutoff_mask;
1620 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1621 __m128d one = _mm_set1_pd(1.0);
1622 __m128d two = _mm_set1_pd(2.0);
1623 x = xx[0];
1624 f = ff[0];
1626 nri = nlist->nri;
1627 iinr = nlist->iinr;
1628 jindex = nlist->jindex;
1629 jjnr = nlist->jjnr;
1630 shiftidx = nlist->shift;
1631 gid = nlist->gid;
1632 shiftvec = fr->shift_vec[0];
1633 fshift = fr->fshift[0];
1634 facel = _mm_set1_pd(fr->epsfac);
1635 charge = mdatoms->chargeA;
1636 nvdwtype = fr->ntype;
1637 vdwparam = fr->nbfp;
1638 vdwtype = mdatoms->typeA;
1639 vdwgridparam = fr->ljpme_c6grid;
1640 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
1641 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
1642 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
1644 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1645 ewtab = fr->ic->tabq_coul_F;
1646 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1647 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1649 /* Setup water-specific parameters */
1650 inr = nlist->iinr[0];
1651 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1652 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1653 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1654 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1656 jq1 = _mm_set1_pd(charge[inr+1]);
1657 jq2 = _mm_set1_pd(charge[inr+2]);
1658 jq3 = _mm_set1_pd(charge[inr+3]);
1659 vdwjidx0A = 2*vdwtype[inr+0];
1660 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1661 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1662 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
1663 qq11 = _mm_mul_pd(iq1,jq1);
1664 qq12 = _mm_mul_pd(iq1,jq2);
1665 qq13 = _mm_mul_pd(iq1,jq3);
1666 qq21 = _mm_mul_pd(iq2,jq1);
1667 qq22 = _mm_mul_pd(iq2,jq2);
1668 qq23 = _mm_mul_pd(iq2,jq3);
1669 qq31 = _mm_mul_pd(iq3,jq1);
1670 qq32 = _mm_mul_pd(iq3,jq2);
1671 qq33 = _mm_mul_pd(iq3,jq3);
1673 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1674 rcutoff_scalar = fr->rcoulomb;
1675 rcutoff = _mm_set1_pd(rcutoff_scalar);
1676 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1678 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
1679 rvdw = _mm_set1_pd(fr->rvdw);
1681 /* Avoid stupid compiler warnings */
1682 jnrA = jnrB = 0;
1683 j_coord_offsetA = 0;
1684 j_coord_offsetB = 0;
1686 outeriter = 0;
1687 inneriter = 0;
1689 /* Start outer loop over neighborlists */
1690 for(iidx=0; iidx<nri; iidx++)
1692 /* Load shift vector for this list */
1693 i_shift_offset = DIM*shiftidx[iidx];
1695 /* Load limits for loop over neighbors */
1696 j_index_start = jindex[iidx];
1697 j_index_end = jindex[iidx+1];
1699 /* Get outer coordinate index */
1700 inr = iinr[iidx];
1701 i_coord_offset = DIM*inr;
1703 /* Load i particle coords and add shift vector */
1704 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1705 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1707 fix0 = _mm_setzero_pd();
1708 fiy0 = _mm_setzero_pd();
1709 fiz0 = _mm_setzero_pd();
1710 fix1 = _mm_setzero_pd();
1711 fiy1 = _mm_setzero_pd();
1712 fiz1 = _mm_setzero_pd();
1713 fix2 = _mm_setzero_pd();
1714 fiy2 = _mm_setzero_pd();
1715 fiz2 = _mm_setzero_pd();
1716 fix3 = _mm_setzero_pd();
1717 fiy3 = _mm_setzero_pd();
1718 fiz3 = _mm_setzero_pd();
1720 /* Start inner kernel loop */
1721 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1724 /* Get j neighbor index, and coordinate index */
1725 jnrA = jjnr[jidx];
1726 jnrB = jjnr[jidx+1];
1727 j_coord_offsetA = DIM*jnrA;
1728 j_coord_offsetB = DIM*jnrB;
1730 /* load j atom coordinates */
1731 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1732 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1733 &jy2,&jz2,&jx3,&jy3,&jz3);
1735 /* Calculate displacement vector */
1736 dx00 = _mm_sub_pd(ix0,jx0);
1737 dy00 = _mm_sub_pd(iy0,jy0);
1738 dz00 = _mm_sub_pd(iz0,jz0);
1739 dx11 = _mm_sub_pd(ix1,jx1);
1740 dy11 = _mm_sub_pd(iy1,jy1);
1741 dz11 = _mm_sub_pd(iz1,jz1);
1742 dx12 = _mm_sub_pd(ix1,jx2);
1743 dy12 = _mm_sub_pd(iy1,jy2);
1744 dz12 = _mm_sub_pd(iz1,jz2);
1745 dx13 = _mm_sub_pd(ix1,jx3);
1746 dy13 = _mm_sub_pd(iy1,jy3);
1747 dz13 = _mm_sub_pd(iz1,jz3);
1748 dx21 = _mm_sub_pd(ix2,jx1);
1749 dy21 = _mm_sub_pd(iy2,jy1);
1750 dz21 = _mm_sub_pd(iz2,jz1);
1751 dx22 = _mm_sub_pd(ix2,jx2);
1752 dy22 = _mm_sub_pd(iy2,jy2);
1753 dz22 = _mm_sub_pd(iz2,jz2);
1754 dx23 = _mm_sub_pd(ix2,jx3);
1755 dy23 = _mm_sub_pd(iy2,jy3);
1756 dz23 = _mm_sub_pd(iz2,jz3);
1757 dx31 = _mm_sub_pd(ix3,jx1);
1758 dy31 = _mm_sub_pd(iy3,jy1);
1759 dz31 = _mm_sub_pd(iz3,jz1);
1760 dx32 = _mm_sub_pd(ix3,jx2);
1761 dy32 = _mm_sub_pd(iy3,jy2);
1762 dz32 = _mm_sub_pd(iz3,jz2);
1763 dx33 = _mm_sub_pd(ix3,jx3);
1764 dy33 = _mm_sub_pd(iy3,jy3);
1765 dz33 = _mm_sub_pd(iz3,jz3);
1767 /* Calculate squared distance and things based on it */
1768 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1769 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1770 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1771 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1772 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1773 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1774 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1775 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1776 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1777 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1779 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1780 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1781 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1782 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1783 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1784 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1785 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1786 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1787 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1788 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1790 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1791 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1792 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1793 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1794 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1795 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1796 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1797 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1798 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1799 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1801 fjx0 = _mm_setzero_pd();
1802 fjy0 = _mm_setzero_pd();
1803 fjz0 = _mm_setzero_pd();
1804 fjx1 = _mm_setzero_pd();
1805 fjy1 = _mm_setzero_pd();
1806 fjz1 = _mm_setzero_pd();
1807 fjx2 = _mm_setzero_pd();
1808 fjy2 = _mm_setzero_pd();
1809 fjz2 = _mm_setzero_pd();
1810 fjx3 = _mm_setzero_pd();
1811 fjy3 = _mm_setzero_pd();
1812 fjz3 = _mm_setzero_pd();
1814 /**************************
1815 * CALCULATE INTERACTIONS *
1816 **************************/
1818 if (gmx_mm_any_lt(rsq00,rcutoff2))
1821 r00 = _mm_mul_pd(rsq00,rinv00);
1823 /* Analytical LJ-PME */
1824 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1825 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1826 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1827 exponent = gmx_simd_exp_d(ewcljrsq);
1828 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1829 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1830 /* f6A = 6 * C6grid * (1 - poly) */
1831 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1832 /* f6B = C6grid * exponent * beta^6 */
1833 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1834 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1835 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1837 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1839 fscal = fvdw;
1841 fscal = _mm_and_pd(fscal,cutoff_mask);
1843 /* Calculate temporary vectorial force */
1844 tx = _mm_mul_pd(fscal,dx00);
1845 ty = _mm_mul_pd(fscal,dy00);
1846 tz = _mm_mul_pd(fscal,dz00);
1848 /* Update vectorial force */
1849 fix0 = _mm_add_pd(fix0,tx);
1850 fiy0 = _mm_add_pd(fiy0,ty);
1851 fiz0 = _mm_add_pd(fiz0,tz);
1853 fjx0 = _mm_add_pd(fjx0,tx);
1854 fjy0 = _mm_add_pd(fjy0,ty);
1855 fjz0 = _mm_add_pd(fjz0,tz);
1859 /**************************
1860 * CALCULATE INTERACTIONS *
1861 **************************/
1863 if (gmx_mm_any_lt(rsq11,rcutoff2))
1866 r11 = _mm_mul_pd(rsq11,rinv11);
1868 /* EWALD ELECTROSTATICS */
1870 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1871 ewrt = _mm_mul_pd(r11,ewtabscale);
1872 ewitab = _mm_cvttpd_epi32(ewrt);
1873 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1874 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1875 &ewtabF,&ewtabFn);
1876 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1877 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1879 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1881 fscal = felec;
1883 fscal = _mm_and_pd(fscal,cutoff_mask);
1885 /* Calculate temporary vectorial force */
1886 tx = _mm_mul_pd(fscal,dx11);
1887 ty = _mm_mul_pd(fscal,dy11);
1888 tz = _mm_mul_pd(fscal,dz11);
1890 /* Update vectorial force */
1891 fix1 = _mm_add_pd(fix1,tx);
1892 fiy1 = _mm_add_pd(fiy1,ty);
1893 fiz1 = _mm_add_pd(fiz1,tz);
1895 fjx1 = _mm_add_pd(fjx1,tx);
1896 fjy1 = _mm_add_pd(fjy1,ty);
1897 fjz1 = _mm_add_pd(fjz1,tz);
1901 /**************************
1902 * CALCULATE INTERACTIONS *
1903 **************************/
1905 if (gmx_mm_any_lt(rsq12,rcutoff2))
1908 r12 = _mm_mul_pd(rsq12,rinv12);
1910 /* EWALD ELECTROSTATICS */
1912 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1913 ewrt = _mm_mul_pd(r12,ewtabscale);
1914 ewitab = _mm_cvttpd_epi32(ewrt);
1915 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1916 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1917 &ewtabF,&ewtabFn);
1918 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1919 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1921 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1923 fscal = felec;
1925 fscal = _mm_and_pd(fscal,cutoff_mask);
1927 /* Calculate temporary vectorial force */
1928 tx = _mm_mul_pd(fscal,dx12);
1929 ty = _mm_mul_pd(fscal,dy12);
1930 tz = _mm_mul_pd(fscal,dz12);
1932 /* Update vectorial force */
1933 fix1 = _mm_add_pd(fix1,tx);
1934 fiy1 = _mm_add_pd(fiy1,ty);
1935 fiz1 = _mm_add_pd(fiz1,tz);
1937 fjx2 = _mm_add_pd(fjx2,tx);
1938 fjy2 = _mm_add_pd(fjy2,ty);
1939 fjz2 = _mm_add_pd(fjz2,tz);
1943 /**************************
1944 * CALCULATE INTERACTIONS *
1945 **************************/
1947 if (gmx_mm_any_lt(rsq13,rcutoff2))
1950 r13 = _mm_mul_pd(rsq13,rinv13);
1952 /* EWALD ELECTROSTATICS */
1954 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1955 ewrt = _mm_mul_pd(r13,ewtabscale);
1956 ewitab = _mm_cvttpd_epi32(ewrt);
1957 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1958 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1959 &ewtabF,&ewtabFn);
1960 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1961 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1963 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1965 fscal = felec;
1967 fscal = _mm_and_pd(fscal,cutoff_mask);
1969 /* Calculate temporary vectorial force */
1970 tx = _mm_mul_pd(fscal,dx13);
1971 ty = _mm_mul_pd(fscal,dy13);
1972 tz = _mm_mul_pd(fscal,dz13);
1974 /* Update vectorial force */
1975 fix1 = _mm_add_pd(fix1,tx);
1976 fiy1 = _mm_add_pd(fiy1,ty);
1977 fiz1 = _mm_add_pd(fiz1,tz);
1979 fjx3 = _mm_add_pd(fjx3,tx);
1980 fjy3 = _mm_add_pd(fjy3,ty);
1981 fjz3 = _mm_add_pd(fjz3,tz);
1985 /**************************
1986 * CALCULATE INTERACTIONS *
1987 **************************/
1989 if (gmx_mm_any_lt(rsq21,rcutoff2))
1992 r21 = _mm_mul_pd(rsq21,rinv21);
1994 /* EWALD ELECTROSTATICS */
1996 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1997 ewrt = _mm_mul_pd(r21,ewtabscale);
1998 ewitab = _mm_cvttpd_epi32(ewrt);
1999 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2000 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2001 &ewtabF,&ewtabFn);
2002 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2003 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2005 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2007 fscal = felec;
2009 fscal = _mm_and_pd(fscal,cutoff_mask);
2011 /* Calculate temporary vectorial force */
2012 tx = _mm_mul_pd(fscal,dx21);
2013 ty = _mm_mul_pd(fscal,dy21);
2014 tz = _mm_mul_pd(fscal,dz21);
2016 /* Update vectorial force */
2017 fix2 = _mm_add_pd(fix2,tx);
2018 fiy2 = _mm_add_pd(fiy2,ty);
2019 fiz2 = _mm_add_pd(fiz2,tz);
2021 fjx1 = _mm_add_pd(fjx1,tx);
2022 fjy1 = _mm_add_pd(fjy1,ty);
2023 fjz1 = _mm_add_pd(fjz1,tz);
2027 /**************************
2028 * CALCULATE INTERACTIONS *
2029 **************************/
2031 if (gmx_mm_any_lt(rsq22,rcutoff2))
2034 r22 = _mm_mul_pd(rsq22,rinv22);
2036 /* EWALD ELECTROSTATICS */
2038 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2039 ewrt = _mm_mul_pd(r22,ewtabscale);
2040 ewitab = _mm_cvttpd_epi32(ewrt);
2041 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2042 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2043 &ewtabF,&ewtabFn);
2044 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2045 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2047 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2049 fscal = felec;
2051 fscal = _mm_and_pd(fscal,cutoff_mask);
2053 /* Calculate temporary vectorial force */
2054 tx = _mm_mul_pd(fscal,dx22);
2055 ty = _mm_mul_pd(fscal,dy22);
2056 tz = _mm_mul_pd(fscal,dz22);
2058 /* Update vectorial force */
2059 fix2 = _mm_add_pd(fix2,tx);
2060 fiy2 = _mm_add_pd(fiy2,ty);
2061 fiz2 = _mm_add_pd(fiz2,tz);
2063 fjx2 = _mm_add_pd(fjx2,tx);
2064 fjy2 = _mm_add_pd(fjy2,ty);
2065 fjz2 = _mm_add_pd(fjz2,tz);
2069 /**************************
2070 * CALCULATE INTERACTIONS *
2071 **************************/
2073 if (gmx_mm_any_lt(rsq23,rcutoff2))
2076 r23 = _mm_mul_pd(rsq23,rinv23);
2078 /* EWALD ELECTROSTATICS */
2080 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2081 ewrt = _mm_mul_pd(r23,ewtabscale);
2082 ewitab = _mm_cvttpd_epi32(ewrt);
2083 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2084 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2085 &ewtabF,&ewtabFn);
2086 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2087 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2089 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2091 fscal = felec;
2093 fscal = _mm_and_pd(fscal,cutoff_mask);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm_mul_pd(fscal,dx23);
2097 ty = _mm_mul_pd(fscal,dy23);
2098 tz = _mm_mul_pd(fscal,dz23);
2100 /* Update vectorial force */
2101 fix2 = _mm_add_pd(fix2,tx);
2102 fiy2 = _mm_add_pd(fiy2,ty);
2103 fiz2 = _mm_add_pd(fiz2,tz);
2105 fjx3 = _mm_add_pd(fjx3,tx);
2106 fjy3 = _mm_add_pd(fjy3,ty);
2107 fjz3 = _mm_add_pd(fjz3,tz);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 if (gmx_mm_any_lt(rsq31,rcutoff2))
2118 r31 = _mm_mul_pd(rsq31,rinv31);
2120 /* EWALD ELECTROSTATICS */
2122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2123 ewrt = _mm_mul_pd(r31,ewtabscale);
2124 ewitab = _mm_cvttpd_epi32(ewrt);
2125 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2126 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2127 &ewtabF,&ewtabFn);
2128 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2129 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2131 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2133 fscal = felec;
2135 fscal = _mm_and_pd(fscal,cutoff_mask);
2137 /* Calculate temporary vectorial force */
2138 tx = _mm_mul_pd(fscal,dx31);
2139 ty = _mm_mul_pd(fscal,dy31);
2140 tz = _mm_mul_pd(fscal,dz31);
2142 /* Update vectorial force */
2143 fix3 = _mm_add_pd(fix3,tx);
2144 fiy3 = _mm_add_pd(fiy3,ty);
2145 fiz3 = _mm_add_pd(fiz3,tz);
2147 fjx1 = _mm_add_pd(fjx1,tx);
2148 fjy1 = _mm_add_pd(fjy1,ty);
2149 fjz1 = _mm_add_pd(fjz1,tz);
2153 /**************************
2154 * CALCULATE INTERACTIONS *
2155 **************************/
2157 if (gmx_mm_any_lt(rsq32,rcutoff2))
2160 r32 = _mm_mul_pd(rsq32,rinv32);
2162 /* EWALD ELECTROSTATICS */
2164 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2165 ewrt = _mm_mul_pd(r32,ewtabscale);
2166 ewitab = _mm_cvttpd_epi32(ewrt);
2167 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2168 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2169 &ewtabF,&ewtabFn);
2170 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2171 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2173 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2175 fscal = felec;
2177 fscal = _mm_and_pd(fscal,cutoff_mask);
2179 /* Calculate temporary vectorial force */
2180 tx = _mm_mul_pd(fscal,dx32);
2181 ty = _mm_mul_pd(fscal,dy32);
2182 tz = _mm_mul_pd(fscal,dz32);
2184 /* Update vectorial force */
2185 fix3 = _mm_add_pd(fix3,tx);
2186 fiy3 = _mm_add_pd(fiy3,ty);
2187 fiz3 = _mm_add_pd(fiz3,tz);
2189 fjx2 = _mm_add_pd(fjx2,tx);
2190 fjy2 = _mm_add_pd(fjy2,ty);
2191 fjz2 = _mm_add_pd(fjz2,tz);
2195 /**************************
2196 * CALCULATE INTERACTIONS *
2197 **************************/
2199 if (gmx_mm_any_lt(rsq33,rcutoff2))
2202 r33 = _mm_mul_pd(rsq33,rinv33);
2204 /* EWALD ELECTROSTATICS */
2206 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2207 ewrt = _mm_mul_pd(r33,ewtabscale);
2208 ewitab = _mm_cvttpd_epi32(ewrt);
2209 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2210 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2211 &ewtabF,&ewtabFn);
2212 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2213 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2215 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2217 fscal = felec;
2219 fscal = _mm_and_pd(fscal,cutoff_mask);
2221 /* Calculate temporary vectorial force */
2222 tx = _mm_mul_pd(fscal,dx33);
2223 ty = _mm_mul_pd(fscal,dy33);
2224 tz = _mm_mul_pd(fscal,dz33);
2226 /* Update vectorial force */
2227 fix3 = _mm_add_pd(fix3,tx);
2228 fiy3 = _mm_add_pd(fiy3,ty);
2229 fiz3 = _mm_add_pd(fiz3,tz);
2231 fjx3 = _mm_add_pd(fjx3,tx);
2232 fjy3 = _mm_add_pd(fjy3,ty);
2233 fjz3 = _mm_add_pd(fjz3,tz);
2237 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2239 /* Inner loop uses 403 flops */
2242 if(jidx<j_index_end)
2245 jnrA = jjnr[jidx];
2246 j_coord_offsetA = DIM*jnrA;
2248 /* load j atom coordinates */
2249 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2250 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2251 &jy2,&jz2,&jx3,&jy3,&jz3);
2253 /* Calculate displacement vector */
2254 dx00 = _mm_sub_pd(ix0,jx0);
2255 dy00 = _mm_sub_pd(iy0,jy0);
2256 dz00 = _mm_sub_pd(iz0,jz0);
2257 dx11 = _mm_sub_pd(ix1,jx1);
2258 dy11 = _mm_sub_pd(iy1,jy1);
2259 dz11 = _mm_sub_pd(iz1,jz1);
2260 dx12 = _mm_sub_pd(ix1,jx2);
2261 dy12 = _mm_sub_pd(iy1,jy2);
2262 dz12 = _mm_sub_pd(iz1,jz2);
2263 dx13 = _mm_sub_pd(ix1,jx3);
2264 dy13 = _mm_sub_pd(iy1,jy3);
2265 dz13 = _mm_sub_pd(iz1,jz3);
2266 dx21 = _mm_sub_pd(ix2,jx1);
2267 dy21 = _mm_sub_pd(iy2,jy1);
2268 dz21 = _mm_sub_pd(iz2,jz1);
2269 dx22 = _mm_sub_pd(ix2,jx2);
2270 dy22 = _mm_sub_pd(iy2,jy2);
2271 dz22 = _mm_sub_pd(iz2,jz2);
2272 dx23 = _mm_sub_pd(ix2,jx3);
2273 dy23 = _mm_sub_pd(iy2,jy3);
2274 dz23 = _mm_sub_pd(iz2,jz3);
2275 dx31 = _mm_sub_pd(ix3,jx1);
2276 dy31 = _mm_sub_pd(iy3,jy1);
2277 dz31 = _mm_sub_pd(iz3,jz1);
2278 dx32 = _mm_sub_pd(ix3,jx2);
2279 dy32 = _mm_sub_pd(iy3,jy2);
2280 dz32 = _mm_sub_pd(iz3,jz2);
2281 dx33 = _mm_sub_pd(ix3,jx3);
2282 dy33 = _mm_sub_pd(iy3,jy3);
2283 dz33 = _mm_sub_pd(iz3,jz3);
2285 /* Calculate squared distance and things based on it */
2286 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2287 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2288 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2289 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2290 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2291 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2292 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2293 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2294 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2295 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2297 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2298 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2299 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2300 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2301 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2302 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2303 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2304 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2305 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2306 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2308 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2309 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2310 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2311 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2312 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2313 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2314 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2315 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2316 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2317 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2319 fjx0 = _mm_setzero_pd();
2320 fjy0 = _mm_setzero_pd();
2321 fjz0 = _mm_setzero_pd();
2322 fjx1 = _mm_setzero_pd();
2323 fjy1 = _mm_setzero_pd();
2324 fjz1 = _mm_setzero_pd();
2325 fjx2 = _mm_setzero_pd();
2326 fjy2 = _mm_setzero_pd();
2327 fjz2 = _mm_setzero_pd();
2328 fjx3 = _mm_setzero_pd();
2329 fjy3 = _mm_setzero_pd();
2330 fjz3 = _mm_setzero_pd();
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 if (gmx_mm_any_lt(rsq00,rcutoff2))
2339 r00 = _mm_mul_pd(rsq00,rinv00);
2341 /* Analytical LJ-PME */
2342 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2343 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
2344 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
2345 exponent = gmx_simd_exp_d(ewcljrsq);
2346 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2347 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2348 /* f6A = 6 * C6grid * (1 - poly) */
2349 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
2350 /* f6B = C6grid * exponent * beta^6 */
2351 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
2352 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2353 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2355 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2357 fscal = fvdw;
2359 fscal = _mm_and_pd(fscal,cutoff_mask);
2361 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2363 /* Calculate temporary vectorial force */
2364 tx = _mm_mul_pd(fscal,dx00);
2365 ty = _mm_mul_pd(fscal,dy00);
2366 tz = _mm_mul_pd(fscal,dz00);
2368 /* Update vectorial force */
2369 fix0 = _mm_add_pd(fix0,tx);
2370 fiy0 = _mm_add_pd(fiy0,ty);
2371 fiz0 = _mm_add_pd(fiz0,tz);
2373 fjx0 = _mm_add_pd(fjx0,tx);
2374 fjy0 = _mm_add_pd(fjy0,ty);
2375 fjz0 = _mm_add_pd(fjz0,tz);
2379 /**************************
2380 * CALCULATE INTERACTIONS *
2381 **************************/
2383 if (gmx_mm_any_lt(rsq11,rcutoff2))
2386 r11 = _mm_mul_pd(rsq11,rinv11);
2388 /* EWALD ELECTROSTATICS */
2390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2391 ewrt = _mm_mul_pd(r11,ewtabscale);
2392 ewitab = _mm_cvttpd_epi32(ewrt);
2393 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2394 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2395 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2396 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2398 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2400 fscal = felec;
2402 fscal = _mm_and_pd(fscal,cutoff_mask);
2404 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2406 /* Calculate temporary vectorial force */
2407 tx = _mm_mul_pd(fscal,dx11);
2408 ty = _mm_mul_pd(fscal,dy11);
2409 tz = _mm_mul_pd(fscal,dz11);
2411 /* Update vectorial force */
2412 fix1 = _mm_add_pd(fix1,tx);
2413 fiy1 = _mm_add_pd(fiy1,ty);
2414 fiz1 = _mm_add_pd(fiz1,tz);
2416 fjx1 = _mm_add_pd(fjx1,tx);
2417 fjy1 = _mm_add_pd(fjy1,ty);
2418 fjz1 = _mm_add_pd(fjz1,tz);
2422 /**************************
2423 * CALCULATE INTERACTIONS *
2424 **************************/
2426 if (gmx_mm_any_lt(rsq12,rcutoff2))
2429 r12 = _mm_mul_pd(rsq12,rinv12);
2431 /* EWALD ELECTROSTATICS */
2433 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2434 ewrt = _mm_mul_pd(r12,ewtabscale);
2435 ewitab = _mm_cvttpd_epi32(ewrt);
2436 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2437 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2438 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2439 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2441 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2443 fscal = felec;
2445 fscal = _mm_and_pd(fscal,cutoff_mask);
2447 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2449 /* Calculate temporary vectorial force */
2450 tx = _mm_mul_pd(fscal,dx12);
2451 ty = _mm_mul_pd(fscal,dy12);
2452 tz = _mm_mul_pd(fscal,dz12);
2454 /* Update vectorial force */
2455 fix1 = _mm_add_pd(fix1,tx);
2456 fiy1 = _mm_add_pd(fiy1,ty);
2457 fiz1 = _mm_add_pd(fiz1,tz);
2459 fjx2 = _mm_add_pd(fjx2,tx);
2460 fjy2 = _mm_add_pd(fjy2,ty);
2461 fjz2 = _mm_add_pd(fjz2,tz);
2465 /**************************
2466 * CALCULATE INTERACTIONS *
2467 **************************/
2469 if (gmx_mm_any_lt(rsq13,rcutoff2))
2472 r13 = _mm_mul_pd(rsq13,rinv13);
2474 /* EWALD ELECTROSTATICS */
2476 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2477 ewrt = _mm_mul_pd(r13,ewtabscale);
2478 ewitab = _mm_cvttpd_epi32(ewrt);
2479 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2480 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2481 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2482 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2484 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2486 fscal = felec;
2488 fscal = _mm_and_pd(fscal,cutoff_mask);
2490 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2492 /* Calculate temporary vectorial force */
2493 tx = _mm_mul_pd(fscal,dx13);
2494 ty = _mm_mul_pd(fscal,dy13);
2495 tz = _mm_mul_pd(fscal,dz13);
2497 /* Update vectorial force */
2498 fix1 = _mm_add_pd(fix1,tx);
2499 fiy1 = _mm_add_pd(fiy1,ty);
2500 fiz1 = _mm_add_pd(fiz1,tz);
2502 fjx3 = _mm_add_pd(fjx3,tx);
2503 fjy3 = _mm_add_pd(fjy3,ty);
2504 fjz3 = _mm_add_pd(fjz3,tz);
2508 /**************************
2509 * CALCULATE INTERACTIONS *
2510 **************************/
2512 if (gmx_mm_any_lt(rsq21,rcutoff2))
2515 r21 = _mm_mul_pd(rsq21,rinv21);
2517 /* EWALD ELECTROSTATICS */
2519 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2520 ewrt = _mm_mul_pd(r21,ewtabscale);
2521 ewitab = _mm_cvttpd_epi32(ewrt);
2522 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2523 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2524 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2525 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2527 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2529 fscal = felec;
2531 fscal = _mm_and_pd(fscal,cutoff_mask);
2533 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2535 /* Calculate temporary vectorial force */
2536 tx = _mm_mul_pd(fscal,dx21);
2537 ty = _mm_mul_pd(fscal,dy21);
2538 tz = _mm_mul_pd(fscal,dz21);
2540 /* Update vectorial force */
2541 fix2 = _mm_add_pd(fix2,tx);
2542 fiy2 = _mm_add_pd(fiy2,ty);
2543 fiz2 = _mm_add_pd(fiz2,tz);
2545 fjx1 = _mm_add_pd(fjx1,tx);
2546 fjy1 = _mm_add_pd(fjy1,ty);
2547 fjz1 = _mm_add_pd(fjz1,tz);
2551 /**************************
2552 * CALCULATE INTERACTIONS *
2553 **************************/
2555 if (gmx_mm_any_lt(rsq22,rcutoff2))
2558 r22 = _mm_mul_pd(rsq22,rinv22);
2560 /* EWALD ELECTROSTATICS */
2562 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2563 ewrt = _mm_mul_pd(r22,ewtabscale);
2564 ewitab = _mm_cvttpd_epi32(ewrt);
2565 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2566 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2567 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2568 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2570 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2572 fscal = felec;
2574 fscal = _mm_and_pd(fscal,cutoff_mask);
2576 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2578 /* Calculate temporary vectorial force */
2579 tx = _mm_mul_pd(fscal,dx22);
2580 ty = _mm_mul_pd(fscal,dy22);
2581 tz = _mm_mul_pd(fscal,dz22);
2583 /* Update vectorial force */
2584 fix2 = _mm_add_pd(fix2,tx);
2585 fiy2 = _mm_add_pd(fiy2,ty);
2586 fiz2 = _mm_add_pd(fiz2,tz);
2588 fjx2 = _mm_add_pd(fjx2,tx);
2589 fjy2 = _mm_add_pd(fjy2,ty);
2590 fjz2 = _mm_add_pd(fjz2,tz);
2594 /**************************
2595 * CALCULATE INTERACTIONS *
2596 **************************/
2598 if (gmx_mm_any_lt(rsq23,rcutoff2))
2601 r23 = _mm_mul_pd(rsq23,rinv23);
2603 /* EWALD ELECTROSTATICS */
2605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2606 ewrt = _mm_mul_pd(r23,ewtabscale);
2607 ewitab = _mm_cvttpd_epi32(ewrt);
2608 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2609 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2610 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2611 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2613 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2615 fscal = felec;
2617 fscal = _mm_and_pd(fscal,cutoff_mask);
2619 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2621 /* Calculate temporary vectorial force */
2622 tx = _mm_mul_pd(fscal,dx23);
2623 ty = _mm_mul_pd(fscal,dy23);
2624 tz = _mm_mul_pd(fscal,dz23);
2626 /* Update vectorial force */
2627 fix2 = _mm_add_pd(fix2,tx);
2628 fiy2 = _mm_add_pd(fiy2,ty);
2629 fiz2 = _mm_add_pd(fiz2,tz);
2631 fjx3 = _mm_add_pd(fjx3,tx);
2632 fjy3 = _mm_add_pd(fjy3,ty);
2633 fjz3 = _mm_add_pd(fjz3,tz);
2637 /**************************
2638 * CALCULATE INTERACTIONS *
2639 **************************/
2641 if (gmx_mm_any_lt(rsq31,rcutoff2))
2644 r31 = _mm_mul_pd(rsq31,rinv31);
2646 /* EWALD ELECTROSTATICS */
2648 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2649 ewrt = _mm_mul_pd(r31,ewtabscale);
2650 ewitab = _mm_cvttpd_epi32(ewrt);
2651 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2652 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2653 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2654 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2656 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2658 fscal = felec;
2660 fscal = _mm_and_pd(fscal,cutoff_mask);
2662 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2664 /* Calculate temporary vectorial force */
2665 tx = _mm_mul_pd(fscal,dx31);
2666 ty = _mm_mul_pd(fscal,dy31);
2667 tz = _mm_mul_pd(fscal,dz31);
2669 /* Update vectorial force */
2670 fix3 = _mm_add_pd(fix3,tx);
2671 fiy3 = _mm_add_pd(fiy3,ty);
2672 fiz3 = _mm_add_pd(fiz3,tz);
2674 fjx1 = _mm_add_pd(fjx1,tx);
2675 fjy1 = _mm_add_pd(fjy1,ty);
2676 fjz1 = _mm_add_pd(fjz1,tz);
2680 /**************************
2681 * CALCULATE INTERACTIONS *
2682 **************************/
2684 if (gmx_mm_any_lt(rsq32,rcutoff2))
2687 r32 = _mm_mul_pd(rsq32,rinv32);
2689 /* EWALD ELECTROSTATICS */
2691 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2692 ewrt = _mm_mul_pd(r32,ewtabscale);
2693 ewitab = _mm_cvttpd_epi32(ewrt);
2694 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2695 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2696 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2697 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2699 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2701 fscal = felec;
2703 fscal = _mm_and_pd(fscal,cutoff_mask);
2705 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2707 /* Calculate temporary vectorial force */
2708 tx = _mm_mul_pd(fscal,dx32);
2709 ty = _mm_mul_pd(fscal,dy32);
2710 tz = _mm_mul_pd(fscal,dz32);
2712 /* Update vectorial force */
2713 fix3 = _mm_add_pd(fix3,tx);
2714 fiy3 = _mm_add_pd(fiy3,ty);
2715 fiz3 = _mm_add_pd(fiz3,tz);
2717 fjx2 = _mm_add_pd(fjx2,tx);
2718 fjy2 = _mm_add_pd(fjy2,ty);
2719 fjz2 = _mm_add_pd(fjz2,tz);
2723 /**************************
2724 * CALCULATE INTERACTIONS *
2725 **************************/
2727 if (gmx_mm_any_lt(rsq33,rcutoff2))
2730 r33 = _mm_mul_pd(rsq33,rinv33);
2732 /* EWALD ELECTROSTATICS */
2734 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2735 ewrt = _mm_mul_pd(r33,ewtabscale);
2736 ewitab = _mm_cvttpd_epi32(ewrt);
2737 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2738 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2739 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2740 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2742 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2744 fscal = felec;
2746 fscal = _mm_and_pd(fscal,cutoff_mask);
2748 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2750 /* Calculate temporary vectorial force */
2751 tx = _mm_mul_pd(fscal,dx33);
2752 ty = _mm_mul_pd(fscal,dy33);
2753 tz = _mm_mul_pd(fscal,dz33);
2755 /* Update vectorial force */
2756 fix3 = _mm_add_pd(fix3,tx);
2757 fiy3 = _mm_add_pd(fiy3,ty);
2758 fiz3 = _mm_add_pd(fiz3,tz);
2760 fjx3 = _mm_add_pd(fjx3,tx);
2761 fjy3 = _mm_add_pd(fjy3,ty);
2762 fjz3 = _mm_add_pd(fjz3,tz);
2766 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2768 /* Inner loop uses 403 flops */
2771 /* End of innermost loop */
2773 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2774 f+i_coord_offset,fshift+i_shift_offset);
2776 /* Increment number of inner iterations */
2777 inneriter += j_index_end - j_index_start;
2779 /* Outer loop uses 24 flops */
2782 /* Increment number of outer iterations */
2783 outeriter += nri;
2785 /* Update outer/inner flops */
2787 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*403);