Removed simple.h from nb_kernel_sse2_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_double.c
blob85898814322ada87363d1b1df3ec4c120062cc89
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB;
76 int j_coord_offsetA,j_coord_offsetB;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 int vdwioffset0;
82 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 int vdwioffset1;
84 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 int vdwioffset2;
86 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 int vdwioffset3;
88 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89 int vdwjidx0A,vdwjidx0B;
90 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B;
92 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B;
94 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 int vdwjidx3A,vdwjidx3B;
96 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 real *charge;
109 int nvdwtype;
110 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 int *vdwtype;
112 real *vdwparam;
113 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
115 __m128i ewitab;
116 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117 real *ewtab;
118 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
119 real rswitch_scalar,d_scalar;
120 __m128d dummy_mask,cutoff_mask;
121 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
122 __m128d one = _mm_set1_pd(1.0);
123 __m128d two = _mm_set1_pd(2.0);
124 x = xx[0];
125 f = ff[0];
127 nri = nlist->nri;
128 iinr = nlist->iinr;
129 jindex = nlist->jindex;
130 jjnr = nlist->jjnr;
131 shiftidx = nlist->shift;
132 gid = nlist->gid;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm_set1_pd(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
138 vdwparam = fr->nbfp;
139 vdwtype = mdatoms->typeA;
141 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
142 ewtab = fr->ic->tabq_coul_FDV0;
143 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
144 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
149 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
150 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
151 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
153 jq1 = _mm_set1_pd(charge[inr+1]);
154 jq2 = _mm_set1_pd(charge[inr+2]);
155 jq3 = _mm_set1_pd(charge[inr+3]);
156 vdwjidx0A = 2*vdwtype[inr+0];
157 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
158 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
159 qq11 = _mm_mul_pd(iq1,jq1);
160 qq12 = _mm_mul_pd(iq1,jq2);
161 qq13 = _mm_mul_pd(iq1,jq3);
162 qq21 = _mm_mul_pd(iq2,jq1);
163 qq22 = _mm_mul_pd(iq2,jq2);
164 qq23 = _mm_mul_pd(iq2,jq3);
165 qq31 = _mm_mul_pd(iq3,jq1);
166 qq32 = _mm_mul_pd(iq3,jq2);
167 qq33 = _mm_mul_pd(iq3,jq3);
169 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
170 rcutoff_scalar = fr->rcoulomb;
171 rcutoff = _mm_set1_pd(rcutoff_scalar);
172 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
174 rswitch_scalar = fr->rcoulomb_switch;
175 rswitch = _mm_set1_pd(rswitch_scalar);
176 /* Setup switch parameters */
177 d_scalar = rcutoff_scalar-rswitch_scalar;
178 d = _mm_set1_pd(d_scalar);
179 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
180 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
181 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
182 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
183 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
184 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186 /* Avoid stupid compiler warnings */
187 jnrA = jnrB = 0;
188 j_coord_offsetA = 0;
189 j_coord_offsetB = 0;
191 outeriter = 0;
192 inneriter = 0;
194 /* Start outer loop over neighborlists */
195 for(iidx=0; iidx<nri; iidx++)
197 /* Load shift vector for this list */
198 i_shift_offset = DIM*shiftidx[iidx];
200 /* Load limits for loop over neighbors */
201 j_index_start = jindex[iidx];
202 j_index_end = jindex[iidx+1];
204 /* Get outer coordinate index */
205 inr = iinr[iidx];
206 i_coord_offset = DIM*inr;
208 /* Load i particle coords and add shift vector */
209 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
210 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
212 fix0 = _mm_setzero_pd();
213 fiy0 = _mm_setzero_pd();
214 fiz0 = _mm_setzero_pd();
215 fix1 = _mm_setzero_pd();
216 fiy1 = _mm_setzero_pd();
217 fiz1 = _mm_setzero_pd();
218 fix2 = _mm_setzero_pd();
219 fiy2 = _mm_setzero_pd();
220 fiz2 = _mm_setzero_pd();
221 fix3 = _mm_setzero_pd();
222 fiy3 = _mm_setzero_pd();
223 fiz3 = _mm_setzero_pd();
225 /* Reset potential sums */
226 velecsum = _mm_setzero_pd();
227 vvdwsum = _mm_setzero_pd();
229 /* Start inner kernel loop */
230 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
233 /* Get j neighbor index, and coordinate index */
234 jnrA = jjnr[jidx];
235 jnrB = jjnr[jidx+1];
236 j_coord_offsetA = DIM*jnrA;
237 j_coord_offsetB = DIM*jnrB;
239 /* load j atom coordinates */
240 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
241 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
242 &jy2,&jz2,&jx3,&jy3,&jz3);
244 /* Calculate displacement vector */
245 dx00 = _mm_sub_pd(ix0,jx0);
246 dy00 = _mm_sub_pd(iy0,jy0);
247 dz00 = _mm_sub_pd(iz0,jz0);
248 dx11 = _mm_sub_pd(ix1,jx1);
249 dy11 = _mm_sub_pd(iy1,jy1);
250 dz11 = _mm_sub_pd(iz1,jz1);
251 dx12 = _mm_sub_pd(ix1,jx2);
252 dy12 = _mm_sub_pd(iy1,jy2);
253 dz12 = _mm_sub_pd(iz1,jz2);
254 dx13 = _mm_sub_pd(ix1,jx3);
255 dy13 = _mm_sub_pd(iy1,jy3);
256 dz13 = _mm_sub_pd(iz1,jz3);
257 dx21 = _mm_sub_pd(ix2,jx1);
258 dy21 = _mm_sub_pd(iy2,jy1);
259 dz21 = _mm_sub_pd(iz2,jz1);
260 dx22 = _mm_sub_pd(ix2,jx2);
261 dy22 = _mm_sub_pd(iy2,jy2);
262 dz22 = _mm_sub_pd(iz2,jz2);
263 dx23 = _mm_sub_pd(ix2,jx3);
264 dy23 = _mm_sub_pd(iy2,jy3);
265 dz23 = _mm_sub_pd(iz2,jz3);
266 dx31 = _mm_sub_pd(ix3,jx1);
267 dy31 = _mm_sub_pd(iy3,jy1);
268 dz31 = _mm_sub_pd(iz3,jz1);
269 dx32 = _mm_sub_pd(ix3,jx2);
270 dy32 = _mm_sub_pd(iy3,jy2);
271 dz32 = _mm_sub_pd(iz3,jz2);
272 dx33 = _mm_sub_pd(ix3,jx3);
273 dy33 = _mm_sub_pd(iy3,jy3);
274 dz33 = _mm_sub_pd(iz3,jz3);
276 /* Calculate squared distance and things based on it */
277 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
278 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
279 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
280 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
281 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
282 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
283 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
284 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
285 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
286 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
288 rinv00 = gmx_mm_invsqrt_pd(rsq00);
289 rinv11 = gmx_mm_invsqrt_pd(rsq11);
290 rinv12 = gmx_mm_invsqrt_pd(rsq12);
291 rinv13 = gmx_mm_invsqrt_pd(rsq13);
292 rinv21 = gmx_mm_invsqrt_pd(rsq21);
293 rinv22 = gmx_mm_invsqrt_pd(rsq22);
294 rinv23 = gmx_mm_invsqrt_pd(rsq23);
295 rinv31 = gmx_mm_invsqrt_pd(rsq31);
296 rinv32 = gmx_mm_invsqrt_pd(rsq32);
297 rinv33 = gmx_mm_invsqrt_pd(rsq33);
299 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
300 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
301 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
302 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
303 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
304 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
305 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
306 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
307 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
308 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
310 fjx0 = _mm_setzero_pd();
311 fjy0 = _mm_setzero_pd();
312 fjz0 = _mm_setzero_pd();
313 fjx1 = _mm_setzero_pd();
314 fjy1 = _mm_setzero_pd();
315 fjz1 = _mm_setzero_pd();
316 fjx2 = _mm_setzero_pd();
317 fjy2 = _mm_setzero_pd();
318 fjz2 = _mm_setzero_pd();
319 fjx3 = _mm_setzero_pd();
320 fjy3 = _mm_setzero_pd();
321 fjz3 = _mm_setzero_pd();
323 /**************************
324 * CALCULATE INTERACTIONS *
325 **************************/
327 if (gmx_mm_any_lt(rsq00,rcutoff2))
330 r00 = _mm_mul_pd(rsq00,rinv00);
332 /* LENNARD-JONES DISPERSION/REPULSION */
334 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
335 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
336 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
337 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
338 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
340 d = _mm_sub_pd(r00,rswitch);
341 d = _mm_max_pd(d,_mm_setzero_pd());
342 d2 = _mm_mul_pd(d,d);
343 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
345 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
347 /* Evaluate switch function */
348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
349 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
350 vvdw = _mm_mul_pd(vvdw,sw);
351 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 vvdw = _mm_and_pd(vvdw,cutoff_mask);
355 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
357 fscal = fvdw;
359 fscal = _mm_and_pd(fscal,cutoff_mask);
361 /* Calculate temporary vectorial force */
362 tx = _mm_mul_pd(fscal,dx00);
363 ty = _mm_mul_pd(fscal,dy00);
364 tz = _mm_mul_pd(fscal,dz00);
366 /* Update vectorial force */
367 fix0 = _mm_add_pd(fix0,tx);
368 fiy0 = _mm_add_pd(fiy0,ty);
369 fiz0 = _mm_add_pd(fiz0,tz);
371 fjx0 = _mm_add_pd(fjx0,tx);
372 fjy0 = _mm_add_pd(fjy0,ty);
373 fjz0 = _mm_add_pd(fjz0,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 if (gmx_mm_any_lt(rsq11,rcutoff2))
384 r11 = _mm_mul_pd(rsq11,rinv11);
386 /* EWALD ELECTROSTATICS */
388 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
389 ewrt = _mm_mul_pd(r11,ewtabscale);
390 ewitab = _mm_cvttpd_epi32(ewrt);
391 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
392 ewitab = _mm_slli_epi32(ewitab,2);
393 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
394 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
395 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
396 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
397 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
398 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
399 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
400 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
401 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
402 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
404 d = _mm_sub_pd(r11,rswitch);
405 d = _mm_max_pd(d,_mm_setzero_pd());
406 d2 = _mm_mul_pd(d,d);
407 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
409 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
411 /* Evaluate switch function */
412 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
413 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
414 velec = _mm_mul_pd(velec,sw);
415 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velec = _mm_and_pd(velec,cutoff_mask);
419 velecsum = _mm_add_pd(velecsum,velec);
421 fscal = felec;
423 fscal = _mm_and_pd(fscal,cutoff_mask);
425 /* Calculate temporary vectorial force */
426 tx = _mm_mul_pd(fscal,dx11);
427 ty = _mm_mul_pd(fscal,dy11);
428 tz = _mm_mul_pd(fscal,dz11);
430 /* Update vectorial force */
431 fix1 = _mm_add_pd(fix1,tx);
432 fiy1 = _mm_add_pd(fiy1,ty);
433 fiz1 = _mm_add_pd(fiz1,tz);
435 fjx1 = _mm_add_pd(fjx1,tx);
436 fjy1 = _mm_add_pd(fjy1,ty);
437 fjz1 = _mm_add_pd(fjz1,tz);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 if (gmx_mm_any_lt(rsq12,rcutoff2))
448 r12 = _mm_mul_pd(rsq12,rinv12);
450 /* EWALD ELECTROSTATICS */
452 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
453 ewrt = _mm_mul_pd(r12,ewtabscale);
454 ewitab = _mm_cvttpd_epi32(ewrt);
455 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
456 ewitab = _mm_slli_epi32(ewitab,2);
457 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
458 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
459 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
460 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
461 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
462 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
463 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
464 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
465 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
466 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
468 d = _mm_sub_pd(r12,rswitch);
469 d = _mm_max_pd(d,_mm_setzero_pd());
470 d2 = _mm_mul_pd(d,d);
471 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
473 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
475 /* Evaluate switch function */
476 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
477 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
478 velec = _mm_mul_pd(velec,sw);
479 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velec = _mm_and_pd(velec,cutoff_mask);
483 velecsum = _mm_add_pd(velecsum,velec);
485 fscal = felec;
487 fscal = _mm_and_pd(fscal,cutoff_mask);
489 /* Calculate temporary vectorial force */
490 tx = _mm_mul_pd(fscal,dx12);
491 ty = _mm_mul_pd(fscal,dy12);
492 tz = _mm_mul_pd(fscal,dz12);
494 /* Update vectorial force */
495 fix1 = _mm_add_pd(fix1,tx);
496 fiy1 = _mm_add_pd(fiy1,ty);
497 fiz1 = _mm_add_pd(fiz1,tz);
499 fjx2 = _mm_add_pd(fjx2,tx);
500 fjy2 = _mm_add_pd(fjy2,ty);
501 fjz2 = _mm_add_pd(fjz2,tz);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 if (gmx_mm_any_lt(rsq13,rcutoff2))
512 r13 = _mm_mul_pd(rsq13,rinv13);
514 /* EWALD ELECTROSTATICS */
516 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
517 ewrt = _mm_mul_pd(r13,ewtabscale);
518 ewitab = _mm_cvttpd_epi32(ewrt);
519 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
520 ewitab = _mm_slli_epi32(ewitab,2);
521 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
522 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
523 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
524 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
525 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
526 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
527 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
528 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
529 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
530 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
532 d = _mm_sub_pd(r13,rswitch);
533 d = _mm_max_pd(d,_mm_setzero_pd());
534 d2 = _mm_mul_pd(d,d);
535 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
537 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
539 /* Evaluate switch function */
540 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
541 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
542 velec = _mm_mul_pd(velec,sw);
543 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
545 /* Update potential sum for this i atom from the interaction with this j atom. */
546 velec = _mm_and_pd(velec,cutoff_mask);
547 velecsum = _mm_add_pd(velecsum,velec);
549 fscal = felec;
551 fscal = _mm_and_pd(fscal,cutoff_mask);
553 /* Calculate temporary vectorial force */
554 tx = _mm_mul_pd(fscal,dx13);
555 ty = _mm_mul_pd(fscal,dy13);
556 tz = _mm_mul_pd(fscal,dz13);
558 /* Update vectorial force */
559 fix1 = _mm_add_pd(fix1,tx);
560 fiy1 = _mm_add_pd(fiy1,ty);
561 fiz1 = _mm_add_pd(fiz1,tz);
563 fjx3 = _mm_add_pd(fjx3,tx);
564 fjy3 = _mm_add_pd(fjy3,ty);
565 fjz3 = _mm_add_pd(fjz3,tz);
569 /**************************
570 * CALCULATE INTERACTIONS *
571 **************************/
573 if (gmx_mm_any_lt(rsq21,rcutoff2))
576 r21 = _mm_mul_pd(rsq21,rinv21);
578 /* EWALD ELECTROSTATICS */
580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
581 ewrt = _mm_mul_pd(r21,ewtabscale);
582 ewitab = _mm_cvttpd_epi32(ewrt);
583 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
584 ewitab = _mm_slli_epi32(ewitab,2);
585 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
586 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
587 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
588 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
589 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
590 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
591 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
592 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
593 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
594 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
596 d = _mm_sub_pd(r21,rswitch);
597 d = _mm_max_pd(d,_mm_setzero_pd());
598 d2 = _mm_mul_pd(d,d);
599 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
601 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
603 /* Evaluate switch function */
604 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
605 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
606 velec = _mm_mul_pd(velec,sw);
607 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velec = _mm_and_pd(velec,cutoff_mask);
611 velecsum = _mm_add_pd(velecsum,velec);
613 fscal = felec;
615 fscal = _mm_and_pd(fscal,cutoff_mask);
617 /* Calculate temporary vectorial force */
618 tx = _mm_mul_pd(fscal,dx21);
619 ty = _mm_mul_pd(fscal,dy21);
620 tz = _mm_mul_pd(fscal,dz21);
622 /* Update vectorial force */
623 fix2 = _mm_add_pd(fix2,tx);
624 fiy2 = _mm_add_pd(fiy2,ty);
625 fiz2 = _mm_add_pd(fiz2,tz);
627 fjx1 = _mm_add_pd(fjx1,tx);
628 fjy1 = _mm_add_pd(fjy1,ty);
629 fjz1 = _mm_add_pd(fjz1,tz);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 if (gmx_mm_any_lt(rsq22,rcutoff2))
640 r22 = _mm_mul_pd(rsq22,rinv22);
642 /* EWALD ELECTROSTATICS */
644 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
645 ewrt = _mm_mul_pd(r22,ewtabscale);
646 ewitab = _mm_cvttpd_epi32(ewrt);
647 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
648 ewitab = _mm_slli_epi32(ewitab,2);
649 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
650 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
651 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
652 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
653 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
654 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
655 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
656 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
657 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
658 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
660 d = _mm_sub_pd(r22,rswitch);
661 d = _mm_max_pd(d,_mm_setzero_pd());
662 d2 = _mm_mul_pd(d,d);
663 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
665 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
667 /* Evaluate switch function */
668 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
669 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
670 velec = _mm_mul_pd(velec,sw);
671 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velec = _mm_and_pd(velec,cutoff_mask);
675 velecsum = _mm_add_pd(velecsum,velec);
677 fscal = felec;
679 fscal = _mm_and_pd(fscal,cutoff_mask);
681 /* Calculate temporary vectorial force */
682 tx = _mm_mul_pd(fscal,dx22);
683 ty = _mm_mul_pd(fscal,dy22);
684 tz = _mm_mul_pd(fscal,dz22);
686 /* Update vectorial force */
687 fix2 = _mm_add_pd(fix2,tx);
688 fiy2 = _mm_add_pd(fiy2,ty);
689 fiz2 = _mm_add_pd(fiz2,tz);
691 fjx2 = _mm_add_pd(fjx2,tx);
692 fjy2 = _mm_add_pd(fjy2,ty);
693 fjz2 = _mm_add_pd(fjz2,tz);
697 /**************************
698 * CALCULATE INTERACTIONS *
699 **************************/
701 if (gmx_mm_any_lt(rsq23,rcutoff2))
704 r23 = _mm_mul_pd(rsq23,rinv23);
706 /* EWALD ELECTROSTATICS */
708 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
709 ewrt = _mm_mul_pd(r23,ewtabscale);
710 ewitab = _mm_cvttpd_epi32(ewrt);
711 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
712 ewitab = _mm_slli_epi32(ewitab,2);
713 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
714 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
715 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
716 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
717 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
718 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
719 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
720 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
721 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
722 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
724 d = _mm_sub_pd(r23,rswitch);
725 d = _mm_max_pd(d,_mm_setzero_pd());
726 d2 = _mm_mul_pd(d,d);
727 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
729 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
731 /* Evaluate switch function */
732 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
733 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
734 velec = _mm_mul_pd(velec,sw);
735 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velec = _mm_and_pd(velec,cutoff_mask);
739 velecsum = _mm_add_pd(velecsum,velec);
741 fscal = felec;
743 fscal = _mm_and_pd(fscal,cutoff_mask);
745 /* Calculate temporary vectorial force */
746 tx = _mm_mul_pd(fscal,dx23);
747 ty = _mm_mul_pd(fscal,dy23);
748 tz = _mm_mul_pd(fscal,dz23);
750 /* Update vectorial force */
751 fix2 = _mm_add_pd(fix2,tx);
752 fiy2 = _mm_add_pd(fiy2,ty);
753 fiz2 = _mm_add_pd(fiz2,tz);
755 fjx3 = _mm_add_pd(fjx3,tx);
756 fjy3 = _mm_add_pd(fjy3,ty);
757 fjz3 = _mm_add_pd(fjz3,tz);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 if (gmx_mm_any_lt(rsq31,rcutoff2))
768 r31 = _mm_mul_pd(rsq31,rinv31);
770 /* EWALD ELECTROSTATICS */
772 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
773 ewrt = _mm_mul_pd(r31,ewtabscale);
774 ewitab = _mm_cvttpd_epi32(ewrt);
775 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
776 ewitab = _mm_slli_epi32(ewitab,2);
777 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
778 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
779 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
780 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
781 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
782 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
783 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
784 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
785 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
786 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
788 d = _mm_sub_pd(r31,rswitch);
789 d = _mm_max_pd(d,_mm_setzero_pd());
790 d2 = _mm_mul_pd(d,d);
791 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
793 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
795 /* Evaluate switch function */
796 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
797 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
798 velec = _mm_mul_pd(velec,sw);
799 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
801 /* Update potential sum for this i atom from the interaction with this j atom. */
802 velec = _mm_and_pd(velec,cutoff_mask);
803 velecsum = _mm_add_pd(velecsum,velec);
805 fscal = felec;
807 fscal = _mm_and_pd(fscal,cutoff_mask);
809 /* Calculate temporary vectorial force */
810 tx = _mm_mul_pd(fscal,dx31);
811 ty = _mm_mul_pd(fscal,dy31);
812 tz = _mm_mul_pd(fscal,dz31);
814 /* Update vectorial force */
815 fix3 = _mm_add_pd(fix3,tx);
816 fiy3 = _mm_add_pd(fiy3,ty);
817 fiz3 = _mm_add_pd(fiz3,tz);
819 fjx1 = _mm_add_pd(fjx1,tx);
820 fjy1 = _mm_add_pd(fjy1,ty);
821 fjz1 = _mm_add_pd(fjz1,tz);
825 /**************************
826 * CALCULATE INTERACTIONS *
827 **************************/
829 if (gmx_mm_any_lt(rsq32,rcutoff2))
832 r32 = _mm_mul_pd(rsq32,rinv32);
834 /* EWALD ELECTROSTATICS */
836 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
837 ewrt = _mm_mul_pd(r32,ewtabscale);
838 ewitab = _mm_cvttpd_epi32(ewrt);
839 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
840 ewitab = _mm_slli_epi32(ewitab,2);
841 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
842 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
843 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
844 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
845 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
846 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
847 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
848 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
849 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
850 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
852 d = _mm_sub_pd(r32,rswitch);
853 d = _mm_max_pd(d,_mm_setzero_pd());
854 d2 = _mm_mul_pd(d,d);
855 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
857 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
859 /* Evaluate switch function */
860 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
861 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
862 velec = _mm_mul_pd(velec,sw);
863 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
865 /* Update potential sum for this i atom from the interaction with this j atom. */
866 velec = _mm_and_pd(velec,cutoff_mask);
867 velecsum = _mm_add_pd(velecsum,velec);
869 fscal = felec;
871 fscal = _mm_and_pd(fscal,cutoff_mask);
873 /* Calculate temporary vectorial force */
874 tx = _mm_mul_pd(fscal,dx32);
875 ty = _mm_mul_pd(fscal,dy32);
876 tz = _mm_mul_pd(fscal,dz32);
878 /* Update vectorial force */
879 fix3 = _mm_add_pd(fix3,tx);
880 fiy3 = _mm_add_pd(fiy3,ty);
881 fiz3 = _mm_add_pd(fiz3,tz);
883 fjx2 = _mm_add_pd(fjx2,tx);
884 fjy2 = _mm_add_pd(fjy2,ty);
885 fjz2 = _mm_add_pd(fjz2,tz);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 if (gmx_mm_any_lt(rsq33,rcutoff2))
896 r33 = _mm_mul_pd(rsq33,rinv33);
898 /* EWALD ELECTROSTATICS */
900 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
901 ewrt = _mm_mul_pd(r33,ewtabscale);
902 ewitab = _mm_cvttpd_epi32(ewrt);
903 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
904 ewitab = _mm_slli_epi32(ewitab,2);
905 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
906 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
907 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
908 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
909 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
910 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
911 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
912 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
913 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
914 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
916 d = _mm_sub_pd(r33,rswitch);
917 d = _mm_max_pd(d,_mm_setzero_pd());
918 d2 = _mm_mul_pd(d,d);
919 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
921 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
923 /* Evaluate switch function */
924 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
925 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
926 velec = _mm_mul_pd(velec,sw);
927 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec = _mm_and_pd(velec,cutoff_mask);
931 velecsum = _mm_add_pd(velecsum,velec);
933 fscal = felec;
935 fscal = _mm_and_pd(fscal,cutoff_mask);
937 /* Calculate temporary vectorial force */
938 tx = _mm_mul_pd(fscal,dx33);
939 ty = _mm_mul_pd(fscal,dy33);
940 tz = _mm_mul_pd(fscal,dz33);
942 /* Update vectorial force */
943 fix3 = _mm_add_pd(fix3,tx);
944 fiy3 = _mm_add_pd(fiy3,ty);
945 fiz3 = _mm_add_pd(fiz3,tz);
947 fjx3 = _mm_add_pd(fjx3,tx);
948 fjy3 = _mm_add_pd(fjy3,ty);
949 fjz3 = _mm_add_pd(fjz3,tz);
953 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
955 /* Inner loop uses 647 flops */
958 if(jidx<j_index_end)
961 jnrA = jjnr[jidx];
962 j_coord_offsetA = DIM*jnrA;
964 /* load j atom coordinates */
965 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
966 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
967 &jy2,&jz2,&jx3,&jy3,&jz3);
969 /* Calculate displacement vector */
970 dx00 = _mm_sub_pd(ix0,jx0);
971 dy00 = _mm_sub_pd(iy0,jy0);
972 dz00 = _mm_sub_pd(iz0,jz0);
973 dx11 = _mm_sub_pd(ix1,jx1);
974 dy11 = _mm_sub_pd(iy1,jy1);
975 dz11 = _mm_sub_pd(iz1,jz1);
976 dx12 = _mm_sub_pd(ix1,jx2);
977 dy12 = _mm_sub_pd(iy1,jy2);
978 dz12 = _mm_sub_pd(iz1,jz2);
979 dx13 = _mm_sub_pd(ix1,jx3);
980 dy13 = _mm_sub_pd(iy1,jy3);
981 dz13 = _mm_sub_pd(iz1,jz3);
982 dx21 = _mm_sub_pd(ix2,jx1);
983 dy21 = _mm_sub_pd(iy2,jy1);
984 dz21 = _mm_sub_pd(iz2,jz1);
985 dx22 = _mm_sub_pd(ix2,jx2);
986 dy22 = _mm_sub_pd(iy2,jy2);
987 dz22 = _mm_sub_pd(iz2,jz2);
988 dx23 = _mm_sub_pd(ix2,jx3);
989 dy23 = _mm_sub_pd(iy2,jy3);
990 dz23 = _mm_sub_pd(iz2,jz3);
991 dx31 = _mm_sub_pd(ix3,jx1);
992 dy31 = _mm_sub_pd(iy3,jy1);
993 dz31 = _mm_sub_pd(iz3,jz1);
994 dx32 = _mm_sub_pd(ix3,jx2);
995 dy32 = _mm_sub_pd(iy3,jy2);
996 dz32 = _mm_sub_pd(iz3,jz2);
997 dx33 = _mm_sub_pd(ix3,jx3);
998 dy33 = _mm_sub_pd(iy3,jy3);
999 dz33 = _mm_sub_pd(iz3,jz3);
1001 /* Calculate squared distance and things based on it */
1002 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1003 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1004 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1005 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1006 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1007 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1008 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1009 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1010 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1011 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1013 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1014 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1015 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1016 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1017 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1018 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1019 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1020 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1021 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1022 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1024 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1025 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1026 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1027 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1028 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1029 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1030 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1031 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1032 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1033 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1035 fjx0 = _mm_setzero_pd();
1036 fjy0 = _mm_setzero_pd();
1037 fjz0 = _mm_setzero_pd();
1038 fjx1 = _mm_setzero_pd();
1039 fjy1 = _mm_setzero_pd();
1040 fjz1 = _mm_setzero_pd();
1041 fjx2 = _mm_setzero_pd();
1042 fjy2 = _mm_setzero_pd();
1043 fjz2 = _mm_setzero_pd();
1044 fjx3 = _mm_setzero_pd();
1045 fjy3 = _mm_setzero_pd();
1046 fjz3 = _mm_setzero_pd();
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 if (gmx_mm_any_lt(rsq00,rcutoff2))
1055 r00 = _mm_mul_pd(rsq00,rinv00);
1057 /* LENNARD-JONES DISPERSION/REPULSION */
1059 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1060 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1061 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1062 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1063 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1065 d = _mm_sub_pd(r00,rswitch);
1066 d = _mm_max_pd(d,_mm_setzero_pd());
1067 d2 = _mm_mul_pd(d,d);
1068 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1070 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1072 /* Evaluate switch function */
1073 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1074 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1075 vvdw = _mm_mul_pd(vvdw,sw);
1076 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1080 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1081 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1083 fscal = fvdw;
1085 fscal = _mm_and_pd(fscal,cutoff_mask);
1087 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1089 /* Calculate temporary vectorial force */
1090 tx = _mm_mul_pd(fscal,dx00);
1091 ty = _mm_mul_pd(fscal,dy00);
1092 tz = _mm_mul_pd(fscal,dz00);
1094 /* Update vectorial force */
1095 fix0 = _mm_add_pd(fix0,tx);
1096 fiy0 = _mm_add_pd(fiy0,ty);
1097 fiz0 = _mm_add_pd(fiz0,tz);
1099 fjx0 = _mm_add_pd(fjx0,tx);
1100 fjy0 = _mm_add_pd(fjy0,ty);
1101 fjz0 = _mm_add_pd(fjz0,tz);
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 if (gmx_mm_any_lt(rsq11,rcutoff2))
1112 r11 = _mm_mul_pd(rsq11,rinv11);
1114 /* EWALD ELECTROSTATICS */
1116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1117 ewrt = _mm_mul_pd(r11,ewtabscale);
1118 ewitab = _mm_cvttpd_epi32(ewrt);
1119 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1120 ewitab = _mm_slli_epi32(ewitab,2);
1121 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1122 ewtabD = _mm_setzero_pd();
1123 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1124 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1125 ewtabFn = _mm_setzero_pd();
1126 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1127 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1128 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1129 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1130 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1132 d = _mm_sub_pd(r11,rswitch);
1133 d = _mm_max_pd(d,_mm_setzero_pd());
1134 d2 = _mm_mul_pd(d,d);
1135 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1137 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1139 /* Evaluate switch function */
1140 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1141 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1142 velec = _mm_mul_pd(velec,sw);
1143 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1145 /* Update potential sum for this i atom from the interaction with this j atom. */
1146 velec = _mm_and_pd(velec,cutoff_mask);
1147 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1148 velecsum = _mm_add_pd(velecsum,velec);
1150 fscal = felec;
1152 fscal = _mm_and_pd(fscal,cutoff_mask);
1154 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1156 /* Calculate temporary vectorial force */
1157 tx = _mm_mul_pd(fscal,dx11);
1158 ty = _mm_mul_pd(fscal,dy11);
1159 tz = _mm_mul_pd(fscal,dz11);
1161 /* Update vectorial force */
1162 fix1 = _mm_add_pd(fix1,tx);
1163 fiy1 = _mm_add_pd(fiy1,ty);
1164 fiz1 = _mm_add_pd(fiz1,tz);
1166 fjx1 = _mm_add_pd(fjx1,tx);
1167 fjy1 = _mm_add_pd(fjy1,ty);
1168 fjz1 = _mm_add_pd(fjz1,tz);
1172 /**************************
1173 * CALCULATE INTERACTIONS *
1174 **************************/
1176 if (gmx_mm_any_lt(rsq12,rcutoff2))
1179 r12 = _mm_mul_pd(rsq12,rinv12);
1181 /* EWALD ELECTROSTATICS */
1183 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1184 ewrt = _mm_mul_pd(r12,ewtabscale);
1185 ewitab = _mm_cvttpd_epi32(ewrt);
1186 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1187 ewitab = _mm_slli_epi32(ewitab,2);
1188 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1189 ewtabD = _mm_setzero_pd();
1190 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1191 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1192 ewtabFn = _mm_setzero_pd();
1193 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1194 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1195 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1196 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1197 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1199 d = _mm_sub_pd(r12,rswitch);
1200 d = _mm_max_pd(d,_mm_setzero_pd());
1201 d2 = _mm_mul_pd(d,d);
1202 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1204 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1206 /* Evaluate switch function */
1207 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1208 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1209 velec = _mm_mul_pd(velec,sw);
1210 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1212 /* Update potential sum for this i atom from the interaction with this j atom. */
1213 velec = _mm_and_pd(velec,cutoff_mask);
1214 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1215 velecsum = _mm_add_pd(velecsum,velec);
1217 fscal = felec;
1219 fscal = _mm_and_pd(fscal,cutoff_mask);
1221 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1223 /* Calculate temporary vectorial force */
1224 tx = _mm_mul_pd(fscal,dx12);
1225 ty = _mm_mul_pd(fscal,dy12);
1226 tz = _mm_mul_pd(fscal,dz12);
1228 /* Update vectorial force */
1229 fix1 = _mm_add_pd(fix1,tx);
1230 fiy1 = _mm_add_pd(fiy1,ty);
1231 fiz1 = _mm_add_pd(fiz1,tz);
1233 fjx2 = _mm_add_pd(fjx2,tx);
1234 fjy2 = _mm_add_pd(fjy2,ty);
1235 fjz2 = _mm_add_pd(fjz2,tz);
1239 /**************************
1240 * CALCULATE INTERACTIONS *
1241 **************************/
1243 if (gmx_mm_any_lt(rsq13,rcutoff2))
1246 r13 = _mm_mul_pd(rsq13,rinv13);
1248 /* EWALD ELECTROSTATICS */
1250 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1251 ewrt = _mm_mul_pd(r13,ewtabscale);
1252 ewitab = _mm_cvttpd_epi32(ewrt);
1253 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1254 ewitab = _mm_slli_epi32(ewitab,2);
1255 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1256 ewtabD = _mm_setzero_pd();
1257 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1258 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1259 ewtabFn = _mm_setzero_pd();
1260 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1261 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1262 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1263 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1264 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1266 d = _mm_sub_pd(r13,rswitch);
1267 d = _mm_max_pd(d,_mm_setzero_pd());
1268 d2 = _mm_mul_pd(d,d);
1269 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1271 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1273 /* Evaluate switch function */
1274 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1275 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1276 velec = _mm_mul_pd(velec,sw);
1277 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1279 /* Update potential sum for this i atom from the interaction with this j atom. */
1280 velec = _mm_and_pd(velec,cutoff_mask);
1281 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1282 velecsum = _mm_add_pd(velecsum,velec);
1284 fscal = felec;
1286 fscal = _mm_and_pd(fscal,cutoff_mask);
1288 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1290 /* Calculate temporary vectorial force */
1291 tx = _mm_mul_pd(fscal,dx13);
1292 ty = _mm_mul_pd(fscal,dy13);
1293 tz = _mm_mul_pd(fscal,dz13);
1295 /* Update vectorial force */
1296 fix1 = _mm_add_pd(fix1,tx);
1297 fiy1 = _mm_add_pd(fiy1,ty);
1298 fiz1 = _mm_add_pd(fiz1,tz);
1300 fjx3 = _mm_add_pd(fjx3,tx);
1301 fjy3 = _mm_add_pd(fjy3,ty);
1302 fjz3 = _mm_add_pd(fjz3,tz);
1306 /**************************
1307 * CALCULATE INTERACTIONS *
1308 **************************/
1310 if (gmx_mm_any_lt(rsq21,rcutoff2))
1313 r21 = _mm_mul_pd(rsq21,rinv21);
1315 /* EWALD ELECTROSTATICS */
1317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1318 ewrt = _mm_mul_pd(r21,ewtabscale);
1319 ewitab = _mm_cvttpd_epi32(ewrt);
1320 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1321 ewitab = _mm_slli_epi32(ewitab,2);
1322 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1323 ewtabD = _mm_setzero_pd();
1324 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1325 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1326 ewtabFn = _mm_setzero_pd();
1327 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1328 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1329 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1330 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1331 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1333 d = _mm_sub_pd(r21,rswitch);
1334 d = _mm_max_pd(d,_mm_setzero_pd());
1335 d2 = _mm_mul_pd(d,d);
1336 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1338 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1340 /* Evaluate switch function */
1341 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1342 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1343 velec = _mm_mul_pd(velec,sw);
1344 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1346 /* Update potential sum for this i atom from the interaction with this j atom. */
1347 velec = _mm_and_pd(velec,cutoff_mask);
1348 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1349 velecsum = _mm_add_pd(velecsum,velec);
1351 fscal = felec;
1353 fscal = _mm_and_pd(fscal,cutoff_mask);
1355 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1357 /* Calculate temporary vectorial force */
1358 tx = _mm_mul_pd(fscal,dx21);
1359 ty = _mm_mul_pd(fscal,dy21);
1360 tz = _mm_mul_pd(fscal,dz21);
1362 /* Update vectorial force */
1363 fix2 = _mm_add_pd(fix2,tx);
1364 fiy2 = _mm_add_pd(fiy2,ty);
1365 fiz2 = _mm_add_pd(fiz2,tz);
1367 fjx1 = _mm_add_pd(fjx1,tx);
1368 fjy1 = _mm_add_pd(fjy1,ty);
1369 fjz1 = _mm_add_pd(fjz1,tz);
1373 /**************************
1374 * CALCULATE INTERACTIONS *
1375 **************************/
1377 if (gmx_mm_any_lt(rsq22,rcutoff2))
1380 r22 = _mm_mul_pd(rsq22,rinv22);
1382 /* EWALD ELECTROSTATICS */
1384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1385 ewrt = _mm_mul_pd(r22,ewtabscale);
1386 ewitab = _mm_cvttpd_epi32(ewrt);
1387 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1388 ewitab = _mm_slli_epi32(ewitab,2);
1389 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1390 ewtabD = _mm_setzero_pd();
1391 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1392 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1393 ewtabFn = _mm_setzero_pd();
1394 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1395 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1396 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1397 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1398 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1400 d = _mm_sub_pd(r22,rswitch);
1401 d = _mm_max_pd(d,_mm_setzero_pd());
1402 d2 = _mm_mul_pd(d,d);
1403 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1405 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1407 /* Evaluate switch function */
1408 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1409 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1410 velec = _mm_mul_pd(velec,sw);
1411 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1413 /* Update potential sum for this i atom from the interaction with this j atom. */
1414 velec = _mm_and_pd(velec,cutoff_mask);
1415 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1416 velecsum = _mm_add_pd(velecsum,velec);
1418 fscal = felec;
1420 fscal = _mm_and_pd(fscal,cutoff_mask);
1422 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1424 /* Calculate temporary vectorial force */
1425 tx = _mm_mul_pd(fscal,dx22);
1426 ty = _mm_mul_pd(fscal,dy22);
1427 tz = _mm_mul_pd(fscal,dz22);
1429 /* Update vectorial force */
1430 fix2 = _mm_add_pd(fix2,tx);
1431 fiy2 = _mm_add_pd(fiy2,ty);
1432 fiz2 = _mm_add_pd(fiz2,tz);
1434 fjx2 = _mm_add_pd(fjx2,tx);
1435 fjy2 = _mm_add_pd(fjy2,ty);
1436 fjz2 = _mm_add_pd(fjz2,tz);
1440 /**************************
1441 * CALCULATE INTERACTIONS *
1442 **************************/
1444 if (gmx_mm_any_lt(rsq23,rcutoff2))
1447 r23 = _mm_mul_pd(rsq23,rinv23);
1449 /* EWALD ELECTROSTATICS */
1451 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1452 ewrt = _mm_mul_pd(r23,ewtabscale);
1453 ewitab = _mm_cvttpd_epi32(ewrt);
1454 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1455 ewitab = _mm_slli_epi32(ewitab,2);
1456 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1457 ewtabD = _mm_setzero_pd();
1458 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1459 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1460 ewtabFn = _mm_setzero_pd();
1461 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1462 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1463 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1464 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1465 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1467 d = _mm_sub_pd(r23,rswitch);
1468 d = _mm_max_pd(d,_mm_setzero_pd());
1469 d2 = _mm_mul_pd(d,d);
1470 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1472 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1474 /* Evaluate switch function */
1475 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1476 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1477 velec = _mm_mul_pd(velec,sw);
1478 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1480 /* Update potential sum for this i atom from the interaction with this j atom. */
1481 velec = _mm_and_pd(velec,cutoff_mask);
1482 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1483 velecsum = _mm_add_pd(velecsum,velec);
1485 fscal = felec;
1487 fscal = _mm_and_pd(fscal,cutoff_mask);
1489 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1491 /* Calculate temporary vectorial force */
1492 tx = _mm_mul_pd(fscal,dx23);
1493 ty = _mm_mul_pd(fscal,dy23);
1494 tz = _mm_mul_pd(fscal,dz23);
1496 /* Update vectorial force */
1497 fix2 = _mm_add_pd(fix2,tx);
1498 fiy2 = _mm_add_pd(fiy2,ty);
1499 fiz2 = _mm_add_pd(fiz2,tz);
1501 fjx3 = _mm_add_pd(fjx3,tx);
1502 fjy3 = _mm_add_pd(fjy3,ty);
1503 fjz3 = _mm_add_pd(fjz3,tz);
1507 /**************************
1508 * CALCULATE INTERACTIONS *
1509 **************************/
1511 if (gmx_mm_any_lt(rsq31,rcutoff2))
1514 r31 = _mm_mul_pd(rsq31,rinv31);
1516 /* EWALD ELECTROSTATICS */
1518 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1519 ewrt = _mm_mul_pd(r31,ewtabscale);
1520 ewitab = _mm_cvttpd_epi32(ewrt);
1521 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1522 ewitab = _mm_slli_epi32(ewitab,2);
1523 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1524 ewtabD = _mm_setzero_pd();
1525 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1526 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1527 ewtabFn = _mm_setzero_pd();
1528 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1529 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1530 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1531 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1532 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1534 d = _mm_sub_pd(r31,rswitch);
1535 d = _mm_max_pd(d,_mm_setzero_pd());
1536 d2 = _mm_mul_pd(d,d);
1537 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1539 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1541 /* Evaluate switch function */
1542 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1543 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1544 velec = _mm_mul_pd(velec,sw);
1545 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1547 /* Update potential sum for this i atom from the interaction with this j atom. */
1548 velec = _mm_and_pd(velec,cutoff_mask);
1549 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1550 velecsum = _mm_add_pd(velecsum,velec);
1552 fscal = felec;
1554 fscal = _mm_and_pd(fscal,cutoff_mask);
1556 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1558 /* Calculate temporary vectorial force */
1559 tx = _mm_mul_pd(fscal,dx31);
1560 ty = _mm_mul_pd(fscal,dy31);
1561 tz = _mm_mul_pd(fscal,dz31);
1563 /* Update vectorial force */
1564 fix3 = _mm_add_pd(fix3,tx);
1565 fiy3 = _mm_add_pd(fiy3,ty);
1566 fiz3 = _mm_add_pd(fiz3,tz);
1568 fjx1 = _mm_add_pd(fjx1,tx);
1569 fjy1 = _mm_add_pd(fjy1,ty);
1570 fjz1 = _mm_add_pd(fjz1,tz);
1574 /**************************
1575 * CALCULATE INTERACTIONS *
1576 **************************/
1578 if (gmx_mm_any_lt(rsq32,rcutoff2))
1581 r32 = _mm_mul_pd(rsq32,rinv32);
1583 /* EWALD ELECTROSTATICS */
1585 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1586 ewrt = _mm_mul_pd(r32,ewtabscale);
1587 ewitab = _mm_cvttpd_epi32(ewrt);
1588 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1589 ewitab = _mm_slli_epi32(ewitab,2);
1590 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1591 ewtabD = _mm_setzero_pd();
1592 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1593 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1594 ewtabFn = _mm_setzero_pd();
1595 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1596 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1597 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1598 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1599 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1601 d = _mm_sub_pd(r32,rswitch);
1602 d = _mm_max_pd(d,_mm_setzero_pd());
1603 d2 = _mm_mul_pd(d,d);
1604 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1606 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1608 /* Evaluate switch function */
1609 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1610 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1611 velec = _mm_mul_pd(velec,sw);
1612 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1614 /* Update potential sum for this i atom from the interaction with this j atom. */
1615 velec = _mm_and_pd(velec,cutoff_mask);
1616 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1617 velecsum = _mm_add_pd(velecsum,velec);
1619 fscal = felec;
1621 fscal = _mm_and_pd(fscal,cutoff_mask);
1623 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1625 /* Calculate temporary vectorial force */
1626 tx = _mm_mul_pd(fscal,dx32);
1627 ty = _mm_mul_pd(fscal,dy32);
1628 tz = _mm_mul_pd(fscal,dz32);
1630 /* Update vectorial force */
1631 fix3 = _mm_add_pd(fix3,tx);
1632 fiy3 = _mm_add_pd(fiy3,ty);
1633 fiz3 = _mm_add_pd(fiz3,tz);
1635 fjx2 = _mm_add_pd(fjx2,tx);
1636 fjy2 = _mm_add_pd(fjy2,ty);
1637 fjz2 = _mm_add_pd(fjz2,tz);
1641 /**************************
1642 * CALCULATE INTERACTIONS *
1643 **************************/
1645 if (gmx_mm_any_lt(rsq33,rcutoff2))
1648 r33 = _mm_mul_pd(rsq33,rinv33);
1650 /* EWALD ELECTROSTATICS */
1652 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1653 ewrt = _mm_mul_pd(r33,ewtabscale);
1654 ewitab = _mm_cvttpd_epi32(ewrt);
1655 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1656 ewitab = _mm_slli_epi32(ewitab,2);
1657 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1658 ewtabD = _mm_setzero_pd();
1659 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1660 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1661 ewtabFn = _mm_setzero_pd();
1662 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1663 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1664 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1665 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1666 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1668 d = _mm_sub_pd(r33,rswitch);
1669 d = _mm_max_pd(d,_mm_setzero_pd());
1670 d2 = _mm_mul_pd(d,d);
1671 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1673 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1675 /* Evaluate switch function */
1676 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1677 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1678 velec = _mm_mul_pd(velec,sw);
1679 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1681 /* Update potential sum for this i atom from the interaction with this j atom. */
1682 velec = _mm_and_pd(velec,cutoff_mask);
1683 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1684 velecsum = _mm_add_pd(velecsum,velec);
1686 fscal = felec;
1688 fscal = _mm_and_pd(fscal,cutoff_mask);
1690 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1692 /* Calculate temporary vectorial force */
1693 tx = _mm_mul_pd(fscal,dx33);
1694 ty = _mm_mul_pd(fscal,dy33);
1695 tz = _mm_mul_pd(fscal,dz33);
1697 /* Update vectorial force */
1698 fix3 = _mm_add_pd(fix3,tx);
1699 fiy3 = _mm_add_pd(fiy3,ty);
1700 fiz3 = _mm_add_pd(fiz3,tz);
1702 fjx3 = _mm_add_pd(fjx3,tx);
1703 fjy3 = _mm_add_pd(fjy3,ty);
1704 fjz3 = _mm_add_pd(fjz3,tz);
1708 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1710 /* Inner loop uses 647 flops */
1713 /* End of innermost loop */
1715 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1716 f+i_coord_offset,fshift+i_shift_offset);
1718 ggid = gid[iidx];
1719 /* Update potential energies */
1720 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1721 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1723 /* Increment number of inner iterations */
1724 inneriter += j_index_end - j_index_start;
1726 /* Outer loop uses 26 flops */
1729 /* Increment number of outer iterations */
1730 outeriter += nri;
1732 /* Update outer/inner flops */
1734 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*647);
1737 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1738 * Electrostatics interaction: Ewald
1739 * VdW interaction: LennardJones
1740 * Geometry: Water4-Water4
1741 * Calculate force/pot: Force
1743 void
1744 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1745 (t_nblist * gmx_restrict nlist,
1746 rvec * gmx_restrict xx,
1747 rvec * gmx_restrict ff,
1748 t_forcerec * gmx_restrict fr,
1749 t_mdatoms * gmx_restrict mdatoms,
1750 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1751 t_nrnb * gmx_restrict nrnb)
1753 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1754 * just 0 for non-waters.
1755 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1756 * jnr indices corresponding to data put in the four positions in the SIMD register.
1758 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1759 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1760 int jnrA,jnrB;
1761 int j_coord_offsetA,j_coord_offsetB;
1762 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1763 real rcutoff_scalar;
1764 real *shiftvec,*fshift,*x,*f;
1765 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1766 int vdwioffset0;
1767 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1768 int vdwioffset1;
1769 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1770 int vdwioffset2;
1771 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1772 int vdwioffset3;
1773 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1774 int vdwjidx0A,vdwjidx0B;
1775 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1776 int vdwjidx1A,vdwjidx1B;
1777 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1778 int vdwjidx2A,vdwjidx2B;
1779 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1780 int vdwjidx3A,vdwjidx3B;
1781 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1782 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1783 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1784 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1785 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1786 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1787 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1788 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1789 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1790 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1791 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1792 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1793 real *charge;
1794 int nvdwtype;
1795 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1796 int *vdwtype;
1797 real *vdwparam;
1798 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1799 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1800 __m128i ewitab;
1801 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1802 real *ewtab;
1803 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1804 real rswitch_scalar,d_scalar;
1805 __m128d dummy_mask,cutoff_mask;
1806 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1807 __m128d one = _mm_set1_pd(1.0);
1808 __m128d two = _mm_set1_pd(2.0);
1809 x = xx[0];
1810 f = ff[0];
1812 nri = nlist->nri;
1813 iinr = nlist->iinr;
1814 jindex = nlist->jindex;
1815 jjnr = nlist->jjnr;
1816 shiftidx = nlist->shift;
1817 gid = nlist->gid;
1818 shiftvec = fr->shift_vec[0];
1819 fshift = fr->fshift[0];
1820 facel = _mm_set1_pd(fr->epsfac);
1821 charge = mdatoms->chargeA;
1822 nvdwtype = fr->ntype;
1823 vdwparam = fr->nbfp;
1824 vdwtype = mdatoms->typeA;
1826 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1827 ewtab = fr->ic->tabq_coul_FDV0;
1828 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1829 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1831 /* Setup water-specific parameters */
1832 inr = nlist->iinr[0];
1833 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1834 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1835 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1836 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1838 jq1 = _mm_set1_pd(charge[inr+1]);
1839 jq2 = _mm_set1_pd(charge[inr+2]);
1840 jq3 = _mm_set1_pd(charge[inr+3]);
1841 vdwjidx0A = 2*vdwtype[inr+0];
1842 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1843 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1844 qq11 = _mm_mul_pd(iq1,jq1);
1845 qq12 = _mm_mul_pd(iq1,jq2);
1846 qq13 = _mm_mul_pd(iq1,jq3);
1847 qq21 = _mm_mul_pd(iq2,jq1);
1848 qq22 = _mm_mul_pd(iq2,jq2);
1849 qq23 = _mm_mul_pd(iq2,jq3);
1850 qq31 = _mm_mul_pd(iq3,jq1);
1851 qq32 = _mm_mul_pd(iq3,jq2);
1852 qq33 = _mm_mul_pd(iq3,jq3);
1854 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1855 rcutoff_scalar = fr->rcoulomb;
1856 rcutoff = _mm_set1_pd(rcutoff_scalar);
1857 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1859 rswitch_scalar = fr->rcoulomb_switch;
1860 rswitch = _mm_set1_pd(rswitch_scalar);
1861 /* Setup switch parameters */
1862 d_scalar = rcutoff_scalar-rswitch_scalar;
1863 d = _mm_set1_pd(d_scalar);
1864 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1865 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1866 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1867 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1868 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1869 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1871 /* Avoid stupid compiler warnings */
1872 jnrA = jnrB = 0;
1873 j_coord_offsetA = 0;
1874 j_coord_offsetB = 0;
1876 outeriter = 0;
1877 inneriter = 0;
1879 /* Start outer loop over neighborlists */
1880 for(iidx=0; iidx<nri; iidx++)
1882 /* Load shift vector for this list */
1883 i_shift_offset = DIM*shiftidx[iidx];
1885 /* Load limits for loop over neighbors */
1886 j_index_start = jindex[iidx];
1887 j_index_end = jindex[iidx+1];
1889 /* Get outer coordinate index */
1890 inr = iinr[iidx];
1891 i_coord_offset = DIM*inr;
1893 /* Load i particle coords and add shift vector */
1894 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1895 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1897 fix0 = _mm_setzero_pd();
1898 fiy0 = _mm_setzero_pd();
1899 fiz0 = _mm_setzero_pd();
1900 fix1 = _mm_setzero_pd();
1901 fiy1 = _mm_setzero_pd();
1902 fiz1 = _mm_setzero_pd();
1903 fix2 = _mm_setzero_pd();
1904 fiy2 = _mm_setzero_pd();
1905 fiz2 = _mm_setzero_pd();
1906 fix3 = _mm_setzero_pd();
1907 fiy3 = _mm_setzero_pd();
1908 fiz3 = _mm_setzero_pd();
1910 /* Start inner kernel loop */
1911 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1914 /* Get j neighbor index, and coordinate index */
1915 jnrA = jjnr[jidx];
1916 jnrB = jjnr[jidx+1];
1917 j_coord_offsetA = DIM*jnrA;
1918 j_coord_offsetB = DIM*jnrB;
1920 /* load j atom coordinates */
1921 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1922 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1923 &jy2,&jz2,&jx3,&jy3,&jz3);
1925 /* Calculate displacement vector */
1926 dx00 = _mm_sub_pd(ix0,jx0);
1927 dy00 = _mm_sub_pd(iy0,jy0);
1928 dz00 = _mm_sub_pd(iz0,jz0);
1929 dx11 = _mm_sub_pd(ix1,jx1);
1930 dy11 = _mm_sub_pd(iy1,jy1);
1931 dz11 = _mm_sub_pd(iz1,jz1);
1932 dx12 = _mm_sub_pd(ix1,jx2);
1933 dy12 = _mm_sub_pd(iy1,jy2);
1934 dz12 = _mm_sub_pd(iz1,jz2);
1935 dx13 = _mm_sub_pd(ix1,jx3);
1936 dy13 = _mm_sub_pd(iy1,jy3);
1937 dz13 = _mm_sub_pd(iz1,jz3);
1938 dx21 = _mm_sub_pd(ix2,jx1);
1939 dy21 = _mm_sub_pd(iy2,jy1);
1940 dz21 = _mm_sub_pd(iz2,jz1);
1941 dx22 = _mm_sub_pd(ix2,jx2);
1942 dy22 = _mm_sub_pd(iy2,jy2);
1943 dz22 = _mm_sub_pd(iz2,jz2);
1944 dx23 = _mm_sub_pd(ix2,jx3);
1945 dy23 = _mm_sub_pd(iy2,jy3);
1946 dz23 = _mm_sub_pd(iz2,jz3);
1947 dx31 = _mm_sub_pd(ix3,jx1);
1948 dy31 = _mm_sub_pd(iy3,jy1);
1949 dz31 = _mm_sub_pd(iz3,jz1);
1950 dx32 = _mm_sub_pd(ix3,jx2);
1951 dy32 = _mm_sub_pd(iy3,jy2);
1952 dz32 = _mm_sub_pd(iz3,jz2);
1953 dx33 = _mm_sub_pd(ix3,jx3);
1954 dy33 = _mm_sub_pd(iy3,jy3);
1955 dz33 = _mm_sub_pd(iz3,jz3);
1957 /* Calculate squared distance and things based on it */
1958 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1959 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1960 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1961 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1962 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1963 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1964 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1965 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1966 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1967 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1969 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1970 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1971 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1972 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1973 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1974 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1975 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1976 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1977 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1978 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1980 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1981 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1982 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1983 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1984 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1985 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1986 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1987 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1988 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1989 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1991 fjx0 = _mm_setzero_pd();
1992 fjy0 = _mm_setzero_pd();
1993 fjz0 = _mm_setzero_pd();
1994 fjx1 = _mm_setzero_pd();
1995 fjy1 = _mm_setzero_pd();
1996 fjz1 = _mm_setzero_pd();
1997 fjx2 = _mm_setzero_pd();
1998 fjy2 = _mm_setzero_pd();
1999 fjz2 = _mm_setzero_pd();
2000 fjx3 = _mm_setzero_pd();
2001 fjy3 = _mm_setzero_pd();
2002 fjz3 = _mm_setzero_pd();
2004 /**************************
2005 * CALCULATE INTERACTIONS *
2006 **************************/
2008 if (gmx_mm_any_lt(rsq00,rcutoff2))
2011 r00 = _mm_mul_pd(rsq00,rinv00);
2013 /* LENNARD-JONES DISPERSION/REPULSION */
2015 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2016 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2017 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2018 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2019 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2021 d = _mm_sub_pd(r00,rswitch);
2022 d = _mm_max_pd(d,_mm_setzero_pd());
2023 d2 = _mm_mul_pd(d,d);
2024 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2026 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2028 /* Evaluate switch function */
2029 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2030 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2031 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2033 fscal = fvdw;
2035 fscal = _mm_and_pd(fscal,cutoff_mask);
2037 /* Calculate temporary vectorial force */
2038 tx = _mm_mul_pd(fscal,dx00);
2039 ty = _mm_mul_pd(fscal,dy00);
2040 tz = _mm_mul_pd(fscal,dz00);
2042 /* Update vectorial force */
2043 fix0 = _mm_add_pd(fix0,tx);
2044 fiy0 = _mm_add_pd(fiy0,ty);
2045 fiz0 = _mm_add_pd(fiz0,tz);
2047 fjx0 = _mm_add_pd(fjx0,tx);
2048 fjy0 = _mm_add_pd(fjy0,ty);
2049 fjz0 = _mm_add_pd(fjz0,tz);
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 if (gmx_mm_any_lt(rsq11,rcutoff2))
2060 r11 = _mm_mul_pd(rsq11,rinv11);
2062 /* EWALD ELECTROSTATICS */
2064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2065 ewrt = _mm_mul_pd(r11,ewtabscale);
2066 ewitab = _mm_cvttpd_epi32(ewrt);
2067 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2068 ewitab = _mm_slli_epi32(ewitab,2);
2069 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2070 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2071 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2072 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2073 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2074 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2075 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2076 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2077 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2078 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2080 d = _mm_sub_pd(r11,rswitch);
2081 d = _mm_max_pd(d,_mm_setzero_pd());
2082 d2 = _mm_mul_pd(d,d);
2083 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2085 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2087 /* Evaluate switch function */
2088 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2089 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2090 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2092 fscal = felec;
2094 fscal = _mm_and_pd(fscal,cutoff_mask);
2096 /* Calculate temporary vectorial force */
2097 tx = _mm_mul_pd(fscal,dx11);
2098 ty = _mm_mul_pd(fscal,dy11);
2099 tz = _mm_mul_pd(fscal,dz11);
2101 /* Update vectorial force */
2102 fix1 = _mm_add_pd(fix1,tx);
2103 fiy1 = _mm_add_pd(fiy1,ty);
2104 fiz1 = _mm_add_pd(fiz1,tz);
2106 fjx1 = _mm_add_pd(fjx1,tx);
2107 fjy1 = _mm_add_pd(fjy1,ty);
2108 fjz1 = _mm_add_pd(fjz1,tz);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 if (gmx_mm_any_lt(rsq12,rcutoff2))
2119 r12 = _mm_mul_pd(rsq12,rinv12);
2121 /* EWALD ELECTROSTATICS */
2123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124 ewrt = _mm_mul_pd(r12,ewtabscale);
2125 ewitab = _mm_cvttpd_epi32(ewrt);
2126 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2127 ewitab = _mm_slli_epi32(ewitab,2);
2128 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2129 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2130 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2131 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2132 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2133 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2134 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2135 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2136 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2137 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2139 d = _mm_sub_pd(r12,rswitch);
2140 d = _mm_max_pd(d,_mm_setzero_pd());
2141 d2 = _mm_mul_pd(d,d);
2142 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2144 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2146 /* Evaluate switch function */
2147 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2148 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2149 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2151 fscal = felec;
2153 fscal = _mm_and_pd(fscal,cutoff_mask);
2155 /* Calculate temporary vectorial force */
2156 tx = _mm_mul_pd(fscal,dx12);
2157 ty = _mm_mul_pd(fscal,dy12);
2158 tz = _mm_mul_pd(fscal,dz12);
2160 /* Update vectorial force */
2161 fix1 = _mm_add_pd(fix1,tx);
2162 fiy1 = _mm_add_pd(fiy1,ty);
2163 fiz1 = _mm_add_pd(fiz1,tz);
2165 fjx2 = _mm_add_pd(fjx2,tx);
2166 fjy2 = _mm_add_pd(fjy2,ty);
2167 fjz2 = _mm_add_pd(fjz2,tz);
2171 /**************************
2172 * CALCULATE INTERACTIONS *
2173 **************************/
2175 if (gmx_mm_any_lt(rsq13,rcutoff2))
2178 r13 = _mm_mul_pd(rsq13,rinv13);
2180 /* EWALD ELECTROSTATICS */
2182 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2183 ewrt = _mm_mul_pd(r13,ewtabscale);
2184 ewitab = _mm_cvttpd_epi32(ewrt);
2185 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2186 ewitab = _mm_slli_epi32(ewitab,2);
2187 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2188 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2189 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2190 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2191 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2192 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2193 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2194 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2195 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2196 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2198 d = _mm_sub_pd(r13,rswitch);
2199 d = _mm_max_pd(d,_mm_setzero_pd());
2200 d2 = _mm_mul_pd(d,d);
2201 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2203 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2205 /* Evaluate switch function */
2206 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2207 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2208 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2210 fscal = felec;
2212 fscal = _mm_and_pd(fscal,cutoff_mask);
2214 /* Calculate temporary vectorial force */
2215 tx = _mm_mul_pd(fscal,dx13);
2216 ty = _mm_mul_pd(fscal,dy13);
2217 tz = _mm_mul_pd(fscal,dz13);
2219 /* Update vectorial force */
2220 fix1 = _mm_add_pd(fix1,tx);
2221 fiy1 = _mm_add_pd(fiy1,ty);
2222 fiz1 = _mm_add_pd(fiz1,tz);
2224 fjx3 = _mm_add_pd(fjx3,tx);
2225 fjy3 = _mm_add_pd(fjy3,ty);
2226 fjz3 = _mm_add_pd(fjz3,tz);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 if (gmx_mm_any_lt(rsq21,rcutoff2))
2237 r21 = _mm_mul_pd(rsq21,rinv21);
2239 /* EWALD ELECTROSTATICS */
2241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2242 ewrt = _mm_mul_pd(r21,ewtabscale);
2243 ewitab = _mm_cvttpd_epi32(ewrt);
2244 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2245 ewitab = _mm_slli_epi32(ewitab,2);
2246 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2247 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2248 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2249 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2250 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2251 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2252 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2253 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2254 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2255 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2257 d = _mm_sub_pd(r21,rswitch);
2258 d = _mm_max_pd(d,_mm_setzero_pd());
2259 d2 = _mm_mul_pd(d,d);
2260 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2262 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2264 /* Evaluate switch function */
2265 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2266 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2267 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2269 fscal = felec;
2271 fscal = _mm_and_pd(fscal,cutoff_mask);
2273 /* Calculate temporary vectorial force */
2274 tx = _mm_mul_pd(fscal,dx21);
2275 ty = _mm_mul_pd(fscal,dy21);
2276 tz = _mm_mul_pd(fscal,dz21);
2278 /* Update vectorial force */
2279 fix2 = _mm_add_pd(fix2,tx);
2280 fiy2 = _mm_add_pd(fiy2,ty);
2281 fiz2 = _mm_add_pd(fiz2,tz);
2283 fjx1 = _mm_add_pd(fjx1,tx);
2284 fjy1 = _mm_add_pd(fjy1,ty);
2285 fjz1 = _mm_add_pd(fjz1,tz);
2289 /**************************
2290 * CALCULATE INTERACTIONS *
2291 **************************/
2293 if (gmx_mm_any_lt(rsq22,rcutoff2))
2296 r22 = _mm_mul_pd(rsq22,rinv22);
2298 /* EWALD ELECTROSTATICS */
2300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2301 ewrt = _mm_mul_pd(r22,ewtabscale);
2302 ewitab = _mm_cvttpd_epi32(ewrt);
2303 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2304 ewitab = _mm_slli_epi32(ewitab,2);
2305 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2306 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2307 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2308 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2309 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2310 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2311 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2312 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2313 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2314 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2316 d = _mm_sub_pd(r22,rswitch);
2317 d = _mm_max_pd(d,_mm_setzero_pd());
2318 d2 = _mm_mul_pd(d,d);
2319 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2321 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2323 /* Evaluate switch function */
2324 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2325 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2326 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2328 fscal = felec;
2330 fscal = _mm_and_pd(fscal,cutoff_mask);
2332 /* Calculate temporary vectorial force */
2333 tx = _mm_mul_pd(fscal,dx22);
2334 ty = _mm_mul_pd(fscal,dy22);
2335 tz = _mm_mul_pd(fscal,dz22);
2337 /* Update vectorial force */
2338 fix2 = _mm_add_pd(fix2,tx);
2339 fiy2 = _mm_add_pd(fiy2,ty);
2340 fiz2 = _mm_add_pd(fiz2,tz);
2342 fjx2 = _mm_add_pd(fjx2,tx);
2343 fjy2 = _mm_add_pd(fjy2,ty);
2344 fjz2 = _mm_add_pd(fjz2,tz);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 if (gmx_mm_any_lt(rsq23,rcutoff2))
2355 r23 = _mm_mul_pd(rsq23,rinv23);
2357 /* EWALD ELECTROSTATICS */
2359 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2360 ewrt = _mm_mul_pd(r23,ewtabscale);
2361 ewitab = _mm_cvttpd_epi32(ewrt);
2362 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2363 ewitab = _mm_slli_epi32(ewitab,2);
2364 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2365 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2366 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2367 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2368 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2369 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2370 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2371 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2372 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2373 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2375 d = _mm_sub_pd(r23,rswitch);
2376 d = _mm_max_pd(d,_mm_setzero_pd());
2377 d2 = _mm_mul_pd(d,d);
2378 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2380 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2382 /* Evaluate switch function */
2383 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2384 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2385 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2387 fscal = felec;
2389 fscal = _mm_and_pd(fscal,cutoff_mask);
2391 /* Calculate temporary vectorial force */
2392 tx = _mm_mul_pd(fscal,dx23);
2393 ty = _mm_mul_pd(fscal,dy23);
2394 tz = _mm_mul_pd(fscal,dz23);
2396 /* Update vectorial force */
2397 fix2 = _mm_add_pd(fix2,tx);
2398 fiy2 = _mm_add_pd(fiy2,ty);
2399 fiz2 = _mm_add_pd(fiz2,tz);
2401 fjx3 = _mm_add_pd(fjx3,tx);
2402 fjy3 = _mm_add_pd(fjy3,ty);
2403 fjz3 = _mm_add_pd(fjz3,tz);
2407 /**************************
2408 * CALCULATE INTERACTIONS *
2409 **************************/
2411 if (gmx_mm_any_lt(rsq31,rcutoff2))
2414 r31 = _mm_mul_pd(rsq31,rinv31);
2416 /* EWALD ELECTROSTATICS */
2418 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2419 ewrt = _mm_mul_pd(r31,ewtabscale);
2420 ewitab = _mm_cvttpd_epi32(ewrt);
2421 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2422 ewitab = _mm_slli_epi32(ewitab,2);
2423 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2424 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2425 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2426 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2427 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2428 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2429 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2430 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2431 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2432 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2434 d = _mm_sub_pd(r31,rswitch);
2435 d = _mm_max_pd(d,_mm_setzero_pd());
2436 d2 = _mm_mul_pd(d,d);
2437 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2439 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2441 /* Evaluate switch function */
2442 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2443 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2444 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2446 fscal = felec;
2448 fscal = _mm_and_pd(fscal,cutoff_mask);
2450 /* Calculate temporary vectorial force */
2451 tx = _mm_mul_pd(fscal,dx31);
2452 ty = _mm_mul_pd(fscal,dy31);
2453 tz = _mm_mul_pd(fscal,dz31);
2455 /* Update vectorial force */
2456 fix3 = _mm_add_pd(fix3,tx);
2457 fiy3 = _mm_add_pd(fiy3,ty);
2458 fiz3 = _mm_add_pd(fiz3,tz);
2460 fjx1 = _mm_add_pd(fjx1,tx);
2461 fjy1 = _mm_add_pd(fjy1,ty);
2462 fjz1 = _mm_add_pd(fjz1,tz);
2466 /**************************
2467 * CALCULATE INTERACTIONS *
2468 **************************/
2470 if (gmx_mm_any_lt(rsq32,rcutoff2))
2473 r32 = _mm_mul_pd(rsq32,rinv32);
2475 /* EWALD ELECTROSTATICS */
2477 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2478 ewrt = _mm_mul_pd(r32,ewtabscale);
2479 ewitab = _mm_cvttpd_epi32(ewrt);
2480 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2481 ewitab = _mm_slli_epi32(ewitab,2);
2482 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2483 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2484 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2485 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2486 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2487 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2488 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2489 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2490 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2491 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2493 d = _mm_sub_pd(r32,rswitch);
2494 d = _mm_max_pd(d,_mm_setzero_pd());
2495 d2 = _mm_mul_pd(d,d);
2496 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2498 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2500 /* Evaluate switch function */
2501 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2502 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2503 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2505 fscal = felec;
2507 fscal = _mm_and_pd(fscal,cutoff_mask);
2509 /* Calculate temporary vectorial force */
2510 tx = _mm_mul_pd(fscal,dx32);
2511 ty = _mm_mul_pd(fscal,dy32);
2512 tz = _mm_mul_pd(fscal,dz32);
2514 /* Update vectorial force */
2515 fix3 = _mm_add_pd(fix3,tx);
2516 fiy3 = _mm_add_pd(fiy3,ty);
2517 fiz3 = _mm_add_pd(fiz3,tz);
2519 fjx2 = _mm_add_pd(fjx2,tx);
2520 fjy2 = _mm_add_pd(fjy2,ty);
2521 fjz2 = _mm_add_pd(fjz2,tz);
2525 /**************************
2526 * CALCULATE INTERACTIONS *
2527 **************************/
2529 if (gmx_mm_any_lt(rsq33,rcutoff2))
2532 r33 = _mm_mul_pd(rsq33,rinv33);
2534 /* EWALD ELECTROSTATICS */
2536 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2537 ewrt = _mm_mul_pd(r33,ewtabscale);
2538 ewitab = _mm_cvttpd_epi32(ewrt);
2539 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2540 ewitab = _mm_slli_epi32(ewitab,2);
2541 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2542 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2543 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2544 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2545 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2546 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2547 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2548 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2549 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2550 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2552 d = _mm_sub_pd(r33,rswitch);
2553 d = _mm_max_pd(d,_mm_setzero_pd());
2554 d2 = _mm_mul_pd(d,d);
2555 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2557 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2559 /* Evaluate switch function */
2560 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2561 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2562 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2564 fscal = felec;
2566 fscal = _mm_and_pd(fscal,cutoff_mask);
2568 /* Calculate temporary vectorial force */
2569 tx = _mm_mul_pd(fscal,dx33);
2570 ty = _mm_mul_pd(fscal,dy33);
2571 tz = _mm_mul_pd(fscal,dz33);
2573 /* Update vectorial force */
2574 fix3 = _mm_add_pd(fix3,tx);
2575 fiy3 = _mm_add_pd(fiy3,ty);
2576 fiz3 = _mm_add_pd(fiz3,tz);
2578 fjx3 = _mm_add_pd(fjx3,tx);
2579 fjy3 = _mm_add_pd(fjy3,ty);
2580 fjz3 = _mm_add_pd(fjz3,tz);
2584 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2586 /* Inner loop uses 617 flops */
2589 if(jidx<j_index_end)
2592 jnrA = jjnr[jidx];
2593 j_coord_offsetA = DIM*jnrA;
2595 /* load j atom coordinates */
2596 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2597 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2598 &jy2,&jz2,&jx3,&jy3,&jz3);
2600 /* Calculate displacement vector */
2601 dx00 = _mm_sub_pd(ix0,jx0);
2602 dy00 = _mm_sub_pd(iy0,jy0);
2603 dz00 = _mm_sub_pd(iz0,jz0);
2604 dx11 = _mm_sub_pd(ix1,jx1);
2605 dy11 = _mm_sub_pd(iy1,jy1);
2606 dz11 = _mm_sub_pd(iz1,jz1);
2607 dx12 = _mm_sub_pd(ix1,jx2);
2608 dy12 = _mm_sub_pd(iy1,jy2);
2609 dz12 = _mm_sub_pd(iz1,jz2);
2610 dx13 = _mm_sub_pd(ix1,jx3);
2611 dy13 = _mm_sub_pd(iy1,jy3);
2612 dz13 = _mm_sub_pd(iz1,jz3);
2613 dx21 = _mm_sub_pd(ix2,jx1);
2614 dy21 = _mm_sub_pd(iy2,jy1);
2615 dz21 = _mm_sub_pd(iz2,jz1);
2616 dx22 = _mm_sub_pd(ix2,jx2);
2617 dy22 = _mm_sub_pd(iy2,jy2);
2618 dz22 = _mm_sub_pd(iz2,jz2);
2619 dx23 = _mm_sub_pd(ix2,jx3);
2620 dy23 = _mm_sub_pd(iy2,jy3);
2621 dz23 = _mm_sub_pd(iz2,jz3);
2622 dx31 = _mm_sub_pd(ix3,jx1);
2623 dy31 = _mm_sub_pd(iy3,jy1);
2624 dz31 = _mm_sub_pd(iz3,jz1);
2625 dx32 = _mm_sub_pd(ix3,jx2);
2626 dy32 = _mm_sub_pd(iy3,jy2);
2627 dz32 = _mm_sub_pd(iz3,jz2);
2628 dx33 = _mm_sub_pd(ix3,jx3);
2629 dy33 = _mm_sub_pd(iy3,jy3);
2630 dz33 = _mm_sub_pd(iz3,jz3);
2632 /* Calculate squared distance and things based on it */
2633 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2634 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2635 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2636 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2637 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2638 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2639 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2640 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2641 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2642 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2644 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2645 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2646 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2647 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2648 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2649 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2650 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2651 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2652 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2653 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2655 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2656 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2657 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2658 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2659 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2660 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2661 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2662 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2663 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2664 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2666 fjx0 = _mm_setzero_pd();
2667 fjy0 = _mm_setzero_pd();
2668 fjz0 = _mm_setzero_pd();
2669 fjx1 = _mm_setzero_pd();
2670 fjy1 = _mm_setzero_pd();
2671 fjz1 = _mm_setzero_pd();
2672 fjx2 = _mm_setzero_pd();
2673 fjy2 = _mm_setzero_pd();
2674 fjz2 = _mm_setzero_pd();
2675 fjx3 = _mm_setzero_pd();
2676 fjy3 = _mm_setzero_pd();
2677 fjz3 = _mm_setzero_pd();
2679 /**************************
2680 * CALCULATE INTERACTIONS *
2681 **************************/
2683 if (gmx_mm_any_lt(rsq00,rcutoff2))
2686 r00 = _mm_mul_pd(rsq00,rinv00);
2688 /* LENNARD-JONES DISPERSION/REPULSION */
2690 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2691 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2692 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2693 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2694 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2696 d = _mm_sub_pd(r00,rswitch);
2697 d = _mm_max_pd(d,_mm_setzero_pd());
2698 d2 = _mm_mul_pd(d,d);
2699 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2701 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2703 /* Evaluate switch function */
2704 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2705 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2706 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2708 fscal = fvdw;
2710 fscal = _mm_and_pd(fscal,cutoff_mask);
2712 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2714 /* Calculate temporary vectorial force */
2715 tx = _mm_mul_pd(fscal,dx00);
2716 ty = _mm_mul_pd(fscal,dy00);
2717 tz = _mm_mul_pd(fscal,dz00);
2719 /* Update vectorial force */
2720 fix0 = _mm_add_pd(fix0,tx);
2721 fiy0 = _mm_add_pd(fiy0,ty);
2722 fiz0 = _mm_add_pd(fiz0,tz);
2724 fjx0 = _mm_add_pd(fjx0,tx);
2725 fjy0 = _mm_add_pd(fjy0,ty);
2726 fjz0 = _mm_add_pd(fjz0,tz);
2730 /**************************
2731 * CALCULATE INTERACTIONS *
2732 **************************/
2734 if (gmx_mm_any_lt(rsq11,rcutoff2))
2737 r11 = _mm_mul_pd(rsq11,rinv11);
2739 /* EWALD ELECTROSTATICS */
2741 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2742 ewrt = _mm_mul_pd(r11,ewtabscale);
2743 ewitab = _mm_cvttpd_epi32(ewrt);
2744 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2745 ewitab = _mm_slli_epi32(ewitab,2);
2746 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2747 ewtabD = _mm_setzero_pd();
2748 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2749 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2750 ewtabFn = _mm_setzero_pd();
2751 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2752 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2753 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2754 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2755 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2757 d = _mm_sub_pd(r11,rswitch);
2758 d = _mm_max_pd(d,_mm_setzero_pd());
2759 d2 = _mm_mul_pd(d,d);
2760 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2762 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2764 /* Evaluate switch function */
2765 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2766 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2767 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2769 fscal = felec;
2771 fscal = _mm_and_pd(fscal,cutoff_mask);
2773 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2775 /* Calculate temporary vectorial force */
2776 tx = _mm_mul_pd(fscal,dx11);
2777 ty = _mm_mul_pd(fscal,dy11);
2778 tz = _mm_mul_pd(fscal,dz11);
2780 /* Update vectorial force */
2781 fix1 = _mm_add_pd(fix1,tx);
2782 fiy1 = _mm_add_pd(fiy1,ty);
2783 fiz1 = _mm_add_pd(fiz1,tz);
2785 fjx1 = _mm_add_pd(fjx1,tx);
2786 fjy1 = _mm_add_pd(fjy1,ty);
2787 fjz1 = _mm_add_pd(fjz1,tz);
2791 /**************************
2792 * CALCULATE INTERACTIONS *
2793 **************************/
2795 if (gmx_mm_any_lt(rsq12,rcutoff2))
2798 r12 = _mm_mul_pd(rsq12,rinv12);
2800 /* EWALD ELECTROSTATICS */
2802 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2803 ewrt = _mm_mul_pd(r12,ewtabscale);
2804 ewitab = _mm_cvttpd_epi32(ewrt);
2805 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2806 ewitab = _mm_slli_epi32(ewitab,2);
2807 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2808 ewtabD = _mm_setzero_pd();
2809 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2810 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2811 ewtabFn = _mm_setzero_pd();
2812 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2813 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2814 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2815 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2816 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2818 d = _mm_sub_pd(r12,rswitch);
2819 d = _mm_max_pd(d,_mm_setzero_pd());
2820 d2 = _mm_mul_pd(d,d);
2821 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2823 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2825 /* Evaluate switch function */
2826 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2827 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2828 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2830 fscal = felec;
2832 fscal = _mm_and_pd(fscal,cutoff_mask);
2834 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2836 /* Calculate temporary vectorial force */
2837 tx = _mm_mul_pd(fscal,dx12);
2838 ty = _mm_mul_pd(fscal,dy12);
2839 tz = _mm_mul_pd(fscal,dz12);
2841 /* Update vectorial force */
2842 fix1 = _mm_add_pd(fix1,tx);
2843 fiy1 = _mm_add_pd(fiy1,ty);
2844 fiz1 = _mm_add_pd(fiz1,tz);
2846 fjx2 = _mm_add_pd(fjx2,tx);
2847 fjy2 = _mm_add_pd(fjy2,ty);
2848 fjz2 = _mm_add_pd(fjz2,tz);
2852 /**************************
2853 * CALCULATE INTERACTIONS *
2854 **************************/
2856 if (gmx_mm_any_lt(rsq13,rcutoff2))
2859 r13 = _mm_mul_pd(rsq13,rinv13);
2861 /* EWALD ELECTROSTATICS */
2863 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2864 ewrt = _mm_mul_pd(r13,ewtabscale);
2865 ewitab = _mm_cvttpd_epi32(ewrt);
2866 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2867 ewitab = _mm_slli_epi32(ewitab,2);
2868 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2869 ewtabD = _mm_setzero_pd();
2870 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2871 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2872 ewtabFn = _mm_setzero_pd();
2873 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2874 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2875 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2876 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2877 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2879 d = _mm_sub_pd(r13,rswitch);
2880 d = _mm_max_pd(d,_mm_setzero_pd());
2881 d2 = _mm_mul_pd(d,d);
2882 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2884 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2886 /* Evaluate switch function */
2887 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2888 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2889 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2891 fscal = felec;
2893 fscal = _mm_and_pd(fscal,cutoff_mask);
2895 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2897 /* Calculate temporary vectorial force */
2898 tx = _mm_mul_pd(fscal,dx13);
2899 ty = _mm_mul_pd(fscal,dy13);
2900 tz = _mm_mul_pd(fscal,dz13);
2902 /* Update vectorial force */
2903 fix1 = _mm_add_pd(fix1,tx);
2904 fiy1 = _mm_add_pd(fiy1,ty);
2905 fiz1 = _mm_add_pd(fiz1,tz);
2907 fjx3 = _mm_add_pd(fjx3,tx);
2908 fjy3 = _mm_add_pd(fjy3,ty);
2909 fjz3 = _mm_add_pd(fjz3,tz);
2913 /**************************
2914 * CALCULATE INTERACTIONS *
2915 **************************/
2917 if (gmx_mm_any_lt(rsq21,rcutoff2))
2920 r21 = _mm_mul_pd(rsq21,rinv21);
2922 /* EWALD ELECTROSTATICS */
2924 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2925 ewrt = _mm_mul_pd(r21,ewtabscale);
2926 ewitab = _mm_cvttpd_epi32(ewrt);
2927 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2928 ewitab = _mm_slli_epi32(ewitab,2);
2929 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2930 ewtabD = _mm_setzero_pd();
2931 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2932 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2933 ewtabFn = _mm_setzero_pd();
2934 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2935 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2936 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2937 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2938 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2940 d = _mm_sub_pd(r21,rswitch);
2941 d = _mm_max_pd(d,_mm_setzero_pd());
2942 d2 = _mm_mul_pd(d,d);
2943 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2945 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2947 /* Evaluate switch function */
2948 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2949 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2950 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2952 fscal = felec;
2954 fscal = _mm_and_pd(fscal,cutoff_mask);
2956 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2958 /* Calculate temporary vectorial force */
2959 tx = _mm_mul_pd(fscal,dx21);
2960 ty = _mm_mul_pd(fscal,dy21);
2961 tz = _mm_mul_pd(fscal,dz21);
2963 /* Update vectorial force */
2964 fix2 = _mm_add_pd(fix2,tx);
2965 fiy2 = _mm_add_pd(fiy2,ty);
2966 fiz2 = _mm_add_pd(fiz2,tz);
2968 fjx1 = _mm_add_pd(fjx1,tx);
2969 fjy1 = _mm_add_pd(fjy1,ty);
2970 fjz1 = _mm_add_pd(fjz1,tz);
2974 /**************************
2975 * CALCULATE INTERACTIONS *
2976 **************************/
2978 if (gmx_mm_any_lt(rsq22,rcutoff2))
2981 r22 = _mm_mul_pd(rsq22,rinv22);
2983 /* EWALD ELECTROSTATICS */
2985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2986 ewrt = _mm_mul_pd(r22,ewtabscale);
2987 ewitab = _mm_cvttpd_epi32(ewrt);
2988 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2989 ewitab = _mm_slli_epi32(ewitab,2);
2990 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2991 ewtabD = _mm_setzero_pd();
2992 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2993 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2994 ewtabFn = _mm_setzero_pd();
2995 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2996 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2997 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2998 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2999 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3001 d = _mm_sub_pd(r22,rswitch);
3002 d = _mm_max_pd(d,_mm_setzero_pd());
3003 d2 = _mm_mul_pd(d,d);
3004 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3006 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3008 /* Evaluate switch function */
3009 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3010 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3011 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3013 fscal = felec;
3015 fscal = _mm_and_pd(fscal,cutoff_mask);
3017 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3019 /* Calculate temporary vectorial force */
3020 tx = _mm_mul_pd(fscal,dx22);
3021 ty = _mm_mul_pd(fscal,dy22);
3022 tz = _mm_mul_pd(fscal,dz22);
3024 /* Update vectorial force */
3025 fix2 = _mm_add_pd(fix2,tx);
3026 fiy2 = _mm_add_pd(fiy2,ty);
3027 fiz2 = _mm_add_pd(fiz2,tz);
3029 fjx2 = _mm_add_pd(fjx2,tx);
3030 fjy2 = _mm_add_pd(fjy2,ty);
3031 fjz2 = _mm_add_pd(fjz2,tz);
3035 /**************************
3036 * CALCULATE INTERACTIONS *
3037 **************************/
3039 if (gmx_mm_any_lt(rsq23,rcutoff2))
3042 r23 = _mm_mul_pd(rsq23,rinv23);
3044 /* EWALD ELECTROSTATICS */
3046 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3047 ewrt = _mm_mul_pd(r23,ewtabscale);
3048 ewitab = _mm_cvttpd_epi32(ewrt);
3049 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3050 ewitab = _mm_slli_epi32(ewitab,2);
3051 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3052 ewtabD = _mm_setzero_pd();
3053 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3054 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3055 ewtabFn = _mm_setzero_pd();
3056 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3057 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3058 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3059 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3060 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3062 d = _mm_sub_pd(r23,rswitch);
3063 d = _mm_max_pd(d,_mm_setzero_pd());
3064 d2 = _mm_mul_pd(d,d);
3065 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3067 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3069 /* Evaluate switch function */
3070 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3071 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3072 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
3074 fscal = felec;
3076 fscal = _mm_and_pd(fscal,cutoff_mask);
3078 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3080 /* Calculate temporary vectorial force */
3081 tx = _mm_mul_pd(fscal,dx23);
3082 ty = _mm_mul_pd(fscal,dy23);
3083 tz = _mm_mul_pd(fscal,dz23);
3085 /* Update vectorial force */
3086 fix2 = _mm_add_pd(fix2,tx);
3087 fiy2 = _mm_add_pd(fiy2,ty);
3088 fiz2 = _mm_add_pd(fiz2,tz);
3090 fjx3 = _mm_add_pd(fjx3,tx);
3091 fjy3 = _mm_add_pd(fjy3,ty);
3092 fjz3 = _mm_add_pd(fjz3,tz);
3096 /**************************
3097 * CALCULATE INTERACTIONS *
3098 **************************/
3100 if (gmx_mm_any_lt(rsq31,rcutoff2))
3103 r31 = _mm_mul_pd(rsq31,rinv31);
3105 /* EWALD ELECTROSTATICS */
3107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3108 ewrt = _mm_mul_pd(r31,ewtabscale);
3109 ewitab = _mm_cvttpd_epi32(ewrt);
3110 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3111 ewitab = _mm_slli_epi32(ewitab,2);
3112 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3113 ewtabD = _mm_setzero_pd();
3114 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3115 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3116 ewtabFn = _mm_setzero_pd();
3117 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3118 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3119 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3120 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3121 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3123 d = _mm_sub_pd(r31,rswitch);
3124 d = _mm_max_pd(d,_mm_setzero_pd());
3125 d2 = _mm_mul_pd(d,d);
3126 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3128 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3130 /* Evaluate switch function */
3131 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3132 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3133 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
3135 fscal = felec;
3137 fscal = _mm_and_pd(fscal,cutoff_mask);
3139 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3141 /* Calculate temporary vectorial force */
3142 tx = _mm_mul_pd(fscal,dx31);
3143 ty = _mm_mul_pd(fscal,dy31);
3144 tz = _mm_mul_pd(fscal,dz31);
3146 /* Update vectorial force */
3147 fix3 = _mm_add_pd(fix3,tx);
3148 fiy3 = _mm_add_pd(fiy3,ty);
3149 fiz3 = _mm_add_pd(fiz3,tz);
3151 fjx1 = _mm_add_pd(fjx1,tx);
3152 fjy1 = _mm_add_pd(fjy1,ty);
3153 fjz1 = _mm_add_pd(fjz1,tz);
3157 /**************************
3158 * CALCULATE INTERACTIONS *
3159 **************************/
3161 if (gmx_mm_any_lt(rsq32,rcutoff2))
3164 r32 = _mm_mul_pd(rsq32,rinv32);
3166 /* EWALD ELECTROSTATICS */
3168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3169 ewrt = _mm_mul_pd(r32,ewtabscale);
3170 ewitab = _mm_cvttpd_epi32(ewrt);
3171 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3172 ewitab = _mm_slli_epi32(ewitab,2);
3173 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3174 ewtabD = _mm_setzero_pd();
3175 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3176 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3177 ewtabFn = _mm_setzero_pd();
3178 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3179 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3180 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3181 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3182 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3184 d = _mm_sub_pd(r32,rswitch);
3185 d = _mm_max_pd(d,_mm_setzero_pd());
3186 d2 = _mm_mul_pd(d,d);
3187 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3189 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3191 /* Evaluate switch function */
3192 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3193 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3194 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
3196 fscal = felec;
3198 fscal = _mm_and_pd(fscal,cutoff_mask);
3200 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3202 /* Calculate temporary vectorial force */
3203 tx = _mm_mul_pd(fscal,dx32);
3204 ty = _mm_mul_pd(fscal,dy32);
3205 tz = _mm_mul_pd(fscal,dz32);
3207 /* Update vectorial force */
3208 fix3 = _mm_add_pd(fix3,tx);
3209 fiy3 = _mm_add_pd(fiy3,ty);
3210 fiz3 = _mm_add_pd(fiz3,tz);
3212 fjx2 = _mm_add_pd(fjx2,tx);
3213 fjy2 = _mm_add_pd(fjy2,ty);
3214 fjz2 = _mm_add_pd(fjz2,tz);
3218 /**************************
3219 * CALCULATE INTERACTIONS *
3220 **************************/
3222 if (gmx_mm_any_lt(rsq33,rcutoff2))
3225 r33 = _mm_mul_pd(rsq33,rinv33);
3227 /* EWALD ELECTROSTATICS */
3229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3230 ewrt = _mm_mul_pd(r33,ewtabscale);
3231 ewitab = _mm_cvttpd_epi32(ewrt);
3232 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3233 ewitab = _mm_slli_epi32(ewitab,2);
3234 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3235 ewtabD = _mm_setzero_pd();
3236 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3237 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3238 ewtabFn = _mm_setzero_pd();
3239 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3240 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3241 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3242 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3243 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3245 d = _mm_sub_pd(r33,rswitch);
3246 d = _mm_max_pd(d,_mm_setzero_pd());
3247 d2 = _mm_mul_pd(d,d);
3248 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3250 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3252 /* Evaluate switch function */
3253 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3254 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3255 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
3257 fscal = felec;
3259 fscal = _mm_and_pd(fscal,cutoff_mask);
3261 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3263 /* Calculate temporary vectorial force */
3264 tx = _mm_mul_pd(fscal,dx33);
3265 ty = _mm_mul_pd(fscal,dy33);
3266 tz = _mm_mul_pd(fscal,dz33);
3268 /* Update vectorial force */
3269 fix3 = _mm_add_pd(fix3,tx);
3270 fiy3 = _mm_add_pd(fiy3,ty);
3271 fiz3 = _mm_add_pd(fiz3,tz);
3273 fjx3 = _mm_add_pd(fjx3,tx);
3274 fjy3 = _mm_add_pd(fjy3,ty);
3275 fjz3 = _mm_add_pd(fjz3,tz);
3279 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3281 /* Inner loop uses 617 flops */
3284 /* End of innermost loop */
3286 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3287 f+i_coord_offset,fshift+i_shift_offset);
3289 /* Increment number of inner iterations */
3290 inneriter += j_index_end - j_index_start;
3292 /* Outer loop uses 24 flops */
3295 /* Increment number of outer iterations */
3296 outeriter += nri;
3298 /* Update outer/inner flops */
3300 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*617);