Removed simple.h from nb_kernel_sse2_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_double.c
blobfb8cf2764380db1c553eb50ba8924ea63af016cb
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
53 * Electrostatics interaction: ReactionField
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB;
76 int j_coord_offsetA,j_coord_offsetB;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 int vdwioffset0;
82 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 int vdwioffset1;
84 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 int vdwioffset2;
86 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 int vdwioffset3;
88 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89 int vdwjidx0A,vdwjidx0B;
90 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B;
92 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B;
94 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 int vdwjidx3A,vdwjidx3B;
96 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 real *charge;
109 int nvdwtype;
110 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 int *vdwtype;
112 real *vdwparam;
113 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
115 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116 real rswitch_scalar,d_scalar;
117 __m128d dummy_mask,cutoff_mask;
118 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
119 __m128d one = _mm_set1_pd(1.0);
120 __m128d two = _mm_set1_pd(2.0);
121 x = xx[0];
122 f = ff[0];
124 nri = nlist->nri;
125 iinr = nlist->iinr;
126 jindex = nlist->jindex;
127 jjnr = nlist->jjnr;
128 shiftidx = nlist->shift;
129 gid = nlist->gid;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_pd(fr->epsfac);
133 charge = mdatoms->chargeA;
134 krf = _mm_set1_pd(fr->ic->k_rf);
135 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
136 crf = _mm_set1_pd(fr->ic->c_rf);
137 nvdwtype = fr->ntype;
138 vdwparam = fr->nbfp;
139 vdwtype = mdatoms->typeA;
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq1 = _mm_set1_pd(charge[inr+1]);
149 jq2 = _mm_set1_pd(charge[inr+2]);
150 jq3 = _mm_set1_pd(charge[inr+3]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq11 = _mm_mul_pd(iq1,jq1);
155 qq12 = _mm_mul_pd(iq1,jq2);
156 qq13 = _mm_mul_pd(iq1,jq3);
157 qq21 = _mm_mul_pd(iq2,jq1);
158 qq22 = _mm_mul_pd(iq2,jq2);
159 qq23 = _mm_mul_pd(iq2,jq3);
160 qq31 = _mm_mul_pd(iq3,jq1);
161 qq32 = _mm_mul_pd(iq3,jq2);
162 qq33 = _mm_mul_pd(iq3,jq3);
164 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165 rcutoff_scalar = fr->rcoulomb;
166 rcutoff = _mm_set1_pd(rcutoff_scalar);
167 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
169 rswitch_scalar = fr->rvdw_switch;
170 rswitch = _mm_set1_pd(rswitch_scalar);
171 /* Setup switch parameters */
172 d_scalar = rcutoff_scalar-rswitch_scalar;
173 d = _mm_set1_pd(d_scalar);
174 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
175 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
178 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
179 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 /* Avoid stupid compiler warnings */
182 jnrA = jnrB = 0;
183 j_coord_offsetA = 0;
184 j_coord_offsetB = 0;
186 outeriter = 0;
187 inneriter = 0;
189 /* Start outer loop over neighborlists */
190 for(iidx=0; iidx<nri; iidx++)
192 /* Load shift vector for this list */
193 i_shift_offset = DIM*shiftidx[iidx];
195 /* Load limits for loop over neighbors */
196 j_index_start = jindex[iidx];
197 j_index_end = jindex[iidx+1];
199 /* Get outer coordinate index */
200 inr = iinr[iidx];
201 i_coord_offset = DIM*inr;
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
205 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
207 fix0 = _mm_setzero_pd();
208 fiy0 = _mm_setzero_pd();
209 fiz0 = _mm_setzero_pd();
210 fix1 = _mm_setzero_pd();
211 fiy1 = _mm_setzero_pd();
212 fiz1 = _mm_setzero_pd();
213 fix2 = _mm_setzero_pd();
214 fiy2 = _mm_setzero_pd();
215 fiz2 = _mm_setzero_pd();
216 fix3 = _mm_setzero_pd();
217 fiy3 = _mm_setzero_pd();
218 fiz3 = _mm_setzero_pd();
220 /* Reset potential sums */
221 velecsum = _mm_setzero_pd();
222 vvdwsum = _mm_setzero_pd();
224 /* Start inner kernel loop */
225 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
228 /* Get j neighbor index, and coordinate index */
229 jnrA = jjnr[jidx];
230 jnrB = jjnr[jidx+1];
231 j_coord_offsetA = DIM*jnrA;
232 j_coord_offsetB = DIM*jnrB;
234 /* load j atom coordinates */
235 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
236 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
237 &jy2,&jz2,&jx3,&jy3,&jz3);
239 /* Calculate displacement vector */
240 dx00 = _mm_sub_pd(ix0,jx0);
241 dy00 = _mm_sub_pd(iy0,jy0);
242 dz00 = _mm_sub_pd(iz0,jz0);
243 dx11 = _mm_sub_pd(ix1,jx1);
244 dy11 = _mm_sub_pd(iy1,jy1);
245 dz11 = _mm_sub_pd(iz1,jz1);
246 dx12 = _mm_sub_pd(ix1,jx2);
247 dy12 = _mm_sub_pd(iy1,jy2);
248 dz12 = _mm_sub_pd(iz1,jz2);
249 dx13 = _mm_sub_pd(ix1,jx3);
250 dy13 = _mm_sub_pd(iy1,jy3);
251 dz13 = _mm_sub_pd(iz1,jz3);
252 dx21 = _mm_sub_pd(ix2,jx1);
253 dy21 = _mm_sub_pd(iy2,jy1);
254 dz21 = _mm_sub_pd(iz2,jz1);
255 dx22 = _mm_sub_pd(ix2,jx2);
256 dy22 = _mm_sub_pd(iy2,jy2);
257 dz22 = _mm_sub_pd(iz2,jz2);
258 dx23 = _mm_sub_pd(ix2,jx3);
259 dy23 = _mm_sub_pd(iy2,jy3);
260 dz23 = _mm_sub_pd(iz2,jz3);
261 dx31 = _mm_sub_pd(ix3,jx1);
262 dy31 = _mm_sub_pd(iy3,jy1);
263 dz31 = _mm_sub_pd(iz3,jz1);
264 dx32 = _mm_sub_pd(ix3,jx2);
265 dy32 = _mm_sub_pd(iy3,jy2);
266 dz32 = _mm_sub_pd(iz3,jz2);
267 dx33 = _mm_sub_pd(ix3,jx3);
268 dy33 = _mm_sub_pd(iy3,jy3);
269 dz33 = _mm_sub_pd(iz3,jz3);
271 /* Calculate squared distance and things based on it */
272 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
273 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
274 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
275 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
276 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
277 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
278 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
279 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
280 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
281 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
283 rinv00 = gmx_mm_invsqrt_pd(rsq00);
284 rinv11 = gmx_mm_invsqrt_pd(rsq11);
285 rinv12 = gmx_mm_invsqrt_pd(rsq12);
286 rinv13 = gmx_mm_invsqrt_pd(rsq13);
287 rinv21 = gmx_mm_invsqrt_pd(rsq21);
288 rinv22 = gmx_mm_invsqrt_pd(rsq22);
289 rinv23 = gmx_mm_invsqrt_pd(rsq23);
290 rinv31 = gmx_mm_invsqrt_pd(rsq31);
291 rinv32 = gmx_mm_invsqrt_pd(rsq32);
292 rinv33 = gmx_mm_invsqrt_pd(rsq33);
294 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
295 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
296 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
297 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
298 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
299 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
300 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
301 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
302 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
303 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
305 fjx0 = _mm_setzero_pd();
306 fjy0 = _mm_setzero_pd();
307 fjz0 = _mm_setzero_pd();
308 fjx1 = _mm_setzero_pd();
309 fjy1 = _mm_setzero_pd();
310 fjz1 = _mm_setzero_pd();
311 fjx2 = _mm_setzero_pd();
312 fjy2 = _mm_setzero_pd();
313 fjz2 = _mm_setzero_pd();
314 fjx3 = _mm_setzero_pd();
315 fjy3 = _mm_setzero_pd();
316 fjz3 = _mm_setzero_pd();
318 /**************************
319 * CALCULATE INTERACTIONS *
320 **************************/
322 if (gmx_mm_any_lt(rsq00,rcutoff2))
325 r00 = _mm_mul_pd(rsq00,rinv00);
327 /* LENNARD-JONES DISPERSION/REPULSION */
329 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
330 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
331 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
332 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
333 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
335 d = _mm_sub_pd(r00,rswitch);
336 d = _mm_max_pd(d,_mm_setzero_pd());
337 d2 = _mm_mul_pd(d,d);
338 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
340 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
342 /* Evaluate switch function */
343 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
344 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
345 vvdw = _mm_mul_pd(vvdw,sw);
346 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 vvdw = _mm_and_pd(vvdw,cutoff_mask);
350 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
352 fscal = fvdw;
354 fscal = _mm_and_pd(fscal,cutoff_mask);
356 /* Calculate temporary vectorial force */
357 tx = _mm_mul_pd(fscal,dx00);
358 ty = _mm_mul_pd(fscal,dy00);
359 tz = _mm_mul_pd(fscal,dz00);
361 /* Update vectorial force */
362 fix0 = _mm_add_pd(fix0,tx);
363 fiy0 = _mm_add_pd(fiy0,ty);
364 fiz0 = _mm_add_pd(fiz0,tz);
366 fjx0 = _mm_add_pd(fjx0,tx);
367 fjy0 = _mm_add_pd(fjy0,ty);
368 fjz0 = _mm_add_pd(fjz0,tz);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 if (gmx_mm_any_lt(rsq11,rcutoff2))
379 /* REACTION-FIELD ELECTROSTATICS */
380 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
381 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
383 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velec = _mm_and_pd(velec,cutoff_mask);
387 velecsum = _mm_add_pd(velecsum,velec);
389 fscal = felec;
391 fscal = _mm_and_pd(fscal,cutoff_mask);
393 /* Calculate temporary vectorial force */
394 tx = _mm_mul_pd(fscal,dx11);
395 ty = _mm_mul_pd(fscal,dy11);
396 tz = _mm_mul_pd(fscal,dz11);
398 /* Update vectorial force */
399 fix1 = _mm_add_pd(fix1,tx);
400 fiy1 = _mm_add_pd(fiy1,ty);
401 fiz1 = _mm_add_pd(fiz1,tz);
403 fjx1 = _mm_add_pd(fjx1,tx);
404 fjy1 = _mm_add_pd(fjy1,ty);
405 fjz1 = _mm_add_pd(fjz1,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 if (gmx_mm_any_lt(rsq12,rcutoff2))
416 /* REACTION-FIELD ELECTROSTATICS */
417 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
418 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
420 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velec = _mm_and_pd(velec,cutoff_mask);
424 velecsum = _mm_add_pd(velecsum,velec);
426 fscal = felec;
428 fscal = _mm_and_pd(fscal,cutoff_mask);
430 /* Calculate temporary vectorial force */
431 tx = _mm_mul_pd(fscal,dx12);
432 ty = _mm_mul_pd(fscal,dy12);
433 tz = _mm_mul_pd(fscal,dz12);
435 /* Update vectorial force */
436 fix1 = _mm_add_pd(fix1,tx);
437 fiy1 = _mm_add_pd(fiy1,ty);
438 fiz1 = _mm_add_pd(fiz1,tz);
440 fjx2 = _mm_add_pd(fjx2,tx);
441 fjy2 = _mm_add_pd(fjy2,ty);
442 fjz2 = _mm_add_pd(fjz2,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 if (gmx_mm_any_lt(rsq13,rcutoff2))
453 /* REACTION-FIELD ELECTROSTATICS */
454 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
455 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
457 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 velec = _mm_and_pd(velec,cutoff_mask);
461 velecsum = _mm_add_pd(velecsum,velec);
463 fscal = felec;
465 fscal = _mm_and_pd(fscal,cutoff_mask);
467 /* Calculate temporary vectorial force */
468 tx = _mm_mul_pd(fscal,dx13);
469 ty = _mm_mul_pd(fscal,dy13);
470 tz = _mm_mul_pd(fscal,dz13);
472 /* Update vectorial force */
473 fix1 = _mm_add_pd(fix1,tx);
474 fiy1 = _mm_add_pd(fiy1,ty);
475 fiz1 = _mm_add_pd(fiz1,tz);
477 fjx3 = _mm_add_pd(fjx3,tx);
478 fjy3 = _mm_add_pd(fjy3,ty);
479 fjz3 = _mm_add_pd(fjz3,tz);
483 /**************************
484 * CALCULATE INTERACTIONS *
485 **************************/
487 if (gmx_mm_any_lt(rsq21,rcutoff2))
490 /* REACTION-FIELD ELECTROSTATICS */
491 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
492 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
494 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec = _mm_and_pd(velec,cutoff_mask);
498 velecsum = _mm_add_pd(velecsum,velec);
500 fscal = felec;
502 fscal = _mm_and_pd(fscal,cutoff_mask);
504 /* Calculate temporary vectorial force */
505 tx = _mm_mul_pd(fscal,dx21);
506 ty = _mm_mul_pd(fscal,dy21);
507 tz = _mm_mul_pd(fscal,dz21);
509 /* Update vectorial force */
510 fix2 = _mm_add_pd(fix2,tx);
511 fiy2 = _mm_add_pd(fiy2,ty);
512 fiz2 = _mm_add_pd(fiz2,tz);
514 fjx1 = _mm_add_pd(fjx1,tx);
515 fjy1 = _mm_add_pd(fjy1,ty);
516 fjz1 = _mm_add_pd(fjz1,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm_any_lt(rsq22,rcutoff2))
527 /* REACTION-FIELD ELECTROSTATICS */
528 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
529 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
531 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
533 /* Update potential sum for this i atom from the interaction with this j atom. */
534 velec = _mm_and_pd(velec,cutoff_mask);
535 velecsum = _mm_add_pd(velecsum,velec);
537 fscal = felec;
539 fscal = _mm_and_pd(fscal,cutoff_mask);
541 /* Calculate temporary vectorial force */
542 tx = _mm_mul_pd(fscal,dx22);
543 ty = _mm_mul_pd(fscal,dy22);
544 tz = _mm_mul_pd(fscal,dz22);
546 /* Update vectorial force */
547 fix2 = _mm_add_pd(fix2,tx);
548 fiy2 = _mm_add_pd(fiy2,ty);
549 fiz2 = _mm_add_pd(fiz2,tz);
551 fjx2 = _mm_add_pd(fjx2,tx);
552 fjy2 = _mm_add_pd(fjy2,ty);
553 fjz2 = _mm_add_pd(fjz2,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 if (gmx_mm_any_lt(rsq23,rcutoff2))
564 /* REACTION-FIELD ELECTROSTATICS */
565 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
566 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
568 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velec = _mm_and_pd(velec,cutoff_mask);
572 velecsum = _mm_add_pd(velecsum,velec);
574 fscal = felec;
576 fscal = _mm_and_pd(fscal,cutoff_mask);
578 /* Calculate temporary vectorial force */
579 tx = _mm_mul_pd(fscal,dx23);
580 ty = _mm_mul_pd(fscal,dy23);
581 tz = _mm_mul_pd(fscal,dz23);
583 /* Update vectorial force */
584 fix2 = _mm_add_pd(fix2,tx);
585 fiy2 = _mm_add_pd(fiy2,ty);
586 fiz2 = _mm_add_pd(fiz2,tz);
588 fjx3 = _mm_add_pd(fjx3,tx);
589 fjy3 = _mm_add_pd(fjy3,ty);
590 fjz3 = _mm_add_pd(fjz3,tz);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 if (gmx_mm_any_lt(rsq31,rcutoff2))
601 /* REACTION-FIELD ELECTROSTATICS */
602 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
603 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
605 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
607 /* Update potential sum for this i atom from the interaction with this j atom. */
608 velec = _mm_and_pd(velec,cutoff_mask);
609 velecsum = _mm_add_pd(velecsum,velec);
611 fscal = felec;
613 fscal = _mm_and_pd(fscal,cutoff_mask);
615 /* Calculate temporary vectorial force */
616 tx = _mm_mul_pd(fscal,dx31);
617 ty = _mm_mul_pd(fscal,dy31);
618 tz = _mm_mul_pd(fscal,dz31);
620 /* Update vectorial force */
621 fix3 = _mm_add_pd(fix3,tx);
622 fiy3 = _mm_add_pd(fiy3,ty);
623 fiz3 = _mm_add_pd(fiz3,tz);
625 fjx1 = _mm_add_pd(fjx1,tx);
626 fjy1 = _mm_add_pd(fjy1,ty);
627 fjz1 = _mm_add_pd(fjz1,tz);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 if (gmx_mm_any_lt(rsq32,rcutoff2))
638 /* REACTION-FIELD ELECTROSTATICS */
639 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
640 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
642 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec = _mm_and_pd(velec,cutoff_mask);
646 velecsum = _mm_add_pd(velecsum,velec);
648 fscal = felec;
650 fscal = _mm_and_pd(fscal,cutoff_mask);
652 /* Calculate temporary vectorial force */
653 tx = _mm_mul_pd(fscal,dx32);
654 ty = _mm_mul_pd(fscal,dy32);
655 tz = _mm_mul_pd(fscal,dz32);
657 /* Update vectorial force */
658 fix3 = _mm_add_pd(fix3,tx);
659 fiy3 = _mm_add_pd(fiy3,ty);
660 fiz3 = _mm_add_pd(fiz3,tz);
662 fjx2 = _mm_add_pd(fjx2,tx);
663 fjy2 = _mm_add_pd(fjy2,ty);
664 fjz2 = _mm_add_pd(fjz2,tz);
668 /**************************
669 * CALCULATE INTERACTIONS *
670 **************************/
672 if (gmx_mm_any_lt(rsq33,rcutoff2))
675 /* REACTION-FIELD ELECTROSTATICS */
676 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
677 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
679 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velec = _mm_and_pd(velec,cutoff_mask);
683 velecsum = _mm_add_pd(velecsum,velec);
685 fscal = felec;
687 fscal = _mm_and_pd(fscal,cutoff_mask);
689 /* Calculate temporary vectorial force */
690 tx = _mm_mul_pd(fscal,dx33);
691 ty = _mm_mul_pd(fscal,dy33);
692 tz = _mm_mul_pd(fscal,dz33);
694 /* Update vectorial force */
695 fix3 = _mm_add_pd(fix3,tx);
696 fiy3 = _mm_add_pd(fiy3,ty);
697 fiz3 = _mm_add_pd(fiz3,tz);
699 fjx3 = _mm_add_pd(fjx3,tx);
700 fjy3 = _mm_add_pd(fjy3,ty);
701 fjz3 = _mm_add_pd(fjz3,tz);
705 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
707 /* Inner loop uses 386 flops */
710 if(jidx<j_index_end)
713 jnrA = jjnr[jidx];
714 j_coord_offsetA = DIM*jnrA;
716 /* load j atom coordinates */
717 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
718 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
719 &jy2,&jz2,&jx3,&jy3,&jz3);
721 /* Calculate displacement vector */
722 dx00 = _mm_sub_pd(ix0,jx0);
723 dy00 = _mm_sub_pd(iy0,jy0);
724 dz00 = _mm_sub_pd(iz0,jz0);
725 dx11 = _mm_sub_pd(ix1,jx1);
726 dy11 = _mm_sub_pd(iy1,jy1);
727 dz11 = _mm_sub_pd(iz1,jz1);
728 dx12 = _mm_sub_pd(ix1,jx2);
729 dy12 = _mm_sub_pd(iy1,jy2);
730 dz12 = _mm_sub_pd(iz1,jz2);
731 dx13 = _mm_sub_pd(ix1,jx3);
732 dy13 = _mm_sub_pd(iy1,jy3);
733 dz13 = _mm_sub_pd(iz1,jz3);
734 dx21 = _mm_sub_pd(ix2,jx1);
735 dy21 = _mm_sub_pd(iy2,jy1);
736 dz21 = _mm_sub_pd(iz2,jz1);
737 dx22 = _mm_sub_pd(ix2,jx2);
738 dy22 = _mm_sub_pd(iy2,jy2);
739 dz22 = _mm_sub_pd(iz2,jz2);
740 dx23 = _mm_sub_pd(ix2,jx3);
741 dy23 = _mm_sub_pd(iy2,jy3);
742 dz23 = _mm_sub_pd(iz2,jz3);
743 dx31 = _mm_sub_pd(ix3,jx1);
744 dy31 = _mm_sub_pd(iy3,jy1);
745 dz31 = _mm_sub_pd(iz3,jz1);
746 dx32 = _mm_sub_pd(ix3,jx2);
747 dy32 = _mm_sub_pd(iy3,jy2);
748 dz32 = _mm_sub_pd(iz3,jz2);
749 dx33 = _mm_sub_pd(ix3,jx3);
750 dy33 = _mm_sub_pd(iy3,jy3);
751 dz33 = _mm_sub_pd(iz3,jz3);
753 /* Calculate squared distance and things based on it */
754 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
755 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
756 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
757 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
758 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
759 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
760 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
761 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
762 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
763 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
765 rinv00 = gmx_mm_invsqrt_pd(rsq00);
766 rinv11 = gmx_mm_invsqrt_pd(rsq11);
767 rinv12 = gmx_mm_invsqrt_pd(rsq12);
768 rinv13 = gmx_mm_invsqrt_pd(rsq13);
769 rinv21 = gmx_mm_invsqrt_pd(rsq21);
770 rinv22 = gmx_mm_invsqrt_pd(rsq22);
771 rinv23 = gmx_mm_invsqrt_pd(rsq23);
772 rinv31 = gmx_mm_invsqrt_pd(rsq31);
773 rinv32 = gmx_mm_invsqrt_pd(rsq32);
774 rinv33 = gmx_mm_invsqrt_pd(rsq33);
776 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
777 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
778 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
779 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
780 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
781 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
782 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
783 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
784 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
785 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
787 fjx0 = _mm_setzero_pd();
788 fjy0 = _mm_setzero_pd();
789 fjz0 = _mm_setzero_pd();
790 fjx1 = _mm_setzero_pd();
791 fjy1 = _mm_setzero_pd();
792 fjz1 = _mm_setzero_pd();
793 fjx2 = _mm_setzero_pd();
794 fjy2 = _mm_setzero_pd();
795 fjz2 = _mm_setzero_pd();
796 fjx3 = _mm_setzero_pd();
797 fjy3 = _mm_setzero_pd();
798 fjz3 = _mm_setzero_pd();
800 /**************************
801 * CALCULATE INTERACTIONS *
802 **************************/
804 if (gmx_mm_any_lt(rsq00,rcutoff2))
807 r00 = _mm_mul_pd(rsq00,rinv00);
809 /* LENNARD-JONES DISPERSION/REPULSION */
811 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
812 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
813 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
814 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
815 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
817 d = _mm_sub_pd(r00,rswitch);
818 d = _mm_max_pd(d,_mm_setzero_pd());
819 d2 = _mm_mul_pd(d,d);
820 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
822 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
824 /* Evaluate switch function */
825 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
826 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
827 vvdw = _mm_mul_pd(vvdw,sw);
828 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
830 /* Update potential sum for this i atom from the interaction with this j atom. */
831 vvdw = _mm_and_pd(vvdw,cutoff_mask);
832 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
833 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
835 fscal = fvdw;
837 fscal = _mm_and_pd(fscal,cutoff_mask);
839 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
841 /* Calculate temporary vectorial force */
842 tx = _mm_mul_pd(fscal,dx00);
843 ty = _mm_mul_pd(fscal,dy00);
844 tz = _mm_mul_pd(fscal,dz00);
846 /* Update vectorial force */
847 fix0 = _mm_add_pd(fix0,tx);
848 fiy0 = _mm_add_pd(fiy0,ty);
849 fiz0 = _mm_add_pd(fiz0,tz);
851 fjx0 = _mm_add_pd(fjx0,tx);
852 fjy0 = _mm_add_pd(fjy0,ty);
853 fjz0 = _mm_add_pd(fjz0,tz);
857 /**************************
858 * CALCULATE INTERACTIONS *
859 **************************/
861 if (gmx_mm_any_lt(rsq11,rcutoff2))
864 /* REACTION-FIELD ELECTROSTATICS */
865 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
866 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
868 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_and_pd(velec,cutoff_mask);
872 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
873 velecsum = _mm_add_pd(velecsum,velec);
875 fscal = felec;
877 fscal = _mm_and_pd(fscal,cutoff_mask);
879 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_pd(fscal,dx11);
883 ty = _mm_mul_pd(fscal,dy11);
884 tz = _mm_mul_pd(fscal,dz11);
886 /* Update vectorial force */
887 fix1 = _mm_add_pd(fix1,tx);
888 fiy1 = _mm_add_pd(fiy1,ty);
889 fiz1 = _mm_add_pd(fiz1,tz);
891 fjx1 = _mm_add_pd(fjx1,tx);
892 fjy1 = _mm_add_pd(fjy1,ty);
893 fjz1 = _mm_add_pd(fjz1,tz);
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
901 if (gmx_mm_any_lt(rsq12,rcutoff2))
904 /* REACTION-FIELD ELECTROSTATICS */
905 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
906 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
908 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
910 /* Update potential sum for this i atom from the interaction with this j atom. */
911 velec = _mm_and_pd(velec,cutoff_mask);
912 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
913 velecsum = _mm_add_pd(velecsum,velec);
915 fscal = felec;
917 fscal = _mm_and_pd(fscal,cutoff_mask);
919 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
921 /* Calculate temporary vectorial force */
922 tx = _mm_mul_pd(fscal,dx12);
923 ty = _mm_mul_pd(fscal,dy12);
924 tz = _mm_mul_pd(fscal,dz12);
926 /* Update vectorial force */
927 fix1 = _mm_add_pd(fix1,tx);
928 fiy1 = _mm_add_pd(fiy1,ty);
929 fiz1 = _mm_add_pd(fiz1,tz);
931 fjx2 = _mm_add_pd(fjx2,tx);
932 fjy2 = _mm_add_pd(fjy2,ty);
933 fjz2 = _mm_add_pd(fjz2,tz);
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 if (gmx_mm_any_lt(rsq13,rcutoff2))
944 /* REACTION-FIELD ELECTROSTATICS */
945 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
946 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
948 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
950 /* Update potential sum for this i atom from the interaction with this j atom. */
951 velec = _mm_and_pd(velec,cutoff_mask);
952 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
953 velecsum = _mm_add_pd(velecsum,velec);
955 fscal = felec;
957 fscal = _mm_and_pd(fscal,cutoff_mask);
959 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
961 /* Calculate temporary vectorial force */
962 tx = _mm_mul_pd(fscal,dx13);
963 ty = _mm_mul_pd(fscal,dy13);
964 tz = _mm_mul_pd(fscal,dz13);
966 /* Update vectorial force */
967 fix1 = _mm_add_pd(fix1,tx);
968 fiy1 = _mm_add_pd(fiy1,ty);
969 fiz1 = _mm_add_pd(fiz1,tz);
971 fjx3 = _mm_add_pd(fjx3,tx);
972 fjy3 = _mm_add_pd(fjy3,ty);
973 fjz3 = _mm_add_pd(fjz3,tz);
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 if (gmx_mm_any_lt(rsq21,rcutoff2))
984 /* REACTION-FIELD ELECTROSTATICS */
985 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
986 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
988 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec = _mm_and_pd(velec,cutoff_mask);
992 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
993 velecsum = _mm_add_pd(velecsum,velec);
995 fscal = felec;
997 fscal = _mm_and_pd(fscal,cutoff_mask);
999 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1001 /* Calculate temporary vectorial force */
1002 tx = _mm_mul_pd(fscal,dx21);
1003 ty = _mm_mul_pd(fscal,dy21);
1004 tz = _mm_mul_pd(fscal,dz21);
1006 /* Update vectorial force */
1007 fix2 = _mm_add_pd(fix2,tx);
1008 fiy2 = _mm_add_pd(fiy2,ty);
1009 fiz2 = _mm_add_pd(fiz2,tz);
1011 fjx1 = _mm_add_pd(fjx1,tx);
1012 fjy1 = _mm_add_pd(fjy1,ty);
1013 fjz1 = _mm_add_pd(fjz1,tz);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_mm_any_lt(rsq22,rcutoff2))
1024 /* REACTION-FIELD ELECTROSTATICS */
1025 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1026 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1028 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1030 /* Update potential sum for this i atom from the interaction with this j atom. */
1031 velec = _mm_and_pd(velec,cutoff_mask);
1032 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1033 velecsum = _mm_add_pd(velecsum,velec);
1035 fscal = felec;
1037 fscal = _mm_and_pd(fscal,cutoff_mask);
1039 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1041 /* Calculate temporary vectorial force */
1042 tx = _mm_mul_pd(fscal,dx22);
1043 ty = _mm_mul_pd(fscal,dy22);
1044 tz = _mm_mul_pd(fscal,dz22);
1046 /* Update vectorial force */
1047 fix2 = _mm_add_pd(fix2,tx);
1048 fiy2 = _mm_add_pd(fiy2,ty);
1049 fiz2 = _mm_add_pd(fiz2,tz);
1051 fjx2 = _mm_add_pd(fjx2,tx);
1052 fjy2 = _mm_add_pd(fjy2,ty);
1053 fjz2 = _mm_add_pd(fjz2,tz);
1057 /**************************
1058 * CALCULATE INTERACTIONS *
1059 **************************/
1061 if (gmx_mm_any_lt(rsq23,rcutoff2))
1064 /* REACTION-FIELD ELECTROSTATICS */
1065 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
1066 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1068 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1070 /* Update potential sum for this i atom from the interaction with this j atom. */
1071 velec = _mm_and_pd(velec,cutoff_mask);
1072 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1073 velecsum = _mm_add_pd(velecsum,velec);
1075 fscal = felec;
1077 fscal = _mm_and_pd(fscal,cutoff_mask);
1079 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1081 /* Calculate temporary vectorial force */
1082 tx = _mm_mul_pd(fscal,dx23);
1083 ty = _mm_mul_pd(fscal,dy23);
1084 tz = _mm_mul_pd(fscal,dz23);
1086 /* Update vectorial force */
1087 fix2 = _mm_add_pd(fix2,tx);
1088 fiy2 = _mm_add_pd(fiy2,ty);
1089 fiz2 = _mm_add_pd(fiz2,tz);
1091 fjx3 = _mm_add_pd(fjx3,tx);
1092 fjy3 = _mm_add_pd(fjy3,ty);
1093 fjz3 = _mm_add_pd(fjz3,tz);
1097 /**************************
1098 * CALCULATE INTERACTIONS *
1099 **************************/
1101 if (gmx_mm_any_lt(rsq31,rcutoff2))
1104 /* REACTION-FIELD ELECTROSTATICS */
1105 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
1106 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1108 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1110 /* Update potential sum for this i atom from the interaction with this j atom. */
1111 velec = _mm_and_pd(velec,cutoff_mask);
1112 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1113 velecsum = _mm_add_pd(velecsum,velec);
1115 fscal = felec;
1117 fscal = _mm_and_pd(fscal,cutoff_mask);
1119 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1121 /* Calculate temporary vectorial force */
1122 tx = _mm_mul_pd(fscal,dx31);
1123 ty = _mm_mul_pd(fscal,dy31);
1124 tz = _mm_mul_pd(fscal,dz31);
1126 /* Update vectorial force */
1127 fix3 = _mm_add_pd(fix3,tx);
1128 fiy3 = _mm_add_pd(fiy3,ty);
1129 fiz3 = _mm_add_pd(fiz3,tz);
1131 fjx1 = _mm_add_pd(fjx1,tx);
1132 fjy1 = _mm_add_pd(fjy1,ty);
1133 fjz1 = _mm_add_pd(fjz1,tz);
1137 /**************************
1138 * CALCULATE INTERACTIONS *
1139 **************************/
1141 if (gmx_mm_any_lt(rsq32,rcutoff2))
1144 /* REACTION-FIELD ELECTROSTATICS */
1145 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
1146 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1148 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1150 /* Update potential sum for this i atom from the interaction with this j atom. */
1151 velec = _mm_and_pd(velec,cutoff_mask);
1152 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1153 velecsum = _mm_add_pd(velecsum,velec);
1155 fscal = felec;
1157 fscal = _mm_and_pd(fscal,cutoff_mask);
1159 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1161 /* Calculate temporary vectorial force */
1162 tx = _mm_mul_pd(fscal,dx32);
1163 ty = _mm_mul_pd(fscal,dy32);
1164 tz = _mm_mul_pd(fscal,dz32);
1166 /* Update vectorial force */
1167 fix3 = _mm_add_pd(fix3,tx);
1168 fiy3 = _mm_add_pd(fiy3,ty);
1169 fiz3 = _mm_add_pd(fiz3,tz);
1171 fjx2 = _mm_add_pd(fjx2,tx);
1172 fjy2 = _mm_add_pd(fjy2,ty);
1173 fjz2 = _mm_add_pd(fjz2,tz);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm_any_lt(rsq33,rcutoff2))
1184 /* REACTION-FIELD ELECTROSTATICS */
1185 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
1186 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1188 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1190 /* Update potential sum for this i atom from the interaction with this j atom. */
1191 velec = _mm_and_pd(velec,cutoff_mask);
1192 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1193 velecsum = _mm_add_pd(velecsum,velec);
1195 fscal = felec;
1197 fscal = _mm_and_pd(fscal,cutoff_mask);
1199 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1201 /* Calculate temporary vectorial force */
1202 tx = _mm_mul_pd(fscal,dx33);
1203 ty = _mm_mul_pd(fscal,dy33);
1204 tz = _mm_mul_pd(fscal,dz33);
1206 /* Update vectorial force */
1207 fix3 = _mm_add_pd(fix3,tx);
1208 fiy3 = _mm_add_pd(fiy3,ty);
1209 fiz3 = _mm_add_pd(fiz3,tz);
1211 fjx3 = _mm_add_pd(fjx3,tx);
1212 fjy3 = _mm_add_pd(fjy3,ty);
1213 fjz3 = _mm_add_pd(fjz3,tz);
1217 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1219 /* Inner loop uses 386 flops */
1222 /* End of innermost loop */
1224 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1225 f+i_coord_offset,fshift+i_shift_offset);
1227 ggid = gid[iidx];
1228 /* Update potential energies */
1229 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1230 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1232 /* Increment number of inner iterations */
1233 inneriter += j_index_end - j_index_start;
1235 /* Outer loop uses 26 flops */
1238 /* Increment number of outer iterations */
1239 outeriter += nri;
1241 /* Update outer/inner flops */
1243 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*386);
1246 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1247 * Electrostatics interaction: ReactionField
1248 * VdW interaction: LennardJones
1249 * Geometry: Water4-Water4
1250 * Calculate force/pot: Force
1252 void
1253 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1254 (t_nblist * gmx_restrict nlist,
1255 rvec * gmx_restrict xx,
1256 rvec * gmx_restrict ff,
1257 t_forcerec * gmx_restrict fr,
1258 t_mdatoms * gmx_restrict mdatoms,
1259 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1260 t_nrnb * gmx_restrict nrnb)
1262 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1263 * just 0 for non-waters.
1264 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1265 * jnr indices corresponding to data put in the four positions in the SIMD register.
1267 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1268 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1269 int jnrA,jnrB;
1270 int j_coord_offsetA,j_coord_offsetB;
1271 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1272 real rcutoff_scalar;
1273 real *shiftvec,*fshift,*x,*f;
1274 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1275 int vdwioffset0;
1276 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1277 int vdwioffset1;
1278 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1279 int vdwioffset2;
1280 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1281 int vdwioffset3;
1282 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1283 int vdwjidx0A,vdwjidx0B;
1284 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1285 int vdwjidx1A,vdwjidx1B;
1286 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1287 int vdwjidx2A,vdwjidx2B;
1288 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1289 int vdwjidx3A,vdwjidx3B;
1290 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1291 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1292 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1293 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1294 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1295 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1296 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1297 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1298 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1299 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1300 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1301 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1302 real *charge;
1303 int nvdwtype;
1304 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1305 int *vdwtype;
1306 real *vdwparam;
1307 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1308 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1309 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1310 real rswitch_scalar,d_scalar;
1311 __m128d dummy_mask,cutoff_mask;
1312 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1313 __m128d one = _mm_set1_pd(1.0);
1314 __m128d two = _mm_set1_pd(2.0);
1315 x = xx[0];
1316 f = ff[0];
1318 nri = nlist->nri;
1319 iinr = nlist->iinr;
1320 jindex = nlist->jindex;
1321 jjnr = nlist->jjnr;
1322 shiftidx = nlist->shift;
1323 gid = nlist->gid;
1324 shiftvec = fr->shift_vec[0];
1325 fshift = fr->fshift[0];
1326 facel = _mm_set1_pd(fr->epsfac);
1327 charge = mdatoms->chargeA;
1328 krf = _mm_set1_pd(fr->ic->k_rf);
1329 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1330 crf = _mm_set1_pd(fr->ic->c_rf);
1331 nvdwtype = fr->ntype;
1332 vdwparam = fr->nbfp;
1333 vdwtype = mdatoms->typeA;
1335 /* Setup water-specific parameters */
1336 inr = nlist->iinr[0];
1337 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1338 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1339 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1340 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1342 jq1 = _mm_set1_pd(charge[inr+1]);
1343 jq2 = _mm_set1_pd(charge[inr+2]);
1344 jq3 = _mm_set1_pd(charge[inr+3]);
1345 vdwjidx0A = 2*vdwtype[inr+0];
1346 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1347 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1348 qq11 = _mm_mul_pd(iq1,jq1);
1349 qq12 = _mm_mul_pd(iq1,jq2);
1350 qq13 = _mm_mul_pd(iq1,jq3);
1351 qq21 = _mm_mul_pd(iq2,jq1);
1352 qq22 = _mm_mul_pd(iq2,jq2);
1353 qq23 = _mm_mul_pd(iq2,jq3);
1354 qq31 = _mm_mul_pd(iq3,jq1);
1355 qq32 = _mm_mul_pd(iq3,jq2);
1356 qq33 = _mm_mul_pd(iq3,jq3);
1358 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1359 rcutoff_scalar = fr->rcoulomb;
1360 rcutoff = _mm_set1_pd(rcutoff_scalar);
1361 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1363 rswitch_scalar = fr->rvdw_switch;
1364 rswitch = _mm_set1_pd(rswitch_scalar);
1365 /* Setup switch parameters */
1366 d_scalar = rcutoff_scalar-rswitch_scalar;
1367 d = _mm_set1_pd(d_scalar);
1368 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1369 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1370 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1371 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1372 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1373 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1375 /* Avoid stupid compiler warnings */
1376 jnrA = jnrB = 0;
1377 j_coord_offsetA = 0;
1378 j_coord_offsetB = 0;
1380 outeriter = 0;
1381 inneriter = 0;
1383 /* Start outer loop over neighborlists */
1384 for(iidx=0; iidx<nri; iidx++)
1386 /* Load shift vector for this list */
1387 i_shift_offset = DIM*shiftidx[iidx];
1389 /* Load limits for loop over neighbors */
1390 j_index_start = jindex[iidx];
1391 j_index_end = jindex[iidx+1];
1393 /* Get outer coordinate index */
1394 inr = iinr[iidx];
1395 i_coord_offset = DIM*inr;
1397 /* Load i particle coords and add shift vector */
1398 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1399 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1401 fix0 = _mm_setzero_pd();
1402 fiy0 = _mm_setzero_pd();
1403 fiz0 = _mm_setzero_pd();
1404 fix1 = _mm_setzero_pd();
1405 fiy1 = _mm_setzero_pd();
1406 fiz1 = _mm_setzero_pd();
1407 fix2 = _mm_setzero_pd();
1408 fiy2 = _mm_setzero_pd();
1409 fiz2 = _mm_setzero_pd();
1410 fix3 = _mm_setzero_pd();
1411 fiy3 = _mm_setzero_pd();
1412 fiz3 = _mm_setzero_pd();
1414 /* Start inner kernel loop */
1415 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1418 /* Get j neighbor index, and coordinate index */
1419 jnrA = jjnr[jidx];
1420 jnrB = jjnr[jidx+1];
1421 j_coord_offsetA = DIM*jnrA;
1422 j_coord_offsetB = DIM*jnrB;
1424 /* load j atom coordinates */
1425 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1426 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1427 &jy2,&jz2,&jx3,&jy3,&jz3);
1429 /* Calculate displacement vector */
1430 dx00 = _mm_sub_pd(ix0,jx0);
1431 dy00 = _mm_sub_pd(iy0,jy0);
1432 dz00 = _mm_sub_pd(iz0,jz0);
1433 dx11 = _mm_sub_pd(ix1,jx1);
1434 dy11 = _mm_sub_pd(iy1,jy1);
1435 dz11 = _mm_sub_pd(iz1,jz1);
1436 dx12 = _mm_sub_pd(ix1,jx2);
1437 dy12 = _mm_sub_pd(iy1,jy2);
1438 dz12 = _mm_sub_pd(iz1,jz2);
1439 dx13 = _mm_sub_pd(ix1,jx3);
1440 dy13 = _mm_sub_pd(iy1,jy3);
1441 dz13 = _mm_sub_pd(iz1,jz3);
1442 dx21 = _mm_sub_pd(ix2,jx1);
1443 dy21 = _mm_sub_pd(iy2,jy1);
1444 dz21 = _mm_sub_pd(iz2,jz1);
1445 dx22 = _mm_sub_pd(ix2,jx2);
1446 dy22 = _mm_sub_pd(iy2,jy2);
1447 dz22 = _mm_sub_pd(iz2,jz2);
1448 dx23 = _mm_sub_pd(ix2,jx3);
1449 dy23 = _mm_sub_pd(iy2,jy3);
1450 dz23 = _mm_sub_pd(iz2,jz3);
1451 dx31 = _mm_sub_pd(ix3,jx1);
1452 dy31 = _mm_sub_pd(iy3,jy1);
1453 dz31 = _mm_sub_pd(iz3,jz1);
1454 dx32 = _mm_sub_pd(ix3,jx2);
1455 dy32 = _mm_sub_pd(iy3,jy2);
1456 dz32 = _mm_sub_pd(iz3,jz2);
1457 dx33 = _mm_sub_pd(ix3,jx3);
1458 dy33 = _mm_sub_pd(iy3,jy3);
1459 dz33 = _mm_sub_pd(iz3,jz3);
1461 /* Calculate squared distance and things based on it */
1462 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1463 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1464 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1465 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1466 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1467 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1468 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1469 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1470 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1471 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1473 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1474 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1475 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1476 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1477 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1478 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1479 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1480 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1481 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1482 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1484 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1485 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1486 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1487 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1488 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1489 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1490 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1491 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1492 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1493 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1495 fjx0 = _mm_setzero_pd();
1496 fjy0 = _mm_setzero_pd();
1497 fjz0 = _mm_setzero_pd();
1498 fjx1 = _mm_setzero_pd();
1499 fjy1 = _mm_setzero_pd();
1500 fjz1 = _mm_setzero_pd();
1501 fjx2 = _mm_setzero_pd();
1502 fjy2 = _mm_setzero_pd();
1503 fjz2 = _mm_setzero_pd();
1504 fjx3 = _mm_setzero_pd();
1505 fjy3 = _mm_setzero_pd();
1506 fjz3 = _mm_setzero_pd();
1508 /**************************
1509 * CALCULATE INTERACTIONS *
1510 **************************/
1512 if (gmx_mm_any_lt(rsq00,rcutoff2))
1515 r00 = _mm_mul_pd(rsq00,rinv00);
1517 /* LENNARD-JONES DISPERSION/REPULSION */
1519 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1520 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1521 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1522 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1523 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1525 d = _mm_sub_pd(r00,rswitch);
1526 d = _mm_max_pd(d,_mm_setzero_pd());
1527 d2 = _mm_mul_pd(d,d);
1528 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1530 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1532 /* Evaluate switch function */
1533 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1534 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1535 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1537 fscal = fvdw;
1539 fscal = _mm_and_pd(fscal,cutoff_mask);
1541 /* Calculate temporary vectorial force */
1542 tx = _mm_mul_pd(fscal,dx00);
1543 ty = _mm_mul_pd(fscal,dy00);
1544 tz = _mm_mul_pd(fscal,dz00);
1546 /* Update vectorial force */
1547 fix0 = _mm_add_pd(fix0,tx);
1548 fiy0 = _mm_add_pd(fiy0,ty);
1549 fiz0 = _mm_add_pd(fiz0,tz);
1551 fjx0 = _mm_add_pd(fjx0,tx);
1552 fjy0 = _mm_add_pd(fjy0,ty);
1553 fjz0 = _mm_add_pd(fjz0,tz);
1557 /**************************
1558 * CALCULATE INTERACTIONS *
1559 **************************/
1561 if (gmx_mm_any_lt(rsq11,rcutoff2))
1564 /* REACTION-FIELD ELECTROSTATICS */
1565 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1567 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1569 fscal = felec;
1571 fscal = _mm_and_pd(fscal,cutoff_mask);
1573 /* Calculate temporary vectorial force */
1574 tx = _mm_mul_pd(fscal,dx11);
1575 ty = _mm_mul_pd(fscal,dy11);
1576 tz = _mm_mul_pd(fscal,dz11);
1578 /* Update vectorial force */
1579 fix1 = _mm_add_pd(fix1,tx);
1580 fiy1 = _mm_add_pd(fiy1,ty);
1581 fiz1 = _mm_add_pd(fiz1,tz);
1583 fjx1 = _mm_add_pd(fjx1,tx);
1584 fjy1 = _mm_add_pd(fjy1,ty);
1585 fjz1 = _mm_add_pd(fjz1,tz);
1589 /**************************
1590 * CALCULATE INTERACTIONS *
1591 **************************/
1593 if (gmx_mm_any_lt(rsq12,rcutoff2))
1596 /* REACTION-FIELD ELECTROSTATICS */
1597 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1599 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1601 fscal = felec;
1603 fscal = _mm_and_pd(fscal,cutoff_mask);
1605 /* Calculate temporary vectorial force */
1606 tx = _mm_mul_pd(fscal,dx12);
1607 ty = _mm_mul_pd(fscal,dy12);
1608 tz = _mm_mul_pd(fscal,dz12);
1610 /* Update vectorial force */
1611 fix1 = _mm_add_pd(fix1,tx);
1612 fiy1 = _mm_add_pd(fiy1,ty);
1613 fiz1 = _mm_add_pd(fiz1,tz);
1615 fjx2 = _mm_add_pd(fjx2,tx);
1616 fjy2 = _mm_add_pd(fjy2,ty);
1617 fjz2 = _mm_add_pd(fjz2,tz);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 if (gmx_mm_any_lt(rsq13,rcutoff2))
1628 /* REACTION-FIELD ELECTROSTATICS */
1629 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1631 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1633 fscal = felec;
1635 fscal = _mm_and_pd(fscal,cutoff_mask);
1637 /* Calculate temporary vectorial force */
1638 tx = _mm_mul_pd(fscal,dx13);
1639 ty = _mm_mul_pd(fscal,dy13);
1640 tz = _mm_mul_pd(fscal,dz13);
1642 /* Update vectorial force */
1643 fix1 = _mm_add_pd(fix1,tx);
1644 fiy1 = _mm_add_pd(fiy1,ty);
1645 fiz1 = _mm_add_pd(fiz1,tz);
1647 fjx3 = _mm_add_pd(fjx3,tx);
1648 fjy3 = _mm_add_pd(fjy3,ty);
1649 fjz3 = _mm_add_pd(fjz3,tz);
1653 /**************************
1654 * CALCULATE INTERACTIONS *
1655 **************************/
1657 if (gmx_mm_any_lt(rsq21,rcutoff2))
1660 /* REACTION-FIELD ELECTROSTATICS */
1661 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1663 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1665 fscal = felec;
1667 fscal = _mm_and_pd(fscal,cutoff_mask);
1669 /* Calculate temporary vectorial force */
1670 tx = _mm_mul_pd(fscal,dx21);
1671 ty = _mm_mul_pd(fscal,dy21);
1672 tz = _mm_mul_pd(fscal,dz21);
1674 /* Update vectorial force */
1675 fix2 = _mm_add_pd(fix2,tx);
1676 fiy2 = _mm_add_pd(fiy2,ty);
1677 fiz2 = _mm_add_pd(fiz2,tz);
1679 fjx1 = _mm_add_pd(fjx1,tx);
1680 fjy1 = _mm_add_pd(fjy1,ty);
1681 fjz1 = _mm_add_pd(fjz1,tz);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 if (gmx_mm_any_lt(rsq22,rcutoff2))
1692 /* REACTION-FIELD ELECTROSTATICS */
1693 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1695 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1697 fscal = felec;
1699 fscal = _mm_and_pd(fscal,cutoff_mask);
1701 /* Calculate temporary vectorial force */
1702 tx = _mm_mul_pd(fscal,dx22);
1703 ty = _mm_mul_pd(fscal,dy22);
1704 tz = _mm_mul_pd(fscal,dz22);
1706 /* Update vectorial force */
1707 fix2 = _mm_add_pd(fix2,tx);
1708 fiy2 = _mm_add_pd(fiy2,ty);
1709 fiz2 = _mm_add_pd(fiz2,tz);
1711 fjx2 = _mm_add_pd(fjx2,tx);
1712 fjy2 = _mm_add_pd(fjy2,ty);
1713 fjz2 = _mm_add_pd(fjz2,tz);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 if (gmx_mm_any_lt(rsq23,rcutoff2))
1724 /* REACTION-FIELD ELECTROSTATICS */
1725 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1727 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1729 fscal = felec;
1731 fscal = _mm_and_pd(fscal,cutoff_mask);
1733 /* Calculate temporary vectorial force */
1734 tx = _mm_mul_pd(fscal,dx23);
1735 ty = _mm_mul_pd(fscal,dy23);
1736 tz = _mm_mul_pd(fscal,dz23);
1738 /* Update vectorial force */
1739 fix2 = _mm_add_pd(fix2,tx);
1740 fiy2 = _mm_add_pd(fiy2,ty);
1741 fiz2 = _mm_add_pd(fiz2,tz);
1743 fjx3 = _mm_add_pd(fjx3,tx);
1744 fjy3 = _mm_add_pd(fjy3,ty);
1745 fjz3 = _mm_add_pd(fjz3,tz);
1749 /**************************
1750 * CALCULATE INTERACTIONS *
1751 **************************/
1753 if (gmx_mm_any_lt(rsq31,rcutoff2))
1756 /* REACTION-FIELD ELECTROSTATICS */
1757 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1759 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1761 fscal = felec;
1763 fscal = _mm_and_pd(fscal,cutoff_mask);
1765 /* Calculate temporary vectorial force */
1766 tx = _mm_mul_pd(fscal,dx31);
1767 ty = _mm_mul_pd(fscal,dy31);
1768 tz = _mm_mul_pd(fscal,dz31);
1770 /* Update vectorial force */
1771 fix3 = _mm_add_pd(fix3,tx);
1772 fiy3 = _mm_add_pd(fiy3,ty);
1773 fiz3 = _mm_add_pd(fiz3,tz);
1775 fjx1 = _mm_add_pd(fjx1,tx);
1776 fjy1 = _mm_add_pd(fjy1,ty);
1777 fjz1 = _mm_add_pd(fjz1,tz);
1781 /**************************
1782 * CALCULATE INTERACTIONS *
1783 **************************/
1785 if (gmx_mm_any_lt(rsq32,rcutoff2))
1788 /* REACTION-FIELD ELECTROSTATICS */
1789 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1791 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1793 fscal = felec;
1795 fscal = _mm_and_pd(fscal,cutoff_mask);
1797 /* Calculate temporary vectorial force */
1798 tx = _mm_mul_pd(fscal,dx32);
1799 ty = _mm_mul_pd(fscal,dy32);
1800 tz = _mm_mul_pd(fscal,dz32);
1802 /* Update vectorial force */
1803 fix3 = _mm_add_pd(fix3,tx);
1804 fiy3 = _mm_add_pd(fiy3,ty);
1805 fiz3 = _mm_add_pd(fiz3,tz);
1807 fjx2 = _mm_add_pd(fjx2,tx);
1808 fjy2 = _mm_add_pd(fjy2,ty);
1809 fjz2 = _mm_add_pd(fjz2,tz);
1813 /**************************
1814 * CALCULATE INTERACTIONS *
1815 **************************/
1817 if (gmx_mm_any_lt(rsq33,rcutoff2))
1820 /* REACTION-FIELD ELECTROSTATICS */
1821 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1823 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1825 fscal = felec;
1827 fscal = _mm_and_pd(fscal,cutoff_mask);
1829 /* Calculate temporary vectorial force */
1830 tx = _mm_mul_pd(fscal,dx33);
1831 ty = _mm_mul_pd(fscal,dy33);
1832 tz = _mm_mul_pd(fscal,dz33);
1834 /* Update vectorial force */
1835 fix3 = _mm_add_pd(fix3,tx);
1836 fiy3 = _mm_add_pd(fiy3,ty);
1837 fiz3 = _mm_add_pd(fiz3,tz);
1839 fjx3 = _mm_add_pd(fjx3,tx);
1840 fjy3 = _mm_add_pd(fjy3,ty);
1841 fjz3 = _mm_add_pd(fjz3,tz);
1845 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1847 /* Inner loop uses 329 flops */
1850 if(jidx<j_index_end)
1853 jnrA = jjnr[jidx];
1854 j_coord_offsetA = DIM*jnrA;
1856 /* load j atom coordinates */
1857 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1858 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1859 &jy2,&jz2,&jx3,&jy3,&jz3);
1861 /* Calculate displacement vector */
1862 dx00 = _mm_sub_pd(ix0,jx0);
1863 dy00 = _mm_sub_pd(iy0,jy0);
1864 dz00 = _mm_sub_pd(iz0,jz0);
1865 dx11 = _mm_sub_pd(ix1,jx1);
1866 dy11 = _mm_sub_pd(iy1,jy1);
1867 dz11 = _mm_sub_pd(iz1,jz1);
1868 dx12 = _mm_sub_pd(ix1,jx2);
1869 dy12 = _mm_sub_pd(iy1,jy2);
1870 dz12 = _mm_sub_pd(iz1,jz2);
1871 dx13 = _mm_sub_pd(ix1,jx3);
1872 dy13 = _mm_sub_pd(iy1,jy3);
1873 dz13 = _mm_sub_pd(iz1,jz3);
1874 dx21 = _mm_sub_pd(ix2,jx1);
1875 dy21 = _mm_sub_pd(iy2,jy1);
1876 dz21 = _mm_sub_pd(iz2,jz1);
1877 dx22 = _mm_sub_pd(ix2,jx2);
1878 dy22 = _mm_sub_pd(iy2,jy2);
1879 dz22 = _mm_sub_pd(iz2,jz2);
1880 dx23 = _mm_sub_pd(ix2,jx3);
1881 dy23 = _mm_sub_pd(iy2,jy3);
1882 dz23 = _mm_sub_pd(iz2,jz3);
1883 dx31 = _mm_sub_pd(ix3,jx1);
1884 dy31 = _mm_sub_pd(iy3,jy1);
1885 dz31 = _mm_sub_pd(iz3,jz1);
1886 dx32 = _mm_sub_pd(ix3,jx2);
1887 dy32 = _mm_sub_pd(iy3,jy2);
1888 dz32 = _mm_sub_pd(iz3,jz2);
1889 dx33 = _mm_sub_pd(ix3,jx3);
1890 dy33 = _mm_sub_pd(iy3,jy3);
1891 dz33 = _mm_sub_pd(iz3,jz3);
1893 /* Calculate squared distance and things based on it */
1894 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1895 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1896 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1897 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1898 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1899 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1900 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1901 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1902 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1903 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1905 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1906 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1907 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1908 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1909 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1910 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1911 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1912 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1913 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1914 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1916 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1917 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1918 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1919 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1920 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1921 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1922 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1923 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1924 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1925 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1927 fjx0 = _mm_setzero_pd();
1928 fjy0 = _mm_setzero_pd();
1929 fjz0 = _mm_setzero_pd();
1930 fjx1 = _mm_setzero_pd();
1931 fjy1 = _mm_setzero_pd();
1932 fjz1 = _mm_setzero_pd();
1933 fjx2 = _mm_setzero_pd();
1934 fjy2 = _mm_setzero_pd();
1935 fjz2 = _mm_setzero_pd();
1936 fjx3 = _mm_setzero_pd();
1937 fjy3 = _mm_setzero_pd();
1938 fjz3 = _mm_setzero_pd();
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 if (gmx_mm_any_lt(rsq00,rcutoff2))
1947 r00 = _mm_mul_pd(rsq00,rinv00);
1949 /* LENNARD-JONES DISPERSION/REPULSION */
1951 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1952 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1953 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1954 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1955 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1957 d = _mm_sub_pd(r00,rswitch);
1958 d = _mm_max_pd(d,_mm_setzero_pd());
1959 d2 = _mm_mul_pd(d,d);
1960 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1962 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1967 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1969 fscal = fvdw;
1971 fscal = _mm_and_pd(fscal,cutoff_mask);
1973 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1975 /* Calculate temporary vectorial force */
1976 tx = _mm_mul_pd(fscal,dx00);
1977 ty = _mm_mul_pd(fscal,dy00);
1978 tz = _mm_mul_pd(fscal,dz00);
1980 /* Update vectorial force */
1981 fix0 = _mm_add_pd(fix0,tx);
1982 fiy0 = _mm_add_pd(fiy0,ty);
1983 fiz0 = _mm_add_pd(fiz0,tz);
1985 fjx0 = _mm_add_pd(fjx0,tx);
1986 fjy0 = _mm_add_pd(fjy0,ty);
1987 fjz0 = _mm_add_pd(fjz0,tz);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 if (gmx_mm_any_lt(rsq11,rcutoff2))
1998 /* REACTION-FIELD ELECTROSTATICS */
1999 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
2001 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2003 fscal = felec;
2005 fscal = _mm_and_pd(fscal,cutoff_mask);
2007 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2009 /* Calculate temporary vectorial force */
2010 tx = _mm_mul_pd(fscal,dx11);
2011 ty = _mm_mul_pd(fscal,dy11);
2012 tz = _mm_mul_pd(fscal,dz11);
2014 /* Update vectorial force */
2015 fix1 = _mm_add_pd(fix1,tx);
2016 fiy1 = _mm_add_pd(fiy1,ty);
2017 fiz1 = _mm_add_pd(fiz1,tz);
2019 fjx1 = _mm_add_pd(fjx1,tx);
2020 fjy1 = _mm_add_pd(fjy1,ty);
2021 fjz1 = _mm_add_pd(fjz1,tz);
2025 /**************************
2026 * CALCULATE INTERACTIONS *
2027 **************************/
2029 if (gmx_mm_any_lt(rsq12,rcutoff2))
2032 /* REACTION-FIELD ELECTROSTATICS */
2033 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2035 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2037 fscal = felec;
2039 fscal = _mm_and_pd(fscal,cutoff_mask);
2041 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2043 /* Calculate temporary vectorial force */
2044 tx = _mm_mul_pd(fscal,dx12);
2045 ty = _mm_mul_pd(fscal,dy12);
2046 tz = _mm_mul_pd(fscal,dz12);
2048 /* Update vectorial force */
2049 fix1 = _mm_add_pd(fix1,tx);
2050 fiy1 = _mm_add_pd(fiy1,ty);
2051 fiz1 = _mm_add_pd(fiz1,tz);
2053 fjx2 = _mm_add_pd(fjx2,tx);
2054 fjy2 = _mm_add_pd(fjy2,ty);
2055 fjz2 = _mm_add_pd(fjz2,tz);
2059 /**************************
2060 * CALCULATE INTERACTIONS *
2061 **************************/
2063 if (gmx_mm_any_lt(rsq13,rcutoff2))
2066 /* REACTION-FIELD ELECTROSTATICS */
2067 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
2069 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2071 fscal = felec;
2073 fscal = _mm_and_pd(fscal,cutoff_mask);
2075 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2077 /* Calculate temporary vectorial force */
2078 tx = _mm_mul_pd(fscal,dx13);
2079 ty = _mm_mul_pd(fscal,dy13);
2080 tz = _mm_mul_pd(fscal,dz13);
2082 /* Update vectorial force */
2083 fix1 = _mm_add_pd(fix1,tx);
2084 fiy1 = _mm_add_pd(fiy1,ty);
2085 fiz1 = _mm_add_pd(fiz1,tz);
2087 fjx3 = _mm_add_pd(fjx3,tx);
2088 fjy3 = _mm_add_pd(fjy3,ty);
2089 fjz3 = _mm_add_pd(fjz3,tz);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 if (gmx_mm_any_lt(rsq21,rcutoff2))
2100 /* REACTION-FIELD ELECTROSTATICS */
2101 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2103 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2105 fscal = felec;
2107 fscal = _mm_and_pd(fscal,cutoff_mask);
2109 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2111 /* Calculate temporary vectorial force */
2112 tx = _mm_mul_pd(fscal,dx21);
2113 ty = _mm_mul_pd(fscal,dy21);
2114 tz = _mm_mul_pd(fscal,dz21);
2116 /* Update vectorial force */
2117 fix2 = _mm_add_pd(fix2,tx);
2118 fiy2 = _mm_add_pd(fiy2,ty);
2119 fiz2 = _mm_add_pd(fiz2,tz);
2121 fjx1 = _mm_add_pd(fjx1,tx);
2122 fjy1 = _mm_add_pd(fjy1,ty);
2123 fjz1 = _mm_add_pd(fjz1,tz);
2127 /**************************
2128 * CALCULATE INTERACTIONS *
2129 **************************/
2131 if (gmx_mm_any_lt(rsq22,rcutoff2))
2134 /* REACTION-FIELD ELECTROSTATICS */
2135 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2137 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2139 fscal = felec;
2141 fscal = _mm_and_pd(fscal,cutoff_mask);
2143 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2145 /* Calculate temporary vectorial force */
2146 tx = _mm_mul_pd(fscal,dx22);
2147 ty = _mm_mul_pd(fscal,dy22);
2148 tz = _mm_mul_pd(fscal,dz22);
2150 /* Update vectorial force */
2151 fix2 = _mm_add_pd(fix2,tx);
2152 fiy2 = _mm_add_pd(fiy2,ty);
2153 fiz2 = _mm_add_pd(fiz2,tz);
2155 fjx2 = _mm_add_pd(fjx2,tx);
2156 fjy2 = _mm_add_pd(fjy2,ty);
2157 fjz2 = _mm_add_pd(fjz2,tz);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm_any_lt(rsq23,rcutoff2))
2168 /* REACTION-FIELD ELECTROSTATICS */
2169 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
2171 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2173 fscal = felec;
2175 fscal = _mm_and_pd(fscal,cutoff_mask);
2177 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2179 /* Calculate temporary vectorial force */
2180 tx = _mm_mul_pd(fscal,dx23);
2181 ty = _mm_mul_pd(fscal,dy23);
2182 tz = _mm_mul_pd(fscal,dz23);
2184 /* Update vectorial force */
2185 fix2 = _mm_add_pd(fix2,tx);
2186 fiy2 = _mm_add_pd(fiy2,ty);
2187 fiz2 = _mm_add_pd(fiz2,tz);
2189 fjx3 = _mm_add_pd(fjx3,tx);
2190 fjy3 = _mm_add_pd(fjy3,ty);
2191 fjz3 = _mm_add_pd(fjz3,tz);
2195 /**************************
2196 * CALCULATE INTERACTIONS *
2197 **************************/
2199 if (gmx_mm_any_lt(rsq31,rcutoff2))
2202 /* REACTION-FIELD ELECTROSTATICS */
2203 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
2205 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2207 fscal = felec;
2209 fscal = _mm_and_pd(fscal,cutoff_mask);
2211 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2213 /* Calculate temporary vectorial force */
2214 tx = _mm_mul_pd(fscal,dx31);
2215 ty = _mm_mul_pd(fscal,dy31);
2216 tz = _mm_mul_pd(fscal,dz31);
2218 /* Update vectorial force */
2219 fix3 = _mm_add_pd(fix3,tx);
2220 fiy3 = _mm_add_pd(fiy3,ty);
2221 fiz3 = _mm_add_pd(fiz3,tz);
2223 fjx1 = _mm_add_pd(fjx1,tx);
2224 fjy1 = _mm_add_pd(fjy1,ty);
2225 fjz1 = _mm_add_pd(fjz1,tz);
2229 /**************************
2230 * CALCULATE INTERACTIONS *
2231 **************************/
2233 if (gmx_mm_any_lt(rsq32,rcutoff2))
2236 /* REACTION-FIELD ELECTROSTATICS */
2237 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
2239 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2241 fscal = felec;
2243 fscal = _mm_and_pd(fscal,cutoff_mask);
2245 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2247 /* Calculate temporary vectorial force */
2248 tx = _mm_mul_pd(fscal,dx32);
2249 ty = _mm_mul_pd(fscal,dy32);
2250 tz = _mm_mul_pd(fscal,dz32);
2252 /* Update vectorial force */
2253 fix3 = _mm_add_pd(fix3,tx);
2254 fiy3 = _mm_add_pd(fiy3,ty);
2255 fiz3 = _mm_add_pd(fiz3,tz);
2257 fjx2 = _mm_add_pd(fjx2,tx);
2258 fjy2 = _mm_add_pd(fjy2,ty);
2259 fjz2 = _mm_add_pd(fjz2,tz);
2263 /**************************
2264 * CALCULATE INTERACTIONS *
2265 **************************/
2267 if (gmx_mm_any_lt(rsq33,rcutoff2))
2270 /* REACTION-FIELD ELECTROSTATICS */
2271 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
2273 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2275 fscal = felec;
2277 fscal = _mm_and_pd(fscal,cutoff_mask);
2279 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2281 /* Calculate temporary vectorial force */
2282 tx = _mm_mul_pd(fscal,dx33);
2283 ty = _mm_mul_pd(fscal,dy33);
2284 tz = _mm_mul_pd(fscal,dz33);
2286 /* Update vectorial force */
2287 fix3 = _mm_add_pd(fix3,tx);
2288 fiy3 = _mm_add_pd(fiy3,ty);
2289 fiz3 = _mm_add_pd(fiz3,tz);
2291 fjx3 = _mm_add_pd(fjx3,tx);
2292 fjy3 = _mm_add_pd(fjy3,ty);
2293 fjz3 = _mm_add_pd(fjz3,tz);
2297 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2299 /* Inner loop uses 329 flops */
2302 /* End of innermost loop */
2304 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2305 f+i_coord_offset,fshift+i_shift_offset);
2307 /* Increment number of inner iterations */
2308 inneriter += j_index_end - j_index_start;
2310 /* Outer loop uses 24 flops */
2313 /* Increment number of outer iterations */
2314 outeriter += nri;
2316 /* Update outer/inner flops */
2318 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*329);