Removed simple.h from nb_kernel_sse2_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse2_single.c
blob508a3ea7b7a3c17cc2d581d66f0a890d481d8e5e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
53 * Electrostatics interaction: ReactionField
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_single
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB,jnrC,jnrD;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real rcutoff_scalar;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 real scratch[4*DIM];
83 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 int vdwioffset0;
85 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 int vdwioffset1;
87 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 int vdwioffset2;
89 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 real *charge;
107 int nvdwtype;
108 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 int *vdwtype;
110 real *vdwparam;
111 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
112 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
114 real rswitch_scalar,d_scalar;
115 __m128 dummy_mask,cutoff_mask;
116 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one = _mm_set1_ps(1.0);
118 __m128 two = _mm_set1_ps(2.0);
119 x = xx[0];
120 f = ff[0];
122 nri = nlist->nri;
123 iinr = nlist->iinr;
124 jindex = nlist->jindex;
125 jjnr = nlist->jjnr;
126 shiftidx = nlist->shift;
127 gid = nlist->gid;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_ps(fr->epsfac);
131 charge = mdatoms->chargeA;
132 krf = _mm_set1_ps(fr->ic->k_rf);
133 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
134 crf = _mm_set1_ps(fr->ic->c_rf);
135 nvdwtype = fr->ntype;
136 vdwparam = fr->nbfp;
137 vdwtype = mdatoms->typeA;
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
142 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
143 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq0 = _mm_set1_ps(charge[inr+0]);
147 jq1 = _mm_set1_ps(charge[inr+1]);
148 jq2 = _mm_set1_ps(charge[inr+2]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 qq00 = _mm_mul_ps(iq0,jq0);
151 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
152 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
153 qq01 = _mm_mul_ps(iq0,jq1);
154 qq02 = _mm_mul_ps(iq0,jq2);
155 qq10 = _mm_mul_ps(iq1,jq0);
156 qq11 = _mm_mul_ps(iq1,jq1);
157 qq12 = _mm_mul_ps(iq1,jq2);
158 qq20 = _mm_mul_ps(iq2,jq0);
159 qq21 = _mm_mul_ps(iq2,jq1);
160 qq22 = _mm_mul_ps(iq2,jq2);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar = fr->rcoulomb;
164 rcutoff = _mm_set1_ps(rcutoff_scalar);
165 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
167 rswitch_scalar = fr->rvdw_switch;
168 rswitch = _mm_set1_ps(rswitch_scalar);
169 /* Setup switch parameters */
170 d_scalar = rcutoff_scalar-rswitch_scalar;
171 d = _mm_set1_ps(d_scalar);
172 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
173 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
176 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 /* Avoid stupid compiler warnings */
180 jnrA = jnrB = jnrC = jnrD = 0;
181 j_coord_offsetA = 0;
182 j_coord_offsetB = 0;
183 j_coord_offsetC = 0;
184 j_coord_offsetD = 0;
186 outeriter = 0;
187 inneriter = 0;
189 for(iidx=0;iidx<4*DIM;iidx++)
191 scratch[iidx] = 0.0;
194 /* Start outer loop over neighborlists */
195 for(iidx=0; iidx<nri; iidx++)
197 /* Load shift vector for this list */
198 i_shift_offset = DIM*shiftidx[iidx];
200 /* Load limits for loop over neighbors */
201 j_index_start = jindex[iidx];
202 j_index_end = jindex[iidx+1];
204 /* Get outer coordinate index */
205 inr = iinr[iidx];
206 i_coord_offset = DIM*inr;
208 /* Load i particle coords and add shift vector */
209 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
210 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
212 fix0 = _mm_setzero_ps();
213 fiy0 = _mm_setzero_ps();
214 fiz0 = _mm_setzero_ps();
215 fix1 = _mm_setzero_ps();
216 fiy1 = _mm_setzero_ps();
217 fiz1 = _mm_setzero_ps();
218 fix2 = _mm_setzero_ps();
219 fiy2 = _mm_setzero_ps();
220 fiz2 = _mm_setzero_ps();
222 /* Reset potential sums */
223 velecsum = _mm_setzero_ps();
224 vvdwsum = _mm_setzero_ps();
226 /* Start inner kernel loop */
227 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
230 /* Get j neighbor index, and coordinate index */
231 jnrA = jjnr[jidx];
232 jnrB = jjnr[jidx+1];
233 jnrC = jjnr[jidx+2];
234 jnrD = jjnr[jidx+3];
235 j_coord_offsetA = DIM*jnrA;
236 j_coord_offsetB = DIM*jnrB;
237 j_coord_offsetC = DIM*jnrC;
238 j_coord_offsetD = DIM*jnrD;
240 /* load j atom coordinates */
241 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
242 x+j_coord_offsetC,x+j_coord_offsetD,
243 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
245 /* Calculate displacement vector */
246 dx00 = _mm_sub_ps(ix0,jx0);
247 dy00 = _mm_sub_ps(iy0,jy0);
248 dz00 = _mm_sub_ps(iz0,jz0);
249 dx01 = _mm_sub_ps(ix0,jx1);
250 dy01 = _mm_sub_ps(iy0,jy1);
251 dz01 = _mm_sub_ps(iz0,jz1);
252 dx02 = _mm_sub_ps(ix0,jx2);
253 dy02 = _mm_sub_ps(iy0,jy2);
254 dz02 = _mm_sub_ps(iz0,jz2);
255 dx10 = _mm_sub_ps(ix1,jx0);
256 dy10 = _mm_sub_ps(iy1,jy0);
257 dz10 = _mm_sub_ps(iz1,jz0);
258 dx11 = _mm_sub_ps(ix1,jx1);
259 dy11 = _mm_sub_ps(iy1,jy1);
260 dz11 = _mm_sub_ps(iz1,jz1);
261 dx12 = _mm_sub_ps(ix1,jx2);
262 dy12 = _mm_sub_ps(iy1,jy2);
263 dz12 = _mm_sub_ps(iz1,jz2);
264 dx20 = _mm_sub_ps(ix2,jx0);
265 dy20 = _mm_sub_ps(iy2,jy0);
266 dz20 = _mm_sub_ps(iz2,jz0);
267 dx21 = _mm_sub_ps(ix2,jx1);
268 dy21 = _mm_sub_ps(iy2,jy1);
269 dz21 = _mm_sub_ps(iz2,jz1);
270 dx22 = _mm_sub_ps(ix2,jx2);
271 dy22 = _mm_sub_ps(iy2,jy2);
272 dz22 = _mm_sub_ps(iz2,jz2);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
276 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
277 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
278 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
279 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
280 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
281 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
282 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
283 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
285 rinv00 = gmx_mm_invsqrt_ps(rsq00);
286 rinv01 = gmx_mm_invsqrt_ps(rsq01);
287 rinv02 = gmx_mm_invsqrt_ps(rsq02);
288 rinv10 = gmx_mm_invsqrt_ps(rsq10);
289 rinv11 = gmx_mm_invsqrt_ps(rsq11);
290 rinv12 = gmx_mm_invsqrt_ps(rsq12);
291 rinv20 = gmx_mm_invsqrt_ps(rsq20);
292 rinv21 = gmx_mm_invsqrt_ps(rsq21);
293 rinv22 = gmx_mm_invsqrt_ps(rsq22);
295 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
296 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
297 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
298 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
299 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
300 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
301 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
302 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
303 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
305 fjx0 = _mm_setzero_ps();
306 fjy0 = _mm_setzero_ps();
307 fjz0 = _mm_setzero_ps();
308 fjx1 = _mm_setzero_ps();
309 fjy1 = _mm_setzero_ps();
310 fjz1 = _mm_setzero_ps();
311 fjx2 = _mm_setzero_ps();
312 fjy2 = _mm_setzero_ps();
313 fjz2 = _mm_setzero_ps();
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
319 if (gmx_mm_any_lt(rsq00,rcutoff2))
322 r00 = _mm_mul_ps(rsq00,rinv00);
324 /* REACTION-FIELD ELECTROSTATICS */
325 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
326 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
328 /* LENNARD-JONES DISPERSION/REPULSION */
330 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
331 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
332 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
333 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
334 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
336 d = _mm_sub_ps(r00,rswitch);
337 d = _mm_max_ps(d,_mm_setzero_ps());
338 d2 = _mm_mul_ps(d,d);
339 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
341 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
343 /* Evaluate switch function */
344 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
345 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
346 vvdw = _mm_mul_ps(vvdw,sw);
347 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velec = _mm_and_ps(velec,cutoff_mask);
351 velecsum = _mm_add_ps(velecsum,velec);
352 vvdw = _mm_and_ps(vvdw,cutoff_mask);
353 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
355 fscal = _mm_add_ps(felec,fvdw);
357 fscal = _mm_and_ps(fscal,cutoff_mask);
359 /* Calculate temporary vectorial force */
360 tx = _mm_mul_ps(fscal,dx00);
361 ty = _mm_mul_ps(fscal,dy00);
362 tz = _mm_mul_ps(fscal,dz00);
364 /* Update vectorial force */
365 fix0 = _mm_add_ps(fix0,tx);
366 fiy0 = _mm_add_ps(fiy0,ty);
367 fiz0 = _mm_add_ps(fiz0,tz);
369 fjx0 = _mm_add_ps(fjx0,tx);
370 fjy0 = _mm_add_ps(fjy0,ty);
371 fjz0 = _mm_add_ps(fjz0,tz);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 if (gmx_mm_any_lt(rsq01,rcutoff2))
382 /* REACTION-FIELD ELECTROSTATICS */
383 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
384 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
386 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velec = _mm_and_ps(velec,cutoff_mask);
390 velecsum = _mm_add_ps(velecsum,velec);
392 fscal = felec;
394 fscal = _mm_and_ps(fscal,cutoff_mask);
396 /* Calculate temporary vectorial force */
397 tx = _mm_mul_ps(fscal,dx01);
398 ty = _mm_mul_ps(fscal,dy01);
399 tz = _mm_mul_ps(fscal,dz01);
401 /* Update vectorial force */
402 fix0 = _mm_add_ps(fix0,tx);
403 fiy0 = _mm_add_ps(fiy0,ty);
404 fiz0 = _mm_add_ps(fiz0,tz);
406 fjx1 = _mm_add_ps(fjx1,tx);
407 fjy1 = _mm_add_ps(fjy1,ty);
408 fjz1 = _mm_add_ps(fjz1,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 if (gmx_mm_any_lt(rsq02,rcutoff2))
419 /* REACTION-FIELD ELECTROSTATICS */
420 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
421 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
423 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
425 /* Update potential sum for this i atom from the interaction with this j atom. */
426 velec = _mm_and_ps(velec,cutoff_mask);
427 velecsum = _mm_add_ps(velecsum,velec);
429 fscal = felec;
431 fscal = _mm_and_ps(fscal,cutoff_mask);
433 /* Calculate temporary vectorial force */
434 tx = _mm_mul_ps(fscal,dx02);
435 ty = _mm_mul_ps(fscal,dy02);
436 tz = _mm_mul_ps(fscal,dz02);
438 /* Update vectorial force */
439 fix0 = _mm_add_ps(fix0,tx);
440 fiy0 = _mm_add_ps(fiy0,ty);
441 fiz0 = _mm_add_ps(fiz0,tz);
443 fjx2 = _mm_add_ps(fjx2,tx);
444 fjy2 = _mm_add_ps(fjy2,ty);
445 fjz2 = _mm_add_ps(fjz2,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 if (gmx_mm_any_lt(rsq10,rcutoff2))
456 /* REACTION-FIELD ELECTROSTATICS */
457 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
458 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
460 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
462 /* Update potential sum for this i atom from the interaction with this j atom. */
463 velec = _mm_and_ps(velec,cutoff_mask);
464 velecsum = _mm_add_ps(velecsum,velec);
466 fscal = felec;
468 fscal = _mm_and_ps(fscal,cutoff_mask);
470 /* Calculate temporary vectorial force */
471 tx = _mm_mul_ps(fscal,dx10);
472 ty = _mm_mul_ps(fscal,dy10);
473 tz = _mm_mul_ps(fscal,dz10);
475 /* Update vectorial force */
476 fix1 = _mm_add_ps(fix1,tx);
477 fiy1 = _mm_add_ps(fiy1,ty);
478 fiz1 = _mm_add_ps(fiz1,tz);
480 fjx0 = _mm_add_ps(fjx0,tx);
481 fjy0 = _mm_add_ps(fjy0,ty);
482 fjz0 = _mm_add_ps(fjz0,tz);
486 /**************************
487 * CALCULATE INTERACTIONS *
488 **************************/
490 if (gmx_mm_any_lt(rsq11,rcutoff2))
493 /* REACTION-FIELD ELECTROSTATICS */
494 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
495 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
497 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velec = _mm_and_ps(velec,cutoff_mask);
501 velecsum = _mm_add_ps(velecsum,velec);
503 fscal = felec;
505 fscal = _mm_and_ps(fscal,cutoff_mask);
507 /* Calculate temporary vectorial force */
508 tx = _mm_mul_ps(fscal,dx11);
509 ty = _mm_mul_ps(fscal,dy11);
510 tz = _mm_mul_ps(fscal,dz11);
512 /* Update vectorial force */
513 fix1 = _mm_add_ps(fix1,tx);
514 fiy1 = _mm_add_ps(fiy1,ty);
515 fiz1 = _mm_add_ps(fiz1,tz);
517 fjx1 = _mm_add_ps(fjx1,tx);
518 fjy1 = _mm_add_ps(fjy1,ty);
519 fjz1 = _mm_add_ps(fjz1,tz);
523 /**************************
524 * CALCULATE INTERACTIONS *
525 **************************/
527 if (gmx_mm_any_lt(rsq12,rcutoff2))
530 /* REACTION-FIELD ELECTROSTATICS */
531 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
532 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
534 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velec = _mm_and_ps(velec,cutoff_mask);
538 velecsum = _mm_add_ps(velecsum,velec);
540 fscal = felec;
542 fscal = _mm_and_ps(fscal,cutoff_mask);
544 /* Calculate temporary vectorial force */
545 tx = _mm_mul_ps(fscal,dx12);
546 ty = _mm_mul_ps(fscal,dy12);
547 tz = _mm_mul_ps(fscal,dz12);
549 /* Update vectorial force */
550 fix1 = _mm_add_ps(fix1,tx);
551 fiy1 = _mm_add_ps(fiy1,ty);
552 fiz1 = _mm_add_ps(fiz1,tz);
554 fjx2 = _mm_add_ps(fjx2,tx);
555 fjy2 = _mm_add_ps(fjy2,ty);
556 fjz2 = _mm_add_ps(fjz2,tz);
560 /**************************
561 * CALCULATE INTERACTIONS *
562 **************************/
564 if (gmx_mm_any_lt(rsq20,rcutoff2))
567 /* REACTION-FIELD ELECTROSTATICS */
568 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
569 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
571 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velec = _mm_and_ps(velec,cutoff_mask);
575 velecsum = _mm_add_ps(velecsum,velec);
577 fscal = felec;
579 fscal = _mm_and_ps(fscal,cutoff_mask);
581 /* Calculate temporary vectorial force */
582 tx = _mm_mul_ps(fscal,dx20);
583 ty = _mm_mul_ps(fscal,dy20);
584 tz = _mm_mul_ps(fscal,dz20);
586 /* Update vectorial force */
587 fix2 = _mm_add_ps(fix2,tx);
588 fiy2 = _mm_add_ps(fiy2,ty);
589 fiz2 = _mm_add_ps(fiz2,tz);
591 fjx0 = _mm_add_ps(fjx0,tx);
592 fjy0 = _mm_add_ps(fjy0,ty);
593 fjz0 = _mm_add_ps(fjz0,tz);
597 /**************************
598 * CALCULATE INTERACTIONS *
599 **************************/
601 if (gmx_mm_any_lt(rsq21,rcutoff2))
604 /* REACTION-FIELD ELECTROSTATICS */
605 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
606 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
608 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
610 /* Update potential sum for this i atom from the interaction with this j atom. */
611 velec = _mm_and_ps(velec,cutoff_mask);
612 velecsum = _mm_add_ps(velecsum,velec);
614 fscal = felec;
616 fscal = _mm_and_ps(fscal,cutoff_mask);
618 /* Calculate temporary vectorial force */
619 tx = _mm_mul_ps(fscal,dx21);
620 ty = _mm_mul_ps(fscal,dy21);
621 tz = _mm_mul_ps(fscal,dz21);
623 /* Update vectorial force */
624 fix2 = _mm_add_ps(fix2,tx);
625 fiy2 = _mm_add_ps(fiy2,ty);
626 fiz2 = _mm_add_ps(fiz2,tz);
628 fjx1 = _mm_add_ps(fjx1,tx);
629 fjy1 = _mm_add_ps(fjy1,ty);
630 fjz1 = _mm_add_ps(fjz1,tz);
634 /**************************
635 * CALCULATE INTERACTIONS *
636 **************************/
638 if (gmx_mm_any_lt(rsq22,rcutoff2))
641 /* REACTION-FIELD ELECTROSTATICS */
642 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
643 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
645 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
647 /* Update potential sum for this i atom from the interaction with this j atom. */
648 velec = _mm_and_ps(velec,cutoff_mask);
649 velecsum = _mm_add_ps(velecsum,velec);
651 fscal = felec;
653 fscal = _mm_and_ps(fscal,cutoff_mask);
655 /* Calculate temporary vectorial force */
656 tx = _mm_mul_ps(fscal,dx22);
657 ty = _mm_mul_ps(fscal,dy22);
658 tz = _mm_mul_ps(fscal,dz22);
660 /* Update vectorial force */
661 fix2 = _mm_add_ps(fix2,tx);
662 fiy2 = _mm_add_ps(fiy2,ty);
663 fiz2 = _mm_add_ps(fiz2,tz);
665 fjx2 = _mm_add_ps(fjx2,tx);
666 fjy2 = _mm_add_ps(fjy2,ty);
667 fjz2 = _mm_add_ps(fjz2,tz);
671 fjptrA = f+j_coord_offsetA;
672 fjptrB = f+j_coord_offsetB;
673 fjptrC = f+j_coord_offsetC;
674 fjptrD = f+j_coord_offsetD;
676 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
677 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
679 /* Inner loop uses 358 flops */
682 if(jidx<j_index_end)
685 /* Get j neighbor index, and coordinate index */
686 jnrlistA = jjnr[jidx];
687 jnrlistB = jjnr[jidx+1];
688 jnrlistC = jjnr[jidx+2];
689 jnrlistD = jjnr[jidx+3];
690 /* Sign of each element will be negative for non-real atoms.
691 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
692 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
694 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
695 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
696 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
697 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
698 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
699 j_coord_offsetA = DIM*jnrA;
700 j_coord_offsetB = DIM*jnrB;
701 j_coord_offsetC = DIM*jnrC;
702 j_coord_offsetD = DIM*jnrD;
704 /* load j atom coordinates */
705 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
706 x+j_coord_offsetC,x+j_coord_offsetD,
707 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
709 /* Calculate displacement vector */
710 dx00 = _mm_sub_ps(ix0,jx0);
711 dy00 = _mm_sub_ps(iy0,jy0);
712 dz00 = _mm_sub_ps(iz0,jz0);
713 dx01 = _mm_sub_ps(ix0,jx1);
714 dy01 = _mm_sub_ps(iy0,jy1);
715 dz01 = _mm_sub_ps(iz0,jz1);
716 dx02 = _mm_sub_ps(ix0,jx2);
717 dy02 = _mm_sub_ps(iy0,jy2);
718 dz02 = _mm_sub_ps(iz0,jz2);
719 dx10 = _mm_sub_ps(ix1,jx0);
720 dy10 = _mm_sub_ps(iy1,jy0);
721 dz10 = _mm_sub_ps(iz1,jz0);
722 dx11 = _mm_sub_ps(ix1,jx1);
723 dy11 = _mm_sub_ps(iy1,jy1);
724 dz11 = _mm_sub_ps(iz1,jz1);
725 dx12 = _mm_sub_ps(ix1,jx2);
726 dy12 = _mm_sub_ps(iy1,jy2);
727 dz12 = _mm_sub_ps(iz1,jz2);
728 dx20 = _mm_sub_ps(ix2,jx0);
729 dy20 = _mm_sub_ps(iy2,jy0);
730 dz20 = _mm_sub_ps(iz2,jz0);
731 dx21 = _mm_sub_ps(ix2,jx1);
732 dy21 = _mm_sub_ps(iy2,jy1);
733 dz21 = _mm_sub_ps(iz2,jz1);
734 dx22 = _mm_sub_ps(ix2,jx2);
735 dy22 = _mm_sub_ps(iy2,jy2);
736 dz22 = _mm_sub_ps(iz2,jz2);
738 /* Calculate squared distance and things based on it */
739 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
740 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
741 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
742 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
743 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
744 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
745 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
746 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
747 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
749 rinv00 = gmx_mm_invsqrt_ps(rsq00);
750 rinv01 = gmx_mm_invsqrt_ps(rsq01);
751 rinv02 = gmx_mm_invsqrt_ps(rsq02);
752 rinv10 = gmx_mm_invsqrt_ps(rsq10);
753 rinv11 = gmx_mm_invsqrt_ps(rsq11);
754 rinv12 = gmx_mm_invsqrt_ps(rsq12);
755 rinv20 = gmx_mm_invsqrt_ps(rsq20);
756 rinv21 = gmx_mm_invsqrt_ps(rsq21);
757 rinv22 = gmx_mm_invsqrt_ps(rsq22);
759 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
760 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
761 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
762 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
763 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
764 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
765 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
766 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
767 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
769 fjx0 = _mm_setzero_ps();
770 fjy0 = _mm_setzero_ps();
771 fjz0 = _mm_setzero_ps();
772 fjx1 = _mm_setzero_ps();
773 fjy1 = _mm_setzero_ps();
774 fjz1 = _mm_setzero_ps();
775 fjx2 = _mm_setzero_ps();
776 fjy2 = _mm_setzero_ps();
777 fjz2 = _mm_setzero_ps();
779 /**************************
780 * CALCULATE INTERACTIONS *
781 **************************/
783 if (gmx_mm_any_lt(rsq00,rcutoff2))
786 r00 = _mm_mul_ps(rsq00,rinv00);
787 r00 = _mm_andnot_ps(dummy_mask,r00);
789 /* REACTION-FIELD ELECTROSTATICS */
790 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_add_ps(rinv00,_mm_mul_ps(krf,rsq00)),crf));
791 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
793 /* LENNARD-JONES DISPERSION/REPULSION */
795 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
796 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
797 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
798 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
799 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
801 d = _mm_sub_ps(r00,rswitch);
802 d = _mm_max_ps(d,_mm_setzero_ps());
803 d2 = _mm_mul_ps(d,d);
804 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
806 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
808 /* Evaluate switch function */
809 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
810 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
811 vvdw = _mm_mul_ps(vvdw,sw);
812 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
814 /* Update potential sum for this i atom from the interaction with this j atom. */
815 velec = _mm_and_ps(velec,cutoff_mask);
816 velec = _mm_andnot_ps(dummy_mask,velec);
817 velecsum = _mm_add_ps(velecsum,velec);
818 vvdw = _mm_and_ps(vvdw,cutoff_mask);
819 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
820 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
822 fscal = _mm_add_ps(felec,fvdw);
824 fscal = _mm_and_ps(fscal,cutoff_mask);
826 fscal = _mm_andnot_ps(dummy_mask,fscal);
828 /* Calculate temporary vectorial force */
829 tx = _mm_mul_ps(fscal,dx00);
830 ty = _mm_mul_ps(fscal,dy00);
831 tz = _mm_mul_ps(fscal,dz00);
833 /* Update vectorial force */
834 fix0 = _mm_add_ps(fix0,tx);
835 fiy0 = _mm_add_ps(fiy0,ty);
836 fiz0 = _mm_add_ps(fiz0,tz);
838 fjx0 = _mm_add_ps(fjx0,tx);
839 fjy0 = _mm_add_ps(fjy0,ty);
840 fjz0 = _mm_add_ps(fjz0,tz);
844 /**************************
845 * CALCULATE INTERACTIONS *
846 **************************/
848 if (gmx_mm_any_lt(rsq01,rcutoff2))
851 /* REACTION-FIELD ELECTROSTATICS */
852 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_add_ps(rinv01,_mm_mul_ps(krf,rsq01)),crf));
853 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
855 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
857 /* Update potential sum for this i atom from the interaction with this j atom. */
858 velec = _mm_and_ps(velec,cutoff_mask);
859 velec = _mm_andnot_ps(dummy_mask,velec);
860 velecsum = _mm_add_ps(velecsum,velec);
862 fscal = felec;
864 fscal = _mm_and_ps(fscal,cutoff_mask);
866 fscal = _mm_andnot_ps(dummy_mask,fscal);
868 /* Calculate temporary vectorial force */
869 tx = _mm_mul_ps(fscal,dx01);
870 ty = _mm_mul_ps(fscal,dy01);
871 tz = _mm_mul_ps(fscal,dz01);
873 /* Update vectorial force */
874 fix0 = _mm_add_ps(fix0,tx);
875 fiy0 = _mm_add_ps(fiy0,ty);
876 fiz0 = _mm_add_ps(fiz0,tz);
878 fjx1 = _mm_add_ps(fjx1,tx);
879 fjy1 = _mm_add_ps(fjy1,ty);
880 fjz1 = _mm_add_ps(fjz1,tz);
884 /**************************
885 * CALCULATE INTERACTIONS *
886 **************************/
888 if (gmx_mm_any_lt(rsq02,rcutoff2))
891 /* REACTION-FIELD ELECTROSTATICS */
892 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_add_ps(rinv02,_mm_mul_ps(krf,rsq02)),crf));
893 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
895 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
897 /* Update potential sum for this i atom from the interaction with this j atom. */
898 velec = _mm_and_ps(velec,cutoff_mask);
899 velec = _mm_andnot_ps(dummy_mask,velec);
900 velecsum = _mm_add_ps(velecsum,velec);
902 fscal = felec;
904 fscal = _mm_and_ps(fscal,cutoff_mask);
906 fscal = _mm_andnot_ps(dummy_mask,fscal);
908 /* Calculate temporary vectorial force */
909 tx = _mm_mul_ps(fscal,dx02);
910 ty = _mm_mul_ps(fscal,dy02);
911 tz = _mm_mul_ps(fscal,dz02);
913 /* Update vectorial force */
914 fix0 = _mm_add_ps(fix0,tx);
915 fiy0 = _mm_add_ps(fiy0,ty);
916 fiz0 = _mm_add_ps(fiz0,tz);
918 fjx2 = _mm_add_ps(fjx2,tx);
919 fjy2 = _mm_add_ps(fjy2,ty);
920 fjz2 = _mm_add_ps(fjz2,tz);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 if (gmx_mm_any_lt(rsq10,rcutoff2))
931 /* REACTION-FIELD ELECTROSTATICS */
932 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_add_ps(rinv10,_mm_mul_ps(krf,rsq10)),crf));
933 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
935 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
937 /* Update potential sum for this i atom from the interaction with this j atom. */
938 velec = _mm_and_ps(velec,cutoff_mask);
939 velec = _mm_andnot_ps(dummy_mask,velec);
940 velecsum = _mm_add_ps(velecsum,velec);
942 fscal = felec;
944 fscal = _mm_and_ps(fscal,cutoff_mask);
946 fscal = _mm_andnot_ps(dummy_mask,fscal);
948 /* Calculate temporary vectorial force */
949 tx = _mm_mul_ps(fscal,dx10);
950 ty = _mm_mul_ps(fscal,dy10);
951 tz = _mm_mul_ps(fscal,dz10);
953 /* Update vectorial force */
954 fix1 = _mm_add_ps(fix1,tx);
955 fiy1 = _mm_add_ps(fiy1,ty);
956 fiz1 = _mm_add_ps(fiz1,tz);
958 fjx0 = _mm_add_ps(fjx0,tx);
959 fjy0 = _mm_add_ps(fjy0,ty);
960 fjz0 = _mm_add_ps(fjz0,tz);
964 /**************************
965 * CALCULATE INTERACTIONS *
966 **************************/
968 if (gmx_mm_any_lt(rsq11,rcutoff2))
971 /* REACTION-FIELD ELECTROSTATICS */
972 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
973 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
975 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
977 /* Update potential sum for this i atom from the interaction with this j atom. */
978 velec = _mm_and_ps(velec,cutoff_mask);
979 velec = _mm_andnot_ps(dummy_mask,velec);
980 velecsum = _mm_add_ps(velecsum,velec);
982 fscal = felec;
984 fscal = _mm_and_ps(fscal,cutoff_mask);
986 fscal = _mm_andnot_ps(dummy_mask,fscal);
988 /* Calculate temporary vectorial force */
989 tx = _mm_mul_ps(fscal,dx11);
990 ty = _mm_mul_ps(fscal,dy11);
991 tz = _mm_mul_ps(fscal,dz11);
993 /* Update vectorial force */
994 fix1 = _mm_add_ps(fix1,tx);
995 fiy1 = _mm_add_ps(fiy1,ty);
996 fiz1 = _mm_add_ps(fiz1,tz);
998 fjx1 = _mm_add_ps(fjx1,tx);
999 fjy1 = _mm_add_ps(fjy1,ty);
1000 fjz1 = _mm_add_ps(fjz1,tz);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 if (gmx_mm_any_lt(rsq12,rcutoff2))
1011 /* REACTION-FIELD ELECTROSTATICS */
1012 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
1013 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1015 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 velec = _mm_and_ps(velec,cutoff_mask);
1019 velec = _mm_andnot_ps(dummy_mask,velec);
1020 velecsum = _mm_add_ps(velecsum,velec);
1022 fscal = felec;
1024 fscal = _mm_and_ps(fscal,cutoff_mask);
1026 fscal = _mm_andnot_ps(dummy_mask,fscal);
1028 /* Calculate temporary vectorial force */
1029 tx = _mm_mul_ps(fscal,dx12);
1030 ty = _mm_mul_ps(fscal,dy12);
1031 tz = _mm_mul_ps(fscal,dz12);
1033 /* Update vectorial force */
1034 fix1 = _mm_add_ps(fix1,tx);
1035 fiy1 = _mm_add_ps(fiy1,ty);
1036 fiz1 = _mm_add_ps(fiz1,tz);
1038 fjx2 = _mm_add_ps(fjx2,tx);
1039 fjy2 = _mm_add_ps(fjy2,ty);
1040 fjz2 = _mm_add_ps(fjz2,tz);
1044 /**************************
1045 * CALCULATE INTERACTIONS *
1046 **************************/
1048 if (gmx_mm_any_lt(rsq20,rcutoff2))
1051 /* REACTION-FIELD ELECTROSTATICS */
1052 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_add_ps(rinv20,_mm_mul_ps(krf,rsq20)),crf));
1053 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1055 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1057 /* Update potential sum for this i atom from the interaction with this j atom. */
1058 velec = _mm_and_ps(velec,cutoff_mask);
1059 velec = _mm_andnot_ps(dummy_mask,velec);
1060 velecsum = _mm_add_ps(velecsum,velec);
1062 fscal = felec;
1064 fscal = _mm_and_ps(fscal,cutoff_mask);
1066 fscal = _mm_andnot_ps(dummy_mask,fscal);
1068 /* Calculate temporary vectorial force */
1069 tx = _mm_mul_ps(fscal,dx20);
1070 ty = _mm_mul_ps(fscal,dy20);
1071 tz = _mm_mul_ps(fscal,dz20);
1073 /* Update vectorial force */
1074 fix2 = _mm_add_ps(fix2,tx);
1075 fiy2 = _mm_add_ps(fiy2,ty);
1076 fiz2 = _mm_add_ps(fiz2,tz);
1078 fjx0 = _mm_add_ps(fjx0,tx);
1079 fjy0 = _mm_add_ps(fjy0,ty);
1080 fjz0 = _mm_add_ps(fjz0,tz);
1084 /**************************
1085 * CALCULATE INTERACTIONS *
1086 **************************/
1088 if (gmx_mm_any_lt(rsq21,rcutoff2))
1091 /* REACTION-FIELD ELECTROSTATICS */
1092 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
1093 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1095 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1097 /* Update potential sum for this i atom from the interaction with this j atom. */
1098 velec = _mm_and_ps(velec,cutoff_mask);
1099 velec = _mm_andnot_ps(dummy_mask,velec);
1100 velecsum = _mm_add_ps(velecsum,velec);
1102 fscal = felec;
1104 fscal = _mm_and_ps(fscal,cutoff_mask);
1106 fscal = _mm_andnot_ps(dummy_mask,fscal);
1108 /* Calculate temporary vectorial force */
1109 tx = _mm_mul_ps(fscal,dx21);
1110 ty = _mm_mul_ps(fscal,dy21);
1111 tz = _mm_mul_ps(fscal,dz21);
1113 /* Update vectorial force */
1114 fix2 = _mm_add_ps(fix2,tx);
1115 fiy2 = _mm_add_ps(fiy2,ty);
1116 fiz2 = _mm_add_ps(fiz2,tz);
1118 fjx1 = _mm_add_ps(fjx1,tx);
1119 fjy1 = _mm_add_ps(fjy1,ty);
1120 fjz1 = _mm_add_ps(fjz1,tz);
1124 /**************************
1125 * CALCULATE INTERACTIONS *
1126 **************************/
1128 if (gmx_mm_any_lt(rsq22,rcutoff2))
1131 /* REACTION-FIELD ELECTROSTATICS */
1132 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
1133 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1135 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1137 /* Update potential sum for this i atom from the interaction with this j atom. */
1138 velec = _mm_and_ps(velec,cutoff_mask);
1139 velec = _mm_andnot_ps(dummy_mask,velec);
1140 velecsum = _mm_add_ps(velecsum,velec);
1142 fscal = felec;
1144 fscal = _mm_and_ps(fscal,cutoff_mask);
1146 fscal = _mm_andnot_ps(dummy_mask,fscal);
1148 /* Calculate temporary vectorial force */
1149 tx = _mm_mul_ps(fscal,dx22);
1150 ty = _mm_mul_ps(fscal,dy22);
1151 tz = _mm_mul_ps(fscal,dz22);
1153 /* Update vectorial force */
1154 fix2 = _mm_add_ps(fix2,tx);
1155 fiy2 = _mm_add_ps(fiy2,ty);
1156 fiz2 = _mm_add_ps(fiz2,tz);
1158 fjx2 = _mm_add_ps(fjx2,tx);
1159 fjy2 = _mm_add_ps(fjy2,ty);
1160 fjz2 = _mm_add_ps(fjz2,tz);
1164 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1165 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1166 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1167 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1169 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1170 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1172 /* Inner loop uses 359 flops */
1175 /* End of innermost loop */
1177 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1178 f+i_coord_offset,fshift+i_shift_offset);
1180 ggid = gid[iidx];
1181 /* Update potential energies */
1182 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1183 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1185 /* Increment number of inner iterations */
1186 inneriter += j_index_end - j_index_start;
1188 /* Outer loop uses 20 flops */
1191 /* Increment number of outer iterations */
1192 outeriter += nri;
1194 /* Update outer/inner flops */
1196 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1199 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
1200 * Electrostatics interaction: ReactionField
1201 * VdW interaction: LennardJones
1202 * Geometry: Water3-Water3
1203 * Calculate force/pot: Force
1205 void
1206 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
1207 (t_nblist * gmx_restrict nlist,
1208 rvec * gmx_restrict xx,
1209 rvec * gmx_restrict ff,
1210 t_forcerec * gmx_restrict fr,
1211 t_mdatoms * gmx_restrict mdatoms,
1212 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1213 t_nrnb * gmx_restrict nrnb)
1215 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1216 * just 0 for non-waters.
1217 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1218 * jnr indices corresponding to data put in the four positions in the SIMD register.
1220 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1221 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1222 int jnrA,jnrB,jnrC,jnrD;
1223 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1224 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1225 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1226 real rcutoff_scalar;
1227 real *shiftvec,*fshift,*x,*f;
1228 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1229 real scratch[4*DIM];
1230 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1231 int vdwioffset0;
1232 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1233 int vdwioffset1;
1234 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1235 int vdwioffset2;
1236 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1237 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1238 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1239 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1240 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1241 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1242 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1243 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1244 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1245 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1246 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1247 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1248 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1249 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1250 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1251 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1252 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1253 real *charge;
1254 int nvdwtype;
1255 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1256 int *vdwtype;
1257 real *vdwparam;
1258 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1259 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1260 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1261 real rswitch_scalar,d_scalar;
1262 __m128 dummy_mask,cutoff_mask;
1263 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1264 __m128 one = _mm_set1_ps(1.0);
1265 __m128 two = _mm_set1_ps(2.0);
1266 x = xx[0];
1267 f = ff[0];
1269 nri = nlist->nri;
1270 iinr = nlist->iinr;
1271 jindex = nlist->jindex;
1272 jjnr = nlist->jjnr;
1273 shiftidx = nlist->shift;
1274 gid = nlist->gid;
1275 shiftvec = fr->shift_vec[0];
1276 fshift = fr->fshift[0];
1277 facel = _mm_set1_ps(fr->epsfac);
1278 charge = mdatoms->chargeA;
1279 krf = _mm_set1_ps(fr->ic->k_rf);
1280 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1281 crf = _mm_set1_ps(fr->ic->c_rf);
1282 nvdwtype = fr->ntype;
1283 vdwparam = fr->nbfp;
1284 vdwtype = mdatoms->typeA;
1286 /* Setup water-specific parameters */
1287 inr = nlist->iinr[0];
1288 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1289 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1290 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1291 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1293 jq0 = _mm_set1_ps(charge[inr+0]);
1294 jq1 = _mm_set1_ps(charge[inr+1]);
1295 jq2 = _mm_set1_ps(charge[inr+2]);
1296 vdwjidx0A = 2*vdwtype[inr+0];
1297 qq00 = _mm_mul_ps(iq0,jq0);
1298 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1299 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1300 qq01 = _mm_mul_ps(iq0,jq1);
1301 qq02 = _mm_mul_ps(iq0,jq2);
1302 qq10 = _mm_mul_ps(iq1,jq0);
1303 qq11 = _mm_mul_ps(iq1,jq1);
1304 qq12 = _mm_mul_ps(iq1,jq2);
1305 qq20 = _mm_mul_ps(iq2,jq0);
1306 qq21 = _mm_mul_ps(iq2,jq1);
1307 qq22 = _mm_mul_ps(iq2,jq2);
1309 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1310 rcutoff_scalar = fr->rcoulomb;
1311 rcutoff = _mm_set1_ps(rcutoff_scalar);
1312 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1314 rswitch_scalar = fr->rvdw_switch;
1315 rswitch = _mm_set1_ps(rswitch_scalar);
1316 /* Setup switch parameters */
1317 d_scalar = rcutoff_scalar-rswitch_scalar;
1318 d = _mm_set1_ps(d_scalar);
1319 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1320 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1321 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1322 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1323 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1324 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1326 /* Avoid stupid compiler warnings */
1327 jnrA = jnrB = jnrC = jnrD = 0;
1328 j_coord_offsetA = 0;
1329 j_coord_offsetB = 0;
1330 j_coord_offsetC = 0;
1331 j_coord_offsetD = 0;
1333 outeriter = 0;
1334 inneriter = 0;
1336 for(iidx=0;iidx<4*DIM;iidx++)
1338 scratch[iidx] = 0.0;
1341 /* Start outer loop over neighborlists */
1342 for(iidx=0; iidx<nri; iidx++)
1344 /* Load shift vector for this list */
1345 i_shift_offset = DIM*shiftidx[iidx];
1347 /* Load limits for loop over neighbors */
1348 j_index_start = jindex[iidx];
1349 j_index_end = jindex[iidx+1];
1351 /* Get outer coordinate index */
1352 inr = iinr[iidx];
1353 i_coord_offset = DIM*inr;
1355 /* Load i particle coords and add shift vector */
1356 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1357 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1359 fix0 = _mm_setzero_ps();
1360 fiy0 = _mm_setzero_ps();
1361 fiz0 = _mm_setzero_ps();
1362 fix1 = _mm_setzero_ps();
1363 fiy1 = _mm_setzero_ps();
1364 fiz1 = _mm_setzero_ps();
1365 fix2 = _mm_setzero_ps();
1366 fiy2 = _mm_setzero_ps();
1367 fiz2 = _mm_setzero_ps();
1369 /* Start inner kernel loop */
1370 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1373 /* Get j neighbor index, and coordinate index */
1374 jnrA = jjnr[jidx];
1375 jnrB = jjnr[jidx+1];
1376 jnrC = jjnr[jidx+2];
1377 jnrD = jjnr[jidx+3];
1378 j_coord_offsetA = DIM*jnrA;
1379 j_coord_offsetB = DIM*jnrB;
1380 j_coord_offsetC = DIM*jnrC;
1381 j_coord_offsetD = DIM*jnrD;
1383 /* load j atom coordinates */
1384 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1385 x+j_coord_offsetC,x+j_coord_offsetD,
1386 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1388 /* Calculate displacement vector */
1389 dx00 = _mm_sub_ps(ix0,jx0);
1390 dy00 = _mm_sub_ps(iy0,jy0);
1391 dz00 = _mm_sub_ps(iz0,jz0);
1392 dx01 = _mm_sub_ps(ix0,jx1);
1393 dy01 = _mm_sub_ps(iy0,jy1);
1394 dz01 = _mm_sub_ps(iz0,jz1);
1395 dx02 = _mm_sub_ps(ix0,jx2);
1396 dy02 = _mm_sub_ps(iy0,jy2);
1397 dz02 = _mm_sub_ps(iz0,jz2);
1398 dx10 = _mm_sub_ps(ix1,jx0);
1399 dy10 = _mm_sub_ps(iy1,jy0);
1400 dz10 = _mm_sub_ps(iz1,jz0);
1401 dx11 = _mm_sub_ps(ix1,jx1);
1402 dy11 = _mm_sub_ps(iy1,jy1);
1403 dz11 = _mm_sub_ps(iz1,jz1);
1404 dx12 = _mm_sub_ps(ix1,jx2);
1405 dy12 = _mm_sub_ps(iy1,jy2);
1406 dz12 = _mm_sub_ps(iz1,jz2);
1407 dx20 = _mm_sub_ps(ix2,jx0);
1408 dy20 = _mm_sub_ps(iy2,jy0);
1409 dz20 = _mm_sub_ps(iz2,jz0);
1410 dx21 = _mm_sub_ps(ix2,jx1);
1411 dy21 = _mm_sub_ps(iy2,jy1);
1412 dz21 = _mm_sub_ps(iz2,jz1);
1413 dx22 = _mm_sub_ps(ix2,jx2);
1414 dy22 = _mm_sub_ps(iy2,jy2);
1415 dz22 = _mm_sub_ps(iz2,jz2);
1417 /* Calculate squared distance and things based on it */
1418 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1419 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1420 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1421 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1422 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1423 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1424 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1425 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1426 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1428 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1429 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1430 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1431 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1432 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1433 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1434 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1435 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1436 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1438 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1439 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1440 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1441 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1442 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1443 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1444 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1445 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1446 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1448 fjx0 = _mm_setzero_ps();
1449 fjy0 = _mm_setzero_ps();
1450 fjz0 = _mm_setzero_ps();
1451 fjx1 = _mm_setzero_ps();
1452 fjy1 = _mm_setzero_ps();
1453 fjz1 = _mm_setzero_ps();
1454 fjx2 = _mm_setzero_ps();
1455 fjy2 = _mm_setzero_ps();
1456 fjz2 = _mm_setzero_ps();
1458 /**************************
1459 * CALCULATE INTERACTIONS *
1460 **************************/
1462 if (gmx_mm_any_lt(rsq00,rcutoff2))
1465 r00 = _mm_mul_ps(rsq00,rinv00);
1467 /* REACTION-FIELD ELECTROSTATICS */
1468 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1470 /* LENNARD-JONES DISPERSION/REPULSION */
1472 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1473 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1474 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1475 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1476 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1478 d = _mm_sub_ps(r00,rswitch);
1479 d = _mm_max_ps(d,_mm_setzero_ps());
1480 d2 = _mm_mul_ps(d,d);
1481 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1483 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1485 /* Evaluate switch function */
1486 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1487 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1488 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1490 fscal = _mm_add_ps(felec,fvdw);
1492 fscal = _mm_and_ps(fscal,cutoff_mask);
1494 /* Calculate temporary vectorial force */
1495 tx = _mm_mul_ps(fscal,dx00);
1496 ty = _mm_mul_ps(fscal,dy00);
1497 tz = _mm_mul_ps(fscal,dz00);
1499 /* Update vectorial force */
1500 fix0 = _mm_add_ps(fix0,tx);
1501 fiy0 = _mm_add_ps(fiy0,ty);
1502 fiz0 = _mm_add_ps(fiz0,tz);
1504 fjx0 = _mm_add_ps(fjx0,tx);
1505 fjy0 = _mm_add_ps(fjy0,ty);
1506 fjz0 = _mm_add_ps(fjz0,tz);
1510 /**************************
1511 * CALCULATE INTERACTIONS *
1512 **************************/
1514 if (gmx_mm_any_lt(rsq01,rcutoff2))
1517 /* REACTION-FIELD ELECTROSTATICS */
1518 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1520 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1522 fscal = felec;
1524 fscal = _mm_and_ps(fscal,cutoff_mask);
1526 /* Calculate temporary vectorial force */
1527 tx = _mm_mul_ps(fscal,dx01);
1528 ty = _mm_mul_ps(fscal,dy01);
1529 tz = _mm_mul_ps(fscal,dz01);
1531 /* Update vectorial force */
1532 fix0 = _mm_add_ps(fix0,tx);
1533 fiy0 = _mm_add_ps(fiy0,ty);
1534 fiz0 = _mm_add_ps(fiz0,tz);
1536 fjx1 = _mm_add_ps(fjx1,tx);
1537 fjy1 = _mm_add_ps(fjy1,ty);
1538 fjz1 = _mm_add_ps(fjz1,tz);
1542 /**************************
1543 * CALCULATE INTERACTIONS *
1544 **************************/
1546 if (gmx_mm_any_lt(rsq02,rcutoff2))
1549 /* REACTION-FIELD ELECTROSTATICS */
1550 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1552 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1554 fscal = felec;
1556 fscal = _mm_and_ps(fscal,cutoff_mask);
1558 /* Calculate temporary vectorial force */
1559 tx = _mm_mul_ps(fscal,dx02);
1560 ty = _mm_mul_ps(fscal,dy02);
1561 tz = _mm_mul_ps(fscal,dz02);
1563 /* Update vectorial force */
1564 fix0 = _mm_add_ps(fix0,tx);
1565 fiy0 = _mm_add_ps(fiy0,ty);
1566 fiz0 = _mm_add_ps(fiz0,tz);
1568 fjx2 = _mm_add_ps(fjx2,tx);
1569 fjy2 = _mm_add_ps(fjy2,ty);
1570 fjz2 = _mm_add_ps(fjz2,tz);
1574 /**************************
1575 * CALCULATE INTERACTIONS *
1576 **************************/
1578 if (gmx_mm_any_lt(rsq10,rcutoff2))
1581 /* REACTION-FIELD ELECTROSTATICS */
1582 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
1584 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1586 fscal = felec;
1588 fscal = _mm_and_ps(fscal,cutoff_mask);
1590 /* Calculate temporary vectorial force */
1591 tx = _mm_mul_ps(fscal,dx10);
1592 ty = _mm_mul_ps(fscal,dy10);
1593 tz = _mm_mul_ps(fscal,dz10);
1595 /* Update vectorial force */
1596 fix1 = _mm_add_ps(fix1,tx);
1597 fiy1 = _mm_add_ps(fiy1,ty);
1598 fiz1 = _mm_add_ps(fiz1,tz);
1600 fjx0 = _mm_add_ps(fjx0,tx);
1601 fjy0 = _mm_add_ps(fjy0,ty);
1602 fjz0 = _mm_add_ps(fjz0,tz);
1606 /**************************
1607 * CALCULATE INTERACTIONS *
1608 **************************/
1610 if (gmx_mm_any_lt(rsq11,rcutoff2))
1613 /* REACTION-FIELD ELECTROSTATICS */
1614 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1616 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1618 fscal = felec;
1620 fscal = _mm_and_ps(fscal,cutoff_mask);
1622 /* Calculate temporary vectorial force */
1623 tx = _mm_mul_ps(fscal,dx11);
1624 ty = _mm_mul_ps(fscal,dy11);
1625 tz = _mm_mul_ps(fscal,dz11);
1627 /* Update vectorial force */
1628 fix1 = _mm_add_ps(fix1,tx);
1629 fiy1 = _mm_add_ps(fiy1,ty);
1630 fiz1 = _mm_add_ps(fiz1,tz);
1632 fjx1 = _mm_add_ps(fjx1,tx);
1633 fjy1 = _mm_add_ps(fjy1,ty);
1634 fjz1 = _mm_add_ps(fjz1,tz);
1638 /**************************
1639 * CALCULATE INTERACTIONS *
1640 **************************/
1642 if (gmx_mm_any_lt(rsq12,rcutoff2))
1645 /* REACTION-FIELD ELECTROSTATICS */
1646 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1648 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1650 fscal = felec;
1652 fscal = _mm_and_ps(fscal,cutoff_mask);
1654 /* Calculate temporary vectorial force */
1655 tx = _mm_mul_ps(fscal,dx12);
1656 ty = _mm_mul_ps(fscal,dy12);
1657 tz = _mm_mul_ps(fscal,dz12);
1659 /* Update vectorial force */
1660 fix1 = _mm_add_ps(fix1,tx);
1661 fiy1 = _mm_add_ps(fiy1,ty);
1662 fiz1 = _mm_add_ps(fiz1,tz);
1664 fjx2 = _mm_add_ps(fjx2,tx);
1665 fjy2 = _mm_add_ps(fjy2,ty);
1666 fjz2 = _mm_add_ps(fjz2,tz);
1670 /**************************
1671 * CALCULATE INTERACTIONS *
1672 **************************/
1674 if (gmx_mm_any_lt(rsq20,rcutoff2))
1677 /* REACTION-FIELD ELECTROSTATICS */
1678 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
1680 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1682 fscal = felec;
1684 fscal = _mm_and_ps(fscal,cutoff_mask);
1686 /* Calculate temporary vectorial force */
1687 tx = _mm_mul_ps(fscal,dx20);
1688 ty = _mm_mul_ps(fscal,dy20);
1689 tz = _mm_mul_ps(fscal,dz20);
1691 /* Update vectorial force */
1692 fix2 = _mm_add_ps(fix2,tx);
1693 fiy2 = _mm_add_ps(fiy2,ty);
1694 fiz2 = _mm_add_ps(fiz2,tz);
1696 fjx0 = _mm_add_ps(fjx0,tx);
1697 fjy0 = _mm_add_ps(fjy0,ty);
1698 fjz0 = _mm_add_ps(fjz0,tz);
1702 /**************************
1703 * CALCULATE INTERACTIONS *
1704 **************************/
1706 if (gmx_mm_any_lt(rsq21,rcutoff2))
1709 /* REACTION-FIELD ELECTROSTATICS */
1710 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1712 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1714 fscal = felec;
1716 fscal = _mm_and_ps(fscal,cutoff_mask);
1718 /* Calculate temporary vectorial force */
1719 tx = _mm_mul_ps(fscal,dx21);
1720 ty = _mm_mul_ps(fscal,dy21);
1721 tz = _mm_mul_ps(fscal,dz21);
1723 /* Update vectorial force */
1724 fix2 = _mm_add_ps(fix2,tx);
1725 fiy2 = _mm_add_ps(fiy2,ty);
1726 fiz2 = _mm_add_ps(fiz2,tz);
1728 fjx1 = _mm_add_ps(fjx1,tx);
1729 fjy1 = _mm_add_ps(fjy1,ty);
1730 fjz1 = _mm_add_ps(fjz1,tz);
1734 /**************************
1735 * CALCULATE INTERACTIONS *
1736 **************************/
1738 if (gmx_mm_any_lt(rsq22,rcutoff2))
1741 /* REACTION-FIELD ELECTROSTATICS */
1742 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1744 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1746 fscal = felec;
1748 fscal = _mm_and_ps(fscal,cutoff_mask);
1750 /* Calculate temporary vectorial force */
1751 tx = _mm_mul_ps(fscal,dx22);
1752 ty = _mm_mul_ps(fscal,dy22);
1753 tz = _mm_mul_ps(fscal,dz22);
1755 /* Update vectorial force */
1756 fix2 = _mm_add_ps(fix2,tx);
1757 fiy2 = _mm_add_ps(fiy2,ty);
1758 fiz2 = _mm_add_ps(fiz2,tz);
1760 fjx2 = _mm_add_ps(fjx2,tx);
1761 fjy2 = _mm_add_ps(fjy2,ty);
1762 fjz2 = _mm_add_ps(fjz2,tz);
1766 fjptrA = f+j_coord_offsetA;
1767 fjptrB = f+j_coord_offsetB;
1768 fjptrC = f+j_coord_offsetC;
1769 fjptrD = f+j_coord_offsetD;
1771 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1772 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1774 /* Inner loop uses 301 flops */
1777 if(jidx<j_index_end)
1780 /* Get j neighbor index, and coordinate index */
1781 jnrlistA = jjnr[jidx];
1782 jnrlistB = jjnr[jidx+1];
1783 jnrlistC = jjnr[jidx+2];
1784 jnrlistD = jjnr[jidx+3];
1785 /* Sign of each element will be negative for non-real atoms.
1786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1787 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1789 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1790 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1791 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1792 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1793 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1794 j_coord_offsetA = DIM*jnrA;
1795 j_coord_offsetB = DIM*jnrB;
1796 j_coord_offsetC = DIM*jnrC;
1797 j_coord_offsetD = DIM*jnrD;
1799 /* load j atom coordinates */
1800 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1801 x+j_coord_offsetC,x+j_coord_offsetD,
1802 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1804 /* Calculate displacement vector */
1805 dx00 = _mm_sub_ps(ix0,jx0);
1806 dy00 = _mm_sub_ps(iy0,jy0);
1807 dz00 = _mm_sub_ps(iz0,jz0);
1808 dx01 = _mm_sub_ps(ix0,jx1);
1809 dy01 = _mm_sub_ps(iy0,jy1);
1810 dz01 = _mm_sub_ps(iz0,jz1);
1811 dx02 = _mm_sub_ps(ix0,jx2);
1812 dy02 = _mm_sub_ps(iy0,jy2);
1813 dz02 = _mm_sub_ps(iz0,jz2);
1814 dx10 = _mm_sub_ps(ix1,jx0);
1815 dy10 = _mm_sub_ps(iy1,jy0);
1816 dz10 = _mm_sub_ps(iz1,jz0);
1817 dx11 = _mm_sub_ps(ix1,jx1);
1818 dy11 = _mm_sub_ps(iy1,jy1);
1819 dz11 = _mm_sub_ps(iz1,jz1);
1820 dx12 = _mm_sub_ps(ix1,jx2);
1821 dy12 = _mm_sub_ps(iy1,jy2);
1822 dz12 = _mm_sub_ps(iz1,jz2);
1823 dx20 = _mm_sub_ps(ix2,jx0);
1824 dy20 = _mm_sub_ps(iy2,jy0);
1825 dz20 = _mm_sub_ps(iz2,jz0);
1826 dx21 = _mm_sub_ps(ix2,jx1);
1827 dy21 = _mm_sub_ps(iy2,jy1);
1828 dz21 = _mm_sub_ps(iz2,jz1);
1829 dx22 = _mm_sub_ps(ix2,jx2);
1830 dy22 = _mm_sub_ps(iy2,jy2);
1831 dz22 = _mm_sub_ps(iz2,jz2);
1833 /* Calculate squared distance and things based on it */
1834 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1835 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1836 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1837 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1838 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1839 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1840 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1841 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1842 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1844 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1845 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1846 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1847 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1848 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1849 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1850 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1851 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1852 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1854 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1855 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1856 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1857 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1858 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1859 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1860 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1861 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1862 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1864 fjx0 = _mm_setzero_ps();
1865 fjy0 = _mm_setzero_ps();
1866 fjz0 = _mm_setzero_ps();
1867 fjx1 = _mm_setzero_ps();
1868 fjy1 = _mm_setzero_ps();
1869 fjz1 = _mm_setzero_ps();
1870 fjx2 = _mm_setzero_ps();
1871 fjy2 = _mm_setzero_ps();
1872 fjz2 = _mm_setzero_ps();
1874 /**************************
1875 * CALCULATE INTERACTIONS *
1876 **************************/
1878 if (gmx_mm_any_lt(rsq00,rcutoff2))
1881 r00 = _mm_mul_ps(rsq00,rinv00);
1882 r00 = _mm_andnot_ps(dummy_mask,r00);
1884 /* REACTION-FIELD ELECTROSTATICS */
1885 felec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_mul_ps(rinv00,rinvsq00),krf2));
1887 /* LENNARD-JONES DISPERSION/REPULSION */
1889 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1890 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1891 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1892 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1893 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1895 d = _mm_sub_ps(r00,rswitch);
1896 d = _mm_max_ps(d,_mm_setzero_ps());
1897 d2 = _mm_mul_ps(d,d);
1898 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1900 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1902 /* Evaluate switch function */
1903 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1904 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1905 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1907 fscal = _mm_add_ps(felec,fvdw);
1909 fscal = _mm_and_ps(fscal,cutoff_mask);
1911 fscal = _mm_andnot_ps(dummy_mask,fscal);
1913 /* Calculate temporary vectorial force */
1914 tx = _mm_mul_ps(fscal,dx00);
1915 ty = _mm_mul_ps(fscal,dy00);
1916 tz = _mm_mul_ps(fscal,dz00);
1918 /* Update vectorial force */
1919 fix0 = _mm_add_ps(fix0,tx);
1920 fiy0 = _mm_add_ps(fiy0,ty);
1921 fiz0 = _mm_add_ps(fiz0,tz);
1923 fjx0 = _mm_add_ps(fjx0,tx);
1924 fjy0 = _mm_add_ps(fjy0,ty);
1925 fjz0 = _mm_add_ps(fjz0,tz);
1929 /**************************
1930 * CALCULATE INTERACTIONS *
1931 **************************/
1933 if (gmx_mm_any_lt(rsq01,rcutoff2))
1936 /* REACTION-FIELD ELECTROSTATICS */
1937 felec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_mul_ps(rinv01,rinvsq01),krf2));
1939 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1941 fscal = felec;
1943 fscal = _mm_and_ps(fscal,cutoff_mask);
1945 fscal = _mm_andnot_ps(dummy_mask,fscal);
1947 /* Calculate temporary vectorial force */
1948 tx = _mm_mul_ps(fscal,dx01);
1949 ty = _mm_mul_ps(fscal,dy01);
1950 tz = _mm_mul_ps(fscal,dz01);
1952 /* Update vectorial force */
1953 fix0 = _mm_add_ps(fix0,tx);
1954 fiy0 = _mm_add_ps(fiy0,ty);
1955 fiz0 = _mm_add_ps(fiz0,tz);
1957 fjx1 = _mm_add_ps(fjx1,tx);
1958 fjy1 = _mm_add_ps(fjy1,ty);
1959 fjz1 = _mm_add_ps(fjz1,tz);
1963 /**************************
1964 * CALCULATE INTERACTIONS *
1965 **************************/
1967 if (gmx_mm_any_lt(rsq02,rcutoff2))
1970 /* REACTION-FIELD ELECTROSTATICS */
1971 felec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_mul_ps(rinv02,rinvsq02),krf2));
1973 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1975 fscal = felec;
1977 fscal = _mm_and_ps(fscal,cutoff_mask);
1979 fscal = _mm_andnot_ps(dummy_mask,fscal);
1981 /* Calculate temporary vectorial force */
1982 tx = _mm_mul_ps(fscal,dx02);
1983 ty = _mm_mul_ps(fscal,dy02);
1984 tz = _mm_mul_ps(fscal,dz02);
1986 /* Update vectorial force */
1987 fix0 = _mm_add_ps(fix0,tx);
1988 fiy0 = _mm_add_ps(fiy0,ty);
1989 fiz0 = _mm_add_ps(fiz0,tz);
1991 fjx2 = _mm_add_ps(fjx2,tx);
1992 fjy2 = _mm_add_ps(fjy2,ty);
1993 fjz2 = _mm_add_ps(fjz2,tz);
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2001 if (gmx_mm_any_lt(rsq10,rcutoff2))
2004 /* REACTION-FIELD ELECTROSTATICS */
2005 felec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_mul_ps(rinv10,rinvsq10),krf2));
2007 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2009 fscal = felec;
2011 fscal = _mm_and_ps(fscal,cutoff_mask);
2013 fscal = _mm_andnot_ps(dummy_mask,fscal);
2015 /* Calculate temporary vectorial force */
2016 tx = _mm_mul_ps(fscal,dx10);
2017 ty = _mm_mul_ps(fscal,dy10);
2018 tz = _mm_mul_ps(fscal,dz10);
2020 /* Update vectorial force */
2021 fix1 = _mm_add_ps(fix1,tx);
2022 fiy1 = _mm_add_ps(fiy1,ty);
2023 fiz1 = _mm_add_ps(fiz1,tz);
2025 fjx0 = _mm_add_ps(fjx0,tx);
2026 fjy0 = _mm_add_ps(fjy0,ty);
2027 fjz0 = _mm_add_ps(fjz0,tz);
2031 /**************************
2032 * CALCULATE INTERACTIONS *
2033 **************************/
2035 if (gmx_mm_any_lt(rsq11,rcutoff2))
2038 /* REACTION-FIELD ELECTROSTATICS */
2039 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
2041 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2043 fscal = felec;
2045 fscal = _mm_and_ps(fscal,cutoff_mask);
2047 fscal = _mm_andnot_ps(dummy_mask,fscal);
2049 /* Calculate temporary vectorial force */
2050 tx = _mm_mul_ps(fscal,dx11);
2051 ty = _mm_mul_ps(fscal,dy11);
2052 tz = _mm_mul_ps(fscal,dz11);
2054 /* Update vectorial force */
2055 fix1 = _mm_add_ps(fix1,tx);
2056 fiy1 = _mm_add_ps(fiy1,ty);
2057 fiz1 = _mm_add_ps(fiz1,tz);
2059 fjx1 = _mm_add_ps(fjx1,tx);
2060 fjy1 = _mm_add_ps(fjy1,ty);
2061 fjz1 = _mm_add_ps(fjz1,tz);
2065 /**************************
2066 * CALCULATE INTERACTIONS *
2067 **************************/
2069 if (gmx_mm_any_lt(rsq12,rcutoff2))
2072 /* REACTION-FIELD ELECTROSTATICS */
2073 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
2075 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2077 fscal = felec;
2079 fscal = _mm_and_ps(fscal,cutoff_mask);
2081 fscal = _mm_andnot_ps(dummy_mask,fscal);
2083 /* Calculate temporary vectorial force */
2084 tx = _mm_mul_ps(fscal,dx12);
2085 ty = _mm_mul_ps(fscal,dy12);
2086 tz = _mm_mul_ps(fscal,dz12);
2088 /* Update vectorial force */
2089 fix1 = _mm_add_ps(fix1,tx);
2090 fiy1 = _mm_add_ps(fiy1,ty);
2091 fiz1 = _mm_add_ps(fiz1,tz);
2093 fjx2 = _mm_add_ps(fjx2,tx);
2094 fjy2 = _mm_add_ps(fjy2,ty);
2095 fjz2 = _mm_add_ps(fjz2,tz);
2099 /**************************
2100 * CALCULATE INTERACTIONS *
2101 **************************/
2103 if (gmx_mm_any_lt(rsq20,rcutoff2))
2106 /* REACTION-FIELD ELECTROSTATICS */
2107 felec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_mul_ps(rinv20,rinvsq20),krf2));
2109 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2111 fscal = felec;
2113 fscal = _mm_and_ps(fscal,cutoff_mask);
2115 fscal = _mm_andnot_ps(dummy_mask,fscal);
2117 /* Calculate temporary vectorial force */
2118 tx = _mm_mul_ps(fscal,dx20);
2119 ty = _mm_mul_ps(fscal,dy20);
2120 tz = _mm_mul_ps(fscal,dz20);
2122 /* Update vectorial force */
2123 fix2 = _mm_add_ps(fix2,tx);
2124 fiy2 = _mm_add_ps(fiy2,ty);
2125 fiz2 = _mm_add_ps(fiz2,tz);
2127 fjx0 = _mm_add_ps(fjx0,tx);
2128 fjy0 = _mm_add_ps(fjy0,ty);
2129 fjz0 = _mm_add_ps(fjz0,tz);
2133 /**************************
2134 * CALCULATE INTERACTIONS *
2135 **************************/
2137 if (gmx_mm_any_lt(rsq21,rcutoff2))
2140 /* REACTION-FIELD ELECTROSTATICS */
2141 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
2143 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2145 fscal = felec;
2147 fscal = _mm_and_ps(fscal,cutoff_mask);
2149 fscal = _mm_andnot_ps(dummy_mask,fscal);
2151 /* Calculate temporary vectorial force */
2152 tx = _mm_mul_ps(fscal,dx21);
2153 ty = _mm_mul_ps(fscal,dy21);
2154 tz = _mm_mul_ps(fscal,dz21);
2156 /* Update vectorial force */
2157 fix2 = _mm_add_ps(fix2,tx);
2158 fiy2 = _mm_add_ps(fiy2,ty);
2159 fiz2 = _mm_add_ps(fiz2,tz);
2161 fjx1 = _mm_add_ps(fjx1,tx);
2162 fjy1 = _mm_add_ps(fjy1,ty);
2163 fjz1 = _mm_add_ps(fjz1,tz);
2167 /**************************
2168 * CALCULATE INTERACTIONS *
2169 **************************/
2171 if (gmx_mm_any_lt(rsq22,rcutoff2))
2174 /* REACTION-FIELD ELECTROSTATICS */
2175 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
2177 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2179 fscal = felec;
2181 fscal = _mm_and_ps(fscal,cutoff_mask);
2183 fscal = _mm_andnot_ps(dummy_mask,fscal);
2185 /* Calculate temporary vectorial force */
2186 tx = _mm_mul_ps(fscal,dx22);
2187 ty = _mm_mul_ps(fscal,dy22);
2188 tz = _mm_mul_ps(fscal,dz22);
2190 /* Update vectorial force */
2191 fix2 = _mm_add_ps(fix2,tx);
2192 fiy2 = _mm_add_ps(fiy2,ty);
2193 fiz2 = _mm_add_ps(fiz2,tz);
2195 fjx2 = _mm_add_ps(fjx2,tx);
2196 fjy2 = _mm_add_ps(fjy2,ty);
2197 fjz2 = _mm_add_ps(fjz2,tz);
2201 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2202 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2203 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2204 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2206 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2207 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2209 /* Inner loop uses 302 flops */
2212 /* End of innermost loop */
2214 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2215 f+i_coord_offset,fshift+i_shift_offset);
2217 /* Increment number of inner iterations */
2218 inneriter += j_index_end - j_index_start;
2220 /* Outer loop uses 18 flops */
2223 /* Increment number of outer iterations */
2224 outeriter += nri;
2226 /* Update outer/inner flops */
2228 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);