Remove all unnecessary HAVE_CONFIG_H
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sse2_single.c
blob8a0de8764c35d7a4b690355d5efd476b24a25d60
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
38 #include "config.h"
40 #include <math.h>
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
57 void
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 real scratch[4*DIM];
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 int vdwioffset0;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 int vdwioffset1;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 int vdwioffset2;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwioffset3;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
110 real *charge;
111 int nvdwtype;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 int *vdwtype;
114 real *vdwparam;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
117 __m128i ewitab;
118 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119 real *ewtab;
120 __m128 dummy_mask,cutoff_mask;
121 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one = _mm_set1_ps(1.0);
123 __m128 two = _mm_set1_ps(2.0);
124 x = xx[0];
125 f = ff[0];
127 nri = nlist->nri;
128 iinr = nlist->iinr;
129 jindex = nlist->jindex;
130 jjnr = nlist->jjnr;
131 shiftidx = nlist->shift;
132 gid = nlist->gid;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm_set1_ps(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
138 vdwparam = fr->nbfp;
139 vdwtype = mdatoms->typeA;
141 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
142 ewtab = fr->ic->tabq_coul_FDV0;
143 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
144 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
151 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
153 jq1 = _mm_set1_ps(charge[inr+1]);
154 jq2 = _mm_set1_ps(charge[inr+2]);
155 jq3 = _mm_set1_ps(charge[inr+3]);
156 vdwjidx0A = 2*vdwtype[inr+0];
157 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
158 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
159 qq11 = _mm_mul_ps(iq1,jq1);
160 qq12 = _mm_mul_ps(iq1,jq2);
161 qq13 = _mm_mul_ps(iq1,jq3);
162 qq21 = _mm_mul_ps(iq2,jq1);
163 qq22 = _mm_mul_ps(iq2,jq2);
164 qq23 = _mm_mul_ps(iq2,jq3);
165 qq31 = _mm_mul_ps(iq3,jq1);
166 qq32 = _mm_mul_ps(iq3,jq2);
167 qq33 = _mm_mul_ps(iq3,jq3);
169 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
170 rcutoff_scalar = fr->rcoulomb;
171 rcutoff = _mm_set1_ps(rcutoff_scalar);
172 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
174 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
175 rvdw = _mm_set1_ps(fr->rvdw);
177 /* Avoid stupid compiler warnings */
178 jnrA = jnrB = jnrC = jnrD = 0;
179 j_coord_offsetA = 0;
180 j_coord_offsetB = 0;
181 j_coord_offsetC = 0;
182 j_coord_offsetD = 0;
184 outeriter = 0;
185 inneriter = 0;
187 for(iidx=0;iidx<4*DIM;iidx++)
189 scratch[iidx] = 0.0;
192 /* Start outer loop over neighborlists */
193 for(iidx=0; iidx<nri; iidx++)
195 /* Load shift vector for this list */
196 i_shift_offset = DIM*shiftidx[iidx];
198 /* Load limits for loop over neighbors */
199 j_index_start = jindex[iidx];
200 j_index_end = jindex[iidx+1];
202 /* Get outer coordinate index */
203 inr = iinr[iidx];
204 i_coord_offset = DIM*inr;
206 /* Load i particle coords and add shift vector */
207 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
208 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
210 fix0 = _mm_setzero_ps();
211 fiy0 = _mm_setzero_ps();
212 fiz0 = _mm_setzero_ps();
213 fix1 = _mm_setzero_ps();
214 fiy1 = _mm_setzero_ps();
215 fiz1 = _mm_setzero_ps();
216 fix2 = _mm_setzero_ps();
217 fiy2 = _mm_setzero_ps();
218 fiz2 = _mm_setzero_ps();
219 fix3 = _mm_setzero_ps();
220 fiy3 = _mm_setzero_ps();
221 fiz3 = _mm_setzero_ps();
223 /* Reset potential sums */
224 velecsum = _mm_setzero_ps();
225 vvdwsum = _mm_setzero_ps();
227 /* Start inner kernel loop */
228 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
231 /* Get j neighbor index, and coordinate index */
232 jnrA = jjnr[jidx];
233 jnrB = jjnr[jidx+1];
234 jnrC = jjnr[jidx+2];
235 jnrD = jjnr[jidx+3];
236 j_coord_offsetA = DIM*jnrA;
237 j_coord_offsetB = DIM*jnrB;
238 j_coord_offsetC = DIM*jnrC;
239 j_coord_offsetD = DIM*jnrD;
241 /* load j atom coordinates */
242 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
243 x+j_coord_offsetC,x+j_coord_offsetD,
244 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
245 &jy2,&jz2,&jx3,&jy3,&jz3);
247 /* Calculate displacement vector */
248 dx00 = _mm_sub_ps(ix0,jx0);
249 dy00 = _mm_sub_ps(iy0,jy0);
250 dz00 = _mm_sub_ps(iz0,jz0);
251 dx11 = _mm_sub_ps(ix1,jx1);
252 dy11 = _mm_sub_ps(iy1,jy1);
253 dz11 = _mm_sub_ps(iz1,jz1);
254 dx12 = _mm_sub_ps(ix1,jx2);
255 dy12 = _mm_sub_ps(iy1,jy2);
256 dz12 = _mm_sub_ps(iz1,jz2);
257 dx13 = _mm_sub_ps(ix1,jx3);
258 dy13 = _mm_sub_ps(iy1,jy3);
259 dz13 = _mm_sub_ps(iz1,jz3);
260 dx21 = _mm_sub_ps(ix2,jx1);
261 dy21 = _mm_sub_ps(iy2,jy1);
262 dz21 = _mm_sub_ps(iz2,jz1);
263 dx22 = _mm_sub_ps(ix2,jx2);
264 dy22 = _mm_sub_ps(iy2,jy2);
265 dz22 = _mm_sub_ps(iz2,jz2);
266 dx23 = _mm_sub_ps(ix2,jx3);
267 dy23 = _mm_sub_ps(iy2,jy3);
268 dz23 = _mm_sub_ps(iz2,jz3);
269 dx31 = _mm_sub_ps(ix3,jx1);
270 dy31 = _mm_sub_ps(iy3,jy1);
271 dz31 = _mm_sub_ps(iz3,jz1);
272 dx32 = _mm_sub_ps(ix3,jx2);
273 dy32 = _mm_sub_ps(iy3,jy2);
274 dz32 = _mm_sub_ps(iz3,jz2);
275 dx33 = _mm_sub_ps(ix3,jx3);
276 dy33 = _mm_sub_ps(iy3,jy3);
277 dz33 = _mm_sub_ps(iz3,jz3);
279 /* Calculate squared distance and things based on it */
280 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
281 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
282 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
283 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
284 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
285 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
286 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
287 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
288 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
289 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
291 rinv11 = gmx_mm_invsqrt_ps(rsq11);
292 rinv12 = gmx_mm_invsqrt_ps(rsq12);
293 rinv13 = gmx_mm_invsqrt_ps(rsq13);
294 rinv21 = gmx_mm_invsqrt_ps(rsq21);
295 rinv22 = gmx_mm_invsqrt_ps(rsq22);
296 rinv23 = gmx_mm_invsqrt_ps(rsq23);
297 rinv31 = gmx_mm_invsqrt_ps(rsq31);
298 rinv32 = gmx_mm_invsqrt_ps(rsq32);
299 rinv33 = gmx_mm_invsqrt_ps(rsq33);
301 rinvsq00 = gmx_mm_inv_ps(rsq00);
302 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
303 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
304 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
305 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
306 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
307 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
308 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
309 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
310 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
312 fjx0 = _mm_setzero_ps();
313 fjy0 = _mm_setzero_ps();
314 fjz0 = _mm_setzero_ps();
315 fjx1 = _mm_setzero_ps();
316 fjy1 = _mm_setzero_ps();
317 fjz1 = _mm_setzero_ps();
318 fjx2 = _mm_setzero_ps();
319 fjy2 = _mm_setzero_ps();
320 fjz2 = _mm_setzero_ps();
321 fjx3 = _mm_setzero_ps();
322 fjy3 = _mm_setzero_ps();
323 fjz3 = _mm_setzero_ps();
325 /**************************
326 * CALCULATE INTERACTIONS *
327 **************************/
329 if (gmx_mm_any_lt(rsq00,rcutoff2))
332 /* LENNARD-JONES DISPERSION/REPULSION */
334 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
335 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
336 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
337 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
338 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
339 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
341 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 vvdw = _mm_and_ps(vvdw,cutoff_mask);
345 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
347 fscal = fvdw;
349 fscal = _mm_and_ps(fscal,cutoff_mask);
351 /* Calculate temporary vectorial force */
352 tx = _mm_mul_ps(fscal,dx00);
353 ty = _mm_mul_ps(fscal,dy00);
354 tz = _mm_mul_ps(fscal,dz00);
356 /* Update vectorial force */
357 fix0 = _mm_add_ps(fix0,tx);
358 fiy0 = _mm_add_ps(fiy0,ty);
359 fiz0 = _mm_add_ps(fiz0,tz);
361 fjx0 = _mm_add_ps(fjx0,tx);
362 fjy0 = _mm_add_ps(fjy0,ty);
363 fjz0 = _mm_add_ps(fjz0,tz);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 if (gmx_mm_any_lt(rsq11,rcutoff2))
374 r11 = _mm_mul_ps(rsq11,rinv11);
376 /* EWALD ELECTROSTATICS */
378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379 ewrt = _mm_mul_ps(r11,ewtabscale);
380 ewitab = _mm_cvttps_epi32(ewrt);
381 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
382 ewitab = _mm_slli_epi32(ewitab,2);
383 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
384 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
385 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
386 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
387 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
388 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
389 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
390 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
391 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
393 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velec = _mm_and_ps(velec,cutoff_mask);
397 velecsum = _mm_add_ps(velecsum,velec);
399 fscal = felec;
401 fscal = _mm_and_ps(fscal,cutoff_mask);
403 /* Calculate temporary vectorial force */
404 tx = _mm_mul_ps(fscal,dx11);
405 ty = _mm_mul_ps(fscal,dy11);
406 tz = _mm_mul_ps(fscal,dz11);
408 /* Update vectorial force */
409 fix1 = _mm_add_ps(fix1,tx);
410 fiy1 = _mm_add_ps(fiy1,ty);
411 fiz1 = _mm_add_ps(fiz1,tz);
413 fjx1 = _mm_add_ps(fjx1,tx);
414 fjy1 = _mm_add_ps(fjy1,ty);
415 fjz1 = _mm_add_ps(fjz1,tz);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 if (gmx_mm_any_lt(rsq12,rcutoff2))
426 r12 = _mm_mul_ps(rsq12,rinv12);
428 /* EWALD ELECTROSTATICS */
430 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
431 ewrt = _mm_mul_ps(r12,ewtabscale);
432 ewitab = _mm_cvttps_epi32(ewrt);
433 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
434 ewitab = _mm_slli_epi32(ewitab,2);
435 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
436 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
437 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
438 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
439 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
440 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
441 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
442 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
443 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
445 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velec = _mm_and_ps(velec,cutoff_mask);
449 velecsum = _mm_add_ps(velecsum,velec);
451 fscal = felec;
453 fscal = _mm_and_ps(fscal,cutoff_mask);
455 /* Calculate temporary vectorial force */
456 tx = _mm_mul_ps(fscal,dx12);
457 ty = _mm_mul_ps(fscal,dy12);
458 tz = _mm_mul_ps(fscal,dz12);
460 /* Update vectorial force */
461 fix1 = _mm_add_ps(fix1,tx);
462 fiy1 = _mm_add_ps(fiy1,ty);
463 fiz1 = _mm_add_ps(fiz1,tz);
465 fjx2 = _mm_add_ps(fjx2,tx);
466 fjy2 = _mm_add_ps(fjy2,ty);
467 fjz2 = _mm_add_ps(fjz2,tz);
471 /**************************
472 * CALCULATE INTERACTIONS *
473 **************************/
475 if (gmx_mm_any_lt(rsq13,rcutoff2))
478 r13 = _mm_mul_ps(rsq13,rinv13);
480 /* EWALD ELECTROSTATICS */
482 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
483 ewrt = _mm_mul_ps(r13,ewtabscale);
484 ewitab = _mm_cvttps_epi32(ewrt);
485 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
486 ewitab = _mm_slli_epi32(ewitab,2);
487 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
488 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
489 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
490 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
491 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
492 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
493 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
494 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
495 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
497 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velec = _mm_and_ps(velec,cutoff_mask);
501 velecsum = _mm_add_ps(velecsum,velec);
503 fscal = felec;
505 fscal = _mm_and_ps(fscal,cutoff_mask);
507 /* Calculate temporary vectorial force */
508 tx = _mm_mul_ps(fscal,dx13);
509 ty = _mm_mul_ps(fscal,dy13);
510 tz = _mm_mul_ps(fscal,dz13);
512 /* Update vectorial force */
513 fix1 = _mm_add_ps(fix1,tx);
514 fiy1 = _mm_add_ps(fiy1,ty);
515 fiz1 = _mm_add_ps(fiz1,tz);
517 fjx3 = _mm_add_ps(fjx3,tx);
518 fjy3 = _mm_add_ps(fjy3,ty);
519 fjz3 = _mm_add_ps(fjz3,tz);
523 /**************************
524 * CALCULATE INTERACTIONS *
525 **************************/
527 if (gmx_mm_any_lt(rsq21,rcutoff2))
530 r21 = _mm_mul_ps(rsq21,rinv21);
532 /* EWALD ELECTROSTATICS */
534 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
535 ewrt = _mm_mul_ps(r21,ewtabscale);
536 ewitab = _mm_cvttps_epi32(ewrt);
537 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
538 ewitab = _mm_slli_epi32(ewitab,2);
539 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
540 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
541 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
542 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
543 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
544 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
545 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
546 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
547 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
549 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
551 /* Update potential sum for this i atom from the interaction with this j atom. */
552 velec = _mm_and_ps(velec,cutoff_mask);
553 velecsum = _mm_add_ps(velecsum,velec);
555 fscal = felec;
557 fscal = _mm_and_ps(fscal,cutoff_mask);
559 /* Calculate temporary vectorial force */
560 tx = _mm_mul_ps(fscal,dx21);
561 ty = _mm_mul_ps(fscal,dy21);
562 tz = _mm_mul_ps(fscal,dz21);
564 /* Update vectorial force */
565 fix2 = _mm_add_ps(fix2,tx);
566 fiy2 = _mm_add_ps(fiy2,ty);
567 fiz2 = _mm_add_ps(fiz2,tz);
569 fjx1 = _mm_add_ps(fjx1,tx);
570 fjy1 = _mm_add_ps(fjy1,ty);
571 fjz1 = _mm_add_ps(fjz1,tz);
575 /**************************
576 * CALCULATE INTERACTIONS *
577 **************************/
579 if (gmx_mm_any_lt(rsq22,rcutoff2))
582 r22 = _mm_mul_ps(rsq22,rinv22);
584 /* EWALD ELECTROSTATICS */
586 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
587 ewrt = _mm_mul_ps(r22,ewtabscale);
588 ewitab = _mm_cvttps_epi32(ewrt);
589 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
590 ewitab = _mm_slli_epi32(ewitab,2);
591 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
592 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
593 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
594 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
595 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
596 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
597 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
598 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
599 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
601 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velec = _mm_and_ps(velec,cutoff_mask);
605 velecsum = _mm_add_ps(velecsum,velec);
607 fscal = felec;
609 fscal = _mm_and_ps(fscal,cutoff_mask);
611 /* Calculate temporary vectorial force */
612 tx = _mm_mul_ps(fscal,dx22);
613 ty = _mm_mul_ps(fscal,dy22);
614 tz = _mm_mul_ps(fscal,dz22);
616 /* Update vectorial force */
617 fix2 = _mm_add_ps(fix2,tx);
618 fiy2 = _mm_add_ps(fiy2,ty);
619 fiz2 = _mm_add_ps(fiz2,tz);
621 fjx2 = _mm_add_ps(fjx2,tx);
622 fjy2 = _mm_add_ps(fjy2,ty);
623 fjz2 = _mm_add_ps(fjz2,tz);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 if (gmx_mm_any_lt(rsq23,rcutoff2))
634 r23 = _mm_mul_ps(rsq23,rinv23);
636 /* EWALD ELECTROSTATICS */
638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639 ewrt = _mm_mul_ps(r23,ewtabscale);
640 ewitab = _mm_cvttps_epi32(ewrt);
641 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
642 ewitab = _mm_slli_epi32(ewitab,2);
643 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
644 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
645 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
646 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
647 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
648 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
649 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
650 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
651 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
653 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
655 /* Update potential sum for this i atom from the interaction with this j atom. */
656 velec = _mm_and_ps(velec,cutoff_mask);
657 velecsum = _mm_add_ps(velecsum,velec);
659 fscal = felec;
661 fscal = _mm_and_ps(fscal,cutoff_mask);
663 /* Calculate temporary vectorial force */
664 tx = _mm_mul_ps(fscal,dx23);
665 ty = _mm_mul_ps(fscal,dy23);
666 tz = _mm_mul_ps(fscal,dz23);
668 /* Update vectorial force */
669 fix2 = _mm_add_ps(fix2,tx);
670 fiy2 = _mm_add_ps(fiy2,ty);
671 fiz2 = _mm_add_ps(fiz2,tz);
673 fjx3 = _mm_add_ps(fjx3,tx);
674 fjy3 = _mm_add_ps(fjy3,ty);
675 fjz3 = _mm_add_ps(fjz3,tz);
679 /**************************
680 * CALCULATE INTERACTIONS *
681 **************************/
683 if (gmx_mm_any_lt(rsq31,rcutoff2))
686 r31 = _mm_mul_ps(rsq31,rinv31);
688 /* EWALD ELECTROSTATICS */
690 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
691 ewrt = _mm_mul_ps(r31,ewtabscale);
692 ewitab = _mm_cvttps_epi32(ewrt);
693 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
694 ewitab = _mm_slli_epi32(ewitab,2);
695 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
696 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
697 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
698 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
699 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
700 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
701 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
702 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
703 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
705 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
707 /* Update potential sum for this i atom from the interaction with this j atom. */
708 velec = _mm_and_ps(velec,cutoff_mask);
709 velecsum = _mm_add_ps(velecsum,velec);
711 fscal = felec;
713 fscal = _mm_and_ps(fscal,cutoff_mask);
715 /* Calculate temporary vectorial force */
716 tx = _mm_mul_ps(fscal,dx31);
717 ty = _mm_mul_ps(fscal,dy31);
718 tz = _mm_mul_ps(fscal,dz31);
720 /* Update vectorial force */
721 fix3 = _mm_add_ps(fix3,tx);
722 fiy3 = _mm_add_ps(fiy3,ty);
723 fiz3 = _mm_add_ps(fiz3,tz);
725 fjx1 = _mm_add_ps(fjx1,tx);
726 fjy1 = _mm_add_ps(fjy1,ty);
727 fjz1 = _mm_add_ps(fjz1,tz);
731 /**************************
732 * CALCULATE INTERACTIONS *
733 **************************/
735 if (gmx_mm_any_lt(rsq32,rcutoff2))
738 r32 = _mm_mul_ps(rsq32,rinv32);
740 /* EWALD ELECTROSTATICS */
742 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
743 ewrt = _mm_mul_ps(r32,ewtabscale);
744 ewitab = _mm_cvttps_epi32(ewrt);
745 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
746 ewitab = _mm_slli_epi32(ewitab,2);
747 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
748 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
749 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
750 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
751 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
752 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
753 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
754 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
755 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
757 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
759 /* Update potential sum for this i atom from the interaction with this j atom. */
760 velec = _mm_and_ps(velec,cutoff_mask);
761 velecsum = _mm_add_ps(velecsum,velec);
763 fscal = felec;
765 fscal = _mm_and_ps(fscal,cutoff_mask);
767 /* Calculate temporary vectorial force */
768 tx = _mm_mul_ps(fscal,dx32);
769 ty = _mm_mul_ps(fscal,dy32);
770 tz = _mm_mul_ps(fscal,dz32);
772 /* Update vectorial force */
773 fix3 = _mm_add_ps(fix3,tx);
774 fiy3 = _mm_add_ps(fiy3,ty);
775 fiz3 = _mm_add_ps(fiz3,tz);
777 fjx2 = _mm_add_ps(fjx2,tx);
778 fjy2 = _mm_add_ps(fjy2,ty);
779 fjz2 = _mm_add_ps(fjz2,tz);
783 /**************************
784 * CALCULATE INTERACTIONS *
785 **************************/
787 if (gmx_mm_any_lt(rsq33,rcutoff2))
790 r33 = _mm_mul_ps(rsq33,rinv33);
792 /* EWALD ELECTROSTATICS */
794 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
795 ewrt = _mm_mul_ps(r33,ewtabscale);
796 ewitab = _mm_cvttps_epi32(ewrt);
797 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
798 ewitab = _mm_slli_epi32(ewitab,2);
799 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
800 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
801 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
802 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
803 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
804 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
805 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
806 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
807 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
809 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
811 /* Update potential sum for this i atom from the interaction with this j atom. */
812 velec = _mm_and_ps(velec,cutoff_mask);
813 velecsum = _mm_add_ps(velecsum,velec);
815 fscal = felec;
817 fscal = _mm_and_ps(fscal,cutoff_mask);
819 /* Calculate temporary vectorial force */
820 tx = _mm_mul_ps(fscal,dx33);
821 ty = _mm_mul_ps(fscal,dy33);
822 tz = _mm_mul_ps(fscal,dz33);
824 /* Update vectorial force */
825 fix3 = _mm_add_ps(fix3,tx);
826 fiy3 = _mm_add_ps(fiy3,ty);
827 fiz3 = _mm_add_ps(fiz3,tz);
829 fjx3 = _mm_add_ps(fjx3,tx);
830 fjy3 = _mm_add_ps(fjy3,ty);
831 fjz3 = _mm_add_ps(fjz3,tz);
835 fjptrA = f+j_coord_offsetA;
836 fjptrB = f+j_coord_offsetB;
837 fjptrC = f+j_coord_offsetC;
838 fjptrD = f+j_coord_offsetD;
840 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
841 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
842 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
844 /* Inner loop uses 458 flops */
847 if(jidx<j_index_end)
850 /* Get j neighbor index, and coordinate index */
851 jnrlistA = jjnr[jidx];
852 jnrlistB = jjnr[jidx+1];
853 jnrlistC = jjnr[jidx+2];
854 jnrlistD = jjnr[jidx+3];
855 /* Sign of each element will be negative for non-real atoms.
856 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
857 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
859 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
860 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
861 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
862 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
863 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
864 j_coord_offsetA = DIM*jnrA;
865 j_coord_offsetB = DIM*jnrB;
866 j_coord_offsetC = DIM*jnrC;
867 j_coord_offsetD = DIM*jnrD;
869 /* load j atom coordinates */
870 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
871 x+j_coord_offsetC,x+j_coord_offsetD,
872 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
873 &jy2,&jz2,&jx3,&jy3,&jz3);
875 /* Calculate displacement vector */
876 dx00 = _mm_sub_ps(ix0,jx0);
877 dy00 = _mm_sub_ps(iy0,jy0);
878 dz00 = _mm_sub_ps(iz0,jz0);
879 dx11 = _mm_sub_ps(ix1,jx1);
880 dy11 = _mm_sub_ps(iy1,jy1);
881 dz11 = _mm_sub_ps(iz1,jz1);
882 dx12 = _mm_sub_ps(ix1,jx2);
883 dy12 = _mm_sub_ps(iy1,jy2);
884 dz12 = _mm_sub_ps(iz1,jz2);
885 dx13 = _mm_sub_ps(ix1,jx3);
886 dy13 = _mm_sub_ps(iy1,jy3);
887 dz13 = _mm_sub_ps(iz1,jz3);
888 dx21 = _mm_sub_ps(ix2,jx1);
889 dy21 = _mm_sub_ps(iy2,jy1);
890 dz21 = _mm_sub_ps(iz2,jz1);
891 dx22 = _mm_sub_ps(ix2,jx2);
892 dy22 = _mm_sub_ps(iy2,jy2);
893 dz22 = _mm_sub_ps(iz2,jz2);
894 dx23 = _mm_sub_ps(ix2,jx3);
895 dy23 = _mm_sub_ps(iy2,jy3);
896 dz23 = _mm_sub_ps(iz2,jz3);
897 dx31 = _mm_sub_ps(ix3,jx1);
898 dy31 = _mm_sub_ps(iy3,jy1);
899 dz31 = _mm_sub_ps(iz3,jz1);
900 dx32 = _mm_sub_ps(ix3,jx2);
901 dy32 = _mm_sub_ps(iy3,jy2);
902 dz32 = _mm_sub_ps(iz3,jz2);
903 dx33 = _mm_sub_ps(ix3,jx3);
904 dy33 = _mm_sub_ps(iy3,jy3);
905 dz33 = _mm_sub_ps(iz3,jz3);
907 /* Calculate squared distance and things based on it */
908 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
909 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
910 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
911 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
912 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
913 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
914 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
915 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
916 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
917 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
919 rinv11 = gmx_mm_invsqrt_ps(rsq11);
920 rinv12 = gmx_mm_invsqrt_ps(rsq12);
921 rinv13 = gmx_mm_invsqrt_ps(rsq13);
922 rinv21 = gmx_mm_invsqrt_ps(rsq21);
923 rinv22 = gmx_mm_invsqrt_ps(rsq22);
924 rinv23 = gmx_mm_invsqrt_ps(rsq23);
925 rinv31 = gmx_mm_invsqrt_ps(rsq31);
926 rinv32 = gmx_mm_invsqrt_ps(rsq32);
927 rinv33 = gmx_mm_invsqrt_ps(rsq33);
929 rinvsq00 = gmx_mm_inv_ps(rsq00);
930 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
931 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
932 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
933 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
934 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
935 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
936 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
937 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
938 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
940 fjx0 = _mm_setzero_ps();
941 fjy0 = _mm_setzero_ps();
942 fjz0 = _mm_setzero_ps();
943 fjx1 = _mm_setzero_ps();
944 fjy1 = _mm_setzero_ps();
945 fjz1 = _mm_setzero_ps();
946 fjx2 = _mm_setzero_ps();
947 fjy2 = _mm_setzero_ps();
948 fjz2 = _mm_setzero_ps();
949 fjx3 = _mm_setzero_ps();
950 fjy3 = _mm_setzero_ps();
951 fjz3 = _mm_setzero_ps();
953 /**************************
954 * CALCULATE INTERACTIONS *
955 **************************/
957 if (gmx_mm_any_lt(rsq00,rcutoff2))
960 /* LENNARD-JONES DISPERSION/REPULSION */
962 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
963 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
964 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
965 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
966 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
967 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
969 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
971 /* Update potential sum for this i atom from the interaction with this j atom. */
972 vvdw = _mm_and_ps(vvdw,cutoff_mask);
973 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
974 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
976 fscal = fvdw;
978 fscal = _mm_and_ps(fscal,cutoff_mask);
980 fscal = _mm_andnot_ps(dummy_mask,fscal);
982 /* Calculate temporary vectorial force */
983 tx = _mm_mul_ps(fscal,dx00);
984 ty = _mm_mul_ps(fscal,dy00);
985 tz = _mm_mul_ps(fscal,dz00);
987 /* Update vectorial force */
988 fix0 = _mm_add_ps(fix0,tx);
989 fiy0 = _mm_add_ps(fiy0,ty);
990 fiz0 = _mm_add_ps(fiz0,tz);
992 fjx0 = _mm_add_ps(fjx0,tx);
993 fjy0 = _mm_add_ps(fjy0,ty);
994 fjz0 = _mm_add_ps(fjz0,tz);
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 if (gmx_mm_any_lt(rsq11,rcutoff2))
1005 r11 = _mm_mul_ps(rsq11,rinv11);
1006 r11 = _mm_andnot_ps(dummy_mask,r11);
1008 /* EWALD ELECTROSTATICS */
1010 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1011 ewrt = _mm_mul_ps(r11,ewtabscale);
1012 ewitab = _mm_cvttps_epi32(ewrt);
1013 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1014 ewitab = _mm_slli_epi32(ewitab,2);
1015 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1016 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1017 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1018 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1019 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1020 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1021 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1022 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1023 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1025 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1027 /* Update potential sum for this i atom from the interaction with this j atom. */
1028 velec = _mm_and_ps(velec,cutoff_mask);
1029 velec = _mm_andnot_ps(dummy_mask,velec);
1030 velecsum = _mm_add_ps(velecsum,velec);
1032 fscal = felec;
1034 fscal = _mm_and_ps(fscal,cutoff_mask);
1036 fscal = _mm_andnot_ps(dummy_mask,fscal);
1038 /* Calculate temporary vectorial force */
1039 tx = _mm_mul_ps(fscal,dx11);
1040 ty = _mm_mul_ps(fscal,dy11);
1041 tz = _mm_mul_ps(fscal,dz11);
1043 /* Update vectorial force */
1044 fix1 = _mm_add_ps(fix1,tx);
1045 fiy1 = _mm_add_ps(fiy1,ty);
1046 fiz1 = _mm_add_ps(fiz1,tz);
1048 fjx1 = _mm_add_ps(fjx1,tx);
1049 fjy1 = _mm_add_ps(fjy1,ty);
1050 fjz1 = _mm_add_ps(fjz1,tz);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 if (gmx_mm_any_lt(rsq12,rcutoff2))
1061 r12 = _mm_mul_ps(rsq12,rinv12);
1062 r12 = _mm_andnot_ps(dummy_mask,r12);
1064 /* EWALD ELECTROSTATICS */
1066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1067 ewrt = _mm_mul_ps(r12,ewtabscale);
1068 ewitab = _mm_cvttps_epi32(ewrt);
1069 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1070 ewitab = _mm_slli_epi32(ewitab,2);
1071 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1072 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1073 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1074 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1075 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1076 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1077 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1078 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1079 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1081 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1083 /* Update potential sum for this i atom from the interaction with this j atom. */
1084 velec = _mm_and_ps(velec,cutoff_mask);
1085 velec = _mm_andnot_ps(dummy_mask,velec);
1086 velecsum = _mm_add_ps(velecsum,velec);
1088 fscal = felec;
1090 fscal = _mm_and_ps(fscal,cutoff_mask);
1092 fscal = _mm_andnot_ps(dummy_mask,fscal);
1094 /* Calculate temporary vectorial force */
1095 tx = _mm_mul_ps(fscal,dx12);
1096 ty = _mm_mul_ps(fscal,dy12);
1097 tz = _mm_mul_ps(fscal,dz12);
1099 /* Update vectorial force */
1100 fix1 = _mm_add_ps(fix1,tx);
1101 fiy1 = _mm_add_ps(fiy1,ty);
1102 fiz1 = _mm_add_ps(fiz1,tz);
1104 fjx2 = _mm_add_ps(fjx2,tx);
1105 fjy2 = _mm_add_ps(fjy2,ty);
1106 fjz2 = _mm_add_ps(fjz2,tz);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 if (gmx_mm_any_lt(rsq13,rcutoff2))
1117 r13 = _mm_mul_ps(rsq13,rinv13);
1118 r13 = _mm_andnot_ps(dummy_mask,r13);
1120 /* EWALD ELECTROSTATICS */
1122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123 ewrt = _mm_mul_ps(r13,ewtabscale);
1124 ewitab = _mm_cvttps_epi32(ewrt);
1125 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1126 ewitab = _mm_slli_epi32(ewitab,2);
1127 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1128 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1129 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1130 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1131 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1132 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1133 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1134 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
1135 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1137 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1139 /* Update potential sum for this i atom from the interaction with this j atom. */
1140 velec = _mm_and_ps(velec,cutoff_mask);
1141 velec = _mm_andnot_ps(dummy_mask,velec);
1142 velecsum = _mm_add_ps(velecsum,velec);
1144 fscal = felec;
1146 fscal = _mm_and_ps(fscal,cutoff_mask);
1148 fscal = _mm_andnot_ps(dummy_mask,fscal);
1150 /* Calculate temporary vectorial force */
1151 tx = _mm_mul_ps(fscal,dx13);
1152 ty = _mm_mul_ps(fscal,dy13);
1153 tz = _mm_mul_ps(fscal,dz13);
1155 /* Update vectorial force */
1156 fix1 = _mm_add_ps(fix1,tx);
1157 fiy1 = _mm_add_ps(fiy1,ty);
1158 fiz1 = _mm_add_ps(fiz1,tz);
1160 fjx3 = _mm_add_ps(fjx3,tx);
1161 fjy3 = _mm_add_ps(fjy3,ty);
1162 fjz3 = _mm_add_ps(fjz3,tz);
1166 /**************************
1167 * CALCULATE INTERACTIONS *
1168 **************************/
1170 if (gmx_mm_any_lt(rsq21,rcutoff2))
1173 r21 = _mm_mul_ps(rsq21,rinv21);
1174 r21 = _mm_andnot_ps(dummy_mask,r21);
1176 /* EWALD ELECTROSTATICS */
1178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1179 ewrt = _mm_mul_ps(r21,ewtabscale);
1180 ewitab = _mm_cvttps_epi32(ewrt);
1181 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1182 ewitab = _mm_slli_epi32(ewitab,2);
1183 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1184 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1185 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1186 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1187 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1188 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1189 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1190 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1191 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1193 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec = _mm_and_ps(velec,cutoff_mask);
1197 velec = _mm_andnot_ps(dummy_mask,velec);
1198 velecsum = _mm_add_ps(velecsum,velec);
1200 fscal = felec;
1202 fscal = _mm_and_ps(fscal,cutoff_mask);
1204 fscal = _mm_andnot_ps(dummy_mask,fscal);
1206 /* Calculate temporary vectorial force */
1207 tx = _mm_mul_ps(fscal,dx21);
1208 ty = _mm_mul_ps(fscal,dy21);
1209 tz = _mm_mul_ps(fscal,dz21);
1211 /* Update vectorial force */
1212 fix2 = _mm_add_ps(fix2,tx);
1213 fiy2 = _mm_add_ps(fiy2,ty);
1214 fiz2 = _mm_add_ps(fiz2,tz);
1216 fjx1 = _mm_add_ps(fjx1,tx);
1217 fjy1 = _mm_add_ps(fjy1,ty);
1218 fjz1 = _mm_add_ps(fjz1,tz);
1222 /**************************
1223 * CALCULATE INTERACTIONS *
1224 **************************/
1226 if (gmx_mm_any_lt(rsq22,rcutoff2))
1229 r22 = _mm_mul_ps(rsq22,rinv22);
1230 r22 = _mm_andnot_ps(dummy_mask,r22);
1232 /* EWALD ELECTROSTATICS */
1234 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1235 ewrt = _mm_mul_ps(r22,ewtabscale);
1236 ewitab = _mm_cvttps_epi32(ewrt);
1237 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1238 ewitab = _mm_slli_epi32(ewitab,2);
1239 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1240 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1241 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1242 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1243 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1244 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1245 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1246 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1247 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1249 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1251 /* Update potential sum for this i atom from the interaction with this j atom. */
1252 velec = _mm_and_ps(velec,cutoff_mask);
1253 velec = _mm_andnot_ps(dummy_mask,velec);
1254 velecsum = _mm_add_ps(velecsum,velec);
1256 fscal = felec;
1258 fscal = _mm_and_ps(fscal,cutoff_mask);
1260 fscal = _mm_andnot_ps(dummy_mask,fscal);
1262 /* Calculate temporary vectorial force */
1263 tx = _mm_mul_ps(fscal,dx22);
1264 ty = _mm_mul_ps(fscal,dy22);
1265 tz = _mm_mul_ps(fscal,dz22);
1267 /* Update vectorial force */
1268 fix2 = _mm_add_ps(fix2,tx);
1269 fiy2 = _mm_add_ps(fiy2,ty);
1270 fiz2 = _mm_add_ps(fiz2,tz);
1272 fjx2 = _mm_add_ps(fjx2,tx);
1273 fjy2 = _mm_add_ps(fjy2,ty);
1274 fjz2 = _mm_add_ps(fjz2,tz);
1278 /**************************
1279 * CALCULATE INTERACTIONS *
1280 **************************/
1282 if (gmx_mm_any_lt(rsq23,rcutoff2))
1285 r23 = _mm_mul_ps(rsq23,rinv23);
1286 r23 = _mm_andnot_ps(dummy_mask,r23);
1288 /* EWALD ELECTROSTATICS */
1290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1291 ewrt = _mm_mul_ps(r23,ewtabscale);
1292 ewitab = _mm_cvttps_epi32(ewrt);
1293 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1294 ewitab = _mm_slli_epi32(ewitab,2);
1295 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1296 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1297 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1298 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1299 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1300 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1301 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1302 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1303 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1305 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1307 /* Update potential sum for this i atom from the interaction with this j atom. */
1308 velec = _mm_and_ps(velec,cutoff_mask);
1309 velec = _mm_andnot_ps(dummy_mask,velec);
1310 velecsum = _mm_add_ps(velecsum,velec);
1312 fscal = felec;
1314 fscal = _mm_and_ps(fscal,cutoff_mask);
1316 fscal = _mm_andnot_ps(dummy_mask,fscal);
1318 /* Calculate temporary vectorial force */
1319 tx = _mm_mul_ps(fscal,dx23);
1320 ty = _mm_mul_ps(fscal,dy23);
1321 tz = _mm_mul_ps(fscal,dz23);
1323 /* Update vectorial force */
1324 fix2 = _mm_add_ps(fix2,tx);
1325 fiy2 = _mm_add_ps(fiy2,ty);
1326 fiz2 = _mm_add_ps(fiz2,tz);
1328 fjx3 = _mm_add_ps(fjx3,tx);
1329 fjy3 = _mm_add_ps(fjy3,ty);
1330 fjz3 = _mm_add_ps(fjz3,tz);
1334 /**************************
1335 * CALCULATE INTERACTIONS *
1336 **************************/
1338 if (gmx_mm_any_lt(rsq31,rcutoff2))
1341 r31 = _mm_mul_ps(rsq31,rinv31);
1342 r31 = _mm_andnot_ps(dummy_mask,r31);
1344 /* EWALD ELECTROSTATICS */
1346 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1347 ewrt = _mm_mul_ps(r31,ewtabscale);
1348 ewitab = _mm_cvttps_epi32(ewrt);
1349 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1350 ewitab = _mm_slli_epi32(ewitab,2);
1351 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1352 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1353 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1354 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1355 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1356 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1357 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1358 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1359 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1361 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1363 /* Update potential sum for this i atom from the interaction with this j atom. */
1364 velec = _mm_and_ps(velec,cutoff_mask);
1365 velec = _mm_andnot_ps(dummy_mask,velec);
1366 velecsum = _mm_add_ps(velecsum,velec);
1368 fscal = felec;
1370 fscal = _mm_and_ps(fscal,cutoff_mask);
1372 fscal = _mm_andnot_ps(dummy_mask,fscal);
1374 /* Calculate temporary vectorial force */
1375 tx = _mm_mul_ps(fscal,dx31);
1376 ty = _mm_mul_ps(fscal,dy31);
1377 tz = _mm_mul_ps(fscal,dz31);
1379 /* Update vectorial force */
1380 fix3 = _mm_add_ps(fix3,tx);
1381 fiy3 = _mm_add_ps(fiy3,ty);
1382 fiz3 = _mm_add_ps(fiz3,tz);
1384 fjx1 = _mm_add_ps(fjx1,tx);
1385 fjy1 = _mm_add_ps(fjy1,ty);
1386 fjz1 = _mm_add_ps(fjz1,tz);
1390 /**************************
1391 * CALCULATE INTERACTIONS *
1392 **************************/
1394 if (gmx_mm_any_lt(rsq32,rcutoff2))
1397 r32 = _mm_mul_ps(rsq32,rinv32);
1398 r32 = _mm_andnot_ps(dummy_mask,r32);
1400 /* EWALD ELECTROSTATICS */
1402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1403 ewrt = _mm_mul_ps(r32,ewtabscale);
1404 ewitab = _mm_cvttps_epi32(ewrt);
1405 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1406 ewitab = _mm_slli_epi32(ewitab,2);
1407 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1408 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1409 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1410 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1411 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1412 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1413 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1414 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1415 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1417 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1419 /* Update potential sum for this i atom from the interaction with this j atom. */
1420 velec = _mm_and_ps(velec,cutoff_mask);
1421 velec = _mm_andnot_ps(dummy_mask,velec);
1422 velecsum = _mm_add_ps(velecsum,velec);
1424 fscal = felec;
1426 fscal = _mm_and_ps(fscal,cutoff_mask);
1428 fscal = _mm_andnot_ps(dummy_mask,fscal);
1430 /* Calculate temporary vectorial force */
1431 tx = _mm_mul_ps(fscal,dx32);
1432 ty = _mm_mul_ps(fscal,dy32);
1433 tz = _mm_mul_ps(fscal,dz32);
1435 /* Update vectorial force */
1436 fix3 = _mm_add_ps(fix3,tx);
1437 fiy3 = _mm_add_ps(fiy3,ty);
1438 fiz3 = _mm_add_ps(fiz3,tz);
1440 fjx2 = _mm_add_ps(fjx2,tx);
1441 fjy2 = _mm_add_ps(fjy2,ty);
1442 fjz2 = _mm_add_ps(fjz2,tz);
1446 /**************************
1447 * CALCULATE INTERACTIONS *
1448 **************************/
1450 if (gmx_mm_any_lt(rsq33,rcutoff2))
1453 r33 = _mm_mul_ps(rsq33,rinv33);
1454 r33 = _mm_andnot_ps(dummy_mask,r33);
1456 /* EWALD ELECTROSTATICS */
1458 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1459 ewrt = _mm_mul_ps(r33,ewtabscale);
1460 ewitab = _mm_cvttps_epi32(ewrt);
1461 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1462 ewitab = _mm_slli_epi32(ewitab,2);
1463 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1464 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1465 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1466 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1467 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1468 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1469 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1470 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1471 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1473 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1475 /* Update potential sum for this i atom from the interaction with this j atom. */
1476 velec = _mm_and_ps(velec,cutoff_mask);
1477 velec = _mm_andnot_ps(dummy_mask,velec);
1478 velecsum = _mm_add_ps(velecsum,velec);
1480 fscal = felec;
1482 fscal = _mm_and_ps(fscal,cutoff_mask);
1484 fscal = _mm_andnot_ps(dummy_mask,fscal);
1486 /* Calculate temporary vectorial force */
1487 tx = _mm_mul_ps(fscal,dx33);
1488 ty = _mm_mul_ps(fscal,dy33);
1489 tz = _mm_mul_ps(fscal,dz33);
1491 /* Update vectorial force */
1492 fix3 = _mm_add_ps(fix3,tx);
1493 fiy3 = _mm_add_ps(fiy3,ty);
1494 fiz3 = _mm_add_ps(fiz3,tz);
1496 fjx3 = _mm_add_ps(fjx3,tx);
1497 fjy3 = _mm_add_ps(fjy3,ty);
1498 fjz3 = _mm_add_ps(fjz3,tz);
1502 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1503 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1504 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1505 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1507 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1508 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1509 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1511 /* Inner loop uses 467 flops */
1514 /* End of innermost loop */
1516 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1517 f+i_coord_offset,fshift+i_shift_offset);
1519 ggid = gid[iidx];
1520 /* Update potential energies */
1521 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1522 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1524 /* Increment number of inner iterations */
1525 inneriter += j_index_end - j_index_start;
1527 /* Outer loop uses 26 flops */
1530 /* Increment number of outer iterations */
1531 outeriter += nri;
1533 /* Update outer/inner flops */
1535 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
1538 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
1539 * Electrostatics interaction: Ewald
1540 * VdW interaction: LennardJones
1541 * Geometry: Water4-Water4
1542 * Calculate force/pot: Force
1544 void
1545 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
1546 (t_nblist * gmx_restrict nlist,
1547 rvec * gmx_restrict xx,
1548 rvec * gmx_restrict ff,
1549 t_forcerec * gmx_restrict fr,
1550 t_mdatoms * gmx_restrict mdatoms,
1551 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1552 t_nrnb * gmx_restrict nrnb)
1554 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1555 * just 0 for non-waters.
1556 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1557 * jnr indices corresponding to data put in the four positions in the SIMD register.
1559 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1560 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1561 int jnrA,jnrB,jnrC,jnrD;
1562 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1563 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1564 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1565 real rcutoff_scalar;
1566 real *shiftvec,*fshift,*x,*f;
1567 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1568 real scratch[4*DIM];
1569 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1570 int vdwioffset0;
1571 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1572 int vdwioffset1;
1573 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1574 int vdwioffset2;
1575 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1576 int vdwioffset3;
1577 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1578 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1579 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1580 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1581 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1582 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1583 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1584 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1585 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1586 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1587 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1588 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1589 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1590 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1591 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1592 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1593 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1594 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1595 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1596 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1597 real *charge;
1598 int nvdwtype;
1599 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1600 int *vdwtype;
1601 real *vdwparam;
1602 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1603 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1604 __m128i ewitab;
1605 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1606 real *ewtab;
1607 __m128 dummy_mask,cutoff_mask;
1608 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1609 __m128 one = _mm_set1_ps(1.0);
1610 __m128 two = _mm_set1_ps(2.0);
1611 x = xx[0];
1612 f = ff[0];
1614 nri = nlist->nri;
1615 iinr = nlist->iinr;
1616 jindex = nlist->jindex;
1617 jjnr = nlist->jjnr;
1618 shiftidx = nlist->shift;
1619 gid = nlist->gid;
1620 shiftvec = fr->shift_vec[0];
1621 fshift = fr->fshift[0];
1622 facel = _mm_set1_ps(fr->epsfac);
1623 charge = mdatoms->chargeA;
1624 nvdwtype = fr->ntype;
1625 vdwparam = fr->nbfp;
1626 vdwtype = mdatoms->typeA;
1628 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1629 ewtab = fr->ic->tabq_coul_F;
1630 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1631 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1633 /* Setup water-specific parameters */
1634 inr = nlist->iinr[0];
1635 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1636 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1637 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1638 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1640 jq1 = _mm_set1_ps(charge[inr+1]);
1641 jq2 = _mm_set1_ps(charge[inr+2]);
1642 jq3 = _mm_set1_ps(charge[inr+3]);
1643 vdwjidx0A = 2*vdwtype[inr+0];
1644 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1645 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1646 qq11 = _mm_mul_ps(iq1,jq1);
1647 qq12 = _mm_mul_ps(iq1,jq2);
1648 qq13 = _mm_mul_ps(iq1,jq3);
1649 qq21 = _mm_mul_ps(iq2,jq1);
1650 qq22 = _mm_mul_ps(iq2,jq2);
1651 qq23 = _mm_mul_ps(iq2,jq3);
1652 qq31 = _mm_mul_ps(iq3,jq1);
1653 qq32 = _mm_mul_ps(iq3,jq2);
1654 qq33 = _mm_mul_ps(iq3,jq3);
1656 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1657 rcutoff_scalar = fr->rcoulomb;
1658 rcutoff = _mm_set1_ps(rcutoff_scalar);
1659 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1661 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1662 rvdw = _mm_set1_ps(fr->rvdw);
1664 /* Avoid stupid compiler warnings */
1665 jnrA = jnrB = jnrC = jnrD = 0;
1666 j_coord_offsetA = 0;
1667 j_coord_offsetB = 0;
1668 j_coord_offsetC = 0;
1669 j_coord_offsetD = 0;
1671 outeriter = 0;
1672 inneriter = 0;
1674 for(iidx=0;iidx<4*DIM;iidx++)
1676 scratch[iidx] = 0.0;
1679 /* Start outer loop over neighborlists */
1680 for(iidx=0; iidx<nri; iidx++)
1682 /* Load shift vector for this list */
1683 i_shift_offset = DIM*shiftidx[iidx];
1685 /* Load limits for loop over neighbors */
1686 j_index_start = jindex[iidx];
1687 j_index_end = jindex[iidx+1];
1689 /* Get outer coordinate index */
1690 inr = iinr[iidx];
1691 i_coord_offset = DIM*inr;
1693 /* Load i particle coords and add shift vector */
1694 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1695 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1697 fix0 = _mm_setzero_ps();
1698 fiy0 = _mm_setzero_ps();
1699 fiz0 = _mm_setzero_ps();
1700 fix1 = _mm_setzero_ps();
1701 fiy1 = _mm_setzero_ps();
1702 fiz1 = _mm_setzero_ps();
1703 fix2 = _mm_setzero_ps();
1704 fiy2 = _mm_setzero_ps();
1705 fiz2 = _mm_setzero_ps();
1706 fix3 = _mm_setzero_ps();
1707 fiy3 = _mm_setzero_ps();
1708 fiz3 = _mm_setzero_ps();
1710 /* Start inner kernel loop */
1711 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1714 /* Get j neighbor index, and coordinate index */
1715 jnrA = jjnr[jidx];
1716 jnrB = jjnr[jidx+1];
1717 jnrC = jjnr[jidx+2];
1718 jnrD = jjnr[jidx+3];
1719 j_coord_offsetA = DIM*jnrA;
1720 j_coord_offsetB = DIM*jnrB;
1721 j_coord_offsetC = DIM*jnrC;
1722 j_coord_offsetD = DIM*jnrD;
1724 /* load j atom coordinates */
1725 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1726 x+j_coord_offsetC,x+j_coord_offsetD,
1727 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1728 &jy2,&jz2,&jx3,&jy3,&jz3);
1730 /* Calculate displacement vector */
1731 dx00 = _mm_sub_ps(ix0,jx0);
1732 dy00 = _mm_sub_ps(iy0,jy0);
1733 dz00 = _mm_sub_ps(iz0,jz0);
1734 dx11 = _mm_sub_ps(ix1,jx1);
1735 dy11 = _mm_sub_ps(iy1,jy1);
1736 dz11 = _mm_sub_ps(iz1,jz1);
1737 dx12 = _mm_sub_ps(ix1,jx2);
1738 dy12 = _mm_sub_ps(iy1,jy2);
1739 dz12 = _mm_sub_ps(iz1,jz2);
1740 dx13 = _mm_sub_ps(ix1,jx3);
1741 dy13 = _mm_sub_ps(iy1,jy3);
1742 dz13 = _mm_sub_ps(iz1,jz3);
1743 dx21 = _mm_sub_ps(ix2,jx1);
1744 dy21 = _mm_sub_ps(iy2,jy1);
1745 dz21 = _mm_sub_ps(iz2,jz1);
1746 dx22 = _mm_sub_ps(ix2,jx2);
1747 dy22 = _mm_sub_ps(iy2,jy2);
1748 dz22 = _mm_sub_ps(iz2,jz2);
1749 dx23 = _mm_sub_ps(ix2,jx3);
1750 dy23 = _mm_sub_ps(iy2,jy3);
1751 dz23 = _mm_sub_ps(iz2,jz3);
1752 dx31 = _mm_sub_ps(ix3,jx1);
1753 dy31 = _mm_sub_ps(iy3,jy1);
1754 dz31 = _mm_sub_ps(iz3,jz1);
1755 dx32 = _mm_sub_ps(ix3,jx2);
1756 dy32 = _mm_sub_ps(iy3,jy2);
1757 dz32 = _mm_sub_ps(iz3,jz2);
1758 dx33 = _mm_sub_ps(ix3,jx3);
1759 dy33 = _mm_sub_ps(iy3,jy3);
1760 dz33 = _mm_sub_ps(iz3,jz3);
1762 /* Calculate squared distance and things based on it */
1763 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1764 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1765 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1766 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1767 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1768 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1769 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1770 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1771 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1772 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1774 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1775 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1776 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1777 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1778 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1779 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1780 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1781 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1782 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1784 rinvsq00 = gmx_mm_inv_ps(rsq00);
1785 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1786 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1787 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1788 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1789 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1790 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1791 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1792 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1793 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1795 fjx0 = _mm_setzero_ps();
1796 fjy0 = _mm_setzero_ps();
1797 fjz0 = _mm_setzero_ps();
1798 fjx1 = _mm_setzero_ps();
1799 fjy1 = _mm_setzero_ps();
1800 fjz1 = _mm_setzero_ps();
1801 fjx2 = _mm_setzero_ps();
1802 fjy2 = _mm_setzero_ps();
1803 fjz2 = _mm_setzero_ps();
1804 fjx3 = _mm_setzero_ps();
1805 fjy3 = _mm_setzero_ps();
1806 fjz3 = _mm_setzero_ps();
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 if (gmx_mm_any_lt(rsq00,rcutoff2))
1815 /* LENNARD-JONES DISPERSION/REPULSION */
1817 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1818 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1820 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1822 fscal = fvdw;
1824 fscal = _mm_and_ps(fscal,cutoff_mask);
1826 /* Calculate temporary vectorial force */
1827 tx = _mm_mul_ps(fscal,dx00);
1828 ty = _mm_mul_ps(fscal,dy00);
1829 tz = _mm_mul_ps(fscal,dz00);
1831 /* Update vectorial force */
1832 fix0 = _mm_add_ps(fix0,tx);
1833 fiy0 = _mm_add_ps(fiy0,ty);
1834 fiz0 = _mm_add_ps(fiz0,tz);
1836 fjx0 = _mm_add_ps(fjx0,tx);
1837 fjy0 = _mm_add_ps(fjy0,ty);
1838 fjz0 = _mm_add_ps(fjz0,tz);
1842 /**************************
1843 * CALCULATE INTERACTIONS *
1844 **************************/
1846 if (gmx_mm_any_lt(rsq11,rcutoff2))
1849 r11 = _mm_mul_ps(rsq11,rinv11);
1851 /* EWALD ELECTROSTATICS */
1853 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1854 ewrt = _mm_mul_ps(r11,ewtabscale);
1855 ewitab = _mm_cvttps_epi32(ewrt);
1856 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1857 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1858 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1859 &ewtabF,&ewtabFn);
1860 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1861 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1863 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1865 fscal = felec;
1867 fscal = _mm_and_ps(fscal,cutoff_mask);
1869 /* Calculate temporary vectorial force */
1870 tx = _mm_mul_ps(fscal,dx11);
1871 ty = _mm_mul_ps(fscal,dy11);
1872 tz = _mm_mul_ps(fscal,dz11);
1874 /* Update vectorial force */
1875 fix1 = _mm_add_ps(fix1,tx);
1876 fiy1 = _mm_add_ps(fiy1,ty);
1877 fiz1 = _mm_add_ps(fiz1,tz);
1879 fjx1 = _mm_add_ps(fjx1,tx);
1880 fjy1 = _mm_add_ps(fjy1,ty);
1881 fjz1 = _mm_add_ps(fjz1,tz);
1885 /**************************
1886 * CALCULATE INTERACTIONS *
1887 **************************/
1889 if (gmx_mm_any_lt(rsq12,rcutoff2))
1892 r12 = _mm_mul_ps(rsq12,rinv12);
1894 /* EWALD ELECTROSTATICS */
1896 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1897 ewrt = _mm_mul_ps(r12,ewtabscale);
1898 ewitab = _mm_cvttps_epi32(ewrt);
1899 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1900 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1901 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1902 &ewtabF,&ewtabFn);
1903 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1904 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1906 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1908 fscal = felec;
1910 fscal = _mm_and_ps(fscal,cutoff_mask);
1912 /* Calculate temporary vectorial force */
1913 tx = _mm_mul_ps(fscal,dx12);
1914 ty = _mm_mul_ps(fscal,dy12);
1915 tz = _mm_mul_ps(fscal,dz12);
1917 /* Update vectorial force */
1918 fix1 = _mm_add_ps(fix1,tx);
1919 fiy1 = _mm_add_ps(fiy1,ty);
1920 fiz1 = _mm_add_ps(fiz1,tz);
1922 fjx2 = _mm_add_ps(fjx2,tx);
1923 fjy2 = _mm_add_ps(fjy2,ty);
1924 fjz2 = _mm_add_ps(fjz2,tz);
1928 /**************************
1929 * CALCULATE INTERACTIONS *
1930 **************************/
1932 if (gmx_mm_any_lt(rsq13,rcutoff2))
1935 r13 = _mm_mul_ps(rsq13,rinv13);
1937 /* EWALD ELECTROSTATICS */
1939 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1940 ewrt = _mm_mul_ps(r13,ewtabscale);
1941 ewitab = _mm_cvttps_epi32(ewrt);
1942 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1943 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1944 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1945 &ewtabF,&ewtabFn);
1946 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1947 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1949 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1951 fscal = felec;
1953 fscal = _mm_and_ps(fscal,cutoff_mask);
1955 /* Calculate temporary vectorial force */
1956 tx = _mm_mul_ps(fscal,dx13);
1957 ty = _mm_mul_ps(fscal,dy13);
1958 tz = _mm_mul_ps(fscal,dz13);
1960 /* Update vectorial force */
1961 fix1 = _mm_add_ps(fix1,tx);
1962 fiy1 = _mm_add_ps(fiy1,ty);
1963 fiz1 = _mm_add_ps(fiz1,tz);
1965 fjx3 = _mm_add_ps(fjx3,tx);
1966 fjy3 = _mm_add_ps(fjy3,ty);
1967 fjz3 = _mm_add_ps(fjz3,tz);
1971 /**************************
1972 * CALCULATE INTERACTIONS *
1973 **************************/
1975 if (gmx_mm_any_lt(rsq21,rcutoff2))
1978 r21 = _mm_mul_ps(rsq21,rinv21);
1980 /* EWALD ELECTROSTATICS */
1982 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1983 ewrt = _mm_mul_ps(r21,ewtabscale);
1984 ewitab = _mm_cvttps_epi32(ewrt);
1985 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1986 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1987 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1988 &ewtabF,&ewtabFn);
1989 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1990 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1992 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1994 fscal = felec;
1996 fscal = _mm_and_ps(fscal,cutoff_mask);
1998 /* Calculate temporary vectorial force */
1999 tx = _mm_mul_ps(fscal,dx21);
2000 ty = _mm_mul_ps(fscal,dy21);
2001 tz = _mm_mul_ps(fscal,dz21);
2003 /* Update vectorial force */
2004 fix2 = _mm_add_ps(fix2,tx);
2005 fiy2 = _mm_add_ps(fiy2,ty);
2006 fiz2 = _mm_add_ps(fiz2,tz);
2008 fjx1 = _mm_add_ps(fjx1,tx);
2009 fjy1 = _mm_add_ps(fjy1,ty);
2010 fjz1 = _mm_add_ps(fjz1,tz);
2014 /**************************
2015 * CALCULATE INTERACTIONS *
2016 **************************/
2018 if (gmx_mm_any_lt(rsq22,rcutoff2))
2021 r22 = _mm_mul_ps(rsq22,rinv22);
2023 /* EWALD ELECTROSTATICS */
2025 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2026 ewrt = _mm_mul_ps(r22,ewtabscale);
2027 ewitab = _mm_cvttps_epi32(ewrt);
2028 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2029 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2030 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2031 &ewtabF,&ewtabFn);
2032 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2033 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2035 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2037 fscal = felec;
2039 fscal = _mm_and_ps(fscal,cutoff_mask);
2041 /* Calculate temporary vectorial force */
2042 tx = _mm_mul_ps(fscal,dx22);
2043 ty = _mm_mul_ps(fscal,dy22);
2044 tz = _mm_mul_ps(fscal,dz22);
2046 /* Update vectorial force */
2047 fix2 = _mm_add_ps(fix2,tx);
2048 fiy2 = _mm_add_ps(fiy2,ty);
2049 fiz2 = _mm_add_ps(fiz2,tz);
2051 fjx2 = _mm_add_ps(fjx2,tx);
2052 fjy2 = _mm_add_ps(fjy2,ty);
2053 fjz2 = _mm_add_ps(fjz2,tz);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 if (gmx_mm_any_lt(rsq23,rcutoff2))
2064 r23 = _mm_mul_ps(rsq23,rinv23);
2066 /* EWALD ELECTROSTATICS */
2068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2069 ewrt = _mm_mul_ps(r23,ewtabscale);
2070 ewitab = _mm_cvttps_epi32(ewrt);
2071 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2072 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2073 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2074 &ewtabF,&ewtabFn);
2075 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2076 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2078 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2080 fscal = felec;
2082 fscal = _mm_and_ps(fscal,cutoff_mask);
2084 /* Calculate temporary vectorial force */
2085 tx = _mm_mul_ps(fscal,dx23);
2086 ty = _mm_mul_ps(fscal,dy23);
2087 tz = _mm_mul_ps(fscal,dz23);
2089 /* Update vectorial force */
2090 fix2 = _mm_add_ps(fix2,tx);
2091 fiy2 = _mm_add_ps(fiy2,ty);
2092 fiz2 = _mm_add_ps(fiz2,tz);
2094 fjx3 = _mm_add_ps(fjx3,tx);
2095 fjy3 = _mm_add_ps(fjy3,ty);
2096 fjz3 = _mm_add_ps(fjz3,tz);
2100 /**************************
2101 * CALCULATE INTERACTIONS *
2102 **************************/
2104 if (gmx_mm_any_lt(rsq31,rcutoff2))
2107 r31 = _mm_mul_ps(rsq31,rinv31);
2109 /* EWALD ELECTROSTATICS */
2111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112 ewrt = _mm_mul_ps(r31,ewtabscale);
2113 ewitab = _mm_cvttps_epi32(ewrt);
2114 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2115 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2116 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2117 &ewtabF,&ewtabFn);
2118 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2119 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2121 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2123 fscal = felec;
2125 fscal = _mm_and_ps(fscal,cutoff_mask);
2127 /* Calculate temporary vectorial force */
2128 tx = _mm_mul_ps(fscal,dx31);
2129 ty = _mm_mul_ps(fscal,dy31);
2130 tz = _mm_mul_ps(fscal,dz31);
2132 /* Update vectorial force */
2133 fix3 = _mm_add_ps(fix3,tx);
2134 fiy3 = _mm_add_ps(fiy3,ty);
2135 fiz3 = _mm_add_ps(fiz3,tz);
2137 fjx1 = _mm_add_ps(fjx1,tx);
2138 fjy1 = _mm_add_ps(fjy1,ty);
2139 fjz1 = _mm_add_ps(fjz1,tz);
2143 /**************************
2144 * CALCULATE INTERACTIONS *
2145 **************************/
2147 if (gmx_mm_any_lt(rsq32,rcutoff2))
2150 r32 = _mm_mul_ps(rsq32,rinv32);
2152 /* EWALD ELECTROSTATICS */
2154 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2155 ewrt = _mm_mul_ps(r32,ewtabscale);
2156 ewitab = _mm_cvttps_epi32(ewrt);
2157 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2158 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2159 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2160 &ewtabF,&ewtabFn);
2161 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2162 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2164 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2166 fscal = felec;
2168 fscal = _mm_and_ps(fscal,cutoff_mask);
2170 /* Calculate temporary vectorial force */
2171 tx = _mm_mul_ps(fscal,dx32);
2172 ty = _mm_mul_ps(fscal,dy32);
2173 tz = _mm_mul_ps(fscal,dz32);
2175 /* Update vectorial force */
2176 fix3 = _mm_add_ps(fix3,tx);
2177 fiy3 = _mm_add_ps(fiy3,ty);
2178 fiz3 = _mm_add_ps(fiz3,tz);
2180 fjx2 = _mm_add_ps(fjx2,tx);
2181 fjy2 = _mm_add_ps(fjy2,ty);
2182 fjz2 = _mm_add_ps(fjz2,tz);
2186 /**************************
2187 * CALCULATE INTERACTIONS *
2188 **************************/
2190 if (gmx_mm_any_lt(rsq33,rcutoff2))
2193 r33 = _mm_mul_ps(rsq33,rinv33);
2195 /* EWALD ELECTROSTATICS */
2197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2198 ewrt = _mm_mul_ps(r33,ewtabscale);
2199 ewitab = _mm_cvttps_epi32(ewrt);
2200 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2201 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2202 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2203 &ewtabF,&ewtabFn);
2204 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2205 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2207 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2209 fscal = felec;
2211 fscal = _mm_and_ps(fscal,cutoff_mask);
2213 /* Calculate temporary vectorial force */
2214 tx = _mm_mul_ps(fscal,dx33);
2215 ty = _mm_mul_ps(fscal,dy33);
2216 tz = _mm_mul_ps(fscal,dz33);
2218 /* Update vectorial force */
2219 fix3 = _mm_add_ps(fix3,tx);
2220 fiy3 = _mm_add_ps(fiy3,ty);
2221 fiz3 = _mm_add_ps(fiz3,tz);
2223 fjx3 = _mm_add_ps(fjx3,tx);
2224 fjy3 = _mm_add_ps(fjy3,ty);
2225 fjz3 = _mm_add_ps(fjz3,tz);
2229 fjptrA = f+j_coord_offsetA;
2230 fjptrB = f+j_coord_offsetB;
2231 fjptrC = f+j_coord_offsetC;
2232 fjptrD = f+j_coord_offsetD;
2234 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2235 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2236 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2238 /* Inner loop uses 384 flops */
2241 if(jidx<j_index_end)
2244 /* Get j neighbor index, and coordinate index */
2245 jnrlistA = jjnr[jidx];
2246 jnrlistB = jjnr[jidx+1];
2247 jnrlistC = jjnr[jidx+2];
2248 jnrlistD = jjnr[jidx+3];
2249 /* Sign of each element will be negative for non-real atoms.
2250 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2251 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2253 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2254 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2255 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2256 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2257 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2258 j_coord_offsetA = DIM*jnrA;
2259 j_coord_offsetB = DIM*jnrB;
2260 j_coord_offsetC = DIM*jnrC;
2261 j_coord_offsetD = DIM*jnrD;
2263 /* load j atom coordinates */
2264 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2265 x+j_coord_offsetC,x+j_coord_offsetD,
2266 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2267 &jy2,&jz2,&jx3,&jy3,&jz3);
2269 /* Calculate displacement vector */
2270 dx00 = _mm_sub_ps(ix0,jx0);
2271 dy00 = _mm_sub_ps(iy0,jy0);
2272 dz00 = _mm_sub_ps(iz0,jz0);
2273 dx11 = _mm_sub_ps(ix1,jx1);
2274 dy11 = _mm_sub_ps(iy1,jy1);
2275 dz11 = _mm_sub_ps(iz1,jz1);
2276 dx12 = _mm_sub_ps(ix1,jx2);
2277 dy12 = _mm_sub_ps(iy1,jy2);
2278 dz12 = _mm_sub_ps(iz1,jz2);
2279 dx13 = _mm_sub_ps(ix1,jx3);
2280 dy13 = _mm_sub_ps(iy1,jy3);
2281 dz13 = _mm_sub_ps(iz1,jz3);
2282 dx21 = _mm_sub_ps(ix2,jx1);
2283 dy21 = _mm_sub_ps(iy2,jy1);
2284 dz21 = _mm_sub_ps(iz2,jz1);
2285 dx22 = _mm_sub_ps(ix2,jx2);
2286 dy22 = _mm_sub_ps(iy2,jy2);
2287 dz22 = _mm_sub_ps(iz2,jz2);
2288 dx23 = _mm_sub_ps(ix2,jx3);
2289 dy23 = _mm_sub_ps(iy2,jy3);
2290 dz23 = _mm_sub_ps(iz2,jz3);
2291 dx31 = _mm_sub_ps(ix3,jx1);
2292 dy31 = _mm_sub_ps(iy3,jy1);
2293 dz31 = _mm_sub_ps(iz3,jz1);
2294 dx32 = _mm_sub_ps(ix3,jx2);
2295 dy32 = _mm_sub_ps(iy3,jy2);
2296 dz32 = _mm_sub_ps(iz3,jz2);
2297 dx33 = _mm_sub_ps(ix3,jx3);
2298 dy33 = _mm_sub_ps(iy3,jy3);
2299 dz33 = _mm_sub_ps(iz3,jz3);
2301 /* Calculate squared distance and things based on it */
2302 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2303 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2304 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2305 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2306 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2307 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2308 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2309 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2310 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2311 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2313 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2314 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2315 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2316 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2317 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2318 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2319 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2320 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2321 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2323 rinvsq00 = gmx_mm_inv_ps(rsq00);
2324 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2325 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2326 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2327 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2328 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2329 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2330 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2331 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2332 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2334 fjx0 = _mm_setzero_ps();
2335 fjy0 = _mm_setzero_ps();
2336 fjz0 = _mm_setzero_ps();
2337 fjx1 = _mm_setzero_ps();
2338 fjy1 = _mm_setzero_ps();
2339 fjz1 = _mm_setzero_ps();
2340 fjx2 = _mm_setzero_ps();
2341 fjy2 = _mm_setzero_ps();
2342 fjz2 = _mm_setzero_ps();
2343 fjx3 = _mm_setzero_ps();
2344 fjy3 = _mm_setzero_ps();
2345 fjz3 = _mm_setzero_ps();
2347 /**************************
2348 * CALCULATE INTERACTIONS *
2349 **************************/
2351 if (gmx_mm_any_lt(rsq00,rcutoff2))
2354 /* LENNARD-JONES DISPERSION/REPULSION */
2356 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2357 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2359 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2361 fscal = fvdw;
2363 fscal = _mm_and_ps(fscal,cutoff_mask);
2365 fscal = _mm_andnot_ps(dummy_mask,fscal);
2367 /* Calculate temporary vectorial force */
2368 tx = _mm_mul_ps(fscal,dx00);
2369 ty = _mm_mul_ps(fscal,dy00);
2370 tz = _mm_mul_ps(fscal,dz00);
2372 /* Update vectorial force */
2373 fix0 = _mm_add_ps(fix0,tx);
2374 fiy0 = _mm_add_ps(fiy0,ty);
2375 fiz0 = _mm_add_ps(fiz0,tz);
2377 fjx0 = _mm_add_ps(fjx0,tx);
2378 fjy0 = _mm_add_ps(fjy0,ty);
2379 fjz0 = _mm_add_ps(fjz0,tz);
2383 /**************************
2384 * CALCULATE INTERACTIONS *
2385 **************************/
2387 if (gmx_mm_any_lt(rsq11,rcutoff2))
2390 r11 = _mm_mul_ps(rsq11,rinv11);
2391 r11 = _mm_andnot_ps(dummy_mask,r11);
2393 /* EWALD ELECTROSTATICS */
2395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2396 ewrt = _mm_mul_ps(r11,ewtabscale);
2397 ewitab = _mm_cvttps_epi32(ewrt);
2398 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2399 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2400 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2401 &ewtabF,&ewtabFn);
2402 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2403 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2405 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2407 fscal = felec;
2409 fscal = _mm_and_ps(fscal,cutoff_mask);
2411 fscal = _mm_andnot_ps(dummy_mask,fscal);
2413 /* Calculate temporary vectorial force */
2414 tx = _mm_mul_ps(fscal,dx11);
2415 ty = _mm_mul_ps(fscal,dy11);
2416 tz = _mm_mul_ps(fscal,dz11);
2418 /* Update vectorial force */
2419 fix1 = _mm_add_ps(fix1,tx);
2420 fiy1 = _mm_add_ps(fiy1,ty);
2421 fiz1 = _mm_add_ps(fiz1,tz);
2423 fjx1 = _mm_add_ps(fjx1,tx);
2424 fjy1 = _mm_add_ps(fjy1,ty);
2425 fjz1 = _mm_add_ps(fjz1,tz);
2429 /**************************
2430 * CALCULATE INTERACTIONS *
2431 **************************/
2433 if (gmx_mm_any_lt(rsq12,rcutoff2))
2436 r12 = _mm_mul_ps(rsq12,rinv12);
2437 r12 = _mm_andnot_ps(dummy_mask,r12);
2439 /* EWALD ELECTROSTATICS */
2441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2442 ewrt = _mm_mul_ps(r12,ewtabscale);
2443 ewitab = _mm_cvttps_epi32(ewrt);
2444 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2445 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2446 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2447 &ewtabF,&ewtabFn);
2448 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2449 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2451 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2453 fscal = felec;
2455 fscal = _mm_and_ps(fscal,cutoff_mask);
2457 fscal = _mm_andnot_ps(dummy_mask,fscal);
2459 /* Calculate temporary vectorial force */
2460 tx = _mm_mul_ps(fscal,dx12);
2461 ty = _mm_mul_ps(fscal,dy12);
2462 tz = _mm_mul_ps(fscal,dz12);
2464 /* Update vectorial force */
2465 fix1 = _mm_add_ps(fix1,tx);
2466 fiy1 = _mm_add_ps(fiy1,ty);
2467 fiz1 = _mm_add_ps(fiz1,tz);
2469 fjx2 = _mm_add_ps(fjx2,tx);
2470 fjy2 = _mm_add_ps(fjy2,ty);
2471 fjz2 = _mm_add_ps(fjz2,tz);
2475 /**************************
2476 * CALCULATE INTERACTIONS *
2477 **************************/
2479 if (gmx_mm_any_lt(rsq13,rcutoff2))
2482 r13 = _mm_mul_ps(rsq13,rinv13);
2483 r13 = _mm_andnot_ps(dummy_mask,r13);
2485 /* EWALD ELECTROSTATICS */
2487 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2488 ewrt = _mm_mul_ps(r13,ewtabscale);
2489 ewitab = _mm_cvttps_epi32(ewrt);
2490 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2491 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2492 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2493 &ewtabF,&ewtabFn);
2494 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2495 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2497 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2499 fscal = felec;
2501 fscal = _mm_and_ps(fscal,cutoff_mask);
2503 fscal = _mm_andnot_ps(dummy_mask,fscal);
2505 /* Calculate temporary vectorial force */
2506 tx = _mm_mul_ps(fscal,dx13);
2507 ty = _mm_mul_ps(fscal,dy13);
2508 tz = _mm_mul_ps(fscal,dz13);
2510 /* Update vectorial force */
2511 fix1 = _mm_add_ps(fix1,tx);
2512 fiy1 = _mm_add_ps(fiy1,ty);
2513 fiz1 = _mm_add_ps(fiz1,tz);
2515 fjx3 = _mm_add_ps(fjx3,tx);
2516 fjy3 = _mm_add_ps(fjy3,ty);
2517 fjz3 = _mm_add_ps(fjz3,tz);
2521 /**************************
2522 * CALCULATE INTERACTIONS *
2523 **************************/
2525 if (gmx_mm_any_lt(rsq21,rcutoff2))
2528 r21 = _mm_mul_ps(rsq21,rinv21);
2529 r21 = _mm_andnot_ps(dummy_mask,r21);
2531 /* EWALD ELECTROSTATICS */
2533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2534 ewrt = _mm_mul_ps(r21,ewtabscale);
2535 ewitab = _mm_cvttps_epi32(ewrt);
2536 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2537 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2538 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2539 &ewtabF,&ewtabFn);
2540 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2541 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2543 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2545 fscal = felec;
2547 fscal = _mm_and_ps(fscal,cutoff_mask);
2549 fscal = _mm_andnot_ps(dummy_mask,fscal);
2551 /* Calculate temporary vectorial force */
2552 tx = _mm_mul_ps(fscal,dx21);
2553 ty = _mm_mul_ps(fscal,dy21);
2554 tz = _mm_mul_ps(fscal,dz21);
2556 /* Update vectorial force */
2557 fix2 = _mm_add_ps(fix2,tx);
2558 fiy2 = _mm_add_ps(fiy2,ty);
2559 fiz2 = _mm_add_ps(fiz2,tz);
2561 fjx1 = _mm_add_ps(fjx1,tx);
2562 fjy1 = _mm_add_ps(fjy1,ty);
2563 fjz1 = _mm_add_ps(fjz1,tz);
2567 /**************************
2568 * CALCULATE INTERACTIONS *
2569 **************************/
2571 if (gmx_mm_any_lt(rsq22,rcutoff2))
2574 r22 = _mm_mul_ps(rsq22,rinv22);
2575 r22 = _mm_andnot_ps(dummy_mask,r22);
2577 /* EWALD ELECTROSTATICS */
2579 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2580 ewrt = _mm_mul_ps(r22,ewtabscale);
2581 ewitab = _mm_cvttps_epi32(ewrt);
2582 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2583 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2584 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2585 &ewtabF,&ewtabFn);
2586 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2587 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2589 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2591 fscal = felec;
2593 fscal = _mm_and_ps(fscal,cutoff_mask);
2595 fscal = _mm_andnot_ps(dummy_mask,fscal);
2597 /* Calculate temporary vectorial force */
2598 tx = _mm_mul_ps(fscal,dx22);
2599 ty = _mm_mul_ps(fscal,dy22);
2600 tz = _mm_mul_ps(fscal,dz22);
2602 /* Update vectorial force */
2603 fix2 = _mm_add_ps(fix2,tx);
2604 fiy2 = _mm_add_ps(fiy2,ty);
2605 fiz2 = _mm_add_ps(fiz2,tz);
2607 fjx2 = _mm_add_ps(fjx2,tx);
2608 fjy2 = _mm_add_ps(fjy2,ty);
2609 fjz2 = _mm_add_ps(fjz2,tz);
2613 /**************************
2614 * CALCULATE INTERACTIONS *
2615 **************************/
2617 if (gmx_mm_any_lt(rsq23,rcutoff2))
2620 r23 = _mm_mul_ps(rsq23,rinv23);
2621 r23 = _mm_andnot_ps(dummy_mask,r23);
2623 /* EWALD ELECTROSTATICS */
2625 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2626 ewrt = _mm_mul_ps(r23,ewtabscale);
2627 ewitab = _mm_cvttps_epi32(ewrt);
2628 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2629 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2630 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2631 &ewtabF,&ewtabFn);
2632 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2633 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2635 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2637 fscal = felec;
2639 fscal = _mm_and_ps(fscal,cutoff_mask);
2641 fscal = _mm_andnot_ps(dummy_mask,fscal);
2643 /* Calculate temporary vectorial force */
2644 tx = _mm_mul_ps(fscal,dx23);
2645 ty = _mm_mul_ps(fscal,dy23);
2646 tz = _mm_mul_ps(fscal,dz23);
2648 /* Update vectorial force */
2649 fix2 = _mm_add_ps(fix2,tx);
2650 fiy2 = _mm_add_ps(fiy2,ty);
2651 fiz2 = _mm_add_ps(fiz2,tz);
2653 fjx3 = _mm_add_ps(fjx3,tx);
2654 fjy3 = _mm_add_ps(fjy3,ty);
2655 fjz3 = _mm_add_ps(fjz3,tz);
2659 /**************************
2660 * CALCULATE INTERACTIONS *
2661 **************************/
2663 if (gmx_mm_any_lt(rsq31,rcutoff2))
2666 r31 = _mm_mul_ps(rsq31,rinv31);
2667 r31 = _mm_andnot_ps(dummy_mask,r31);
2669 /* EWALD ELECTROSTATICS */
2671 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2672 ewrt = _mm_mul_ps(r31,ewtabscale);
2673 ewitab = _mm_cvttps_epi32(ewrt);
2674 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2675 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2676 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2677 &ewtabF,&ewtabFn);
2678 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2679 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2681 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2683 fscal = felec;
2685 fscal = _mm_and_ps(fscal,cutoff_mask);
2687 fscal = _mm_andnot_ps(dummy_mask,fscal);
2689 /* Calculate temporary vectorial force */
2690 tx = _mm_mul_ps(fscal,dx31);
2691 ty = _mm_mul_ps(fscal,dy31);
2692 tz = _mm_mul_ps(fscal,dz31);
2694 /* Update vectorial force */
2695 fix3 = _mm_add_ps(fix3,tx);
2696 fiy3 = _mm_add_ps(fiy3,ty);
2697 fiz3 = _mm_add_ps(fiz3,tz);
2699 fjx1 = _mm_add_ps(fjx1,tx);
2700 fjy1 = _mm_add_ps(fjy1,ty);
2701 fjz1 = _mm_add_ps(fjz1,tz);
2705 /**************************
2706 * CALCULATE INTERACTIONS *
2707 **************************/
2709 if (gmx_mm_any_lt(rsq32,rcutoff2))
2712 r32 = _mm_mul_ps(rsq32,rinv32);
2713 r32 = _mm_andnot_ps(dummy_mask,r32);
2715 /* EWALD ELECTROSTATICS */
2717 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2718 ewrt = _mm_mul_ps(r32,ewtabscale);
2719 ewitab = _mm_cvttps_epi32(ewrt);
2720 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2721 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2722 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2723 &ewtabF,&ewtabFn);
2724 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2725 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2727 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2729 fscal = felec;
2731 fscal = _mm_and_ps(fscal,cutoff_mask);
2733 fscal = _mm_andnot_ps(dummy_mask,fscal);
2735 /* Calculate temporary vectorial force */
2736 tx = _mm_mul_ps(fscal,dx32);
2737 ty = _mm_mul_ps(fscal,dy32);
2738 tz = _mm_mul_ps(fscal,dz32);
2740 /* Update vectorial force */
2741 fix3 = _mm_add_ps(fix3,tx);
2742 fiy3 = _mm_add_ps(fiy3,ty);
2743 fiz3 = _mm_add_ps(fiz3,tz);
2745 fjx2 = _mm_add_ps(fjx2,tx);
2746 fjy2 = _mm_add_ps(fjy2,ty);
2747 fjz2 = _mm_add_ps(fjz2,tz);
2751 /**************************
2752 * CALCULATE INTERACTIONS *
2753 **************************/
2755 if (gmx_mm_any_lt(rsq33,rcutoff2))
2758 r33 = _mm_mul_ps(rsq33,rinv33);
2759 r33 = _mm_andnot_ps(dummy_mask,r33);
2761 /* EWALD ELECTROSTATICS */
2763 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2764 ewrt = _mm_mul_ps(r33,ewtabscale);
2765 ewitab = _mm_cvttps_epi32(ewrt);
2766 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2767 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2768 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2769 &ewtabF,&ewtabFn);
2770 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2771 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2773 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2775 fscal = felec;
2777 fscal = _mm_and_ps(fscal,cutoff_mask);
2779 fscal = _mm_andnot_ps(dummy_mask,fscal);
2781 /* Calculate temporary vectorial force */
2782 tx = _mm_mul_ps(fscal,dx33);
2783 ty = _mm_mul_ps(fscal,dy33);
2784 tz = _mm_mul_ps(fscal,dz33);
2786 /* Update vectorial force */
2787 fix3 = _mm_add_ps(fix3,tx);
2788 fiy3 = _mm_add_ps(fiy3,ty);
2789 fiz3 = _mm_add_ps(fiz3,tz);
2791 fjx3 = _mm_add_ps(fjx3,tx);
2792 fjy3 = _mm_add_ps(fjy3,ty);
2793 fjz3 = _mm_add_ps(fjz3,tz);
2797 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2798 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2799 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2800 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2802 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2803 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2804 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2806 /* Inner loop uses 393 flops */
2809 /* End of innermost loop */
2811 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2812 f+i_coord_offset,fshift+i_shift_offset);
2814 /* Increment number of inner iterations */
2815 inneriter += j_index_end - j_index_start;
2817 /* Outer loop uses 24 flops */
2820 /* Increment number of outer iterations */
2821 outeriter += nri;
2823 /* Update outer/inner flops */
2825 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);