Remove all unnecessary HAVE_CONFIG_H
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse2_single.c
blob4f0f9142dba961b34ca9c2235ddfd2c8fbdbb55f
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
38 #include "config.h"
40 #include <math.h>
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
57 void
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 real scratch[4*DIM];
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 int vdwioffset0;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 int vdwioffset1;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 int vdwioffset2;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
105 real *charge;
106 int nvdwtype;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108 int *vdwtype;
109 real *vdwparam;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
112 __m128i ewitab;
113 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 real *ewtab;
115 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116 real rswitch_scalar,d_scalar;
117 __m128 dummy_mask,cutoff_mask;
118 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one = _mm_set1_ps(1.0);
120 __m128 two = _mm_set1_ps(2.0);
121 x = xx[0];
122 f = ff[0];
124 nri = nlist->nri;
125 iinr = nlist->iinr;
126 jindex = nlist->jindex;
127 jjnr = nlist->jjnr;
128 shiftidx = nlist->shift;
129 gid = nlist->gid;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_ps(fr->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
135 vdwparam = fr->nbfp;
136 vdwtype = mdatoms->typeA;
138 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
139 ewtab = fr->ic->tabq_coul_FDV0;
140 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
141 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
146 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
150 jq0 = _mm_set1_ps(charge[inr+0]);
151 jq1 = _mm_set1_ps(charge[inr+1]);
152 jq2 = _mm_set1_ps(charge[inr+2]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 qq00 = _mm_mul_ps(iq0,jq0);
155 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157 qq01 = _mm_mul_ps(iq0,jq1);
158 qq02 = _mm_mul_ps(iq0,jq2);
159 qq10 = _mm_mul_ps(iq1,jq0);
160 qq11 = _mm_mul_ps(iq1,jq1);
161 qq12 = _mm_mul_ps(iq1,jq2);
162 qq20 = _mm_mul_ps(iq2,jq0);
163 qq21 = _mm_mul_ps(iq2,jq1);
164 qq22 = _mm_mul_ps(iq2,jq2);
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar = fr->rcoulomb;
168 rcutoff = _mm_set1_ps(rcutoff_scalar);
169 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
171 rswitch_scalar = fr->rcoulomb_switch;
172 rswitch = _mm_set1_ps(rswitch_scalar);
173 /* Setup switch parameters */
174 d_scalar = rcutoff_scalar-rswitch_scalar;
175 d = _mm_set1_ps(d_scalar);
176 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
177 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
180 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
181 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183 /* Avoid stupid compiler warnings */
184 jnrA = jnrB = jnrC = jnrD = 0;
185 j_coord_offsetA = 0;
186 j_coord_offsetB = 0;
187 j_coord_offsetC = 0;
188 j_coord_offsetD = 0;
190 outeriter = 0;
191 inneriter = 0;
193 for(iidx=0;iidx<4*DIM;iidx++)
195 scratch[iidx] = 0.0;
198 /* Start outer loop over neighborlists */
199 for(iidx=0; iidx<nri; iidx++)
201 /* Load shift vector for this list */
202 i_shift_offset = DIM*shiftidx[iidx];
204 /* Load limits for loop over neighbors */
205 j_index_start = jindex[iidx];
206 j_index_end = jindex[iidx+1];
208 /* Get outer coordinate index */
209 inr = iinr[iidx];
210 i_coord_offset = DIM*inr;
212 /* Load i particle coords and add shift vector */
213 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
214 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
216 fix0 = _mm_setzero_ps();
217 fiy0 = _mm_setzero_ps();
218 fiz0 = _mm_setzero_ps();
219 fix1 = _mm_setzero_ps();
220 fiy1 = _mm_setzero_ps();
221 fiz1 = _mm_setzero_ps();
222 fix2 = _mm_setzero_ps();
223 fiy2 = _mm_setzero_ps();
224 fiz2 = _mm_setzero_ps();
226 /* Reset potential sums */
227 velecsum = _mm_setzero_ps();
228 vvdwsum = _mm_setzero_ps();
230 /* Start inner kernel loop */
231 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
234 /* Get j neighbor index, and coordinate index */
235 jnrA = jjnr[jidx];
236 jnrB = jjnr[jidx+1];
237 jnrC = jjnr[jidx+2];
238 jnrD = jjnr[jidx+3];
239 j_coord_offsetA = DIM*jnrA;
240 j_coord_offsetB = DIM*jnrB;
241 j_coord_offsetC = DIM*jnrC;
242 j_coord_offsetD = DIM*jnrD;
244 /* load j atom coordinates */
245 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
246 x+j_coord_offsetC,x+j_coord_offsetD,
247 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
249 /* Calculate displacement vector */
250 dx00 = _mm_sub_ps(ix0,jx0);
251 dy00 = _mm_sub_ps(iy0,jy0);
252 dz00 = _mm_sub_ps(iz0,jz0);
253 dx01 = _mm_sub_ps(ix0,jx1);
254 dy01 = _mm_sub_ps(iy0,jy1);
255 dz01 = _mm_sub_ps(iz0,jz1);
256 dx02 = _mm_sub_ps(ix0,jx2);
257 dy02 = _mm_sub_ps(iy0,jy2);
258 dz02 = _mm_sub_ps(iz0,jz2);
259 dx10 = _mm_sub_ps(ix1,jx0);
260 dy10 = _mm_sub_ps(iy1,jy0);
261 dz10 = _mm_sub_ps(iz1,jz0);
262 dx11 = _mm_sub_ps(ix1,jx1);
263 dy11 = _mm_sub_ps(iy1,jy1);
264 dz11 = _mm_sub_ps(iz1,jz1);
265 dx12 = _mm_sub_ps(ix1,jx2);
266 dy12 = _mm_sub_ps(iy1,jy2);
267 dz12 = _mm_sub_ps(iz1,jz2);
268 dx20 = _mm_sub_ps(ix2,jx0);
269 dy20 = _mm_sub_ps(iy2,jy0);
270 dz20 = _mm_sub_ps(iz2,jz0);
271 dx21 = _mm_sub_ps(ix2,jx1);
272 dy21 = _mm_sub_ps(iy2,jy1);
273 dz21 = _mm_sub_ps(iz2,jz1);
274 dx22 = _mm_sub_ps(ix2,jx2);
275 dy22 = _mm_sub_ps(iy2,jy2);
276 dz22 = _mm_sub_ps(iz2,jz2);
278 /* Calculate squared distance and things based on it */
279 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
280 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
281 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
282 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
283 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
284 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
285 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
286 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
287 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
289 rinv00 = gmx_mm_invsqrt_ps(rsq00);
290 rinv01 = gmx_mm_invsqrt_ps(rsq01);
291 rinv02 = gmx_mm_invsqrt_ps(rsq02);
292 rinv10 = gmx_mm_invsqrt_ps(rsq10);
293 rinv11 = gmx_mm_invsqrt_ps(rsq11);
294 rinv12 = gmx_mm_invsqrt_ps(rsq12);
295 rinv20 = gmx_mm_invsqrt_ps(rsq20);
296 rinv21 = gmx_mm_invsqrt_ps(rsq21);
297 rinv22 = gmx_mm_invsqrt_ps(rsq22);
299 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
300 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
301 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
302 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
303 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
304 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
305 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
306 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
307 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
309 fjx0 = _mm_setzero_ps();
310 fjy0 = _mm_setzero_ps();
311 fjz0 = _mm_setzero_ps();
312 fjx1 = _mm_setzero_ps();
313 fjy1 = _mm_setzero_ps();
314 fjz1 = _mm_setzero_ps();
315 fjx2 = _mm_setzero_ps();
316 fjy2 = _mm_setzero_ps();
317 fjz2 = _mm_setzero_ps();
319 /**************************
320 * CALCULATE INTERACTIONS *
321 **************************/
323 if (gmx_mm_any_lt(rsq00,rcutoff2))
326 r00 = _mm_mul_ps(rsq00,rinv00);
328 /* EWALD ELECTROSTATICS */
330 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
331 ewrt = _mm_mul_ps(r00,ewtabscale);
332 ewitab = _mm_cvttps_epi32(ewrt);
333 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
334 ewitab = _mm_slli_epi32(ewitab,2);
335 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
336 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
337 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
338 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
339 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
340 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
341 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
342 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
343 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
345 /* LENNARD-JONES DISPERSION/REPULSION */
347 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
348 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
349 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
350 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
351 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
353 d = _mm_sub_ps(r00,rswitch);
354 d = _mm_max_ps(d,_mm_setzero_ps());
355 d2 = _mm_mul_ps(d,d);
356 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
358 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
360 /* Evaluate switch function */
361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
362 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
363 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
364 velec = _mm_mul_ps(velec,sw);
365 vvdw = _mm_mul_ps(vvdw,sw);
366 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
368 /* Update potential sum for this i atom from the interaction with this j atom. */
369 velec = _mm_and_ps(velec,cutoff_mask);
370 velecsum = _mm_add_ps(velecsum,velec);
371 vvdw = _mm_and_ps(vvdw,cutoff_mask);
372 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
374 fscal = _mm_add_ps(felec,fvdw);
376 fscal = _mm_and_ps(fscal,cutoff_mask);
378 /* Calculate temporary vectorial force */
379 tx = _mm_mul_ps(fscal,dx00);
380 ty = _mm_mul_ps(fscal,dy00);
381 tz = _mm_mul_ps(fscal,dz00);
383 /* Update vectorial force */
384 fix0 = _mm_add_ps(fix0,tx);
385 fiy0 = _mm_add_ps(fiy0,ty);
386 fiz0 = _mm_add_ps(fiz0,tz);
388 fjx0 = _mm_add_ps(fjx0,tx);
389 fjy0 = _mm_add_ps(fjy0,ty);
390 fjz0 = _mm_add_ps(fjz0,tz);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 if (gmx_mm_any_lt(rsq01,rcutoff2))
401 r01 = _mm_mul_ps(rsq01,rinv01);
403 /* EWALD ELECTROSTATICS */
405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
406 ewrt = _mm_mul_ps(r01,ewtabscale);
407 ewitab = _mm_cvttps_epi32(ewrt);
408 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
409 ewitab = _mm_slli_epi32(ewitab,2);
410 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
411 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
412 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
413 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
414 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
415 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
416 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
417 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
418 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
420 d = _mm_sub_ps(r01,rswitch);
421 d = _mm_max_ps(d,_mm_setzero_ps());
422 d2 = _mm_mul_ps(d,d);
423 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
425 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
427 /* Evaluate switch function */
428 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
429 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
430 velec = _mm_mul_ps(velec,sw);
431 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velec = _mm_and_ps(velec,cutoff_mask);
435 velecsum = _mm_add_ps(velecsum,velec);
437 fscal = felec;
439 fscal = _mm_and_ps(fscal,cutoff_mask);
441 /* Calculate temporary vectorial force */
442 tx = _mm_mul_ps(fscal,dx01);
443 ty = _mm_mul_ps(fscal,dy01);
444 tz = _mm_mul_ps(fscal,dz01);
446 /* Update vectorial force */
447 fix0 = _mm_add_ps(fix0,tx);
448 fiy0 = _mm_add_ps(fiy0,ty);
449 fiz0 = _mm_add_ps(fiz0,tz);
451 fjx1 = _mm_add_ps(fjx1,tx);
452 fjy1 = _mm_add_ps(fjy1,ty);
453 fjz1 = _mm_add_ps(fjz1,tz);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 if (gmx_mm_any_lt(rsq02,rcutoff2))
464 r02 = _mm_mul_ps(rsq02,rinv02);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm_mul_ps(r02,ewtabscale);
470 ewitab = _mm_cvttps_epi32(ewrt);
471 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
477 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
479 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
480 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
481 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
483 d = _mm_sub_ps(r02,rswitch);
484 d = _mm_max_ps(d,_mm_setzero_ps());
485 d2 = _mm_mul_ps(d,d);
486 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
488 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
490 /* Evaluate switch function */
491 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
492 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
493 velec = _mm_mul_ps(velec,sw);
494 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec = _mm_and_ps(velec,cutoff_mask);
498 velecsum = _mm_add_ps(velecsum,velec);
500 fscal = felec;
502 fscal = _mm_and_ps(fscal,cutoff_mask);
504 /* Calculate temporary vectorial force */
505 tx = _mm_mul_ps(fscal,dx02);
506 ty = _mm_mul_ps(fscal,dy02);
507 tz = _mm_mul_ps(fscal,dz02);
509 /* Update vectorial force */
510 fix0 = _mm_add_ps(fix0,tx);
511 fiy0 = _mm_add_ps(fiy0,ty);
512 fiz0 = _mm_add_ps(fiz0,tz);
514 fjx2 = _mm_add_ps(fjx2,tx);
515 fjy2 = _mm_add_ps(fjy2,ty);
516 fjz2 = _mm_add_ps(fjz2,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm_any_lt(rsq10,rcutoff2))
527 r10 = _mm_mul_ps(rsq10,rinv10);
529 /* EWALD ELECTROSTATICS */
531 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
532 ewrt = _mm_mul_ps(r10,ewtabscale);
533 ewitab = _mm_cvttps_epi32(ewrt);
534 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
535 ewitab = _mm_slli_epi32(ewitab,2);
536 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
537 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
538 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
539 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
540 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
541 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
542 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
543 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
544 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
546 d = _mm_sub_ps(r10,rswitch);
547 d = _mm_max_ps(d,_mm_setzero_ps());
548 d2 = _mm_mul_ps(d,d);
549 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
551 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
553 /* Evaluate switch function */
554 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
555 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
556 velec = _mm_mul_ps(velec,sw);
557 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velec = _mm_and_ps(velec,cutoff_mask);
561 velecsum = _mm_add_ps(velecsum,velec);
563 fscal = felec;
565 fscal = _mm_and_ps(fscal,cutoff_mask);
567 /* Calculate temporary vectorial force */
568 tx = _mm_mul_ps(fscal,dx10);
569 ty = _mm_mul_ps(fscal,dy10);
570 tz = _mm_mul_ps(fscal,dz10);
572 /* Update vectorial force */
573 fix1 = _mm_add_ps(fix1,tx);
574 fiy1 = _mm_add_ps(fiy1,ty);
575 fiz1 = _mm_add_ps(fiz1,tz);
577 fjx0 = _mm_add_ps(fjx0,tx);
578 fjy0 = _mm_add_ps(fjy0,ty);
579 fjz0 = _mm_add_ps(fjz0,tz);
583 /**************************
584 * CALCULATE INTERACTIONS *
585 **************************/
587 if (gmx_mm_any_lt(rsq11,rcutoff2))
590 r11 = _mm_mul_ps(rsq11,rinv11);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm_mul_ps(r11,ewtabscale);
596 ewitab = _mm_cvttps_epi32(ewrt);
597 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
603 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
605 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
606 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
607 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
609 d = _mm_sub_ps(r11,rswitch);
610 d = _mm_max_ps(d,_mm_setzero_ps());
611 d2 = _mm_mul_ps(d,d);
612 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
614 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
616 /* Evaluate switch function */
617 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
618 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
619 velec = _mm_mul_ps(velec,sw);
620 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec = _mm_and_ps(velec,cutoff_mask);
624 velecsum = _mm_add_ps(velecsum,velec);
626 fscal = felec;
628 fscal = _mm_and_ps(fscal,cutoff_mask);
630 /* Calculate temporary vectorial force */
631 tx = _mm_mul_ps(fscal,dx11);
632 ty = _mm_mul_ps(fscal,dy11);
633 tz = _mm_mul_ps(fscal,dz11);
635 /* Update vectorial force */
636 fix1 = _mm_add_ps(fix1,tx);
637 fiy1 = _mm_add_ps(fiy1,ty);
638 fiz1 = _mm_add_ps(fiz1,tz);
640 fjx1 = _mm_add_ps(fjx1,tx);
641 fjy1 = _mm_add_ps(fjy1,ty);
642 fjz1 = _mm_add_ps(fjz1,tz);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 if (gmx_mm_any_lt(rsq12,rcutoff2))
653 r12 = _mm_mul_ps(rsq12,rinv12);
655 /* EWALD ELECTROSTATICS */
657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
658 ewrt = _mm_mul_ps(r12,ewtabscale);
659 ewitab = _mm_cvttps_epi32(ewrt);
660 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
661 ewitab = _mm_slli_epi32(ewitab,2);
662 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
663 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
664 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
665 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
666 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
667 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
668 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
669 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
670 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
672 d = _mm_sub_ps(r12,rswitch);
673 d = _mm_max_ps(d,_mm_setzero_ps());
674 d2 = _mm_mul_ps(d,d);
675 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
677 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
679 /* Evaluate switch function */
680 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
681 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
682 velec = _mm_mul_ps(velec,sw);
683 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
685 /* Update potential sum for this i atom from the interaction with this j atom. */
686 velec = _mm_and_ps(velec,cutoff_mask);
687 velecsum = _mm_add_ps(velecsum,velec);
689 fscal = felec;
691 fscal = _mm_and_ps(fscal,cutoff_mask);
693 /* Calculate temporary vectorial force */
694 tx = _mm_mul_ps(fscal,dx12);
695 ty = _mm_mul_ps(fscal,dy12);
696 tz = _mm_mul_ps(fscal,dz12);
698 /* Update vectorial force */
699 fix1 = _mm_add_ps(fix1,tx);
700 fiy1 = _mm_add_ps(fiy1,ty);
701 fiz1 = _mm_add_ps(fiz1,tz);
703 fjx2 = _mm_add_ps(fjx2,tx);
704 fjy2 = _mm_add_ps(fjy2,ty);
705 fjz2 = _mm_add_ps(fjz2,tz);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 if (gmx_mm_any_lt(rsq20,rcutoff2))
716 r20 = _mm_mul_ps(rsq20,rinv20);
718 /* EWALD ELECTROSTATICS */
720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
721 ewrt = _mm_mul_ps(r20,ewtabscale);
722 ewitab = _mm_cvttps_epi32(ewrt);
723 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
724 ewitab = _mm_slli_epi32(ewitab,2);
725 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
726 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
727 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
728 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
729 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
730 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
731 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
732 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
733 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
735 d = _mm_sub_ps(r20,rswitch);
736 d = _mm_max_ps(d,_mm_setzero_ps());
737 d2 = _mm_mul_ps(d,d);
738 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
740 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
742 /* Evaluate switch function */
743 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
744 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
745 velec = _mm_mul_ps(velec,sw);
746 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
748 /* Update potential sum for this i atom from the interaction with this j atom. */
749 velec = _mm_and_ps(velec,cutoff_mask);
750 velecsum = _mm_add_ps(velecsum,velec);
752 fscal = felec;
754 fscal = _mm_and_ps(fscal,cutoff_mask);
756 /* Calculate temporary vectorial force */
757 tx = _mm_mul_ps(fscal,dx20);
758 ty = _mm_mul_ps(fscal,dy20);
759 tz = _mm_mul_ps(fscal,dz20);
761 /* Update vectorial force */
762 fix2 = _mm_add_ps(fix2,tx);
763 fiy2 = _mm_add_ps(fiy2,ty);
764 fiz2 = _mm_add_ps(fiz2,tz);
766 fjx0 = _mm_add_ps(fjx0,tx);
767 fjy0 = _mm_add_ps(fjy0,ty);
768 fjz0 = _mm_add_ps(fjz0,tz);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 if (gmx_mm_any_lt(rsq21,rcutoff2))
779 r21 = _mm_mul_ps(rsq21,rinv21);
781 /* EWALD ELECTROSTATICS */
783 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
784 ewrt = _mm_mul_ps(r21,ewtabscale);
785 ewitab = _mm_cvttps_epi32(ewrt);
786 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
787 ewitab = _mm_slli_epi32(ewitab,2);
788 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
789 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
790 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
791 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
792 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
793 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
794 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
795 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
796 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
798 d = _mm_sub_ps(r21,rswitch);
799 d = _mm_max_ps(d,_mm_setzero_ps());
800 d2 = _mm_mul_ps(d,d);
801 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
803 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
805 /* Evaluate switch function */
806 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
807 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
808 velec = _mm_mul_ps(velec,sw);
809 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
811 /* Update potential sum for this i atom from the interaction with this j atom. */
812 velec = _mm_and_ps(velec,cutoff_mask);
813 velecsum = _mm_add_ps(velecsum,velec);
815 fscal = felec;
817 fscal = _mm_and_ps(fscal,cutoff_mask);
819 /* Calculate temporary vectorial force */
820 tx = _mm_mul_ps(fscal,dx21);
821 ty = _mm_mul_ps(fscal,dy21);
822 tz = _mm_mul_ps(fscal,dz21);
824 /* Update vectorial force */
825 fix2 = _mm_add_ps(fix2,tx);
826 fiy2 = _mm_add_ps(fiy2,ty);
827 fiz2 = _mm_add_ps(fiz2,tz);
829 fjx1 = _mm_add_ps(fjx1,tx);
830 fjy1 = _mm_add_ps(fjy1,ty);
831 fjz1 = _mm_add_ps(fjz1,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 if (gmx_mm_any_lt(rsq22,rcutoff2))
842 r22 = _mm_mul_ps(rsq22,rinv22);
844 /* EWALD ELECTROSTATICS */
846 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
847 ewrt = _mm_mul_ps(r22,ewtabscale);
848 ewitab = _mm_cvttps_epi32(ewrt);
849 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
850 ewitab = _mm_slli_epi32(ewitab,2);
851 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
852 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
853 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
854 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
855 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
856 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
857 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
858 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
859 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
861 d = _mm_sub_ps(r22,rswitch);
862 d = _mm_max_ps(d,_mm_setzero_ps());
863 d2 = _mm_mul_ps(d,d);
864 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
866 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
868 /* Evaluate switch function */
869 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
870 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
871 velec = _mm_mul_ps(velec,sw);
872 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec = _mm_and_ps(velec,cutoff_mask);
876 velecsum = _mm_add_ps(velecsum,velec);
878 fscal = felec;
880 fscal = _mm_and_ps(fscal,cutoff_mask);
882 /* Calculate temporary vectorial force */
883 tx = _mm_mul_ps(fscal,dx22);
884 ty = _mm_mul_ps(fscal,dy22);
885 tz = _mm_mul_ps(fscal,dz22);
887 /* Update vectorial force */
888 fix2 = _mm_add_ps(fix2,tx);
889 fiy2 = _mm_add_ps(fiy2,ty);
890 fiz2 = _mm_add_ps(fiz2,tz);
892 fjx2 = _mm_add_ps(fjx2,tx);
893 fjy2 = _mm_add_ps(fjy2,ty);
894 fjz2 = _mm_add_ps(fjz2,tz);
898 fjptrA = f+j_coord_offsetA;
899 fjptrB = f+j_coord_offsetB;
900 fjptrC = f+j_coord_offsetC;
901 fjptrD = f+j_coord_offsetD;
903 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
904 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
906 /* Inner loop uses 603 flops */
909 if(jidx<j_index_end)
912 /* Get j neighbor index, and coordinate index */
913 jnrlistA = jjnr[jidx];
914 jnrlistB = jjnr[jidx+1];
915 jnrlistC = jjnr[jidx+2];
916 jnrlistD = jjnr[jidx+3];
917 /* Sign of each element will be negative for non-real atoms.
918 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
919 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
921 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
922 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
923 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
924 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
925 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
926 j_coord_offsetA = DIM*jnrA;
927 j_coord_offsetB = DIM*jnrB;
928 j_coord_offsetC = DIM*jnrC;
929 j_coord_offsetD = DIM*jnrD;
931 /* load j atom coordinates */
932 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
933 x+j_coord_offsetC,x+j_coord_offsetD,
934 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
936 /* Calculate displacement vector */
937 dx00 = _mm_sub_ps(ix0,jx0);
938 dy00 = _mm_sub_ps(iy0,jy0);
939 dz00 = _mm_sub_ps(iz0,jz0);
940 dx01 = _mm_sub_ps(ix0,jx1);
941 dy01 = _mm_sub_ps(iy0,jy1);
942 dz01 = _mm_sub_ps(iz0,jz1);
943 dx02 = _mm_sub_ps(ix0,jx2);
944 dy02 = _mm_sub_ps(iy0,jy2);
945 dz02 = _mm_sub_ps(iz0,jz2);
946 dx10 = _mm_sub_ps(ix1,jx0);
947 dy10 = _mm_sub_ps(iy1,jy0);
948 dz10 = _mm_sub_ps(iz1,jz0);
949 dx11 = _mm_sub_ps(ix1,jx1);
950 dy11 = _mm_sub_ps(iy1,jy1);
951 dz11 = _mm_sub_ps(iz1,jz1);
952 dx12 = _mm_sub_ps(ix1,jx2);
953 dy12 = _mm_sub_ps(iy1,jy2);
954 dz12 = _mm_sub_ps(iz1,jz2);
955 dx20 = _mm_sub_ps(ix2,jx0);
956 dy20 = _mm_sub_ps(iy2,jy0);
957 dz20 = _mm_sub_ps(iz2,jz0);
958 dx21 = _mm_sub_ps(ix2,jx1);
959 dy21 = _mm_sub_ps(iy2,jy1);
960 dz21 = _mm_sub_ps(iz2,jz1);
961 dx22 = _mm_sub_ps(ix2,jx2);
962 dy22 = _mm_sub_ps(iy2,jy2);
963 dz22 = _mm_sub_ps(iz2,jz2);
965 /* Calculate squared distance and things based on it */
966 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
967 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
968 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
969 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
970 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
971 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
972 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
973 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
974 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
976 rinv00 = gmx_mm_invsqrt_ps(rsq00);
977 rinv01 = gmx_mm_invsqrt_ps(rsq01);
978 rinv02 = gmx_mm_invsqrt_ps(rsq02);
979 rinv10 = gmx_mm_invsqrt_ps(rsq10);
980 rinv11 = gmx_mm_invsqrt_ps(rsq11);
981 rinv12 = gmx_mm_invsqrt_ps(rsq12);
982 rinv20 = gmx_mm_invsqrt_ps(rsq20);
983 rinv21 = gmx_mm_invsqrt_ps(rsq21);
984 rinv22 = gmx_mm_invsqrt_ps(rsq22);
986 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
987 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
988 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
989 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
990 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
991 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
992 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
993 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
994 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
996 fjx0 = _mm_setzero_ps();
997 fjy0 = _mm_setzero_ps();
998 fjz0 = _mm_setzero_ps();
999 fjx1 = _mm_setzero_ps();
1000 fjy1 = _mm_setzero_ps();
1001 fjz1 = _mm_setzero_ps();
1002 fjx2 = _mm_setzero_ps();
1003 fjy2 = _mm_setzero_ps();
1004 fjz2 = _mm_setzero_ps();
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 if (gmx_mm_any_lt(rsq00,rcutoff2))
1013 r00 = _mm_mul_ps(rsq00,rinv00);
1014 r00 = _mm_andnot_ps(dummy_mask,r00);
1016 /* EWALD ELECTROSTATICS */
1018 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1019 ewrt = _mm_mul_ps(r00,ewtabscale);
1020 ewitab = _mm_cvttps_epi32(ewrt);
1021 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1022 ewitab = _mm_slli_epi32(ewitab,2);
1023 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1024 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1025 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1026 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1027 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1028 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1029 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1030 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1031 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1033 /* LENNARD-JONES DISPERSION/REPULSION */
1035 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1036 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1037 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1038 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1039 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1041 d = _mm_sub_ps(r00,rswitch);
1042 d = _mm_max_ps(d,_mm_setzero_ps());
1043 d2 = _mm_mul_ps(d,d);
1044 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1046 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1048 /* Evaluate switch function */
1049 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1050 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1051 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1052 velec = _mm_mul_ps(velec,sw);
1053 vvdw = _mm_mul_ps(vvdw,sw);
1054 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1056 /* Update potential sum for this i atom from the interaction with this j atom. */
1057 velec = _mm_and_ps(velec,cutoff_mask);
1058 velec = _mm_andnot_ps(dummy_mask,velec);
1059 velecsum = _mm_add_ps(velecsum,velec);
1060 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1061 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1062 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1064 fscal = _mm_add_ps(felec,fvdw);
1066 fscal = _mm_and_ps(fscal,cutoff_mask);
1068 fscal = _mm_andnot_ps(dummy_mask,fscal);
1070 /* Calculate temporary vectorial force */
1071 tx = _mm_mul_ps(fscal,dx00);
1072 ty = _mm_mul_ps(fscal,dy00);
1073 tz = _mm_mul_ps(fscal,dz00);
1075 /* Update vectorial force */
1076 fix0 = _mm_add_ps(fix0,tx);
1077 fiy0 = _mm_add_ps(fiy0,ty);
1078 fiz0 = _mm_add_ps(fiz0,tz);
1080 fjx0 = _mm_add_ps(fjx0,tx);
1081 fjy0 = _mm_add_ps(fjy0,ty);
1082 fjz0 = _mm_add_ps(fjz0,tz);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 if (gmx_mm_any_lt(rsq01,rcutoff2))
1093 r01 = _mm_mul_ps(rsq01,rinv01);
1094 r01 = _mm_andnot_ps(dummy_mask,r01);
1096 /* EWALD ELECTROSTATICS */
1098 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1099 ewrt = _mm_mul_ps(r01,ewtabscale);
1100 ewitab = _mm_cvttps_epi32(ewrt);
1101 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1102 ewitab = _mm_slli_epi32(ewitab,2);
1103 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1104 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1105 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1106 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1107 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1108 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1109 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1110 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1111 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1113 d = _mm_sub_ps(r01,rswitch);
1114 d = _mm_max_ps(d,_mm_setzero_ps());
1115 d2 = _mm_mul_ps(d,d);
1116 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1118 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1120 /* Evaluate switch function */
1121 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1122 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1123 velec = _mm_mul_ps(velec,sw);
1124 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec = _mm_and_ps(velec,cutoff_mask);
1128 velec = _mm_andnot_ps(dummy_mask,velec);
1129 velecsum = _mm_add_ps(velecsum,velec);
1131 fscal = felec;
1133 fscal = _mm_and_ps(fscal,cutoff_mask);
1135 fscal = _mm_andnot_ps(dummy_mask,fscal);
1137 /* Calculate temporary vectorial force */
1138 tx = _mm_mul_ps(fscal,dx01);
1139 ty = _mm_mul_ps(fscal,dy01);
1140 tz = _mm_mul_ps(fscal,dz01);
1142 /* Update vectorial force */
1143 fix0 = _mm_add_ps(fix0,tx);
1144 fiy0 = _mm_add_ps(fiy0,ty);
1145 fiz0 = _mm_add_ps(fiz0,tz);
1147 fjx1 = _mm_add_ps(fjx1,tx);
1148 fjy1 = _mm_add_ps(fjy1,ty);
1149 fjz1 = _mm_add_ps(fjz1,tz);
1153 /**************************
1154 * CALCULATE INTERACTIONS *
1155 **************************/
1157 if (gmx_mm_any_lt(rsq02,rcutoff2))
1160 r02 = _mm_mul_ps(rsq02,rinv02);
1161 r02 = _mm_andnot_ps(dummy_mask,r02);
1163 /* EWALD ELECTROSTATICS */
1165 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1166 ewrt = _mm_mul_ps(r02,ewtabscale);
1167 ewitab = _mm_cvttps_epi32(ewrt);
1168 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1169 ewitab = _mm_slli_epi32(ewitab,2);
1170 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1171 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1172 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1173 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1174 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1175 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1176 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1177 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1178 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1180 d = _mm_sub_ps(r02,rswitch);
1181 d = _mm_max_ps(d,_mm_setzero_ps());
1182 d2 = _mm_mul_ps(d,d);
1183 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1185 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1187 /* Evaluate switch function */
1188 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1189 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1190 velec = _mm_mul_ps(velec,sw);
1191 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1193 /* Update potential sum for this i atom from the interaction with this j atom. */
1194 velec = _mm_and_ps(velec,cutoff_mask);
1195 velec = _mm_andnot_ps(dummy_mask,velec);
1196 velecsum = _mm_add_ps(velecsum,velec);
1198 fscal = felec;
1200 fscal = _mm_and_ps(fscal,cutoff_mask);
1202 fscal = _mm_andnot_ps(dummy_mask,fscal);
1204 /* Calculate temporary vectorial force */
1205 tx = _mm_mul_ps(fscal,dx02);
1206 ty = _mm_mul_ps(fscal,dy02);
1207 tz = _mm_mul_ps(fscal,dz02);
1209 /* Update vectorial force */
1210 fix0 = _mm_add_ps(fix0,tx);
1211 fiy0 = _mm_add_ps(fiy0,ty);
1212 fiz0 = _mm_add_ps(fiz0,tz);
1214 fjx2 = _mm_add_ps(fjx2,tx);
1215 fjy2 = _mm_add_ps(fjy2,ty);
1216 fjz2 = _mm_add_ps(fjz2,tz);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 if (gmx_mm_any_lt(rsq10,rcutoff2))
1227 r10 = _mm_mul_ps(rsq10,rinv10);
1228 r10 = _mm_andnot_ps(dummy_mask,r10);
1230 /* EWALD ELECTROSTATICS */
1232 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1233 ewrt = _mm_mul_ps(r10,ewtabscale);
1234 ewitab = _mm_cvttps_epi32(ewrt);
1235 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1236 ewitab = _mm_slli_epi32(ewitab,2);
1237 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1238 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1239 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1240 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1241 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1242 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1243 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1244 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1245 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1247 d = _mm_sub_ps(r10,rswitch);
1248 d = _mm_max_ps(d,_mm_setzero_ps());
1249 d2 = _mm_mul_ps(d,d);
1250 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1252 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1254 /* Evaluate switch function */
1255 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1256 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1257 velec = _mm_mul_ps(velec,sw);
1258 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1260 /* Update potential sum for this i atom from the interaction with this j atom. */
1261 velec = _mm_and_ps(velec,cutoff_mask);
1262 velec = _mm_andnot_ps(dummy_mask,velec);
1263 velecsum = _mm_add_ps(velecsum,velec);
1265 fscal = felec;
1267 fscal = _mm_and_ps(fscal,cutoff_mask);
1269 fscal = _mm_andnot_ps(dummy_mask,fscal);
1271 /* Calculate temporary vectorial force */
1272 tx = _mm_mul_ps(fscal,dx10);
1273 ty = _mm_mul_ps(fscal,dy10);
1274 tz = _mm_mul_ps(fscal,dz10);
1276 /* Update vectorial force */
1277 fix1 = _mm_add_ps(fix1,tx);
1278 fiy1 = _mm_add_ps(fiy1,ty);
1279 fiz1 = _mm_add_ps(fiz1,tz);
1281 fjx0 = _mm_add_ps(fjx0,tx);
1282 fjy0 = _mm_add_ps(fjy0,ty);
1283 fjz0 = _mm_add_ps(fjz0,tz);
1287 /**************************
1288 * CALCULATE INTERACTIONS *
1289 **************************/
1291 if (gmx_mm_any_lt(rsq11,rcutoff2))
1294 r11 = _mm_mul_ps(rsq11,rinv11);
1295 r11 = _mm_andnot_ps(dummy_mask,r11);
1297 /* EWALD ELECTROSTATICS */
1299 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1300 ewrt = _mm_mul_ps(r11,ewtabscale);
1301 ewitab = _mm_cvttps_epi32(ewrt);
1302 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1303 ewitab = _mm_slli_epi32(ewitab,2);
1304 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1305 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1306 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1307 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1308 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1309 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1310 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1311 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1312 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1314 d = _mm_sub_ps(r11,rswitch);
1315 d = _mm_max_ps(d,_mm_setzero_ps());
1316 d2 = _mm_mul_ps(d,d);
1317 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1319 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1321 /* Evaluate switch function */
1322 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1323 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1324 velec = _mm_mul_ps(velec,sw);
1325 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1327 /* Update potential sum for this i atom from the interaction with this j atom. */
1328 velec = _mm_and_ps(velec,cutoff_mask);
1329 velec = _mm_andnot_ps(dummy_mask,velec);
1330 velecsum = _mm_add_ps(velecsum,velec);
1332 fscal = felec;
1334 fscal = _mm_and_ps(fscal,cutoff_mask);
1336 fscal = _mm_andnot_ps(dummy_mask,fscal);
1338 /* Calculate temporary vectorial force */
1339 tx = _mm_mul_ps(fscal,dx11);
1340 ty = _mm_mul_ps(fscal,dy11);
1341 tz = _mm_mul_ps(fscal,dz11);
1343 /* Update vectorial force */
1344 fix1 = _mm_add_ps(fix1,tx);
1345 fiy1 = _mm_add_ps(fiy1,ty);
1346 fiz1 = _mm_add_ps(fiz1,tz);
1348 fjx1 = _mm_add_ps(fjx1,tx);
1349 fjy1 = _mm_add_ps(fjy1,ty);
1350 fjz1 = _mm_add_ps(fjz1,tz);
1354 /**************************
1355 * CALCULATE INTERACTIONS *
1356 **************************/
1358 if (gmx_mm_any_lt(rsq12,rcutoff2))
1361 r12 = _mm_mul_ps(rsq12,rinv12);
1362 r12 = _mm_andnot_ps(dummy_mask,r12);
1364 /* EWALD ELECTROSTATICS */
1366 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1367 ewrt = _mm_mul_ps(r12,ewtabscale);
1368 ewitab = _mm_cvttps_epi32(ewrt);
1369 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1370 ewitab = _mm_slli_epi32(ewitab,2);
1371 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1372 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1373 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1374 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1375 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1376 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1377 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1378 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1379 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1381 d = _mm_sub_ps(r12,rswitch);
1382 d = _mm_max_ps(d,_mm_setzero_ps());
1383 d2 = _mm_mul_ps(d,d);
1384 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1386 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1388 /* Evaluate switch function */
1389 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1390 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1391 velec = _mm_mul_ps(velec,sw);
1392 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1394 /* Update potential sum for this i atom from the interaction with this j atom. */
1395 velec = _mm_and_ps(velec,cutoff_mask);
1396 velec = _mm_andnot_ps(dummy_mask,velec);
1397 velecsum = _mm_add_ps(velecsum,velec);
1399 fscal = felec;
1401 fscal = _mm_and_ps(fscal,cutoff_mask);
1403 fscal = _mm_andnot_ps(dummy_mask,fscal);
1405 /* Calculate temporary vectorial force */
1406 tx = _mm_mul_ps(fscal,dx12);
1407 ty = _mm_mul_ps(fscal,dy12);
1408 tz = _mm_mul_ps(fscal,dz12);
1410 /* Update vectorial force */
1411 fix1 = _mm_add_ps(fix1,tx);
1412 fiy1 = _mm_add_ps(fiy1,ty);
1413 fiz1 = _mm_add_ps(fiz1,tz);
1415 fjx2 = _mm_add_ps(fjx2,tx);
1416 fjy2 = _mm_add_ps(fjy2,ty);
1417 fjz2 = _mm_add_ps(fjz2,tz);
1421 /**************************
1422 * CALCULATE INTERACTIONS *
1423 **************************/
1425 if (gmx_mm_any_lt(rsq20,rcutoff2))
1428 r20 = _mm_mul_ps(rsq20,rinv20);
1429 r20 = _mm_andnot_ps(dummy_mask,r20);
1431 /* EWALD ELECTROSTATICS */
1433 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1434 ewrt = _mm_mul_ps(r20,ewtabscale);
1435 ewitab = _mm_cvttps_epi32(ewrt);
1436 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1437 ewitab = _mm_slli_epi32(ewitab,2);
1438 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1439 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1440 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1441 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1442 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1443 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1444 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1445 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1446 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1448 d = _mm_sub_ps(r20,rswitch);
1449 d = _mm_max_ps(d,_mm_setzero_ps());
1450 d2 = _mm_mul_ps(d,d);
1451 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1453 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1455 /* Evaluate switch function */
1456 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1457 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1458 velec = _mm_mul_ps(velec,sw);
1459 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1461 /* Update potential sum for this i atom from the interaction with this j atom. */
1462 velec = _mm_and_ps(velec,cutoff_mask);
1463 velec = _mm_andnot_ps(dummy_mask,velec);
1464 velecsum = _mm_add_ps(velecsum,velec);
1466 fscal = felec;
1468 fscal = _mm_and_ps(fscal,cutoff_mask);
1470 fscal = _mm_andnot_ps(dummy_mask,fscal);
1472 /* Calculate temporary vectorial force */
1473 tx = _mm_mul_ps(fscal,dx20);
1474 ty = _mm_mul_ps(fscal,dy20);
1475 tz = _mm_mul_ps(fscal,dz20);
1477 /* Update vectorial force */
1478 fix2 = _mm_add_ps(fix2,tx);
1479 fiy2 = _mm_add_ps(fiy2,ty);
1480 fiz2 = _mm_add_ps(fiz2,tz);
1482 fjx0 = _mm_add_ps(fjx0,tx);
1483 fjy0 = _mm_add_ps(fjy0,ty);
1484 fjz0 = _mm_add_ps(fjz0,tz);
1488 /**************************
1489 * CALCULATE INTERACTIONS *
1490 **************************/
1492 if (gmx_mm_any_lt(rsq21,rcutoff2))
1495 r21 = _mm_mul_ps(rsq21,rinv21);
1496 r21 = _mm_andnot_ps(dummy_mask,r21);
1498 /* EWALD ELECTROSTATICS */
1500 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1501 ewrt = _mm_mul_ps(r21,ewtabscale);
1502 ewitab = _mm_cvttps_epi32(ewrt);
1503 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1504 ewitab = _mm_slli_epi32(ewitab,2);
1505 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1506 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1507 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1508 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1509 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1510 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1511 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1512 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1513 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1515 d = _mm_sub_ps(r21,rswitch);
1516 d = _mm_max_ps(d,_mm_setzero_ps());
1517 d2 = _mm_mul_ps(d,d);
1518 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1520 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1522 /* Evaluate switch function */
1523 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1524 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1525 velec = _mm_mul_ps(velec,sw);
1526 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1528 /* Update potential sum for this i atom from the interaction with this j atom. */
1529 velec = _mm_and_ps(velec,cutoff_mask);
1530 velec = _mm_andnot_ps(dummy_mask,velec);
1531 velecsum = _mm_add_ps(velecsum,velec);
1533 fscal = felec;
1535 fscal = _mm_and_ps(fscal,cutoff_mask);
1537 fscal = _mm_andnot_ps(dummy_mask,fscal);
1539 /* Calculate temporary vectorial force */
1540 tx = _mm_mul_ps(fscal,dx21);
1541 ty = _mm_mul_ps(fscal,dy21);
1542 tz = _mm_mul_ps(fscal,dz21);
1544 /* Update vectorial force */
1545 fix2 = _mm_add_ps(fix2,tx);
1546 fiy2 = _mm_add_ps(fiy2,ty);
1547 fiz2 = _mm_add_ps(fiz2,tz);
1549 fjx1 = _mm_add_ps(fjx1,tx);
1550 fjy1 = _mm_add_ps(fjy1,ty);
1551 fjz1 = _mm_add_ps(fjz1,tz);
1555 /**************************
1556 * CALCULATE INTERACTIONS *
1557 **************************/
1559 if (gmx_mm_any_lt(rsq22,rcutoff2))
1562 r22 = _mm_mul_ps(rsq22,rinv22);
1563 r22 = _mm_andnot_ps(dummy_mask,r22);
1565 /* EWALD ELECTROSTATICS */
1567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1568 ewrt = _mm_mul_ps(r22,ewtabscale);
1569 ewitab = _mm_cvttps_epi32(ewrt);
1570 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1571 ewitab = _mm_slli_epi32(ewitab,2);
1572 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1573 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1574 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1575 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1576 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1577 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1578 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1579 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1580 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1582 d = _mm_sub_ps(r22,rswitch);
1583 d = _mm_max_ps(d,_mm_setzero_ps());
1584 d2 = _mm_mul_ps(d,d);
1585 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1587 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1589 /* Evaluate switch function */
1590 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1591 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1592 velec = _mm_mul_ps(velec,sw);
1593 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1595 /* Update potential sum for this i atom from the interaction with this j atom. */
1596 velec = _mm_and_ps(velec,cutoff_mask);
1597 velec = _mm_andnot_ps(dummy_mask,velec);
1598 velecsum = _mm_add_ps(velecsum,velec);
1600 fscal = felec;
1602 fscal = _mm_and_ps(fscal,cutoff_mask);
1604 fscal = _mm_andnot_ps(dummy_mask,fscal);
1606 /* Calculate temporary vectorial force */
1607 tx = _mm_mul_ps(fscal,dx22);
1608 ty = _mm_mul_ps(fscal,dy22);
1609 tz = _mm_mul_ps(fscal,dz22);
1611 /* Update vectorial force */
1612 fix2 = _mm_add_ps(fix2,tx);
1613 fiy2 = _mm_add_ps(fiy2,ty);
1614 fiz2 = _mm_add_ps(fiz2,tz);
1616 fjx2 = _mm_add_ps(fjx2,tx);
1617 fjy2 = _mm_add_ps(fjy2,ty);
1618 fjz2 = _mm_add_ps(fjz2,tz);
1622 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1623 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1624 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1625 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1627 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1628 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1630 /* Inner loop uses 612 flops */
1633 /* End of innermost loop */
1635 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1636 f+i_coord_offset,fshift+i_shift_offset);
1638 ggid = gid[iidx];
1639 /* Update potential energies */
1640 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1641 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1643 /* Increment number of inner iterations */
1644 inneriter += j_index_end - j_index_start;
1646 /* Outer loop uses 20 flops */
1649 /* Increment number of outer iterations */
1650 outeriter += nri;
1652 /* Update outer/inner flops */
1654 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1657 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1658 * Electrostatics interaction: Ewald
1659 * VdW interaction: LennardJones
1660 * Geometry: Water3-Water3
1661 * Calculate force/pot: Force
1663 void
1664 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1665 (t_nblist * gmx_restrict nlist,
1666 rvec * gmx_restrict xx,
1667 rvec * gmx_restrict ff,
1668 t_forcerec * gmx_restrict fr,
1669 t_mdatoms * gmx_restrict mdatoms,
1670 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1671 t_nrnb * gmx_restrict nrnb)
1673 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1674 * just 0 for non-waters.
1675 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1676 * jnr indices corresponding to data put in the four positions in the SIMD register.
1678 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1679 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1680 int jnrA,jnrB,jnrC,jnrD;
1681 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1682 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1683 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1684 real rcutoff_scalar;
1685 real *shiftvec,*fshift,*x,*f;
1686 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1687 real scratch[4*DIM];
1688 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1689 int vdwioffset0;
1690 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1691 int vdwioffset1;
1692 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1693 int vdwioffset2;
1694 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1695 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1696 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1697 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1698 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1699 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1700 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1701 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1702 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1703 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1704 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1705 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1706 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1707 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1708 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1709 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1710 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1711 real *charge;
1712 int nvdwtype;
1713 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1714 int *vdwtype;
1715 real *vdwparam;
1716 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1717 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1718 __m128i ewitab;
1719 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1720 real *ewtab;
1721 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1722 real rswitch_scalar,d_scalar;
1723 __m128 dummy_mask,cutoff_mask;
1724 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1725 __m128 one = _mm_set1_ps(1.0);
1726 __m128 two = _mm_set1_ps(2.0);
1727 x = xx[0];
1728 f = ff[0];
1730 nri = nlist->nri;
1731 iinr = nlist->iinr;
1732 jindex = nlist->jindex;
1733 jjnr = nlist->jjnr;
1734 shiftidx = nlist->shift;
1735 gid = nlist->gid;
1736 shiftvec = fr->shift_vec[0];
1737 fshift = fr->fshift[0];
1738 facel = _mm_set1_ps(fr->epsfac);
1739 charge = mdatoms->chargeA;
1740 nvdwtype = fr->ntype;
1741 vdwparam = fr->nbfp;
1742 vdwtype = mdatoms->typeA;
1744 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1745 ewtab = fr->ic->tabq_coul_FDV0;
1746 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1747 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1749 /* Setup water-specific parameters */
1750 inr = nlist->iinr[0];
1751 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1752 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1753 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1754 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1756 jq0 = _mm_set1_ps(charge[inr+0]);
1757 jq1 = _mm_set1_ps(charge[inr+1]);
1758 jq2 = _mm_set1_ps(charge[inr+2]);
1759 vdwjidx0A = 2*vdwtype[inr+0];
1760 qq00 = _mm_mul_ps(iq0,jq0);
1761 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1762 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1763 qq01 = _mm_mul_ps(iq0,jq1);
1764 qq02 = _mm_mul_ps(iq0,jq2);
1765 qq10 = _mm_mul_ps(iq1,jq0);
1766 qq11 = _mm_mul_ps(iq1,jq1);
1767 qq12 = _mm_mul_ps(iq1,jq2);
1768 qq20 = _mm_mul_ps(iq2,jq0);
1769 qq21 = _mm_mul_ps(iq2,jq1);
1770 qq22 = _mm_mul_ps(iq2,jq2);
1772 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1773 rcutoff_scalar = fr->rcoulomb;
1774 rcutoff = _mm_set1_ps(rcutoff_scalar);
1775 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1777 rswitch_scalar = fr->rcoulomb_switch;
1778 rswitch = _mm_set1_ps(rswitch_scalar);
1779 /* Setup switch parameters */
1780 d_scalar = rcutoff_scalar-rswitch_scalar;
1781 d = _mm_set1_ps(d_scalar);
1782 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1783 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1784 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1785 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1786 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1787 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1789 /* Avoid stupid compiler warnings */
1790 jnrA = jnrB = jnrC = jnrD = 0;
1791 j_coord_offsetA = 0;
1792 j_coord_offsetB = 0;
1793 j_coord_offsetC = 0;
1794 j_coord_offsetD = 0;
1796 outeriter = 0;
1797 inneriter = 0;
1799 for(iidx=0;iidx<4*DIM;iidx++)
1801 scratch[iidx] = 0.0;
1804 /* Start outer loop over neighborlists */
1805 for(iidx=0; iidx<nri; iidx++)
1807 /* Load shift vector for this list */
1808 i_shift_offset = DIM*shiftidx[iidx];
1810 /* Load limits for loop over neighbors */
1811 j_index_start = jindex[iidx];
1812 j_index_end = jindex[iidx+1];
1814 /* Get outer coordinate index */
1815 inr = iinr[iidx];
1816 i_coord_offset = DIM*inr;
1818 /* Load i particle coords and add shift vector */
1819 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1820 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1822 fix0 = _mm_setzero_ps();
1823 fiy0 = _mm_setzero_ps();
1824 fiz0 = _mm_setzero_ps();
1825 fix1 = _mm_setzero_ps();
1826 fiy1 = _mm_setzero_ps();
1827 fiz1 = _mm_setzero_ps();
1828 fix2 = _mm_setzero_ps();
1829 fiy2 = _mm_setzero_ps();
1830 fiz2 = _mm_setzero_ps();
1832 /* Start inner kernel loop */
1833 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1836 /* Get j neighbor index, and coordinate index */
1837 jnrA = jjnr[jidx];
1838 jnrB = jjnr[jidx+1];
1839 jnrC = jjnr[jidx+2];
1840 jnrD = jjnr[jidx+3];
1841 j_coord_offsetA = DIM*jnrA;
1842 j_coord_offsetB = DIM*jnrB;
1843 j_coord_offsetC = DIM*jnrC;
1844 j_coord_offsetD = DIM*jnrD;
1846 /* load j atom coordinates */
1847 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1848 x+j_coord_offsetC,x+j_coord_offsetD,
1849 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1851 /* Calculate displacement vector */
1852 dx00 = _mm_sub_ps(ix0,jx0);
1853 dy00 = _mm_sub_ps(iy0,jy0);
1854 dz00 = _mm_sub_ps(iz0,jz0);
1855 dx01 = _mm_sub_ps(ix0,jx1);
1856 dy01 = _mm_sub_ps(iy0,jy1);
1857 dz01 = _mm_sub_ps(iz0,jz1);
1858 dx02 = _mm_sub_ps(ix0,jx2);
1859 dy02 = _mm_sub_ps(iy0,jy2);
1860 dz02 = _mm_sub_ps(iz0,jz2);
1861 dx10 = _mm_sub_ps(ix1,jx0);
1862 dy10 = _mm_sub_ps(iy1,jy0);
1863 dz10 = _mm_sub_ps(iz1,jz0);
1864 dx11 = _mm_sub_ps(ix1,jx1);
1865 dy11 = _mm_sub_ps(iy1,jy1);
1866 dz11 = _mm_sub_ps(iz1,jz1);
1867 dx12 = _mm_sub_ps(ix1,jx2);
1868 dy12 = _mm_sub_ps(iy1,jy2);
1869 dz12 = _mm_sub_ps(iz1,jz2);
1870 dx20 = _mm_sub_ps(ix2,jx0);
1871 dy20 = _mm_sub_ps(iy2,jy0);
1872 dz20 = _mm_sub_ps(iz2,jz0);
1873 dx21 = _mm_sub_ps(ix2,jx1);
1874 dy21 = _mm_sub_ps(iy2,jy1);
1875 dz21 = _mm_sub_ps(iz2,jz1);
1876 dx22 = _mm_sub_ps(ix2,jx2);
1877 dy22 = _mm_sub_ps(iy2,jy2);
1878 dz22 = _mm_sub_ps(iz2,jz2);
1880 /* Calculate squared distance and things based on it */
1881 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1882 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1883 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1884 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1885 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1886 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1887 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1888 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1889 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1891 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1892 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1893 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1894 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1895 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1896 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1897 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1898 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1899 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1901 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1902 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1903 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1904 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1905 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1906 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1907 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1908 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1909 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1911 fjx0 = _mm_setzero_ps();
1912 fjy0 = _mm_setzero_ps();
1913 fjz0 = _mm_setzero_ps();
1914 fjx1 = _mm_setzero_ps();
1915 fjy1 = _mm_setzero_ps();
1916 fjz1 = _mm_setzero_ps();
1917 fjx2 = _mm_setzero_ps();
1918 fjy2 = _mm_setzero_ps();
1919 fjz2 = _mm_setzero_ps();
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 if (gmx_mm_any_lt(rsq00,rcutoff2))
1928 r00 = _mm_mul_ps(rsq00,rinv00);
1930 /* EWALD ELECTROSTATICS */
1932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1933 ewrt = _mm_mul_ps(r00,ewtabscale);
1934 ewitab = _mm_cvttps_epi32(ewrt);
1935 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1936 ewitab = _mm_slli_epi32(ewitab,2);
1937 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1938 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1939 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1940 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1941 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1942 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1943 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1944 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1945 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1947 /* LENNARD-JONES DISPERSION/REPULSION */
1949 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1950 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1951 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1952 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1953 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1955 d = _mm_sub_ps(r00,rswitch);
1956 d = _mm_max_ps(d,_mm_setzero_ps());
1957 d2 = _mm_mul_ps(d,d);
1958 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1960 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1962 /* Evaluate switch function */
1963 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1964 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1965 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1966 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1968 fscal = _mm_add_ps(felec,fvdw);
1970 fscal = _mm_and_ps(fscal,cutoff_mask);
1972 /* Calculate temporary vectorial force */
1973 tx = _mm_mul_ps(fscal,dx00);
1974 ty = _mm_mul_ps(fscal,dy00);
1975 tz = _mm_mul_ps(fscal,dz00);
1977 /* Update vectorial force */
1978 fix0 = _mm_add_ps(fix0,tx);
1979 fiy0 = _mm_add_ps(fiy0,ty);
1980 fiz0 = _mm_add_ps(fiz0,tz);
1982 fjx0 = _mm_add_ps(fjx0,tx);
1983 fjy0 = _mm_add_ps(fjy0,ty);
1984 fjz0 = _mm_add_ps(fjz0,tz);
1988 /**************************
1989 * CALCULATE INTERACTIONS *
1990 **************************/
1992 if (gmx_mm_any_lt(rsq01,rcutoff2))
1995 r01 = _mm_mul_ps(rsq01,rinv01);
1997 /* EWALD ELECTROSTATICS */
1999 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2000 ewrt = _mm_mul_ps(r01,ewtabscale);
2001 ewitab = _mm_cvttps_epi32(ewrt);
2002 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2003 ewitab = _mm_slli_epi32(ewitab,2);
2004 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2005 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2006 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2007 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2008 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2009 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2010 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2011 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2012 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2014 d = _mm_sub_ps(r01,rswitch);
2015 d = _mm_max_ps(d,_mm_setzero_ps());
2016 d2 = _mm_mul_ps(d,d);
2017 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2019 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2021 /* Evaluate switch function */
2022 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2023 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2024 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2026 fscal = felec;
2028 fscal = _mm_and_ps(fscal,cutoff_mask);
2030 /* Calculate temporary vectorial force */
2031 tx = _mm_mul_ps(fscal,dx01);
2032 ty = _mm_mul_ps(fscal,dy01);
2033 tz = _mm_mul_ps(fscal,dz01);
2035 /* Update vectorial force */
2036 fix0 = _mm_add_ps(fix0,tx);
2037 fiy0 = _mm_add_ps(fiy0,ty);
2038 fiz0 = _mm_add_ps(fiz0,tz);
2040 fjx1 = _mm_add_ps(fjx1,tx);
2041 fjy1 = _mm_add_ps(fjy1,ty);
2042 fjz1 = _mm_add_ps(fjz1,tz);
2046 /**************************
2047 * CALCULATE INTERACTIONS *
2048 **************************/
2050 if (gmx_mm_any_lt(rsq02,rcutoff2))
2053 r02 = _mm_mul_ps(rsq02,rinv02);
2055 /* EWALD ELECTROSTATICS */
2057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2058 ewrt = _mm_mul_ps(r02,ewtabscale);
2059 ewitab = _mm_cvttps_epi32(ewrt);
2060 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2061 ewitab = _mm_slli_epi32(ewitab,2);
2062 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2063 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2064 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2065 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2066 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2067 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2068 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2069 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2070 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2072 d = _mm_sub_ps(r02,rswitch);
2073 d = _mm_max_ps(d,_mm_setzero_ps());
2074 d2 = _mm_mul_ps(d,d);
2075 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2077 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2079 /* Evaluate switch function */
2080 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2081 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2082 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2084 fscal = felec;
2086 fscal = _mm_and_ps(fscal,cutoff_mask);
2088 /* Calculate temporary vectorial force */
2089 tx = _mm_mul_ps(fscal,dx02);
2090 ty = _mm_mul_ps(fscal,dy02);
2091 tz = _mm_mul_ps(fscal,dz02);
2093 /* Update vectorial force */
2094 fix0 = _mm_add_ps(fix0,tx);
2095 fiy0 = _mm_add_ps(fiy0,ty);
2096 fiz0 = _mm_add_ps(fiz0,tz);
2098 fjx2 = _mm_add_ps(fjx2,tx);
2099 fjy2 = _mm_add_ps(fjy2,ty);
2100 fjz2 = _mm_add_ps(fjz2,tz);
2104 /**************************
2105 * CALCULATE INTERACTIONS *
2106 **************************/
2108 if (gmx_mm_any_lt(rsq10,rcutoff2))
2111 r10 = _mm_mul_ps(rsq10,rinv10);
2113 /* EWALD ELECTROSTATICS */
2115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2116 ewrt = _mm_mul_ps(r10,ewtabscale);
2117 ewitab = _mm_cvttps_epi32(ewrt);
2118 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2119 ewitab = _mm_slli_epi32(ewitab,2);
2120 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2121 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2122 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2123 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2124 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2125 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2126 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2127 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2128 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2130 d = _mm_sub_ps(r10,rswitch);
2131 d = _mm_max_ps(d,_mm_setzero_ps());
2132 d2 = _mm_mul_ps(d,d);
2133 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2135 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2137 /* Evaluate switch function */
2138 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2139 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2140 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2142 fscal = felec;
2144 fscal = _mm_and_ps(fscal,cutoff_mask);
2146 /* Calculate temporary vectorial force */
2147 tx = _mm_mul_ps(fscal,dx10);
2148 ty = _mm_mul_ps(fscal,dy10);
2149 tz = _mm_mul_ps(fscal,dz10);
2151 /* Update vectorial force */
2152 fix1 = _mm_add_ps(fix1,tx);
2153 fiy1 = _mm_add_ps(fiy1,ty);
2154 fiz1 = _mm_add_ps(fiz1,tz);
2156 fjx0 = _mm_add_ps(fjx0,tx);
2157 fjy0 = _mm_add_ps(fjy0,ty);
2158 fjz0 = _mm_add_ps(fjz0,tz);
2162 /**************************
2163 * CALCULATE INTERACTIONS *
2164 **************************/
2166 if (gmx_mm_any_lt(rsq11,rcutoff2))
2169 r11 = _mm_mul_ps(rsq11,rinv11);
2171 /* EWALD ELECTROSTATICS */
2173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2174 ewrt = _mm_mul_ps(r11,ewtabscale);
2175 ewitab = _mm_cvttps_epi32(ewrt);
2176 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2177 ewitab = _mm_slli_epi32(ewitab,2);
2178 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2179 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2180 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2181 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2182 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2183 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2184 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2185 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2186 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2188 d = _mm_sub_ps(r11,rswitch);
2189 d = _mm_max_ps(d,_mm_setzero_ps());
2190 d2 = _mm_mul_ps(d,d);
2191 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2193 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2195 /* Evaluate switch function */
2196 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2197 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2198 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2200 fscal = felec;
2202 fscal = _mm_and_ps(fscal,cutoff_mask);
2204 /* Calculate temporary vectorial force */
2205 tx = _mm_mul_ps(fscal,dx11);
2206 ty = _mm_mul_ps(fscal,dy11);
2207 tz = _mm_mul_ps(fscal,dz11);
2209 /* Update vectorial force */
2210 fix1 = _mm_add_ps(fix1,tx);
2211 fiy1 = _mm_add_ps(fiy1,ty);
2212 fiz1 = _mm_add_ps(fiz1,tz);
2214 fjx1 = _mm_add_ps(fjx1,tx);
2215 fjy1 = _mm_add_ps(fjy1,ty);
2216 fjz1 = _mm_add_ps(fjz1,tz);
2220 /**************************
2221 * CALCULATE INTERACTIONS *
2222 **************************/
2224 if (gmx_mm_any_lt(rsq12,rcutoff2))
2227 r12 = _mm_mul_ps(rsq12,rinv12);
2229 /* EWALD ELECTROSTATICS */
2231 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2232 ewrt = _mm_mul_ps(r12,ewtabscale);
2233 ewitab = _mm_cvttps_epi32(ewrt);
2234 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2235 ewitab = _mm_slli_epi32(ewitab,2);
2236 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2237 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2238 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2239 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2240 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2241 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2242 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2243 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2244 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2246 d = _mm_sub_ps(r12,rswitch);
2247 d = _mm_max_ps(d,_mm_setzero_ps());
2248 d2 = _mm_mul_ps(d,d);
2249 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2251 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2253 /* Evaluate switch function */
2254 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2255 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2256 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2258 fscal = felec;
2260 fscal = _mm_and_ps(fscal,cutoff_mask);
2262 /* Calculate temporary vectorial force */
2263 tx = _mm_mul_ps(fscal,dx12);
2264 ty = _mm_mul_ps(fscal,dy12);
2265 tz = _mm_mul_ps(fscal,dz12);
2267 /* Update vectorial force */
2268 fix1 = _mm_add_ps(fix1,tx);
2269 fiy1 = _mm_add_ps(fiy1,ty);
2270 fiz1 = _mm_add_ps(fiz1,tz);
2272 fjx2 = _mm_add_ps(fjx2,tx);
2273 fjy2 = _mm_add_ps(fjy2,ty);
2274 fjz2 = _mm_add_ps(fjz2,tz);
2278 /**************************
2279 * CALCULATE INTERACTIONS *
2280 **************************/
2282 if (gmx_mm_any_lt(rsq20,rcutoff2))
2285 r20 = _mm_mul_ps(rsq20,rinv20);
2287 /* EWALD ELECTROSTATICS */
2289 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2290 ewrt = _mm_mul_ps(r20,ewtabscale);
2291 ewitab = _mm_cvttps_epi32(ewrt);
2292 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2293 ewitab = _mm_slli_epi32(ewitab,2);
2294 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2295 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2296 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2297 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2298 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2299 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2300 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2301 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2302 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2304 d = _mm_sub_ps(r20,rswitch);
2305 d = _mm_max_ps(d,_mm_setzero_ps());
2306 d2 = _mm_mul_ps(d,d);
2307 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2309 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2311 /* Evaluate switch function */
2312 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2313 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2314 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2316 fscal = felec;
2318 fscal = _mm_and_ps(fscal,cutoff_mask);
2320 /* Calculate temporary vectorial force */
2321 tx = _mm_mul_ps(fscal,dx20);
2322 ty = _mm_mul_ps(fscal,dy20);
2323 tz = _mm_mul_ps(fscal,dz20);
2325 /* Update vectorial force */
2326 fix2 = _mm_add_ps(fix2,tx);
2327 fiy2 = _mm_add_ps(fiy2,ty);
2328 fiz2 = _mm_add_ps(fiz2,tz);
2330 fjx0 = _mm_add_ps(fjx0,tx);
2331 fjy0 = _mm_add_ps(fjy0,ty);
2332 fjz0 = _mm_add_ps(fjz0,tz);
2336 /**************************
2337 * CALCULATE INTERACTIONS *
2338 **************************/
2340 if (gmx_mm_any_lt(rsq21,rcutoff2))
2343 r21 = _mm_mul_ps(rsq21,rinv21);
2345 /* EWALD ELECTROSTATICS */
2347 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2348 ewrt = _mm_mul_ps(r21,ewtabscale);
2349 ewitab = _mm_cvttps_epi32(ewrt);
2350 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2351 ewitab = _mm_slli_epi32(ewitab,2);
2352 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2353 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2354 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2355 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2356 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2357 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2358 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2359 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2360 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2362 d = _mm_sub_ps(r21,rswitch);
2363 d = _mm_max_ps(d,_mm_setzero_ps());
2364 d2 = _mm_mul_ps(d,d);
2365 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2367 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2369 /* Evaluate switch function */
2370 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2371 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2372 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2374 fscal = felec;
2376 fscal = _mm_and_ps(fscal,cutoff_mask);
2378 /* Calculate temporary vectorial force */
2379 tx = _mm_mul_ps(fscal,dx21);
2380 ty = _mm_mul_ps(fscal,dy21);
2381 tz = _mm_mul_ps(fscal,dz21);
2383 /* Update vectorial force */
2384 fix2 = _mm_add_ps(fix2,tx);
2385 fiy2 = _mm_add_ps(fiy2,ty);
2386 fiz2 = _mm_add_ps(fiz2,tz);
2388 fjx1 = _mm_add_ps(fjx1,tx);
2389 fjy1 = _mm_add_ps(fjy1,ty);
2390 fjz1 = _mm_add_ps(fjz1,tz);
2394 /**************************
2395 * CALCULATE INTERACTIONS *
2396 **************************/
2398 if (gmx_mm_any_lt(rsq22,rcutoff2))
2401 r22 = _mm_mul_ps(rsq22,rinv22);
2403 /* EWALD ELECTROSTATICS */
2405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2406 ewrt = _mm_mul_ps(r22,ewtabscale);
2407 ewitab = _mm_cvttps_epi32(ewrt);
2408 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2409 ewitab = _mm_slli_epi32(ewitab,2);
2410 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2411 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2412 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2413 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2414 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2415 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2416 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2417 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2418 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2420 d = _mm_sub_ps(r22,rswitch);
2421 d = _mm_max_ps(d,_mm_setzero_ps());
2422 d2 = _mm_mul_ps(d,d);
2423 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2425 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2427 /* Evaluate switch function */
2428 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2429 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2430 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2432 fscal = felec;
2434 fscal = _mm_and_ps(fscal,cutoff_mask);
2436 /* Calculate temporary vectorial force */
2437 tx = _mm_mul_ps(fscal,dx22);
2438 ty = _mm_mul_ps(fscal,dy22);
2439 tz = _mm_mul_ps(fscal,dz22);
2441 /* Update vectorial force */
2442 fix2 = _mm_add_ps(fix2,tx);
2443 fiy2 = _mm_add_ps(fiy2,ty);
2444 fiz2 = _mm_add_ps(fiz2,tz);
2446 fjx2 = _mm_add_ps(fjx2,tx);
2447 fjy2 = _mm_add_ps(fjy2,ty);
2448 fjz2 = _mm_add_ps(fjz2,tz);
2452 fjptrA = f+j_coord_offsetA;
2453 fjptrB = f+j_coord_offsetB;
2454 fjptrC = f+j_coord_offsetC;
2455 fjptrD = f+j_coord_offsetD;
2457 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2458 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2460 /* Inner loop uses 573 flops */
2463 if(jidx<j_index_end)
2466 /* Get j neighbor index, and coordinate index */
2467 jnrlistA = jjnr[jidx];
2468 jnrlistB = jjnr[jidx+1];
2469 jnrlistC = jjnr[jidx+2];
2470 jnrlistD = jjnr[jidx+3];
2471 /* Sign of each element will be negative for non-real atoms.
2472 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2473 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2475 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2476 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2477 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2478 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2479 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2480 j_coord_offsetA = DIM*jnrA;
2481 j_coord_offsetB = DIM*jnrB;
2482 j_coord_offsetC = DIM*jnrC;
2483 j_coord_offsetD = DIM*jnrD;
2485 /* load j atom coordinates */
2486 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2487 x+j_coord_offsetC,x+j_coord_offsetD,
2488 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2490 /* Calculate displacement vector */
2491 dx00 = _mm_sub_ps(ix0,jx0);
2492 dy00 = _mm_sub_ps(iy0,jy0);
2493 dz00 = _mm_sub_ps(iz0,jz0);
2494 dx01 = _mm_sub_ps(ix0,jx1);
2495 dy01 = _mm_sub_ps(iy0,jy1);
2496 dz01 = _mm_sub_ps(iz0,jz1);
2497 dx02 = _mm_sub_ps(ix0,jx2);
2498 dy02 = _mm_sub_ps(iy0,jy2);
2499 dz02 = _mm_sub_ps(iz0,jz2);
2500 dx10 = _mm_sub_ps(ix1,jx0);
2501 dy10 = _mm_sub_ps(iy1,jy0);
2502 dz10 = _mm_sub_ps(iz1,jz0);
2503 dx11 = _mm_sub_ps(ix1,jx1);
2504 dy11 = _mm_sub_ps(iy1,jy1);
2505 dz11 = _mm_sub_ps(iz1,jz1);
2506 dx12 = _mm_sub_ps(ix1,jx2);
2507 dy12 = _mm_sub_ps(iy1,jy2);
2508 dz12 = _mm_sub_ps(iz1,jz2);
2509 dx20 = _mm_sub_ps(ix2,jx0);
2510 dy20 = _mm_sub_ps(iy2,jy0);
2511 dz20 = _mm_sub_ps(iz2,jz0);
2512 dx21 = _mm_sub_ps(ix2,jx1);
2513 dy21 = _mm_sub_ps(iy2,jy1);
2514 dz21 = _mm_sub_ps(iz2,jz1);
2515 dx22 = _mm_sub_ps(ix2,jx2);
2516 dy22 = _mm_sub_ps(iy2,jy2);
2517 dz22 = _mm_sub_ps(iz2,jz2);
2519 /* Calculate squared distance and things based on it */
2520 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2521 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2522 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2523 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2524 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2525 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2526 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2527 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2528 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2530 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2531 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2532 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2533 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2534 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2535 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2536 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2537 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2538 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2540 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2541 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2542 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2543 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2544 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2545 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2546 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2547 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2548 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2550 fjx0 = _mm_setzero_ps();
2551 fjy0 = _mm_setzero_ps();
2552 fjz0 = _mm_setzero_ps();
2553 fjx1 = _mm_setzero_ps();
2554 fjy1 = _mm_setzero_ps();
2555 fjz1 = _mm_setzero_ps();
2556 fjx2 = _mm_setzero_ps();
2557 fjy2 = _mm_setzero_ps();
2558 fjz2 = _mm_setzero_ps();
2560 /**************************
2561 * CALCULATE INTERACTIONS *
2562 **************************/
2564 if (gmx_mm_any_lt(rsq00,rcutoff2))
2567 r00 = _mm_mul_ps(rsq00,rinv00);
2568 r00 = _mm_andnot_ps(dummy_mask,r00);
2570 /* EWALD ELECTROSTATICS */
2572 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2573 ewrt = _mm_mul_ps(r00,ewtabscale);
2574 ewitab = _mm_cvttps_epi32(ewrt);
2575 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2576 ewitab = _mm_slli_epi32(ewitab,2);
2577 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2578 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2579 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2580 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2581 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2582 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2583 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2584 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2585 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2587 /* LENNARD-JONES DISPERSION/REPULSION */
2589 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2590 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2591 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2592 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2593 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2595 d = _mm_sub_ps(r00,rswitch);
2596 d = _mm_max_ps(d,_mm_setzero_ps());
2597 d2 = _mm_mul_ps(d,d);
2598 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2600 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2602 /* Evaluate switch function */
2603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2604 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2605 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2606 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2608 fscal = _mm_add_ps(felec,fvdw);
2610 fscal = _mm_and_ps(fscal,cutoff_mask);
2612 fscal = _mm_andnot_ps(dummy_mask,fscal);
2614 /* Calculate temporary vectorial force */
2615 tx = _mm_mul_ps(fscal,dx00);
2616 ty = _mm_mul_ps(fscal,dy00);
2617 tz = _mm_mul_ps(fscal,dz00);
2619 /* Update vectorial force */
2620 fix0 = _mm_add_ps(fix0,tx);
2621 fiy0 = _mm_add_ps(fiy0,ty);
2622 fiz0 = _mm_add_ps(fiz0,tz);
2624 fjx0 = _mm_add_ps(fjx0,tx);
2625 fjy0 = _mm_add_ps(fjy0,ty);
2626 fjz0 = _mm_add_ps(fjz0,tz);
2630 /**************************
2631 * CALCULATE INTERACTIONS *
2632 **************************/
2634 if (gmx_mm_any_lt(rsq01,rcutoff2))
2637 r01 = _mm_mul_ps(rsq01,rinv01);
2638 r01 = _mm_andnot_ps(dummy_mask,r01);
2640 /* EWALD ELECTROSTATICS */
2642 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2643 ewrt = _mm_mul_ps(r01,ewtabscale);
2644 ewitab = _mm_cvttps_epi32(ewrt);
2645 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2646 ewitab = _mm_slli_epi32(ewitab,2);
2647 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2648 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2649 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2650 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2651 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2652 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2653 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2654 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2655 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2657 d = _mm_sub_ps(r01,rswitch);
2658 d = _mm_max_ps(d,_mm_setzero_ps());
2659 d2 = _mm_mul_ps(d,d);
2660 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2662 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2664 /* Evaluate switch function */
2665 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2666 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2667 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2669 fscal = felec;
2671 fscal = _mm_and_ps(fscal,cutoff_mask);
2673 fscal = _mm_andnot_ps(dummy_mask,fscal);
2675 /* Calculate temporary vectorial force */
2676 tx = _mm_mul_ps(fscal,dx01);
2677 ty = _mm_mul_ps(fscal,dy01);
2678 tz = _mm_mul_ps(fscal,dz01);
2680 /* Update vectorial force */
2681 fix0 = _mm_add_ps(fix0,tx);
2682 fiy0 = _mm_add_ps(fiy0,ty);
2683 fiz0 = _mm_add_ps(fiz0,tz);
2685 fjx1 = _mm_add_ps(fjx1,tx);
2686 fjy1 = _mm_add_ps(fjy1,ty);
2687 fjz1 = _mm_add_ps(fjz1,tz);
2691 /**************************
2692 * CALCULATE INTERACTIONS *
2693 **************************/
2695 if (gmx_mm_any_lt(rsq02,rcutoff2))
2698 r02 = _mm_mul_ps(rsq02,rinv02);
2699 r02 = _mm_andnot_ps(dummy_mask,r02);
2701 /* EWALD ELECTROSTATICS */
2703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2704 ewrt = _mm_mul_ps(r02,ewtabscale);
2705 ewitab = _mm_cvttps_epi32(ewrt);
2706 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2707 ewitab = _mm_slli_epi32(ewitab,2);
2708 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2709 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2710 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2711 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2712 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2713 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2714 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2715 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2716 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2718 d = _mm_sub_ps(r02,rswitch);
2719 d = _mm_max_ps(d,_mm_setzero_ps());
2720 d2 = _mm_mul_ps(d,d);
2721 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2723 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2725 /* Evaluate switch function */
2726 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2727 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2728 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2730 fscal = felec;
2732 fscal = _mm_and_ps(fscal,cutoff_mask);
2734 fscal = _mm_andnot_ps(dummy_mask,fscal);
2736 /* Calculate temporary vectorial force */
2737 tx = _mm_mul_ps(fscal,dx02);
2738 ty = _mm_mul_ps(fscal,dy02);
2739 tz = _mm_mul_ps(fscal,dz02);
2741 /* Update vectorial force */
2742 fix0 = _mm_add_ps(fix0,tx);
2743 fiy0 = _mm_add_ps(fiy0,ty);
2744 fiz0 = _mm_add_ps(fiz0,tz);
2746 fjx2 = _mm_add_ps(fjx2,tx);
2747 fjy2 = _mm_add_ps(fjy2,ty);
2748 fjz2 = _mm_add_ps(fjz2,tz);
2752 /**************************
2753 * CALCULATE INTERACTIONS *
2754 **************************/
2756 if (gmx_mm_any_lt(rsq10,rcutoff2))
2759 r10 = _mm_mul_ps(rsq10,rinv10);
2760 r10 = _mm_andnot_ps(dummy_mask,r10);
2762 /* EWALD ELECTROSTATICS */
2764 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2765 ewrt = _mm_mul_ps(r10,ewtabscale);
2766 ewitab = _mm_cvttps_epi32(ewrt);
2767 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2768 ewitab = _mm_slli_epi32(ewitab,2);
2769 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2770 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2771 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2772 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2773 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2774 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2775 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2776 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2777 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2779 d = _mm_sub_ps(r10,rswitch);
2780 d = _mm_max_ps(d,_mm_setzero_ps());
2781 d2 = _mm_mul_ps(d,d);
2782 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2784 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2786 /* Evaluate switch function */
2787 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2788 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2789 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2791 fscal = felec;
2793 fscal = _mm_and_ps(fscal,cutoff_mask);
2795 fscal = _mm_andnot_ps(dummy_mask,fscal);
2797 /* Calculate temporary vectorial force */
2798 tx = _mm_mul_ps(fscal,dx10);
2799 ty = _mm_mul_ps(fscal,dy10);
2800 tz = _mm_mul_ps(fscal,dz10);
2802 /* Update vectorial force */
2803 fix1 = _mm_add_ps(fix1,tx);
2804 fiy1 = _mm_add_ps(fiy1,ty);
2805 fiz1 = _mm_add_ps(fiz1,tz);
2807 fjx0 = _mm_add_ps(fjx0,tx);
2808 fjy0 = _mm_add_ps(fjy0,ty);
2809 fjz0 = _mm_add_ps(fjz0,tz);
2813 /**************************
2814 * CALCULATE INTERACTIONS *
2815 **************************/
2817 if (gmx_mm_any_lt(rsq11,rcutoff2))
2820 r11 = _mm_mul_ps(rsq11,rinv11);
2821 r11 = _mm_andnot_ps(dummy_mask,r11);
2823 /* EWALD ELECTROSTATICS */
2825 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2826 ewrt = _mm_mul_ps(r11,ewtabscale);
2827 ewitab = _mm_cvttps_epi32(ewrt);
2828 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2829 ewitab = _mm_slli_epi32(ewitab,2);
2830 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2831 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2832 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2833 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2834 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2835 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2836 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2837 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2838 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2840 d = _mm_sub_ps(r11,rswitch);
2841 d = _mm_max_ps(d,_mm_setzero_ps());
2842 d2 = _mm_mul_ps(d,d);
2843 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2845 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2847 /* Evaluate switch function */
2848 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2849 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2850 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2852 fscal = felec;
2854 fscal = _mm_and_ps(fscal,cutoff_mask);
2856 fscal = _mm_andnot_ps(dummy_mask,fscal);
2858 /* Calculate temporary vectorial force */
2859 tx = _mm_mul_ps(fscal,dx11);
2860 ty = _mm_mul_ps(fscal,dy11);
2861 tz = _mm_mul_ps(fscal,dz11);
2863 /* Update vectorial force */
2864 fix1 = _mm_add_ps(fix1,tx);
2865 fiy1 = _mm_add_ps(fiy1,ty);
2866 fiz1 = _mm_add_ps(fiz1,tz);
2868 fjx1 = _mm_add_ps(fjx1,tx);
2869 fjy1 = _mm_add_ps(fjy1,ty);
2870 fjz1 = _mm_add_ps(fjz1,tz);
2874 /**************************
2875 * CALCULATE INTERACTIONS *
2876 **************************/
2878 if (gmx_mm_any_lt(rsq12,rcutoff2))
2881 r12 = _mm_mul_ps(rsq12,rinv12);
2882 r12 = _mm_andnot_ps(dummy_mask,r12);
2884 /* EWALD ELECTROSTATICS */
2886 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2887 ewrt = _mm_mul_ps(r12,ewtabscale);
2888 ewitab = _mm_cvttps_epi32(ewrt);
2889 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2890 ewitab = _mm_slli_epi32(ewitab,2);
2891 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2892 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2893 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2894 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2895 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2896 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2897 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2898 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2899 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2901 d = _mm_sub_ps(r12,rswitch);
2902 d = _mm_max_ps(d,_mm_setzero_ps());
2903 d2 = _mm_mul_ps(d,d);
2904 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2906 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2908 /* Evaluate switch function */
2909 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2910 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2911 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2913 fscal = felec;
2915 fscal = _mm_and_ps(fscal,cutoff_mask);
2917 fscal = _mm_andnot_ps(dummy_mask,fscal);
2919 /* Calculate temporary vectorial force */
2920 tx = _mm_mul_ps(fscal,dx12);
2921 ty = _mm_mul_ps(fscal,dy12);
2922 tz = _mm_mul_ps(fscal,dz12);
2924 /* Update vectorial force */
2925 fix1 = _mm_add_ps(fix1,tx);
2926 fiy1 = _mm_add_ps(fiy1,ty);
2927 fiz1 = _mm_add_ps(fiz1,tz);
2929 fjx2 = _mm_add_ps(fjx2,tx);
2930 fjy2 = _mm_add_ps(fjy2,ty);
2931 fjz2 = _mm_add_ps(fjz2,tz);
2935 /**************************
2936 * CALCULATE INTERACTIONS *
2937 **************************/
2939 if (gmx_mm_any_lt(rsq20,rcutoff2))
2942 r20 = _mm_mul_ps(rsq20,rinv20);
2943 r20 = _mm_andnot_ps(dummy_mask,r20);
2945 /* EWALD ELECTROSTATICS */
2947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2948 ewrt = _mm_mul_ps(r20,ewtabscale);
2949 ewitab = _mm_cvttps_epi32(ewrt);
2950 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2951 ewitab = _mm_slli_epi32(ewitab,2);
2952 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2953 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2954 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2955 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2956 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2957 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2958 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2959 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2960 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2962 d = _mm_sub_ps(r20,rswitch);
2963 d = _mm_max_ps(d,_mm_setzero_ps());
2964 d2 = _mm_mul_ps(d,d);
2965 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2967 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2969 /* Evaluate switch function */
2970 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2971 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2972 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2974 fscal = felec;
2976 fscal = _mm_and_ps(fscal,cutoff_mask);
2978 fscal = _mm_andnot_ps(dummy_mask,fscal);
2980 /* Calculate temporary vectorial force */
2981 tx = _mm_mul_ps(fscal,dx20);
2982 ty = _mm_mul_ps(fscal,dy20);
2983 tz = _mm_mul_ps(fscal,dz20);
2985 /* Update vectorial force */
2986 fix2 = _mm_add_ps(fix2,tx);
2987 fiy2 = _mm_add_ps(fiy2,ty);
2988 fiz2 = _mm_add_ps(fiz2,tz);
2990 fjx0 = _mm_add_ps(fjx0,tx);
2991 fjy0 = _mm_add_ps(fjy0,ty);
2992 fjz0 = _mm_add_ps(fjz0,tz);
2996 /**************************
2997 * CALCULATE INTERACTIONS *
2998 **************************/
3000 if (gmx_mm_any_lt(rsq21,rcutoff2))
3003 r21 = _mm_mul_ps(rsq21,rinv21);
3004 r21 = _mm_andnot_ps(dummy_mask,r21);
3006 /* EWALD ELECTROSTATICS */
3008 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3009 ewrt = _mm_mul_ps(r21,ewtabscale);
3010 ewitab = _mm_cvttps_epi32(ewrt);
3011 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3012 ewitab = _mm_slli_epi32(ewitab,2);
3013 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3014 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3015 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3016 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3017 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3018 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3019 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3020 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3021 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3023 d = _mm_sub_ps(r21,rswitch);
3024 d = _mm_max_ps(d,_mm_setzero_ps());
3025 d2 = _mm_mul_ps(d,d);
3026 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3028 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3030 /* Evaluate switch function */
3031 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3032 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3033 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3035 fscal = felec;
3037 fscal = _mm_and_ps(fscal,cutoff_mask);
3039 fscal = _mm_andnot_ps(dummy_mask,fscal);
3041 /* Calculate temporary vectorial force */
3042 tx = _mm_mul_ps(fscal,dx21);
3043 ty = _mm_mul_ps(fscal,dy21);
3044 tz = _mm_mul_ps(fscal,dz21);
3046 /* Update vectorial force */
3047 fix2 = _mm_add_ps(fix2,tx);
3048 fiy2 = _mm_add_ps(fiy2,ty);
3049 fiz2 = _mm_add_ps(fiz2,tz);
3051 fjx1 = _mm_add_ps(fjx1,tx);
3052 fjy1 = _mm_add_ps(fjy1,ty);
3053 fjz1 = _mm_add_ps(fjz1,tz);
3057 /**************************
3058 * CALCULATE INTERACTIONS *
3059 **************************/
3061 if (gmx_mm_any_lt(rsq22,rcutoff2))
3064 r22 = _mm_mul_ps(rsq22,rinv22);
3065 r22 = _mm_andnot_ps(dummy_mask,r22);
3067 /* EWALD ELECTROSTATICS */
3069 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3070 ewrt = _mm_mul_ps(r22,ewtabscale);
3071 ewitab = _mm_cvttps_epi32(ewrt);
3072 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3073 ewitab = _mm_slli_epi32(ewitab,2);
3074 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3075 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3076 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3077 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3078 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3079 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3080 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3081 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3082 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3084 d = _mm_sub_ps(r22,rswitch);
3085 d = _mm_max_ps(d,_mm_setzero_ps());
3086 d2 = _mm_mul_ps(d,d);
3087 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3089 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3091 /* Evaluate switch function */
3092 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3093 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3094 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3096 fscal = felec;
3098 fscal = _mm_and_ps(fscal,cutoff_mask);
3100 fscal = _mm_andnot_ps(dummy_mask,fscal);
3102 /* Calculate temporary vectorial force */
3103 tx = _mm_mul_ps(fscal,dx22);
3104 ty = _mm_mul_ps(fscal,dy22);
3105 tz = _mm_mul_ps(fscal,dz22);
3107 /* Update vectorial force */
3108 fix2 = _mm_add_ps(fix2,tx);
3109 fiy2 = _mm_add_ps(fiy2,ty);
3110 fiz2 = _mm_add_ps(fiz2,tz);
3112 fjx2 = _mm_add_ps(fjx2,tx);
3113 fjy2 = _mm_add_ps(fjy2,ty);
3114 fjz2 = _mm_add_ps(fjz2,tz);
3118 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3119 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3120 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3121 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3123 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3124 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3126 /* Inner loop uses 582 flops */
3129 /* End of innermost loop */
3131 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3132 f+i_coord_offset,fshift+i_shift_offset);
3134 /* Increment number of inner iterations */
3135 inneriter += j_index_end - j_index_start;
3137 /* Outer loop uses 18 flops */
3140 /* Increment number of outer iterations */
3141 outeriter += nri;
3143 /* Update outer/inner flops */
3145 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);