Remove nb-parameters from t_forcerec
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse2_double.c
blob5fe0fc30af2515a35093dca524ba33e72925a221
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
76 real rcutoff_scalar;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79 int vdwioffset1;
80 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
81 int vdwioffset2;
82 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
83 int vdwioffset3;
84 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
85 int vdwjidx1A,vdwjidx1B;
86 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
87 int vdwjidx2A,vdwjidx2B;
88 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
89 int vdwjidx3A,vdwjidx3B;
90 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
91 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
92 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
93 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
94 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
95 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
96 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
97 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
98 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
99 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
100 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
101 real *charge;
102 __m128i ewitab;
103 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104 real *ewtab;
105 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
106 real rswitch_scalar,d_scalar;
107 __m128d dummy_mask,cutoff_mask;
108 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
109 __m128d one = _mm_set1_pd(1.0);
110 __m128d two = _mm_set1_pd(2.0);
111 x = xx[0];
112 f = ff[0];
114 nri = nlist->nri;
115 iinr = nlist->iinr;
116 jindex = nlist->jindex;
117 jjnr = nlist->jjnr;
118 shiftidx = nlist->shift;
119 gid = nlist->gid;
120 shiftvec = fr->shift_vec[0];
121 fshift = fr->fshift[0];
122 facel = _mm_set1_pd(fr->ic->epsfac);
123 charge = mdatoms->chargeA;
125 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
126 ewtab = fr->ic->tabq_coul_FDV0;
127 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
128 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
133 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
134 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136 jq1 = _mm_set1_pd(charge[inr+1]);
137 jq2 = _mm_set1_pd(charge[inr+2]);
138 jq3 = _mm_set1_pd(charge[inr+3]);
139 qq11 = _mm_mul_pd(iq1,jq1);
140 qq12 = _mm_mul_pd(iq1,jq2);
141 qq13 = _mm_mul_pd(iq1,jq3);
142 qq21 = _mm_mul_pd(iq2,jq1);
143 qq22 = _mm_mul_pd(iq2,jq2);
144 qq23 = _mm_mul_pd(iq2,jq3);
145 qq31 = _mm_mul_pd(iq3,jq1);
146 qq32 = _mm_mul_pd(iq3,jq2);
147 qq33 = _mm_mul_pd(iq3,jq3);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->ic->rcoulomb;
151 rcutoff = _mm_set1_pd(rcutoff_scalar);
152 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
154 rswitch_scalar = fr->ic->rcoulomb_switch;
155 rswitch = _mm_set1_pd(rswitch_scalar);
156 /* Setup switch parameters */
157 d_scalar = rcutoff_scalar-rswitch_scalar;
158 d = _mm_set1_pd(d_scalar);
159 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
160 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
163 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
166 /* Avoid stupid compiler warnings */
167 jnrA = jnrB = 0;
168 j_coord_offsetA = 0;
169 j_coord_offsetB = 0;
171 outeriter = 0;
172 inneriter = 0;
174 /* Start outer loop over neighborlists */
175 for(iidx=0; iidx<nri; iidx++)
177 /* Load shift vector for this list */
178 i_shift_offset = DIM*shiftidx[iidx];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
185 inr = iinr[iidx];
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
190 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
192 fix1 = _mm_setzero_pd();
193 fiy1 = _mm_setzero_pd();
194 fiz1 = _mm_setzero_pd();
195 fix2 = _mm_setzero_pd();
196 fiy2 = _mm_setzero_pd();
197 fiz2 = _mm_setzero_pd();
198 fix3 = _mm_setzero_pd();
199 fiy3 = _mm_setzero_pd();
200 fiz3 = _mm_setzero_pd();
202 /* Reset potential sums */
203 velecsum = _mm_setzero_pd();
205 /* Start inner kernel loop */
206 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
209 /* Get j neighbor index, and coordinate index */
210 jnrA = jjnr[jidx];
211 jnrB = jjnr[jidx+1];
212 j_coord_offsetA = DIM*jnrA;
213 j_coord_offsetB = DIM*jnrB;
215 /* load j atom coordinates */
216 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
217 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
219 /* Calculate displacement vector */
220 dx11 = _mm_sub_pd(ix1,jx1);
221 dy11 = _mm_sub_pd(iy1,jy1);
222 dz11 = _mm_sub_pd(iz1,jz1);
223 dx12 = _mm_sub_pd(ix1,jx2);
224 dy12 = _mm_sub_pd(iy1,jy2);
225 dz12 = _mm_sub_pd(iz1,jz2);
226 dx13 = _mm_sub_pd(ix1,jx3);
227 dy13 = _mm_sub_pd(iy1,jy3);
228 dz13 = _mm_sub_pd(iz1,jz3);
229 dx21 = _mm_sub_pd(ix2,jx1);
230 dy21 = _mm_sub_pd(iy2,jy1);
231 dz21 = _mm_sub_pd(iz2,jz1);
232 dx22 = _mm_sub_pd(ix2,jx2);
233 dy22 = _mm_sub_pd(iy2,jy2);
234 dz22 = _mm_sub_pd(iz2,jz2);
235 dx23 = _mm_sub_pd(ix2,jx3);
236 dy23 = _mm_sub_pd(iy2,jy3);
237 dz23 = _mm_sub_pd(iz2,jz3);
238 dx31 = _mm_sub_pd(ix3,jx1);
239 dy31 = _mm_sub_pd(iy3,jy1);
240 dz31 = _mm_sub_pd(iz3,jz1);
241 dx32 = _mm_sub_pd(ix3,jx2);
242 dy32 = _mm_sub_pd(iy3,jy2);
243 dz32 = _mm_sub_pd(iz3,jz2);
244 dx33 = _mm_sub_pd(ix3,jx3);
245 dy33 = _mm_sub_pd(iy3,jy3);
246 dz33 = _mm_sub_pd(iz3,jz3);
248 /* Calculate squared distance and things based on it */
249 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
250 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
251 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
252 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
253 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
254 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
255 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
256 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
257 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
259 rinv11 = sse2_invsqrt_d(rsq11);
260 rinv12 = sse2_invsqrt_d(rsq12);
261 rinv13 = sse2_invsqrt_d(rsq13);
262 rinv21 = sse2_invsqrt_d(rsq21);
263 rinv22 = sse2_invsqrt_d(rsq22);
264 rinv23 = sse2_invsqrt_d(rsq23);
265 rinv31 = sse2_invsqrt_d(rsq31);
266 rinv32 = sse2_invsqrt_d(rsq32);
267 rinv33 = sse2_invsqrt_d(rsq33);
269 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
270 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
271 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
272 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
273 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
274 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
275 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
276 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
277 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
279 fjx1 = _mm_setzero_pd();
280 fjy1 = _mm_setzero_pd();
281 fjz1 = _mm_setzero_pd();
282 fjx2 = _mm_setzero_pd();
283 fjy2 = _mm_setzero_pd();
284 fjz2 = _mm_setzero_pd();
285 fjx3 = _mm_setzero_pd();
286 fjy3 = _mm_setzero_pd();
287 fjz3 = _mm_setzero_pd();
289 /**************************
290 * CALCULATE INTERACTIONS *
291 **************************/
293 if (gmx_mm_any_lt(rsq11,rcutoff2))
296 r11 = _mm_mul_pd(rsq11,rinv11);
298 /* EWALD ELECTROSTATICS */
300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
301 ewrt = _mm_mul_pd(r11,ewtabscale);
302 ewitab = _mm_cvttpd_epi32(ewrt);
303 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
304 ewitab = _mm_slli_epi32(ewitab,2);
305 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
306 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
307 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
308 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
309 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
310 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
311 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
312 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
313 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
314 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
316 d = _mm_sub_pd(r11,rswitch);
317 d = _mm_max_pd(d,_mm_setzero_pd());
318 d2 = _mm_mul_pd(d,d);
319 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
321 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
323 /* Evaluate switch function */
324 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
325 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
326 velec = _mm_mul_pd(velec,sw);
327 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 velec = _mm_and_pd(velec,cutoff_mask);
331 velecsum = _mm_add_pd(velecsum,velec);
333 fscal = felec;
335 fscal = _mm_and_pd(fscal,cutoff_mask);
337 /* Calculate temporary vectorial force */
338 tx = _mm_mul_pd(fscal,dx11);
339 ty = _mm_mul_pd(fscal,dy11);
340 tz = _mm_mul_pd(fscal,dz11);
342 /* Update vectorial force */
343 fix1 = _mm_add_pd(fix1,tx);
344 fiy1 = _mm_add_pd(fiy1,ty);
345 fiz1 = _mm_add_pd(fiz1,tz);
347 fjx1 = _mm_add_pd(fjx1,tx);
348 fjy1 = _mm_add_pd(fjy1,ty);
349 fjz1 = _mm_add_pd(fjz1,tz);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 if (gmx_mm_any_lt(rsq12,rcutoff2))
360 r12 = _mm_mul_pd(rsq12,rinv12);
362 /* EWALD ELECTROSTATICS */
364 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
365 ewrt = _mm_mul_pd(r12,ewtabscale);
366 ewitab = _mm_cvttpd_epi32(ewrt);
367 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
368 ewitab = _mm_slli_epi32(ewitab,2);
369 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
370 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
371 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
372 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
373 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
374 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
375 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
376 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
377 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
378 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
380 d = _mm_sub_pd(r12,rswitch);
381 d = _mm_max_pd(d,_mm_setzero_pd());
382 d2 = _mm_mul_pd(d,d);
383 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
385 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
387 /* Evaluate switch function */
388 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
389 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
390 velec = _mm_mul_pd(velec,sw);
391 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velec = _mm_and_pd(velec,cutoff_mask);
395 velecsum = _mm_add_pd(velecsum,velec);
397 fscal = felec;
399 fscal = _mm_and_pd(fscal,cutoff_mask);
401 /* Calculate temporary vectorial force */
402 tx = _mm_mul_pd(fscal,dx12);
403 ty = _mm_mul_pd(fscal,dy12);
404 tz = _mm_mul_pd(fscal,dz12);
406 /* Update vectorial force */
407 fix1 = _mm_add_pd(fix1,tx);
408 fiy1 = _mm_add_pd(fiy1,ty);
409 fiz1 = _mm_add_pd(fiz1,tz);
411 fjx2 = _mm_add_pd(fjx2,tx);
412 fjy2 = _mm_add_pd(fjy2,ty);
413 fjz2 = _mm_add_pd(fjz2,tz);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 if (gmx_mm_any_lt(rsq13,rcutoff2))
424 r13 = _mm_mul_pd(rsq13,rinv13);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm_mul_pd(r13,ewtabscale);
430 ewitab = _mm_cvttpd_epi32(ewrt);
431 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
434 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
435 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
436 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
437 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
438 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
439 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
440 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
441 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
442 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
444 d = _mm_sub_pd(r13,rswitch);
445 d = _mm_max_pd(d,_mm_setzero_pd());
446 d2 = _mm_mul_pd(d,d);
447 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
449 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
451 /* Evaluate switch function */
452 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
453 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
454 velec = _mm_mul_pd(velec,sw);
455 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velec = _mm_and_pd(velec,cutoff_mask);
459 velecsum = _mm_add_pd(velecsum,velec);
461 fscal = felec;
463 fscal = _mm_and_pd(fscal,cutoff_mask);
465 /* Calculate temporary vectorial force */
466 tx = _mm_mul_pd(fscal,dx13);
467 ty = _mm_mul_pd(fscal,dy13);
468 tz = _mm_mul_pd(fscal,dz13);
470 /* Update vectorial force */
471 fix1 = _mm_add_pd(fix1,tx);
472 fiy1 = _mm_add_pd(fiy1,ty);
473 fiz1 = _mm_add_pd(fiz1,tz);
475 fjx3 = _mm_add_pd(fjx3,tx);
476 fjy3 = _mm_add_pd(fjy3,ty);
477 fjz3 = _mm_add_pd(fjz3,tz);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 if (gmx_mm_any_lt(rsq21,rcutoff2))
488 r21 = _mm_mul_pd(rsq21,rinv21);
490 /* EWALD ELECTROSTATICS */
492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
493 ewrt = _mm_mul_pd(r21,ewtabscale);
494 ewitab = _mm_cvttpd_epi32(ewrt);
495 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
496 ewitab = _mm_slli_epi32(ewitab,2);
497 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
498 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
499 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
500 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
501 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
502 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
503 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
504 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
505 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
506 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
508 d = _mm_sub_pd(r21,rswitch);
509 d = _mm_max_pd(d,_mm_setzero_pd());
510 d2 = _mm_mul_pd(d,d);
511 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
513 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
515 /* Evaluate switch function */
516 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
517 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
518 velec = _mm_mul_pd(velec,sw);
519 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velec = _mm_and_pd(velec,cutoff_mask);
523 velecsum = _mm_add_pd(velecsum,velec);
525 fscal = felec;
527 fscal = _mm_and_pd(fscal,cutoff_mask);
529 /* Calculate temporary vectorial force */
530 tx = _mm_mul_pd(fscal,dx21);
531 ty = _mm_mul_pd(fscal,dy21);
532 tz = _mm_mul_pd(fscal,dz21);
534 /* Update vectorial force */
535 fix2 = _mm_add_pd(fix2,tx);
536 fiy2 = _mm_add_pd(fiy2,ty);
537 fiz2 = _mm_add_pd(fiz2,tz);
539 fjx1 = _mm_add_pd(fjx1,tx);
540 fjy1 = _mm_add_pd(fjy1,ty);
541 fjz1 = _mm_add_pd(fjz1,tz);
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
549 if (gmx_mm_any_lt(rsq22,rcutoff2))
552 r22 = _mm_mul_pd(rsq22,rinv22);
554 /* EWALD ELECTROSTATICS */
556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
557 ewrt = _mm_mul_pd(r22,ewtabscale);
558 ewitab = _mm_cvttpd_epi32(ewrt);
559 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
560 ewitab = _mm_slli_epi32(ewitab,2);
561 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
562 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
563 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
564 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
565 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
566 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
567 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
568 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
569 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
570 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
572 d = _mm_sub_pd(r22,rswitch);
573 d = _mm_max_pd(d,_mm_setzero_pd());
574 d2 = _mm_mul_pd(d,d);
575 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
577 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
579 /* Evaluate switch function */
580 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
582 velec = _mm_mul_pd(velec,sw);
583 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec = _mm_and_pd(velec,cutoff_mask);
587 velecsum = _mm_add_pd(velecsum,velec);
589 fscal = felec;
591 fscal = _mm_and_pd(fscal,cutoff_mask);
593 /* Calculate temporary vectorial force */
594 tx = _mm_mul_pd(fscal,dx22);
595 ty = _mm_mul_pd(fscal,dy22);
596 tz = _mm_mul_pd(fscal,dz22);
598 /* Update vectorial force */
599 fix2 = _mm_add_pd(fix2,tx);
600 fiy2 = _mm_add_pd(fiy2,ty);
601 fiz2 = _mm_add_pd(fiz2,tz);
603 fjx2 = _mm_add_pd(fjx2,tx);
604 fjy2 = _mm_add_pd(fjy2,ty);
605 fjz2 = _mm_add_pd(fjz2,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 if (gmx_mm_any_lt(rsq23,rcutoff2))
616 r23 = _mm_mul_pd(rsq23,rinv23);
618 /* EWALD ELECTROSTATICS */
620 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
621 ewrt = _mm_mul_pd(r23,ewtabscale);
622 ewitab = _mm_cvttpd_epi32(ewrt);
623 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
624 ewitab = _mm_slli_epi32(ewitab,2);
625 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
626 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
627 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
628 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
629 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
630 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
631 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
632 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
633 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
634 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
636 d = _mm_sub_pd(r23,rswitch);
637 d = _mm_max_pd(d,_mm_setzero_pd());
638 d2 = _mm_mul_pd(d,d);
639 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
641 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
643 /* Evaluate switch function */
644 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
645 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
646 velec = _mm_mul_pd(velec,sw);
647 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 velec = _mm_and_pd(velec,cutoff_mask);
651 velecsum = _mm_add_pd(velecsum,velec);
653 fscal = felec;
655 fscal = _mm_and_pd(fscal,cutoff_mask);
657 /* Calculate temporary vectorial force */
658 tx = _mm_mul_pd(fscal,dx23);
659 ty = _mm_mul_pd(fscal,dy23);
660 tz = _mm_mul_pd(fscal,dz23);
662 /* Update vectorial force */
663 fix2 = _mm_add_pd(fix2,tx);
664 fiy2 = _mm_add_pd(fiy2,ty);
665 fiz2 = _mm_add_pd(fiz2,tz);
667 fjx3 = _mm_add_pd(fjx3,tx);
668 fjy3 = _mm_add_pd(fjy3,ty);
669 fjz3 = _mm_add_pd(fjz3,tz);
673 /**************************
674 * CALCULATE INTERACTIONS *
675 **************************/
677 if (gmx_mm_any_lt(rsq31,rcutoff2))
680 r31 = _mm_mul_pd(rsq31,rinv31);
682 /* EWALD ELECTROSTATICS */
684 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
685 ewrt = _mm_mul_pd(r31,ewtabscale);
686 ewitab = _mm_cvttpd_epi32(ewrt);
687 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
688 ewitab = _mm_slli_epi32(ewitab,2);
689 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
690 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
691 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
692 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
693 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
694 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
695 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
696 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
697 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
698 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
700 d = _mm_sub_pd(r31,rswitch);
701 d = _mm_max_pd(d,_mm_setzero_pd());
702 d2 = _mm_mul_pd(d,d);
703 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
705 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
707 /* Evaluate switch function */
708 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
709 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
710 velec = _mm_mul_pd(velec,sw);
711 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velec = _mm_and_pd(velec,cutoff_mask);
715 velecsum = _mm_add_pd(velecsum,velec);
717 fscal = felec;
719 fscal = _mm_and_pd(fscal,cutoff_mask);
721 /* Calculate temporary vectorial force */
722 tx = _mm_mul_pd(fscal,dx31);
723 ty = _mm_mul_pd(fscal,dy31);
724 tz = _mm_mul_pd(fscal,dz31);
726 /* Update vectorial force */
727 fix3 = _mm_add_pd(fix3,tx);
728 fiy3 = _mm_add_pd(fiy3,ty);
729 fiz3 = _mm_add_pd(fiz3,tz);
731 fjx1 = _mm_add_pd(fjx1,tx);
732 fjy1 = _mm_add_pd(fjy1,ty);
733 fjz1 = _mm_add_pd(fjz1,tz);
737 /**************************
738 * CALCULATE INTERACTIONS *
739 **************************/
741 if (gmx_mm_any_lt(rsq32,rcutoff2))
744 r32 = _mm_mul_pd(rsq32,rinv32);
746 /* EWALD ELECTROSTATICS */
748 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
749 ewrt = _mm_mul_pd(r32,ewtabscale);
750 ewitab = _mm_cvttpd_epi32(ewrt);
751 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
752 ewitab = _mm_slli_epi32(ewitab,2);
753 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
754 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
755 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
756 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
757 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
758 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
759 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
760 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
761 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
762 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
764 d = _mm_sub_pd(r32,rswitch);
765 d = _mm_max_pd(d,_mm_setzero_pd());
766 d2 = _mm_mul_pd(d,d);
767 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
769 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
771 /* Evaluate switch function */
772 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
773 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
774 velec = _mm_mul_pd(velec,sw);
775 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
777 /* Update potential sum for this i atom from the interaction with this j atom. */
778 velec = _mm_and_pd(velec,cutoff_mask);
779 velecsum = _mm_add_pd(velecsum,velec);
781 fscal = felec;
783 fscal = _mm_and_pd(fscal,cutoff_mask);
785 /* Calculate temporary vectorial force */
786 tx = _mm_mul_pd(fscal,dx32);
787 ty = _mm_mul_pd(fscal,dy32);
788 tz = _mm_mul_pd(fscal,dz32);
790 /* Update vectorial force */
791 fix3 = _mm_add_pd(fix3,tx);
792 fiy3 = _mm_add_pd(fiy3,ty);
793 fiz3 = _mm_add_pd(fiz3,tz);
795 fjx2 = _mm_add_pd(fjx2,tx);
796 fjy2 = _mm_add_pd(fjy2,ty);
797 fjz2 = _mm_add_pd(fjz2,tz);
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 if (gmx_mm_any_lt(rsq33,rcutoff2))
808 r33 = _mm_mul_pd(rsq33,rinv33);
810 /* EWALD ELECTROSTATICS */
812 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
813 ewrt = _mm_mul_pd(r33,ewtabscale);
814 ewitab = _mm_cvttpd_epi32(ewrt);
815 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
816 ewitab = _mm_slli_epi32(ewitab,2);
817 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
818 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
819 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
820 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
821 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
822 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
823 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
824 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
825 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
826 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
828 d = _mm_sub_pd(r33,rswitch);
829 d = _mm_max_pd(d,_mm_setzero_pd());
830 d2 = _mm_mul_pd(d,d);
831 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
833 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
835 /* Evaluate switch function */
836 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
837 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
838 velec = _mm_mul_pd(velec,sw);
839 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 velec = _mm_and_pd(velec,cutoff_mask);
843 velecsum = _mm_add_pd(velecsum,velec);
845 fscal = felec;
847 fscal = _mm_and_pd(fscal,cutoff_mask);
849 /* Calculate temporary vectorial force */
850 tx = _mm_mul_pd(fscal,dx33);
851 ty = _mm_mul_pd(fscal,dy33);
852 tz = _mm_mul_pd(fscal,dz33);
854 /* Update vectorial force */
855 fix3 = _mm_add_pd(fix3,tx);
856 fiy3 = _mm_add_pd(fiy3,ty);
857 fiz3 = _mm_add_pd(fiz3,tz);
859 fjx3 = _mm_add_pd(fjx3,tx);
860 fjy3 = _mm_add_pd(fjy3,ty);
861 fjz3 = _mm_add_pd(fjz3,tz);
865 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
867 /* Inner loop uses 585 flops */
870 if(jidx<j_index_end)
873 jnrA = jjnr[jidx];
874 j_coord_offsetA = DIM*jnrA;
876 /* load j atom coordinates */
877 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
878 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
880 /* Calculate displacement vector */
881 dx11 = _mm_sub_pd(ix1,jx1);
882 dy11 = _mm_sub_pd(iy1,jy1);
883 dz11 = _mm_sub_pd(iz1,jz1);
884 dx12 = _mm_sub_pd(ix1,jx2);
885 dy12 = _mm_sub_pd(iy1,jy2);
886 dz12 = _mm_sub_pd(iz1,jz2);
887 dx13 = _mm_sub_pd(ix1,jx3);
888 dy13 = _mm_sub_pd(iy1,jy3);
889 dz13 = _mm_sub_pd(iz1,jz3);
890 dx21 = _mm_sub_pd(ix2,jx1);
891 dy21 = _mm_sub_pd(iy2,jy1);
892 dz21 = _mm_sub_pd(iz2,jz1);
893 dx22 = _mm_sub_pd(ix2,jx2);
894 dy22 = _mm_sub_pd(iy2,jy2);
895 dz22 = _mm_sub_pd(iz2,jz2);
896 dx23 = _mm_sub_pd(ix2,jx3);
897 dy23 = _mm_sub_pd(iy2,jy3);
898 dz23 = _mm_sub_pd(iz2,jz3);
899 dx31 = _mm_sub_pd(ix3,jx1);
900 dy31 = _mm_sub_pd(iy3,jy1);
901 dz31 = _mm_sub_pd(iz3,jz1);
902 dx32 = _mm_sub_pd(ix3,jx2);
903 dy32 = _mm_sub_pd(iy3,jy2);
904 dz32 = _mm_sub_pd(iz3,jz2);
905 dx33 = _mm_sub_pd(ix3,jx3);
906 dy33 = _mm_sub_pd(iy3,jy3);
907 dz33 = _mm_sub_pd(iz3,jz3);
909 /* Calculate squared distance and things based on it */
910 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
911 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
912 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
913 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
914 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
915 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
916 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
917 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
918 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
920 rinv11 = sse2_invsqrt_d(rsq11);
921 rinv12 = sse2_invsqrt_d(rsq12);
922 rinv13 = sse2_invsqrt_d(rsq13);
923 rinv21 = sse2_invsqrt_d(rsq21);
924 rinv22 = sse2_invsqrt_d(rsq22);
925 rinv23 = sse2_invsqrt_d(rsq23);
926 rinv31 = sse2_invsqrt_d(rsq31);
927 rinv32 = sse2_invsqrt_d(rsq32);
928 rinv33 = sse2_invsqrt_d(rsq33);
930 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
931 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
932 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
933 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
934 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
935 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
936 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
937 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
938 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
940 fjx1 = _mm_setzero_pd();
941 fjy1 = _mm_setzero_pd();
942 fjz1 = _mm_setzero_pd();
943 fjx2 = _mm_setzero_pd();
944 fjy2 = _mm_setzero_pd();
945 fjz2 = _mm_setzero_pd();
946 fjx3 = _mm_setzero_pd();
947 fjy3 = _mm_setzero_pd();
948 fjz3 = _mm_setzero_pd();
950 /**************************
951 * CALCULATE INTERACTIONS *
952 **************************/
954 if (gmx_mm_any_lt(rsq11,rcutoff2))
957 r11 = _mm_mul_pd(rsq11,rinv11);
959 /* EWALD ELECTROSTATICS */
961 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
962 ewrt = _mm_mul_pd(r11,ewtabscale);
963 ewitab = _mm_cvttpd_epi32(ewrt);
964 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
965 ewitab = _mm_slli_epi32(ewitab,2);
966 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
967 ewtabD = _mm_setzero_pd();
968 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
969 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
970 ewtabFn = _mm_setzero_pd();
971 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
972 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
973 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
974 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
975 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
977 d = _mm_sub_pd(r11,rswitch);
978 d = _mm_max_pd(d,_mm_setzero_pd());
979 d2 = _mm_mul_pd(d,d);
980 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
982 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
984 /* Evaluate switch function */
985 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
986 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
987 velec = _mm_mul_pd(velec,sw);
988 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec = _mm_and_pd(velec,cutoff_mask);
992 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
993 velecsum = _mm_add_pd(velecsum,velec);
995 fscal = felec;
997 fscal = _mm_and_pd(fscal,cutoff_mask);
999 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1001 /* Calculate temporary vectorial force */
1002 tx = _mm_mul_pd(fscal,dx11);
1003 ty = _mm_mul_pd(fscal,dy11);
1004 tz = _mm_mul_pd(fscal,dz11);
1006 /* Update vectorial force */
1007 fix1 = _mm_add_pd(fix1,tx);
1008 fiy1 = _mm_add_pd(fiy1,ty);
1009 fiz1 = _mm_add_pd(fiz1,tz);
1011 fjx1 = _mm_add_pd(fjx1,tx);
1012 fjy1 = _mm_add_pd(fjy1,ty);
1013 fjz1 = _mm_add_pd(fjz1,tz);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_mm_any_lt(rsq12,rcutoff2))
1024 r12 = _mm_mul_pd(rsq12,rinv12);
1026 /* EWALD ELECTROSTATICS */
1028 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1029 ewrt = _mm_mul_pd(r12,ewtabscale);
1030 ewitab = _mm_cvttpd_epi32(ewrt);
1031 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1032 ewitab = _mm_slli_epi32(ewitab,2);
1033 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1034 ewtabD = _mm_setzero_pd();
1035 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1036 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1037 ewtabFn = _mm_setzero_pd();
1038 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1039 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1040 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1041 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1042 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1044 d = _mm_sub_pd(r12,rswitch);
1045 d = _mm_max_pd(d,_mm_setzero_pd());
1046 d2 = _mm_mul_pd(d,d);
1047 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1049 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1051 /* Evaluate switch function */
1052 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1053 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1054 velec = _mm_mul_pd(velec,sw);
1055 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1057 /* Update potential sum for this i atom from the interaction with this j atom. */
1058 velec = _mm_and_pd(velec,cutoff_mask);
1059 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1060 velecsum = _mm_add_pd(velecsum,velec);
1062 fscal = felec;
1064 fscal = _mm_and_pd(fscal,cutoff_mask);
1066 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1068 /* Calculate temporary vectorial force */
1069 tx = _mm_mul_pd(fscal,dx12);
1070 ty = _mm_mul_pd(fscal,dy12);
1071 tz = _mm_mul_pd(fscal,dz12);
1073 /* Update vectorial force */
1074 fix1 = _mm_add_pd(fix1,tx);
1075 fiy1 = _mm_add_pd(fiy1,ty);
1076 fiz1 = _mm_add_pd(fiz1,tz);
1078 fjx2 = _mm_add_pd(fjx2,tx);
1079 fjy2 = _mm_add_pd(fjy2,ty);
1080 fjz2 = _mm_add_pd(fjz2,tz);
1084 /**************************
1085 * CALCULATE INTERACTIONS *
1086 **************************/
1088 if (gmx_mm_any_lt(rsq13,rcutoff2))
1091 r13 = _mm_mul_pd(rsq13,rinv13);
1093 /* EWALD ELECTROSTATICS */
1095 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1096 ewrt = _mm_mul_pd(r13,ewtabscale);
1097 ewitab = _mm_cvttpd_epi32(ewrt);
1098 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1099 ewitab = _mm_slli_epi32(ewitab,2);
1100 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1101 ewtabD = _mm_setzero_pd();
1102 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1103 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1104 ewtabFn = _mm_setzero_pd();
1105 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1106 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1107 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1108 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1109 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1111 d = _mm_sub_pd(r13,rswitch);
1112 d = _mm_max_pd(d,_mm_setzero_pd());
1113 d2 = _mm_mul_pd(d,d);
1114 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1116 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1118 /* Evaluate switch function */
1119 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1120 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1121 velec = _mm_mul_pd(velec,sw);
1122 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1124 /* Update potential sum for this i atom from the interaction with this j atom. */
1125 velec = _mm_and_pd(velec,cutoff_mask);
1126 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1127 velecsum = _mm_add_pd(velecsum,velec);
1129 fscal = felec;
1131 fscal = _mm_and_pd(fscal,cutoff_mask);
1133 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1135 /* Calculate temporary vectorial force */
1136 tx = _mm_mul_pd(fscal,dx13);
1137 ty = _mm_mul_pd(fscal,dy13);
1138 tz = _mm_mul_pd(fscal,dz13);
1140 /* Update vectorial force */
1141 fix1 = _mm_add_pd(fix1,tx);
1142 fiy1 = _mm_add_pd(fiy1,ty);
1143 fiz1 = _mm_add_pd(fiz1,tz);
1145 fjx3 = _mm_add_pd(fjx3,tx);
1146 fjy3 = _mm_add_pd(fjy3,ty);
1147 fjz3 = _mm_add_pd(fjz3,tz);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 if (gmx_mm_any_lt(rsq21,rcutoff2))
1158 r21 = _mm_mul_pd(rsq21,rinv21);
1160 /* EWALD ELECTROSTATICS */
1162 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1163 ewrt = _mm_mul_pd(r21,ewtabscale);
1164 ewitab = _mm_cvttpd_epi32(ewrt);
1165 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1166 ewitab = _mm_slli_epi32(ewitab,2);
1167 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1168 ewtabD = _mm_setzero_pd();
1169 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1170 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1171 ewtabFn = _mm_setzero_pd();
1172 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1173 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1174 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1175 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1176 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1178 d = _mm_sub_pd(r21,rswitch);
1179 d = _mm_max_pd(d,_mm_setzero_pd());
1180 d2 = _mm_mul_pd(d,d);
1181 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1183 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1185 /* Evaluate switch function */
1186 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1187 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1188 velec = _mm_mul_pd(velec,sw);
1189 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1191 /* Update potential sum for this i atom from the interaction with this j atom. */
1192 velec = _mm_and_pd(velec,cutoff_mask);
1193 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1194 velecsum = _mm_add_pd(velecsum,velec);
1196 fscal = felec;
1198 fscal = _mm_and_pd(fscal,cutoff_mask);
1200 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1202 /* Calculate temporary vectorial force */
1203 tx = _mm_mul_pd(fscal,dx21);
1204 ty = _mm_mul_pd(fscal,dy21);
1205 tz = _mm_mul_pd(fscal,dz21);
1207 /* Update vectorial force */
1208 fix2 = _mm_add_pd(fix2,tx);
1209 fiy2 = _mm_add_pd(fiy2,ty);
1210 fiz2 = _mm_add_pd(fiz2,tz);
1212 fjx1 = _mm_add_pd(fjx1,tx);
1213 fjy1 = _mm_add_pd(fjy1,ty);
1214 fjz1 = _mm_add_pd(fjz1,tz);
1218 /**************************
1219 * CALCULATE INTERACTIONS *
1220 **************************/
1222 if (gmx_mm_any_lt(rsq22,rcutoff2))
1225 r22 = _mm_mul_pd(rsq22,rinv22);
1227 /* EWALD ELECTROSTATICS */
1229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1230 ewrt = _mm_mul_pd(r22,ewtabscale);
1231 ewitab = _mm_cvttpd_epi32(ewrt);
1232 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1233 ewitab = _mm_slli_epi32(ewitab,2);
1234 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1235 ewtabD = _mm_setzero_pd();
1236 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1237 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1238 ewtabFn = _mm_setzero_pd();
1239 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1240 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1241 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1242 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1243 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1245 d = _mm_sub_pd(r22,rswitch);
1246 d = _mm_max_pd(d,_mm_setzero_pd());
1247 d2 = _mm_mul_pd(d,d);
1248 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1250 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1252 /* Evaluate switch function */
1253 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1254 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1255 velec = _mm_mul_pd(velec,sw);
1256 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec = _mm_and_pd(velec,cutoff_mask);
1260 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1261 velecsum = _mm_add_pd(velecsum,velec);
1263 fscal = felec;
1265 fscal = _mm_and_pd(fscal,cutoff_mask);
1267 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1269 /* Calculate temporary vectorial force */
1270 tx = _mm_mul_pd(fscal,dx22);
1271 ty = _mm_mul_pd(fscal,dy22);
1272 tz = _mm_mul_pd(fscal,dz22);
1274 /* Update vectorial force */
1275 fix2 = _mm_add_pd(fix2,tx);
1276 fiy2 = _mm_add_pd(fiy2,ty);
1277 fiz2 = _mm_add_pd(fiz2,tz);
1279 fjx2 = _mm_add_pd(fjx2,tx);
1280 fjy2 = _mm_add_pd(fjy2,ty);
1281 fjz2 = _mm_add_pd(fjz2,tz);
1285 /**************************
1286 * CALCULATE INTERACTIONS *
1287 **************************/
1289 if (gmx_mm_any_lt(rsq23,rcutoff2))
1292 r23 = _mm_mul_pd(rsq23,rinv23);
1294 /* EWALD ELECTROSTATICS */
1296 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1297 ewrt = _mm_mul_pd(r23,ewtabscale);
1298 ewitab = _mm_cvttpd_epi32(ewrt);
1299 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1300 ewitab = _mm_slli_epi32(ewitab,2);
1301 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1302 ewtabD = _mm_setzero_pd();
1303 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1304 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1305 ewtabFn = _mm_setzero_pd();
1306 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1307 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1308 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1309 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1310 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1312 d = _mm_sub_pd(r23,rswitch);
1313 d = _mm_max_pd(d,_mm_setzero_pd());
1314 d2 = _mm_mul_pd(d,d);
1315 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1317 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1319 /* Evaluate switch function */
1320 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1321 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1322 velec = _mm_mul_pd(velec,sw);
1323 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1325 /* Update potential sum for this i atom from the interaction with this j atom. */
1326 velec = _mm_and_pd(velec,cutoff_mask);
1327 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1328 velecsum = _mm_add_pd(velecsum,velec);
1330 fscal = felec;
1332 fscal = _mm_and_pd(fscal,cutoff_mask);
1334 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1336 /* Calculate temporary vectorial force */
1337 tx = _mm_mul_pd(fscal,dx23);
1338 ty = _mm_mul_pd(fscal,dy23);
1339 tz = _mm_mul_pd(fscal,dz23);
1341 /* Update vectorial force */
1342 fix2 = _mm_add_pd(fix2,tx);
1343 fiy2 = _mm_add_pd(fiy2,ty);
1344 fiz2 = _mm_add_pd(fiz2,tz);
1346 fjx3 = _mm_add_pd(fjx3,tx);
1347 fjy3 = _mm_add_pd(fjy3,ty);
1348 fjz3 = _mm_add_pd(fjz3,tz);
1352 /**************************
1353 * CALCULATE INTERACTIONS *
1354 **************************/
1356 if (gmx_mm_any_lt(rsq31,rcutoff2))
1359 r31 = _mm_mul_pd(rsq31,rinv31);
1361 /* EWALD ELECTROSTATICS */
1363 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1364 ewrt = _mm_mul_pd(r31,ewtabscale);
1365 ewitab = _mm_cvttpd_epi32(ewrt);
1366 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1367 ewitab = _mm_slli_epi32(ewitab,2);
1368 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1369 ewtabD = _mm_setzero_pd();
1370 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1371 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1372 ewtabFn = _mm_setzero_pd();
1373 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1374 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1375 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1376 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1377 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1379 d = _mm_sub_pd(r31,rswitch);
1380 d = _mm_max_pd(d,_mm_setzero_pd());
1381 d2 = _mm_mul_pd(d,d);
1382 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1384 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1386 /* Evaluate switch function */
1387 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1388 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1389 velec = _mm_mul_pd(velec,sw);
1390 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1392 /* Update potential sum for this i atom from the interaction with this j atom. */
1393 velec = _mm_and_pd(velec,cutoff_mask);
1394 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1395 velecsum = _mm_add_pd(velecsum,velec);
1397 fscal = felec;
1399 fscal = _mm_and_pd(fscal,cutoff_mask);
1401 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1403 /* Calculate temporary vectorial force */
1404 tx = _mm_mul_pd(fscal,dx31);
1405 ty = _mm_mul_pd(fscal,dy31);
1406 tz = _mm_mul_pd(fscal,dz31);
1408 /* Update vectorial force */
1409 fix3 = _mm_add_pd(fix3,tx);
1410 fiy3 = _mm_add_pd(fiy3,ty);
1411 fiz3 = _mm_add_pd(fiz3,tz);
1413 fjx1 = _mm_add_pd(fjx1,tx);
1414 fjy1 = _mm_add_pd(fjy1,ty);
1415 fjz1 = _mm_add_pd(fjz1,tz);
1419 /**************************
1420 * CALCULATE INTERACTIONS *
1421 **************************/
1423 if (gmx_mm_any_lt(rsq32,rcutoff2))
1426 r32 = _mm_mul_pd(rsq32,rinv32);
1428 /* EWALD ELECTROSTATICS */
1430 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1431 ewrt = _mm_mul_pd(r32,ewtabscale);
1432 ewitab = _mm_cvttpd_epi32(ewrt);
1433 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1434 ewitab = _mm_slli_epi32(ewitab,2);
1435 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1436 ewtabD = _mm_setzero_pd();
1437 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1438 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1439 ewtabFn = _mm_setzero_pd();
1440 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1441 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1442 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1443 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1444 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1446 d = _mm_sub_pd(r32,rswitch);
1447 d = _mm_max_pd(d,_mm_setzero_pd());
1448 d2 = _mm_mul_pd(d,d);
1449 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1451 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1453 /* Evaluate switch function */
1454 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1455 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1456 velec = _mm_mul_pd(velec,sw);
1457 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1459 /* Update potential sum for this i atom from the interaction with this j atom. */
1460 velec = _mm_and_pd(velec,cutoff_mask);
1461 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1462 velecsum = _mm_add_pd(velecsum,velec);
1464 fscal = felec;
1466 fscal = _mm_and_pd(fscal,cutoff_mask);
1468 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1470 /* Calculate temporary vectorial force */
1471 tx = _mm_mul_pd(fscal,dx32);
1472 ty = _mm_mul_pd(fscal,dy32);
1473 tz = _mm_mul_pd(fscal,dz32);
1475 /* Update vectorial force */
1476 fix3 = _mm_add_pd(fix3,tx);
1477 fiy3 = _mm_add_pd(fiy3,ty);
1478 fiz3 = _mm_add_pd(fiz3,tz);
1480 fjx2 = _mm_add_pd(fjx2,tx);
1481 fjy2 = _mm_add_pd(fjy2,ty);
1482 fjz2 = _mm_add_pd(fjz2,tz);
1486 /**************************
1487 * CALCULATE INTERACTIONS *
1488 **************************/
1490 if (gmx_mm_any_lt(rsq33,rcutoff2))
1493 r33 = _mm_mul_pd(rsq33,rinv33);
1495 /* EWALD ELECTROSTATICS */
1497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1498 ewrt = _mm_mul_pd(r33,ewtabscale);
1499 ewitab = _mm_cvttpd_epi32(ewrt);
1500 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1501 ewitab = _mm_slli_epi32(ewitab,2);
1502 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1503 ewtabD = _mm_setzero_pd();
1504 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1505 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1506 ewtabFn = _mm_setzero_pd();
1507 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1508 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1509 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1510 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1511 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1513 d = _mm_sub_pd(r33,rswitch);
1514 d = _mm_max_pd(d,_mm_setzero_pd());
1515 d2 = _mm_mul_pd(d,d);
1516 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1518 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1520 /* Evaluate switch function */
1521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1522 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1523 velec = _mm_mul_pd(velec,sw);
1524 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1526 /* Update potential sum for this i atom from the interaction with this j atom. */
1527 velec = _mm_and_pd(velec,cutoff_mask);
1528 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1529 velecsum = _mm_add_pd(velecsum,velec);
1531 fscal = felec;
1533 fscal = _mm_and_pd(fscal,cutoff_mask);
1535 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1537 /* Calculate temporary vectorial force */
1538 tx = _mm_mul_pd(fscal,dx33);
1539 ty = _mm_mul_pd(fscal,dy33);
1540 tz = _mm_mul_pd(fscal,dz33);
1542 /* Update vectorial force */
1543 fix3 = _mm_add_pd(fix3,tx);
1544 fiy3 = _mm_add_pd(fiy3,ty);
1545 fiz3 = _mm_add_pd(fiz3,tz);
1547 fjx3 = _mm_add_pd(fjx3,tx);
1548 fjy3 = _mm_add_pd(fjy3,ty);
1549 fjz3 = _mm_add_pd(fjz3,tz);
1553 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1555 /* Inner loop uses 585 flops */
1558 /* End of innermost loop */
1560 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1561 f+i_coord_offset+DIM,fshift+i_shift_offset);
1563 ggid = gid[iidx];
1564 /* Update potential energies */
1565 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1567 /* Increment number of inner iterations */
1568 inneriter += j_index_end - j_index_start;
1570 /* Outer loop uses 19 flops */
1573 /* Increment number of outer iterations */
1574 outeriter += nri;
1576 /* Update outer/inner flops */
1578 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*585);
1581 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double
1582 * Electrostatics interaction: Ewald
1583 * VdW interaction: None
1584 * Geometry: Water4-Water4
1585 * Calculate force/pot: Force
1587 void
1588 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double
1589 (t_nblist * gmx_restrict nlist,
1590 rvec * gmx_restrict xx,
1591 rvec * gmx_restrict ff,
1592 struct t_forcerec * gmx_restrict fr,
1593 t_mdatoms * gmx_restrict mdatoms,
1594 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1595 t_nrnb * gmx_restrict nrnb)
1597 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1598 * just 0 for non-waters.
1599 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1600 * jnr indices corresponding to data put in the four positions in the SIMD register.
1602 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1603 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1604 int jnrA,jnrB;
1605 int j_coord_offsetA,j_coord_offsetB;
1606 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1607 real rcutoff_scalar;
1608 real *shiftvec,*fshift,*x,*f;
1609 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1610 int vdwioffset1;
1611 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1612 int vdwioffset2;
1613 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1614 int vdwioffset3;
1615 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1616 int vdwjidx1A,vdwjidx1B;
1617 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1618 int vdwjidx2A,vdwjidx2B;
1619 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1620 int vdwjidx3A,vdwjidx3B;
1621 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1622 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1623 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1624 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1625 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1626 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1627 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1628 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1629 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1630 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1631 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1632 real *charge;
1633 __m128i ewitab;
1634 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1635 real *ewtab;
1636 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1637 real rswitch_scalar,d_scalar;
1638 __m128d dummy_mask,cutoff_mask;
1639 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1640 __m128d one = _mm_set1_pd(1.0);
1641 __m128d two = _mm_set1_pd(2.0);
1642 x = xx[0];
1643 f = ff[0];
1645 nri = nlist->nri;
1646 iinr = nlist->iinr;
1647 jindex = nlist->jindex;
1648 jjnr = nlist->jjnr;
1649 shiftidx = nlist->shift;
1650 gid = nlist->gid;
1651 shiftvec = fr->shift_vec[0];
1652 fshift = fr->fshift[0];
1653 facel = _mm_set1_pd(fr->ic->epsfac);
1654 charge = mdatoms->chargeA;
1656 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1657 ewtab = fr->ic->tabq_coul_FDV0;
1658 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1659 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1661 /* Setup water-specific parameters */
1662 inr = nlist->iinr[0];
1663 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1664 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1665 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1667 jq1 = _mm_set1_pd(charge[inr+1]);
1668 jq2 = _mm_set1_pd(charge[inr+2]);
1669 jq3 = _mm_set1_pd(charge[inr+3]);
1670 qq11 = _mm_mul_pd(iq1,jq1);
1671 qq12 = _mm_mul_pd(iq1,jq2);
1672 qq13 = _mm_mul_pd(iq1,jq3);
1673 qq21 = _mm_mul_pd(iq2,jq1);
1674 qq22 = _mm_mul_pd(iq2,jq2);
1675 qq23 = _mm_mul_pd(iq2,jq3);
1676 qq31 = _mm_mul_pd(iq3,jq1);
1677 qq32 = _mm_mul_pd(iq3,jq2);
1678 qq33 = _mm_mul_pd(iq3,jq3);
1680 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1681 rcutoff_scalar = fr->ic->rcoulomb;
1682 rcutoff = _mm_set1_pd(rcutoff_scalar);
1683 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1685 rswitch_scalar = fr->ic->rcoulomb_switch;
1686 rswitch = _mm_set1_pd(rswitch_scalar);
1687 /* Setup switch parameters */
1688 d_scalar = rcutoff_scalar-rswitch_scalar;
1689 d = _mm_set1_pd(d_scalar);
1690 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1691 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1692 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1693 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1694 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1695 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1697 /* Avoid stupid compiler warnings */
1698 jnrA = jnrB = 0;
1699 j_coord_offsetA = 0;
1700 j_coord_offsetB = 0;
1702 outeriter = 0;
1703 inneriter = 0;
1705 /* Start outer loop over neighborlists */
1706 for(iidx=0; iidx<nri; iidx++)
1708 /* Load shift vector for this list */
1709 i_shift_offset = DIM*shiftidx[iidx];
1711 /* Load limits for loop over neighbors */
1712 j_index_start = jindex[iidx];
1713 j_index_end = jindex[iidx+1];
1715 /* Get outer coordinate index */
1716 inr = iinr[iidx];
1717 i_coord_offset = DIM*inr;
1719 /* Load i particle coords and add shift vector */
1720 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1721 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1723 fix1 = _mm_setzero_pd();
1724 fiy1 = _mm_setzero_pd();
1725 fiz1 = _mm_setzero_pd();
1726 fix2 = _mm_setzero_pd();
1727 fiy2 = _mm_setzero_pd();
1728 fiz2 = _mm_setzero_pd();
1729 fix3 = _mm_setzero_pd();
1730 fiy3 = _mm_setzero_pd();
1731 fiz3 = _mm_setzero_pd();
1733 /* Start inner kernel loop */
1734 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1737 /* Get j neighbor index, and coordinate index */
1738 jnrA = jjnr[jidx];
1739 jnrB = jjnr[jidx+1];
1740 j_coord_offsetA = DIM*jnrA;
1741 j_coord_offsetB = DIM*jnrB;
1743 /* load j atom coordinates */
1744 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1745 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1747 /* Calculate displacement vector */
1748 dx11 = _mm_sub_pd(ix1,jx1);
1749 dy11 = _mm_sub_pd(iy1,jy1);
1750 dz11 = _mm_sub_pd(iz1,jz1);
1751 dx12 = _mm_sub_pd(ix1,jx2);
1752 dy12 = _mm_sub_pd(iy1,jy2);
1753 dz12 = _mm_sub_pd(iz1,jz2);
1754 dx13 = _mm_sub_pd(ix1,jx3);
1755 dy13 = _mm_sub_pd(iy1,jy3);
1756 dz13 = _mm_sub_pd(iz1,jz3);
1757 dx21 = _mm_sub_pd(ix2,jx1);
1758 dy21 = _mm_sub_pd(iy2,jy1);
1759 dz21 = _mm_sub_pd(iz2,jz1);
1760 dx22 = _mm_sub_pd(ix2,jx2);
1761 dy22 = _mm_sub_pd(iy2,jy2);
1762 dz22 = _mm_sub_pd(iz2,jz2);
1763 dx23 = _mm_sub_pd(ix2,jx3);
1764 dy23 = _mm_sub_pd(iy2,jy3);
1765 dz23 = _mm_sub_pd(iz2,jz3);
1766 dx31 = _mm_sub_pd(ix3,jx1);
1767 dy31 = _mm_sub_pd(iy3,jy1);
1768 dz31 = _mm_sub_pd(iz3,jz1);
1769 dx32 = _mm_sub_pd(ix3,jx2);
1770 dy32 = _mm_sub_pd(iy3,jy2);
1771 dz32 = _mm_sub_pd(iz3,jz2);
1772 dx33 = _mm_sub_pd(ix3,jx3);
1773 dy33 = _mm_sub_pd(iy3,jy3);
1774 dz33 = _mm_sub_pd(iz3,jz3);
1776 /* Calculate squared distance and things based on it */
1777 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1778 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1779 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1780 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1781 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1782 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1783 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1784 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1785 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1787 rinv11 = sse2_invsqrt_d(rsq11);
1788 rinv12 = sse2_invsqrt_d(rsq12);
1789 rinv13 = sse2_invsqrt_d(rsq13);
1790 rinv21 = sse2_invsqrt_d(rsq21);
1791 rinv22 = sse2_invsqrt_d(rsq22);
1792 rinv23 = sse2_invsqrt_d(rsq23);
1793 rinv31 = sse2_invsqrt_d(rsq31);
1794 rinv32 = sse2_invsqrt_d(rsq32);
1795 rinv33 = sse2_invsqrt_d(rsq33);
1797 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1798 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1799 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1800 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1801 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1802 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1803 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1804 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1805 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1807 fjx1 = _mm_setzero_pd();
1808 fjy1 = _mm_setzero_pd();
1809 fjz1 = _mm_setzero_pd();
1810 fjx2 = _mm_setzero_pd();
1811 fjy2 = _mm_setzero_pd();
1812 fjz2 = _mm_setzero_pd();
1813 fjx3 = _mm_setzero_pd();
1814 fjy3 = _mm_setzero_pd();
1815 fjz3 = _mm_setzero_pd();
1817 /**************************
1818 * CALCULATE INTERACTIONS *
1819 **************************/
1821 if (gmx_mm_any_lt(rsq11,rcutoff2))
1824 r11 = _mm_mul_pd(rsq11,rinv11);
1826 /* EWALD ELECTROSTATICS */
1828 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1829 ewrt = _mm_mul_pd(r11,ewtabscale);
1830 ewitab = _mm_cvttpd_epi32(ewrt);
1831 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1832 ewitab = _mm_slli_epi32(ewitab,2);
1833 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1834 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1835 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1836 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1837 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1838 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1839 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1840 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1841 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1842 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1844 d = _mm_sub_pd(r11,rswitch);
1845 d = _mm_max_pd(d,_mm_setzero_pd());
1846 d2 = _mm_mul_pd(d,d);
1847 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1849 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1851 /* Evaluate switch function */
1852 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1853 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1854 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1856 fscal = felec;
1858 fscal = _mm_and_pd(fscal,cutoff_mask);
1860 /* Calculate temporary vectorial force */
1861 tx = _mm_mul_pd(fscal,dx11);
1862 ty = _mm_mul_pd(fscal,dy11);
1863 tz = _mm_mul_pd(fscal,dz11);
1865 /* Update vectorial force */
1866 fix1 = _mm_add_pd(fix1,tx);
1867 fiy1 = _mm_add_pd(fiy1,ty);
1868 fiz1 = _mm_add_pd(fiz1,tz);
1870 fjx1 = _mm_add_pd(fjx1,tx);
1871 fjy1 = _mm_add_pd(fjy1,ty);
1872 fjz1 = _mm_add_pd(fjz1,tz);
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 if (gmx_mm_any_lt(rsq12,rcutoff2))
1883 r12 = _mm_mul_pd(rsq12,rinv12);
1885 /* EWALD ELECTROSTATICS */
1887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888 ewrt = _mm_mul_pd(r12,ewtabscale);
1889 ewitab = _mm_cvttpd_epi32(ewrt);
1890 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1891 ewitab = _mm_slli_epi32(ewitab,2);
1892 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1893 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1894 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1895 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1896 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1897 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1898 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1899 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1900 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1901 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1903 d = _mm_sub_pd(r12,rswitch);
1904 d = _mm_max_pd(d,_mm_setzero_pd());
1905 d2 = _mm_mul_pd(d,d);
1906 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1908 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1910 /* Evaluate switch function */
1911 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1912 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1913 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1915 fscal = felec;
1917 fscal = _mm_and_pd(fscal,cutoff_mask);
1919 /* Calculate temporary vectorial force */
1920 tx = _mm_mul_pd(fscal,dx12);
1921 ty = _mm_mul_pd(fscal,dy12);
1922 tz = _mm_mul_pd(fscal,dz12);
1924 /* Update vectorial force */
1925 fix1 = _mm_add_pd(fix1,tx);
1926 fiy1 = _mm_add_pd(fiy1,ty);
1927 fiz1 = _mm_add_pd(fiz1,tz);
1929 fjx2 = _mm_add_pd(fjx2,tx);
1930 fjy2 = _mm_add_pd(fjy2,ty);
1931 fjz2 = _mm_add_pd(fjz2,tz);
1935 /**************************
1936 * CALCULATE INTERACTIONS *
1937 **************************/
1939 if (gmx_mm_any_lt(rsq13,rcutoff2))
1942 r13 = _mm_mul_pd(rsq13,rinv13);
1944 /* EWALD ELECTROSTATICS */
1946 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1947 ewrt = _mm_mul_pd(r13,ewtabscale);
1948 ewitab = _mm_cvttpd_epi32(ewrt);
1949 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1950 ewitab = _mm_slli_epi32(ewitab,2);
1951 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1952 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1953 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1954 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1955 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1956 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1957 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1958 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1959 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1960 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1962 d = _mm_sub_pd(r13,rswitch);
1963 d = _mm_max_pd(d,_mm_setzero_pd());
1964 d2 = _mm_mul_pd(d,d);
1965 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1967 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1969 /* Evaluate switch function */
1970 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1971 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1972 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1974 fscal = felec;
1976 fscal = _mm_and_pd(fscal,cutoff_mask);
1978 /* Calculate temporary vectorial force */
1979 tx = _mm_mul_pd(fscal,dx13);
1980 ty = _mm_mul_pd(fscal,dy13);
1981 tz = _mm_mul_pd(fscal,dz13);
1983 /* Update vectorial force */
1984 fix1 = _mm_add_pd(fix1,tx);
1985 fiy1 = _mm_add_pd(fiy1,ty);
1986 fiz1 = _mm_add_pd(fiz1,tz);
1988 fjx3 = _mm_add_pd(fjx3,tx);
1989 fjy3 = _mm_add_pd(fjy3,ty);
1990 fjz3 = _mm_add_pd(fjz3,tz);
1994 /**************************
1995 * CALCULATE INTERACTIONS *
1996 **************************/
1998 if (gmx_mm_any_lt(rsq21,rcutoff2))
2001 r21 = _mm_mul_pd(rsq21,rinv21);
2003 /* EWALD ELECTROSTATICS */
2005 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2006 ewrt = _mm_mul_pd(r21,ewtabscale);
2007 ewitab = _mm_cvttpd_epi32(ewrt);
2008 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2009 ewitab = _mm_slli_epi32(ewitab,2);
2010 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2011 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2012 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2013 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2014 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2015 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2016 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2017 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2018 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2019 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2021 d = _mm_sub_pd(r21,rswitch);
2022 d = _mm_max_pd(d,_mm_setzero_pd());
2023 d2 = _mm_mul_pd(d,d);
2024 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2026 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2028 /* Evaluate switch function */
2029 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2030 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2031 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2033 fscal = felec;
2035 fscal = _mm_and_pd(fscal,cutoff_mask);
2037 /* Calculate temporary vectorial force */
2038 tx = _mm_mul_pd(fscal,dx21);
2039 ty = _mm_mul_pd(fscal,dy21);
2040 tz = _mm_mul_pd(fscal,dz21);
2042 /* Update vectorial force */
2043 fix2 = _mm_add_pd(fix2,tx);
2044 fiy2 = _mm_add_pd(fiy2,ty);
2045 fiz2 = _mm_add_pd(fiz2,tz);
2047 fjx1 = _mm_add_pd(fjx1,tx);
2048 fjy1 = _mm_add_pd(fjy1,ty);
2049 fjz1 = _mm_add_pd(fjz1,tz);
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 if (gmx_mm_any_lt(rsq22,rcutoff2))
2060 r22 = _mm_mul_pd(rsq22,rinv22);
2062 /* EWALD ELECTROSTATICS */
2064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2065 ewrt = _mm_mul_pd(r22,ewtabscale);
2066 ewitab = _mm_cvttpd_epi32(ewrt);
2067 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2068 ewitab = _mm_slli_epi32(ewitab,2);
2069 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2070 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2071 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2072 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2073 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2074 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2075 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2076 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2077 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2078 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2080 d = _mm_sub_pd(r22,rswitch);
2081 d = _mm_max_pd(d,_mm_setzero_pd());
2082 d2 = _mm_mul_pd(d,d);
2083 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2085 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2087 /* Evaluate switch function */
2088 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2089 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2090 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2092 fscal = felec;
2094 fscal = _mm_and_pd(fscal,cutoff_mask);
2096 /* Calculate temporary vectorial force */
2097 tx = _mm_mul_pd(fscal,dx22);
2098 ty = _mm_mul_pd(fscal,dy22);
2099 tz = _mm_mul_pd(fscal,dz22);
2101 /* Update vectorial force */
2102 fix2 = _mm_add_pd(fix2,tx);
2103 fiy2 = _mm_add_pd(fiy2,ty);
2104 fiz2 = _mm_add_pd(fiz2,tz);
2106 fjx2 = _mm_add_pd(fjx2,tx);
2107 fjy2 = _mm_add_pd(fjy2,ty);
2108 fjz2 = _mm_add_pd(fjz2,tz);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 if (gmx_mm_any_lt(rsq23,rcutoff2))
2119 r23 = _mm_mul_pd(rsq23,rinv23);
2121 /* EWALD ELECTROSTATICS */
2123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124 ewrt = _mm_mul_pd(r23,ewtabscale);
2125 ewitab = _mm_cvttpd_epi32(ewrt);
2126 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2127 ewitab = _mm_slli_epi32(ewitab,2);
2128 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2129 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2130 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2131 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2132 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2133 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2134 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2135 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2136 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2137 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2139 d = _mm_sub_pd(r23,rswitch);
2140 d = _mm_max_pd(d,_mm_setzero_pd());
2141 d2 = _mm_mul_pd(d,d);
2142 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2144 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2146 /* Evaluate switch function */
2147 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2148 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2149 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2151 fscal = felec;
2153 fscal = _mm_and_pd(fscal,cutoff_mask);
2155 /* Calculate temporary vectorial force */
2156 tx = _mm_mul_pd(fscal,dx23);
2157 ty = _mm_mul_pd(fscal,dy23);
2158 tz = _mm_mul_pd(fscal,dz23);
2160 /* Update vectorial force */
2161 fix2 = _mm_add_pd(fix2,tx);
2162 fiy2 = _mm_add_pd(fiy2,ty);
2163 fiz2 = _mm_add_pd(fiz2,tz);
2165 fjx3 = _mm_add_pd(fjx3,tx);
2166 fjy3 = _mm_add_pd(fjy3,ty);
2167 fjz3 = _mm_add_pd(fjz3,tz);
2171 /**************************
2172 * CALCULATE INTERACTIONS *
2173 **************************/
2175 if (gmx_mm_any_lt(rsq31,rcutoff2))
2178 r31 = _mm_mul_pd(rsq31,rinv31);
2180 /* EWALD ELECTROSTATICS */
2182 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2183 ewrt = _mm_mul_pd(r31,ewtabscale);
2184 ewitab = _mm_cvttpd_epi32(ewrt);
2185 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2186 ewitab = _mm_slli_epi32(ewitab,2);
2187 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2188 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2189 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2190 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2191 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2192 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2193 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2194 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2195 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2196 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2198 d = _mm_sub_pd(r31,rswitch);
2199 d = _mm_max_pd(d,_mm_setzero_pd());
2200 d2 = _mm_mul_pd(d,d);
2201 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2203 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2205 /* Evaluate switch function */
2206 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2207 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2208 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2210 fscal = felec;
2212 fscal = _mm_and_pd(fscal,cutoff_mask);
2214 /* Calculate temporary vectorial force */
2215 tx = _mm_mul_pd(fscal,dx31);
2216 ty = _mm_mul_pd(fscal,dy31);
2217 tz = _mm_mul_pd(fscal,dz31);
2219 /* Update vectorial force */
2220 fix3 = _mm_add_pd(fix3,tx);
2221 fiy3 = _mm_add_pd(fiy3,ty);
2222 fiz3 = _mm_add_pd(fiz3,tz);
2224 fjx1 = _mm_add_pd(fjx1,tx);
2225 fjy1 = _mm_add_pd(fjy1,ty);
2226 fjz1 = _mm_add_pd(fjz1,tz);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 if (gmx_mm_any_lt(rsq32,rcutoff2))
2237 r32 = _mm_mul_pd(rsq32,rinv32);
2239 /* EWALD ELECTROSTATICS */
2241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2242 ewrt = _mm_mul_pd(r32,ewtabscale);
2243 ewitab = _mm_cvttpd_epi32(ewrt);
2244 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2245 ewitab = _mm_slli_epi32(ewitab,2);
2246 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2247 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2248 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2249 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2250 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2251 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2252 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2253 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2254 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2255 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2257 d = _mm_sub_pd(r32,rswitch);
2258 d = _mm_max_pd(d,_mm_setzero_pd());
2259 d2 = _mm_mul_pd(d,d);
2260 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2262 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2264 /* Evaluate switch function */
2265 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2266 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2267 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2269 fscal = felec;
2271 fscal = _mm_and_pd(fscal,cutoff_mask);
2273 /* Calculate temporary vectorial force */
2274 tx = _mm_mul_pd(fscal,dx32);
2275 ty = _mm_mul_pd(fscal,dy32);
2276 tz = _mm_mul_pd(fscal,dz32);
2278 /* Update vectorial force */
2279 fix3 = _mm_add_pd(fix3,tx);
2280 fiy3 = _mm_add_pd(fiy3,ty);
2281 fiz3 = _mm_add_pd(fiz3,tz);
2283 fjx2 = _mm_add_pd(fjx2,tx);
2284 fjy2 = _mm_add_pd(fjy2,ty);
2285 fjz2 = _mm_add_pd(fjz2,tz);
2289 /**************************
2290 * CALCULATE INTERACTIONS *
2291 **************************/
2293 if (gmx_mm_any_lt(rsq33,rcutoff2))
2296 r33 = _mm_mul_pd(rsq33,rinv33);
2298 /* EWALD ELECTROSTATICS */
2300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2301 ewrt = _mm_mul_pd(r33,ewtabscale);
2302 ewitab = _mm_cvttpd_epi32(ewrt);
2303 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2304 ewitab = _mm_slli_epi32(ewitab,2);
2305 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2306 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2307 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2308 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2309 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2310 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2311 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2312 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2313 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2314 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2316 d = _mm_sub_pd(r33,rswitch);
2317 d = _mm_max_pd(d,_mm_setzero_pd());
2318 d2 = _mm_mul_pd(d,d);
2319 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2321 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2323 /* Evaluate switch function */
2324 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2325 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2326 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2328 fscal = felec;
2330 fscal = _mm_and_pd(fscal,cutoff_mask);
2332 /* Calculate temporary vectorial force */
2333 tx = _mm_mul_pd(fscal,dx33);
2334 ty = _mm_mul_pd(fscal,dy33);
2335 tz = _mm_mul_pd(fscal,dz33);
2337 /* Update vectorial force */
2338 fix3 = _mm_add_pd(fix3,tx);
2339 fiy3 = _mm_add_pd(fiy3,ty);
2340 fiz3 = _mm_add_pd(fiz3,tz);
2342 fjx3 = _mm_add_pd(fjx3,tx);
2343 fjy3 = _mm_add_pd(fjy3,ty);
2344 fjz3 = _mm_add_pd(fjz3,tz);
2348 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2350 /* Inner loop uses 558 flops */
2353 if(jidx<j_index_end)
2356 jnrA = jjnr[jidx];
2357 j_coord_offsetA = DIM*jnrA;
2359 /* load j atom coordinates */
2360 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
2361 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2363 /* Calculate displacement vector */
2364 dx11 = _mm_sub_pd(ix1,jx1);
2365 dy11 = _mm_sub_pd(iy1,jy1);
2366 dz11 = _mm_sub_pd(iz1,jz1);
2367 dx12 = _mm_sub_pd(ix1,jx2);
2368 dy12 = _mm_sub_pd(iy1,jy2);
2369 dz12 = _mm_sub_pd(iz1,jz2);
2370 dx13 = _mm_sub_pd(ix1,jx3);
2371 dy13 = _mm_sub_pd(iy1,jy3);
2372 dz13 = _mm_sub_pd(iz1,jz3);
2373 dx21 = _mm_sub_pd(ix2,jx1);
2374 dy21 = _mm_sub_pd(iy2,jy1);
2375 dz21 = _mm_sub_pd(iz2,jz1);
2376 dx22 = _mm_sub_pd(ix2,jx2);
2377 dy22 = _mm_sub_pd(iy2,jy2);
2378 dz22 = _mm_sub_pd(iz2,jz2);
2379 dx23 = _mm_sub_pd(ix2,jx3);
2380 dy23 = _mm_sub_pd(iy2,jy3);
2381 dz23 = _mm_sub_pd(iz2,jz3);
2382 dx31 = _mm_sub_pd(ix3,jx1);
2383 dy31 = _mm_sub_pd(iy3,jy1);
2384 dz31 = _mm_sub_pd(iz3,jz1);
2385 dx32 = _mm_sub_pd(ix3,jx2);
2386 dy32 = _mm_sub_pd(iy3,jy2);
2387 dz32 = _mm_sub_pd(iz3,jz2);
2388 dx33 = _mm_sub_pd(ix3,jx3);
2389 dy33 = _mm_sub_pd(iy3,jy3);
2390 dz33 = _mm_sub_pd(iz3,jz3);
2392 /* Calculate squared distance and things based on it */
2393 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2394 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2395 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2396 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2397 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2398 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2399 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2400 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2401 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2403 rinv11 = sse2_invsqrt_d(rsq11);
2404 rinv12 = sse2_invsqrt_d(rsq12);
2405 rinv13 = sse2_invsqrt_d(rsq13);
2406 rinv21 = sse2_invsqrt_d(rsq21);
2407 rinv22 = sse2_invsqrt_d(rsq22);
2408 rinv23 = sse2_invsqrt_d(rsq23);
2409 rinv31 = sse2_invsqrt_d(rsq31);
2410 rinv32 = sse2_invsqrt_d(rsq32);
2411 rinv33 = sse2_invsqrt_d(rsq33);
2413 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2414 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2415 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2416 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2417 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2418 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2419 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2420 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2421 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2423 fjx1 = _mm_setzero_pd();
2424 fjy1 = _mm_setzero_pd();
2425 fjz1 = _mm_setzero_pd();
2426 fjx2 = _mm_setzero_pd();
2427 fjy2 = _mm_setzero_pd();
2428 fjz2 = _mm_setzero_pd();
2429 fjx3 = _mm_setzero_pd();
2430 fjy3 = _mm_setzero_pd();
2431 fjz3 = _mm_setzero_pd();
2433 /**************************
2434 * CALCULATE INTERACTIONS *
2435 **************************/
2437 if (gmx_mm_any_lt(rsq11,rcutoff2))
2440 r11 = _mm_mul_pd(rsq11,rinv11);
2442 /* EWALD ELECTROSTATICS */
2444 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2445 ewrt = _mm_mul_pd(r11,ewtabscale);
2446 ewitab = _mm_cvttpd_epi32(ewrt);
2447 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2448 ewitab = _mm_slli_epi32(ewitab,2);
2449 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2450 ewtabD = _mm_setzero_pd();
2451 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2452 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2453 ewtabFn = _mm_setzero_pd();
2454 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2455 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2456 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2457 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2458 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2460 d = _mm_sub_pd(r11,rswitch);
2461 d = _mm_max_pd(d,_mm_setzero_pd());
2462 d2 = _mm_mul_pd(d,d);
2463 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2465 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2467 /* Evaluate switch function */
2468 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2469 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2470 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2472 fscal = felec;
2474 fscal = _mm_and_pd(fscal,cutoff_mask);
2476 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2478 /* Calculate temporary vectorial force */
2479 tx = _mm_mul_pd(fscal,dx11);
2480 ty = _mm_mul_pd(fscal,dy11);
2481 tz = _mm_mul_pd(fscal,dz11);
2483 /* Update vectorial force */
2484 fix1 = _mm_add_pd(fix1,tx);
2485 fiy1 = _mm_add_pd(fiy1,ty);
2486 fiz1 = _mm_add_pd(fiz1,tz);
2488 fjx1 = _mm_add_pd(fjx1,tx);
2489 fjy1 = _mm_add_pd(fjy1,ty);
2490 fjz1 = _mm_add_pd(fjz1,tz);
2494 /**************************
2495 * CALCULATE INTERACTIONS *
2496 **************************/
2498 if (gmx_mm_any_lt(rsq12,rcutoff2))
2501 r12 = _mm_mul_pd(rsq12,rinv12);
2503 /* EWALD ELECTROSTATICS */
2505 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2506 ewrt = _mm_mul_pd(r12,ewtabscale);
2507 ewitab = _mm_cvttpd_epi32(ewrt);
2508 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2509 ewitab = _mm_slli_epi32(ewitab,2);
2510 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2511 ewtabD = _mm_setzero_pd();
2512 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2513 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2514 ewtabFn = _mm_setzero_pd();
2515 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2516 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2517 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2518 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2519 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2521 d = _mm_sub_pd(r12,rswitch);
2522 d = _mm_max_pd(d,_mm_setzero_pd());
2523 d2 = _mm_mul_pd(d,d);
2524 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2526 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2528 /* Evaluate switch function */
2529 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2530 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2531 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2533 fscal = felec;
2535 fscal = _mm_and_pd(fscal,cutoff_mask);
2537 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2539 /* Calculate temporary vectorial force */
2540 tx = _mm_mul_pd(fscal,dx12);
2541 ty = _mm_mul_pd(fscal,dy12);
2542 tz = _mm_mul_pd(fscal,dz12);
2544 /* Update vectorial force */
2545 fix1 = _mm_add_pd(fix1,tx);
2546 fiy1 = _mm_add_pd(fiy1,ty);
2547 fiz1 = _mm_add_pd(fiz1,tz);
2549 fjx2 = _mm_add_pd(fjx2,tx);
2550 fjy2 = _mm_add_pd(fjy2,ty);
2551 fjz2 = _mm_add_pd(fjz2,tz);
2555 /**************************
2556 * CALCULATE INTERACTIONS *
2557 **************************/
2559 if (gmx_mm_any_lt(rsq13,rcutoff2))
2562 r13 = _mm_mul_pd(rsq13,rinv13);
2564 /* EWALD ELECTROSTATICS */
2566 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2567 ewrt = _mm_mul_pd(r13,ewtabscale);
2568 ewitab = _mm_cvttpd_epi32(ewrt);
2569 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2570 ewitab = _mm_slli_epi32(ewitab,2);
2571 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2572 ewtabD = _mm_setzero_pd();
2573 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2574 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2575 ewtabFn = _mm_setzero_pd();
2576 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2577 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2578 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2579 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2580 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2582 d = _mm_sub_pd(r13,rswitch);
2583 d = _mm_max_pd(d,_mm_setzero_pd());
2584 d2 = _mm_mul_pd(d,d);
2585 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2587 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2589 /* Evaluate switch function */
2590 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2591 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2592 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2594 fscal = felec;
2596 fscal = _mm_and_pd(fscal,cutoff_mask);
2598 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2600 /* Calculate temporary vectorial force */
2601 tx = _mm_mul_pd(fscal,dx13);
2602 ty = _mm_mul_pd(fscal,dy13);
2603 tz = _mm_mul_pd(fscal,dz13);
2605 /* Update vectorial force */
2606 fix1 = _mm_add_pd(fix1,tx);
2607 fiy1 = _mm_add_pd(fiy1,ty);
2608 fiz1 = _mm_add_pd(fiz1,tz);
2610 fjx3 = _mm_add_pd(fjx3,tx);
2611 fjy3 = _mm_add_pd(fjy3,ty);
2612 fjz3 = _mm_add_pd(fjz3,tz);
2616 /**************************
2617 * CALCULATE INTERACTIONS *
2618 **************************/
2620 if (gmx_mm_any_lt(rsq21,rcutoff2))
2623 r21 = _mm_mul_pd(rsq21,rinv21);
2625 /* EWALD ELECTROSTATICS */
2627 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2628 ewrt = _mm_mul_pd(r21,ewtabscale);
2629 ewitab = _mm_cvttpd_epi32(ewrt);
2630 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2631 ewitab = _mm_slli_epi32(ewitab,2);
2632 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2633 ewtabD = _mm_setzero_pd();
2634 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2635 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2636 ewtabFn = _mm_setzero_pd();
2637 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2638 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2639 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2640 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2641 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2643 d = _mm_sub_pd(r21,rswitch);
2644 d = _mm_max_pd(d,_mm_setzero_pd());
2645 d2 = _mm_mul_pd(d,d);
2646 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2648 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2650 /* Evaluate switch function */
2651 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2652 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2653 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2655 fscal = felec;
2657 fscal = _mm_and_pd(fscal,cutoff_mask);
2659 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2661 /* Calculate temporary vectorial force */
2662 tx = _mm_mul_pd(fscal,dx21);
2663 ty = _mm_mul_pd(fscal,dy21);
2664 tz = _mm_mul_pd(fscal,dz21);
2666 /* Update vectorial force */
2667 fix2 = _mm_add_pd(fix2,tx);
2668 fiy2 = _mm_add_pd(fiy2,ty);
2669 fiz2 = _mm_add_pd(fiz2,tz);
2671 fjx1 = _mm_add_pd(fjx1,tx);
2672 fjy1 = _mm_add_pd(fjy1,ty);
2673 fjz1 = _mm_add_pd(fjz1,tz);
2677 /**************************
2678 * CALCULATE INTERACTIONS *
2679 **************************/
2681 if (gmx_mm_any_lt(rsq22,rcutoff2))
2684 r22 = _mm_mul_pd(rsq22,rinv22);
2686 /* EWALD ELECTROSTATICS */
2688 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2689 ewrt = _mm_mul_pd(r22,ewtabscale);
2690 ewitab = _mm_cvttpd_epi32(ewrt);
2691 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2692 ewitab = _mm_slli_epi32(ewitab,2);
2693 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2694 ewtabD = _mm_setzero_pd();
2695 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2696 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2697 ewtabFn = _mm_setzero_pd();
2698 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2699 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2700 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2701 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2702 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2704 d = _mm_sub_pd(r22,rswitch);
2705 d = _mm_max_pd(d,_mm_setzero_pd());
2706 d2 = _mm_mul_pd(d,d);
2707 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2709 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2711 /* Evaluate switch function */
2712 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2713 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2714 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2716 fscal = felec;
2718 fscal = _mm_and_pd(fscal,cutoff_mask);
2720 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2722 /* Calculate temporary vectorial force */
2723 tx = _mm_mul_pd(fscal,dx22);
2724 ty = _mm_mul_pd(fscal,dy22);
2725 tz = _mm_mul_pd(fscal,dz22);
2727 /* Update vectorial force */
2728 fix2 = _mm_add_pd(fix2,tx);
2729 fiy2 = _mm_add_pd(fiy2,ty);
2730 fiz2 = _mm_add_pd(fiz2,tz);
2732 fjx2 = _mm_add_pd(fjx2,tx);
2733 fjy2 = _mm_add_pd(fjy2,ty);
2734 fjz2 = _mm_add_pd(fjz2,tz);
2738 /**************************
2739 * CALCULATE INTERACTIONS *
2740 **************************/
2742 if (gmx_mm_any_lt(rsq23,rcutoff2))
2745 r23 = _mm_mul_pd(rsq23,rinv23);
2747 /* EWALD ELECTROSTATICS */
2749 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2750 ewrt = _mm_mul_pd(r23,ewtabscale);
2751 ewitab = _mm_cvttpd_epi32(ewrt);
2752 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2753 ewitab = _mm_slli_epi32(ewitab,2);
2754 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2755 ewtabD = _mm_setzero_pd();
2756 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2757 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2758 ewtabFn = _mm_setzero_pd();
2759 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2760 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2761 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2762 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2763 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2765 d = _mm_sub_pd(r23,rswitch);
2766 d = _mm_max_pd(d,_mm_setzero_pd());
2767 d2 = _mm_mul_pd(d,d);
2768 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2770 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2772 /* Evaluate switch function */
2773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2774 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2775 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2777 fscal = felec;
2779 fscal = _mm_and_pd(fscal,cutoff_mask);
2781 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2783 /* Calculate temporary vectorial force */
2784 tx = _mm_mul_pd(fscal,dx23);
2785 ty = _mm_mul_pd(fscal,dy23);
2786 tz = _mm_mul_pd(fscal,dz23);
2788 /* Update vectorial force */
2789 fix2 = _mm_add_pd(fix2,tx);
2790 fiy2 = _mm_add_pd(fiy2,ty);
2791 fiz2 = _mm_add_pd(fiz2,tz);
2793 fjx3 = _mm_add_pd(fjx3,tx);
2794 fjy3 = _mm_add_pd(fjy3,ty);
2795 fjz3 = _mm_add_pd(fjz3,tz);
2799 /**************************
2800 * CALCULATE INTERACTIONS *
2801 **************************/
2803 if (gmx_mm_any_lt(rsq31,rcutoff2))
2806 r31 = _mm_mul_pd(rsq31,rinv31);
2808 /* EWALD ELECTROSTATICS */
2810 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2811 ewrt = _mm_mul_pd(r31,ewtabscale);
2812 ewitab = _mm_cvttpd_epi32(ewrt);
2813 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2814 ewitab = _mm_slli_epi32(ewitab,2);
2815 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2816 ewtabD = _mm_setzero_pd();
2817 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2818 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2819 ewtabFn = _mm_setzero_pd();
2820 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2821 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2822 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2823 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2824 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2826 d = _mm_sub_pd(r31,rswitch);
2827 d = _mm_max_pd(d,_mm_setzero_pd());
2828 d2 = _mm_mul_pd(d,d);
2829 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2831 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2833 /* Evaluate switch function */
2834 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2835 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2836 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2838 fscal = felec;
2840 fscal = _mm_and_pd(fscal,cutoff_mask);
2842 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2844 /* Calculate temporary vectorial force */
2845 tx = _mm_mul_pd(fscal,dx31);
2846 ty = _mm_mul_pd(fscal,dy31);
2847 tz = _mm_mul_pd(fscal,dz31);
2849 /* Update vectorial force */
2850 fix3 = _mm_add_pd(fix3,tx);
2851 fiy3 = _mm_add_pd(fiy3,ty);
2852 fiz3 = _mm_add_pd(fiz3,tz);
2854 fjx1 = _mm_add_pd(fjx1,tx);
2855 fjy1 = _mm_add_pd(fjy1,ty);
2856 fjz1 = _mm_add_pd(fjz1,tz);
2860 /**************************
2861 * CALCULATE INTERACTIONS *
2862 **************************/
2864 if (gmx_mm_any_lt(rsq32,rcutoff2))
2867 r32 = _mm_mul_pd(rsq32,rinv32);
2869 /* EWALD ELECTROSTATICS */
2871 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2872 ewrt = _mm_mul_pd(r32,ewtabscale);
2873 ewitab = _mm_cvttpd_epi32(ewrt);
2874 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2875 ewitab = _mm_slli_epi32(ewitab,2);
2876 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2877 ewtabD = _mm_setzero_pd();
2878 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2879 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2880 ewtabFn = _mm_setzero_pd();
2881 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2882 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2883 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2884 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2885 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2887 d = _mm_sub_pd(r32,rswitch);
2888 d = _mm_max_pd(d,_mm_setzero_pd());
2889 d2 = _mm_mul_pd(d,d);
2890 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2892 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2894 /* Evaluate switch function */
2895 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2896 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2897 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2899 fscal = felec;
2901 fscal = _mm_and_pd(fscal,cutoff_mask);
2903 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2905 /* Calculate temporary vectorial force */
2906 tx = _mm_mul_pd(fscal,dx32);
2907 ty = _mm_mul_pd(fscal,dy32);
2908 tz = _mm_mul_pd(fscal,dz32);
2910 /* Update vectorial force */
2911 fix3 = _mm_add_pd(fix3,tx);
2912 fiy3 = _mm_add_pd(fiy3,ty);
2913 fiz3 = _mm_add_pd(fiz3,tz);
2915 fjx2 = _mm_add_pd(fjx2,tx);
2916 fjy2 = _mm_add_pd(fjy2,ty);
2917 fjz2 = _mm_add_pd(fjz2,tz);
2921 /**************************
2922 * CALCULATE INTERACTIONS *
2923 **************************/
2925 if (gmx_mm_any_lt(rsq33,rcutoff2))
2928 r33 = _mm_mul_pd(rsq33,rinv33);
2930 /* EWALD ELECTROSTATICS */
2932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2933 ewrt = _mm_mul_pd(r33,ewtabscale);
2934 ewitab = _mm_cvttpd_epi32(ewrt);
2935 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2936 ewitab = _mm_slli_epi32(ewitab,2);
2937 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2938 ewtabD = _mm_setzero_pd();
2939 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2940 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2941 ewtabFn = _mm_setzero_pd();
2942 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2943 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2944 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2945 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2946 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2948 d = _mm_sub_pd(r33,rswitch);
2949 d = _mm_max_pd(d,_mm_setzero_pd());
2950 d2 = _mm_mul_pd(d,d);
2951 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2953 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2955 /* Evaluate switch function */
2956 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2957 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2958 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2960 fscal = felec;
2962 fscal = _mm_and_pd(fscal,cutoff_mask);
2964 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2966 /* Calculate temporary vectorial force */
2967 tx = _mm_mul_pd(fscal,dx33);
2968 ty = _mm_mul_pd(fscal,dy33);
2969 tz = _mm_mul_pd(fscal,dz33);
2971 /* Update vectorial force */
2972 fix3 = _mm_add_pd(fix3,tx);
2973 fiy3 = _mm_add_pd(fiy3,ty);
2974 fiz3 = _mm_add_pd(fiz3,tz);
2976 fjx3 = _mm_add_pd(fjx3,tx);
2977 fjy3 = _mm_add_pd(fjy3,ty);
2978 fjz3 = _mm_add_pd(fjz3,tz);
2982 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2984 /* Inner loop uses 558 flops */
2987 /* End of innermost loop */
2989 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2990 f+i_coord_offset+DIM,fshift+i_shift_offset);
2992 /* Increment number of inner iterations */
2993 inneriter += j_index_end - j_index_start;
2995 /* Outer loop uses 18 flops */
2998 /* Increment number of outer iterations */
2999 outeriter += nri;
3001 /* Update outer/inner flops */
3003 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*558);