Fix segmentation fault in minimize
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse4_1_double.cpp
blob340e5333af5f20a5e6d98dcb39e3c3ae2ac17f95
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
76 real rcutoff_scalar;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79 int vdwioffset0;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81 int vdwioffset1;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83 int vdwioffset2;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85 int vdwjidx0A,vdwjidx0B;
86 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87 int vdwjidx1A,vdwjidx1B;
88 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89 int vdwjidx2A,vdwjidx2B;
90 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
101 real *charge;
102 int nvdwtype;
103 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104 int *vdwtype;
105 real *vdwparam;
106 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
107 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
108 __m128i ewitab;
109 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
110 real *ewtab;
111 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
112 real rswitch_scalar,d_scalar;
113 __m128d dummy_mask,cutoff_mask;
114 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one = _mm_set1_pd(1.0);
116 __m128d two = _mm_set1_pd(2.0);
117 x = xx[0];
118 f = ff[0];
120 nri = nlist->nri;
121 iinr = nlist->iinr;
122 jindex = nlist->jindex;
123 jjnr = nlist->jjnr;
124 shiftidx = nlist->shift;
125 gid = nlist->gid;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_pd(fr->ic->epsfac);
129 charge = mdatoms->chargeA;
130 nvdwtype = fr->ntype;
131 vdwparam = fr->nbfp;
132 vdwtype = mdatoms->typeA;
134 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
135 ewtab = fr->ic->tabq_coul_FDV0;
136 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
137 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq0 = _mm_set1_pd(charge[inr+0]);
147 jq1 = _mm_set1_pd(charge[inr+1]);
148 jq2 = _mm_set1_pd(charge[inr+2]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 qq00 = _mm_mul_pd(iq0,jq0);
151 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153 qq01 = _mm_mul_pd(iq0,jq1);
154 qq02 = _mm_mul_pd(iq0,jq2);
155 qq10 = _mm_mul_pd(iq1,jq0);
156 qq11 = _mm_mul_pd(iq1,jq1);
157 qq12 = _mm_mul_pd(iq1,jq2);
158 qq20 = _mm_mul_pd(iq2,jq0);
159 qq21 = _mm_mul_pd(iq2,jq1);
160 qq22 = _mm_mul_pd(iq2,jq2);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar = fr->ic->rcoulomb;
164 rcutoff = _mm_set1_pd(rcutoff_scalar);
165 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
167 rswitch_scalar = fr->ic->rcoulomb_switch;
168 rswitch = _mm_set1_pd(rswitch_scalar);
169 /* Setup switch parameters */
170 d_scalar = rcutoff_scalar-rswitch_scalar;
171 d = _mm_set1_pd(d_scalar);
172 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
173 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
176 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 /* Avoid stupid compiler warnings */
180 jnrA = jnrB = 0;
181 j_coord_offsetA = 0;
182 j_coord_offsetB = 0;
184 outeriter = 0;
185 inneriter = 0;
187 /* Start outer loop over neighborlists */
188 for(iidx=0; iidx<nri; iidx++)
190 /* Load shift vector for this list */
191 i_shift_offset = DIM*shiftidx[iidx];
193 /* Load limits for loop over neighbors */
194 j_index_start = jindex[iidx];
195 j_index_end = jindex[iidx+1];
197 /* Get outer coordinate index */
198 inr = iinr[iidx];
199 i_coord_offset = DIM*inr;
201 /* Load i particle coords and add shift vector */
202 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
203 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
205 fix0 = _mm_setzero_pd();
206 fiy0 = _mm_setzero_pd();
207 fiz0 = _mm_setzero_pd();
208 fix1 = _mm_setzero_pd();
209 fiy1 = _mm_setzero_pd();
210 fiz1 = _mm_setzero_pd();
211 fix2 = _mm_setzero_pd();
212 fiy2 = _mm_setzero_pd();
213 fiz2 = _mm_setzero_pd();
215 /* Reset potential sums */
216 velecsum = _mm_setzero_pd();
217 vvdwsum = _mm_setzero_pd();
219 /* Start inner kernel loop */
220 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
223 /* Get j neighbor index, and coordinate index */
224 jnrA = jjnr[jidx];
225 jnrB = jjnr[jidx+1];
226 j_coord_offsetA = DIM*jnrA;
227 j_coord_offsetB = DIM*jnrB;
229 /* load j atom coordinates */
230 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm_sub_pd(ix0,jx0);
235 dy00 = _mm_sub_pd(iy0,jy0);
236 dz00 = _mm_sub_pd(iz0,jz0);
237 dx01 = _mm_sub_pd(ix0,jx1);
238 dy01 = _mm_sub_pd(iy0,jy1);
239 dz01 = _mm_sub_pd(iz0,jz1);
240 dx02 = _mm_sub_pd(ix0,jx2);
241 dy02 = _mm_sub_pd(iy0,jy2);
242 dz02 = _mm_sub_pd(iz0,jz2);
243 dx10 = _mm_sub_pd(ix1,jx0);
244 dy10 = _mm_sub_pd(iy1,jy0);
245 dz10 = _mm_sub_pd(iz1,jz0);
246 dx11 = _mm_sub_pd(ix1,jx1);
247 dy11 = _mm_sub_pd(iy1,jy1);
248 dz11 = _mm_sub_pd(iz1,jz1);
249 dx12 = _mm_sub_pd(ix1,jx2);
250 dy12 = _mm_sub_pd(iy1,jy2);
251 dz12 = _mm_sub_pd(iz1,jz2);
252 dx20 = _mm_sub_pd(ix2,jx0);
253 dy20 = _mm_sub_pd(iy2,jy0);
254 dz20 = _mm_sub_pd(iz2,jz0);
255 dx21 = _mm_sub_pd(ix2,jx1);
256 dy21 = _mm_sub_pd(iy2,jy1);
257 dz21 = _mm_sub_pd(iz2,jz1);
258 dx22 = _mm_sub_pd(ix2,jx2);
259 dy22 = _mm_sub_pd(iy2,jy2);
260 dz22 = _mm_sub_pd(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
264 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
265 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
266 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
267 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
268 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
269 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
270 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
271 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
273 rinv00 = sse41_invsqrt_d(rsq00);
274 rinv01 = sse41_invsqrt_d(rsq01);
275 rinv02 = sse41_invsqrt_d(rsq02);
276 rinv10 = sse41_invsqrt_d(rsq10);
277 rinv11 = sse41_invsqrt_d(rsq11);
278 rinv12 = sse41_invsqrt_d(rsq12);
279 rinv20 = sse41_invsqrt_d(rsq20);
280 rinv21 = sse41_invsqrt_d(rsq21);
281 rinv22 = sse41_invsqrt_d(rsq22);
283 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
284 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
285 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
286 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
287 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
288 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
289 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
290 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
291 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
293 fjx0 = _mm_setzero_pd();
294 fjy0 = _mm_setzero_pd();
295 fjz0 = _mm_setzero_pd();
296 fjx1 = _mm_setzero_pd();
297 fjy1 = _mm_setzero_pd();
298 fjz1 = _mm_setzero_pd();
299 fjx2 = _mm_setzero_pd();
300 fjy2 = _mm_setzero_pd();
301 fjz2 = _mm_setzero_pd();
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 if (gmx_mm_any_lt(rsq00,rcutoff2))
310 r00 = _mm_mul_pd(rsq00,rinv00);
312 /* EWALD ELECTROSTATICS */
314 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
315 ewrt = _mm_mul_pd(r00,ewtabscale);
316 ewitab = _mm_cvttpd_epi32(ewrt);
317 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
318 ewitab = _mm_slli_epi32(ewitab,2);
319 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
320 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
321 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
322 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
323 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
324 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
325 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
326 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
327 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
328 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
330 /* LENNARD-JONES DISPERSION/REPULSION */
332 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
333 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
334 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
335 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
336 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
338 d = _mm_sub_pd(r00,rswitch);
339 d = _mm_max_pd(d,_mm_setzero_pd());
340 d2 = _mm_mul_pd(d,d);
341 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
343 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
345 /* Evaluate switch function */
346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
347 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
348 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
349 velec = _mm_mul_pd(velec,sw);
350 vvdw = _mm_mul_pd(vvdw,sw);
351 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velec = _mm_and_pd(velec,cutoff_mask);
355 velecsum = _mm_add_pd(velecsum,velec);
356 vvdw = _mm_and_pd(vvdw,cutoff_mask);
357 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
359 fscal = _mm_add_pd(felec,fvdw);
361 fscal = _mm_and_pd(fscal,cutoff_mask);
363 /* Calculate temporary vectorial force */
364 tx = _mm_mul_pd(fscal,dx00);
365 ty = _mm_mul_pd(fscal,dy00);
366 tz = _mm_mul_pd(fscal,dz00);
368 /* Update vectorial force */
369 fix0 = _mm_add_pd(fix0,tx);
370 fiy0 = _mm_add_pd(fiy0,ty);
371 fiz0 = _mm_add_pd(fiz0,tz);
373 fjx0 = _mm_add_pd(fjx0,tx);
374 fjy0 = _mm_add_pd(fjy0,ty);
375 fjz0 = _mm_add_pd(fjz0,tz);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 if (gmx_mm_any_lt(rsq01,rcutoff2))
386 r01 = _mm_mul_pd(rsq01,rinv01);
388 /* EWALD ELECTROSTATICS */
390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391 ewrt = _mm_mul_pd(r01,ewtabscale);
392 ewitab = _mm_cvttpd_epi32(ewrt);
393 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
394 ewitab = _mm_slli_epi32(ewitab,2);
395 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
396 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
397 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
398 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
399 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
400 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
401 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
402 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
403 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
404 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
406 d = _mm_sub_pd(r01,rswitch);
407 d = _mm_max_pd(d,_mm_setzero_pd());
408 d2 = _mm_mul_pd(d,d);
409 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
411 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
413 /* Evaluate switch function */
414 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
415 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
416 velec = _mm_mul_pd(velec,sw);
417 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velec = _mm_and_pd(velec,cutoff_mask);
421 velecsum = _mm_add_pd(velecsum,velec);
423 fscal = felec;
425 fscal = _mm_and_pd(fscal,cutoff_mask);
427 /* Calculate temporary vectorial force */
428 tx = _mm_mul_pd(fscal,dx01);
429 ty = _mm_mul_pd(fscal,dy01);
430 tz = _mm_mul_pd(fscal,dz01);
432 /* Update vectorial force */
433 fix0 = _mm_add_pd(fix0,tx);
434 fiy0 = _mm_add_pd(fiy0,ty);
435 fiz0 = _mm_add_pd(fiz0,tz);
437 fjx1 = _mm_add_pd(fjx1,tx);
438 fjy1 = _mm_add_pd(fjy1,ty);
439 fjz1 = _mm_add_pd(fjz1,tz);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 if (gmx_mm_any_lt(rsq02,rcutoff2))
450 r02 = _mm_mul_pd(rsq02,rinv02);
452 /* EWALD ELECTROSTATICS */
454 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
455 ewrt = _mm_mul_pd(r02,ewtabscale);
456 ewitab = _mm_cvttpd_epi32(ewrt);
457 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
458 ewitab = _mm_slli_epi32(ewitab,2);
459 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
460 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
461 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
462 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
463 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
464 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
465 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
466 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
467 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
468 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
470 d = _mm_sub_pd(r02,rswitch);
471 d = _mm_max_pd(d,_mm_setzero_pd());
472 d2 = _mm_mul_pd(d,d);
473 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
475 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
477 /* Evaluate switch function */
478 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
479 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
480 velec = _mm_mul_pd(velec,sw);
481 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velec = _mm_and_pd(velec,cutoff_mask);
485 velecsum = _mm_add_pd(velecsum,velec);
487 fscal = felec;
489 fscal = _mm_and_pd(fscal,cutoff_mask);
491 /* Calculate temporary vectorial force */
492 tx = _mm_mul_pd(fscal,dx02);
493 ty = _mm_mul_pd(fscal,dy02);
494 tz = _mm_mul_pd(fscal,dz02);
496 /* Update vectorial force */
497 fix0 = _mm_add_pd(fix0,tx);
498 fiy0 = _mm_add_pd(fiy0,ty);
499 fiz0 = _mm_add_pd(fiz0,tz);
501 fjx2 = _mm_add_pd(fjx2,tx);
502 fjy2 = _mm_add_pd(fjy2,ty);
503 fjz2 = _mm_add_pd(fjz2,tz);
507 /**************************
508 * CALCULATE INTERACTIONS *
509 **************************/
511 if (gmx_mm_any_lt(rsq10,rcutoff2))
514 r10 = _mm_mul_pd(rsq10,rinv10);
516 /* EWALD ELECTROSTATICS */
518 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
519 ewrt = _mm_mul_pd(r10,ewtabscale);
520 ewitab = _mm_cvttpd_epi32(ewrt);
521 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
522 ewitab = _mm_slli_epi32(ewitab,2);
523 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
524 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
525 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
526 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
527 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
528 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
529 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
530 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
531 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
532 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
534 d = _mm_sub_pd(r10,rswitch);
535 d = _mm_max_pd(d,_mm_setzero_pd());
536 d2 = _mm_mul_pd(d,d);
537 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
539 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
541 /* Evaluate switch function */
542 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
543 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
544 velec = _mm_mul_pd(velec,sw);
545 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
547 /* Update potential sum for this i atom from the interaction with this j atom. */
548 velec = _mm_and_pd(velec,cutoff_mask);
549 velecsum = _mm_add_pd(velecsum,velec);
551 fscal = felec;
553 fscal = _mm_and_pd(fscal,cutoff_mask);
555 /* Calculate temporary vectorial force */
556 tx = _mm_mul_pd(fscal,dx10);
557 ty = _mm_mul_pd(fscal,dy10);
558 tz = _mm_mul_pd(fscal,dz10);
560 /* Update vectorial force */
561 fix1 = _mm_add_pd(fix1,tx);
562 fiy1 = _mm_add_pd(fiy1,ty);
563 fiz1 = _mm_add_pd(fiz1,tz);
565 fjx0 = _mm_add_pd(fjx0,tx);
566 fjy0 = _mm_add_pd(fjy0,ty);
567 fjz0 = _mm_add_pd(fjz0,tz);
571 /**************************
572 * CALCULATE INTERACTIONS *
573 **************************/
575 if (gmx_mm_any_lt(rsq11,rcutoff2))
578 r11 = _mm_mul_pd(rsq11,rinv11);
580 /* EWALD ELECTROSTATICS */
582 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
583 ewrt = _mm_mul_pd(r11,ewtabscale);
584 ewitab = _mm_cvttpd_epi32(ewrt);
585 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
586 ewitab = _mm_slli_epi32(ewitab,2);
587 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
588 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
589 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
590 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
591 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
592 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
593 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
594 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
595 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
596 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
598 d = _mm_sub_pd(r11,rswitch);
599 d = _mm_max_pd(d,_mm_setzero_pd());
600 d2 = _mm_mul_pd(d,d);
601 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
603 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
605 /* Evaluate switch function */
606 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
607 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
608 velec = _mm_mul_pd(velec,sw);
609 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec = _mm_and_pd(velec,cutoff_mask);
613 velecsum = _mm_add_pd(velecsum,velec);
615 fscal = felec;
617 fscal = _mm_and_pd(fscal,cutoff_mask);
619 /* Calculate temporary vectorial force */
620 tx = _mm_mul_pd(fscal,dx11);
621 ty = _mm_mul_pd(fscal,dy11);
622 tz = _mm_mul_pd(fscal,dz11);
624 /* Update vectorial force */
625 fix1 = _mm_add_pd(fix1,tx);
626 fiy1 = _mm_add_pd(fiy1,ty);
627 fiz1 = _mm_add_pd(fiz1,tz);
629 fjx1 = _mm_add_pd(fjx1,tx);
630 fjy1 = _mm_add_pd(fjy1,ty);
631 fjz1 = _mm_add_pd(fjz1,tz);
635 /**************************
636 * CALCULATE INTERACTIONS *
637 **************************/
639 if (gmx_mm_any_lt(rsq12,rcutoff2))
642 r12 = _mm_mul_pd(rsq12,rinv12);
644 /* EWALD ELECTROSTATICS */
646 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
647 ewrt = _mm_mul_pd(r12,ewtabscale);
648 ewitab = _mm_cvttpd_epi32(ewrt);
649 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
650 ewitab = _mm_slli_epi32(ewitab,2);
651 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
652 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
653 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
654 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
655 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
656 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
657 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
658 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
659 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
660 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
662 d = _mm_sub_pd(r12,rswitch);
663 d = _mm_max_pd(d,_mm_setzero_pd());
664 d2 = _mm_mul_pd(d,d);
665 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
667 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
669 /* Evaluate switch function */
670 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
671 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
672 velec = _mm_mul_pd(velec,sw);
673 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 velec = _mm_and_pd(velec,cutoff_mask);
677 velecsum = _mm_add_pd(velecsum,velec);
679 fscal = felec;
681 fscal = _mm_and_pd(fscal,cutoff_mask);
683 /* Calculate temporary vectorial force */
684 tx = _mm_mul_pd(fscal,dx12);
685 ty = _mm_mul_pd(fscal,dy12);
686 tz = _mm_mul_pd(fscal,dz12);
688 /* Update vectorial force */
689 fix1 = _mm_add_pd(fix1,tx);
690 fiy1 = _mm_add_pd(fiy1,ty);
691 fiz1 = _mm_add_pd(fiz1,tz);
693 fjx2 = _mm_add_pd(fjx2,tx);
694 fjy2 = _mm_add_pd(fjy2,ty);
695 fjz2 = _mm_add_pd(fjz2,tz);
699 /**************************
700 * CALCULATE INTERACTIONS *
701 **************************/
703 if (gmx_mm_any_lt(rsq20,rcutoff2))
706 r20 = _mm_mul_pd(rsq20,rinv20);
708 /* EWALD ELECTROSTATICS */
710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
711 ewrt = _mm_mul_pd(r20,ewtabscale);
712 ewitab = _mm_cvttpd_epi32(ewrt);
713 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
714 ewitab = _mm_slli_epi32(ewitab,2);
715 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
716 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
717 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
718 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
719 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
720 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
721 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
722 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
723 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
724 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
726 d = _mm_sub_pd(r20,rswitch);
727 d = _mm_max_pd(d,_mm_setzero_pd());
728 d2 = _mm_mul_pd(d,d);
729 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
731 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
733 /* Evaluate switch function */
734 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
735 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
736 velec = _mm_mul_pd(velec,sw);
737 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velec = _mm_and_pd(velec,cutoff_mask);
741 velecsum = _mm_add_pd(velecsum,velec);
743 fscal = felec;
745 fscal = _mm_and_pd(fscal,cutoff_mask);
747 /* Calculate temporary vectorial force */
748 tx = _mm_mul_pd(fscal,dx20);
749 ty = _mm_mul_pd(fscal,dy20);
750 tz = _mm_mul_pd(fscal,dz20);
752 /* Update vectorial force */
753 fix2 = _mm_add_pd(fix2,tx);
754 fiy2 = _mm_add_pd(fiy2,ty);
755 fiz2 = _mm_add_pd(fiz2,tz);
757 fjx0 = _mm_add_pd(fjx0,tx);
758 fjy0 = _mm_add_pd(fjy0,ty);
759 fjz0 = _mm_add_pd(fjz0,tz);
763 /**************************
764 * CALCULATE INTERACTIONS *
765 **************************/
767 if (gmx_mm_any_lt(rsq21,rcutoff2))
770 r21 = _mm_mul_pd(rsq21,rinv21);
772 /* EWALD ELECTROSTATICS */
774 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
775 ewrt = _mm_mul_pd(r21,ewtabscale);
776 ewitab = _mm_cvttpd_epi32(ewrt);
777 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
778 ewitab = _mm_slli_epi32(ewitab,2);
779 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
780 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
781 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
782 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
783 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
784 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
785 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
786 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
787 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
788 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
790 d = _mm_sub_pd(r21,rswitch);
791 d = _mm_max_pd(d,_mm_setzero_pd());
792 d2 = _mm_mul_pd(d,d);
793 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
795 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
797 /* Evaluate switch function */
798 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
799 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
800 velec = _mm_mul_pd(velec,sw);
801 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
803 /* Update potential sum for this i atom from the interaction with this j atom. */
804 velec = _mm_and_pd(velec,cutoff_mask);
805 velecsum = _mm_add_pd(velecsum,velec);
807 fscal = felec;
809 fscal = _mm_and_pd(fscal,cutoff_mask);
811 /* Calculate temporary vectorial force */
812 tx = _mm_mul_pd(fscal,dx21);
813 ty = _mm_mul_pd(fscal,dy21);
814 tz = _mm_mul_pd(fscal,dz21);
816 /* Update vectorial force */
817 fix2 = _mm_add_pd(fix2,tx);
818 fiy2 = _mm_add_pd(fiy2,ty);
819 fiz2 = _mm_add_pd(fiz2,tz);
821 fjx1 = _mm_add_pd(fjx1,tx);
822 fjy1 = _mm_add_pd(fjy1,ty);
823 fjz1 = _mm_add_pd(fjz1,tz);
827 /**************************
828 * CALCULATE INTERACTIONS *
829 **************************/
831 if (gmx_mm_any_lt(rsq22,rcutoff2))
834 r22 = _mm_mul_pd(rsq22,rinv22);
836 /* EWALD ELECTROSTATICS */
838 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
839 ewrt = _mm_mul_pd(r22,ewtabscale);
840 ewitab = _mm_cvttpd_epi32(ewrt);
841 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
842 ewitab = _mm_slli_epi32(ewitab,2);
843 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
844 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
845 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
846 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
847 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
848 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
849 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
850 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
851 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
852 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
854 d = _mm_sub_pd(r22,rswitch);
855 d = _mm_max_pd(d,_mm_setzero_pd());
856 d2 = _mm_mul_pd(d,d);
857 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
859 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
861 /* Evaluate switch function */
862 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
863 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
864 velec = _mm_mul_pd(velec,sw);
865 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
867 /* Update potential sum for this i atom from the interaction with this j atom. */
868 velec = _mm_and_pd(velec,cutoff_mask);
869 velecsum = _mm_add_pd(velecsum,velec);
871 fscal = felec;
873 fscal = _mm_and_pd(fscal,cutoff_mask);
875 /* Calculate temporary vectorial force */
876 tx = _mm_mul_pd(fscal,dx22);
877 ty = _mm_mul_pd(fscal,dy22);
878 tz = _mm_mul_pd(fscal,dz22);
880 /* Update vectorial force */
881 fix2 = _mm_add_pd(fix2,tx);
882 fiy2 = _mm_add_pd(fiy2,ty);
883 fiz2 = _mm_add_pd(fiz2,tz);
885 fjx2 = _mm_add_pd(fjx2,tx);
886 fjy2 = _mm_add_pd(fjy2,ty);
887 fjz2 = _mm_add_pd(fjz2,tz);
891 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
893 /* Inner loop uses 603 flops */
896 if(jidx<j_index_end)
899 jnrA = jjnr[jidx];
900 j_coord_offsetA = DIM*jnrA;
902 /* load j atom coordinates */
903 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
904 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
906 /* Calculate displacement vector */
907 dx00 = _mm_sub_pd(ix0,jx0);
908 dy00 = _mm_sub_pd(iy0,jy0);
909 dz00 = _mm_sub_pd(iz0,jz0);
910 dx01 = _mm_sub_pd(ix0,jx1);
911 dy01 = _mm_sub_pd(iy0,jy1);
912 dz01 = _mm_sub_pd(iz0,jz1);
913 dx02 = _mm_sub_pd(ix0,jx2);
914 dy02 = _mm_sub_pd(iy0,jy2);
915 dz02 = _mm_sub_pd(iz0,jz2);
916 dx10 = _mm_sub_pd(ix1,jx0);
917 dy10 = _mm_sub_pd(iy1,jy0);
918 dz10 = _mm_sub_pd(iz1,jz0);
919 dx11 = _mm_sub_pd(ix1,jx1);
920 dy11 = _mm_sub_pd(iy1,jy1);
921 dz11 = _mm_sub_pd(iz1,jz1);
922 dx12 = _mm_sub_pd(ix1,jx2);
923 dy12 = _mm_sub_pd(iy1,jy2);
924 dz12 = _mm_sub_pd(iz1,jz2);
925 dx20 = _mm_sub_pd(ix2,jx0);
926 dy20 = _mm_sub_pd(iy2,jy0);
927 dz20 = _mm_sub_pd(iz2,jz0);
928 dx21 = _mm_sub_pd(ix2,jx1);
929 dy21 = _mm_sub_pd(iy2,jy1);
930 dz21 = _mm_sub_pd(iz2,jz1);
931 dx22 = _mm_sub_pd(ix2,jx2);
932 dy22 = _mm_sub_pd(iy2,jy2);
933 dz22 = _mm_sub_pd(iz2,jz2);
935 /* Calculate squared distance and things based on it */
936 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
937 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
938 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
939 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
940 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
941 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
942 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
943 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
944 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
946 rinv00 = sse41_invsqrt_d(rsq00);
947 rinv01 = sse41_invsqrt_d(rsq01);
948 rinv02 = sse41_invsqrt_d(rsq02);
949 rinv10 = sse41_invsqrt_d(rsq10);
950 rinv11 = sse41_invsqrt_d(rsq11);
951 rinv12 = sse41_invsqrt_d(rsq12);
952 rinv20 = sse41_invsqrt_d(rsq20);
953 rinv21 = sse41_invsqrt_d(rsq21);
954 rinv22 = sse41_invsqrt_d(rsq22);
956 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
957 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
958 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
959 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
960 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
961 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
962 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
963 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
964 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
966 fjx0 = _mm_setzero_pd();
967 fjy0 = _mm_setzero_pd();
968 fjz0 = _mm_setzero_pd();
969 fjx1 = _mm_setzero_pd();
970 fjy1 = _mm_setzero_pd();
971 fjz1 = _mm_setzero_pd();
972 fjx2 = _mm_setzero_pd();
973 fjy2 = _mm_setzero_pd();
974 fjz2 = _mm_setzero_pd();
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 if (gmx_mm_any_lt(rsq00,rcutoff2))
983 r00 = _mm_mul_pd(rsq00,rinv00);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt = _mm_mul_pd(r00,ewtabscale);
989 ewitab = _mm_cvttpd_epi32(ewrt);
990 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
991 ewitab = _mm_slli_epi32(ewitab,2);
992 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
993 ewtabD = _mm_setzero_pd();
994 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
995 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
996 ewtabFn = _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
998 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
999 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1000 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1001 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1003 /* LENNARD-JONES DISPERSION/REPULSION */
1005 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1006 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1007 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1008 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1009 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1011 d = _mm_sub_pd(r00,rswitch);
1012 d = _mm_max_pd(d,_mm_setzero_pd());
1013 d2 = _mm_mul_pd(d,d);
1014 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1016 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1018 /* Evaluate switch function */
1019 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1020 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1021 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1022 velec = _mm_mul_pd(velec,sw);
1023 vvdw = _mm_mul_pd(vvdw,sw);
1024 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm_and_pd(velec,cutoff_mask);
1028 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1029 velecsum = _mm_add_pd(velecsum,velec);
1030 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1031 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1032 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1034 fscal = _mm_add_pd(felec,fvdw);
1036 fscal = _mm_and_pd(fscal,cutoff_mask);
1038 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1040 /* Calculate temporary vectorial force */
1041 tx = _mm_mul_pd(fscal,dx00);
1042 ty = _mm_mul_pd(fscal,dy00);
1043 tz = _mm_mul_pd(fscal,dz00);
1045 /* Update vectorial force */
1046 fix0 = _mm_add_pd(fix0,tx);
1047 fiy0 = _mm_add_pd(fiy0,ty);
1048 fiz0 = _mm_add_pd(fiz0,tz);
1050 fjx0 = _mm_add_pd(fjx0,tx);
1051 fjy0 = _mm_add_pd(fjy0,ty);
1052 fjz0 = _mm_add_pd(fjz0,tz);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 if (gmx_mm_any_lt(rsq01,rcutoff2))
1063 r01 = _mm_mul_pd(rsq01,rinv01);
1065 /* EWALD ELECTROSTATICS */
1067 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1068 ewrt = _mm_mul_pd(r01,ewtabscale);
1069 ewitab = _mm_cvttpd_epi32(ewrt);
1070 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1071 ewitab = _mm_slli_epi32(ewitab,2);
1072 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1073 ewtabD = _mm_setzero_pd();
1074 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1075 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1076 ewtabFn = _mm_setzero_pd();
1077 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1078 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1079 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1080 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1081 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1083 d = _mm_sub_pd(r01,rswitch);
1084 d = _mm_max_pd(d,_mm_setzero_pd());
1085 d2 = _mm_mul_pd(d,d);
1086 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1088 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1090 /* Evaluate switch function */
1091 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1092 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1093 velec = _mm_mul_pd(velec,sw);
1094 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1096 /* Update potential sum for this i atom from the interaction with this j atom. */
1097 velec = _mm_and_pd(velec,cutoff_mask);
1098 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1099 velecsum = _mm_add_pd(velecsum,velec);
1101 fscal = felec;
1103 fscal = _mm_and_pd(fscal,cutoff_mask);
1105 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1107 /* Calculate temporary vectorial force */
1108 tx = _mm_mul_pd(fscal,dx01);
1109 ty = _mm_mul_pd(fscal,dy01);
1110 tz = _mm_mul_pd(fscal,dz01);
1112 /* Update vectorial force */
1113 fix0 = _mm_add_pd(fix0,tx);
1114 fiy0 = _mm_add_pd(fiy0,ty);
1115 fiz0 = _mm_add_pd(fiz0,tz);
1117 fjx1 = _mm_add_pd(fjx1,tx);
1118 fjy1 = _mm_add_pd(fjy1,ty);
1119 fjz1 = _mm_add_pd(fjz1,tz);
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 if (gmx_mm_any_lt(rsq02,rcutoff2))
1130 r02 = _mm_mul_pd(rsq02,rinv02);
1132 /* EWALD ELECTROSTATICS */
1134 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1135 ewrt = _mm_mul_pd(r02,ewtabscale);
1136 ewitab = _mm_cvttpd_epi32(ewrt);
1137 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1138 ewitab = _mm_slli_epi32(ewitab,2);
1139 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1140 ewtabD = _mm_setzero_pd();
1141 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1142 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1143 ewtabFn = _mm_setzero_pd();
1144 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1145 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1146 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1147 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1148 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1150 d = _mm_sub_pd(r02,rswitch);
1151 d = _mm_max_pd(d,_mm_setzero_pd());
1152 d2 = _mm_mul_pd(d,d);
1153 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1155 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1157 /* Evaluate switch function */
1158 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1159 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1160 velec = _mm_mul_pd(velec,sw);
1161 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1163 /* Update potential sum for this i atom from the interaction with this j atom. */
1164 velec = _mm_and_pd(velec,cutoff_mask);
1165 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1166 velecsum = _mm_add_pd(velecsum,velec);
1168 fscal = felec;
1170 fscal = _mm_and_pd(fscal,cutoff_mask);
1172 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1174 /* Calculate temporary vectorial force */
1175 tx = _mm_mul_pd(fscal,dx02);
1176 ty = _mm_mul_pd(fscal,dy02);
1177 tz = _mm_mul_pd(fscal,dz02);
1179 /* Update vectorial force */
1180 fix0 = _mm_add_pd(fix0,tx);
1181 fiy0 = _mm_add_pd(fiy0,ty);
1182 fiz0 = _mm_add_pd(fiz0,tz);
1184 fjx2 = _mm_add_pd(fjx2,tx);
1185 fjy2 = _mm_add_pd(fjy2,ty);
1186 fjz2 = _mm_add_pd(fjz2,tz);
1190 /**************************
1191 * CALCULATE INTERACTIONS *
1192 **************************/
1194 if (gmx_mm_any_lt(rsq10,rcutoff2))
1197 r10 = _mm_mul_pd(rsq10,rinv10);
1199 /* EWALD ELECTROSTATICS */
1201 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1202 ewrt = _mm_mul_pd(r10,ewtabscale);
1203 ewitab = _mm_cvttpd_epi32(ewrt);
1204 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1205 ewitab = _mm_slli_epi32(ewitab,2);
1206 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1207 ewtabD = _mm_setzero_pd();
1208 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1209 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1210 ewtabFn = _mm_setzero_pd();
1211 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1212 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1213 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1214 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1215 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1217 d = _mm_sub_pd(r10,rswitch);
1218 d = _mm_max_pd(d,_mm_setzero_pd());
1219 d2 = _mm_mul_pd(d,d);
1220 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1222 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1224 /* Evaluate switch function */
1225 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1226 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1227 velec = _mm_mul_pd(velec,sw);
1228 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1230 /* Update potential sum for this i atom from the interaction with this j atom. */
1231 velec = _mm_and_pd(velec,cutoff_mask);
1232 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1233 velecsum = _mm_add_pd(velecsum,velec);
1235 fscal = felec;
1237 fscal = _mm_and_pd(fscal,cutoff_mask);
1239 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1241 /* Calculate temporary vectorial force */
1242 tx = _mm_mul_pd(fscal,dx10);
1243 ty = _mm_mul_pd(fscal,dy10);
1244 tz = _mm_mul_pd(fscal,dz10);
1246 /* Update vectorial force */
1247 fix1 = _mm_add_pd(fix1,tx);
1248 fiy1 = _mm_add_pd(fiy1,ty);
1249 fiz1 = _mm_add_pd(fiz1,tz);
1251 fjx0 = _mm_add_pd(fjx0,tx);
1252 fjy0 = _mm_add_pd(fjy0,ty);
1253 fjz0 = _mm_add_pd(fjz0,tz);
1257 /**************************
1258 * CALCULATE INTERACTIONS *
1259 **************************/
1261 if (gmx_mm_any_lt(rsq11,rcutoff2))
1264 r11 = _mm_mul_pd(rsq11,rinv11);
1266 /* EWALD ELECTROSTATICS */
1268 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1269 ewrt = _mm_mul_pd(r11,ewtabscale);
1270 ewitab = _mm_cvttpd_epi32(ewrt);
1271 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1272 ewitab = _mm_slli_epi32(ewitab,2);
1273 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1274 ewtabD = _mm_setzero_pd();
1275 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1276 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1277 ewtabFn = _mm_setzero_pd();
1278 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1279 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1280 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1281 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1282 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1284 d = _mm_sub_pd(r11,rswitch);
1285 d = _mm_max_pd(d,_mm_setzero_pd());
1286 d2 = _mm_mul_pd(d,d);
1287 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1289 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1291 /* Evaluate switch function */
1292 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1293 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1294 velec = _mm_mul_pd(velec,sw);
1295 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1297 /* Update potential sum for this i atom from the interaction with this j atom. */
1298 velec = _mm_and_pd(velec,cutoff_mask);
1299 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1300 velecsum = _mm_add_pd(velecsum,velec);
1302 fscal = felec;
1304 fscal = _mm_and_pd(fscal,cutoff_mask);
1306 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1308 /* Calculate temporary vectorial force */
1309 tx = _mm_mul_pd(fscal,dx11);
1310 ty = _mm_mul_pd(fscal,dy11);
1311 tz = _mm_mul_pd(fscal,dz11);
1313 /* Update vectorial force */
1314 fix1 = _mm_add_pd(fix1,tx);
1315 fiy1 = _mm_add_pd(fiy1,ty);
1316 fiz1 = _mm_add_pd(fiz1,tz);
1318 fjx1 = _mm_add_pd(fjx1,tx);
1319 fjy1 = _mm_add_pd(fjy1,ty);
1320 fjz1 = _mm_add_pd(fjz1,tz);
1324 /**************************
1325 * CALCULATE INTERACTIONS *
1326 **************************/
1328 if (gmx_mm_any_lt(rsq12,rcutoff2))
1331 r12 = _mm_mul_pd(rsq12,rinv12);
1333 /* EWALD ELECTROSTATICS */
1335 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1336 ewrt = _mm_mul_pd(r12,ewtabscale);
1337 ewitab = _mm_cvttpd_epi32(ewrt);
1338 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1339 ewitab = _mm_slli_epi32(ewitab,2);
1340 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1341 ewtabD = _mm_setzero_pd();
1342 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1343 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1344 ewtabFn = _mm_setzero_pd();
1345 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1346 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1347 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1348 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1349 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1351 d = _mm_sub_pd(r12,rswitch);
1352 d = _mm_max_pd(d,_mm_setzero_pd());
1353 d2 = _mm_mul_pd(d,d);
1354 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1356 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1358 /* Evaluate switch function */
1359 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1360 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1361 velec = _mm_mul_pd(velec,sw);
1362 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1364 /* Update potential sum for this i atom from the interaction with this j atom. */
1365 velec = _mm_and_pd(velec,cutoff_mask);
1366 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1367 velecsum = _mm_add_pd(velecsum,velec);
1369 fscal = felec;
1371 fscal = _mm_and_pd(fscal,cutoff_mask);
1373 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1375 /* Calculate temporary vectorial force */
1376 tx = _mm_mul_pd(fscal,dx12);
1377 ty = _mm_mul_pd(fscal,dy12);
1378 tz = _mm_mul_pd(fscal,dz12);
1380 /* Update vectorial force */
1381 fix1 = _mm_add_pd(fix1,tx);
1382 fiy1 = _mm_add_pd(fiy1,ty);
1383 fiz1 = _mm_add_pd(fiz1,tz);
1385 fjx2 = _mm_add_pd(fjx2,tx);
1386 fjy2 = _mm_add_pd(fjy2,ty);
1387 fjz2 = _mm_add_pd(fjz2,tz);
1391 /**************************
1392 * CALCULATE INTERACTIONS *
1393 **************************/
1395 if (gmx_mm_any_lt(rsq20,rcutoff2))
1398 r20 = _mm_mul_pd(rsq20,rinv20);
1400 /* EWALD ELECTROSTATICS */
1402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1403 ewrt = _mm_mul_pd(r20,ewtabscale);
1404 ewitab = _mm_cvttpd_epi32(ewrt);
1405 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1406 ewitab = _mm_slli_epi32(ewitab,2);
1407 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1408 ewtabD = _mm_setzero_pd();
1409 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1410 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1411 ewtabFn = _mm_setzero_pd();
1412 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1413 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1414 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1415 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1416 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1418 d = _mm_sub_pd(r20,rswitch);
1419 d = _mm_max_pd(d,_mm_setzero_pd());
1420 d2 = _mm_mul_pd(d,d);
1421 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1423 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1425 /* Evaluate switch function */
1426 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1427 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1428 velec = _mm_mul_pd(velec,sw);
1429 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1431 /* Update potential sum for this i atom from the interaction with this j atom. */
1432 velec = _mm_and_pd(velec,cutoff_mask);
1433 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1434 velecsum = _mm_add_pd(velecsum,velec);
1436 fscal = felec;
1438 fscal = _mm_and_pd(fscal,cutoff_mask);
1440 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1442 /* Calculate temporary vectorial force */
1443 tx = _mm_mul_pd(fscal,dx20);
1444 ty = _mm_mul_pd(fscal,dy20);
1445 tz = _mm_mul_pd(fscal,dz20);
1447 /* Update vectorial force */
1448 fix2 = _mm_add_pd(fix2,tx);
1449 fiy2 = _mm_add_pd(fiy2,ty);
1450 fiz2 = _mm_add_pd(fiz2,tz);
1452 fjx0 = _mm_add_pd(fjx0,tx);
1453 fjy0 = _mm_add_pd(fjy0,ty);
1454 fjz0 = _mm_add_pd(fjz0,tz);
1458 /**************************
1459 * CALCULATE INTERACTIONS *
1460 **************************/
1462 if (gmx_mm_any_lt(rsq21,rcutoff2))
1465 r21 = _mm_mul_pd(rsq21,rinv21);
1467 /* EWALD ELECTROSTATICS */
1469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1470 ewrt = _mm_mul_pd(r21,ewtabscale);
1471 ewitab = _mm_cvttpd_epi32(ewrt);
1472 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1473 ewitab = _mm_slli_epi32(ewitab,2);
1474 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1475 ewtabD = _mm_setzero_pd();
1476 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1477 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1478 ewtabFn = _mm_setzero_pd();
1479 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1480 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1481 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1482 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1483 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1485 d = _mm_sub_pd(r21,rswitch);
1486 d = _mm_max_pd(d,_mm_setzero_pd());
1487 d2 = _mm_mul_pd(d,d);
1488 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1490 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1492 /* Evaluate switch function */
1493 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1494 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1495 velec = _mm_mul_pd(velec,sw);
1496 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1498 /* Update potential sum for this i atom from the interaction with this j atom. */
1499 velec = _mm_and_pd(velec,cutoff_mask);
1500 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1501 velecsum = _mm_add_pd(velecsum,velec);
1503 fscal = felec;
1505 fscal = _mm_and_pd(fscal,cutoff_mask);
1507 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1509 /* Calculate temporary vectorial force */
1510 tx = _mm_mul_pd(fscal,dx21);
1511 ty = _mm_mul_pd(fscal,dy21);
1512 tz = _mm_mul_pd(fscal,dz21);
1514 /* Update vectorial force */
1515 fix2 = _mm_add_pd(fix2,tx);
1516 fiy2 = _mm_add_pd(fiy2,ty);
1517 fiz2 = _mm_add_pd(fiz2,tz);
1519 fjx1 = _mm_add_pd(fjx1,tx);
1520 fjy1 = _mm_add_pd(fjy1,ty);
1521 fjz1 = _mm_add_pd(fjz1,tz);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 if (gmx_mm_any_lt(rsq22,rcutoff2))
1532 r22 = _mm_mul_pd(rsq22,rinv22);
1534 /* EWALD ELECTROSTATICS */
1536 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1537 ewrt = _mm_mul_pd(r22,ewtabscale);
1538 ewitab = _mm_cvttpd_epi32(ewrt);
1539 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1540 ewitab = _mm_slli_epi32(ewitab,2);
1541 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1542 ewtabD = _mm_setzero_pd();
1543 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1544 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1545 ewtabFn = _mm_setzero_pd();
1546 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1547 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1548 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1549 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1550 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1552 d = _mm_sub_pd(r22,rswitch);
1553 d = _mm_max_pd(d,_mm_setzero_pd());
1554 d2 = _mm_mul_pd(d,d);
1555 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1557 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1559 /* Evaluate switch function */
1560 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1561 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1562 velec = _mm_mul_pd(velec,sw);
1563 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1565 /* Update potential sum for this i atom from the interaction with this j atom. */
1566 velec = _mm_and_pd(velec,cutoff_mask);
1567 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1568 velecsum = _mm_add_pd(velecsum,velec);
1570 fscal = felec;
1572 fscal = _mm_and_pd(fscal,cutoff_mask);
1574 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1576 /* Calculate temporary vectorial force */
1577 tx = _mm_mul_pd(fscal,dx22);
1578 ty = _mm_mul_pd(fscal,dy22);
1579 tz = _mm_mul_pd(fscal,dz22);
1581 /* Update vectorial force */
1582 fix2 = _mm_add_pd(fix2,tx);
1583 fiy2 = _mm_add_pd(fiy2,ty);
1584 fiz2 = _mm_add_pd(fiz2,tz);
1586 fjx2 = _mm_add_pd(fjx2,tx);
1587 fjy2 = _mm_add_pd(fjy2,ty);
1588 fjz2 = _mm_add_pd(fjz2,tz);
1592 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1594 /* Inner loop uses 603 flops */
1597 /* End of innermost loop */
1599 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1600 f+i_coord_offset,fshift+i_shift_offset);
1602 ggid = gid[iidx];
1603 /* Update potential energies */
1604 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1605 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1607 /* Increment number of inner iterations */
1608 inneriter += j_index_end - j_index_start;
1610 /* Outer loop uses 20 flops */
1613 /* Increment number of outer iterations */
1614 outeriter += nri;
1616 /* Update outer/inner flops */
1618 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*603);
1621 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double
1622 * Electrostatics interaction: Ewald
1623 * VdW interaction: LennardJones
1624 * Geometry: Water3-Water3
1625 * Calculate force/pot: Force
1627 void
1628 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double
1629 (t_nblist * gmx_restrict nlist,
1630 rvec * gmx_restrict xx,
1631 rvec * gmx_restrict ff,
1632 struct t_forcerec * gmx_restrict fr,
1633 t_mdatoms * gmx_restrict mdatoms,
1634 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1635 t_nrnb * gmx_restrict nrnb)
1637 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1638 * just 0 for non-waters.
1639 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1640 * jnr indices corresponding to data put in the four positions in the SIMD register.
1642 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1643 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1644 int jnrA,jnrB;
1645 int j_coord_offsetA,j_coord_offsetB;
1646 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1647 real rcutoff_scalar;
1648 real *shiftvec,*fshift,*x,*f;
1649 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1650 int vdwioffset0;
1651 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1652 int vdwioffset1;
1653 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1654 int vdwioffset2;
1655 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1656 int vdwjidx0A,vdwjidx0B;
1657 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1658 int vdwjidx1A,vdwjidx1B;
1659 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1660 int vdwjidx2A,vdwjidx2B;
1661 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1662 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1663 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1664 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1665 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1666 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1667 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1668 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1669 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1670 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1671 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1672 real *charge;
1673 int nvdwtype;
1674 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1675 int *vdwtype;
1676 real *vdwparam;
1677 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1678 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1679 __m128i ewitab;
1680 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1681 real *ewtab;
1682 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1683 real rswitch_scalar,d_scalar;
1684 __m128d dummy_mask,cutoff_mask;
1685 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1686 __m128d one = _mm_set1_pd(1.0);
1687 __m128d two = _mm_set1_pd(2.0);
1688 x = xx[0];
1689 f = ff[0];
1691 nri = nlist->nri;
1692 iinr = nlist->iinr;
1693 jindex = nlist->jindex;
1694 jjnr = nlist->jjnr;
1695 shiftidx = nlist->shift;
1696 gid = nlist->gid;
1697 shiftvec = fr->shift_vec[0];
1698 fshift = fr->fshift[0];
1699 facel = _mm_set1_pd(fr->ic->epsfac);
1700 charge = mdatoms->chargeA;
1701 nvdwtype = fr->ntype;
1702 vdwparam = fr->nbfp;
1703 vdwtype = mdatoms->typeA;
1705 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1706 ewtab = fr->ic->tabq_coul_FDV0;
1707 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1708 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1710 /* Setup water-specific parameters */
1711 inr = nlist->iinr[0];
1712 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1713 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1714 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1715 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1717 jq0 = _mm_set1_pd(charge[inr+0]);
1718 jq1 = _mm_set1_pd(charge[inr+1]);
1719 jq2 = _mm_set1_pd(charge[inr+2]);
1720 vdwjidx0A = 2*vdwtype[inr+0];
1721 qq00 = _mm_mul_pd(iq0,jq0);
1722 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1723 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1724 qq01 = _mm_mul_pd(iq0,jq1);
1725 qq02 = _mm_mul_pd(iq0,jq2);
1726 qq10 = _mm_mul_pd(iq1,jq0);
1727 qq11 = _mm_mul_pd(iq1,jq1);
1728 qq12 = _mm_mul_pd(iq1,jq2);
1729 qq20 = _mm_mul_pd(iq2,jq0);
1730 qq21 = _mm_mul_pd(iq2,jq1);
1731 qq22 = _mm_mul_pd(iq2,jq2);
1733 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1734 rcutoff_scalar = fr->ic->rcoulomb;
1735 rcutoff = _mm_set1_pd(rcutoff_scalar);
1736 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1738 rswitch_scalar = fr->ic->rcoulomb_switch;
1739 rswitch = _mm_set1_pd(rswitch_scalar);
1740 /* Setup switch parameters */
1741 d_scalar = rcutoff_scalar-rswitch_scalar;
1742 d = _mm_set1_pd(d_scalar);
1743 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1744 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1745 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1746 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1747 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1748 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1750 /* Avoid stupid compiler warnings */
1751 jnrA = jnrB = 0;
1752 j_coord_offsetA = 0;
1753 j_coord_offsetB = 0;
1755 outeriter = 0;
1756 inneriter = 0;
1758 /* Start outer loop over neighborlists */
1759 for(iidx=0; iidx<nri; iidx++)
1761 /* Load shift vector for this list */
1762 i_shift_offset = DIM*shiftidx[iidx];
1764 /* Load limits for loop over neighbors */
1765 j_index_start = jindex[iidx];
1766 j_index_end = jindex[iidx+1];
1768 /* Get outer coordinate index */
1769 inr = iinr[iidx];
1770 i_coord_offset = DIM*inr;
1772 /* Load i particle coords and add shift vector */
1773 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1774 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1776 fix0 = _mm_setzero_pd();
1777 fiy0 = _mm_setzero_pd();
1778 fiz0 = _mm_setzero_pd();
1779 fix1 = _mm_setzero_pd();
1780 fiy1 = _mm_setzero_pd();
1781 fiz1 = _mm_setzero_pd();
1782 fix2 = _mm_setzero_pd();
1783 fiy2 = _mm_setzero_pd();
1784 fiz2 = _mm_setzero_pd();
1786 /* Start inner kernel loop */
1787 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1790 /* Get j neighbor index, and coordinate index */
1791 jnrA = jjnr[jidx];
1792 jnrB = jjnr[jidx+1];
1793 j_coord_offsetA = DIM*jnrA;
1794 j_coord_offsetB = DIM*jnrB;
1796 /* load j atom coordinates */
1797 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1798 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1800 /* Calculate displacement vector */
1801 dx00 = _mm_sub_pd(ix0,jx0);
1802 dy00 = _mm_sub_pd(iy0,jy0);
1803 dz00 = _mm_sub_pd(iz0,jz0);
1804 dx01 = _mm_sub_pd(ix0,jx1);
1805 dy01 = _mm_sub_pd(iy0,jy1);
1806 dz01 = _mm_sub_pd(iz0,jz1);
1807 dx02 = _mm_sub_pd(ix0,jx2);
1808 dy02 = _mm_sub_pd(iy0,jy2);
1809 dz02 = _mm_sub_pd(iz0,jz2);
1810 dx10 = _mm_sub_pd(ix1,jx0);
1811 dy10 = _mm_sub_pd(iy1,jy0);
1812 dz10 = _mm_sub_pd(iz1,jz0);
1813 dx11 = _mm_sub_pd(ix1,jx1);
1814 dy11 = _mm_sub_pd(iy1,jy1);
1815 dz11 = _mm_sub_pd(iz1,jz1);
1816 dx12 = _mm_sub_pd(ix1,jx2);
1817 dy12 = _mm_sub_pd(iy1,jy2);
1818 dz12 = _mm_sub_pd(iz1,jz2);
1819 dx20 = _mm_sub_pd(ix2,jx0);
1820 dy20 = _mm_sub_pd(iy2,jy0);
1821 dz20 = _mm_sub_pd(iz2,jz0);
1822 dx21 = _mm_sub_pd(ix2,jx1);
1823 dy21 = _mm_sub_pd(iy2,jy1);
1824 dz21 = _mm_sub_pd(iz2,jz1);
1825 dx22 = _mm_sub_pd(ix2,jx2);
1826 dy22 = _mm_sub_pd(iy2,jy2);
1827 dz22 = _mm_sub_pd(iz2,jz2);
1829 /* Calculate squared distance and things based on it */
1830 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1831 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1832 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1833 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1834 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1835 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1836 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1837 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1838 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1840 rinv00 = sse41_invsqrt_d(rsq00);
1841 rinv01 = sse41_invsqrt_d(rsq01);
1842 rinv02 = sse41_invsqrt_d(rsq02);
1843 rinv10 = sse41_invsqrt_d(rsq10);
1844 rinv11 = sse41_invsqrt_d(rsq11);
1845 rinv12 = sse41_invsqrt_d(rsq12);
1846 rinv20 = sse41_invsqrt_d(rsq20);
1847 rinv21 = sse41_invsqrt_d(rsq21);
1848 rinv22 = sse41_invsqrt_d(rsq22);
1850 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1851 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1852 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1853 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1854 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1855 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1856 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1857 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1858 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1860 fjx0 = _mm_setzero_pd();
1861 fjy0 = _mm_setzero_pd();
1862 fjz0 = _mm_setzero_pd();
1863 fjx1 = _mm_setzero_pd();
1864 fjy1 = _mm_setzero_pd();
1865 fjz1 = _mm_setzero_pd();
1866 fjx2 = _mm_setzero_pd();
1867 fjy2 = _mm_setzero_pd();
1868 fjz2 = _mm_setzero_pd();
1870 /**************************
1871 * CALCULATE INTERACTIONS *
1872 **************************/
1874 if (gmx_mm_any_lt(rsq00,rcutoff2))
1877 r00 = _mm_mul_pd(rsq00,rinv00);
1879 /* EWALD ELECTROSTATICS */
1881 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1882 ewrt = _mm_mul_pd(r00,ewtabscale);
1883 ewitab = _mm_cvttpd_epi32(ewrt);
1884 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1885 ewitab = _mm_slli_epi32(ewitab,2);
1886 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1887 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1888 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1889 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1890 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1891 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1892 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1893 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1894 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1895 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1897 /* LENNARD-JONES DISPERSION/REPULSION */
1899 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1900 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1901 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1902 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1903 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1905 d = _mm_sub_pd(r00,rswitch);
1906 d = _mm_max_pd(d,_mm_setzero_pd());
1907 d2 = _mm_mul_pd(d,d);
1908 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1910 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1912 /* Evaluate switch function */
1913 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1914 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1915 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1916 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1918 fscal = _mm_add_pd(felec,fvdw);
1920 fscal = _mm_and_pd(fscal,cutoff_mask);
1922 /* Calculate temporary vectorial force */
1923 tx = _mm_mul_pd(fscal,dx00);
1924 ty = _mm_mul_pd(fscal,dy00);
1925 tz = _mm_mul_pd(fscal,dz00);
1927 /* Update vectorial force */
1928 fix0 = _mm_add_pd(fix0,tx);
1929 fiy0 = _mm_add_pd(fiy0,ty);
1930 fiz0 = _mm_add_pd(fiz0,tz);
1932 fjx0 = _mm_add_pd(fjx0,tx);
1933 fjy0 = _mm_add_pd(fjy0,ty);
1934 fjz0 = _mm_add_pd(fjz0,tz);
1938 /**************************
1939 * CALCULATE INTERACTIONS *
1940 **************************/
1942 if (gmx_mm_any_lt(rsq01,rcutoff2))
1945 r01 = _mm_mul_pd(rsq01,rinv01);
1947 /* EWALD ELECTROSTATICS */
1949 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1950 ewrt = _mm_mul_pd(r01,ewtabscale);
1951 ewitab = _mm_cvttpd_epi32(ewrt);
1952 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1953 ewitab = _mm_slli_epi32(ewitab,2);
1954 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1955 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1956 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1957 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1958 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1959 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1960 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1961 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1962 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1963 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1965 d = _mm_sub_pd(r01,rswitch);
1966 d = _mm_max_pd(d,_mm_setzero_pd());
1967 d2 = _mm_mul_pd(d,d);
1968 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1970 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1972 /* Evaluate switch function */
1973 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1974 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1975 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1977 fscal = felec;
1979 fscal = _mm_and_pd(fscal,cutoff_mask);
1981 /* Calculate temporary vectorial force */
1982 tx = _mm_mul_pd(fscal,dx01);
1983 ty = _mm_mul_pd(fscal,dy01);
1984 tz = _mm_mul_pd(fscal,dz01);
1986 /* Update vectorial force */
1987 fix0 = _mm_add_pd(fix0,tx);
1988 fiy0 = _mm_add_pd(fiy0,ty);
1989 fiz0 = _mm_add_pd(fiz0,tz);
1991 fjx1 = _mm_add_pd(fjx1,tx);
1992 fjy1 = _mm_add_pd(fjy1,ty);
1993 fjz1 = _mm_add_pd(fjz1,tz);
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2001 if (gmx_mm_any_lt(rsq02,rcutoff2))
2004 r02 = _mm_mul_pd(rsq02,rinv02);
2006 /* EWALD ELECTROSTATICS */
2008 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2009 ewrt = _mm_mul_pd(r02,ewtabscale);
2010 ewitab = _mm_cvttpd_epi32(ewrt);
2011 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2012 ewitab = _mm_slli_epi32(ewitab,2);
2013 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2014 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2015 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2016 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2017 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2018 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2019 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2020 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2021 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2022 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2024 d = _mm_sub_pd(r02,rswitch);
2025 d = _mm_max_pd(d,_mm_setzero_pd());
2026 d2 = _mm_mul_pd(d,d);
2027 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2029 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2031 /* Evaluate switch function */
2032 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2033 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2034 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2036 fscal = felec;
2038 fscal = _mm_and_pd(fscal,cutoff_mask);
2040 /* Calculate temporary vectorial force */
2041 tx = _mm_mul_pd(fscal,dx02);
2042 ty = _mm_mul_pd(fscal,dy02);
2043 tz = _mm_mul_pd(fscal,dz02);
2045 /* Update vectorial force */
2046 fix0 = _mm_add_pd(fix0,tx);
2047 fiy0 = _mm_add_pd(fiy0,ty);
2048 fiz0 = _mm_add_pd(fiz0,tz);
2050 fjx2 = _mm_add_pd(fjx2,tx);
2051 fjy2 = _mm_add_pd(fjy2,ty);
2052 fjz2 = _mm_add_pd(fjz2,tz);
2056 /**************************
2057 * CALCULATE INTERACTIONS *
2058 **************************/
2060 if (gmx_mm_any_lt(rsq10,rcutoff2))
2063 r10 = _mm_mul_pd(rsq10,rinv10);
2065 /* EWALD ELECTROSTATICS */
2067 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2068 ewrt = _mm_mul_pd(r10,ewtabscale);
2069 ewitab = _mm_cvttpd_epi32(ewrt);
2070 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2071 ewitab = _mm_slli_epi32(ewitab,2);
2072 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2073 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2074 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2075 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2076 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2077 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2078 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2079 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2080 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2081 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2083 d = _mm_sub_pd(r10,rswitch);
2084 d = _mm_max_pd(d,_mm_setzero_pd());
2085 d2 = _mm_mul_pd(d,d);
2086 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2088 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2090 /* Evaluate switch function */
2091 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2092 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2093 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2095 fscal = felec;
2097 fscal = _mm_and_pd(fscal,cutoff_mask);
2099 /* Calculate temporary vectorial force */
2100 tx = _mm_mul_pd(fscal,dx10);
2101 ty = _mm_mul_pd(fscal,dy10);
2102 tz = _mm_mul_pd(fscal,dz10);
2104 /* Update vectorial force */
2105 fix1 = _mm_add_pd(fix1,tx);
2106 fiy1 = _mm_add_pd(fiy1,ty);
2107 fiz1 = _mm_add_pd(fiz1,tz);
2109 fjx0 = _mm_add_pd(fjx0,tx);
2110 fjy0 = _mm_add_pd(fjy0,ty);
2111 fjz0 = _mm_add_pd(fjz0,tz);
2115 /**************************
2116 * CALCULATE INTERACTIONS *
2117 **************************/
2119 if (gmx_mm_any_lt(rsq11,rcutoff2))
2122 r11 = _mm_mul_pd(rsq11,rinv11);
2124 /* EWALD ELECTROSTATICS */
2126 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2127 ewrt = _mm_mul_pd(r11,ewtabscale);
2128 ewitab = _mm_cvttpd_epi32(ewrt);
2129 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2130 ewitab = _mm_slli_epi32(ewitab,2);
2131 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2132 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2133 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2134 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2135 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2136 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2137 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2138 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2139 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2140 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2142 d = _mm_sub_pd(r11,rswitch);
2143 d = _mm_max_pd(d,_mm_setzero_pd());
2144 d2 = _mm_mul_pd(d,d);
2145 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2147 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2149 /* Evaluate switch function */
2150 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2151 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2152 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2154 fscal = felec;
2156 fscal = _mm_and_pd(fscal,cutoff_mask);
2158 /* Calculate temporary vectorial force */
2159 tx = _mm_mul_pd(fscal,dx11);
2160 ty = _mm_mul_pd(fscal,dy11);
2161 tz = _mm_mul_pd(fscal,dz11);
2163 /* Update vectorial force */
2164 fix1 = _mm_add_pd(fix1,tx);
2165 fiy1 = _mm_add_pd(fiy1,ty);
2166 fiz1 = _mm_add_pd(fiz1,tz);
2168 fjx1 = _mm_add_pd(fjx1,tx);
2169 fjy1 = _mm_add_pd(fjy1,ty);
2170 fjz1 = _mm_add_pd(fjz1,tz);
2174 /**************************
2175 * CALCULATE INTERACTIONS *
2176 **************************/
2178 if (gmx_mm_any_lt(rsq12,rcutoff2))
2181 r12 = _mm_mul_pd(rsq12,rinv12);
2183 /* EWALD ELECTROSTATICS */
2185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2186 ewrt = _mm_mul_pd(r12,ewtabscale);
2187 ewitab = _mm_cvttpd_epi32(ewrt);
2188 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2189 ewitab = _mm_slli_epi32(ewitab,2);
2190 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2191 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2192 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2193 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2194 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2195 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2196 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2197 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2198 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2199 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2201 d = _mm_sub_pd(r12,rswitch);
2202 d = _mm_max_pd(d,_mm_setzero_pd());
2203 d2 = _mm_mul_pd(d,d);
2204 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2206 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2208 /* Evaluate switch function */
2209 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2210 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2211 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2213 fscal = felec;
2215 fscal = _mm_and_pd(fscal,cutoff_mask);
2217 /* Calculate temporary vectorial force */
2218 tx = _mm_mul_pd(fscal,dx12);
2219 ty = _mm_mul_pd(fscal,dy12);
2220 tz = _mm_mul_pd(fscal,dz12);
2222 /* Update vectorial force */
2223 fix1 = _mm_add_pd(fix1,tx);
2224 fiy1 = _mm_add_pd(fiy1,ty);
2225 fiz1 = _mm_add_pd(fiz1,tz);
2227 fjx2 = _mm_add_pd(fjx2,tx);
2228 fjy2 = _mm_add_pd(fjy2,ty);
2229 fjz2 = _mm_add_pd(fjz2,tz);
2233 /**************************
2234 * CALCULATE INTERACTIONS *
2235 **************************/
2237 if (gmx_mm_any_lt(rsq20,rcutoff2))
2240 r20 = _mm_mul_pd(rsq20,rinv20);
2242 /* EWALD ELECTROSTATICS */
2244 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2245 ewrt = _mm_mul_pd(r20,ewtabscale);
2246 ewitab = _mm_cvttpd_epi32(ewrt);
2247 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2248 ewitab = _mm_slli_epi32(ewitab,2);
2249 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2250 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2251 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2252 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2253 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2254 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2255 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2256 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2257 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2258 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2260 d = _mm_sub_pd(r20,rswitch);
2261 d = _mm_max_pd(d,_mm_setzero_pd());
2262 d2 = _mm_mul_pd(d,d);
2263 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2265 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2267 /* Evaluate switch function */
2268 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2269 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2270 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2272 fscal = felec;
2274 fscal = _mm_and_pd(fscal,cutoff_mask);
2276 /* Calculate temporary vectorial force */
2277 tx = _mm_mul_pd(fscal,dx20);
2278 ty = _mm_mul_pd(fscal,dy20);
2279 tz = _mm_mul_pd(fscal,dz20);
2281 /* Update vectorial force */
2282 fix2 = _mm_add_pd(fix2,tx);
2283 fiy2 = _mm_add_pd(fiy2,ty);
2284 fiz2 = _mm_add_pd(fiz2,tz);
2286 fjx0 = _mm_add_pd(fjx0,tx);
2287 fjy0 = _mm_add_pd(fjy0,ty);
2288 fjz0 = _mm_add_pd(fjz0,tz);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 if (gmx_mm_any_lt(rsq21,rcutoff2))
2299 r21 = _mm_mul_pd(rsq21,rinv21);
2301 /* EWALD ELECTROSTATICS */
2303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2304 ewrt = _mm_mul_pd(r21,ewtabscale);
2305 ewitab = _mm_cvttpd_epi32(ewrt);
2306 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2307 ewitab = _mm_slli_epi32(ewitab,2);
2308 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2309 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2310 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2311 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2312 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2313 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2314 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2315 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2316 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2317 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2319 d = _mm_sub_pd(r21,rswitch);
2320 d = _mm_max_pd(d,_mm_setzero_pd());
2321 d2 = _mm_mul_pd(d,d);
2322 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2324 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2326 /* Evaluate switch function */
2327 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2328 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2329 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2331 fscal = felec;
2333 fscal = _mm_and_pd(fscal,cutoff_mask);
2335 /* Calculate temporary vectorial force */
2336 tx = _mm_mul_pd(fscal,dx21);
2337 ty = _mm_mul_pd(fscal,dy21);
2338 tz = _mm_mul_pd(fscal,dz21);
2340 /* Update vectorial force */
2341 fix2 = _mm_add_pd(fix2,tx);
2342 fiy2 = _mm_add_pd(fiy2,ty);
2343 fiz2 = _mm_add_pd(fiz2,tz);
2345 fjx1 = _mm_add_pd(fjx1,tx);
2346 fjy1 = _mm_add_pd(fjy1,ty);
2347 fjz1 = _mm_add_pd(fjz1,tz);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 if (gmx_mm_any_lt(rsq22,rcutoff2))
2358 r22 = _mm_mul_pd(rsq22,rinv22);
2360 /* EWALD ELECTROSTATICS */
2362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2363 ewrt = _mm_mul_pd(r22,ewtabscale);
2364 ewitab = _mm_cvttpd_epi32(ewrt);
2365 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2366 ewitab = _mm_slli_epi32(ewitab,2);
2367 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2368 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2369 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2370 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2371 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2372 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2373 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2374 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2375 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2376 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2378 d = _mm_sub_pd(r22,rswitch);
2379 d = _mm_max_pd(d,_mm_setzero_pd());
2380 d2 = _mm_mul_pd(d,d);
2381 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2383 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2385 /* Evaluate switch function */
2386 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2387 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2388 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2390 fscal = felec;
2392 fscal = _mm_and_pd(fscal,cutoff_mask);
2394 /* Calculate temporary vectorial force */
2395 tx = _mm_mul_pd(fscal,dx22);
2396 ty = _mm_mul_pd(fscal,dy22);
2397 tz = _mm_mul_pd(fscal,dz22);
2399 /* Update vectorial force */
2400 fix2 = _mm_add_pd(fix2,tx);
2401 fiy2 = _mm_add_pd(fiy2,ty);
2402 fiz2 = _mm_add_pd(fiz2,tz);
2404 fjx2 = _mm_add_pd(fjx2,tx);
2405 fjy2 = _mm_add_pd(fjy2,ty);
2406 fjz2 = _mm_add_pd(fjz2,tz);
2410 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2412 /* Inner loop uses 573 flops */
2415 if(jidx<j_index_end)
2418 jnrA = jjnr[jidx];
2419 j_coord_offsetA = DIM*jnrA;
2421 /* load j atom coordinates */
2422 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2423 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2425 /* Calculate displacement vector */
2426 dx00 = _mm_sub_pd(ix0,jx0);
2427 dy00 = _mm_sub_pd(iy0,jy0);
2428 dz00 = _mm_sub_pd(iz0,jz0);
2429 dx01 = _mm_sub_pd(ix0,jx1);
2430 dy01 = _mm_sub_pd(iy0,jy1);
2431 dz01 = _mm_sub_pd(iz0,jz1);
2432 dx02 = _mm_sub_pd(ix0,jx2);
2433 dy02 = _mm_sub_pd(iy0,jy2);
2434 dz02 = _mm_sub_pd(iz0,jz2);
2435 dx10 = _mm_sub_pd(ix1,jx0);
2436 dy10 = _mm_sub_pd(iy1,jy0);
2437 dz10 = _mm_sub_pd(iz1,jz0);
2438 dx11 = _mm_sub_pd(ix1,jx1);
2439 dy11 = _mm_sub_pd(iy1,jy1);
2440 dz11 = _mm_sub_pd(iz1,jz1);
2441 dx12 = _mm_sub_pd(ix1,jx2);
2442 dy12 = _mm_sub_pd(iy1,jy2);
2443 dz12 = _mm_sub_pd(iz1,jz2);
2444 dx20 = _mm_sub_pd(ix2,jx0);
2445 dy20 = _mm_sub_pd(iy2,jy0);
2446 dz20 = _mm_sub_pd(iz2,jz0);
2447 dx21 = _mm_sub_pd(ix2,jx1);
2448 dy21 = _mm_sub_pd(iy2,jy1);
2449 dz21 = _mm_sub_pd(iz2,jz1);
2450 dx22 = _mm_sub_pd(ix2,jx2);
2451 dy22 = _mm_sub_pd(iy2,jy2);
2452 dz22 = _mm_sub_pd(iz2,jz2);
2454 /* Calculate squared distance and things based on it */
2455 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2456 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2457 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2458 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2459 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2460 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2461 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2462 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2463 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2465 rinv00 = sse41_invsqrt_d(rsq00);
2466 rinv01 = sse41_invsqrt_d(rsq01);
2467 rinv02 = sse41_invsqrt_d(rsq02);
2468 rinv10 = sse41_invsqrt_d(rsq10);
2469 rinv11 = sse41_invsqrt_d(rsq11);
2470 rinv12 = sse41_invsqrt_d(rsq12);
2471 rinv20 = sse41_invsqrt_d(rsq20);
2472 rinv21 = sse41_invsqrt_d(rsq21);
2473 rinv22 = sse41_invsqrt_d(rsq22);
2475 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2476 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2477 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2478 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2479 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2480 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2481 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2482 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2483 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2485 fjx0 = _mm_setzero_pd();
2486 fjy0 = _mm_setzero_pd();
2487 fjz0 = _mm_setzero_pd();
2488 fjx1 = _mm_setzero_pd();
2489 fjy1 = _mm_setzero_pd();
2490 fjz1 = _mm_setzero_pd();
2491 fjx2 = _mm_setzero_pd();
2492 fjy2 = _mm_setzero_pd();
2493 fjz2 = _mm_setzero_pd();
2495 /**************************
2496 * CALCULATE INTERACTIONS *
2497 **************************/
2499 if (gmx_mm_any_lt(rsq00,rcutoff2))
2502 r00 = _mm_mul_pd(rsq00,rinv00);
2504 /* EWALD ELECTROSTATICS */
2506 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2507 ewrt = _mm_mul_pd(r00,ewtabscale);
2508 ewitab = _mm_cvttpd_epi32(ewrt);
2509 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2510 ewitab = _mm_slli_epi32(ewitab,2);
2511 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2512 ewtabD = _mm_setzero_pd();
2513 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2514 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2515 ewtabFn = _mm_setzero_pd();
2516 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2517 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2518 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2519 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2520 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2522 /* LENNARD-JONES DISPERSION/REPULSION */
2524 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2525 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2526 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2527 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2528 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2530 d = _mm_sub_pd(r00,rswitch);
2531 d = _mm_max_pd(d,_mm_setzero_pd());
2532 d2 = _mm_mul_pd(d,d);
2533 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2535 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2537 /* Evaluate switch function */
2538 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2539 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2540 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2541 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2543 fscal = _mm_add_pd(felec,fvdw);
2545 fscal = _mm_and_pd(fscal,cutoff_mask);
2547 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2549 /* Calculate temporary vectorial force */
2550 tx = _mm_mul_pd(fscal,dx00);
2551 ty = _mm_mul_pd(fscal,dy00);
2552 tz = _mm_mul_pd(fscal,dz00);
2554 /* Update vectorial force */
2555 fix0 = _mm_add_pd(fix0,tx);
2556 fiy0 = _mm_add_pd(fiy0,ty);
2557 fiz0 = _mm_add_pd(fiz0,tz);
2559 fjx0 = _mm_add_pd(fjx0,tx);
2560 fjy0 = _mm_add_pd(fjy0,ty);
2561 fjz0 = _mm_add_pd(fjz0,tz);
2565 /**************************
2566 * CALCULATE INTERACTIONS *
2567 **************************/
2569 if (gmx_mm_any_lt(rsq01,rcutoff2))
2572 r01 = _mm_mul_pd(rsq01,rinv01);
2574 /* EWALD ELECTROSTATICS */
2576 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2577 ewrt = _mm_mul_pd(r01,ewtabscale);
2578 ewitab = _mm_cvttpd_epi32(ewrt);
2579 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2580 ewitab = _mm_slli_epi32(ewitab,2);
2581 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2582 ewtabD = _mm_setzero_pd();
2583 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2584 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2585 ewtabFn = _mm_setzero_pd();
2586 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2587 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2588 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2589 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2590 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2592 d = _mm_sub_pd(r01,rswitch);
2593 d = _mm_max_pd(d,_mm_setzero_pd());
2594 d2 = _mm_mul_pd(d,d);
2595 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2597 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2599 /* Evaluate switch function */
2600 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2601 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2602 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2604 fscal = felec;
2606 fscal = _mm_and_pd(fscal,cutoff_mask);
2608 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2610 /* Calculate temporary vectorial force */
2611 tx = _mm_mul_pd(fscal,dx01);
2612 ty = _mm_mul_pd(fscal,dy01);
2613 tz = _mm_mul_pd(fscal,dz01);
2615 /* Update vectorial force */
2616 fix0 = _mm_add_pd(fix0,tx);
2617 fiy0 = _mm_add_pd(fiy0,ty);
2618 fiz0 = _mm_add_pd(fiz0,tz);
2620 fjx1 = _mm_add_pd(fjx1,tx);
2621 fjy1 = _mm_add_pd(fjy1,ty);
2622 fjz1 = _mm_add_pd(fjz1,tz);
2626 /**************************
2627 * CALCULATE INTERACTIONS *
2628 **************************/
2630 if (gmx_mm_any_lt(rsq02,rcutoff2))
2633 r02 = _mm_mul_pd(rsq02,rinv02);
2635 /* EWALD ELECTROSTATICS */
2637 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2638 ewrt = _mm_mul_pd(r02,ewtabscale);
2639 ewitab = _mm_cvttpd_epi32(ewrt);
2640 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2641 ewitab = _mm_slli_epi32(ewitab,2);
2642 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2643 ewtabD = _mm_setzero_pd();
2644 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2645 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2646 ewtabFn = _mm_setzero_pd();
2647 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2648 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2649 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2650 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2651 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2653 d = _mm_sub_pd(r02,rswitch);
2654 d = _mm_max_pd(d,_mm_setzero_pd());
2655 d2 = _mm_mul_pd(d,d);
2656 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2658 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2660 /* Evaluate switch function */
2661 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2662 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2663 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2665 fscal = felec;
2667 fscal = _mm_and_pd(fscal,cutoff_mask);
2669 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2671 /* Calculate temporary vectorial force */
2672 tx = _mm_mul_pd(fscal,dx02);
2673 ty = _mm_mul_pd(fscal,dy02);
2674 tz = _mm_mul_pd(fscal,dz02);
2676 /* Update vectorial force */
2677 fix0 = _mm_add_pd(fix0,tx);
2678 fiy0 = _mm_add_pd(fiy0,ty);
2679 fiz0 = _mm_add_pd(fiz0,tz);
2681 fjx2 = _mm_add_pd(fjx2,tx);
2682 fjy2 = _mm_add_pd(fjy2,ty);
2683 fjz2 = _mm_add_pd(fjz2,tz);
2687 /**************************
2688 * CALCULATE INTERACTIONS *
2689 **************************/
2691 if (gmx_mm_any_lt(rsq10,rcutoff2))
2694 r10 = _mm_mul_pd(rsq10,rinv10);
2696 /* EWALD ELECTROSTATICS */
2698 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2699 ewrt = _mm_mul_pd(r10,ewtabscale);
2700 ewitab = _mm_cvttpd_epi32(ewrt);
2701 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2702 ewitab = _mm_slli_epi32(ewitab,2);
2703 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2704 ewtabD = _mm_setzero_pd();
2705 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2706 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2707 ewtabFn = _mm_setzero_pd();
2708 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2709 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2710 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2711 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2712 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2714 d = _mm_sub_pd(r10,rswitch);
2715 d = _mm_max_pd(d,_mm_setzero_pd());
2716 d2 = _mm_mul_pd(d,d);
2717 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2719 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2721 /* Evaluate switch function */
2722 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2723 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2724 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2726 fscal = felec;
2728 fscal = _mm_and_pd(fscal,cutoff_mask);
2730 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2732 /* Calculate temporary vectorial force */
2733 tx = _mm_mul_pd(fscal,dx10);
2734 ty = _mm_mul_pd(fscal,dy10);
2735 tz = _mm_mul_pd(fscal,dz10);
2737 /* Update vectorial force */
2738 fix1 = _mm_add_pd(fix1,tx);
2739 fiy1 = _mm_add_pd(fiy1,ty);
2740 fiz1 = _mm_add_pd(fiz1,tz);
2742 fjx0 = _mm_add_pd(fjx0,tx);
2743 fjy0 = _mm_add_pd(fjy0,ty);
2744 fjz0 = _mm_add_pd(fjz0,tz);
2748 /**************************
2749 * CALCULATE INTERACTIONS *
2750 **************************/
2752 if (gmx_mm_any_lt(rsq11,rcutoff2))
2755 r11 = _mm_mul_pd(rsq11,rinv11);
2757 /* EWALD ELECTROSTATICS */
2759 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2760 ewrt = _mm_mul_pd(r11,ewtabscale);
2761 ewitab = _mm_cvttpd_epi32(ewrt);
2762 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2763 ewitab = _mm_slli_epi32(ewitab,2);
2764 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2765 ewtabD = _mm_setzero_pd();
2766 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2767 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2768 ewtabFn = _mm_setzero_pd();
2769 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2770 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2771 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2772 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2773 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2775 d = _mm_sub_pd(r11,rswitch);
2776 d = _mm_max_pd(d,_mm_setzero_pd());
2777 d2 = _mm_mul_pd(d,d);
2778 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2780 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2782 /* Evaluate switch function */
2783 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2784 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2785 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2787 fscal = felec;
2789 fscal = _mm_and_pd(fscal,cutoff_mask);
2791 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2793 /* Calculate temporary vectorial force */
2794 tx = _mm_mul_pd(fscal,dx11);
2795 ty = _mm_mul_pd(fscal,dy11);
2796 tz = _mm_mul_pd(fscal,dz11);
2798 /* Update vectorial force */
2799 fix1 = _mm_add_pd(fix1,tx);
2800 fiy1 = _mm_add_pd(fiy1,ty);
2801 fiz1 = _mm_add_pd(fiz1,tz);
2803 fjx1 = _mm_add_pd(fjx1,tx);
2804 fjy1 = _mm_add_pd(fjy1,ty);
2805 fjz1 = _mm_add_pd(fjz1,tz);
2809 /**************************
2810 * CALCULATE INTERACTIONS *
2811 **************************/
2813 if (gmx_mm_any_lt(rsq12,rcutoff2))
2816 r12 = _mm_mul_pd(rsq12,rinv12);
2818 /* EWALD ELECTROSTATICS */
2820 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2821 ewrt = _mm_mul_pd(r12,ewtabscale);
2822 ewitab = _mm_cvttpd_epi32(ewrt);
2823 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2824 ewitab = _mm_slli_epi32(ewitab,2);
2825 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2826 ewtabD = _mm_setzero_pd();
2827 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2828 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2829 ewtabFn = _mm_setzero_pd();
2830 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2831 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2832 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2833 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2834 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2836 d = _mm_sub_pd(r12,rswitch);
2837 d = _mm_max_pd(d,_mm_setzero_pd());
2838 d2 = _mm_mul_pd(d,d);
2839 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2841 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2843 /* Evaluate switch function */
2844 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2845 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2846 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2848 fscal = felec;
2850 fscal = _mm_and_pd(fscal,cutoff_mask);
2852 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2854 /* Calculate temporary vectorial force */
2855 tx = _mm_mul_pd(fscal,dx12);
2856 ty = _mm_mul_pd(fscal,dy12);
2857 tz = _mm_mul_pd(fscal,dz12);
2859 /* Update vectorial force */
2860 fix1 = _mm_add_pd(fix1,tx);
2861 fiy1 = _mm_add_pd(fiy1,ty);
2862 fiz1 = _mm_add_pd(fiz1,tz);
2864 fjx2 = _mm_add_pd(fjx2,tx);
2865 fjy2 = _mm_add_pd(fjy2,ty);
2866 fjz2 = _mm_add_pd(fjz2,tz);
2870 /**************************
2871 * CALCULATE INTERACTIONS *
2872 **************************/
2874 if (gmx_mm_any_lt(rsq20,rcutoff2))
2877 r20 = _mm_mul_pd(rsq20,rinv20);
2879 /* EWALD ELECTROSTATICS */
2881 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2882 ewrt = _mm_mul_pd(r20,ewtabscale);
2883 ewitab = _mm_cvttpd_epi32(ewrt);
2884 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2885 ewitab = _mm_slli_epi32(ewitab,2);
2886 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2887 ewtabD = _mm_setzero_pd();
2888 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2889 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2890 ewtabFn = _mm_setzero_pd();
2891 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2892 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2893 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2894 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2895 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2897 d = _mm_sub_pd(r20,rswitch);
2898 d = _mm_max_pd(d,_mm_setzero_pd());
2899 d2 = _mm_mul_pd(d,d);
2900 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2902 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2904 /* Evaluate switch function */
2905 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2906 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2907 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2909 fscal = felec;
2911 fscal = _mm_and_pd(fscal,cutoff_mask);
2913 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2915 /* Calculate temporary vectorial force */
2916 tx = _mm_mul_pd(fscal,dx20);
2917 ty = _mm_mul_pd(fscal,dy20);
2918 tz = _mm_mul_pd(fscal,dz20);
2920 /* Update vectorial force */
2921 fix2 = _mm_add_pd(fix2,tx);
2922 fiy2 = _mm_add_pd(fiy2,ty);
2923 fiz2 = _mm_add_pd(fiz2,tz);
2925 fjx0 = _mm_add_pd(fjx0,tx);
2926 fjy0 = _mm_add_pd(fjy0,ty);
2927 fjz0 = _mm_add_pd(fjz0,tz);
2931 /**************************
2932 * CALCULATE INTERACTIONS *
2933 **************************/
2935 if (gmx_mm_any_lt(rsq21,rcutoff2))
2938 r21 = _mm_mul_pd(rsq21,rinv21);
2940 /* EWALD ELECTROSTATICS */
2942 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2943 ewrt = _mm_mul_pd(r21,ewtabscale);
2944 ewitab = _mm_cvttpd_epi32(ewrt);
2945 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2946 ewitab = _mm_slli_epi32(ewitab,2);
2947 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2948 ewtabD = _mm_setzero_pd();
2949 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2950 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2951 ewtabFn = _mm_setzero_pd();
2952 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2953 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2954 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2955 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2956 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2958 d = _mm_sub_pd(r21,rswitch);
2959 d = _mm_max_pd(d,_mm_setzero_pd());
2960 d2 = _mm_mul_pd(d,d);
2961 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2963 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2965 /* Evaluate switch function */
2966 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2967 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2968 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2970 fscal = felec;
2972 fscal = _mm_and_pd(fscal,cutoff_mask);
2974 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2976 /* Calculate temporary vectorial force */
2977 tx = _mm_mul_pd(fscal,dx21);
2978 ty = _mm_mul_pd(fscal,dy21);
2979 tz = _mm_mul_pd(fscal,dz21);
2981 /* Update vectorial force */
2982 fix2 = _mm_add_pd(fix2,tx);
2983 fiy2 = _mm_add_pd(fiy2,ty);
2984 fiz2 = _mm_add_pd(fiz2,tz);
2986 fjx1 = _mm_add_pd(fjx1,tx);
2987 fjy1 = _mm_add_pd(fjy1,ty);
2988 fjz1 = _mm_add_pd(fjz1,tz);
2992 /**************************
2993 * CALCULATE INTERACTIONS *
2994 **************************/
2996 if (gmx_mm_any_lt(rsq22,rcutoff2))
2999 r22 = _mm_mul_pd(rsq22,rinv22);
3001 /* EWALD ELECTROSTATICS */
3003 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3004 ewrt = _mm_mul_pd(r22,ewtabscale);
3005 ewitab = _mm_cvttpd_epi32(ewrt);
3006 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3007 ewitab = _mm_slli_epi32(ewitab,2);
3008 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3009 ewtabD = _mm_setzero_pd();
3010 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3011 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3012 ewtabFn = _mm_setzero_pd();
3013 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3014 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3015 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3016 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
3017 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3019 d = _mm_sub_pd(r22,rswitch);
3020 d = _mm_max_pd(d,_mm_setzero_pd());
3021 d2 = _mm_mul_pd(d,d);
3022 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3024 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3026 /* Evaluate switch function */
3027 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3028 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3029 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3031 fscal = felec;
3033 fscal = _mm_and_pd(fscal,cutoff_mask);
3035 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3037 /* Calculate temporary vectorial force */
3038 tx = _mm_mul_pd(fscal,dx22);
3039 ty = _mm_mul_pd(fscal,dy22);
3040 tz = _mm_mul_pd(fscal,dz22);
3042 /* Update vectorial force */
3043 fix2 = _mm_add_pd(fix2,tx);
3044 fiy2 = _mm_add_pd(fiy2,ty);
3045 fiz2 = _mm_add_pd(fiz2,tz);
3047 fjx2 = _mm_add_pd(fjx2,tx);
3048 fjy2 = _mm_add_pd(fjy2,ty);
3049 fjz2 = _mm_add_pd(fjz2,tz);
3053 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3055 /* Inner loop uses 573 flops */
3058 /* End of innermost loop */
3060 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3061 f+i_coord_offset,fshift+i_shift_offset);
3063 /* Increment number of inner iterations */
3064 inneriter += j_index_end - j_index_start;
3066 /* Outer loop uses 18 flops */
3069 /* Increment number of outer iterations */
3070 outeriter += nri;
3072 /* Update outer/inner flops */
3074 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*573);