Removed simple.h from nb_kernel_sse4_1_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse4_1_double.c
blobd5f0b6059e5db916d28e1baaa820d2d2ced0fc21
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse4_1_double.h"
49 #include "kernelutil_x86_sse4_1_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB;
76 int j_coord_offsetA,j_coord_offsetB;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 int vdwioffset0;
82 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 int vdwioffset1;
84 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 int vdwioffset2;
86 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
95 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
96 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
103 real *charge;
104 int nvdwtype;
105 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
106 int *vdwtype;
107 real *vdwparam;
108 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
109 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128i ewitab;
111 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
112 real *ewtab;
113 __m128d dummy_mask,cutoff_mask;
114 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one = _mm_set1_pd(1.0);
116 __m128d two = _mm_set1_pd(2.0);
117 x = xx[0];
118 f = ff[0];
120 nri = nlist->nri;
121 iinr = nlist->iinr;
122 jindex = nlist->jindex;
123 jjnr = nlist->jjnr;
124 shiftidx = nlist->shift;
125 gid = nlist->gid;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_pd(fr->epsfac);
129 charge = mdatoms->chargeA;
130 nvdwtype = fr->ntype;
131 vdwparam = fr->nbfp;
132 vdwtype = mdatoms->typeA;
134 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
135 ewtab = fr->ic->tabq_coul_FDV0;
136 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
137 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq0 = _mm_set1_pd(charge[inr+0]);
147 jq1 = _mm_set1_pd(charge[inr+1]);
148 jq2 = _mm_set1_pd(charge[inr+2]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 qq00 = _mm_mul_pd(iq0,jq0);
151 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153 qq01 = _mm_mul_pd(iq0,jq1);
154 qq02 = _mm_mul_pd(iq0,jq2);
155 qq10 = _mm_mul_pd(iq1,jq0);
156 qq11 = _mm_mul_pd(iq1,jq1);
157 qq12 = _mm_mul_pd(iq1,jq2);
158 qq20 = _mm_mul_pd(iq2,jq0);
159 qq21 = _mm_mul_pd(iq2,jq1);
160 qq22 = _mm_mul_pd(iq2,jq2);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar = fr->rcoulomb;
164 rcutoff = _mm_set1_pd(rcutoff_scalar);
165 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
167 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
168 rvdw = _mm_set1_pd(fr->rvdw);
170 /* Avoid stupid compiler warnings */
171 jnrA = jnrB = 0;
172 j_coord_offsetA = 0;
173 j_coord_offsetB = 0;
175 outeriter = 0;
176 inneriter = 0;
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
189 inr = iinr[iidx];
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm_setzero_pd();
197 fiy0 = _mm_setzero_pd();
198 fiz0 = _mm_setzero_pd();
199 fix1 = _mm_setzero_pd();
200 fiy1 = _mm_setzero_pd();
201 fiz1 = _mm_setzero_pd();
202 fix2 = _mm_setzero_pd();
203 fiy2 = _mm_setzero_pd();
204 fiz2 = _mm_setzero_pd();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_pd();
208 vvdwsum = _mm_setzero_pd();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
214 /* Get j neighbor index, and coordinate index */
215 jnrA = jjnr[jidx];
216 jnrB = jjnr[jidx+1];
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
220 /* load j atom coordinates */
221 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
224 /* Calculate displacement vector */
225 dx00 = _mm_sub_pd(ix0,jx0);
226 dy00 = _mm_sub_pd(iy0,jy0);
227 dz00 = _mm_sub_pd(iz0,jz0);
228 dx01 = _mm_sub_pd(ix0,jx1);
229 dy01 = _mm_sub_pd(iy0,jy1);
230 dz01 = _mm_sub_pd(iz0,jz1);
231 dx02 = _mm_sub_pd(ix0,jx2);
232 dy02 = _mm_sub_pd(iy0,jy2);
233 dz02 = _mm_sub_pd(iz0,jz2);
234 dx10 = _mm_sub_pd(ix1,jx0);
235 dy10 = _mm_sub_pd(iy1,jy0);
236 dz10 = _mm_sub_pd(iz1,jz0);
237 dx11 = _mm_sub_pd(ix1,jx1);
238 dy11 = _mm_sub_pd(iy1,jy1);
239 dz11 = _mm_sub_pd(iz1,jz1);
240 dx12 = _mm_sub_pd(ix1,jx2);
241 dy12 = _mm_sub_pd(iy1,jy2);
242 dz12 = _mm_sub_pd(iz1,jz2);
243 dx20 = _mm_sub_pd(ix2,jx0);
244 dy20 = _mm_sub_pd(iy2,jy0);
245 dz20 = _mm_sub_pd(iz2,jz0);
246 dx21 = _mm_sub_pd(ix2,jx1);
247 dy21 = _mm_sub_pd(iy2,jy1);
248 dz21 = _mm_sub_pd(iz2,jz1);
249 dx22 = _mm_sub_pd(ix2,jx2);
250 dy22 = _mm_sub_pd(iy2,jy2);
251 dz22 = _mm_sub_pd(iz2,jz2);
253 /* Calculate squared distance and things based on it */
254 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
255 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
256 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
257 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
258 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
259 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
260 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
261 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
262 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
264 rinv00 = gmx_mm_invsqrt_pd(rsq00);
265 rinv01 = gmx_mm_invsqrt_pd(rsq01);
266 rinv02 = gmx_mm_invsqrt_pd(rsq02);
267 rinv10 = gmx_mm_invsqrt_pd(rsq10);
268 rinv11 = gmx_mm_invsqrt_pd(rsq11);
269 rinv12 = gmx_mm_invsqrt_pd(rsq12);
270 rinv20 = gmx_mm_invsqrt_pd(rsq20);
271 rinv21 = gmx_mm_invsqrt_pd(rsq21);
272 rinv22 = gmx_mm_invsqrt_pd(rsq22);
274 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
275 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
276 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
277 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
278 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
279 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
280 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
281 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
282 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
284 fjx0 = _mm_setzero_pd();
285 fjy0 = _mm_setzero_pd();
286 fjz0 = _mm_setzero_pd();
287 fjx1 = _mm_setzero_pd();
288 fjy1 = _mm_setzero_pd();
289 fjz1 = _mm_setzero_pd();
290 fjx2 = _mm_setzero_pd();
291 fjy2 = _mm_setzero_pd();
292 fjz2 = _mm_setzero_pd();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 if (gmx_mm_any_lt(rsq00,rcutoff2))
301 r00 = _mm_mul_pd(rsq00,rinv00);
303 /* EWALD ELECTROSTATICS */
305 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
306 ewrt = _mm_mul_pd(r00,ewtabscale);
307 ewitab = _mm_cvttpd_epi32(ewrt);
308 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
309 ewitab = _mm_slli_epi32(ewitab,2);
310 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
311 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
312 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
313 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
314 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
315 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
316 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
317 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
318 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
319 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
321 /* LENNARD-JONES DISPERSION/REPULSION */
323 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
324 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
325 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
326 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
327 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
328 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
330 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
332 /* Update potential sum for this i atom from the interaction with this j atom. */
333 velec = _mm_and_pd(velec,cutoff_mask);
334 velecsum = _mm_add_pd(velecsum,velec);
335 vvdw = _mm_and_pd(vvdw,cutoff_mask);
336 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
338 fscal = _mm_add_pd(felec,fvdw);
340 fscal = _mm_and_pd(fscal,cutoff_mask);
342 /* Calculate temporary vectorial force */
343 tx = _mm_mul_pd(fscal,dx00);
344 ty = _mm_mul_pd(fscal,dy00);
345 tz = _mm_mul_pd(fscal,dz00);
347 /* Update vectorial force */
348 fix0 = _mm_add_pd(fix0,tx);
349 fiy0 = _mm_add_pd(fiy0,ty);
350 fiz0 = _mm_add_pd(fiz0,tz);
352 fjx0 = _mm_add_pd(fjx0,tx);
353 fjy0 = _mm_add_pd(fjy0,ty);
354 fjz0 = _mm_add_pd(fjz0,tz);
358 /**************************
359 * CALCULATE INTERACTIONS *
360 **************************/
362 if (gmx_mm_any_lt(rsq01,rcutoff2))
365 r01 = _mm_mul_pd(rsq01,rinv01);
367 /* EWALD ELECTROSTATICS */
369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
370 ewrt = _mm_mul_pd(r01,ewtabscale);
371 ewitab = _mm_cvttpd_epi32(ewrt);
372 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
373 ewitab = _mm_slli_epi32(ewitab,2);
374 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
375 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
376 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
377 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
378 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
379 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
380 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
381 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
382 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
383 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
385 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velec = _mm_and_pd(velec,cutoff_mask);
389 velecsum = _mm_add_pd(velecsum,velec);
391 fscal = felec;
393 fscal = _mm_and_pd(fscal,cutoff_mask);
395 /* Calculate temporary vectorial force */
396 tx = _mm_mul_pd(fscal,dx01);
397 ty = _mm_mul_pd(fscal,dy01);
398 tz = _mm_mul_pd(fscal,dz01);
400 /* Update vectorial force */
401 fix0 = _mm_add_pd(fix0,tx);
402 fiy0 = _mm_add_pd(fiy0,ty);
403 fiz0 = _mm_add_pd(fiz0,tz);
405 fjx1 = _mm_add_pd(fjx1,tx);
406 fjy1 = _mm_add_pd(fjy1,ty);
407 fjz1 = _mm_add_pd(fjz1,tz);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 if (gmx_mm_any_lt(rsq02,rcutoff2))
418 r02 = _mm_mul_pd(rsq02,rinv02);
420 /* EWALD ELECTROSTATICS */
422 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
423 ewrt = _mm_mul_pd(r02,ewtabscale);
424 ewitab = _mm_cvttpd_epi32(ewrt);
425 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
426 ewitab = _mm_slli_epi32(ewitab,2);
427 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
428 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
429 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
430 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
431 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
432 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
433 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
434 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
435 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
436 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
438 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velec = _mm_and_pd(velec,cutoff_mask);
442 velecsum = _mm_add_pd(velecsum,velec);
444 fscal = felec;
446 fscal = _mm_and_pd(fscal,cutoff_mask);
448 /* Calculate temporary vectorial force */
449 tx = _mm_mul_pd(fscal,dx02);
450 ty = _mm_mul_pd(fscal,dy02);
451 tz = _mm_mul_pd(fscal,dz02);
453 /* Update vectorial force */
454 fix0 = _mm_add_pd(fix0,tx);
455 fiy0 = _mm_add_pd(fiy0,ty);
456 fiz0 = _mm_add_pd(fiz0,tz);
458 fjx2 = _mm_add_pd(fjx2,tx);
459 fjy2 = _mm_add_pd(fjy2,ty);
460 fjz2 = _mm_add_pd(fjz2,tz);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 if (gmx_mm_any_lt(rsq10,rcutoff2))
471 r10 = _mm_mul_pd(rsq10,rinv10);
473 /* EWALD ELECTROSTATICS */
475 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476 ewrt = _mm_mul_pd(r10,ewtabscale);
477 ewitab = _mm_cvttpd_epi32(ewrt);
478 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
479 ewitab = _mm_slli_epi32(ewitab,2);
480 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
481 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
482 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
483 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
484 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
485 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
486 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
487 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
488 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
489 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
491 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velec = _mm_and_pd(velec,cutoff_mask);
495 velecsum = _mm_add_pd(velecsum,velec);
497 fscal = felec;
499 fscal = _mm_and_pd(fscal,cutoff_mask);
501 /* Calculate temporary vectorial force */
502 tx = _mm_mul_pd(fscal,dx10);
503 ty = _mm_mul_pd(fscal,dy10);
504 tz = _mm_mul_pd(fscal,dz10);
506 /* Update vectorial force */
507 fix1 = _mm_add_pd(fix1,tx);
508 fiy1 = _mm_add_pd(fiy1,ty);
509 fiz1 = _mm_add_pd(fiz1,tz);
511 fjx0 = _mm_add_pd(fjx0,tx);
512 fjy0 = _mm_add_pd(fjy0,ty);
513 fjz0 = _mm_add_pd(fjz0,tz);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 if (gmx_mm_any_lt(rsq11,rcutoff2))
524 r11 = _mm_mul_pd(rsq11,rinv11);
526 /* EWALD ELECTROSTATICS */
528 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
529 ewrt = _mm_mul_pd(r11,ewtabscale);
530 ewitab = _mm_cvttpd_epi32(ewrt);
531 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
532 ewitab = _mm_slli_epi32(ewitab,2);
533 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
534 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
535 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
536 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
537 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
538 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
539 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
540 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
541 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
542 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
544 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
546 /* Update potential sum for this i atom from the interaction with this j atom. */
547 velec = _mm_and_pd(velec,cutoff_mask);
548 velecsum = _mm_add_pd(velecsum,velec);
550 fscal = felec;
552 fscal = _mm_and_pd(fscal,cutoff_mask);
554 /* Calculate temporary vectorial force */
555 tx = _mm_mul_pd(fscal,dx11);
556 ty = _mm_mul_pd(fscal,dy11);
557 tz = _mm_mul_pd(fscal,dz11);
559 /* Update vectorial force */
560 fix1 = _mm_add_pd(fix1,tx);
561 fiy1 = _mm_add_pd(fiy1,ty);
562 fiz1 = _mm_add_pd(fiz1,tz);
564 fjx1 = _mm_add_pd(fjx1,tx);
565 fjy1 = _mm_add_pd(fjy1,ty);
566 fjz1 = _mm_add_pd(fjz1,tz);
570 /**************************
571 * CALCULATE INTERACTIONS *
572 **************************/
574 if (gmx_mm_any_lt(rsq12,rcutoff2))
577 r12 = _mm_mul_pd(rsq12,rinv12);
579 /* EWALD ELECTROSTATICS */
581 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
582 ewrt = _mm_mul_pd(r12,ewtabscale);
583 ewitab = _mm_cvttpd_epi32(ewrt);
584 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
585 ewitab = _mm_slli_epi32(ewitab,2);
586 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
587 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
588 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
589 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
590 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
591 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
592 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
593 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
594 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
595 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
597 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
599 /* Update potential sum for this i atom from the interaction with this j atom. */
600 velec = _mm_and_pd(velec,cutoff_mask);
601 velecsum = _mm_add_pd(velecsum,velec);
603 fscal = felec;
605 fscal = _mm_and_pd(fscal,cutoff_mask);
607 /* Calculate temporary vectorial force */
608 tx = _mm_mul_pd(fscal,dx12);
609 ty = _mm_mul_pd(fscal,dy12);
610 tz = _mm_mul_pd(fscal,dz12);
612 /* Update vectorial force */
613 fix1 = _mm_add_pd(fix1,tx);
614 fiy1 = _mm_add_pd(fiy1,ty);
615 fiz1 = _mm_add_pd(fiz1,tz);
617 fjx2 = _mm_add_pd(fjx2,tx);
618 fjy2 = _mm_add_pd(fjy2,ty);
619 fjz2 = _mm_add_pd(fjz2,tz);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 if (gmx_mm_any_lt(rsq20,rcutoff2))
630 r20 = _mm_mul_pd(rsq20,rinv20);
632 /* EWALD ELECTROSTATICS */
634 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
635 ewrt = _mm_mul_pd(r20,ewtabscale);
636 ewitab = _mm_cvttpd_epi32(ewrt);
637 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
638 ewitab = _mm_slli_epi32(ewitab,2);
639 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
640 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
641 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
642 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
643 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
644 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
645 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
646 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
647 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
648 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
650 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velec = _mm_and_pd(velec,cutoff_mask);
654 velecsum = _mm_add_pd(velecsum,velec);
656 fscal = felec;
658 fscal = _mm_and_pd(fscal,cutoff_mask);
660 /* Calculate temporary vectorial force */
661 tx = _mm_mul_pd(fscal,dx20);
662 ty = _mm_mul_pd(fscal,dy20);
663 tz = _mm_mul_pd(fscal,dz20);
665 /* Update vectorial force */
666 fix2 = _mm_add_pd(fix2,tx);
667 fiy2 = _mm_add_pd(fiy2,ty);
668 fiz2 = _mm_add_pd(fiz2,tz);
670 fjx0 = _mm_add_pd(fjx0,tx);
671 fjy0 = _mm_add_pd(fjy0,ty);
672 fjz0 = _mm_add_pd(fjz0,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 if (gmx_mm_any_lt(rsq21,rcutoff2))
683 r21 = _mm_mul_pd(rsq21,rinv21);
685 /* EWALD ELECTROSTATICS */
687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688 ewrt = _mm_mul_pd(r21,ewtabscale);
689 ewitab = _mm_cvttpd_epi32(ewrt);
690 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
691 ewitab = _mm_slli_epi32(ewitab,2);
692 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
693 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
694 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
695 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
696 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
697 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
698 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
699 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
700 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
701 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
703 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
705 /* Update potential sum for this i atom from the interaction with this j atom. */
706 velec = _mm_and_pd(velec,cutoff_mask);
707 velecsum = _mm_add_pd(velecsum,velec);
709 fscal = felec;
711 fscal = _mm_and_pd(fscal,cutoff_mask);
713 /* Calculate temporary vectorial force */
714 tx = _mm_mul_pd(fscal,dx21);
715 ty = _mm_mul_pd(fscal,dy21);
716 tz = _mm_mul_pd(fscal,dz21);
718 /* Update vectorial force */
719 fix2 = _mm_add_pd(fix2,tx);
720 fiy2 = _mm_add_pd(fiy2,ty);
721 fiz2 = _mm_add_pd(fiz2,tz);
723 fjx1 = _mm_add_pd(fjx1,tx);
724 fjy1 = _mm_add_pd(fjy1,ty);
725 fjz1 = _mm_add_pd(fjz1,tz);
729 /**************************
730 * CALCULATE INTERACTIONS *
731 **************************/
733 if (gmx_mm_any_lt(rsq22,rcutoff2))
736 r22 = _mm_mul_pd(rsq22,rinv22);
738 /* EWALD ELECTROSTATICS */
740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
741 ewrt = _mm_mul_pd(r22,ewtabscale);
742 ewitab = _mm_cvttpd_epi32(ewrt);
743 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
744 ewitab = _mm_slli_epi32(ewitab,2);
745 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
746 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
747 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
748 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
749 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
750 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
751 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
752 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
753 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
754 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
756 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
758 /* Update potential sum for this i atom from the interaction with this j atom. */
759 velec = _mm_and_pd(velec,cutoff_mask);
760 velecsum = _mm_add_pd(velecsum,velec);
762 fscal = felec;
764 fscal = _mm_and_pd(fscal,cutoff_mask);
766 /* Calculate temporary vectorial force */
767 tx = _mm_mul_pd(fscal,dx22);
768 ty = _mm_mul_pd(fscal,dy22);
769 tz = _mm_mul_pd(fscal,dz22);
771 /* Update vectorial force */
772 fix2 = _mm_add_pd(fix2,tx);
773 fiy2 = _mm_add_pd(fiy2,ty);
774 fiz2 = _mm_add_pd(fiz2,tz);
776 fjx2 = _mm_add_pd(fjx2,tx);
777 fjy2 = _mm_add_pd(fjy2,ty);
778 fjz2 = _mm_add_pd(fjz2,tz);
782 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
784 /* Inner loop uses 432 flops */
787 if(jidx<j_index_end)
790 jnrA = jjnr[jidx];
791 j_coord_offsetA = DIM*jnrA;
793 /* load j atom coordinates */
794 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
795 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
797 /* Calculate displacement vector */
798 dx00 = _mm_sub_pd(ix0,jx0);
799 dy00 = _mm_sub_pd(iy0,jy0);
800 dz00 = _mm_sub_pd(iz0,jz0);
801 dx01 = _mm_sub_pd(ix0,jx1);
802 dy01 = _mm_sub_pd(iy0,jy1);
803 dz01 = _mm_sub_pd(iz0,jz1);
804 dx02 = _mm_sub_pd(ix0,jx2);
805 dy02 = _mm_sub_pd(iy0,jy2);
806 dz02 = _mm_sub_pd(iz0,jz2);
807 dx10 = _mm_sub_pd(ix1,jx0);
808 dy10 = _mm_sub_pd(iy1,jy0);
809 dz10 = _mm_sub_pd(iz1,jz0);
810 dx11 = _mm_sub_pd(ix1,jx1);
811 dy11 = _mm_sub_pd(iy1,jy1);
812 dz11 = _mm_sub_pd(iz1,jz1);
813 dx12 = _mm_sub_pd(ix1,jx2);
814 dy12 = _mm_sub_pd(iy1,jy2);
815 dz12 = _mm_sub_pd(iz1,jz2);
816 dx20 = _mm_sub_pd(ix2,jx0);
817 dy20 = _mm_sub_pd(iy2,jy0);
818 dz20 = _mm_sub_pd(iz2,jz0);
819 dx21 = _mm_sub_pd(ix2,jx1);
820 dy21 = _mm_sub_pd(iy2,jy1);
821 dz21 = _mm_sub_pd(iz2,jz1);
822 dx22 = _mm_sub_pd(ix2,jx2);
823 dy22 = _mm_sub_pd(iy2,jy2);
824 dz22 = _mm_sub_pd(iz2,jz2);
826 /* Calculate squared distance and things based on it */
827 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
828 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
829 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
830 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
831 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
832 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
833 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
834 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
835 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
837 rinv00 = gmx_mm_invsqrt_pd(rsq00);
838 rinv01 = gmx_mm_invsqrt_pd(rsq01);
839 rinv02 = gmx_mm_invsqrt_pd(rsq02);
840 rinv10 = gmx_mm_invsqrt_pd(rsq10);
841 rinv11 = gmx_mm_invsqrt_pd(rsq11);
842 rinv12 = gmx_mm_invsqrt_pd(rsq12);
843 rinv20 = gmx_mm_invsqrt_pd(rsq20);
844 rinv21 = gmx_mm_invsqrt_pd(rsq21);
845 rinv22 = gmx_mm_invsqrt_pd(rsq22);
847 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
848 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
849 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
850 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
851 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
852 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
853 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
854 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
855 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
857 fjx0 = _mm_setzero_pd();
858 fjy0 = _mm_setzero_pd();
859 fjz0 = _mm_setzero_pd();
860 fjx1 = _mm_setzero_pd();
861 fjy1 = _mm_setzero_pd();
862 fjz1 = _mm_setzero_pd();
863 fjx2 = _mm_setzero_pd();
864 fjy2 = _mm_setzero_pd();
865 fjz2 = _mm_setzero_pd();
867 /**************************
868 * CALCULATE INTERACTIONS *
869 **************************/
871 if (gmx_mm_any_lt(rsq00,rcutoff2))
874 r00 = _mm_mul_pd(rsq00,rinv00);
876 /* EWALD ELECTROSTATICS */
878 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
879 ewrt = _mm_mul_pd(r00,ewtabscale);
880 ewitab = _mm_cvttpd_epi32(ewrt);
881 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
882 ewitab = _mm_slli_epi32(ewitab,2);
883 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
884 ewtabD = _mm_setzero_pd();
885 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
886 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
887 ewtabFn = _mm_setzero_pd();
888 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
889 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
890 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
891 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
892 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
894 /* LENNARD-JONES DISPERSION/REPULSION */
896 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
897 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
898 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
899 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
900 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
901 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
903 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
905 /* Update potential sum for this i atom from the interaction with this j atom. */
906 velec = _mm_and_pd(velec,cutoff_mask);
907 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
908 velecsum = _mm_add_pd(velecsum,velec);
909 vvdw = _mm_and_pd(vvdw,cutoff_mask);
910 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
911 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
913 fscal = _mm_add_pd(felec,fvdw);
915 fscal = _mm_and_pd(fscal,cutoff_mask);
917 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
919 /* Calculate temporary vectorial force */
920 tx = _mm_mul_pd(fscal,dx00);
921 ty = _mm_mul_pd(fscal,dy00);
922 tz = _mm_mul_pd(fscal,dz00);
924 /* Update vectorial force */
925 fix0 = _mm_add_pd(fix0,tx);
926 fiy0 = _mm_add_pd(fiy0,ty);
927 fiz0 = _mm_add_pd(fiz0,tz);
929 fjx0 = _mm_add_pd(fjx0,tx);
930 fjy0 = _mm_add_pd(fjy0,ty);
931 fjz0 = _mm_add_pd(fjz0,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm_any_lt(rsq01,rcutoff2))
942 r01 = _mm_mul_pd(rsq01,rinv01);
944 /* EWALD ELECTROSTATICS */
946 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
947 ewrt = _mm_mul_pd(r01,ewtabscale);
948 ewitab = _mm_cvttpd_epi32(ewrt);
949 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
950 ewitab = _mm_slli_epi32(ewitab,2);
951 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
952 ewtabD = _mm_setzero_pd();
953 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
954 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
955 ewtabFn = _mm_setzero_pd();
956 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
957 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
958 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
959 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
960 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
962 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 velec = _mm_and_pd(velec,cutoff_mask);
966 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
967 velecsum = _mm_add_pd(velecsum,velec);
969 fscal = felec;
971 fscal = _mm_and_pd(fscal,cutoff_mask);
973 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
975 /* Calculate temporary vectorial force */
976 tx = _mm_mul_pd(fscal,dx01);
977 ty = _mm_mul_pd(fscal,dy01);
978 tz = _mm_mul_pd(fscal,dz01);
980 /* Update vectorial force */
981 fix0 = _mm_add_pd(fix0,tx);
982 fiy0 = _mm_add_pd(fiy0,ty);
983 fiz0 = _mm_add_pd(fiz0,tz);
985 fjx1 = _mm_add_pd(fjx1,tx);
986 fjy1 = _mm_add_pd(fjy1,ty);
987 fjz1 = _mm_add_pd(fjz1,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm_any_lt(rsq02,rcutoff2))
998 r02 = _mm_mul_pd(rsq02,rinv02);
1000 /* EWALD ELECTROSTATICS */
1002 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1003 ewrt = _mm_mul_pd(r02,ewtabscale);
1004 ewitab = _mm_cvttpd_epi32(ewrt);
1005 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1006 ewitab = _mm_slli_epi32(ewitab,2);
1007 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1008 ewtabD = _mm_setzero_pd();
1009 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1010 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1011 ewtabFn = _mm_setzero_pd();
1012 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1013 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1014 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1015 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
1016 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1018 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1020 /* Update potential sum for this i atom from the interaction with this j atom. */
1021 velec = _mm_and_pd(velec,cutoff_mask);
1022 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1023 velecsum = _mm_add_pd(velecsum,velec);
1025 fscal = felec;
1027 fscal = _mm_and_pd(fscal,cutoff_mask);
1029 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1031 /* Calculate temporary vectorial force */
1032 tx = _mm_mul_pd(fscal,dx02);
1033 ty = _mm_mul_pd(fscal,dy02);
1034 tz = _mm_mul_pd(fscal,dz02);
1036 /* Update vectorial force */
1037 fix0 = _mm_add_pd(fix0,tx);
1038 fiy0 = _mm_add_pd(fiy0,ty);
1039 fiz0 = _mm_add_pd(fiz0,tz);
1041 fjx2 = _mm_add_pd(fjx2,tx);
1042 fjy2 = _mm_add_pd(fjy2,ty);
1043 fjz2 = _mm_add_pd(fjz2,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 if (gmx_mm_any_lt(rsq10,rcutoff2))
1054 r10 = _mm_mul_pd(rsq10,rinv10);
1056 /* EWALD ELECTROSTATICS */
1058 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1059 ewrt = _mm_mul_pd(r10,ewtabscale);
1060 ewitab = _mm_cvttpd_epi32(ewrt);
1061 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1062 ewitab = _mm_slli_epi32(ewitab,2);
1063 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1064 ewtabD = _mm_setzero_pd();
1065 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1066 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1067 ewtabFn = _mm_setzero_pd();
1068 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1069 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1070 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1071 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
1072 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1074 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec = _mm_and_pd(velec,cutoff_mask);
1078 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1079 velecsum = _mm_add_pd(velecsum,velec);
1081 fscal = felec;
1083 fscal = _mm_and_pd(fscal,cutoff_mask);
1085 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1087 /* Calculate temporary vectorial force */
1088 tx = _mm_mul_pd(fscal,dx10);
1089 ty = _mm_mul_pd(fscal,dy10);
1090 tz = _mm_mul_pd(fscal,dz10);
1092 /* Update vectorial force */
1093 fix1 = _mm_add_pd(fix1,tx);
1094 fiy1 = _mm_add_pd(fiy1,ty);
1095 fiz1 = _mm_add_pd(fiz1,tz);
1097 fjx0 = _mm_add_pd(fjx0,tx);
1098 fjy0 = _mm_add_pd(fjy0,ty);
1099 fjz0 = _mm_add_pd(fjz0,tz);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 if (gmx_mm_any_lt(rsq11,rcutoff2))
1110 r11 = _mm_mul_pd(rsq11,rinv11);
1112 /* EWALD ELECTROSTATICS */
1114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1115 ewrt = _mm_mul_pd(r11,ewtabscale);
1116 ewitab = _mm_cvttpd_epi32(ewrt);
1117 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1118 ewitab = _mm_slli_epi32(ewitab,2);
1119 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1120 ewtabD = _mm_setzero_pd();
1121 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1122 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1123 ewtabFn = _mm_setzero_pd();
1124 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1125 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1126 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1127 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1128 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1130 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1132 /* Update potential sum for this i atom from the interaction with this j atom. */
1133 velec = _mm_and_pd(velec,cutoff_mask);
1134 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1135 velecsum = _mm_add_pd(velecsum,velec);
1137 fscal = felec;
1139 fscal = _mm_and_pd(fscal,cutoff_mask);
1141 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1143 /* Calculate temporary vectorial force */
1144 tx = _mm_mul_pd(fscal,dx11);
1145 ty = _mm_mul_pd(fscal,dy11);
1146 tz = _mm_mul_pd(fscal,dz11);
1148 /* Update vectorial force */
1149 fix1 = _mm_add_pd(fix1,tx);
1150 fiy1 = _mm_add_pd(fiy1,ty);
1151 fiz1 = _mm_add_pd(fiz1,tz);
1153 fjx1 = _mm_add_pd(fjx1,tx);
1154 fjy1 = _mm_add_pd(fjy1,ty);
1155 fjz1 = _mm_add_pd(fjz1,tz);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 if (gmx_mm_any_lt(rsq12,rcutoff2))
1166 r12 = _mm_mul_pd(rsq12,rinv12);
1168 /* EWALD ELECTROSTATICS */
1170 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1171 ewrt = _mm_mul_pd(r12,ewtabscale);
1172 ewitab = _mm_cvttpd_epi32(ewrt);
1173 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1174 ewitab = _mm_slli_epi32(ewitab,2);
1175 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1176 ewtabD = _mm_setzero_pd();
1177 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1178 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1179 ewtabFn = _mm_setzero_pd();
1180 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1181 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1182 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1183 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1184 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1186 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec = _mm_and_pd(velec,cutoff_mask);
1190 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1191 velecsum = _mm_add_pd(velecsum,velec);
1193 fscal = felec;
1195 fscal = _mm_and_pd(fscal,cutoff_mask);
1197 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1199 /* Calculate temporary vectorial force */
1200 tx = _mm_mul_pd(fscal,dx12);
1201 ty = _mm_mul_pd(fscal,dy12);
1202 tz = _mm_mul_pd(fscal,dz12);
1204 /* Update vectorial force */
1205 fix1 = _mm_add_pd(fix1,tx);
1206 fiy1 = _mm_add_pd(fiy1,ty);
1207 fiz1 = _mm_add_pd(fiz1,tz);
1209 fjx2 = _mm_add_pd(fjx2,tx);
1210 fjy2 = _mm_add_pd(fjy2,ty);
1211 fjz2 = _mm_add_pd(fjz2,tz);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 if (gmx_mm_any_lt(rsq20,rcutoff2))
1222 r20 = _mm_mul_pd(rsq20,rinv20);
1224 /* EWALD ELECTROSTATICS */
1226 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1227 ewrt = _mm_mul_pd(r20,ewtabscale);
1228 ewitab = _mm_cvttpd_epi32(ewrt);
1229 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1230 ewitab = _mm_slli_epi32(ewitab,2);
1231 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1232 ewtabD = _mm_setzero_pd();
1233 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1234 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1235 ewtabFn = _mm_setzero_pd();
1236 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1237 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1238 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1239 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
1240 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1242 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec = _mm_and_pd(velec,cutoff_mask);
1246 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1247 velecsum = _mm_add_pd(velecsum,velec);
1249 fscal = felec;
1251 fscal = _mm_and_pd(fscal,cutoff_mask);
1253 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1255 /* Calculate temporary vectorial force */
1256 tx = _mm_mul_pd(fscal,dx20);
1257 ty = _mm_mul_pd(fscal,dy20);
1258 tz = _mm_mul_pd(fscal,dz20);
1260 /* Update vectorial force */
1261 fix2 = _mm_add_pd(fix2,tx);
1262 fiy2 = _mm_add_pd(fiy2,ty);
1263 fiz2 = _mm_add_pd(fiz2,tz);
1265 fjx0 = _mm_add_pd(fjx0,tx);
1266 fjy0 = _mm_add_pd(fjy0,ty);
1267 fjz0 = _mm_add_pd(fjz0,tz);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 if (gmx_mm_any_lt(rsq21,rcutoff2))
1278 r21 = _mm_mul_pd(rsq21,rinv21);
1280 /* EWALD ELECTROSTATICS */
1282 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1283 ewrt = _mm_mul_pd(r21,ewtabscale);
1284 ewitab = _mm_cvttpd_epi32(ewrt);
1285 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1286 ewitab = _mm_slli_epi32(ewitab,2);
1287 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1288 ewtabD = _mm_setzero_pd();
1289 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1290 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1291 ewtabFn = _mm_setzero_pd();
1292 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1293 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1294 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1295 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1296 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1298 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1300 /* Update potential sum for this i atom from the interaction with this j atom. */
1301 velec = _mm_and_pd(velec,cutoff_mask);
1302 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1303 velecsum = _mm_add_pd(velecsum,velec);
1305 fscal = felec;
1307 fscal = _mm_and_pd(fscal,cutoff_mask);
1309 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1311 /* Calculate temporary vectorial force */
1312 tx = _mm_mul_pd(fscal,dx21);
1313 ty = _mm_mul_pd(fscal,dy21);
1314 tz = _mm_mul_pd(fscal,dz21);
1316 /* Update vectorial force */
1317 fix2 = _mm_add_pd(fix2,tx);
1318 fiy2 = _mm_add_pd(fiy2,ty);
1319 fiz2 = _mm_add_pd(fiz2,tz);
1321 fjx1 = _mm_add_pd(fjx1,tx);
1322 fjy1 = _mm_add_pd(fjy1,ty);
1323 fjz1 = _mm_add_pd(fjz1,tz);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 if (gmx_mm_any_lt(rsq22,rcutoff2))
1334 r22 = _mm_mul_pd(rsq22,rinv22);
1336 /* EWALD ELECTROSTATICS */
1338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1339 ewrt = _mm_mul_pd(r22,ewtabscale);
1340 ewitab = _mm_cvttpd_epi32(ewrt);
1341 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1342 ewitab = _mm_slli_epi32(ewitab,2);
1343 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1344 ewtabD = _mm_setzero_pd();
1345 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1346 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1347 ewtabFn = _mm_setzero_pd();
1348 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1349 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1350 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1351 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1352 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1354 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1356 /* Update potential sum for this i atom from the interaction with this j atom. */
1357 velec = _mm_and_pd(velec,cutoff_mask);
1358 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1359 velecsum = _mm_add_pd(velecsum,velec);
1361 fscal = felec;
1363 fscal = _mm_and_pd(fscal,cutoff_mask);
1365 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1367 /* Calculate temporary vectorial force */
1368 tx = _mm_mul_pd(fscal,dx22);
1369 ty = _mm_mul_pd(fscal,dy22);
1370 tz = _mm_mul_pd(fscal,dz22);
1372 /* Update vectorial force */
1373 fix2 = _mm_add_pd(fix2,tx);
1374 fiy2 = _mm_add_pd(fiy2,ty);
1375 fiz2 = _mm_add_pd(fiz2,tz);
1377 fjx2 = _mm_add_pd(fjx2,tx);
1378 fjy2 = _mm_add_pd(fjy2,ty);
1379 fjz2 = _mm_add_pd(fjz2,tz);
1383 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1385 /* Inner loop uses 432 flops */
1388 /* End of innermost loop */
1390 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1391 f+i_coord_offset,fshift+i_shift_offset);
1393 ggid = gid[iidx];
1394 /* Update potential energies */
1395 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1396 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1398 /* Increment number of inner iterations */
1399 inneriter += j_index_end - j_index_start;
1401 /* Outer loop uses 20 flops */
1404 /* Increment number of outer iterations */
1405 outeriter += nri;
1407 /* Update outer/inner flops */
1409 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*432);
1412 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double
1413 * Electrostatics interaction: Ewald
1414 * VdW interaction: LennardJones
1415 * Geometry: Water3-Water3
1416 * Calculate force/pot: Force
1418 void
1419 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double
1420 (t_nblist * gmx_restrict nlist,
1421 rvec * gmx_restrict xx,
1422 rvec * gmx_restrict ff,
1423 t_forcerec * gmx_restrict fr,
1424 t_mdatoms * gmx_restrict mdatoms,
1425 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1426 t_nrnb * gmx_restrict nrnb)
1428 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1429 * just 0 for non-waters.
1430 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1431 * jnr indices corresponding to data put in the four positions in the SIMD register.
1433 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1434 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1435 int jnrA,jnrB;
1436 int j_coord_offsetA,j_coord_offsetB;
1437 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1438 real rcutoff_scalar;
1439 real *shiftvec,*fshift,*x,*f;
1440 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1441 int vdwioffset0;
1442 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1443 int vdwioffset1;
1444 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1445 int vdwioffset2;
1446 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1447 int vdwjidx0A,vdwjidx0B;
1448 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1449 int vdwjidx1A,vdwjidx1B;
1450 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1451 int vdwjidx2A,vdwjidx2B;
1452 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1453 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1454 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1455 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1456 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1457 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1458 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1459 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1460 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1461 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1462 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1463 real *charge;
1464 int nvdwtype;
1465 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1466 int *vdwtype;
1467 real *vdwparam;
1468 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1469 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1470 __m128i ewitab;
1471 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1472 real *ewtab;
1473 __m128d dummy_mask,cutoff_mask;
1474 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1475 __m128d one = _mm_set1_pd(1.0);
1476 __m128d two = _mm_set1_pd(2.0);
1477 x = xx[0];
1478 f = ff[0];
1480 nri = nlist->nri;
1481 iinr = nlist->iinr;
1482 jindex = nlist->jindex;
1483 jjnr = nlist->jjnr;
1484 shiftidx = nlist->shift;
1485 gid = nlist->gid;
1486 shiftvec = fr->shift_vec[0];
1487 fshift = fr->fshift[0];
1488 facel = _mm_set1_pd(fr->epsfac);
1489 charge = mdatoms->chargeA;
1490 nvdwtype = fr->ntype;
1491 vdwparam = fr->nbfp;
1492 vdwtype = mdatoms->typeA;
1494 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1495 ewtab = fr->ic->tabq_coul_F;
1496 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1497 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1499 /* Setup water-specific parameters */
1500 inr = nlist->iinr[0];
1501 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1502 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1503 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1504 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1506 jq0 = _mm_set1_pd(charge[inr+0]);
1507 jq1 = _mm_set1_pd(charge[inr+1]);
1508 jq2 = _mm_set1_pd(charge[inr+2]);
1509 vdwjidx0A = 2*vdwtype[inr+0];
1510 qq00 = _mm_mul_pd(iq0,jq0);
1511 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1512 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1513 qq01 = _mm_mul_pd(iq0,jq1);
1514 qq02 = _mm_mul_pd(iq0,jq2);
1515 qq10 = _mm_mul_pd(iq1,jq0);
1516 qq11 = _mm_mul_pd(iq1,jq1);
1517 qq12 = _mm_mul_pd(iq1,jq2);
1518 qq20 = _mm_mul_pd(iq2,jq0);
1519 qq21 = _mm_mul_pd(iq2,jq1);
1520 qq22 = _mm_mul_pd(iq2,jq2);
1522 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1523 rcutoff_scalar = fr->rcoulomb;
1524 rcutoff = _mm_set1_pd(rcutoff_scalar);
1525 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1527 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
1528 rvdw = _mm_set1_pd(fr->rvdw);
1530 /* Avoid stupid compiler warnings */
1531 jnrA = jnrB = 0;
1532 j_coord_offsetA = 0;
1533 j_coord_offsetB = 0;
1535 outeriter = 0;
1536 inneriter = 0;
1538 /* Start outer loop over neighborlists */
1539 for(iidx=0; iidx<nri; iidx++)
1541 /* Load shift vector for this list */
1542 i_shift_offset = DIM*shiftidx[iidx];
1544 /* Load limits for loop over neighbors */
1545 j_index_start = jindex[iidx];
1546 j_index_end = jindex[iidx+1];
1548 /* Get outer coordinate index */
1549 inr = iinr[iidx];
1550 i_coord_offset = DIM*inr;
1552 /* Load i particle coords and add shift vector */
1553 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1554 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1556 fix0 = _mm_setzero_pd();
1557 fiy0 = _mm_setzero_pd();
1558 fiz0 = _mm_setzero_pd();
1559 fix1 = _mm_setzero_pd();
1560 fiy1 = _mm_setzero_pd();
1561 fiz1 = _mm_setzero_pd();
1562 fix2 = _mm_setzero_pd();
1563 fiy2 = _mm_setzero_pd();
1564 fiz2 = _mm_setzero_pd();
1566 /* Start inner kernel loop */
1567 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1570 /* Get j neighbor index, and coordinate index */
1571 jnrA = jjnr[jidx];
1572 jnrB = jjnr[jidx+1];
1573 j_coord_offsetA = DIM*jnrA;
1574 j_coord_offsetB = DIM*jnrB;
1576 /* load j atom coordinates */
1577 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1578 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1580 /* Calculate displacement vector */
1581 dx00 = _mm_sub_pd(ix0,jx0);
1582 dy00 = _mm_sub_pd(iy0,jy0);
1583 dz00 = _mm_sub_pd(iz0,jz0);
1584 dx01 = _mm_sub_pd(ix0,jx1);
1585 dy01 = _mm_sub_pd(iy0,jy1);
1586 dz01 = _mm_sub_pd(iz0,jz1);
1587 dx02 = _mm_sub_pd(ix0,jx2);
1588 dy02 = _mm_sub_pd(iy0,jy2);
1589 dz02 = _mm_sub_pd(iz0,jz2);
1590 dx10 = _mm_sub_pd(ix1,jx0);
1591 dy10 = _mm_sub_pd(iy1,jy0);
1592 dz10 = _mm_sub_pd(iz1,jz0);
1593 dx11 = _mm_sub_pd(ix1,jx1);
1594 dy11 = _mm_sub_pd(iy1,jy1);
1595 dz11 = _mm_sub_pd(iz1,jz1);
1596 dx12 = _mm_sub_pd(ix1,jx2);
1597 dy12 = _mm_sub_pd(iy1,jy2);
1598 dz12 = _mm_sub_pd(iz1,jz2);
1599 dx20 = _mm_sub_pd(ix2,jx0);
1600 dy20 = _mm_sub_pd(iy2,jy0);
1601 dz20 = _mm_sub_pd(iz2,jz0);
1602 dx21 = _mm_sub_pd(ix2,jx1);
1603 dy21 = _mm_sub_pd(iy2,jy1);
1604 dz21 = _mm_sub_pd(iz2,jz1);
1605 dx22 = _mm_sub_pd(ix2,jx2);
1606 dy22 = _mm_sub_pd(iy2,jy2);
1607 dz22 = _mm_sub_pd(iz2,jz2);
1609 /* Calculate squared distance and things based on it */
1610 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1611 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1612 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1613 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1614 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1615 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1616 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1617 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1618 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1620 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1621 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1622 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1623 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1624 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1625 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1626 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1627 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1628 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1630 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1631 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1632 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1633 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1634 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1635 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1636 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1637 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1638 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1640 fjx0 = _mm_setzero_pd();
1641 fjy0 = _mm_setzero_pd();
1642 fjz0 = _mm_setzero_pd();
1643 fjx1 = _mm_setzero_pd();
1644 fjy1 = _mm_setzero_pd();
1645 fjz1 = _mm_setzero_pd();
1646 fjx2 = _mm_setzero_pd();
1647 fjy2 = _mm_setzero_pd();
1648 fjz2 = _mm_setzero_pd();
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 if (gmx_mm_any_lt(rsq00,rcutoff2))
1657 r00 = _mm_mul_pd(rsq00,rinv00);
1659 /* EWALD ELECTROSTATICS */
1661 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1662 ewrt = _mm_mul_pd(r00,ewtabscale);
1663 ewitab = _mm_cvttpd_epi32(ewrt);
1664 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1665 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1666 &ewtabF,&ewtabFn);
1667 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1668 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1670 /* LENNARD-JONES DISPERSION/REPULSION */
1672 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1673 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1675 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1677 fscal = _mm_add_pd(felec,fvdw);
1679 fscal = _mm_and_pd(fscal,cutoff_mask);
1681 /* Calculate temporary vectorial force */
1682 tx = _mm_mul_pd(fscal,dx00);
1683 ty = _mm_mul_pd(fscal,dy00);
1684 tz = _mm_mul_pd(fscal,dz00);
1686 /* Update vectorial force */
1687 fix0 = _mm_add_pd(fix0,tx);
1688 fiy0 = _mm_add_pd(fiy0,ty);
1689 fiz0 = _mm_add_pd(fiz0,tz);
1691 fjx0 = _mm_add_pd(fjx0,tx);
1692 fjy0 = _mm_add_pd(fjy0,ty);
1693 fjz0 = _mm_add_pd(fjz0,tz);
1697 /**************************
1698 * CALCULATE INTERACTIONS *
1699 **************************/
1701 if (gmx_mm_any_lt(rsq01,rcutoff2))
1704 r01 = _mm_mul_pd(rsq01,rinv01);
1706 /* EWALD ELECTROSTATICS */
1708 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1709 ewrt = _mm_mul_pd(r01,ewtabscale);
1710 ewitab = _mm_cvttpd_epi32(ewrt);
1711 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1712 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1713 &ewtabF,&ewtabFn);
1714 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1715 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1717 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1719 fscal = felec;
1721 fscal = _mm_and_pd(fscal,cutoff_mask);
1723 /* Calculate temporary vectorial force */
1724 tx = _mm_mul_pd(fscal,dx01);
1725 ty = _mm_mul_pd(fscal,dy01);
1726 tz = _mm_mul_pd(fscal,dz01);
1728 /* Update vectorial force */
1729 fix0 = _mm_add_pd(fix0,tx);
1730 fiy0 = _mm_add_pd(fiy0,ty);
1731 fiz0 = _mm_add_pd(fiz0,tz);
1733 fjx1 = _mm_add_pd(fjx1,tx);
1734 fjy1 = _mm_add_pd(fjy1,ty);
1735 fjz1 = _mm_add_pd(fjz1,tz);
1739 /**************************
1740 * CALCULATE INTERACTIONS *
1741 **************************/
1743 if (gmx_mm_any_lt(rsq02,rcutoff2))
1746 r02 = _mm_mul_pd(rsq02,rinv02);
1748 /* EWALD ELECTROSTATICS */
1750 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1751 ewrt = _mm_mul_pd(r02,ewtabscale);
1752 ewitab = _mm_cvttpd_epi32(ewrt);
1753 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1754 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1755 &ewtabF,&ewtabFn);
1756 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1757 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1759 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1761 fscal = felec;
1763 fscal = _mm_and_pd(fscal,cutoff_mask);
1765 /* Calculate temporary vectorial force */
1766 tx = _mm_mul_pd(fscal,dx02);
1767 ty = _mm_mul_pd(fscal,dy02);
1768 tz = _mm_mul_pd(fscal,dz02);
1770 /* Update vectorial force */
1771 fix0 = _mm_add_pd(fix0,tx);
1772 fiy0 = _mm_add_pd(fiy0,ty);
1773 fiz0 = _mm_add_pd(fiz0,tz);
1775 fjx2 = _mm_add_pd(fjx2,tx);
1776 fjy2 = _mm_add_pd(fjy2,ty);
1777 fjz2 = _mm_add_pd(fjz2,tz);
1781 /**************************
1782 * CALCULATE INTERACTIONS *
1783 **************************/
1785 if (gmx_mm_any_lt(rsq10,rcutoff2))
1788 r10 = _mm_mul_pd(rsq10,rinv10);
1790 /* EWALD ELECTROSTATICS */
1792 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1793 ewrt = _mm_mul_pd(r10,ewtabscale);
1794 ewitab = _mm_cvttpd_epi32(ewrt);
1795 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1796 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1797 &ewtabF,&ewtabFn);
1798 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1799 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1801 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1803 fscal = felec;
1805 fscal = _mm_and_pd(fscal,cutoff_mask);
1807 /* Calculate temporary vectorial force */
1808 tx = _mm_mul_pd(fscal,dx10);
1809 ty = _mm_mul_pd(fscal,dy10);
1810 tz = _mm_mul_pd(fscal,dz10);
1812 /* Update vectorial force */
1813 fix1 = _mm_add_pd(fix1,tx);
1814 fiy1 = _mm_add_pd(fiy1,ty);
1815 fiz1 = _mm_add_pd(fiz1,tz);
1817 fjx0 = _mm_add_pd(fjx0,tx);
1818 fjy0 = _mm_add_pd(fjy0,ty);
1819 fjz0 = _mm_add_pd(fjz0,tz);
1823 /**************************
1824 * CALCULATE INTERACTIONS *
1825 **************************/
1827 if (gmx_mm_any_lt(rsq11,rcutoff2))
1830 r11 = _mm_mul_pd(rsq11,rinv11);
1832 /* EWALD ELECTROSTATICS */
1834 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1835 ewrt = _mm_mul_pd(r11,ewtabscale);
1836 ewitab = _mm_cvttpd_epi32(ewrt);
1837 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1838 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1839 &ewtabF,&ewtabFn);
1840 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1841 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1843 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1845 fscal = felec;
1847 fscal = _mm_and_pd(fscal,cutoff_mask);
1849 /* Calculate temporary vectorial force */
1850 tx = _mm_mul_pd(fscal,dx11);
1851 ty = _mm_mul_pd(fscal,dy11);
1852 tz = _mm_mul_pd(fscal,dz11);
1854 /* Update vectorial force */
1855 fix1 = _mm_add_pd(fix1,tx);
1856 fiy1 = _mm_add_pd(fiy1,ty);
1857 fiz1 = _mm_add_pd(fiz1,tz);
1859 fjx1 = _mm_add_pd(fjx1,tx);
1860 fjy1 = _mm_add_pd(fjy1,ty);
1861 fjz1 = _mm_add_pd(fjz1,tz);
1865 /**************************
1866 * CALCULATE INTERACTIONS *
1867 **************************/
1869 if (gmx_mm_any_lt(rsq12,rcutoff2))
1872 r12 = _mm_mul_pd(rsq12,rinv12);
1874 /* EWALD ELECTROSTATICS */
1876 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1877 ewrt = _mm_mul_pd(r12,ewtabscale);
1878 ewitab = _mm_cvttpd_epi32(ewrt);
1879 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1880 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1881 &ewtabF,&ewtabFn);
1882 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1883 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1885 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1887 fscal = felec;
1889 fscal = _mm_and_pd(fscal,cutoff_mask);
1891 /* Calculate temporary vectorial force */
1892 tx = _mm_mul_pd(fscal,dx12);
1893 ty = _mm_mul_pd(fscal,dy12);
1894 tz = _mm_mul_pd(fscal,dz12);
1896 /* Update vectorial force */
1897 fix1 = _mm_add_pd(fix1,tx);
1898 fiy1 = _mm_add_pd(fiy1,ty);
1899 fiz1 = _mm_add_pd(fiz1,tz);
1901 fjx2 = _mm_add_pd(fjx2,tx);
1902 fjy2 = _mm_add_pd(fjy2,ty);
1903 fjz2 = _mm_add_pd(fjz2,tz);
1907 /**************************
1908 * CALCULATE INTERACTIONS *
1909 **************************/
1911 if (gmx_mm_any_lt(rsq20,rcutoff2))
1914 r20 = _mm_mul_pd(rsq20,rinv20);
1916 /* EWALD ELECTROSTATICS */
1918 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1919 ewrt = _mm_mul_pd(r20,ewtabscale);
1920 ewitab = _mm_cvttpd_epi32(ewrt);
1921 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1922 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1923 &ewtabF,&ewtabFn);
1924 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1925 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1927 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1929 fscal = felec;
1931 fscal = _mm_and_pd(fscal,cutoff_mask);
1933 /* Calculate temporary vectorial force */
1934 tx = _mm_mul_pd(fscal,dx20);
1935 ty = _mm_mul_pd(fscal,dy20);
1936 tz = _mm_mul_pd(fscal,dz20);
1938 /* Update vectorial force */
1939 fix2 = _mm_add_pd(fix2,tx);
1940 fiy2 = _mm_add_pd(fiy2,ty);
1941 fiz2 = _mm_add_pd(fiz2,tz);
1943 fjx0 = _mm_add_pd(fjx0,tx);
1944 fjy0 = _mm_add_pd(fjy0,ty);
1945 fjz0 = _mm_add_pd(fjz0,tz);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 if (gmx_mm_any_lt(rsq21,rcutoff2))
1956 r21 = _mm_mul_pd(rsq21,rinv21);
1958 /* EWALD ELECTROSTATICS */
1960 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1961 ewrt = _mm_mul_pd(r21,ewtabscale);
1962 ewitab = _mm_cvttpd_epi32(ewrt);
1963 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1964 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1965 &ewtabF,&ewtabFn);
1966 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1967 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1969 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1971 fscal = felec;
1973 fscal = _mm_and_pd(fscal,cutoff_mask);
1975 /* Calculate temporary vectorial force */
1976 tx = _mm_mul_pd(fscal,dx21);
1977 ty = _mm_mul_pd(fscal,dy21);
1978 tz = _mm_mul_pd(fscal,dz21);
1980 /* Update vectorial force */
1981 fix2 = _mm_add_pd(fix2,tx);
1982 fiy2 = _mm_add_pd(fiy2,ty);
1983 fiz2 = _mm_add_pd(fiz2,tz);
1985 fjx1 = _mm_add_pd(fjx1,tx);
1986 fjy1 = _mm_add_pd(fjy1,ty);
1987 fjz1 = _mm_add_pd(fjz1,tz);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 if (gmx_mm_any_lt(rsq22,rcutoff2))
1998 r22 = _mm_mul_pd(rsq22,rinv22);
2000 /* EWALD ELECTROSTATICS */
2002 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2003 ewrt = _mm_mul_pd(r22,ewtabscale);
2004 ewitab = _mm_cvttpd_epi32(ewrt);
2005 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2006 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2007 &ewtabF,&ewtabFn);
2008 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2009 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2011 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2013 fscal = felec;
2015 fscal = _mm_and_pd(fscal,cutoff_mask);
2017 /* Calculate temporary vectorial force */
2018 tx = _mm_mul_pd(fscal,dx22);
2019 ty = _mm_mul_pd(fscal,dy22);
2020 tz = _mm_mul_pd(fscal,dz22);
2022 /* Update vectorial force */
2023 fix2 = _mm_add_pd(fix2,tx);
2024 fiy2 = _mm_add_pd(fiy2,ty);
2025 fiz2 = _mm_add_pd(fiz2,tz);
2027 fjx2 = _mm_add_pd(fjx2,tx);
2028 fjy2 = _mm_add_pd(fjy2,ty);
2029 fjz2 = _mm_add_pd(fjz2,tz);
2033 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2035 /* Inner loop uses 358 flops */
2038 if(jidx<j_index_end)
2041 jnrA = jjnr[jidx];
2042 j_coord_offsetA = DIM*jnrA;
2044 /* load j atom coordinates */
2045 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2046 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2048 /* Calculate displacement vector */
2049 dx00 = _mm_sub_pd(ix0,jx0);
2050 dy00 = _mm_sub_pd(iy0,jy0);
2051 dz00 = _mm_sub_pd(iz0,jz0);
2052 dx01 = _mm_sub_pd(ix0,jx1);
2053 dy01 = _mm_sub_pd(iy0,jy1);
2054 dz01 = _mm_sub_pd(iz0,jz1);
2055 dx02 = _mm_sub_pd(ix0,jx2);
2056 dy02 = _mm_sub_pd(iy0,jy2);
2057 dz02 = _mm_sub_pd(iz0,jz2);
2058 dx10 = _mm_sub_pd(ix1,jx0);
2059 dy10 = _mm_sub_pd(iy1,jy0);
2060 dz10 = _mm_sub_pd(iz1,jz0);
2061 dx11 = _mm_sub_pd(ix1,jx1);
2062 dy11 = _mm_sub_pd(iy1,jy1);
2063 dz11 = _mm_sub_pd(iz1,jz1);
2064 dx12 = _mm_sub_pd(ix1,jx2);
2065 dy12 = _mm_sub_pd(iy1,jy2);
2066 dz12 = _mm_sub_pd(iz1,jz2);
2067 dx20 = _mm_sub_pd(ix2,jx0);
2068 dy20 = _mm_sub_pd(iy2,jy0);
2069 dz20 = _mm_sub_pd(iz2,jz0);
2070 dx21 = _mm_sub_pd(ix2,jx1);
2071 dy21 = _mm_sub_pd(iy2,jy1);
2072 dz21 = _mm_sub_pd(iz2,jz1);
2073 dx22 = _mm_sub_pd(ix2,jx2);
2074 dy22 = _mm_sub_pd(iy2,jy2);
2075 dz22 = _mm_sub_pd(iz2,jz2);
2077 /* Calculate squared distance and things based on it */
2078 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2079 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2080 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2081 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2082 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2083 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2084 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2085 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2086 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2088 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2089 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2090 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2091 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2092 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2093 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2094 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2095 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2096 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2098 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2099 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2100 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2101 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2102 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2103 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2104 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2105 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2106 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2108 fjx0 = _mm_setzero_pd();
2109 fjy0 = _mm_setzero_pd();
2110 fjz0 = _mm_setzero_pd();
2111 fjx1 = _mm_setzero_pd();
2112 fjy1 = _mm_setzero_pd();
2113 fjz1 = _mm_setzero_pd();
2114 fjx2 = _mm_setzero_pd();
2115 fjy2 = _mm_setzero_pd();
2116 fjz2 = _mm_setzero_pd();
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 if (gmx_mm_any_lt(rsq00,rcutoff2))
2125 r00 = _mm_mul_pd(rsq00,rinv00);
2127 /* EWALD ELECTROSTATICS */
2129 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2130 ewrt = _mm_mul_pd(r00,ewtabscale);
2131 ewitab = _mm_cvttpd_epi32(ewrt);
2132 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2133 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2134 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2135 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2137 /* LENNARD-JONES DISPERSION/REPULSION */
2139 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2140 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2142 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2144 fscal = _mm_add_pd(felec,fvdw);
2146 fscal = _mm_and_pd(fscal,cutoff_mask);
2148 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2150 /* Calculate temporary vectorial force */
2151 tx = _mm_mul_pd(fscal,dx00);
2152 ty = _mm_mul_pd(fscal,dy00);
2153 tz = _mm_mul_pd(fscal,dz00);
2155 /* Update vectorial force */
2156 fix0 = _mm_add_pd(fix0,tx);
2157 fiy0 = _mm_add_pd(fiy0,ty);
2158 fiz0 = _mm_add_pd(fiz0,tz);
2160 fjx0 = _mm_add_pd(fjx0,tx);
2161 fjy0 = _mm_add_pd(fjy0,ty);
2162 fjz0 = _mm_add_pd(fjz0,tz);
2166 /**************************
2167 * CALCULATE INTERACTIONS *
2168 **************************/
2170 if (gmx_mm_any_lt(rsq01,rcutoff2))
2173 r01 = _mm_mul_pd(rsq01,rinv01);
2175 /* EWALD ELECTROSTATICS */
2177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2178 ewrt = _mm_mul_pd(r01,ewtabscale);
2179 ewitab = _mm_cvttpd_epi32(ewrt);
2180 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2181 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2182 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2183 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2185 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2187 fscal = felec;
2189 fscal = _mm_and_pd(fscal,cutoff_mask);
2191 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2193 /* Calculate temporary vectorial force */
2194 tx = _mm_mul_pd(fscal,dx01);
2195 ty = _mm_mul_pd(fscal,dy01);
2196 tz = _mm_mul_pd(fscal,dz01);
2198 /* Update vectorial force */
2199 fix0 = _mm_add_pd(fix0,tx);
2200 fiy0 = _mm_add_pd(fiy0,ty);
2201 fiz0 = _mm_add_pd(fiz0,tz);
2203 fjx1 = _mm_add_pd(fjx1,tx);
2204 fjy1 = _mm_add_pd(fjy1,ty);
2205 fjz1 = _mm_add_pd(fjz1,tz);
2209 /**************************
2210 * CALCULATE INTERACTIONS *
2211 **************************/
2213 if (gmx_mm_any_lt(rsq02,rcutoff2))
2216 r02 = _mm_mul_pd(rsq02,rinv02);
2218 /* EWALD ELECTROSTATICS */
2220 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2221 ewrt = _mm_mul_pd(r02,ewtabscale);
2222 ewitab = _mm_cvttpd_epi32(ewrt);
2223 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2224 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2225 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2226 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2228 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2230 fscal = felec;
2232 fscal = _mm_and_pd(fscal,cutoff_mask);
2234 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2236 /* Calculate temporary vectorial force */
2237 tx = _mm_mul_pd(fscal,dx02);
2238 ty = _mm_mul_pd(fscal,dy02);
2239 tz = _mm_mul_pd(fscal,dz02);
2241 /* Update vectorial force */
2242 fix0 = _mm_add_pd(fix0,tx);
2243 fiy0 = _mm_add_pd(fiy0,ty);
2244 fiz0 = _mm_add_pd(fiz0,tz);
2246 fjx2 = _mm_add_pd(fjx2,tx);
2247 fjy2 = _mm_add_pd(fjy2,ty);
2248 fjz2 = _mm_add_pd(fjz2,tz);
2252 /**************************
2253 * CALCULATE INTERACTIONS *
2254 **************************/
2256 if (gmx_mm_any_lt(rsq10,rcutoff2))
2259 r10 = _mm_mul_pd(rsq10,rinv10);
2261 /* EWALD ELECTROSTATICS */
2263 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2264 ewrt = _mm_mul_pd(r10,ewtabscale);
2265 ewitab = _mm_cvttpd_epi32(ewrt);
2266 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2267 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2268 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2269 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2271 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2273 fscal = felec;
2275 fscal = _mm_and_pd(fscal,cutoff_mask);
2277 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2279 /* Calculate temporary vectorial force */
2280 tx = _mm_mul_pd(fscal,dx10);
2281 ty = _mm_mul_pd(fscal,dy10);
2282 tz = _mm_mul_pd(fscal,dz10);
2284 /* Update vectorial force */
2285 fix1 = _mm_add_pd(fix1,tx);
2286 fiy1 = _mm_add_pd(fiy1,ty);
2287 fiz1 = _mm_add_pd(fiz1,tz);
2289 fjx0 = _mm_add_pd(fjx0,tx);
2290 fjy0 = _mm_add_pd(fjy0,ty);
2291 fjz0 = _mm_add_pd(fjz0,tz);
2295 /**************************
2296 * CALCULATE INTERACTIONS *
2297 **************************/
2299 if (gmx_mm_any_lt(rsq11,rcutoff2))
2302 r11 = _mm_mul_pd(rsq11,rinv11);
2304 /* EWALD ELECTROSTATICS */
2306 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2307 ewrt = _mm_mul_pd(r11,ewtabscale);
2308 ewitab = _mm_cvttpd_epi32(ewrt);
2309 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2310 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2311 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2312 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2314 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2316 fscal = felec;
2318 fscal = _mm_and_pd(fscal,cutoff_mask);
2320 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2322 /* Calculate temporary vectorial force */
2323 tx = _mm_mul_pd(fscal,dx11);
2324 ty = _mm_mul_pd(fscal,dy11);
2325 tz = _mm_mul_pd(fscal,dz11);
2327 /* Update vectorial force */
2328 fix1 = _mm_add_pd(fix1,tx);
2329 fiy1 = _mm_add_pd(fiy1,ty);
2330 fiz1 = _mm_add_pd(fiz1,tz);
2332 fjx1 = _mm_add_pd(fjx1,tx);
2333 fjy1 = _mm_add_pd(fjy1,ty);
2334 fjz1 = _mm_add_pd(fjz1,tz);
2338 /**************************
2339 * CALCULATE INTERACTIONS *
2340 **************************/
2342 if (gmx_mm_any_lt(rsq12,rcutoff2))
2345 r12 = _mm_mul_pd(rsq12,rinv12);
2347 /* EWALD ELECTROSTATICS */
2349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2350 ewrt = _mm_mul_pd(r12,ewtabscale);
2351 ewitab = _mm_cvttpd_epi32(ewrt);
2352 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2353 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2354 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2355 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2357 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2359 fscal = felec;
2361 fscal = _mm_and_pd(fscal,cutoff_mask);
2363 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2365 /* Calculate temporary vectorial force */
2366 tx = _mm_mul_pd(fscal,dx12);
2367 ty = _mm_mul_pd(fscal,dy12);
2368 tz = _mm_mul_pd(fscal,dz12);
2370 /* Update vectorial force */
2371 fix1 = _mm_add_pd(fix1,tx);
2372 fiy1 = _mm_add_pd(fiy1,ty);
2373 fiz1 = _mm_add_pd(fiz1,tz);
2375 fjx2 = _mm_add_pd(fjx2,tx);
2376 fjy2 = _mm_add_pd(fjy2,ty);
2377 fjz2 = _mm_add_pd(fjz2,tz);
2381 /**************************
2382 * CALCULATE INTERACTIONS *
2383 **************************/
2385 if (gmx_mm_any_lt(rsq20,rcutoff2))
2388 r20 = _mm_mul_pd(rsq20,rinv20);
2390 /* EWALD ELECTROSTATICS */
2392 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2393 ewrt = _mm_mul_pd(r20,ewtabscale);
2394 ewitab = _mm_cvttpd_epi32(ewrt);
2395 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2396 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2397 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2398 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2400 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2402 fscal = felec;
2404 fscal = _mm_and_pd(fscal,cutoff_mask);
2406 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2408 /* Calculate temporary vectorial force */
2409 tx = _mm_mul_pd(fscal,dx20);
2410 ty = _mm_mul_pd(fscal,dy20);
2411 tz = _mm_mul_pd(fscal,dz20);
2413 /* Update vectorial force */
2414 fix2 = _mm_add_pd(fix2,tx);
2415 fiy2 = _mm_add_pd(fiy2,ty);
2416 fiz2 = _mm_add_pd(fiz2,tz);
2418 fjx0 = _mm_add_pd(fjx0,tx);
2419 fjy0 = _mm_add_pd(fjy0,ty);
2420 fjz0 = _mm_add_pd(fjz0,tz);
2424 /**************************
2425 * CALCULATE INTERACTIONS *
2426 **************************/
2428 if (gmx_mm_any_lt(rsq21,rcutoff2))
2431 r21 = _mm_mul_pd(rsq21,rinv21);
2433 /* EWALD ELECTROSTATICS */
2435 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2436 ewrt = _mm_mul_pd(r21,ewtabscale);
2437 ewitab = _mm_cvttpd_epi32(ewrt);
2438 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2439 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2440 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2441 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2443 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2445 fscal = felec;
2447 fscal = _mm_and_pd(fscal,cutoff_mask);
2449 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2451 /* Calculate temporary vectorial force */
2452 tx = _mm_mul_pd(fscal,dx21);
2453 ty = _mm_mul_pd(fscal,dy21);
2454 tz = _mm_mul_pd(fscal,dz21);
2456 /* Update vectorial force */
2457 fix2 = _mm_add_pd(fix2,tx);
2458 fiy2 = _mm_add_pd(fiy2,ty);
2459 fiz2 = _mm_add_pd(fiz2,tz);
2461 fjx1 = _mm_add_pd(fjx1,tx);
2462 fjy1 = _mm_add_pd(fjy1,ty);
2463 fjz1 = _mm_add_pd(fjz1,tz);
2467 /**************************
2468 * CALCULATE INTERACTIONS *
2469 **************************/
2471 if (gmx_mm_any_lt(rsq22,rcutoff2))
2474 r22 = _mm_mul_pd(rsq22,rinv22);
2476 /* EWALD ELECTROSTATICS */
2478 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2479 ewrt = _mm_mul_pd(r22,ewtabscale);
2480 ewitab = _mm_cvttpd_epi32(ewrt);
2481 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2482 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2483 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2484 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2486 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2488 fscal = felec;
2490 fscal = _mm_and_pd(fscal,cutoff_mask);
2492 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2494 /* Calculate temporary vectorial force */
2495 tx = _mm_mul_pd(fscal,dx22);
2496 ty = _mm_mul_pd(fscal,dy22);
2497 tz = _mm_mul_pd(fscal,dz22);
2499 /* Update vectorial force */
2500 fix2 = _mm_add_pd(fix2,tx);
2501 fiy2 = _mm_add_pd(fiy2,ty);
2502 fiz2 = _mm_add_pd(fiz2,tz);
2504 fjx2 = _mm_add_pd(fjx2,tx);
2505 fjy2 = _mm_add_pd(fjy2,ty);
2506 fjz2 = _mm_add_pd(fjz2,tz);
2510 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2512 /* Inner loop uses 358 flops */
2515 /* End of innermost loop */
2517 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2518 f+i_coord_offset,fshift+i_shift_offset);
2520 /* Increment number of inner iterations */
2521 inneriter += j_index_end - j_index_start;
2523 /* Outer loop uses 18 flops */
2526 /* Increment number of outer iterations */
2527 outeriter += nri;
2529 /* Update outer/inner flops */
2531 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*358);