Removed simple.h from nb_kernel_sse2_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_double.c
blobd9a0b671976bc71cfbd845b0246c171630afcbe9
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB;
76 int j_coord_offsetA,j_coord_offsetB;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real rcutoff_scalar;
79 real *shiftvec,*fshift,*x,*f;
80 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 int vdwioffset0;
82 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 int vdwioffset1;
84 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 int vdwioffset2;
86 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
95 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
96 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
103 real *charge;
104 int nvdwtype;
105 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
106 int *vdwtype;
107 real *vdwparam;
108 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
109 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128i vfitab;
111 __m128i ifour = _mm_set1_epi32(4);
112 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
113 real *vftab;
114 __m128d dummy_mask,cutoff_mask;
115 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116 __m128d one = _mm_set1_pd(1.0);
117 __m128d two = _mm_set1_pd(2.0);
118 x = xx[0];
119 f = ff[0];
121 nri = nlist->nri;
122 iinr = nlist->iinr;
123 jindex = nlist->jindex;
124 jjnr = nlist->jjnr;
125 shiftidx = nlist->shift;
126 gid = nlist->gid;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm_set1_pd(fr->epsfac);
130 charge = mdatoms->chargeA;
131 nvdwtype = fr->ntype;
132 vdwparam = fr->nbfp;
133 vdwtype = mdatoms->typeA;
135 vftab = kernel_data->table_elec->data;
136 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
138 /* Setup water-specific parameters */
139 inr = nlist->iinr[0];
140 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
141 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
142 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
143 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
145 jq0 = _mm_set1_pd(charge[inr+0]);
146 jq1 = _mm_set1_pd(charge[inr+1]);
147 jq2 = _mm_set1_pd(charge[inr+2]);
148 vdwjidx0A = 2*vdwtype[inr+0];
149 qq00 = _mm_mul_pd(iq0,jq0);
150 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
151 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
152 qq01 = _mm_mul_pd(iq0,jq1);
153 qq02 = _mm_mul_pd(iq0,jq2);
154 qq10 = _mm_mul_pd(iq1,jq0);
155 qq11 = _mm_mul_pd(iq1,jq1);
156 qq12 = _mm_mul_pd(iq1,jq2);
157 qq20 = _mm_mul_pd(iq2,jq0);
158 qq21 = _mm_mul_pd(iq2,jq1);
159 qq22 = _mm_mul_pd(iq2,jq2);
161 /* Avoid stupid compiler warnings */
162 jnrA = jnrB = 0;
163 j_coord_offsetA = 0;
164 j_coord_offsetB = 0;
166 outeriter = 0;
167 inneriter = 0;
169 /* Start outer loop over neighborlists */
170 for(iidx=0; iidx<nri; iidx++)
172 /* Load shift vector for this list */
173 i_shift_offset = DIM*shiftidx[iidx];
175 /* Load limits for loop over neighbors */
176 j_index_start = jindex[iidx];
177 j_index_end = jindex[iidx+1];
179 /* Get outer coordinate index */
180 inr = iinr[iidx];
181 i_coord_offset = DIM*inr;
183 /* Load i particle coords and add shift vector */
184 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
185 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
187 fix0 = _mm_setzero_pd();
188 fiy0 = _mm_setzero_pd();
189 fiz0 = _mm_setzero_pd();
190 fix1 = _mm_setzero_pd();
191 fiy1 = _mm_setzero_pd();
192 fiz1 = _mm_setzero_pd();
193 fix2 = _mm_setzero_pd();
194 fiy2 = _mm_setzero_pd();
195 fiz2 = _mm_setzero_pd();
197 /* Reset potential sums */
198 velecsum = _mm_setzero_pd();
199 vvdwsum = _mm_setzero_pd();
201 /* Start inner kernel loop */
202 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
205 /* Get j neighbor index, and coordinate index */
206 jnrA = jjnr[jidx];
207 jnrB = jjnr[jidx+1];
208 j_coord_offsetA = DIM*jnrA;
209 j_coord_offsetB = DIM*jnrB;
211 /* load j atom coordinates */
212 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
213 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215 /* Calculate displacement vector */
216 dx00 = _mm_sub_pd(ix0,jx0);
217 dy00 = _mm_sub_pd(iy0,jy0);
218 dz00 = _mm_sub_pd(iz0,jz0);
219 dx01 = _mm_sub_pd(ix0,jx1);
220 dy01 = _mm_sub_pd(iy0,jy1);
221 dz01 = _mm_sub_pd(iz0,jz1);
222 dx02 = _mm_sub_pd(ix0,jx2);
223 dy02 = _mm_sub_pd(iy0,jy2);
224 dz02 = _mm_sub_pd(iz0,jz2);
225 dx10 = _mm_sub_pd(ix1,jx0);
226 dy10 = _mm_sub_pd(iy1,jy0);
227 dz10 = _mm_sub_pd(iz1,jz0);
228 dx11 = _mm_sub_pd(ix1,jx1);
229 dy11 = _mm_sub_pd(iy1,jy1);
230 dz11 = _mm_sub_pd(iz1,jz1);
231 dx12 = _mm_sub_pd(ix1,jx2);
232 dy12 = _mm_sub_pd(iy1,jy2);
233 dz12 = _mm_sub_pd(iz1,jz2);
234 dx20 = _mm_sub_pd(ix2,jx0);
235 dy20 = _mm_sub_pd(iy2,jy0);
236 dz20 = _mm_sub_pd(iz2,jz0);
237 dx21 = _mm_sub_pd(ix2,jx1);
238 dy21 = _mm_sub_pd(iy2,jy1);
239 dz21 = _mm_sub_pd(iz2,jz1);
240 dx22 = _mm_sub_pd(ix2,jx2);
241 dy22 = _mm_sub_pd(iy2,jy2);
242 dz22 = _mm_sub_pd(iz2,jz2);
244 /* Calculate squared distance and things based on it */
245 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
246 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
247 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
248 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
249 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
250 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
251 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
252 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
253 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
255 rinv00 = gmx_mm_invsqrt_pd(rsq00);
256 rinv01 = gmx_mm_invsqrt_pd(rsq01);
257 rinv02 = gmx_mm_invsqrt_pd(rsq02);
258 rinv10 = gmx_mm_invsqrt_pd(rsq10);
259 rinv11 = gmx_mm_invsqrt_pd(rsq11);
260 rinv12 = gmx_mm_invsqrt_pd(rsq12);
261 rinv20 = gmx_mm_invsqrt_pd(rsq20);
262 rinv21 = gmx_mm_invsqrt_pd(rsq21);
263 rinv22 = gmx_mm_invsqrt_pd(rsq22);
265 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
267 fjx0 = _mm_setzero_pd();
268 fjy0 = _mm_setzero_pd();
269 fjz0 = _mm_setzero_pd();
270 fjx1 = _mm_setzero_pd();
271 fjy1 = _mm_setzero_pd();
272 fjz1 = _mm_setzero_pd();
273 fjx2 = _mm_setzero_pd();
274 fjy2 = _mm_setzero_pd();
275 fjz2 = _mm_setzero_pd();
277 /**************************
278 * CALCULATE INTERACTIONS *
279 **************************/
281 r00 = _mm_mul_pd(rsq00,rinv00);
283 /* Calculate table index by multiplying r with table scale and truncate to integer */
284 rt = _mm_mul_pd(r00,vftabscale);
285 vfitab = _mm_cvttpd_epi32(rt);
286 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
287 vfitab = _mm_slli_epi32(vfitab,2);
289 /* CUBIC SPLINE TABLE ELECTROSTATICS */
290 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
291 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
292 GMX_MM_TRANSPOSE2_PD(Y,F);
293 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
294 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
295 GMX_MM_TRANSPOSE2_PD(G,H);
296 Heps = _mm_mul_pd(vfeps,H);
297 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
298 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
299 velec = _mm_mul_pd(qq00,VV);
300 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
301 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
303 /* LENNARD-JONES DISPERSION/REPULSION */
305 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
306 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
307 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
308 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
309 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
311 /* Update potential sum for this i atom from the interaction with this j atom. */
312 velecsum = _mm_add_pd(velecsum,velec);
313 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
315 fscal = _mm_add_pd(felec,fvdw);
317 /* Calculate temporary vectorial force */
318 tx = _mm_mul_pd(fscal,dx00);
319 ty = _mm_mul_pd(fscal,dy00);
320 tz = _mm_mul_pd(fscal,dz00);
322 /* Update vectorial force */
323 fix0 = _mm_add_pd(fix0,tx);
324 fiy0 = _mm_add_pd(fiy0,ty);
325 fiz0 = _mm_add_pd(fiz0,tz);
327 fjx0 = _mm_add_pd(fjx0,tx);
328 fjy0 = _mm_add_pd(fjy0,ty);
329 fjz0 = _mm_add_pd(fjz0,tz);
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 r01 = _mm_mul_pd(rsq01,rinv01);
337 /* Calculate table index by multiplying r with table scale and truncate to integer */
338 rt = _mm_mul_pd(r01,vftabscale);
339 vfitab = _mm_cvttpd_epi32(rt);
340 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
341 vfitab = _mm_slli_epi32(vfitab,2);
343 /* CUBIC SPLINE TABLE ELECTROSTATICS */
344 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
345 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
346 GMX_MM_TRANSPOSE2_PD(Y,F);
347 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
348 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
349 GMX_MM_TRANSPOSE2_PD(G,H);
350 Heps = _mm_mul_pd(vfeps,H);
351 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
352 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
353 velec = _mm_mul_pd(qq01,VV);
354 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
355 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm_add_pd(velecsum,velec);
360 fscal = felec;
362 /* Calculate temporary vectorial force */
363 tx = _mm_mul_pd(fscal,dx01);
364 ty = _mm_mul_pd(fscal,dy01);
365 tz = _mm_mul_pd(fscal,dz01);
367 /* Update vectorial force */
368 fix0 = _mm_add_pd(fix0,tx);
369 fiy0 = _mm_add_pd(fiy0,ty);
370 fiz0 = _mm_add_pd(fiz0,tz);
372 fjx1 = _mm_add_pd(fjx1,tx);
373 fjy1 = _mm_add_pd(fjy1,ty);
374 fjz1 = _mm_add_pd(fjz1,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r02 = _mm_mul_pd(rsq02,rinv02);
382 /* Calculate table index by multiplying r with table scale and truncate to integer */
383 rt = _mm_mul_pd(r02,vftabscale);
384 vfitab = _mm_cvttpd_epi32(rt);
385 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
386 vfitab = _mm_slli_epi32(vfitab,2);
388 /* CUBIC SPLINE TABLE ELECTROSTATICS */
389 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
390 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
391 GMX_MM_TRANSPOSE2_PD(Y,F);
392 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
393 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
394 GMX_MM_TRANSPOSE2_PD(G,H);
395 Heps = _mm_mul_pd(vfeps,H);
396 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
397 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
398 velec = _mm_mul_pd(qq02,VV);
399 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
400 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velecsum = _mm_add_pd(velecsum,velec);
405 fscal = felec;
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_pd(fscal,dx02);
409 ty = _mm_mul_pd(fscal,dy02);
410 tz = _mm_mul_pd(fscal,dz02);
412 /* Update vectorial force */
413 fix0 = _mm_add_pd(fix0,tx);
414 fiy0 = _mm_add_pd(fiy0,ty);
415 fiz0 = _mm_add_pd(fiz0,tz);
417 fjx2 = _mm_add_pd(fjx2,tx);
418 fjy2 = _mm_add_pd(fjy2,ty);
419 fjz2 = _mm_add_pd(fjz2,tz);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 r10 = _mm_mul_pd(rsq10,rinv10);
427 /* Calculate table index by multiplying r with table scale and truncate to integer */
428 rt = _mm_mul_pd(r10,vftabscale);
429 vfitab = _mm_cvttpd_epi32(rt);
430 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
431 vfitab = _mm_slli_epi32(vfitab,2);
433 /* CUBIC SPLINE TABLE ELECTROSTATICS */
434 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
435 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
436 GMX_MM_TRANSPOSE2_PD(Y,F);
437 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
438 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
439 GMX_MM_TRANSPOSE2_PD(G,H);
440 Heps = _mm_mul_pd(vfeps,H);
441 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
442 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
443 velec = _mm_mul_pd(qq10,VV);
444 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
445 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velecsum = _mm_add_pd(velecsum,velec);
450 fscal = felec;
452 /* Calculate temporary vectorial force */
453 tx = _mm_mul_pd(fscal,dx10);
454 ty = _mm_mul_pd(fscal,dy10);
455 tz = _mm_mul_pd(fscal,dz10);
457 /* Update vectorial force */
458 fix1 = _mm_add_pd(fix1,tx);
459 fiy1 = _mm_add_pd(fiy1,ty);
460 fiz1 = _mm_add_pd(fiz1,tz);
462 fjx0 = _mm_add_pd(fjx0,tx);
463 fjy0 = _mm_add_pd(fjy0,ty);
464 fjz0 = _mm_add_pd(fjz0,tz);
466 /**************************
467 * CALCULATE INTERACTIONS *
468 **************************/
470 r11 = _mm_mul_pd(rsq11,rinv11);
472 /* Calculate table index by multiplying r with table scale and truncate to integer */
473 rt = _mm_mul_pd(r11,vftabscale);
474 vfitab = _mm_cvttpd_epi32(rt);
475 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
476 vfitab = _mm_slli_epi32(vfitab,2);
478 /* CUBIC SPLINE TABLE ELECTROSTATICS */
479 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
480 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
481 GMX_MM_TRANSPOSE2_PD(Y,F);
482 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
483 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
484 GMX_MM_TRANSPOSE2_PD(G,H);
485 Heps = _mm_mul_pd(vfeps,H);
486 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
487 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
488 velec = _mm_mul_pd(qq11,VV);
489 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
490 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velecsum = _mm_add_pd(velecsum,velec);
495 fscal = felec;
497 /* Calculate temporary vectorial force */
498 tx = _mm_mul_pd(fscal,dx11);
499 ty = _mm_mul_pd(fscal,dy11);
500 tz = _mm_mul_pd(fscal,dz11);
502 /* Update vectorial force */
503 fix1 = _mm_add_pd(fix1,tx);
504 fiy1 = _mm_add_pd(fiy1,ty);
505 fiz1 = _mm_add_pd(fiz1,tz);
507 fjx1 = _mm_add_pd(fjx1,tx);
508 fjy1 = _mm_add_pd(fjy1,ty);
509 fjz1 = _mm_add_pd(fjz1,tz);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 r12 = _mm_mul_pd(rsq12,rinv12);
517 /* Calculate table index by multiplying r with table scale and truncate to integer */
518 rt = _mm_mul_pd(r12,vftabscale);
519 vfitab = _mm_cvttpd_epi32(rt);
520 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
521 vfitab = _mm_slli_epi32(vfitab,2);
523 /* CUBIC SPLINE TABLE ELECTROSTATICS */
524 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
525 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
526 GMX_MM_TRANSPOSE2_PD(Y,F);
527 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
528 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
529 GMX_MM_TRANSPOSE2_PD(G,H);
530 Heps = _mm_mul_pd(vfeps,H);
531 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
532 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
533 velec = _mm_mul_pd(qq12,VV);
534 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
535 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velecsum = _mm_add_pd(velecsum,velec);
540 fscal = felec;
542 /* Calculate temporary vectorial force */
543 tx = _mm_mul_pd(fscal,dx12);
544 ty = _mm_mul_pd(fscal,dy12);
545 tz = _mm_mul_pd(fscal,dz12);
547 /* Update vectorial force */
548 fix1 = _mm_add_pd(fix1,tx);
549 fiy1 = _mm_add_pd(fiy1,ty);
550 fiz1 = _mm_add_pd(fiz1,tz);
552 fjx2 = _mm_add_pd(fjx2,tx);
553 fjy2 = _mm_add_pd(fjy2,ty);
554 fjz2 = _mm_add_pd(fjz2,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 r20 = _mm_mul_pd(rsq20,rinv20);
562 /* Calculate table index by multiplying r with table scale and truncate to integer */
563 rt = _mm_mul_pd(r20,vftabscale);
564 vfitab = _mm_cvttpd_epi32(rt);
565 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
566 vfitab = _mm_slli_epi32(vfitab,2);
568 /* CUBIC SPLINE TABLE ELECTROSTATICS */
569 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
570 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
571 GMX_MM_TRANSPOSE2_PD(Y,F);
572 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
573 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
574 GMX_MM_TRANSPOSE2_PD(G,H);
575 Heps = _mm_mul_pd(vfeps,H);
576 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
577 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
578 velec = _mm_mul_pd(qq20,VV);
579 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
580 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velecsum = _mm_add_pd(velecsum,velec);
585 fscal = felec;
587 /* Calculate temporary vectorial force */
588 tx = _mm_mul_pd(fscal,dx20);
589 ty = _mm_mul_pd(fscal,dy20);
590 tz = _mm_mul_pd(fscal,dz20);
592 /* Update vectorial force */
593 fix2 = _mm_add_pd(fix2,tx);
594 fiy2 = _mm_add_pd(fiy2,ty);
595 fiz2 = _mm_add_pd(fiz2,tz);
597 fjx0 = _mm_add_pd(fjx0,tx);
598 fjy0 = _mm_add_pd(fjy0,ty);
599 fjz0 = _mm_add_pd(fjz0,tz);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r21 = _mm_mul_pd(rsq21,rinv21);
607 /* Calculate table index by multiplying r with table scale and truncate to integer */
608 rt = _mm_mul_pd(r21,vftabscale);
609 vfitab = _mm_cvttpd_epi32(rt);
610 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
611 vfitab = _mm_slli_epi32(vfitab,2);
613 /* CUBIC SPLINE TABLE ELECTROSTATICS */
614 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
615 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
616 GMX_MM_TRANSPOSE2_PD(Y,F);
617 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
618 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
619 GMX_MM_TRANSPOSE2_PD(G,H);
620 Heps = _mm_mul_pd(vfeps,H);
621 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
622 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
623 velec = _mm_mul_pd(qq21,VV);
624 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
625 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
627 /* Update potential sum for this i atom from the interaction with this j atom. */
628 velecsum = _mm_add_pd(velecsum,velec);
630 fscal = felec;
632 /* Calculate temporary vectorial force */
633 tx = _mm_mul_pd(fscal,dx21);
634 ty = _mm_mul_pd(fscal,dy21);
635 tz = _mm_mul_pd(fscal,dz21);
637 /* Update vectorial force */
638 fix2 = _mm_add_pd(fix2,tx);
639 fiy2 = _mm_add_pd(fiy2,ty);
640 fiz2 = _mm_add_pd(fiz2,tz);
642 fjx1 = _mm_add_pd(fjx1,tx);
643 fjy1 = _mm_add_pd(fjy1,ty);
644 fjz1 = _mm_add_pd(fjz1,tz);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 r22 = _mm_mul_pd(rsq22,rinv22);
652 /* Calculate table index by multiplying r with table scale and truncate to integer */
653 rt = _mm_mul_pd(r22,vftabscale);
654 vfitab = _mm_cvttpd_epi32(rt);
655 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
656 vfitab = _mm_slli_epi32(vfitab,2);
658 /* CUBIC SPLINE TABLE ELECTROSTATICS */
659 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
660 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
661 GMX_MM_TRANSPOSE2_PD(Y,F);
662 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
663 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
664 GMX_MM_TRANSPOSE2_PD(G,H);
665 Heps = _mm_mul_pd(vfeps,H);
666 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
667 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
668 velec = _mm_mul_pd(qq22,VV);
669 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
670 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velecsum = _mm_add_pd(velecsum,velec);
675 fscal = felec;
677 /* Calculate temporary vectorial force */
678 tx = _mm_mul_pd(fscal,dx22);
679 ty = _mm_mul_pd(fscal,dy22);
680 tz = _mm_mul_pd(fscal,dz22);
682 /* Update vectorial force */
683 fix2 = _mm_add_pd(fix2,tx);
684 fiy2 = _mm_add_pd(fiy2,ty);
685 fiz2 = _mm_add_pd(fiz2,tz);
687 fjx2 = _mm_add_pd(fjx2,tx);
688 fjy2 = _mm_add_pd(fjy2,ty);
689 fjz2 = _mm_add_pd(fjz2,tz);
691 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
693 /* Inner loop uses 400 flops */
696 if(jidx<j_index_end)
699 jnrA = jjnr[jidx];
700 j_coord_offsetA = DIM*jnrA;
702 /* load j atom coordinates */
703 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
704 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
706 /* Calculate displacement vector */
707 dx00 = _mm_sub_pd(ix0,jx0);
708 dy00 = _mm_sub_pd(iy0,jy0);
709 dz00 = _mm_sub_pd(iz0,jz0);
710 dx01 = _mm_sub_pd(ix0,jx1);
711 dy01 = _mm_sub_pd(iy0,jy1);
712 dz01 = _mm_sub_pd(iz0,jz1);
713 dx02 = _mm_sub_pd(ix0,jx2);
714 dy02 = _mm_sub_pd(iy0,jy2);
715 dz02 = _mm_sub_pd(iz0,jz2);
716 dx10 = _mm_sub_pd(ix1,jx0);
717 dy10 = _mm_sub_pd(iy1,jy0);
718 dz10 = _mm_sub_pd(iz1,jz0);
719 dx11 = _mm_sub_pd(ix1,jx1);
720 dy11 = _mm_sub_pd(iy1,jy1);
721 dz11 = _mm_sub_pd(iz1,jz1);
722 dx12 = _mm_sub_pd(ix1,jx2);
723 dy12 = _mm_sub_pd(iy1,jy2);
724 dz12 = _mm_sub_pd(iz1,jz2);
725 dx20 = _mm_sub_pd(ix2,jx0);
726 dy20 = _mm_sub_pd(iy2,jy0);
727 dz20 = _mm_sub_pd(iz2,jz0);
728 dx21 = _mm_sub_pd(ix2,jx1);
729 dy21 = _mm_sub_pd(iy2,jy1);
730 dz21 = _mm_sub_pd(iz2,jz1);
731 dx22 = _mm_sub_pd(ix2,jx2);
732 dy22 = _mm_sub_pd(iy2,jy2);
733 dz22 = _mm_sub_pd(iz2,jz2);
735 /* Calculate squared distance and things based on it */
736 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
737 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
738 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
739 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
740 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
741 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
742 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
743 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
744 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
746 rinv00 = gmx_mm_invsqrt_pd(rsq00);
747 rinv01 = gmx_mm_invsqrt_pd(rsq01);
748 rinv02 = gmx_mm_invsqrt_pd(rsq02);
749 rinv10 = gmx_mm_invsqrt_pd(rsq10);
750 rinv11 = gmx_mm_invsqrt_pd(rsq11);
751 rinv12 = gmx_mm_invsqrt_pd(rsq12);
752 rinv20 = gmx_mm_invsqrt_pd(rsq20);
753 rinv21 = gmx_mm_invsqrt_pd(rsq21);
754 rinv22 = gmx_mm_invsqrt_pd(rsq22);
756 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
758 fjx0 = _mm_setzero_pd();
759 fjy0 = _mm_setzero_pd();
760 fjz0 = _mm_setzero_pd();
761 fjx1 = _mm_setzero_pd();
762 fjy1 = _mm_setzero_pd();
763 fjz1 = _mm_setzero_pd();
764 fjx2 = _mm_setzero_pd();
765 fjy2 = _mm_setzero_pd();
766 fjz2 = _mm_setzero_pd();
768 /**************************
769 * CALCULATE INTERACTIONS *
770 **************************/
772 r00 = _mm_mul_pd(rsq00,rinv00);
774 /* Calculate table index by multiplying r with table scale and truncate to integer */
775 rt = _mm_mul_pd(r00,vftabscale);
776 vfitab = _mm_cvttpd_epi32(rt);
777 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
778 vfitab = _mm_slli_epi32(vfitab,2);
780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
781 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
782 F = _mm_setzero_pd();
783 GMX_MM_TRANSPOSE2_PD(Y,F);
784 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
785 H = _mm_setzero_pd();
786 GMX_MM_TRANSPOSE2_PD(G,H);
787 Heps = _mm_mul_pd(vfeps,H);
788 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
789 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
790 velec = _mm_mul_pd(qq00,VV);
791 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
792 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
794 /* LENNARD-JONES DISPERSION/REPULSION */
796 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
797 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
798 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
799 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
800 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
802 /* Update potential sum for this i atom from the interaction with this j atom. */
803 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
804 velecsum = _mm_add_pd(velecsum,velec);
805 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
806 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
808 fscal = _mm_add_pd(felec,fvdw);
810 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
812 /* Calculate temporary vectorial force */
813 tx = _mm_mul_pd(fscal,dx00);
814 ty = _mm_mul_pd(fscal,dy00);
815 tz = _mm_mul_pd(fscal,dz00);
817 /* Update vectorial force */
818 fix0 = _mm_add_pd(fix0,tx);
819 fiy0 = _mm_add_pd(fiy0,ty);
820 fiz0 = _mm_add_pd(fiz0,tz);
822 fjx0 = _mm_add_pd(fjx0,tx);
823 fjy0 = _mm_add_pd(fjy0,ty);
824 fjz0 = _mm_add_pd(fjz0,tz);
826 /**************************
827 * CALCULATE INTERACTIONS *
828 **************************/
830 r01 = _mm_mul_pd(rsq01,rinv01);
832 /* Calculate table index by multiplying r with table scale and truncate to integer */
833 rt = _mm_mul_pd(r01,vftabscale);
834 vfitab = _mm_cvttpd_epi32(rt);
835 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
836 vfitab = _mm_slli_epi32(vfitab,2);
838 /* CUBIC SPLINE TABLE ELECTROSTATICS */
839 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
840 F = _mm_setzero_pd();
841 GMX_MM_TRANSPOSE2_PD(Y,F);
842 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
843 H = _mm_setzero_pd();
844 GMX_MM_TRANSPOSE2_PD(G,H);
845 Heps = _mm_mul_pd(vfeps,H);
846 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
847 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
848 velec = _mm_mul_pd(qq01,VV);
849 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
850 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
852 /* Update potential sum for this i atom from the interaction with this j atom. */
853 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
854 velecsum = _mm_add_pd(velecsum,velec);
856 fscal = felec;
858 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
860 /* Calculate temporary vectorial force */
861 tx = _mm_mul_pd(fscal,dx01);
862 ty = _mm_mul_pd(fscal,dy01);
863 tz = _mm_mul_pd(fscal,dz01);
865 /* Update vectorial force */
866 fix0 = _mm_add_pd(fix0,tx);
867 fiy0 = _mm_add_pd(fiy0,ty);
868 fiz0 = _mm_add_pd(fiz0,tz);
870 fjx1 = _mm_add_pd(fjx1,tx);
871 fjy1 = _mm_add_pd(fjy1,ty);
872 fjz1 = _mm_add_pd(fjz1,tz);
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 r02 = _mm_mul_pd(rsq02,rinv02);
880 /* Calculate table index by multiplying r with table scale and truncate to integer */
881 rt = _mm_mul_pd(r02,vftabscale);
882 vfitab = _mm_cvttpd_epi32(rt);
883 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
884 vfitab = _mm_slli_epi32(vfitab,2);
886 /* CUBIC SPLINE TABLE ELECTROSTATICS */
887 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
888 F = _mm_setzero_pd();
889 GMX_MM_TRANSPOSE2_PD(Y,F);
890 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
891 H = _mm_setzero_pd();
892 GMX_MM_TRANSPOSE2_PD(G,H);
893 Heps = _mm_mul_pd(vfeps,H);
894 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
895 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
896 velec = _mm_mul_pd(qq02,VV);
897 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
898 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
902 velecsum = _mm_add_pd(velecsum,velec);
904 fscal = felec;
906 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
908 /* Calculate temporary vectorial force */
909 tx = _mm_mul_pd(fscal,dx02);
910 ty = _mm_mul_pd(fscal,dy02);
911 tz = _mm_mul_pd(fscal,dz02);
913 /* Update vectorial force */
914 fix0 = _mm_add_pd(fix0,tx);
915 fiy0 = _mm_add_pd(fiy0,ty);
916 fiz0 = _mm_add_pd(fiz0,tz);
918 fjx2 = _mm_add_pd(fjx2,tx);
919 fjy2 = _mm_add_pd(fjy2,ty);
920 fjz2 = _mm_add_pd(fjz2,tz);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 r10 = _mm_mul_pd(rsq10,rinv10);
928 /* Calculate table index by multiplying r with table scale and truncate to integer */
929 rt = _mm_mul_pd(r10,vftabscale);
930 vfitab = _mm_cvttpd_epi32(rt);
931 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
932 vfitab = _mm_slli_epi32(vfitab,2);
934 /* CUBIC SPLINE TABLE ELECTROSTATICS */
935 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
936 F = _mm_setzero_pd();
937 GMX_MM_TRANSPOSE2_PD(Y,F);
938 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
939 H = _mm_setzero_pd();
940 GMX_MM_TRANSPOSE2_PD(G,H);
941 Heps = _mm_mul_pd(vfeps,H);
942 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
943 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
944 velec = _mm_mul_pd(qq10,VV);
945 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
946 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
950 velecsum = _mm_add_pd(velecsum,velec);
952 fscal = felec;
954 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
956 /* Calculate temporary vectorial force */
957 tx = _mm_mul_pd(fscal,dx10);
958 ty = _mm_mul_pd(fscal,dy10);
959 tz = _mm_mul_pd(fscal,dz10);
961 /* Update vectorial force */
962 fix1 = _mm_add_pd(fix1,tx);
963 fiy1 = _mm_add_pd(fiy1,ty);
964 fiz1 = _mm_add_pd(fiz1,tz);
966 fjx0 = _mm_add_pd(fjx0,tx);
967 fjy0 = _mm_add_pd(fjy0,ty);
968 fjz0 = _mm_add_pd(fjz0,tz);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r11 = _mm_mul_pd(rsq11,rinv11);
976 /* Calculate table index by multiplying r with table scale and truncate to integer */
977 rt = _mm_mul_pd(r11,vftabscale);
978 vfitab = _mm_cvttpd_epi32(rt);
979 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
980 vfitab = _mm_slli_epi32(vfitab,2);
982 /* CUBIC SPLINE TABLE ELECTROSTATICS */
983 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
984 F = _mm_setzero_pd();
985 GMX_MM_TRANSPOSE2_PD(Y,F);
986 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
987 H = _mm_setzero_pd();
988 GMX_MM_TRANSPOSE2_PD(G,H);
989 Heps = _mm_mul_pd(vfeps,H);
990 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
991 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
992 velec = _mm_mul_pd(qq11,VV);
993 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
994 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
996 /* Update potential sum for this i atom from the interaction with this j atom. */
997 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
998 velecsum = _mm_add_pd(velecsum,velec);
1000 fscal = felec;
1002 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1004 /* Calculate temporary vectorial force */
1005 tx = _mm_mul_pd(fscal,dx11);
1006 ty = _mm_mul_pd(fscal,dy11);
1007 tz = _mm_mul_pd(fscal,dz11);
1009 /* Update vectorial force */
1010 fix1 = _mm_add_pd(fix1,tx);
1011 fiy1 = _mm_add_pd(fiy1,ty);
1012 fiz1 = _mm_add_pd(fiz1,tz);
1014 fjx1 = _mm_add_pd(fjx1,tx);
1015 fjy1 = _mm_add_pd(fjy1,ty);
1016 fjz1 = _mm_add_pd(fjz1,tz);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 r12 = _mm_mul_pd(rsq12,rinv12);
1024 /* Calculate table index by multiplying r with table scale and truncate to integer */
1025 rt = _mm_mul_pd(r12,vftabscale);
1026 vfitab = _mm_cvttpd_epi32(rt);
1027 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1028 vfitab = _mm_slli_epi32(vfitab,2);
1030 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1031 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1032 F = _mm_setzero_pd();
1033 GMX_MM_TRANSPOSE2_PD(Y,F);
1034 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1035 H = _mm_setzero_pd();
1036 GMX_MM_TRANSPOSE2_PD(G,H);
1037 Heps = _mm_mul_pd(vfeps,H);
1038 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1039 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1040 velec = _mm_mul_pd(qq12,VV);
1041 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1042 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1044 /* Update potential sum for this i atom from the interaction with this j atom. */
1045 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1046 velecsum = _mm_add_pd(velecsum,velec);
1048 fscal = felec;
1050 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1052 /* Calculate temporary vectorial force */
1053 tx = _mm_mul_pd(fscal,dx12);
1054 ty = _mm_mul_pd(fscal,dy12);
1055 tz = _mm_mul_pd(fscal,dz12);
1057 /* Update vectorial force */
1058 fix1 = _mm_add_pd(fix1,tx);
1059 fiy1 = _mm_add_pd(fiy1,ty);
1060 fiz1 = _mm_add_pd(fiz1,tz);
1062 fjx2 = _mm_add_pd(fjx2,tx);
1063 fjy2 = _mm_add_pd(fjy2,ty);
1064 fjz2 = _mm_add_pd(fjz2,tz);
1066 /**************************
1067 * CALCULATE INTERACTIONS *
1068 **************************/
1070 r20 = _mm_mul_pd(rsq20,rinv20);
1072 /* Calculate table index by multiplying r with table scale and truncate to integer */
1073 rt = _mm_mul_pd(r20,vftabscale);
1074 vfitab = _mm_cvttpd_epi32(rt);
1075 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1076 vfitab = _mm_slli_epi32(vfitab,2);
1078 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1079 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1080 F = _mm_setzero_pd();
1081 GMX_MM_TRANSPOSE2_PD(Y,F);
1082 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1083 H = _mm_setzero_pd();
1084 GMX_MM_TRANSPOSE2_PD(G,H);
1085 Heps = _mm_mul_pd(vfeps,H);
1086 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1087 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1088 velec = _mm_mul_pd(qq20,VV);
1089 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1090 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1092 /* Update potential sum for this i atom from the interaction with this j atom. */
1093 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1094 velecsum = _mm_add_pd(velecsum,velec);
1096 fscal = felec;
1098 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1100 /* Calculate temporary vectorial force */
1101 tx = _mm_mul_pd(fscal,dx20);
1102 ty = _mm_mul_pd(fscal,dy20);
1103 tz = _mm_mul_pd(fscal,dz20);
1105 /* Update vectorial force */
1106 fix2 = _mm_add_pd(fix2,tx);
1107 fiy2 = _mm_add_pd(fiy2,ty);
1108 fiz2 = _mm_add_pd(fiz2,tz);
1110 fjx0 = _mm_add_pd(fjx0,tx);
1111 fjy0 = _mm_add_pd(fjy0,ty);
1112 fjz0 = _mm_add_pd(fjz0,tz);
1114 /**************************
1115 * CALCULATE INTERACTIONS *
1116 **************************/
1118 r21 = _mm_mul_pd(rsq21,rinv21);
1120 /* Calculate table index by multiplying r with table scale and truncate to integer */
1121 rt = _mm_mul_pd(r21,vftabscale);
1122 vfitab = _mm_cvttpd_epi32(rt);
1123 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1124 vfitab = _mm_slli_epi32(vfitab,2);
1126 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1127 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1128 F = _mm_setzero_pd();
1129 GMX_MM_TRANSPOSE2_PD(Y,F);
1130 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1131 H = _mm_setzero_pd();
1132 GMX_MM_TRANSPOSE2_PD(G,H);
1133 Heps = _mm_mul_pd(vfeps,H);
1134 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1135 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1136 velec = _mm_mul_pd(qq21,VV);
1137 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1138 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1142 velecsum = _mm_add_pd(velecsum,velec);
1144 fscal = felec;
1146 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1148 /* Calculate temporary vectorial force */
1149 tx = _mm_mul_pd(fscal,dx21);
1150 ty = _mm_mul_pd(fscal,dy21);
1151 tz = _mm_mul_pd(fscal,dz21);
1153 /* Update vectorial force */
1154 fix2 = _mm_add_pd(fix2,tx);
1155 fiy2 = _mm_add_pd(fiy2,ty);
1156 fiz2 = _mm_add_pd(fiz2,tz);
1158 fjx1 = _mm_add_pd(fjx1,tx);
1159 fjy1 = _mm_add_pd(fjy1,ty);
1160 fjz1 = _mm_add_pd(fjz1,tz);
1162 /**************************
1163 * CALCULATE INTERACTIONS *
1164 **************************/
1166 r22 = _mm_mul_pd(rsq22,rinv22);
1168 /* Calculate table index by multiplying r with table scale and truncate to integer */
1169 rt = _mm_mul_pd(r22,vftabscale);
1170 vfitab = _mm_cvttpd_epi32(rt);
1171 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1172 vfitab = _mm_slli_epi32(vfitab,2);
1174 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1175 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1176 F = _mm_setzero_pd();
1177 GMX_MM_TRANSPOSE2_PD(Y,F);
1178 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1179 H = _mm_setzero_pd();
1180 GMX_MM_TRANSPOSE2_PD(G,H);
1181 Heps = _mm_mul_pd(vfeps,H);
1182 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1183 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1184 velec = _mm_mul_pd(qq22,VV);
1185 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1186 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1190 velecsum = _mm_add_pd(velecsum,velec);
1192 fscal = felec;
1194 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1196 /* Calculate temporary vectorial force */
1197 tx = _mm_mul_pd(fscal,dx22);
1198 ty = _mm_mul_pd(fscal,dy22);
1199 tz = _mm_mul_pd(fscal,dz22);
1201 /* Update vectorial force */
1202 fix2 = _mm_add_pd(fix2,tx);
1203 fiy2 = _mm_add_pd(fiy2,ty);
1204 fiz2 = _mm_add_pd(fiz2,tz);
1206 fjx2 = _mm_add_pd(fjx2,tx);
1207 fjy2 = _mm_add_pd(fjy2,ty);
1208 fjz2 = _mm_add_pd(fjz2,tz);
1210 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1212 /* Inner loop uses 400 flops */
1215 /* End of innermost loop */
1217 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1218 f+i_coord_offset,fshift+i_shift_offset);
1220 ggid = gid[iidx];
1221 /* Update potential energies */
1222 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1223 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1225 /* Increment number of inner iterations */
1226 inneriter += j_index_end - j_index_start;
1228 /* Outer loop uses 20 flops */
1231 /* Increment number of outer iterations */
1232 outeriter += nri;
1234 /* Update outer/inner flops */
1236 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*400);
1239 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double
1240 * Electrostatics interaction: CubicSplineTable
1241 * VdW interaction: LennardJones
1242 * Geometry: Water3-Water3
1243 * Calculate force/pot: Force
1245 void
1246 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double
1247 (t_nblist * gmx_restrict nlist,
1248 rvec * gmx_restrict xx,
1249 rvec * gmx_restrict ff,
1250 t_forcerec * gmx_restrict fr,
1251 t_mdatoms * gmx_restrict mdatoms,
1252 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1253 t_nrnb * gmx_restrict nrnb)
1255 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1256 * just 0 for non-waters.
1257 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1258 * jnr indices corresponding to data put in the four positions in the SIMD register.
1260 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1261 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1262 int jnrA,jnrB;
1263 int j_coord_offsetA,j_coord_offsetB;
1264 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1265 real rcutoff_scalar;
1266 real *shiftvec,*fshift,*x,*f;
1267 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1268 int vdwioffset0;
1269 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1270 int vdwioffset1;
1271 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1272 int vdwioffset2;
1273 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1274 int vdwjidx0A,vdwjidx0B;
1275 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1276 int vdwjidx1A,vdwjidx1B;
1277 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1278 int vdwjidx2A,vdwjidx2B;
1279 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1280 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1281 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1282 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1283 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1284 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1285 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1286 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1287 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1288 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1289 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1290 real *charge;
1291 int nvdwtype;
1292 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1293 int *vdwtype;
1294 real *vdwparam;
1295 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1296 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1297 __m128i vfitab;
1298 __m128i ifour = _mm_set1_epi32(4);
1299 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1300 real *vftab;
1301 __m128d dummy_mask,cutoff_mask;
1302 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1303 __m128d one = _mm_set1_pd(1.0);
1304 __m128d two = _mm_set1_pd(2.0);
1305 x = xx[0];
1306 f = ff[0];
1308 nri = nlist->nri;
1309 iinr = nlist->iinr;
1310 jindex = nlist->jindex;
1311 jjnr = nlist->jjnr;
1312 shiftidx = nlist->shift;
1313 gid = nlist->gid;
1314 shiftvec = fr->shift_vec[0];
1315 fshift = fr->fshift[0];
1316 facel = _mm_set1_pd(fr->epsfac);
1317 charge = mdatoms->chargeA;
1318 nvdwtype = fr->ntype;
1319 vdwparam = fr->nbfp;
1320 vdwtype = mdatoms->typeA;
1322 vftab = kernel_data->table_elec->data;
1323 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1325 /* Setup water-specific parameters */
1326 inr = nlist->iinr[0];
1327 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1328 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1329 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1330 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1332 jq0 = _mm_set1_pd(charge[inr+0]);
1333 jq1 = _mm_set1_pd(charge[inr+1]);
1334 jq2 = _mm_set1_pd(charge[inr+2]);
1335 vdwjidx0A = 2*vdwtype[inr+0];
1336 qq00 = _mm_mul_pd(iq0,jq0);
1337 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1338 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1339 qq01 = _mm_mul_pd(iq0,jq1);
1340 qq02 = _mm_mul_pd(iq0,jq2);
1341 qq10 = _mm_mul_pd(iq1,jq0);
1342 qq11 = _mm_mul_pd(iq1,jq1);
1343 qq12 = _mm_mul_pd(iq1,jq2);
1344 qq20 = _mm_mul_pd(iq2,jq0);
1345 qq21 = _mm_mul_pd(iq2,jq1);
1346 qq22 = _mm_mul_pd(iq2,jq2);
1348 /* Avoid stupid compiler warnings */
1349 jnrA = jnrB = 0;
1350 j_coord_offsetA = 0;
1351 j_coord_offsetB = 0;
1353 outeriter = 0;
1354 inneriter = 0;
1356 /* Start outer loop over neighborlists */
1357 for(iidx=0; iidx<nri; iidx++)
1359 /* Load shift vector for this list */
1360 i_shift_offset = DIM*shiftidx[iidx];
1362 /* Load limits for loop over neighbors */
1363 j_index_start = jindex[iidx];
1364 j_index_end = jindex[iidx+1];
1366 /* Get outer coordinate index */
1367 inr = iinr[iidx];
1368 i_coord_offset = DIM*inr;
1370 /* Load i particle coords and add shift vector */
1371 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1372 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1374 fix0 = _mm_setzero_pd();
1375 fiy0 = _mm_setzero_pd();
1376 fiz0 = _mm_setzero_pd();
1377 fix1 = _mm_setzero_pd();
1378 fiy1 = _mm_setzero_pd();
1379 fiz1 = _mm_setzero_pd();
1380 fix2 = _mm_setzero_pd();
1381 fiy2 = _mm_setzero_pd();
1382 fiz2 = _mm_setzero_pd();
1384 /* Start inner kernel loop */
1385 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1388 /* Get j neighbor index, and coordinate index */
1389 jnrA = jjnr[jidx];
1390 jnrB = jjnr[jidx+1];
1391 j_coord_offsetA = DIM*jnrA;
1392 j_coord_offsetB = DIM*jnrB;
1394 /* load j atom coordinates */
1395 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1396 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1398 /* Calculate displacement vector */
1399 dx00 = _mm_sub_pd(ix0,jx0);
1400 dy00 = _mm_sub_pd(iy0,jy0);
1401 dz00 = _mm_sub_pd(iz0,jz0);
1402 dx01 = _mm_sub_pd(ix0,jx1);
1403 dy01 = _mm_sub_pd(iy0,jy1);
1404 dz01 = _mm_sub_pd(iz0,jz1);
1405 dx02 = _mm_sub_pd(ix0,jx2);
1406 dy02 = _mm_sub_pd(iy0,jy2);
1407 dz02 = _mm_sub_pd(iz0,jz2);
1408 dx10 = _mm_sub_pd(ix1,jx0);
1409 dy10 = _mm_sub_pd(iy1,jy0);
1410 dz10 = _mm_sub_pd(iz1,jz0);
1411 dx11 = _mm_sub_pd(ix1,jx1);
1412 dy11 = _mm_sub_pd(iy1,jy1);
1413 dz11 = _mm_sub_pd(iz1,jz1);
1414 dx12 = _mm_sub_pd(ix1,jx2);
1415 dy12 = _mm_sub_pd(iy1,jy2);
1416 dz12 = _mm_sub_pd(iz1,jz2);
1417 dx20 = _mm_sub_pd(ix2,jx0);
1418 dy20 = _mm_sub_pd(iy2,jy0);
1419 dz20 = _mm_sub_pd(iz2,jz0);
1420 dx21 = _mm_sub_pd(ix2,jx1);
1421 dy21 = _mm_sub_pd(iy2,jy1);
1422 dz21 = _mm_sub_pd(iz2,jz1);
1423 dx22 = _mm_sub_pd(ix2,jx2);
1424 dy22 = _mm_sub_pd(iy2,jy2);
1425 dz22 = _mm_sub_pd(iz2,jz2);
1427 /* Calculate squared distance and things based on it */
1428 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1429 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1430 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1431 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1432 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1433 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1434 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1435 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1436 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1438 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1439 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1440 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1441 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1442 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1443 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1444 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1445 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1446 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1448 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1450 fjx0 = _mm_setzero_pd();
1451 fjy0 = _mm_setzero_pd();
1452 fjz0 = _mm_setzero_pd();
1453 fjx1 = _mm_setzero_pd();
1454 fjy1 = _mm_setzero_pd();
1455 fjz1 = _mm_setzero_pd();
1456 fjx2 = _mm_setzero_pd();
1457 fjy2 = _mm_setzero_pd();
1458 fjz2 = _mm_setzero_pd();
1460 /**************************
1461 * CALCULATE INTERACTIONS *
1462 **************************/
1464 r00 = _mm_mul_pd(rsq00,rinv00);
1466 /* Calculate table index by multiplying r with table scale and truncate to integer */
1467 rt = _mm_mul_pd(r00,vftabscale);
1468 vfitab = _mm_cvttpd_epi32(rt);
1469 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1470 vfitab = _mm_slli_epi32(vfitab,2);
1472 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1473 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1474 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1475 GMX_MM_TRANSPOSE2_PD(Y,F);
1476 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1477 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1478 GMX_MM_TRANSPOSE2_PD(G,H);
1479 Heps = _mm_mul_pd(vfeps,H);
1480 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1481 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1482 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1484 /* LENNARD-JONES DISPERSION/REPULSION */
1486 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1487 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1489 fscal = _mm_add_pd(felec,fvdw);
1491 /* Calculate temporary vectorial force */
1492 tx = _mm_mul_pd(fscal,dx00);
1493 ty = _mm_mul_pd(fscal,dy00);
1494 tz = _mm_mul_pd(fscal,dz00);
1496 /* Update vectorial force */
1497 fix0 = _mm_add_pd(fix0,tx);
1498 fiy0 = _mm_add_pd(fiy0,ty);
1499 fiz0 = _mm_add_pd(fiz0,tz);
1501 fjx0 = _mm_add_pd(fjx0,tx);
1502 fjy0 = _mm_add_pd(fjy0,ty);
1503 fjz0 = _mm_add_pd(fjz0,tz);
1505 /**************************
1506 * CALCULATE INTERACTIONS *
1507 **************************/
1509 r01 = _mm_mul_pd(rsq01,rinv01);
1511 /* Calculate table index by multiplying r with table scale and truncate to integer */
1512 rt = _mm_mul_pd(r01,vftabscale);
1513 vfitab = _mm_cvttpd_epi32(rt);
1514 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1515 vfitab = _mm_slli_epi32(vfitab,2);
1517 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1518 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1519 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1520 GMX_MM_TRANSPOSE2_PD(Y,F);
1521 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1522 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1523 GMX_MM_TRANSPOSE2_PD(G,H);
1524 Heps = _mm_mul_pd(vfeps,H);
1525 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1526 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1527 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1529 fscal = felec;
1531 /* Calculate temporary vectorial force */
1532 tx = _mm_mul_pd(fscal,dx01);
1533 ty = _mm_mul_pd(fscal,dy01);
1534 tz = _mm_mul_pd(fscal,dz01);
1536 /* Update vectorial force */
1537 fix0 = _mm_add_pd(fix0,tx);
1538 fiy0 = _mm_add_pd(fiy0,ty);
1539 fiz0 = _mm_add_pd(fiz0,tz);
1541 fjx1 = _mm_add_pd(fjx1,tx);
1542 fjy1 = _mm_add_pd(fjy1,ty);
1543 fjz1 = _mm_add_pd(fjz1,tz);
1545 /**************************
1546 * CALCULATE INTERACTIONS *
1547 **************************/
1549 r02 = _mm_mul_pd(rsq02,rinv02);
1551 /* Calculate table index by multiplying r with table scale and truncate to integer */
1552 rt = _mm_mul_pd(r02,vftabscale);
1553 vfitab = _mm_cvttpd_epi32(rt);
1554 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1555 vfitab = _mm_slli_epi32(vfitab,2);
1557 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1558 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1559 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1560 GMX_MM_TRANSPOSE2_PD(Y,F);
1561 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1562 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1563 GMX_MM_TRANSPOSE2_PD(G,H);
1564 Heps = _mm_mul_pd(vfeps,H);
1565 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1566 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1567 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1569 fscal = felec;
1571 /* Calculate temporary vectorial force */
1572 tx = _mm_mul_pd(fscal,dx02);
1573 ty = _mm_mul_pd(fscal,dy02);
1574 tz = _mm_mul_pd(fscal,dz02);
1576 /* Update vectorial force */
1577 fix0 = _mm_add_pd(fix0,tx);
1578 fiy0 = _mm_add_pd(fiy0,ty);
1579 fiz0 = _mm_add_pd(fiz0,tz);
1581 fjx2 = _mm_add_pd(fjx2,tx);
1582 fjy2 = _mm_add_pd(fjy2,ty);
1583 fjz2 = _mm_add_pd(fjz2,tz);
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 r10 = _mm_mul_pd(rsq10,rinv10);
1591 /* Calculate table index by multiplying r with table scale and truncate to integer */
1592 rt = _mm_mul_pd(r10,vftabscale);
1593 vfitab = _mm_cvttpd_epi32(rt);
1594 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1595 vfitab = _mm_slli_epi32(vfitab,2);
1597 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1598 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1599 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1600 GMX_MM_TRANSPOSE2_PD(Y,F);
1601 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1602 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1603 GMX_MM_TRANSPOSE2_PD(G,H);
1604 Heps = _mm_mul_pd(vfeps,H);
1605 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1606 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1607 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1609 fscal = felec;
1611 /* Calculate temporary vectorial force */
1612 tx = _mm_mul_pd(fscal,dx10);
1613 ty = _mm_mul_pd(fscal,dy10);
1614 tz = _mm_mul_pd(fscal,dz10);
1616 /* Update vectorial force */
1617 fix1 = _mm_add_pd(fix1,tx);
1618 fiy1 = _mm_add_pd(fiy1,ty);
1619 fiz1 = _mm_add_pd(fiz1,tz);
1621 fjx0 = _mm_add_pd(fjx0,tx);
1622 fjy0 = _mm_add_pd(fjy0,ty);
1623 fjz0 = _mm_add_pd(fjz0,tz);
1625 /**************************
1626 * CALCULATE INTERACTIONS *
1627 **************************/
1629 r11 = _mm_mul_pd(rsq11,rinv11);
1631 /* Calculate table index by multiplying r with table scale and truncate to integer */
1632 rt = _mm_mul_pd(r11,vftabscale);
1633 vfitab = _mm_cvttpd_epi32(rt);
1634 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1635 vfitab = _mm_slli_epi32(vfitab,2);
1637 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1638 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1639 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1640 GMX_MM_TRANSPOSE2_PD(Y,F);
1641 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1642 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1643 GMX_MM_TRANSPOSE2_PD(G,H);
1644 Heps = _mm_mul_pd(vfeps,H);
1645 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1646 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1647 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1649 fscal = felec;
1651 /* Calculate temporary vectorial force */
1652 tx = _mm_mul_pd(fscal,dx11);
1653 ty = _mm_mul_pd(fscal,dy11);
1654 tz = _mm_mul_pd(fscal,dz11);
1656 /* Update vectorial force */
1657 fix1 = _mm_add_pd(fix1,tx);
1658 fiy1 = _mm_add_pd(fiy1,ty);
1659 fiz1 = _mm_add_pd(fiz1,tz);
1661 fjx1 = _mm_add_pd(fjx1,tx);
1662 fjy1 = _mm_add_pd(fjy1,ty);
1663 fjz1 = _mm_add_pd(fjz1,tz);
1665 /**************************
1666 * CALCULATE INTERACTIONS *
1667 **************************/
1669 r12 = _mm_mul_pd(rsq12,rinv12);
1671 /* Calculate table index by multiplying r with table scale and truncate to integer */
1672 rt = _mm_mul_pd(r12,vftabscale);
1673 vfitab = _mm_cvttpd_epi32(rt);
1674 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1675 vfitab = _mm_slli_epi32(vfitab,2);
1677 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1679 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1680 GMX_MM_TRANSPOSE2_PD(Y,F);
1681 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1682 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1683 GMX_MM_TRANSPOSE2_PD(G,H);
1684 Heps = _mm_mul_pd(vfeps,H);
1685 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1686 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1687 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1689 fscal = felec;
1691 /* Calculate temporary vectorial force */
1692 tx = _mm_mul_pd(fscal,dx12);
1693 ty = _mm_mul_pd(fscal,dy12);
1694 tz = _mm_mul_pd(fscal,dz12);
1696 /* Update vectorial force */
1697 fix1 = _mm_add_pd(fix1,tx);
1698 fiy1 = _mm_add_pd(fiy1,ty);
1699 fiz1 = _mm_add_pd(fiz1,tz);
1701 fjx2 = _mm_add_pd(fjx2,tx);
1702 fjy2 = _mm_add_pd(fjy2,ty);
1703 fjz2 = _mm_add_pd(fjz2,tz);
1705 /**************************
1706 * CALCULATE INTERACTIONS *
1707 **************************/
1709 r20 = _mm_mul_pd(rsq20,rinv20);
1711 /* Calculate table index by multiplying r with table scale and truncate to integer */
1712 rt = _mm_mul_pd(r20,vftabscale);
1713 vfitab = _mm_cvttpd_epi32(rt);
1714 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1715 vfitab = _mm_slli_epi32(vfitab,2);
1717 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1718 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1719 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1720 GMX_MM_TRANSPOSE2_PD(Y,F);
1721 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1722 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1723 GMX_MM_TRANSPOSE2_PD(G,H);
1724 Heps = _mm_mul_pd(vfeps,H);
1725 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1726 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1727 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1729 fscal = felec;
1731 /* Calculate temporary vectorial force */
1732 tx = _mm_mul_pd(fscal,dx20);
1733 ty = _mm_mul_pd(fscal,dy20);
1734 tz = _mm_mul_pd(fscal,dz20);
1736 /* Update vectorial force */
1737 fix2 = _mm_add_pd(fix2,tx);
1738 fiy2 = _mm_add_pd(fiy2,ty);
1739 fiz2 = _mm_add_pd(fiz2,tz);
1741 fjx0 = _mm_add_pd(fjx0,tx);
1742 fjy0 = _mm_add_pd(fjy0,ty);
1743 fjz0 = _mm_add_pd(fjz0,tz);
1745 /**************************
1746 * CALCULATE INTERACTIONS *
1747 **************************/
1749 r21 = _mm_mul_pd(rsq21,rinv21);
1751 /* Calculate table index by multiplying r with table scale and truncate to integer */
1752 rt = _mm_mul_pd(r21,vftabscale);
1753 vfitab = _mm_cvttpd_epi32(rt);
1754 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1755 vfitab = _mm_slli_epi32(vfitab,2);
1757 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1758 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1759 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1760 GMX_MM_TRANSPOSE2_PD(Y,F);
1761 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1762 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1763 GMX_MM_TRANSPOSE2_PD(G,H);
1764 Heps = _mm_mul_pd(vfeps,H);
1765 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1766 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1767 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1769 fscal = felec;
1771 /* Calculate temporary vectorial force */
1772 tx = _mm_mul_pd(fscal,dx21);
1773 ty = _mm_mul_pd(fscal,dy21);
1774 tz = _mm_mul_pd(fscal,dz21);
1776 /* Update vectorial force */
1777 fix2 = _mm_add_pd(fix2,tx);
1778 fiy2 = _mm_add_pd(fiy2,ty);
1779 fiz2 = _mm_add_pd(fiz2,tz);
1781 fjx1 = _mm_add_pd(fjx1,tx);
1782 fjy1 = _mm_add_pd(fjy1,ty);
1783 fjz1 = _mm_add_pd(fjz1,tz);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r22 = _mm_mul_pd(rsq22,rinv22);
1791 /* Calculate table index by multiplying r with table scale and truncate to integer */
1792 rt = _mm_mul_pd(r22,vftabscale);
1793 vfitab = _mm_cvttpd_epi32(rt);
1794 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1795 vfitab = _mm_slli_epi32(vfitab,2);
1797 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1798 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1799 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1800 GMX_MM_TRANSPOSE2_PD(Y,F);
1801 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1802 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1803 GMX_MM_TRANSPOSE2_PD(G,H);
1804 Heps = _mm_mul_pd(vfeps,H);
1805 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1806 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1807 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1809 fscal = felec;
1811 /* Calculate temporary vectorial force */
1812 tx = _mm_mul_pd(fscal,dx22);
1813 ty = _mm_mul_pd(fscal,dy22);
1814 tz = _mm_mul_pd(fscal,dz22);
1816 /* Update vectorial force */
1817 fix2 = _mm_add_pd(fix2,tx);
1818 fiy2 = _mm_add_pd(fiy2,ty);
1819 fiz2 = _mm_add_pd(fiz2,tz);
1821 fjx2 = _mm_add_pd(fjx2,tx);
1822 fjy2 = _mm_add_pd(fjy2,ty);
1823 fjz2 = _mm_add_pd(fjz2,tz);
1825 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1827 /* Inner loop uses 359 flops */
1830 if(jidx<j_index_end)
1833 jnrA = jjnr[jidx];
1834 j_coord_offsetA = DIM*jnrA;
1836 /* load j atom coordinates */
1837 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1838 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1840 /* Calculate displacement vector */
1841 dx00 = _mm_sub_pd(ix0,jx0);
1842 dy00 = _mm_sub_pd(iy0,jy0);
1843 dz00 = _mm_sub_pd(iz0,jz0);
1844 dx01 = _mm_sub_pd(ix0,jx1);
1845 dy01 = _mm_sub_pd(iy0,jy1);
1846 dz01 = _mm_sub_pd(iz0,jz1);
1847 dx02 = _mm_sub_pd(ix0,jx2);
1848 dy02 = _mm_sub_pd(iy0,jy2);
1849 dz02 = _mm_sub_pd(iz0,jz2);
1850 dx10 = _mm_sub_pd(ix1,jx0);
1851 dy10 = _mm_sub_pd(iy1,jy0);
1852 dz10 = _mm_sub_pd(iz1,jz0);
1853 dx11 = _mm_sub_pd(ix1,jx1);
1854 dy11 = _mm_sub_pd(iy1,jy1);
1855 dz11 = _mm_sub_pd(iz1,jz1);
1856 dx12 = _mm_sub_pd(ix1,jx2);
1857 dy12 = _mm_sub_pd(iy1,jy2);
1858 dz12 = _mm_sub_pd(iz1,jz2);
1859 dx20 = _mm_sub_pd(ix2,jx0);
1860 dy20 = _mm_sub_pd(iy2,jy0);
1861 dz20 = _mm_sub_pd(iz2,jz0);
1862 dx21 = _mm_sub_pd(ix2,jx1);
1863 dy21 = _mm_sub_pd(iy2,jy1);
1864 dz21 = _mm_sub_pd(iz2,jz1);
1865 dx22 = _mm_sub_pd(ix2,jx2);
1866 dy22 = _mm_sub_pd(iy2,jy2);
1867 dz22 = _mm_sub_pd(iz2,jz2);
1869 /* Calculate squared distance and things based on it */
1870 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1871 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1872 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1873 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1874 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1875 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1876 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1877 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1878 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1880 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1881 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1882 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1883 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1884 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1885 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1886 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1887 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1888 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1890 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1892 fjx0 = _mm_setzero_pd();
1893 fjy0 = _mm_setzero_pd();
1894 fjz0 = _mm_setzero_pd();
1895 fjx1 = _mm_setzero_pd();
1896 fjy1 = _mm_setzero_pd();
1897 fjz1 = _mm_setzero_pd();
1898 fjx2 = _mm_setzero_pd();
1899 fjy2 = _mm_setzero_pd();
1900 fjz2 = _mm_setzero_pd();
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 r00 = _mm_mul_pd(rsq00,rinv00);
1908 /* Calculate table index by multiplying r with table scale and truncate to integer */
1909 rt = _mm_mul_pd(r00,vftabscale);
1910 vfitab = _mm_cvttpd_epi32(rt);
1911 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1912 vfitab = _mm_slli_epi32(vfitab,2);
1914 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1915 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1916 F = _mm_setzero_pd();
1917 GMX_MM_TRANSPOSE2_PD(Y,F);
1918 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1919 H = _mm_setzero_pd();
1920 GMX_MM_TRANSPOSE2_PD(G,H);
1921 Heps = _mm_mul_pd(vfeps,H);
1922 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1923 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1924 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1926 /* LENNARD-JONES DISPERSION/REPULSION */
1928 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1929 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1931 fscal = _mm_add_pd(felec,fvdw);
1933 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1935 /* Calculate temporary vectorial force */
1936 tx = _mm_mul_pd(fscal,dx00);
1937 ty = _mm_mul_pd(fscal,dy00);
1938 tz = _mm_mul_pd(fscal,dz00);
1940 /* Update vectorial force */
1941 fix0 = _mm_add_pd(fix0,tx);
1942 fiy0 = _mm_add_pd(fiy0,ty);
1943 fiz0 = _mm_add_pd(fiz0,tz);
1945 fjx0 = _mm_add_pd(fjx0,tx);
1946 fjy0 = _mm_add_pd(fjy0,ty);
1947 fjz0 = _mm_add_pd(fjz0,tz);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 r01 = _mm_mul_pd(rsq01,rinv01);
1955 /* Calculate table index by multiplying r with table scale and truncate to integer */
1956 rt = _mm_mul_pd(r01,vftabscale);
1957 vfitab = _mm_cvttpd_epi32(rt);
1958 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1959 vfitab = _mm_slli_epi32(vfitab,2);
1961 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1962 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1963 F = _mm_setzero_pd();
1964 GMX_MM_TRANSPOSE2_PD(Y,F);
1965 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1966 H = _mm_setzero_pd();
1967 GMX_MM_TRANSPOSE2_PD(G,H);
1968 Heps = _mm_mul_pd(vfeps,H);
1969 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1970 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1971 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1973 fscal = felec;
1975 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1977 /* Calculate temporary vectorial force */
1978 tx = _mm_mul_pd(fscal,dx01);
1979 ty = _mm_mul_pd(fscal,dy01);
1980 tz = _mm_mul_pd(fscal,dz01);
1982 /* Update vectorial force */
1983 fix0 = _mm_add_pd(fix0,tx);
1984 fiy0 = _mm_add_pd(fiy0,ty);
1985 fiz0 = _mm_add_pd(fiz0,tz);
1987 fjx1 = _mm_add_pd(fjx1,tx);
1988 fjy1 = _mm_add_pd(fjy1,ty);
1989 fjz1 = _mm_add_pd(fjz1,tz);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 r02 = _mm_mul_pd(rsq02,rinv02);
1997 /* Calculate table index by multiplying r with table scale and truncate to integer */
1998 rt = _mm_mul_pd(r02,vftabscale);
1999 vfitab = _mm_cvttpd_epi32(rt);
2000 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2001 vfitab = _mm_slli_epi32(vfitab,2);
2003 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2004 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2005 F = _mm_setzero_pd();
2006 GMX_MM_TRANSPOSE2_PD(Y,F);
2007 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2008 H = _mm_setzero_pd();
2009 GMX_MM_TRANSPOSE2_PD(G,H);
2010 Heps = _mm_mul_pd(vfeps,H);
2011 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2012 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2013 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2015 fscal = felec;
2017 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2019 /* Calculate temporary vectorial force */
2020 tx = _mm_mul_pd(fscal,dx02);
2021 ty = _mm_mul_pd(fscal,dy02);
2022 tz = _mm_mul_pd(fscal,dz02);
2024 /* Update vectorial force */
2025 fix0 = _mm_add_pd(fix0,tx);
2026 fiy0 = _mm_add_pd(fiy0,ty);
2027 fiz0 = _mm_add_pd(fiz0,tz);
2029 fjx2 = _mm_add_pd(fjx2,tx);
2030 fjy2 = _mm_add_pd(fjy2,ty);
2031 fjz2 = _mm_add_pd(fjz2,tz);
2033 /**************************
2034 * CALCULATE INTERACTIONS *
2035 **************************/
2037 r10 = _mm_mul_pd(rsq10,rinv10);
2039 /* Calculate table index by multiplying r with table scale and truncate to integer */
2040 rt = _mm_mul_pd(r10,vftabscale);
2041 vfitab = _mm_cvttpd_epi32(rt);
2042 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2043 vfitab = _mm_slli_epi32(vfitab,2);
2045 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2046 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2047 F = _mm_setzero_pd();
2048 GMX_MM_TRANSPOSE2_PD(Y,F);
2049 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2050 H = _mm_setzero_pd();
2051 GMX_MM_TRANSPOSE2_PD(G,H);
2052 Heps = _mm_mul_pd(vfeps,H);
2053 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2054 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2055 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2057 fscal = felec;
2059 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2061 /* Calculate temporary vectorial force */
2062 tx = _mm_mul_pd(fscal,dx10);
2063 ty = _mm_mul_pd(fscal,dy10);
2064 tz = _mm_mul_pd(fscal,dz10);
2066 /* Update vectorial force */
2067 fix1 = _mm_add_pd(fix1,tx);
2068 fiy1 = _mm_add_pd(fiy1,ty);
2069 fiz1 = _mm_add_pd(fiz1,tz);
2071 fjx0 = _mm_add_pd(fjx0,tx);
2072 fjy0 = _mm_add_pd(fjy0,ty);
2073 fjz0 = _mm_add_pd(fjz0,tz);
2075 /**************************
2076 * CALCULATE INTERACTIONS *
2077 **************************/
2079 r11 = _mm_mul_pd(rsq11,rinv11);
2081 /* Calculate table index by multiplying r with table scale and truncate to integer */
2082 rt = _mm_mul_pd(r11,vftabscale);
2083 vfitab = _mm_cvttpd_epi32(rt);
2084 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2085 vfitab = _mm_slli_epi32(vfitab,2);
2087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2088 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2089 F = _mm_setzero_pd();
2090 GMX_MM_TRANSPOSE2_PD(Y,F);
2091 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2092 H = _mm_setzero_pd();
2093 GMX_MM_TRANSPOSE2_PD(G,H);
2094 Heps = _mm_mul_pd(vfeps,H);
2095 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2096 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2097 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2099 fscal = felec;
2101 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2103 /* Calculate temporary vectorial force */
2104 tx = _mm_mul_pd(fscal,dx11);
2105 ty = _mm_mul_pd(fscal,dy11);
2106 tz = _mm_mul_pd(fscal,dz11);
2108 /* Update vectorial force */
2109 fix1 = _mm_add_pd(fix1,tx);
2110 fiy1 = _mm_add_pd(fiy1,ty);
2111 fiz1 = _mm_add_pd(fiz1,tz);
2113 fjx1 = _mm_add_pd(fjx1,tx);
2114 fjy1 = _mm_add_pd(fjy1,ty);
2115 fjz1 = _mm_add_pd(fjz1,tz);
2117 /**************************
2118 * CALCULATE INTERACTIONS *
2119 **************************/
2121 r12 = _mm_mul_pd(rsq12,rinv12);
2123 /* Calculate table index by multiplying r with table scale and truncate to integer */
2124 rt = _mm_mul_pd(r12,vftabscale);
2125 vfitab = _mm_cvttpd_epi32(rt);
2126 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2127 vfitab = _mm_slli_epi32(vfitab,2);
2129 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2130 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2131 F = _mm_setzero_pd();
2132 GMX_MM_TRANSPOSE2_PD(Y,F);
2133 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2134 H = _mm_setzero_pd();
2135 GMX_MM_TRANSPOSE2_PD(G,H);
2136 Heps = _mm_mul_pd(vfeps,H);
2137 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2138 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2139 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2141 fscal = felec;
2143 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2145 /* Calculate temporary vectorial force */
2146 tx = _mm_mul_pd(fscal,dx12);
2147 ty = _mm_mul_pd(fscal,dy12);
2148 tz = _mm_mul_pd(fscal,dz12);
2150 /* Update vectorial force */
2151 fix1 = _mm_add_pd(fix1,tx);
2152 fiy1 = _mm_add_pd(fiy1,ty);
2153 fiz1 = _mm_add_pd(fiz1,tz);
2155 fjx2 = _mm_add_pd(fjx2,tx);
2156 fjy2 = _mm_add_pd(fjy2,ty);
2157 fjz2 = _mm_add_pd(fjz2,tz);
2159 /**************************
2160 * CALCULATE INTERACTIONS *
2161 **************************/
2163 r20 = _mm_mul_pd(rsq20,rinv20);
2165 /* Calculate table index by multiplying r with table scale and truncate to integer */
2166 rt = _mm_mul_pd(r20,vftabscale);
2167 vfitab = _mm_cvttpd_epi32(rt);
2168 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2169 vfitab = _mm_slli_epi32(vfitab,2);
2171 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2172 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2173 F = _mm_setzero_pd();
2174 GMX_MM_TRANSPOSE2_PD(Y,F);
2175 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2176 H = _mm_setzero_pd();
2177 GMX_MM_TRANSPOSE2_PD(G,H);
2178 Heps = _mm_mul_pd(vfeps,H);
2179 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2180 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2181 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2183 fscal = felec;
2185 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2187 /* Calculate temporary vectorial force */
2188 tx = _mm_mul_pd(fscal,dx20);
2189 ty = _mm_mul_pd(fscal,dy20);
2190 tz = _mm_mul_pd(fscal,dz20);
2192 /* Update vectorial force */
2193 fix2 = _mm_add_pd(fix2,tx);
2194 fiy2 = _mm_add_pd(fiy2,ty);
2195 fiz2 = _mm_add_pd(fiz2,tz);
2197 fjx0 = _mm_add_pd(fjx0,tx);
2198 fjy0 = _mm_add_pd(fjy0,ty);
2199 fjz0 = _mm_add_pd(fjz0,tz);
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2205 r21 = _mm_mul_pd(rsq21,rinv21);
2207 /* Calculate table index by multiplying r with table scale and truncate to integer */
2208 rt = _mm_mul_pd(r21,vftabscale);
2209 vfitab = _mm_cvttpd_epi32(rt);
2210 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2211 vfitab = _mm_slli_epi32(vfitab,2);
2213 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2214 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2215 F = _mm_setzero_pd();
2216 GMX_MM_TRANSPOSE2_PD(Y,F);
2217 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2218 H = _mm_setzero_pd();
2219 GMX_MM_TRANSPOSE2_PD(G,H);
2220 Heps = _mm_mul_pd(vfeps,H);
2221 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2222 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2223 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2225 fscal = felec;
2227 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2229 /* Calculate temporary vectorial force */
2230 tx = _mm_mul_pd(fscal,dx21);
2231 ty = _mm_mul_pd(fscal,dy21);
2232 tz = _mm_mul_pd(fscal,dz21);
2234 /* Update vectorial force */
2235 fix2 = _mm_add_pd(fix2,tx);
2236 fiy2 = _mm_add_pd(fiy2,ty);
2237 fiz2 = _mm_add_pd(fiz2,tz);
2239 fjx1 = _mm_add_pd(fjx1,tx);
2240 fjy1 = _mm_add_pd(fjy1,ty);
2241 fjz1 = _mm_add_pd(fjz1,tz);
2243 /**************************
2244 * CALCULATE INTERACTIONS *
2245 **************************/
2247 r22 = _mm_mul_pd(rsq22,rinv22);
2249 /* Calculate table index by multiplying r with table scale and truncate to integer */
2250 rt = _mm_mul_pd(r22,vftabscale);
2251 vfitab = _mm_cvttpd_epi32(rt);
2252 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2253 vfitab = _mm_slli_epi32(vfitab,2);
2255 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2256 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2257 F = _mm_setzero_pd();
2258 GMX_MM_TRANSPOSE2_PD(Y,F);
2259 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2260 H = _mm_setzero_pd();
2261 GMX_MM_TRANSPOSE2_PD(G,H);
2262 Heps = _mm_mul_pd(vfeps,H);
2263 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2264 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2265 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2267 fscal = felec;
2269 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2271 /* Calculate temporary vectorial force */
2272 tx = _mm_mul_pd(fscal,dx22);
2273 ty = _mm_mul_pd(fscal,dy22);
2274 tz = _mm_mul_pd(fscal,dz22);
2276 /* Update vectorial force */
2277 fix2 = _mm_add_pd(fix2,tx);
2278 fiy2 = _mm_add_pd(fiy2,ty);
2279 fiz2 = _mm_add_pd(fiz2,tz);
2281 fjx2 = _mm_add_pd(fjx2,tx);
2282 fjy2 = _mm_add_pd(fjy2,ty);
2283 fjz2 = _mm_add_pd(fjz2,tz);
2285 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2287 /* Inner loop uses 359 flops */
2290 /* End of innermost loop */
2292 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2293 f+i_coord_offset,fshift+i_shift_offset);
2295 /* Increment number of inner iterations */
2296 inneriter += j_index_end - j_index_start;
2298 /* Outer loop uses 18 flops */
2301 /* Increment number of outer iterations */
2302 outeriter += nri;
2304 /* Update outer/inner flops */
2306 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);