Double precision SSE2 kernels
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_double.c
blob4d2c8920fa74836d7d0b05467ae90a45132a0b05
1 /*
2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
6 * G R O M A C S
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
17 * later version.
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
26 #include <math.h>
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
43 void
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66 int vdwioffset0;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68 int vdwioffset1;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70 int vdwioffset2;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
88 real *charge;
89 int nvdwtype;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91 int *vdwtype;
92 real *vdwparam;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
95 __m128i vfitab;
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
98 real *vftab;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
103 x = xx[0];
104 f = ff[0];
106 nri = nlist->nri;
107 iinr = nlist->iinr;
108 jindex = nlist->jindex;
109 jjnr = nlist->jjnr;
110 shiftidx = nlist->shift;
111 gid = nlist->gid;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
117 vdwparam = fr->nbfp;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_elec_vdw->data;
121 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_pd(charge[inr+0]);
131 jq1 = _mm_set1_pd(charge[inr+1]);
132 jq2 = _mm_set1_pd(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_pd(iq0,jq0);
135 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_pd(iq0,jq1);
138 qq02 = _mm_mul_pd(iq0,jq2);
139 qq10 = _mm_mul_pd(iq1,jq0);
140 qq11 = _mm_mul_pd(iq1,jq1);
141 qq12 = _mm_mul_pd(iq1,jq2);
142 qq20 = _mm_mul_pd(iq2,jq0);
143 qq21 = _mm_mul_pd(iq2,jq1);
144 qq22 = _mm_mul_pd(iq2,jq2);
146 /* Avoid stupid compiler warnings */
147 jnrA = jnrB = 0;
148 j_coord_offsetA = 0;
149 j_coord_offsetB = 0;
151 outeriter = 0;
152 inneriter = 0;
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
165 inr = iinr[iidx];
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm_setzero_pd();
173 fiy0 = _mm_setzero_pd();
174 fiz0 = _mm_setzero_pd();
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
191 jnrA = jjnr[jidx];
192 jnrB = jjnr[jidx+1];
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx01 = _mm_sub_pd(ix0,jx1);
205 dy01 = _mm_sub_pd(iy0,jy1);
206 dz01 = _mm_sub_pd(iz0,jz1);
207 dx02 = _mm_sub_pd(ix0,jx2);
208 dy02 = _mm_sub_pd(iy0,jy2);
209 dz02 = _mm_sub_pd(iz0,jz2);
210 dx10 = _mm_sub_pd(ix1,jx0);
211 dy10 = _mm_sub_pd(iy1,jy0);
212 dz10 = _mm_sub_pd(iz1,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx20 = _mm_sub_pd(ix2,jx0);
220 dy20 = _mm_sub_pd(iy2,jy0);
221 dz20 = _mm_sub_pd(iz2,jz0);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
240 rinv00 = gmx_mm_invsqrt_pd(rsq00);
241 rinv01 = gmx_mm_invsqrt_pd(rsq01);
242 rinv02 = gmx_mm_invsqrt_pd(rsq02);
243 rinv10 = gmx_mm_invsqrt_pd(rsq10);
244 rinv11 = gmx_mm_invsqrt_pd(rsq11);
245 rinv12 = gmx_mm_invsqrt_pd(rsq12);
246 rinv20 = gmx_mm_invsqrt_pd(rsq20);
247 rinv21 = gmx_mm_invsqrt_pd(rsq21);
248 rinv22 = gmx_mm_invsqrt_pd(rsq22);
250 fjx0 = _mm_setzero_pd();
251 fjy0 = _mm_setzero_pd();
252 fjz0 = _mm_setzero_pd();
253 fjx1 = _mm_setzero_pd();
254 fjy1 = _mm_setzero_pd();
255 fjz1 = _mm_setzero_pd();
256 fjx2 = _mm_setzero_pd();
257 fjy2 = _mm_setzero_pd();
258 fjz2 = _mm_setzero_pd();
260 /**************************
261 * CALCULATE INTERACTIONS *
262 **************************/
264 r00 = _mm_mul_pd(rsq00,rinv00);
266 /* Calculate table index by multiplying r with table scale and truncate to integer */
267 rt = _mm_mul_pd(r00,vftabscale);
268 vfitab = _mm_cvttpd_epi32(rt);
269 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
270 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
272 /* CUBIC SPLINE TABLE ELECTROSTATICS */
273 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
274 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
275 GMX_MM_TRANSPOSE2_PD(Y,F);
276 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
277 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
278 GMX_MM_TRANSPOSE2_PD(G,H);
279 Heps = _mm_mul_pd(vfeps,H);
280 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
281 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
282 velec = _mm_mul_pd(qq00,VV);
283 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
284 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
286 /* CUBIC SPLINE TABLE DISPERSION */
287 vfitab = _mm_add_epi32(vfitab,ifour);
288 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
289 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
290 GMX_MM_TRANSPOSE2_PD(Y,F);
291 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
292 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
293 GMX_MM_TRANSPOSE2_PD(G,H);
294 Heps = _mm_mul_pd(vfeps,H);
295 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
296 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
297 vvdw6 = _mm_mul_pd(c6_00,VV);
298 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
299 fvdw6 = _mm_mul_pd(c6_00,FF);
301 /* CUBIC SPLINE TABLE REPULSION */
302 vfitab = _mm_add_epi32(vfitab,ifour);
303 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
304 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
305 GMX_MM_TRANSPOSE2_PD(Y,F);
306 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
307 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
308 GMX_MM_TRANSPOSE2_PD(G,H);
309 Heps = _mm_mul_pd(vfeps,H);
310 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
311 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
312 vvdw12 = _mm_mul_pd(c12_00,VV);
313 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
314 fvdw12 = _mm_mul_pd(c12_00,FF);
315 vvdw = _mm_add_pd(vvdw12,vvdw6);
316 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
318 /* Update potential sum for this i atom from the interaction with this j atom. */
319 velecsum = _mm_add_pd(velecsum,velec);
320 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
322 fscal = _mm_add_pd(felec,fvdw);
324 /* Calculate temporary vectorial force */
325 tx = _mm_mul_pd(fscal,dx00);
326 ty = _mm_mul_pd(fscal,dy00);
327 tz = _mm_mul_pd(fscal,dz00);
329 /* Update vectorial force */
330 fix0 = _mm_add_pd(fix0,tx);
331 fiy0 = _mm_add_pd(fiy0,ty);
332 fiz0 = _mm_add_pd(fiz0,tz);
334 fjx0 = _mm_add_pd(fjx0,tx);
335 fjy0 = _mm_add_pd(fjy0,ty);
336 fjz0 = _mm_add_pd(fjz0,tz);
338 /**************************
339 * CALCULATE INTERACTIONS *
340 **************************/
342 r01 = _mm_mul_pd(rsq01,rinv01);
344 /* Calculate table index by multiplying r with table scale and truncate to integer */
345 rt = _mm_mul_pd(r01,vftabscale);
346 vfitab = _mm_cvttpd_epi32(rt);
347 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
348 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
350 /* CUBIC SPLINE TABLE ELECTROSTATICS */
351 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
352 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
353 GMX_MM_TRANSPOSE2_PD(Y,F);
354 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
355 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
356 GMX_MM_TRANSPOSE2_PD(G,H);
357 Heps = _mm_mul_pd(vfeps,H);
358 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
359 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
360 velec = _mm_mul_pd(qq01,VV);
361 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
362 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 velecsum = _mm_add_pd(velecsum,velec);
367 fscal = felec;
369 /* Calculate temporary vectorial force */
370 tx = _mm_mul_pd(fscal,dx01);
371 ty = _mm_mul_pd(fscal,dy01);
372 tz = _mm_mul_pd(fscal,dz01);
374 /* Update vectorial force */
375 fix0 = _mm_add_pd(fix0,tx);
376 fiy0 = _mm_add_pd(fiy0,ty);
377 fiz0 = _mm_add_pd(fiz0,tz);
379 fjx1 = _mm_add_pd(fjx1,tx);
380 fjy1 = _mm_add_pd(fjy1,ty);
381 fjz1 = _mm_add_pd(fjz1,tz);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 r02 = _mm_mul_pd(rsq02,rinv02);
389 /* Calculate table index by multiplying r with table scale and truncate to integer */
390 rt = _mm_mul_pd(r02,vftabscale);
391 vfitab = _mm_cvttpd_epi32(rt);
392 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
393 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
395 /* CUBIC SPLINE TABLE ELECTROSTATICS */
396 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
397 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
398 GMX_MM_TRANSPOSE2_PD(Y,F);
399 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
400 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
401 GMX_MM_TRANSPOSE2_PD(G,H);
402 Heps = _mm_mul_pd(vfeps,H);
403 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
404 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
405 velec = _mm_mul_pd(qq02,VV);
406 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
407 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velecsum = _mm_add_pd(velecsum,velec);
412 fscal = felec;
414 /* Calculate temporary vectorial force */
415 tx = _mm_mul_pd(fscal,dx02);
416 ty = _mm_mul_pd(fscal,dy02);
417 tz = _mm_mul_pd(fscal,dz02);
419 /* Update vectorial force */
420 fix0 = _mm_add_pd(fix0,tx);
421 fiy0 = _mm_add_pd(fiy0,ty);
422 fiz0 = _mm_add_pd(fiz0,tz);
424 fjx2 = _mm_add_pd(fjx2,tx);
425 fjy2 = _mm_add_pd(fjy2,ty);
426 fjz2 = _mm_add_pd(fjz2,tz);
428 /**************************
429 * CALCULATE INTERACTIONS *
430 **************************/
432 r10 = _mm_mul_pd(rsq10,rinv10);
434 /* Calculate table index by multiplying r with table scale and truncate to integer */
435 rt = _mm_mul_pd(r10,vftabscale);
436 vfitab = _mm_cvttpd_epi32(rt);
437 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
438 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
440 /* CUBIC SPLINE TABLE ELECTROSTATICS */
441 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
442 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
443 GMX_MM_TRANSPOSE2_PD(Y,F);
444 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
445 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
446 GMX_MM_TRANSPOSE2_PD(G,H);
447 Heps = _mm_mul_pd(vfeps,H);
448 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
449 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
450 velec = _mm_mul_pd(qq10,VV);
451 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
452 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velecsum = _mm_add_pd(velecsum,velec);
457 fscal = felec;
459 /* Calculate temporary vectorial force */
460 tx = _mm_mul_pd(fscal,dx10);
461 ty = _mm_mul_pd(fscal,dy10);
462 tz = _mm_mul_pd(fscal,dz10);
464 /* Update vectorial force */
465 fix1 = _mm_add_pd(fix1,tx);
466 fiy1 = _mm_add_pd(fiy1,ty);
467 fiz1 = _mm_add_pd(fiz1,tz);
469 fjx0 = _mm_add_pd(fjx0,tx);
470 fjy0 = _mm_add_pd(fjy0,ty);
471 fjz0 = _mm_add_pd(fjz0,tz);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 r11 = _mm_mul_pd(rsq11,rinv11);
479 /* Calculate table index by multiplying r with table scale and truncate to integer */
480 rt = _mm_mul_pd(r11,vftabscale);
481 vfitab = _mm_cvttpd_epi32(rt);
482 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
483 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
485 /* CUBIC SPLINE TABLE ELECTROSTATICS */
486 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
487 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
488 GMX_MM_TRANSPOSE2_PD(Y,F);
489 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
490 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
491 GMX_MM_TRANSPOSE2_PD(G,H);
492 Heps = _mm_mul_pd(vfeps,H);
493 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
494 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
495 velec = _mm_mul_pd(qq11,VV);
496 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
497 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velecsum = _mm_add_pd(velecsum,velec);
502 fscal = felec;
504 /* Calculate temporary vectorial force */
505 tx = _mm_mul_pd(fscal,dx11);
506 ty = _mm_mul_pd(fscal,dy11);
507 tz = _mm_mul_pd(fscal,dz11);
509 /* Update vectorial force */
510 fix1 = _mm_add_pd(fix1,tx);
511 fiy1 = _mm_add_pd(fiy1,ty);
512 fiz1 = _mm_add_pd(fiz1,tz);
514 fjx1 = _mm_add_pd(fjx1,tx);
515 fjy1 = _mm_add_pd(fjy1,ty);
516 fjz1 = _mm_add_pd(fjz1,tz);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 r12 = _mm_mul_pd(rsq12,rinv12);
524 /* Calculate table index by multiplying r with table scale and truncate to integer */
525 rt = _mm_mul_pd(r12,vftabscale);
526 vfitab = _mm_cvttpd_epi32(rt);
527 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
528 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
530 /* CUBIC SPLINE TABLE ELECTROSTATICS */
531 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
532 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
533 GMX_MM_TRANSPOSE2_PD(Y,F);
534 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
535 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
536 GMX_MM_TRANSPOSE2_PD(G,H);
537 Heps = _mm_mul_pd(vfeps,H);
538 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
539 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
540 velec = _mm_mul_pd(qq12,VV);
541 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
542 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velecsum = _mm_add_pd(velecsum,velec);
547 fscal = felec;
549 /* Calculate temporary vectorial force */
550 tx = _mm_mul_pd(fscal,dx12);
551 ty = _mm_mul_pd(fscal,dy12);
552 tz = _mm_mul_pd(fscal,dz12);
554 /* Update vectorial force */
555 fix1 = _mm_add_pd(fix1,tx);
556 fiy1 = _mm_add_pd(fiy1,ty);
557 fiz1 = _mm_add_pd(fiz1,tz);
559 fjx2 = _mm_add_pd(fjx2,tx);
560 fjy2 = _mm_add_pd(fjy2,ty);
561 fjz2 = _mm_add_pd(fjz2,tz);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 r20 = _mm_mul_pd(rsq20,rinv20);
569 /* Calculate table index by multiplying r with table scale and truncate to integer */
570 rt = _mm_mul_pd(r20,vftabscale);
571 vfitab = _mm_cvttpd_epi32(rt);
572 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
573 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
575 /* CUBIC SPLINE TABLE ELECTROSTATICS */
576 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
577 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
578 GMX_MM_TRANSPOSE2_PD(Y,F);
579 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
580 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
581 GMX_MM_TRANSPOSE2_PD(G,H);
582 Heps = _mm_mul_pd(vfeps,H);
583 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
584 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
585 velec = _mm_mul_pd(qq20,VV);
586 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
587 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velecsum = _mm_add_pd(velecsum,velec);
592 fscal = felec;
594 /* Calculate temporary vectorial force */
595 tx = _mm_mul_pd(fscal,dx20);
596 ty = _mm_mul_pd(fscal,dy20);
597 tz = _mm_mul_pd(fscal,dz20);
599 /* Update vectorial force */
600 fix2 = _mm_add_pd(fix2,tx);
601 fiy2 = _mm_add_pd(fiy2,ty);
602 fiz2 = _mm_add_pd(fiz2,tz);
604 fjx0 = _mm_add_pd(fjx0,tx);
605 fjy0 = _mm_add_pd(fjy0,ty);
606 fjz0 = _mm_add_pd(fjz0,tz);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 r21 = _mm_mul_pd(rsq21,rinv21);
614 /* Calculate table index by multiplying r with table scale and truncate to integer */
615 rt = _mm_mul_pd(r21,vftabscale);
616 vfitab = _mm_cvttpd_epi32(rt);
617 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
618 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
620 /* CUBIC SPLINE TABLE ELECTROSTATICS */
621 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
622 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
623 GMX_MM_TRANSPOSE2_PD(Y,F);
624 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
625 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
626 GMX_MM_TRANSPOSE2_PD(G,H);
627 Heps = _mm_mul_pd(vfeps,H);
628 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
629 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
630 velec = _mm_mul_pd(qq21,VV);
631 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
632 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velecsum = _mm_add_pd(velecsum,velec);
637 fscal = felec;
639 /* Calculate temporary vectorial force */
640 tx = _mm_mul_pd(fscal,dx21);
641 ty = _mm_mul_pd(fscal,dy21);
642 tz = _mm_mul_pd(fscal,dz21);
644 /* Update vectorial force */
645 fix2 = _mm_add_pd(fix2,tx);
646 fiy2 = _mm_add_pd(fiy2,ty);
647 fiz2 = _mm_add_pd(fiz2,tz);
649 fjx1 = _mm_add_pd(fjx1,tx);
650 fjy1 = _mm_add_pd(fjy1,ty);
651 fjz1 = _mm_add_pd(fjz1,tz);
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 r22 = _mm_mul_pd(rsq22,rinv22);
659 /* Calculate table index by multiplying r with table scale and truncate to integer */
660 rt = _mm_mul_pd(r22,vftabscale);
661 vfitab = _mm_cvttpd_epi32(rt);
662 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
663 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
665 /* CUBIC SPLINE TABLE ELECTROSTATICS */
666 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
667 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
668 GMX_MM_TRANSPOSE2_PD(Y,F);
669 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
670 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
671 GMX_MM_TRANSPOSE2_PD(G,H);
672 Heps = _mm_mul_pd(vfeps,H);
673 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
674 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
675 velec = _mm_mul_pd(qq22,VV);
676 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
677 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
679 /* Update potential sum for this i atom from the interaction with this j atom. */
680 velecsum = _mm_add_pd(velecsum,velec);
682 fscal = felec;
684 /* Calculate temporary vectorial force */
685 tx = _mm_mul_pd(fscal,dx22);
686 ty = _mm_mul_pd(fscal,dy22);
687 tz = _mm_mul_pd(fscal,dz22);
689 /* Update vectorial force */
690 fix2 = _mm_add_pd(fix2,tx);
691 fiy2 = _mm_add_pd(fiy2,ty);
692 fiz2 = _mm_add_pd(fiz2,tz);
694 fjx2 = _mm_add_pd(fjx2,tx);
695 fjy2 = _mm_add_pd(fjy2,ty);
696 fjz2 = _mm_add_pd(fjz2,tz);
698 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
700 /* Inner loop uses 417 flops */
703 if(jidx<j_index_end)
706 jnrA = jjnr[jidx];
707 j_coord_offsetA = DIM*jnrA;
709 /* load j atom coordinates */
710 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
711 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
713 /* Calculate displacement vector */
714 dx00 = _mm_sub_pd(ix0,jx0);
715 dy00 = _mm_sub_pd(iy0,jy0);
716 dz00 = _mm_sub_pd(iz0,jz0);
717 dx01 = _mm_sub_pd(ix0,jx1);
718 dy01 = _mm_sub_pd(iy0,jy1);
719 dz01 = _mm_sub_pd(iz0,jz1);
720 dx02 = _mm_sub_pd(ix0,jx2);
721 dy02 = _mm_sub_pd(iy0,jy2);
722 dz02 = _mm_sub_pd(iz0,jz2);
723 dx10 = _mm_sub_pd(ix1,jx0);
724 dy10 = _mm_sub_pd(iy1,jy0);
725 dz10 = _mm_sub_pd(iz1,jz0);
726 dx11 = _mm_sub_pd(ix1,jx1);
727 dy11 = _mm_sub_pd(iy1,jy1);
728 dz11 = _mm_sub_pd(iz1,jz1);
729 dx12 = _mm_sub_pd(ix1,jx2);
730 dy12 = _mm_sub_pd(iy1,jy2);
731 dz12 = _mm_sub_pd(iz1,jz2);
732 dx20 = _mm_sub_pd(ix2,jx0);
733 dy20 = _mm_sub_pd(iy2,jy0);
734 dz20 = _mm_sub_pd(iz2,jz0);
735 dx21 = _mm_sub_pd(ix2,jx1);
736 dy21 = _mm_sub_pd(iy2,jy1);
737 dz21 = _mm_sub_pd(iz2,jz1);
738 dx22 = _mm_sub_pd(ix2,jx2);
739 dy22 = _mm_sub_pd(iy2,jy2);
740 dz22 = _mm_sub_pd(iz2,jz2);
742 /* Calculate squared distance and things based on it */
743 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
744 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
745 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
746 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
747 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
748 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
749 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
750 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
751 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
753 rinv00 = gmx_mm_invsqrt_pd(rsq00);
754 rinv01 = gmx_mm_invsqrt_pd(rsq01);
755 rinv02 = gmx_mm_invsqrt_pd(rsq02);
756 rinv10 = gmx_mm_invsqrt_pd(rsq10);
757 rinv11 = gmx_mm_invsqrt_pd(rsq11);
758 rinv12 = gmx_mm_invsqrt_pd(rsq12);
759 rinv20 = gmx_mm_invsqrt_pd(rsq20);
760 rinv21 = gmx_mm_invsqrt_pd(rsq21);
761 rinv22 = gmx_mm_invsqrt_pd(rsq22);
763 fjx0 = _mm_setzero_pd();
764 fjy0 = _mm_setzero_pd();
765 fjz0 = _mm_setzero_pd();
766 fjx1 = _mm_setzero_pd();
767 fjy1 = _mm_setzero_pd();
768 fjz1 = _mm_setzero_pd();
769 fjx2 = _mm_setzero_pd();
770 fjy2 = _mm_setzero_pd();
771 fjz2 = _mm_setzero_pd();
773 /**************************
774 * CALCULATE INTERACTIONS *
775 **************************/
777 r00 = _mm_mul_pd(rsq00,rinv00);
779 /* Calculate table index by multiplying r with table scale and truncate to integer */
780 rt = _mm_mul_pd(r00,vftabscale);
781 vfitab = _mm_cvttpd_epi32(rt);
782 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
783 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
785 /* CUBIC SPLINE TABLE ELECTROSTATICS */
786 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
787 F = _mm_setzero_pd();
788 GMX_MM_TRANSPOSE2_PD(Y,F);
789 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
790 H = _mm_setzero_pd();
791 GMX_MM_TRANSPOSE2_PD(G,H);
792 Heps = _mm_mul_pd(vfeps,H);
793 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
794 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
795 velec = _mm_mul_pd(qq00,VV);
796 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
797 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
799 /* CUBIC SPLINE TABLE DISPERSION */
800 vfitab = _mm_add_epi32(vfitab,ifour);
801 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
802 F = _mm_setzero_pd();
803 GMX_MM_TRANSPOSE2_PD(Y,F);
804 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
805 H = _mm_setzero_pd();
806 GMX_MM_TRANSPOSE2_PD(G,H);
807 Heps = _mm_mul_pd(vfeps,H);
808 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
809 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
810 vvdw6 = _mm_mul_pd(c6_00,VV);
811 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
812 fvdw6 = _mm_mul_pd(c6_00,FF);
814 /* CUBIC SPLINE TABLE REPULSION */
815 vfitab = _mm_add_epi32(vfitab,ifour);
816 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
817 F = _mm_setzero_pd();
818 GMX_MM_TRANSPOSE2_PD(Y,F);
819 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
820 H = _mm_setzero_pd();
821 GMX_MM_TRANSPOSE2_PD(G,H);
822 Heps = _mm_mul_pd(vfeps,H);
823 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
824 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
825 vvdw12 = _mm_mul_pd(c12_00,VV);
826 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
827 fvdw12 = _mm_mul_pd(c12_00,FF);
828 vvdw = _mm_add_pd(vvdw12,vvdw6);
829 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
831 /* Update potential sum for this i atom from the interaction with this j atom. */
832 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
833 velecsum = _mm_add_pd(velecsum,velec);
834 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
835 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
837 fscal = _mm_add_pd(felec,fvdw);
839 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
841 /* Calculate temporary vectorial force */
842 tx = _mm_mul_pd(fscal,dx00);
843 ty = _mm_mul_pd(fscal,dy00);
844 tz = _mm_mul_pd(fscal,dz00);
846 /* Update vectorial force */
847 fix0 = _mm_add_pd(fix0,tx);
848 fiy0 = _mm_add_pd(fiy0,ty);
849 fiz0 = _mm_add_pd(fiz0,tz);
851 fjx0 = _mm_add_pd(fjx0,tx);
852 fjy0 = _mm_add_pd(fjy0,ty);
853 fjz0 = _mm_add_pd(fjz0,tz);
855 /**************************
856 * CALCULATE INTERACTIONS *
857 **************************/
859 r01 = _mm_mul_pd(rsq01,rinv01);
861 /* Calculate table index by multiplying r with table scale and truncate to integer */
862 rt = _mm_mul_pd(r01,vftabscale);
863 vfitab = _mm_cvttpd_epi32(rt);
864 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
865 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
867 /* CUBIC SPLINE TABLE ELECTROSTATICS */
868 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
869 F = _mm_setzero_pd();
870 GMX_MM_TRANSPOSE2_PD(Y,F);
871 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
872 H = _mm_setzero_pd();
873 GMX_MM_TRANSPOSE2_PD(G,H);
874 Heps = _mm_mul_pd(vfeps,H);
875 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
876 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
877 velec = _mm_mul_pd(qq01,VV);
878 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
879 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
881 /* Update potential sum for this i atom from the interaction with this j atom. */
882 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
883 velecsum = _mm_add_pd(velecsum,velec);
885 fscal = felec;
887 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
889 /* Calculate temporary vectorial force */
890 tx = _mm_mul_pd(fscal,dx01);
891 ty = _mm_mul_pd(fscal,dy01);
892 tz = _mm_mul_pd(fscal,dz01);
894 /* Update vectorial force */
895 fix0 = _mm_add_pd(fix0,tx);
896 fiy0 = _mm_add_pd(fiy0,ty);
897 fiz0 = _mm_add_pd(fiz0,tz);
899 fjx1 = _mm_add_pd(fjx1,tx);
900 fjy1 = _mm_add_pd(fjy1,ty);
901 fjz1 = _mm_add_pd(fjz1,tz);
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 r02 = _mm_mul_pd(rsq02,rinv02);
909 /* Calculate table index by multiplying r with table scale and truncate to integer */
910 rt = _mm_mul_pd(r02,vftabscale);
911 vfitab = _mm_cvttpd_epi32(rt);
912 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
913 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
915 /* CUBIC SPLINE TABLE ELECTROSTATICS */
916 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
917 F = _mm_setzero_pd();
918 GMX_MM_TRANSPOSE2_PD(Y,F);
919 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
920 H = _mm_setzero_pd();
921 GMX_MM_TRANSPOSE2_PD(G,H);
922 Heps = _mm_mul_pd(vfeps,H);
923 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
924 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
925 velec = _mm_mul_pd(qq02,VV);
926 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
927 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
931 velecsum = _mm_add_pd(velecsum,velec);
933 fscal = felec;
935 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
937 /* Calculate temporary vectorial force */
938 tx = _mm_mul_pd(fscal,dx02);
939 ty = _mm_mul_pd(fscal,dy02);
940 tz = _mm_mul_pd(fscal,dz02);
942 /* Update vectorial force */
943 fix0 = _mm_add_pd(fix0,tx);
944 fiy0 = _mm_add_pd(fiy0,ty);
945 fiz0 = _mm_add_pd(fiz0,tz);
947 fjx2 = _mm_add_pd(fjx2,tx);
948 fjy2 = _mm_add_pd(fjy2,ty);
949 fjz2 = _mm_add_pd(fjz2,tz);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 r10 = _mm_mul_pd(rsq10,rinv10);
957 /* Calculate table index by multiplying r with table scale and truncate to integer */
958 rt = _mm_mul_pd(r10,vftabscale);
959 vfitab = _mm_cvttpd_epi32(rt);
960 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
961 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
963 /* CUBIC SPLINE TABLE ELECTROSTATICS */
964 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
965 F = _mm_setzero_pd();
966 GMX_MM_TRANSPOSE2_PD(Y,F);
967 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
968 H = _mm_setzero_pd();
969 GMX_MM_TRANSPOSE2_PD(G,H);
970 Heps = _mm_mul_pd(vfeps,H);
971 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
972 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
973 velec = _mm_mul_pd(qq10,VV);
974 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
975 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
977 /* Update potential sum for this i atom from the interaction with this j atom. */
978 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
979 velecsum = _mm_add_pd(velecsum,velec);
981 fscal = felec;
983 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
985 /* Calculate temporary vectorial force */
986 tx = _mm_mul_pd(fscal,dx10);
987 ty = _mm_mul_pd(fscal,dy10);
988 tz = _mm_mul_pd(fscal,dz10);
990 /* Update vectorial force */
991 fix1 = _mm_add_pd(fix1,tx);
992 fiy1 = _mm_add_pd(fiy1,ty);
993 fiz1 = _mm_add_pd(fiz1,tz);
995 fjx0 = _mm_add_pd(fjx0,tx);
996 fjy0 = _mm_add_pd(fjy0,ty);
997 fjz0 = _mm_add_pd(fjz0,tz);
999 /**************************
1000 * CALCULATE INTERACTIONS *
1001 **************************/
1003 r11 = _mm_mul_pd(rsq11,rinv11);
1005 /* Calculate table index by multiplying r with table scale and truncate to integer */
1006 rt = _mm_mul_pd(r11,vftabscale);
1007 vfitab = _mm_cvttpd_epi32(rt);
1008 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1009 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1011 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1012 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1013 F = _mm_setzero_pd();
1014 GMX_MM_TRANSPOSE2_PD(Y,F);
1015 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1016 H = _mm_setzero_pd();
1017 GMX_MM_TRANSPOSE2_PD(G,H);
1018 Heps = _mm_mul_pd(vfeps,H);
1019 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1020 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1021 velec = _mm_mul_pd(qq11,VV);
1022 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1023 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1027 velecsum = _mm_add_pd(velecsum,velec);
1029 fscal = felec;
1031 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1033 /* Calculate temporary vectorial force */
1034 tx = _mm_mul_pd(fscal,dx11);
1035 ty = _mm_mul_pd(fscal,dy11);
1036 tz = _mm_mul_pd(fscal,dz11);
1038 /* Update vectorial force */
1039 fix1 = _mm_add_pd(fix1,tx);
1040 fiy1 = _mm_add_pd(fiy1,ty);
1041 fiz1 = _mm_add_pd(fiz1,tz);
1043 fjx1 = _mm_add_pd(fjx1,tx);
1044 fjy1 = _mm_add_pd(fjy1,ty);
1045 fjz1 = _mm_add_pd(fjz1,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 r12 = _mm_mul_pd(rsq12,rinv12);
1053 /* Calculate table index by multiplying r with table scale and truncate to integer */
1054 rt = _mm_mul_pd(r12,vftabscale);
1055 vfitab = _mm_cvttpd_epi32(rt);
1056 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1057 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1059 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1060 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1061 F = _mm_setzero_pd();
1062 GMX_MM_TRANSPOSE2_PD(Y,F);
1063 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1064 H = _mm_setzero_pd();
1065 GMX_MM_TRANSPOSE2_PD(G,H);
1066 Heps = _mm_mul_pd(vfeps,H);
1067 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1068 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1069 velec = _mm_mul_pd(qq12,VV);
1070 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1071 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1075 velecsum = _mm_add_pd(velecsum,velec);
1077 fscal = felec;
1079 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1081 /* Calculate temporary vectorial force */
1082 tx = _mm_mul_pd(fscal,dx12);
1083 ty = _mm_mul_pd(fscal,dy12);
1084 tz = _mm_mul_pd(fscal,dz12);
1086 /* Update vectorial force */
1087 fix1 = _mm_add_pd(fix1,tx);
1088 fiy1 = _mm_add_pd(fiy1,ty);
1089 fiz1 = _mm_add_pd(fiz1,tz);
1091 fjx2 = _mm_add_pd(fjx2,tx);
1092 fjy2 = _mm_add_pd(fjy2,ty);
1093 fjz2 = _mm_add_pd(fjz2,tz);
1095 /**************************
1096 * CALCULATE INTERACTIONS *
1097 **************************/
1099 r20 = _mm_mul_pd(rsq20,rinv20);
1101 /* Calculate table index by multiplying r with table scale and truncate to integer */
1102 rt = _mm_mul_pd(r20,vftabscale);
1103 vfitab = _mm_cvttpd_epi32(rt);
1104 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1105 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1107 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1109 F = _mm_setzero_pd();
1110 GMX_MM_TRANSPOSE2_PD(Y,F);
1111 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1112 H = _mm_setzero_pd();
1113 GMX_MM_TRANSPOSE2_PD(G,H);
1114 Heps = _mm_mul_pd(vfeps,H);
1115 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1116 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1117 velec = _mm_mul_pd(qq20,VV);
1118 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1119 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1123 velecsum = _mm_add_pd(velecsum,velec);
1125 fscal = felec;
1127 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1129 /* Calculate temporary vectorial force */
1130 tx = _mm_mul_pd(fscal,dx20);
1131 ty = _mm_mul_pd(fscal,dy20);
1132 tz = _mm_mul_pd(fscal,dz20);
1134 /* Update vectorial force */
1135 fix2 = _mm_add_pd(fix2,tx);
1136 fiy2 = _mm_add_pd(fiy2,ty);
1137 fiz2 = _mm_add_pd(fiz2,tz);
1139 fjx0 = _mm_add_pd(fjx0,tx);
1140 fjy0 = _mm_add_pd(fjy0,ty);
1141 fjz0 = _mm_add_pd(fjz0,tz);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 r21 = _mm_mul_pd(rsq21,rinv21);
1149 /* Calculate table index by multiplying r with table scale and truncate to integer */
1150 rt = _mm_mul_pd(r21,vftabscale);
1151 vfitab = _mm_cvttpd_epi32(rt);
1152 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1153 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1155 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1156 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1157 F = _mm_setzero_pd();
1158 GMX_MM_TRANSPOSE2_PD(Y,F);
1159 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1160 H = _mm_setzero_pd();
1161 GMX_MM_TRANSPOSE2_PD(G,H);
1162 Heps = _mm_mul_pd(vfeps,H);
1163 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1164 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1165 velec = _mm_mul_pd(qq21,VV);
1166 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1167 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1169 /* Update potential sum for this i atom from the interaction with this j atom. */
1170 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1171 velecsum = _mm_add_pd(velecsum,velec);
1173 fscal = felec;
1175 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1177 /* Calculate temporary vectorial force */
1178 tx = _mm_mul_pd(fscal,dx21);
1179 ty = _mm_mul_pd(fscal,dy21);
1180 tz = _mm_mul_pd(fscal,dz21);
1182 /* Update vectorial force */
1183 fix2 = _mm_add_pd(fix2,tx);
1184 fiy2 = _mm_add_pd(fiy2,ty);
1185 fiz2 = _mm_add_pd(fiz2,tz);
1187 fjx1 = _mm_add_pd(fjx1,tx);
1188 fjy1 = _mm_add_pd(fjy1,ty);
1189 fjz1 = _mm_add_pd(fjz1,tz);
1191 /**************************
1192 * CALCULATE INTERACTIONS *
1193 **************************/
1195 r22 = _mm_mul_pd(rsq22,rinv22);
1197 /* Calculate table index by multiplying r with table scale and truncate to integer */
1198 rt = _mm_mul_pd(r22,vftabscale);
1199 vfitab = _mm_cvttpd_epi32(rt);
1200 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1201 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1203 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1205 F = _mm_setzero_pd();
1206 GMX_MM_TRANSPOSE2_PD(Y,F);
1207 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1208 H = _mm_setzero_pd();
1209 GMX_MM_TRANSPOSE2_PD(G,H);
1210 Heps = _mm_mul_pd(vfeps,H);
1211 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1212 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1213 velec = _mm_mul_pd(qq22,VV);
1214 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1215 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1217 /* Update potential sum for this i atom from the interaction with this j atom. */
1218 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1219 velecsum = _mm_add_pd(velecsum,velec);
1221 fscal = felec;
1223 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1225 /* Calculate temporary vectorial force */
1226 tx = _mm_mul_pd(fscal,dx22);
1227 ty = _mm_mul_pd(fscal,dy22);
1228 tz = _mm_mul_pd(fscal,dz22);
1230 /* Update vectorial force */
1231 fix2 = _mm_add_pd(fix2,tx);
1232 fiy2 = _mm_add_pd(fiy2,ty);
1233 fiz2 = _mm_add_pd(fiz2,tz);
1235 fjx2 = _mm_add_pd(fjx2,tx);
1236 fjy2 = _mm_add_pd(fjy2,ty);
1237 fjz2 = _mm_add_pd(fjz2,tz);
1239 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1241 /* Inner loop uses 417 flops */
1244 /* End of innermost loop */
1246 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1247 f+i_coord_offset,fshift+i_shift_offset);
1249 ggid = gid[iidx];
1250 /* Update potential energies */
1251 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1252 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1254 /* Increment number of inner iterations */
1255 inneriter += j_index_end - j_index_start;
1257 /* Outer loop uses 20 flops */
1260 /* Increment number of outer iterations */
1261 outeriter += nri;
1263 /* Update outer/inner flops */
1265 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*417);
1268 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1269 * Electrostatics interaction: CubicSplineTable
1270 * VdW interaction: CubicSplineTable
1271 * Geometry: Water3-Water3
1272 * Calculate force/pot: Force
1274 void
1275 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1276 (t_nblist * gmx_restrict nlist,
1277 rvec * gmx_restrict xx,
1278 rvec * gmx_restrict ff,
1279 t_forcerec * gmx_restrict fr,
1280 t_mdatoms * gmx_restrict mdatoms,
1281 nb_kernel_data_t * gmx_restrict kernel_data,
1282 t_nrnb * gmx_restrict nrnb)
1284 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1285 * just 0 for non-waters.
1286 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1287 * jnr indices corresponding to data put in the four positions in the SIMD register.
1289 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1290 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1291 int jnrA,jnrB;
1292 int j_coord_offsetA,j_coord_offsetB;
1293 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1294 real rcutoff_scalar;
1295 real *shiftvec,*fshift,*x,*f;
1296 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1297 int vdwioffset0;
1298 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1299 int vdwioffset1;
1300 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1301 int vdwioffset2;
1302 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1303 int vdwjidx0A,vdwjidx0B;
1304 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1305 int vdwjidx1A,vdwjidx1B;
1306 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1307 int vdwjidx2A,vdwjidx2B;
1308 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1309 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1310 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1311 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1312 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1313 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1314 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1315 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1316 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1317 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1318 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1319 real *charge;
1320 int nvdwtype;
1321 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1322 int *vdwtype;
1323 real *vdwparam;
1324 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1325 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1326 __m128i vfitab;
1327 __m128i ifour = _mm_set1_epi32(4);
1328 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1329 real *vftab;
1330 __m128d dummy_mask,cutoff_mask;
1331 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1332 __m128d one = _mm_set1_pd(1.0);
1333 __m128d two = _mm_set1_pd(2.0);
1334 x = xx[0];
1335 f = ff[0];
1337 nri = nlist->nri;
1338 iinr = nlist->iinr;
1339 jindex = nlist->jindex;
1340 jjnr = nlist->jjnr;
1341 shiftidx = nlist->shift;
1342 gid = nlist->gid;
1343 shiftvec = fr->shift_vec[0];
1344 fshift = fr->fshift[0];
1345 facel = _mm_set1_pd(fr->epsfac);
1346 charge = mdatoms->chargeA;
1347 nvdwtype = fr->ntype;
1348 vdwparam = fr->nbfp;
1349 vdwtype = mdatoms->typeA;
1351 vftab = kernel_data->table_elec_vdw->data;
1352 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1354 /* Setup water-specific parameters */
1355 inr = nlist->iinr[0];
1356 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1357 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1358 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1359 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1361 jq0 = _mm_set1_pd(charge[inr+0]);
1362 jq1 = _mm_set1_pd(charge[inr+1]);
1363 jq2 = _mm_set1_pd(charge[inr+2]);
1364 vdwjidx0A = 2*vdwtype[inr+0];
1365 qq00 = _mm_mul_pd(iq0,jq0);
1366 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1367 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1368 qq01 = _mm_mul_pd(iq0,jq1);
1369 qq02 = _mm_mul_pd(iq0,jq2);
1370 qq10 = _mm_mul_pd(iq1,jq0);
1371 qq11 = _mm_mul_pd(iq1,jq1);
1372 qq12 = _mm_mul_pd(iq1,jq2);
1373 qq20 = _mm_mul_pd(iq2,jq0);
1374 qq21 = _mm_mul_pd(iq2,jq1);
1375 qq22 = _mm_mul_pd(iq2,jq2);
1377 /* Avoid stupid compiler warnings */
1378 jnrA = jnrB = 0;
1379 j_coord_offsetA = 0;
1380 j_coord_offsetB = 0;
1382 outeriter = 0;
1383 inneriter = 0;
1385 /* Start outer loop over neighborlists */
1386 for(iidx=0; iidx<nri; iidx++)
1388 /* Load shift vector for this list */
1389 i_shift_offset = DIM*shiftidx[iidx];
1391 /* Load limits for loop over neighbors */
1392 j_index_start = jindex[iidx];
1393 j_index_end = jindex[iidx+1];
1395 /* Get outer coordinate index */
1396 inr = iinr[iidx];
1397 i_coord_offset = DIM*inr;
1399 /* Load i particle coords and add shift vector */
1400 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1401 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1403 fix0 = _mm_setzero_pd();
1404 fiy0 = _mm_setzero_pd();
1405 fiz0 = _mm_setzero_pd();
1406 fix1 = _mm_setzero_pd();
1407 fiy1 = _mm_setzero_pd();
1408 fiz1 = _mm_setzero_pd();
1409 fix2 = _mm_setzero_pd();
1410 fiy2 = _mm_setzero_pd();
1411 fiz2 = _mm_setzero_pd();
1413 /* Start inner kernel loop */
1414 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1417 /* Get j neighbor index, and coordinate index */
1418 jnrA = jjnr[jidx];
1419 jnrB = jjnr[jidx+1];
1420 j_coord_offsetA = DIM*jnrA;
1421 j_coord_offsetB = DIM*jnrB;
1423 /* load j atom coordinates */
1424 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1425 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1427 /* Calculate displacement vector */
1428 dx00 = _mm_sub_pd(ix0,jx0);
1429 dy00 = _mm_sub_pd(iy0,jy0);
1430 dz00 = _mm_sub_pd(iz0,jz0);
1431 dx01 = _mm_sub_pd(ix0,jx1);
1432 dy01 = _mm_sub_pd(iy0,jy1);
1433 dz01 = _mm_sub_pd(iz0,jz1);
1434 dx02 = _mm_sub_pd(ix0,jx2);
1435 dy02 = _mm_sub_pd(iy0,jy2);
1436 dz02 = _mm_sub_pd(iz0,jz2);
1437 dx10 = _mm_sub_pd(ix1,jx0);
1438 dy10 = _mm_sub_pd(iy1,jy0);
1439 dz10 = _mm_sub_pd(iz1,jz0);
1440 dx11 = _mm_sub_pd(ix1,jx1);
1441 dy11 = _mm_sub_pd(iy1,jy1);
1442 dz11 = _mm_sub_pd(iz1,jz1);
1443 dx12 = _mm_sub_pd(ix1,jx2);
1444 dy12 = _mm_sub_pd(iy1,jy2);
1445 dz12 = _mm_sub_pd(iz1,jz2);
1446 dx20 = _mm_sub_pd(ix2,jx0);
1447 dy20 = _mm_sub_pd(iy2,jy0);
1448 dz20 = _mm_sub_pd(iz2,jz0);
1449 dx21 = _mm_sub_pd(ix2,jx1);
1450 dy21 = _mm_sub_pd(iy2,jy1);
1451 dz21 = _mm_sub_pd(iz2,jz1);
1452 dx22 = _mm_sub_pd(ix2,jx2);
1453 dy22 = _mm_sub_pd(iy2,jy2);
1454 dz22 = _mm_sub_pd(iz2,jz2);
1456 /* Calculate squared distance and things based on it */
1457 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1458 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1459 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1460 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1461 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1462 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1463 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1464 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1465 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1467 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1468 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1469 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1470 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1471 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1472 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1473 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1474 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1475 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1477 fjx0 = _mm_setzero_pd();
1478 fjy0 = _mm_setzero_pd();
1479 fjz0 = _mm_setzero_pd();
1480 fjx1 = _mm_setzero_pd();
1481 fjy1 = _mm_setzero_pd();
1482 fjz1 = _mm_setzero_pd();
1483 fjx2 = _mm_setzero_pd();
1484 fjy2 = _mm_setzero_pd();
1485 fjz2 = _mm_setzero_pd();
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 r00 = _mm_mul_pd(rsq00,rinv00);
1493 /* Calculate table index by multiplying r with table scale and truncate to integer */
1494 rt = _mm_mul_pd(r00,vftabscale);
1495 vfitab = _mm_cvttpd_epi32(rt);
1496 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1497 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1499 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1500 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1501 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1502 GMX_MM_TRANSPOSE2_PD(Y,F);
1503 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1504 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1505 GMX_MM_TRANSPOSE2_PD(G,H);
1506 Heps = _mm_mul_pd(vfeps,H);
1507 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1508 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1509 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1511 /* CUBIC SPLINE TABLE DISPERSION */
1512 vfitab = _mm_add_epi32(vfitab,ifour);
1513 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1514 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1515 GMX_MM_TRANSPOSE2_PD(Y,F);
1516 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1517 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1518 GMX_MM_TRANSPOSE2_PD(G,H);
1519 Heps = _mm_mul_pd(vfeps,H);
1520 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1521 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1522 fvdw6 = _mm_mul_pd(c6_00,FF);
1524 /* CUBIC SPLINE TABLE REPULSION */
1525 vfitab = _mm_add_epi32(vfitab,ifour);
1526 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1527 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1528 GMX_MM_TRANSPOSE2_PD(Y,F);
1529 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1530 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1531 GMX_MM_TRANSPOSE2_PD(G,H);
1532 Heps = _mm_mul_pd(vfeps,H);
1533 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1534 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1535 fvdw12 = _mm_mul_pd(c12_00,FF);
1536 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1538 fscal = _mm_add_pd(felec,fvdw);
1540 /* Calculate temporary vectorial force */
1541 tx = _mm_mul_pd(fscal,dx00);
1542 ty = _mm_mul_pd(fscal,dy00);
1543 tz = _mm_mul_pd(fscal,dz00);
1545 /* Update vectorial force */
1546 fix0 = _mm_add_pd(fix0,tx);
1547 fiy0 = _mm_add_pd(fiy0,ty);
1548 fiz0 = _mm_add_pd(fiz0,tz);
1550 fjx0 = _mm_add_pd(fjx0,tx);
1551 fjy0 = _mm_add_pd(fjy0,ty);
1552 fjz0 = _mm_add_pd(fjz0,tz);
1554 /**************************
1555 * CALCULATE INTERACTIONS *
1556 **************************/
1558 r01 = _mm_mul_pd(rsq01,rinv01);
1560 /* Calculate table index by multiplying r with table scale and truncate to integer */
1561 rt = _mm_mul_pd(r01,vftabscale);
1562 vfitab = _mm_cvttpd_epi32(rt);
1563 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1564 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1566 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1567 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1568 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1569 GMX_MM_TRANSPOSE2_PD(Y,F);
1570 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1571 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1572 GMX_MM_TRANSPOSE2_PD(G,H);
1573 Heps = _mm_mul_pd(vfeps,H);
1574 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1575 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1576 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1578 fscal = felec;
1580 /* Calculate temporary vectorial force */
1581 tx = _mm_mul_pd(fscal,dx01);
1582 ty = _mm_mul_pd(fscal,dy01);
1583 tz = _mm_mul_pd(fscal,dz01);
1585 /* Update vectorial force */
1586 fix0 = _mm_add_pd(fix0,tx);
1587 fiy0 = _mm_add_pd(fiy0,ty);
1588 fiz0 = _mm_add_pd(fiz0,tz);
1590 fjx1 = _mm_add_pd(fjx1,tx);
1591 fjy1 = _mm_add_pd(fjy1,ty);
1592 fjz1 = _mm_add_pd(fjz1,tz);
1594 /**************************
1595 * CALCULATE INTERACTIONS *
1596 **************************/
1598 r02 = _mm_mul_pd(rsq02,rinv02);
1600 /* Calculate table index by multiplying r with table scale and truncate to integer */
1601 rt = _mm_mul_pd(r02,vftabscale);
1602 vfitab = _mm_cvttpd_epi32(rt);
1603 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1604 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1606 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1607 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1608 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1609 GMX_MM_TRANSPOSE2_PD(Y,F);
1610 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1611 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1612 GMX_MM_TRANSPOSE2_PD(G,H);
1613 Heps = _mm_mul_pd(vfeps,H);
1614 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1615 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1616 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1618 fscal = felec;
1620 /* Calculate temporary vectorial force */
1621 tx = _mm_mul_pd(fscal,dx02);
1622 ty = _mm_mul_pd(fscal,dy02);
1623 tz = _mm_mul_pd(fscal,dz02);
1625 /* Update vectorial force */
1626 fix0 = _mm_add_pd(fix0,tx);
1627 fiy0 = _mm_add_pd(fiy0,ty);
1628 fiz0 = _mm_add_pd(fiz0,tz);
1630 fjx2 = _mm_add_pd(fjx2,tx);
1631 fjy2 = _mm_add_pd(fjy2,ty);
1632 fjz2 = _mm_add_pd(fjz2,tz);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 r10 = _mm_mul_pd(rsq10,rinv10);
1640 /* Calculate table index by multiplying r with table scale and truncate to integer */
1641 rt = _mm_mul_pd(r10,vftabscale);
1642 vfitab = _mm_cvttpd_epi32(rt);
1643 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1644 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1646 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1647 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1648 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1649 GMX_MM_TRANSPOSE2_PD(Y,F);
1650 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1651 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1652 GMX_MM_TRANSPOSE2_PD(G,H);
1653 Heps = _mm_mul_pd(vfeps,H);
1654 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1655 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1656 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1658 fscal = felec;
1660 /* Calculate temporary vectorial force */
1661 tx = _mm_mul_pd(fscal,dx10);
1662 ty = _mm_mul_pd(fscal,dy10);
1663 tz = _mm_mul_pd(fscal,dz10);
1665 /* Update vectorial force */
1666 fix1 = _mm_add_pd(fix1,tx);
1667 fiy1 = _mm_add_pd(fiy1,ty);
1668 fiz1 = _mm_add_pd(fiz1,tz);
1670 fjx0 = _mm_add_pd(fjx0,tx);
1671 fjy0 = _mm_add_pd(fjy0,ty);
1672 fjz0 = _mm_add_pd(fjz0,tz);
1674 /**************************
1675 * CALCULATE INTERACTIONS *
1676 **************************/
1678 r11 = _mm_mul_pd(rsq11,rinv11);
1680 /* Calculate table index by multiplying r with table scale and truncate to integer */
1681 rt = _mm_mul_pd(r11,vftabscale);
1682 vfitab = _mm_cvttpd_epi32(rt);
1683 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1684 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1686 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1687 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1688 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1689 GMX_MM_TRANSPOSE2_PD(Y,F);
1690 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1691 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1692 GMX_MM_TRANSPOSE2_PD(G,H);
1693 Heps = _mm_mul_pd(vfeps,H);
1694 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1695 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1696 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1698 fscal = felec;
1700 /* Calculate temporary vectorial force */
1701 tx = _mm_mul_pd(fscal,dx11);
1702 ty = _mm_mul_pd(fscal,dy11);
1703 tz = _mm_mul_pd(fscal,dz11);
1705 /* Update vectorial force */
1706 fix1 = _mm_add_pd(fix1,tx);
1707 fiy1 = _mm_add_pd(fiy1,ty);
1708 fiz1 = _mm_add_pd(fiz1,tz);
1710 fjx1 = _mm_add_pd(fjx1,tx);
1711 fjy1 = _mm_add_pd(fjy1,ty);
1712 fjz1 = _mm_add_pd(fjz1,tz);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 r12 = _mm_mul_pd(rsq12,rinv12);
1720 /* Calculate table index by multiplying r with table scale and truncate to integer */
1721 rt = _mm_mul_pd(r12,vftabscale);
1722 vfitab = _mm_cvttpd_epi32(rt);
1723 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1724 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1726 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1727 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1728 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1729 GMX_MM_TRANSPOSE2_PD(Y,F);
1730 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1731 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1732 GMX_MM_TRANSPOSE2_PD(G,H);
1733 Heps = _mm_mul_pd(vfeps,H);
1734 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1735 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1736 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1738 fscal = felec;
1740 /* Calculate temporary vectorial force */
1741 tx = _mm_mul_pd(fscal,dx12);
1742 ty = _mm_mul_pd(fscal,dy12);
1743 tz = _mm_mul_pd(fscal,dz12);
1745 /* Update vectorial force */
1746 fix1 = _mm_add_pd(fix1,tx);
1747 fiy1 = _mm_add_pd(fiy1,ty);
1748 fiz1 = _mm_add_pd(fiz1,tz);
1750 fjx2 = _mm_add_pd(fjx2,tx);
1751 fjy2 = _mm_add_pd(fjy2,ty);
1752 fjz2 = _mm_add_pd(fjz2,tz);
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1758 r20 = _mm_mul_pd(rsq20,rinv20);
1760 /* Calculate table index by multiplying r with table scale and truncate to integer */
1761 rt = _mm_mul_pd(r20,vftabscale);
1762 vfitab = _mm_cvttpd_epi32(rt);
1763 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1764 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1766 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1767 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1768 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1769 GMX_MM_TRANSPOSE2_PD(Y,F);
1770 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1771 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1772 GMX_MM_TRANSPOSE2_PD(G,H);
1773 Heps = _mm_mul_pd(vfeps,H);
1774 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1775 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1776 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1778 fscal = felec;
1780 /* Calculate temporary vectorial force */
1781 tx = _mm_mul_pd(fscal,dx20);
1782 ty = _mm_mul_pd(fscal,dy20);
1783 tz = _mm_mul_pd(fscal,dz20);
1785 /* Update vectorial force */
1786 fix2 = _mm_add_pd(fix2,tx);
1787 fiy2 = _mm_add_pd(fiy2,ty);
1788 fiz2 = _mm_add_pd(fiz2,tz);
1790 fjx0 = _mm_add_pd(fjx0,tx);
1791 fjy0 = _mm_add_pd(fjy0,ty);
1792 fjz0 = _mm_add_pd(fjz0,tz);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 r21 = _mm_mul_pd(rsq21,rinv21);
1800 /* Calculate table index by multiplying r with table scale and truncate to integer */
1801 rt = _mm_mul_pd(r21,vftabscale);
1802 vfitab = _mm_cvttpd_epi32(rt);
1803 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1804 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1806 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1807 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1808 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1809 GMX_MM_TRANSPOSE2_PD(Y,F);
1810 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1811 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1812 GMX_MM_TRANSPOSE2_PD(G,H);
1813 Heps = _mm_mul_pd(vfeps,H);
1814 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1815 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1816 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1818 fscal = felec;
1820 /* Calculate temporary vectorial force */
1821 tx = _mm_mul_pd(fscal,dx21);
1822 ty = _mm_mul_pd(fscal,dy21);
1823 tz = _mm_mul_pd(fscal,dz21);
1825 /* Update vectorial force */
1826 fix2 = _mm_add_pd(fix2,tx);
1827 fiy2 = _mm_add_pd(fiy2,ty);
1828 fiz2 = _mm_add_pd(fiz2,tz);
1830 fjx1 = _mm_add_pd(fjx1,tx);
1831 fjy1 = _mm_add_pd(fjy1,ty);
1832 fjz1 = _mm_add_pd(fjz1,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r22 = _mm_mul_pd(rsq22,rinv22);
1840 /* Calculate table index by multiplying r with table scale and truncate to integer */
1841 rt = _mm_mul_pd(r22,vftabscale);
1842 vfitab = _mm_cvttpd_epi32(rt);
1843 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1844 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1846 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1847 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1848 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1849 GMX_MM_TRANSPOSE2_PD(Y,F);
1850 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1851 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1852 GMX_MM_TRANSPOSE2_PD(G,H);
1853 Heps = _mm_mul_pd(vfeps,H);
1854 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1855 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1856 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1858 fscal = felec;
1860 /* Calculate temporary vectorial force */
1861 tx = _mm_mul_pd(fscal,dx22);
1862 ty = _mm_mul_pd(fscal,dy22);
1863 tz = _mm_mul_pd(fscal,dz22);
1865 /* Update vectorial force */
1866 fix2 = _mm_add_pd(fix2,tx);
1867 fiy2 = _mm_add_pd(fiy2,ty);
1868 fiz2 = _mm_add_pd(fiz2,tz);
1870 fjx2 = _mm_add_pd(fjx2,tx);
1871 fjy2 = _mm_add_pd(fjy2,ty);
1872 fjz2 = _mm_add_pd(fjz2,tz);
1874 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1876 /* Inner loop uses 373 flops */
1879 if(jidx<j_index_end)
1882 jnrA = jjnr[jidx];
1883 j_coord_offsetA = DIM*jnrA;
1885 /* load j atom coordinates */
1886 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1887 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1889 /* Calculate displacement vector */
1890 dx00 = _mm_sub_pd(ix0,jx0);
1891 dy00 = _mm_sub_pd(iy0,jy0);
1892 dz00 = _mm_sub_pd(iz0,jz0);
1893 dx01 = _mm_sub_pd(ix0,jx1);
1894 dy01 = _mm_sub_pd(iy0,jy1);
1895 dz01 = _mm_sub_pd(iz0,jz1);
1896 dx02 = _mm_sub_pd(ix0,jx2);
1897 dy02 = _mm_sub_pd(iy0,jy2);
1898 dz02 = _mm_sub_pd(iz0,jz2);
1899 dx10 = _mm_sub_pd(ix1,jx0);
1900 dy10 = _mm_sub_pd(iy1,jy0);
1901 dz10 = _mm_sub_pd(iz1,jz0);
1902 dx11 = _mm_sub_pd(ix1,jx1);
1903 dy11 = _mm_sub_pd(iy1,jy1);
1904 dz11 = _mm_sub_pd(iz1,jz1);
1905 dx12 = _mm_sub_pd(ix1,jx2);
1906 dy12 = _mm_sub_pd(iy1,jy2);
1907 dz12 = _mm_sub_pd(iz1,jz2);
1908 dx20 = _mm_sub_pd(ix2,jx0);
1909 dy20 = _mm_sub_pd(iy2,jy0);
1910 dz20 = _mm_sub_pd(iz2,jz0);
1911 dx21 = _mm_sub_pd(ix2,jx1);
1912 dy21 = _mm_sub_pd(iy2,jy1);
1913 dz21 = _mm_sub_pd(iz2,jz1);
1914 dx22 = _mm_sub_pd(ix2,jx2);
1915 dy22 = _mm_sub_pd(iy2,jy2);
1916 dz22 = _mm_sub_pd(iz2,jz2);
1918 /* Calculate squared distance and things based on it */
1919 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1920 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1921 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1922 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1923 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1924 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1925 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1926 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1927 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1929 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1930 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1931 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1932 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1933 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1934 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1935 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1936 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1937 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1939 fjx0 = _mm_setzero_pd();
1940 fjy0 = _mm_setzero_pd();
1941 fjz0 = _mm_setzero_pd();
1942 fjx1 = _mm_setzero_pd();
1943 fjy1 = _mm_setzero_pd();
1944 fjz1 = _mm_setzero_pd();
1945 fjx2 = _mm_setzero_pd();
1946 fjy2 = _mm_setzero_pd();
1947 fjz2 = _mm_setzero_pd();
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 r00 = _mm_mul_pd(rsq00,rinv00);
1955 /* Calculate table index by multiplying r with table scale and truncate to integer */
1956 rt = _mm_mul_pd(r00,vftabscale);
1957 vfitab = _mm_cvttpd_epi32(rt);
1958 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1959 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1961 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1962 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1963 F = _mm_setzero_pd();
1964 GMX_MM_TRANSPOSE2_PD(Y,F);
1965 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1966 H = _mm_setzero_pd();
1967 GMX_MM_TRANSPOSE2_PD(G,H);
1968 Heps = _mm_mul_pd(vfeps,H);
1969 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1970 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1971 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1973 /* CUBIC SPLINE TABLE DISPERSION */
1974 vfitab = _mm_add_epi32(vfitab,ifour);
1975 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1976 F = _mm_setzero_pd();
1977 GMX_MM_TRANSPOSE2_PD(Y,F);
1978 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1979 H = _mm_setzero_pd();
1980 GMX_MM_TRANSPOSE2_PD(G,H);
1981 Heps = _mm_mul_pd(vfeps,H);
1982 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1983 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1984 fvdw6 = _mm_mul_pd(c6_00,FF);
1986 /* CUBIC SPLINE TABLE REPULSION */
1987 vfitab = _mm_add_epi32(vfitab,ifour);
1988 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1989 F = _mm_setzero_pd();
1990 GMX_MM_TRANSPOSE2_PD(Y,F);
1991 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1992 H = _mm_setzero_pd();
1993 GMX_MM_TRANSPOSE2_PD(G,H);
1994 Heps = _mm_mul_pd(vfeps,H);
1995 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1996 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1997 fvdw12 = _mm_mul_pd(c12_00,FF);
1998 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2000 fscal = _mm_add_pd(felec,fvdw);
2002 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2004 /* Calculate temporary vectorial force */
2005 tx = _mm_mul_pd(fscal,dx00);
2006 ty = _mm_mul_pd(fscal,dy00);
2007 tz = _mm_mul_pd(fscal,dz00);
2009 /* Update vectorial force */
2010 fix0 = _mm_add_pd(fix0,tx);
2011 fiy0 = _mm_add_pd(fiy0,ty);
2012 fiz0 = _mm_add_pd(fiz0,tz);
2014 fjx0 = _mm_add_pd(fjx0,tx);
2015 fjy0 = _mm_add_pd(fjy0,ty);
2016 fjz0 = _mm_add_pd(fjz0,tz);
2018 /**************************
2019 * CALCULATE INTERACTIONS *
2020 **************************/
2022 r01 = _mm_mul_pd(rsq01,rinv01);
2024 /* Calculate table index by multiplying r with table scale and truncate to integer */
2025 rt = _mm_mul_pd(r01,vftabscale);
2026 vfitab = _mm_cvttpd_epi32(rt);
2027 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2028 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2030 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2031 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2032 F = _mm_setzero_pd();
2033 GMX_MM_TRANSPOSE2_PD(Y,F);
2034 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2035 H = _mm_setzero_pd();
2036 GMX_MM_TRANSPOSE2_PD(G,H);
2037 Heps = _mm_mul_pd(vfeps,H);
2038 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2039 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2040 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2042 fscal = felec;
2044 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2046 /* Calculate temporary vectorial force */
2047 tx = _mm_mul_pd(fscal,dx01);
2048 ty = _mm_mul_pd(fscal,dy01);
2049 tz = _mm_mul_pd(fscal,dz01);
2051 /* Update vectorial force */
2052 fix0 = _mm_add_pd(fix0,tx);
2053 fiy0 = _mm_add_pd(fiy0,ty);
2054 fiz0 = _mm_add_pd(fiz0,tz);
2056 fjx1 = _mm_add_pd(fjx1,tx);
2057 fjy1 = _mm_add_pd(fjy1,ty);
2058 fjz1 = _mm_add_pd(fjz1,tz);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 r02 = _mm_mul_pd(rsq02,rinv02);
2066 /* Calculate table index by multiplying r with table scale and truncate to integer */
2067 rt = _mm_mul_pd(r02,vftabscale);
2068 vfitab = _mm_cvttpd_epi32(rt);
2069 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2070 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2072 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2073 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2074 F = _mm_setzero_pd();
2075 GMX_MM_TRANSPOSE2_PD(Y,F);
2076 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2077 H = _mm_setzero_pd();
2078 GMX_MM_TRANSPOSE2_PD(G,H);
2079 Heps = _mm_mul_pd(vfeps,H);
2080 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2081 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2082 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2084 fscal = felec;
2086 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2088 /* Calculate temporary vectorial force */
2089 tx = _mm_mul_pd(fscal,dx02);
2090 ty = _mm_mul_pd(fscal,dy02);
2091 tz = _mm_mul_pd(fscal,dz02);
2093 /* Update vectorial force */
2094 fix0 = _mm_add_pd(fix0,tx);
2095 fiy0 = _mm_add_pd(fiy0,ty);
2096 fiz0 = _mm_add_pd(fiz0,tz);
2098 fjx2 = _mm_add_pd(fjx2,tx);
2099 fjy2 = _mm_add_pd(fjy2,ty);
2100 fjz2 = _mm_add_pd(fjz2,tz);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 r10 = _mm_mul_pd(rsq10,rinv10);
2108 /* Calculate table index by multiplying r with table scale and truncate to integer */
2109 rt = _mm_mul_pd(r10,vftabscale);
2110 vfitab = _mm_cvttpd_epi32(rt);
2111 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2112 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2114 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2115 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2116 F = _mm_setzero_pd();
2117 GMX_MM_TRANSPOSE2_PD(Y,F);
2118 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2119 H = _mm_setzero_pd();
2120 GMX_MM_TRANSPOSE2_PD(G,H);
2121 Heps = _mm_mul_pd(vfeps,H);
2122 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2123 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2124 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2126 fscal = felec;
2128 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2130 /* Calculate temporary vectorial force */
2131 tx = _mm_mul_pd(fscal,dx10);
2132 ty = _mm_mul_pd(fscal,dy10);
2133 tz = _mm_mul_pd(fscal,dz10);
2135 /* Update vectorial force */
2136 fix1 = _mm_add_pd(fix1,tx);
2137 fiy1 = _mm_add_pd(fiy1,ty);
2138 fiz1 = _mm_add_pd(fiz1,tz);
2140 fjx0 = _mm_add_pd(fjx0,tx);
2141 fjy0 = _mm_add_pd(fjy0,ty);
2142 fjz0 = _mm_add_pd(fjz0,tz);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 r11 = _mm_mul_pd(rsq11,rinv11);
2150 /* Calculate table index by multiplying r with table scale and truncate to integer */
2151 rt = _mm_mul_pd(r11,vftabscale);
2152 vfitab = _mm_cvttpd_epi32(rt);
2153 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2154 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2156 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2157 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2158 F = _mm_setzero_pd();
2159 GMX_MM_TRANSPOSE2_PD(Y,F);
2160 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2161 H = _mm_setzero_pd();
2162 GMX_MM_TRANSPOSE2_PD(G,H);
2163 Heps = _mm_mul_pd(vfeps,H);
2164 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2165 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2166 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2168 fscal = felec;
2170 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2172 /* Calculate temporary vectorial force */
2173 tx = _mm_mul_pd(fscal,dx11);
2174 ty = _mm_mul_pd(fscal,dy11);
2175 tz = _mm_mul_pd(fscal,dz11);
2177 /* Update vectorial force */
2178 fix1 = _mm_add_pd(fix1,tx);
2179 fiy1 = _mm_add_pd(fiy1,ty);
2180 fiz1 = _mm_add_pd(fiz1,tz);
2182 fjx1 = _mm_add_pd(fjx1,tx);
2183 fjy1 = _mm_add_pd(fjy1,ty);
2184 fjz1 = _mm_add_pd(fjz1,tz);
2186 /**************************
2187 * CALCULATE INTERACTIONS *
2188 **************************/
2190 r12 = _mm_mul_pd(rsq12,rinv12);
2192 /* Calculate table index by multiplying r with table scale and truncate to integer */
2193 rt = _mm_mul_pd(r12,vftabscale);
2194 vfitab = _mm_cvttpd_epi32(rt);
2195 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2196 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2198 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2199 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2200 F = _mm_setzero_pd();
2201 GMX_MM_TRANSPOSE2_PD(Y,F);
2202 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2203 H = _mm_setzero_pd();
2204 GMX_MM_TRANSPOSE2_PD(G,H);
2205 Heps = _mm_mul_pd(vfeps,H);
2206 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2207 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2208 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2210 fscal = felec;
2212 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2214 /* Calculate temporary vectorial force */
2215 tx = _mm_mul_pd(fscal,dx12);
2216 ty = _mm_mul_pd(fscal,dy12);
2217 tz = _mm_mul_pd(fscal,dz12);
2219 /* Update vectorial force */
2220 fix1 = _mm_add_pd(fix1,tx);
2221 fiy1 = _mm_add_pd(fiy1,ty);
2222 fiz1 = _mm_add_pd(fiz1,tz);
2224 fjx2 = _mm_add_pd(fjx2,tx);
2225 fjy2 = _mm_add_pd(fjy2,ty);
2226 fjz2 = _mm_add_pd(fjz2,tz);
2228 /**************************
2229 * CALCULATE INTERACTIONS *
2230 **************************/
2232 r20 = _mm_mul_pd(rsq20,rinv20);
2234 /* Calculate table index by multiplying r with table scale and truncate to integer */
2235 rt = _mm_mul_pd(r20,vftabscale);
2236 vfitab = _mm_cvttpd_epi32(rt);
2237 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2238 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2240 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2241 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2242 F = _mm_setzero_pd();
2243 GMX_MM_TRANSPOSE2_PD(Y,F);
2244 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2245 H = _mm_setzero_pd();
2246 GMX_MM_TRANSPOSE2_PD(G,H);
2247 Heps = _mm_mul_pd(vfeps,H);
2248 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2249 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2250 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2252 fscal = felec;
2254 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2256 /* Calculate temporary vectorial force */
2257 tx = _mm_mul_pd(fscal,dx20);
2258 ty = _mm_mul_pd(fscal,dy20);
2259 tz = _mm_mul_pd(fscal,dz20);
2261 /* Update vectorial force */
2262 fix2 = _mm_add_pd(fix2,tx);
2263 fiy2 = _mm_add_pd(fiy2,ty);
2264 fiz2 = _mm_add_pd(fiz2,tz);
2266 fjx0 = _mm_add_pd(fjx0,tx);
2267 fjy0 = _mm_add_pd(fjy0,ty);
2268 fjz0 = _mm_add_pd(fjz0,tz);
2270 /**************************
2271 * CALCULATE INTERACTIONS *
2272 **************************/
2274 r21 = _mm_mul_pd(rsq21,rinv21);
2276 /* Calculate table index by multiplying r with table scale and truncate to integer */
2277 rt = _mm_mul_pd(r21,vftabscale);
2278 vfitab = _mm_cvttpd_epi32(rt);
2279 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2280 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2282 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2283 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2284 F = _mm_setzero_pd();
2285 GMX_MM_TRANSPOSE2_PD(Y,F);
2286 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2287 H = _mm_setzero_pd();
2288 GMX_MM_TRANSPOSE2_PD(G,H);
2289 Heps = _mm_mul_pd(vfeps,H);
2290 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2291 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2292 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2294 fscal = felec;
2296 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2298 /* Calculate temporary vectorial force */
2299 tx = _mm_mul_pd(fscal,dx21);
2300 ty = _mm_mul_pd(fscal,dy21);
2301 tz = _mm_mul_pd(fscal,dz21);
2303 /* Update vectorial force */
2304 fix2 = _mm_add_pd(fix2,tx);
2305 fiy2 = _mm_add_pd(fiy2,ty);
2306 fiz2 = _mm_add_pd(fiz2,tz);
2308 fjx1 = _mm_add_pd(fjx1,tx);
2309 fjy1 = _mm_add_pd(fjy1,ty);
2310 fjz1 = _mm_add_pd(fjz1,tz);
2312 /**************************
2313 * CALCULATE INTERACTIONS *
2314 **************************/
2316 r22 = _mm_mul_pd(rsq22,rinv22);
2318 /* Calculate table index by multiplying r with table scale and truncate to integer */
2319 rt = _mm_mul_pd(r22,vftabscale);
2320 vfitab = _mm_cvttpd_epi32(rt);
2321 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2322 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2324 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2325 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2326 F = _mm_setzero_pd();
2327 GMX_MM_TRANSPOSE2_PD(Y,F);
2328 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2329 H = _mm_setzero_pd();
2330 GMX_MM_TRANSPOSE2_PD(G,H);
2331 Heps = _mm_mul_pd(vfeps,H);
2332 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2333 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2334 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2336 fscal = felec;
2338 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2340 /* Calculate temporary vectorial force */
2341 tx = _mm_mul_pd(fscal,dx22);
2342 ty = _mm_mul_pd(fscal,dy22);
2343 tz = _mm_mul_pd(fscal,dz22);
2345 /* Update vectorial force */
2346 fix2 = _mm_add_pd(fix2,tx);
2347 fiy2 = _mm_add_pd(fiy2,ty);
2348 fiz2 = _mm_add_pd(fiz2,tz);
2350 fjx2 = _mm_add_pd(fjx2,tx);
2351 fjy2 = _mm_add_pd(fjy2,ty);
2352 fjz2 = _mm_add_pd(fjz2,tz);
2354 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2356 /* Inner loop uses 373 flops */
2359 /* End of innermost loop */
2361 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2362 f+i_coord_offset,fshift+i_shift_offset);
2364 /* Increment number of inner iterations */
2365 inneriter += j_index_end - j_index_start;
2367 /* Outer loop uses 18 flops */
2370 /* Increment number of outer iterations */
2371 outeriter += nri;
2373 /* Update outer/inner flops */
2375 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*373);