Double precision SSE2 kernels
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sse2_double.c
blob9951dfcbd73be5b1bdd5214f82e943f9c7d56a10
1 /*
2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
6 * G R O M A C S
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
17 * later version.
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
26 #include <math.h>
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
43 void
44 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66 int vdwioffset0;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68 int vdwioffset1;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70 int vdwioffset2;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
88 real *charge;
89 int nvdwtype;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91 int *vdwtype;
92 real *vdwparam;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
95 __m128i vfitab;
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
98 real *vftab;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
103 x = xx[0];
104 f = ff[0];
106 nri = nlist->nri;
107 iinr = nlist->iinr;
108 jindex = nlist->jindex;
109 jjnr = nlist->jjnr;
110 shiftidx = nlist->shift;
111 gid = nlist->gid;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 krf = _mm_set1_pd(fr->ic->k_rf);
117 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
118 crf = _mm_set1_pd(fr->ic->c_rf);
119 nvdwtype = fr->ntype;
120 vdwparam = fr->nbfp;
121 vdwtype = mdatoms->typeA;
123 vftab = kernel_data->table_vdw->data;
124 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
129 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
130 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_pd(charge[inr+0]);
134 jq1 = _mm_set1_pd(charge[inr+1]);
135 jq2 = _mm_set1_pd(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_pd(iq0,jq0);
138 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_pd(iq0,jq1);
141 qq02 = _mm_mul_pd(iq0,jq2);
142 qq10 = _mm_mul_pd(iq1,jq0);
143 qq11 = _mm_mul_pd(iq1,jq1);
144 qq12 = _mm_mul_pd(iq1,jq2);
145 qq20 = _mm_mul_pd(iq2,jq0);
146 qq21 = _mm_mul_pd(iq2,jq1);
147 qq22 = _mm_mul_pd(iq2,jq2);
149 /* Avoid stupid compiler warnings */
150 jnrA = jnrB = 0;
151 j_coord_offsetA = 0;
152 j_coord_offsetB = 0;
154 outeriter = 0;
155 inneriter = 0;
157 /* Start outer loop over neighborlists */
158 for(iidx=0; iidx<nri; iidx++)
160 /* Load shift vector for this list */
161 i_shift_offset = DIM*shiftidx[iidx];
163 /* Load limits for loop over neighbors */
164 j_index_start = jindex[iidx];
165 j_index_end = jindex[iidx+1];
167 /* Get outer coordinate index */
168 inr = iinr[iidx];
169 i_coord_offset = DIM*inr;
171 /* Load i particle coords and add shift vector */
172 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
173 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
175 fix0 = _mm_setzero_pd();
176 fiy0 = _mm_setzero_pd();
177 fiz0 = _mm_setzero_pd();
178 fix1 = _mm_setzero_pd();
179 fiy1 = _mm_setzero_pd();
180 fiz1 = _mm_setzero_pd();
181 fix2 = _mm_setzero_pd();
182 fiy2 = _mm_setzero_pd();
183 fiz2 = _mm_setzero_pd();
185 /* Reset potential sums */
186 velecsum = _mm_setzero_pd();
187 vvdwsum = _mm_setzero_pd();
189 /* Start inner kernel loop */
190 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
193 /* Get j neighbor index, and coordinate index */
194 jnrA = jjnr[jidx];
195 jnrB = jjnr[jidx+1];
196 j_coord_offsetA = DIM*jnrA;
197 j_coord_offsetB = DIM*jnrB;
199 /* load j atom coordinates */
200 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
201 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
203 /* Calculate displacement vector */
204 dx00 = _mm_sub_pd(ix0,jx0);
205 dy00 = _mm_sub_pd(iy0,jy0);
206 dz00 = _mm_sub_pd(iz0,jz0);
207 dx01 = _mm_sub_pd(ix0,jx1);
208 dy01 = _mm_sub_pd(iy0,jy1);
209 dz01 = _mm_sub_pd(iz0,jz1);
210 dx02 = _mm_sub_pd(ix0,jx2);
211 dy02 = _mm_sub_pd(iy0,jy2);
212 dz02 = _mm_sub_pd(iz0,jz2);
213 dx10 = _mm_sub_pd(ix1,jx0);
214 dy10 = _mm_sub_pd(iy1,jy0);
215 dz10 = _mm_sub_pd(iz1,jz0);
216 dx11 = _mm_sub_pd(ix1,jx1);
217 dy11 = _mm_sub_pd(iy1,jy1);
218 dz11 = _mm_sub_pd(iz1,jz1);
219 dx12 = _mm_sub_pd(ix1,jx2);
220 dy12 = _mm_sub_pd(iy1,jy2);
221 dz12 = _mm_sub_pd(iz1,jz2);
222 dx20 = _mm_sub_pd(ix2,jx0);
223 dy20 = _mm_sub_pd(iy2,jy0);
224 dz20 = _mm_sub_pd(iz2,jz0);
225 dx21 = _mm_sub_pd(ix2,jx1);
226 dy21 = _mm_sub_pd(iy2,jy1);
227 dz21 = _mm_sub_pd(iz2,jz1);
228 dx22 = _mm_sub_pd(ix2,jx2);
229 dy22 = _mm_sub_pd(iy2,jy2);
230 dz22 = _mm_sub_pd(iz2,jz2);
232 /* Calculate squared distance and things based on it */
233 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
234 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
235 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
236 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
237 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
238 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
239 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
240 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
241 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
243 rinv00 = gmx_mm_invsqrt_pd(rsq00);
244 rinv01 = gmx_mm_invsqrt_pd(rsq01);
245 rinv02 = gmx_mm_invsqrt_pd(rsq02);
246 rinv10 = gmx_mm_invsqrt_pd(rsq10);
247 rinv11 = gmx_mm_invsqrt_pd(rsq11);
248 rinv12 = gmx_mm_invsqrt_pd(rsq12);
249 rinv20 = gmx_mm_invsqrt_pd(rsq20);
250 rinv21 = gmx_mm_invsqrt_pd(rsq21);
251 rinv22 = gmx_mm_invsqrt_pd(rsq22);
253 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
254 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
255 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
256 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
257 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
258 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
259 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
260 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
261 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
263 fjx0 = _mm_setzero_pd();
264 fjy0 = _mm_setzero_pd();
265 fjz0 = _mm_setzero_pd();
266 fjx1 = _mm_setzero_pd();
267 fjy1 = _mm_setzero_pd();
268 fjz1 = _mm_setzero_pd();
269 fjx2 = _mm_setzero_pd();
270 fjy2 = _mm_setzero_pd();
271 fjz2 = _mm_setzero_pd();
273 /**************************
274 * CALCULATE INTERACTIONS *
275 **************************/
277 r00 = _mm_mul_pd(rsq00,rinv00);
279 /* Calculate table index by multiplying r with table scale and truncate to integer */
280 rt = _mm_mul_pd(r00,vftabscale);
281 vfitab = _mm_cvttpd_epi32(rt);
282 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
283 vfitab = _mm_slli_epi32(vfitab,3);
285 /* REACTION-FIELD ELECTROSTATICS */
286 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
287 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
289 /* CUBIC SPLINE TABLE DISPERSION */
290 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
291 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
292 GMX_MM_TRANSPOSE2_PD(Y,F);
293 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
294 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
295 GMX_MM_TRANSPOSE2_PD(G,H);
296 Heps = _mm_mul_pd(vfeps,H);
297 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
298 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
299 vvdw6 = _mm_mul_pd(c6_00,VV);
300 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
301 fvdw6 = _mm_mul_pd(c6_00,FF);
303 /* CUBIC SPLINE TABLE REPULSION */
304 vfitab = _mm_add_epi32(vfitab,ifour);
305 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
306 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
307 GMX_MM_TRANSPOSE2_PD(Y,F);
308 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
309 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
310 GMX_MM_TRANSPOSE2_PD(G,H);
311 Heps = _mm_mul_pd(vfeps,H);
312 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
313 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
314 vvdw12 = _mm_mul_pd(c12_00,VV);
315 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
316 fvdw12 = _mm_mul_pd(c12_00,FF);
317 vvdw = _mm_add_pd(vvdw12,vvdw6);
318 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 velecsum = _mm_add_pd(velecsum,velec);
322 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
324 fscal = _mm_add_pd(felec,fvdw);
326 /* Calculate temporary vectorial force */
327 tx = _mm_mul_pd(fscal,dx00);
328 ty = _mm_mul_pd(fscal,dy00);
329 tz = _mm_mul_pd(fscal,dz00);
331 /* Update vectorial force */
332 fix0 = _mm_add_pd(fix0,tx);
333 fiy0 = _mm_add_pd(fiy0,ty);
334 fiz0 = _mm_add_pd(fiz0,tz);
336 fjx0 = _mm_add_pd(fjx0,tx);
337 fjy0 = _mm_add_pd(fjy0,ty);
338 fjz0 = _mm_add_pd(fjz0,tz);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 /* REACTION-FIELD ELECTROSTATICS */
345 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
346 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 velecsum = _mm_add_pd(velecsum,velec);
351 fscal = felec;
353 /* Calculate temporary vectorial force */
354 tx = _mm_mul_pd(fscal,dx01);
355 ty = _mm_mul_pd(fscal,dy01);
356 tz = _mm_mul_pd(fscal,dz01);
358 /* Update vectorial force */
359 fix0 = _mm_add_pd(fix0,tx);
360 fiy0 = _mm_add_pd(fiy0,ty);
361 fiz0 = _mm_add_pd(fiz0,tz);
363 fjx1 = _mm_add_pd(fjx1,tx);
364 fjy1 = _mm_add_pd(fjy1,ty);
365 fjz1 = _mm_add_pd(fjz1,tz);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 /* REACTION-FIELD ELECTROSTATICS */
372 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
373 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velecsum = _mm_add_pd(velecsum,velec);
378 fscal = felec;
380 /* Calculate temporary vectorial force */
381 tx = _mm_mul_pd(fscal,dx02);
382 ty = _mm_mul_pd(fscal,dy02);
383 tz = _mm_mul_pd(fscal,dz02);
385 /* Update vectorial force */
386 fix0 = _mm_add_pd(fix0,tx);
387 fiy0 = _mm_add_pd(fiy0,ty);
388 fiz0 = _mm_add_pd(fiz0,tz);
390 fjx2 = _mm_add_pd(fjx2,tx);
391 fjy2 = _mm_add_pd(fjy2,ty);
392 fjz2 = _mm_add_pd(fjz2,tz);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 /* REACTION-FIELD ELECTROSTATICS */
399 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
400 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velecsum = _mm_add_pd(velecsum,velec);
405 fscal = felec;
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_pd(fscal,dx10);
409 ty = _mm_mul_pd(fscal,dy10);
410 tz = _mm_mul_pd(fscal,dz10);
412 /* Update vectorial force */
413 fix1 = _mm_add_pd(fix1,tx);
414 fiy1 = _mm_add_pd(fiy1,ty);
415 fiz1 = _mm_add_pd(fiz1,tz);
417 fjx0 = _mm_add_pd(fjx0,tx);
418 fjy0 = _mm_add_pd(fjy0,ty);
419 fjz0 = _mm_add_pd(fjz0,tz);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 /* REACTION-FIELD ELECTROSTATICS */
426 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
427 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
429 /* Update potential sum for this i atom from the interaction with this j atom. */
430 velecsum = _mm_add_pd(velecsum,velec);
432 fscal = felec;
434 /* Calculate temporary vectorial force */
435 tx = _mm_mul_pd(fscal,dx11);
436 ty = _mm_mul_pd(fscal,dy11);
437 tz = _mm_mul_pd(fscal,dz11);
439 /* Update vectorial force */
440 fix1 = _mm_add_pd(fix1,tx);
441 fiy1 = _mm_add_pd(fiy1,ty);
442 fiz1 = _mm_add_pd(fiz1,tz);
444 fjx1 = _mm_add_pd(fjx1,tx);
445 fjy1 = _mm_add_pd(fjy1,ty);
446 fjz1 = _mm_add_pd(fjz1,tz);
448 /**************************
449 * CALCULATE INTERACTIONS *
450 **************************/
452 /* REACTION-FIELD ELECTROSTATICS */
453 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
454 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velecsum = _mm_add_pd(velecsum,velec);
459 fscal = felec;
461 /* Calculate temporary vectorial force */
462 tx = _mm_mul_pd(fscal,dx12);
463 ty = _mm_mul_pd(fscal,dy12);
464 tz = _mm_mul_pd(fscal,dz12);
466 /* Update vectorial force */
467 fix1 = _mm_add_pd(fix1,tx);
468 fiy1 = _mm_add_pd(fiy1,ty);
469 fiz1 = _mm_add_pd(fiz1,tz);
471 fjx2 = _mm_add_pd(fjx2,tx);
472 fjy2 = _mm_add_pd(fjy2,ty);
473 fjz2 = _mm_add_pd(fjz2,tz);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 /* REACTION-FIELD ELECTROSTATICS */
480 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
481 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm_add_pd(velecsum,velec);
486 fscal = felec;
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_pd(fscal,dx20);
490 ty = _mm_mul_pd(fscal,dy20);
491 tz = _mm_mul_pd(fscal,dz20);
493 /* Update vectorial force */
494 fix2 = _mm_add_pd(fix2,tx);
495 fiy2 = _mm_add_pd(fiy2,ty);
496 fiz2 = _mm_add_pd(fiz2,tz);
498 fjx0 = _mm_add_pd(fjx0,tx);
499 fjy0 = _mm_add_pd(fjy0,ty);
500 fjz0 = _mm_add_pd(fjz0,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 /* REACTION-FIELD ELECTROSTATICS */
507 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
508 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velecsum = _mm_add_pd(velecsum,velec);
513 fscal = felec;
515 /* Calculate temporary vectorial force */
516 tx = _mm_mul_pd(fscal,dx21);
517 ty = _mm_mul_pd(fscal,dy21);
518 tz = _mm_mul_pd(fscal,dz21);
520 /* Update vectorial force */
521 fix2 = _mm_add_pd(fix2,tx);
522 fiy2 = _mm_add_pd(fiy2,ty);
523 fiz2 = _mm_add_pd(fiz2,tz);
525 fjx1 = _mm_add_pd(fjx1,tx);
526 fjy1 = _mm_add_pd(fjy1,ty);
527 fjz1 = _mm_add_pd(fjz1,tz);
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
533 /* REACTION-FIELD ELECTROSTATICS */
534 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
535 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velecsum = _mm_add_pd(velecsum,velec);
540 fscal = felec;
542 /* Calculate temporary vectorial force */
543 tx = _mm_mul_pd(fscal,dx22);
544 ty = _mm_mul_pd(fscal,dy22);
545 tz = _mm_mul_pd(fscal,dz22);
547 /* Update vectorial force */
548 fix2 = _mm_add_pd(fix2,tx);
549 fiy2 = _mm_add_pd(fiy2,ty);
550 fiz2 = _mm_add_pd(fiz2,tz);
552 fjx2 = _mm_add_pd(fjx2,tx);
553 fjy2 = _mm_add_pd(fjy2,ty);
554 fjz2 = _mm_add_pd(fjz2,tz);
556 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
558 /* Inner loop uses 323 flops */
561 if(jidx<j_index_end)
564 jnrA = jjnr[jidx];
565 j_coord_offsetA = DIM*jnrA;
567 /* load j atom coordinates */
568 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
569 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
571 /* Calculate displacement vector */
572 dx00 = _mm_sub_pd(ix0,jx0);
573 dy00 = _mm_sub_pd(iy0,jy0);
574 dz00 = _mm_sub_pd(iz0,jz0);
575 dx01 = _mm_sub_pd(ix0,jx1);
576 dy01 = _mm_sub_pd(iy0,jy1);
577 dz01 = _mm_sub_pd(iz0,jz1);
578 dx02 = _mm_sub_pd(ix0,jx2);
579 dy02 = _mm_sub_pd(iy0,jy2);
580 dz02 = _mm_sub_pd(iz0,jz2);
581 dx10 = _mm_sub_pd(ix1,jx0);
582 dy10 = _mm_sub_pd(iy1,jy0);
583 dz10 = _mm_sub_pd(iz1,jz0);
584 dx11 = _mm_sub_pd(ix1,jx1);
585 dy11 = _mm_sub_pd(iy1,jy1);
586 dz11 = _mm_sub_pd(iz1,jz1);
587 dx12 = _mm_sub_pd(ix1,jx2);
588 dy12 = _mm_sub_pd(iy1,jy2);
589 dz12 = _mm_sub_pd(iz1,jz2);
590 dx20 = _mm_sub_pd(ix2,jx0);
591 dy20 = _mm_sub_pd(iy2,jy0);
592 dz20 = _mm_sub_pd(iz2,jz0);
593 dx21 = _mm_sub_pd(ix2,jx1);
594 dy21 = _mm_sub_pd(iy2,jy1);
595 dz21 = _mm_sub_pd(iz2,jz1);
596 dx22 = _mm_sub_pd(ix2,jx2);
597 dy22 = _mm_sub_pd(iy2,jy2);
598 dz22 = _mm_sub_pd(iz2,jz2);
600 /* Calculate squared distance and things based on it */
601 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
602 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
603 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
604 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
605 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
606 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
607 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
608 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
609 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
611 rinv00 = gmx_mm_invsqrt_pd(rsq00);
612 rinv01 = gmx_mm_invsqrt_pd(rsq01);
613 rinv02 = gmx_mm_invsqrt_pd(rsq02);
614 rinv10 = gmx_mm_invsqrt_pd(rsq10);
615 rinv11 = gmx_mm_invsqrt_pd(rsq11);
616 rinv12 = gmx_mm_invsqrt_pd(rsq12);
617 rinv20 = gmx_mm_invsqrt_pd(rsq20);
618 rinv21 = gmx_mm_invsqrt_pd(rsq21);
619 rinv22 = gmx_mm_invsqrt_pd(rsq22);
621 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
622 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
623 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
624 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
625 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
626 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
627 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
628 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
629 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
631 fjx0 = _mm_setzero_pd();
632 fjy0 = _mm_setzero_pd();
633 fjz0 = _mm_setzero_pd();
634 fjx1 = _mm_setzero_pd();
635 fjy1 = _mm_setzero_pd();
636 fjz1 = _mm_setzero_pd();
637 fjx2 = _mm_setzero_pd();
638 fjy2 = _mm_setzero_pd();
639 fjz2 = _mm_setzero_pd();
641 /**************************
642 * CALCULATE INTERACTIONS *
643 **************************/
645 r00 = _mm_mul_pd(rsq00,rinv00);
647 /* Calculate table index by multiplying r with table scale and truncate to integer */
648 rt = _mm_mul_pd(r00,vftabscale);
649 vfitab = _mm_cvttpd_epi32(rt);
650 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
651 vfitab = _mm_slli_epi32(vfitab,3);
653 /* REACTION-FIELD ELECTROSTATICS */
654 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_add_pd(rinv00,_mm_mul_pd(krf,rsq00)),crf));
655 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
657 /* CUBIC SPLINE TABLE DISPERSION */
658 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
659 F = _mm_setzero_pd();
660 GMX_MM_TRANSPOSE2_PD(Y,F);
661 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
662 H = _mm_setzero_pd();
663 GMX_MM_TRANSPOSE2_PD(G,H);
664 Heps = _mm_mul_pd(vfeps,H);
665 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
666 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
667 vvdw6 = _mm_mul_pd(c6_00,VV);
668 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
669 fvdw6 = _mm_mul_pd(c6_00,FF);
671 /* CUBIC SPLINE TABLE REPULSION */
672 vfitab = _mm_add_epi32(vfitab,ifour);
673 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
674 F = _mm_setzero_pd();
675 GMX_MM_TRANSPOSE2_PD(Y,F);
676 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
677 H = _mm_setzero_pd();
678 GMX_MM_TRANSPOSE2_PD(G,H);
679 Heps = _mm_mul_pd(vfeps,H);
680 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
681 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
682 vvdw12 = _mm_mul_pd(c12_00,VV);
683 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
684 fvdw12 = _mm_mul_pd(c12_00,FF);
685 vvdw = _mm_add_pd(vvdw12,vvdw6);
686 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
688 /* Update potential sum for this i atom from the interaction with this j atom. */
689 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
690 velecsum = _mm_add_pd(velecsum,velec);
691 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
692 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
694 fscal = _mm_add_pd(felec,fvdw);
696 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_pd(fscal,dx00);
700 ty = _mm_mul_pd(fscal,dy00);
701 tz = _mm_mul_pd(fscal,dz00);
703 /* Update vectorial force */
704 fix0 = _mm_add_pd(fix0,tx);
705 fiy0 = _mm_add_pd(fiy0,ty);
706 fiz0 = _mm_add_pd(fiz0,tz);
708 fjx0 = _mm_add_pd(fjx0,tx);
709 fjy0 = _mm_add_pd(fjy0,ty);
710 fjz0 = _mm_add_pd(fjz0,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 /* REACTION-FIELD ELECTROSTATICS */
717 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_add_pd(rinv01,_mm_mul_pd(krf,rsq01)),crf));
718 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
722 velecsum = _mm_add_pd(velecsum,velec);
724 fscal = felec;
726 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
728 /* Calculate temporary vectorial force */
729 tx = _mm_mul_pd(fscal,dx01);
730 ty = _mm_mul_pd(fscal,dy01);
731 tz = _mm_mul_pd(fscal,dz01);
733 /* Update vectorial force */
734 fix0 = _mm_add_pd(fix0,tx);
735 fiy0 = _mm_add_pd(fiy0,ty);
736 fiz0 = _mm_add_pd(fiz0,tz);
738 fjx1 = _mm_add_pd(fjx1,tx);
739 fjy1 = _mm_add_pd(fjy1,ty);
740 fjz1 = _mm_add_pd(fjz1,tz);
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 /* REACTION-FIELD ELECTROSTATICS */
747 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_add_pd(rinv02,_mm_mul_pd(krf,rsq02)),crf));
748 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
752 velecsum = _mm_add_pd(velecsum,velec);
754 fscal = felec;
756 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
758 /* Calculate temporary vectorial force */
759 tx = _mm_mul_pd(fscal,dx02);
760 ty = _mm_mul_pd(fscal,dy02);
761 tz = _mm_mul_pd(fscal,dz02);
763 /* Update vectorial force */
764 fix0 = _mm_add_pd(fix0,tx);
765 fiy0 = _mm_add_pd(fiy0,ty);
766 fiz0 = _mm_add_pd(fiz0,tz);
768 fjx2 = _mm_add_pd(fjx2,tx);
769 fjy2 = _mm_add_pd(fjy2,ty);
770 fjz2 = _mm_add_pd(fjz2,tz);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 /* REACTION-FIELD ELECTROSTATICS */
777 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_add_pd(rinv10,_mm_mul_pd(krf,rsq10)),crf));
778 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
780 /* Update potential sum for this i atom from the interaction with this j atom. */
781 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
782 velecsum = _mm_add_pd(velecsum,velec);
784 fscal = felec;
786 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
788 /* Calculate temporary vectorial force */
789 tx = _mm_mul_pd(fscal,dx10);
790 ty = _mm_mul_pd(fscal,dy10);
791 tz = _mm_mul_pd(fscal,dz10);
793 /* Update vectorial force */
794 fix1 = _mm_add_pd(fix1,tx);
795 fiy1 = _mm_add_pd(fiy1,ty);
796 fiz1 = _mm_add_pd(fiz1,tz);
798 fjx0 = _mm_add_pd(fjx0,tx);
799 fjy0 = _mm_add_pd(fjy0,ty);
800 fjz0 = _mm_add_pd(fjz0,tz);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 /* REACTION-FIELD ELECTROSTATICS */
807 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
808 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
812 velecsum = _mm_add_pd(velecsum,velec);
814 fscal = felec;
816 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_pd(fscal,dx11);
820 ty = _mm_mul_pd(fscal,dy11);
821 tz = _mm_mul_pd(fscal,dz11);
823 /* Update vectorial force */
824 fix1 = _mm_add_pd(fix1,tx);
825 fiy1 = _mm_add_pd(fiy1,ty);
826 fiz1 = _mm_add_pd(fiz1,tz);
828 fjx1 = _mm_add_pd(fjx1,tx);
829 fjy1 = _mm_add_pd(fjy1,ty);
830 fjz1 = _mm_add_pd(fjz1,tz);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 /* REACTION-FIELD ELECTROSTATICS */
837 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
838 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
842 velecsum = _mm_add_pd(velecsum,velec);
844 fscal = felec;
846 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
848 /* Calculate temporary vectorial force */
849 tx = _mm_mul_pd(fscal,dx12);
850 ty = _mm_mul_pd(fscal,dy12);
851 tz = _mm_mul_pd(fscal,dz12);
853 /* Update vectorial force */
854 fix1 = _mm_add_pd(fix1,tx);
855 fiy1 = _mm_add_pd(fiy1,ty);
856 fiz1 = _mm_add_pd(fiz1,tz);
858 fjx2 = _mm_add_pd(fjx2,tx);
859 fjy2 = _mm_add_pd(fjy2,ty);
860 fjz2 = _mm_add_pd(fjz2,tz);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 /* REACTION-FIELD ELECTROSTATICS */
867 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_add_pd(rinv20,_mm_mul_pd(krf,rsq20)),crf));
868 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
872 velecsum = _mm_add_pd(velecsum,velec);
874 fscal = felec;
876 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
878 /* Calculate temporary vectorial force */
879 tx = _mm_mul_pd(fscal,dx20);
880 ty = _mm_mul_pd(fscal,dy20);
881 tz = _mm_mul_pd(fscal,dz20);
883 /* Update vectorial force */
884 fix2 = _mm_add_pd(fix2,tx);
885 fiy2 = _mm_add_pd(fiy2,ty);
886 fiz2 = _mm_add_pd(fiz2,tz);
888 fjx0 = _mm_add_pd(fjx0,tx);
889 fjy0 = _mm_add_pd(fjy0,ty);
890 fjz0 = _mm_add_pd(fjz0,tz);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 /* REACTION-FIELD ELECTROSTATICS */
897 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
898 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
902 velecsum = _mm_add_pd(velecsum,velec);
904 fscal = felec;
906 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
908 /* Calculate temporary vectorial force */
909 tx = _mm_mul_pd(fscal,dx21);
910 ty = _mm_mul_pd(fscal,dy21);
911 tz = _mm_mul_pd(fscal,dz21);
913 /* Update vectorial force */
914 fix2 = _mm_add_pd(fix2,tx);
915 fiy2 = _mm_add_pd(fiy2,ty);
916 fiz2 = _mm_add_pd(fiz2,tz);
918 fjx1 = _mm_add_pd(fjx1,tx);
919 fjy1 = _mm_add_pd(fjy1,ty);
920 fjz1 = _mm_add_pd(fjz1,tz);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 /* REACTION-FIELD ELECTROSTATICS */
927 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
928 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
930 /* Update potential sum for this i atom from the interaction with this j atom. */
931 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
932 velecsum = _mm_add_pd(velecsum,velec);
934 fscal = felec;
936 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
938 /* Calculate temporary vectorial force */
939 tx = _mm_mul_pd(fscal,dx22);
940 ty = _mm_mul_pd(fscal,dy22);
941 tz = _mm_mul_pd(fscal,dz22);
943 /* Update vectorial force */
944 fix2 = _mm_add_pd(fix2,tx);
945 fiy2 = _mm_add_pd(fiy2,ty);
946 fiz2 = _mm_add_pd(fiz2,tz);
948 fjx2 = _mm_add_pd(fjx2,tx);
949 fjy2 = _mm_add_pd(fjy2,ty);
950 fjz2 = _mm_add_pd(fjz2,tz);
952 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
954 /* Inner loop uses 323 flops */
957 /* End of innermost loop */
959 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
960 f+i_coord_offset,fshift+i_shift_offset);
962 ggid = gid[iidx];
963 /* Update potential energies */
964 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
965 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
967 /* Increment number of inner iterations */
968 inneriter += j_index_end - j_index_start;
970 /* Outer loop uses 20 flops */
973 /* Increment number of outer iterations */
974 outeriter += nri;
976 /* Update outer/inner flops */
978 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*323);
981 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double
982 * Electrostatics interaction: ReactionField
983 * VdW interaction: CubicSplineTable
984 * Geometry: Water3-Water3
985 * Calculate force/pot: Force
987 void
988 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double
989 (t_nblist * gmx_restrict nlist,
990 rvec * gmx_restrict xx,
991 rvec * gmx_restrict ff,
992 t_forcerec * gmx_restrict fr,
993 t_mdatoms * gmx_restrict mdatoms,
994 nb_kernel_data_t * gmx_restrict kernel_data,
995 t_nrnb * gmx_restrict nrnb)
997 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
998 * just 0 for non-waters.
999 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1000 * jnr indices corresponding to data put in the four positions in the SIMD register.
1002 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1003 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1004 int jnrA,jnrB;
1005 int j_coord_offsetA,j_coord_offsetB;
1006 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1007 real rcutoff_scalar;
1008 real *shiftvec,*fshift,*x,*f;
1009 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1010 int vdwioffset0;
1011 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1012 int vdwioffset1;
1013 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1014 int vdwioffset2;
1015 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1016 int vdwjidx0A,vdwjidx0B;
1017 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1018 int vdwjidx1A,vdwjidx1B;
1019 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1020 int vdwjidx2A,vdwjidx2B;
1021 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1022 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1023 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1024 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1025 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1026 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1027 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1028 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1029 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1030 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1031 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1032 real *charge;
1033 int nvdwtype;
1034 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1035 int *vdwtype;
1036 real *vdwparam;
1037 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1038 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1039 __m128i vfitab;
1040 __m128i ifour = _mm_set1_epi32(4);
1041 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1042 real *vftab;
1043 __m128d dummy_mask,cutoff_mask;
1044 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1045 __m128d one = _mm_set1_pd(1.0);
1046 __m128d two = _mm_set1_pd(2.0);
1047 x = xx[0];
1048 f = ff[0];
1050 nri = nlist->nri;
1051 iinr = nlist->iinr;
1052 jindex = nlist->jindex;
1053 jjnr = nlist->jjnr;
1054 shiftidx = nlist->shift;
1055 gid = nlist->gid;
1056 shiftvec = fr->shift_vec[0];
1057 fshift = fr->fshift[0];
1058 facel = _mm_set1_pd(fr->epsfac);
1059 charge = mdatoms->chargeA;
1060 krf = _mm_set1_pd(fr->ic->k_rf);
1061 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1062 crf = _mm_set1_pd(fr->ic->c_rf);
1063 nvdwtype = fr->ntype;
1064 vdwparam = fr->nbfp;
1065 vdwtype = mdatoms->typeA;
1067 vftab = kernel_data->table_vdw->data;
1068 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1070 /* Setup water-specific parameters */
1071 inr = nlist->iinr[0];
1072 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1073 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1074 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1075 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1077 jq0 = _mm_set1_pd(charge[inr+0]);
1078 jq1 = _mm_set1_pd(charge[inr+1]);
1079 jq2 = _mm_set1_pd(charge[inr+2]);
1080 vdwjidx0A = 2*vdwtype[inr+0];
1081 qq00 = _mm_mul_pd(iq0,jq0);
1082 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1083 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1084 qq01 = _mm_mul_pd(iq0,jq1);
1085 qq02 = _mm_mul_pd(iq0,jq2);
1086 qq10 = _mm_mul_pd(iq1,jq0);
1087 qq11 = _mm_mul_pd(iq1,jq1);
1088 qq12 = _mm_mul_pd(iq1,jq2);
1089 qq20 = _mm_mul_pd(iq2,jq0);
1090 qq21 = _mm_mul_pd(iq2,jq1);
1091 qq22 = _mm_mul_pd(iq2,jq2);
1093 /* Avoid stupid compiler warnings */
1094 jnrA = jnrB = 0;
1095 j_coord_offsetA = 0;
1096 j_coord_offsetB = 0;
1098 outeriter = 0;
1099 inneriter = 0;
1101 /* Start outer loop over neighborlists */
1102 for(iidx=0; iidx<nri; iidx++)
1104 /* Load shift vector for this list */
1105 i_shift_offset = DIM*shiftidx[iidx];
1107 /* Load limits for loop over neighbors */
1108 j_index_start = jindex[iidx];
1109 j_index_end = jindex[iidx+1];
1111 /* Get outer coordinate index */
1112 inr = iinr[iidx];
1113 i_coord_offset = DIM*inr;
1115 /* Load i particle coords and add shift vector */
1116 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1117 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1119 fix0 = _mm_setzero_pd();
1120 fiy0 = _mm_setzero_pd();
1121 fiz0 = _mm_setzero_pd();
1122 fix1 = _mm_setzero_pd();
1123 fiy1 = _mm_setzero_pd();
1124 fiz1 = _mm_setzero_pd();
1125 fix2 = _mm_setzero_pd();
1126 fiy2 = _mm_setzero_pd();
1127 fiz2 = _mm_setzero_pd();
1129 /* Start inner kernel loop */
1130 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1133 /* Get j neighbor index, and coordinate index */
1134 jnrA = jjnr[jidx];
1135 jnrB = jjnr[jidx+1];
1136 j_coord_offsetA = DIM*jnrA;
1137 j_coord_offsetB = DIM*jnrB;
1139 /* load j atom coordinates */
1140 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1141 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1143 /* Calculate displacement vector */
1144 dx00 = _mm_sub_pd(ix0,jx0);
1145 dy00 = _mm_sub_pd(iy0,jy0);
1146 dz00 = _mm_sub_pd(iz0,jz0);
1147 dx01 = _mm_sub_pd(ix0,jx1);
1148 dy01 = _mm_sub_pd(iy0,jy1);
1149 dz01 = _mm_sub_pd(iz0,jz1);
1150 dx02 = _mm_sub_pd(ix0,jx2);
1151 dy02 = _mm_sub_pd(iy0,jy2);
1152 dz02 = _mm_sub_pd(iz0,jz2);
1153 dx10 = _mm_sub_pd(ix1,jx0);
1154 dy10 = _mm_sub_pd(iy1,jy0);
1155 dz10 = _mm_sub_pd(iz1,jz0);
1156 dx11 = _mm_sub_pd(ix1,jx1);
1157 dy11 = _mm_sub_pd(iy1,jy1);
1158 dz11 = _mm_sub_pd(iz1,jz1);
1159 dx12 = _mm_sub_pd(ix1,jx2);
1160 dy12 = _mm_sub_pd(iy1,jy2);
1161 dz12 = _mm_sub_pd(iz1,jz2);
1162 dx20 = _mm_sub_pd(ix2,jx0);
1163 dy20 = _mm_sub_pd(iy2,jy0);
1164 dz20 = _mm_sub_pd(iz2,jz0);
1165 dx21 = _mm_sub_pd(ix2,jx1);
1166 dy21 = _mm_sub_pd(iy2,jy1);
1167 dz21 = _mm_sub_pd(iz2,jz1);
1168 dx22 = _mm_sub_pd(ix2,jx2);
1169 dy22 = _mm_sub_pd(iy2,jy2);
1170 dz22 = _mm_sub_pd(iz2,jz2);
1172 /* Calculate squared distance and things based on it */
1173 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1174 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1175 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1176 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1177 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1178 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1179 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1180 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1181 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1183 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1184 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1185 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1186 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1187 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1188 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1189 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1190 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1191 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1193 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1194 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1195 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1196 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1197 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1198 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1199 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1200 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1201 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1203 fjx0 = _mm_setzero_pd();
1204 fjy0 = _mm_setzero_pd();
1205 fjz0 = _mm_setzero_pd();
1206 fjx1 = _mm_setzero_pd();
1207 fjy1 = _mm_setzero_pd();
1208 fjz1 = _mm_setzero_pd();
1209 fjx2 = _mm_setzero_pd();
1210 fjy2 = _mm_setzero_pd();
1211 fjz2 = _mm_setzero_pd();
1213 /**************************
1214 * CALCULATE INTERACTIONS *
1215 **************************/
1217 r00 = _mm_mul_pd(rsq00,rinv00);
1219 /* Calculate table index by multiplying r with table scale and truncate to integer */
1220 rt = _mm_mul_pd(r00,vftabscale);
1221 vfitab = _mm_cvttpd_epi32(rt);
1222 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1223 vfitab = _mm_slli_epi32(vfitab,3);
1225 /* REACTION-FIELD ELECTROSTATICS */
1226 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1228 /* CUBIC SPLINE TABLE DISPERSION */
1229 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1230 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1231 GMX_MM_TRANSPOSE2_PD(Y,F);
1232 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1233 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1234 GMX_MM_TRANSPOSE2_PD(G,H);
1235 Heps = _mm_mul_pd(vfeps,H);
1236 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1237 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1238 fvdw6 = _mm_mul_pd(c6_00,FF);
1240 /* CUBIC SPLINE TABLE REPULSION */
1241 vfitab = _mm_add_epi32(vfitab,ifour);
1242 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1243 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1244 GMX_MM_TRANSPOSE2_PD(Y,F);
1245 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1246 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1247 GMX_MM_TRANSPOSE2_PD(G,H);
1248 Heps = _mm_mul_pd(vfeps,H);
1249 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1250 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1251 fvdw12 = _mm_mul_pd(c12_00,FF);
1252 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1254 fscal = _mm_add_pd(felec,fvdw);
1256 /* Calculate temporary vectorial force */
1257 tx = _mm_mul_pd(fscal,dx00);
1258 ty = _mm_mul_pd(fscal,dy00);
1259 tz = _mm_mul_pd(fscal,dz00);
1261 /* Update vectorial force */
1262 fix0 = _mm_add_pd(fix0,tx);
1263 fiy0 = _mm_add_pd(fiy0,ty);
1264 fiz0 = _mm_add_pd(fiz0,tz);
1266 fjx0 = _mm_add_pd(fjx0,tx);
1267 fjy0 = _mm_add_pd(fjy0,ty);
1268 fjz0 = _mm_add_pd(fjz0,tz);
1270 /**************************
1271 * CALCULATE INTERACTIONS *
1272 **************************/
1274 /* REACTION-FIELD ELECTROSTATICS */
1275 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1277 fscal = felec;
1279 /* Calculate temporary vectorial force */
1280 tx = _mm_mul_pd(fscal,dx01);
1281 ty = _mm_mul_pd(fscal,dy01);
1282 tz = _mm_mul_pd(fscal,dz01);
1284 /* Update vectorial force */
1285 fix0 = _mm_add_pd(fix0,tx);
1286 fiy0 = _mm_add_pd(fiy0,ty);
1287 fiz0 = _mm_add_pd(fiz0,tz);
1289 fjx1 = _mm_add_pd(fjx1,tx);
1290 fjy1 = _mm_add_pd(fjy1,ty);
1291 fjz1 = _mm_add_pd(fjz1,tz);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* REACTION-FIELD ELECTROSTATICS */
1298 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1300 fscal = felec;
1302 /* Calculate temporary vectorial force */
1303 tx = _mm_mul_pd(fscal,dx02);
1304 ty = _mm_mul_pd(fscal,dy02);
1305 tz = _mm_mul_pd(fscal,dz02);
1307 /* Update vectorial force */
1308 fix0 = _mm_add_pd(fix0,tx);
1309 fiy0 = _mm_add_pd(fiy0,ty);
1310 fiz0 = _mm_add_pd(fiz0,tz);
1312 fjx2 = _mm_add_pd(fjx2,tx);
1313 fjy2 = _mm_add_pd(fjy2,ty);
1314 fjz2 = _mm_add_pd(fjz2,tz);
1316 /**************************
1317 * CALCULATE INTERACTIONS *
1318 **************************/
1320 /* REACTION-FIELD ELECTROSTATICS */
1321 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1323 fscal = felec;
1325 /* Calculate temporary vectorial force */
1326 tx = _mm_mul_pd(fscal,dx10);
1327 ty = _mm_mul_pd(fscal,dy10);
1328 tz = _mm_mul_pd(fscal,dz10);
1330 /* Update vectorial force */
1331 fix1 = _mm_add_pd(fix1,tx);
1332 fiy1 = _mm_add_pd(fiy1,ty);
1333 fiz1 = _mm_add_pd(fiz1,tz);
1335 fjx0 = _mm_add_pd(fjx0,tx);
1336 fjy0 = _mm_add_pd(fjy0,ty);
1337 fjz0 = _mm_add_pd(fjz0,tz);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 /* REACTION-FIELD ELECTROSTATICS */
1344 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1346 fscal = felec;
1348 /* Calculate temporary vectorial force */
1349 tx = _mm_mul_pd(fscal,dx11);
1350 ty = _mm_mul_pd(fscal,dy11);
1351 tz = _mm_mul_pd(fscal,dz11);
1353 /* Update vectorial force */
1354 fix1 = _mm_add_pd(fix1,tx);
1355 fiy1 = _mm_add_pd(fiy1,ty);
1356 fiz1 = _mm_add_pd(fiz1,tz);
1358 fjx1 = _mm_add_pd(fjx1,tx);
1359 fjy1 = _mm_add_pd(fjy1,ty);
1360 fjz1 = _mm_add_pd(fjz1,tz);
1362 /**************************
1363 * CALCULATE INTERACTIONS *
1364 **************************/
1366 /* REACTION-FIELD ELECTROSTATICS */
1367 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1369 fscal = felec;
1371 /* Calculate temporary vectorial force */
1372 tx = _mm_mul_pd(fscal,dx12);
1373 ty = _mm_mul_pd(fscal,dy12);
1374 tz = _mm_mul_pd(fscal,dz12);
1376 /* Update vectorial force */
1377 fix1 = _mm_add_pd(fix1,tx);
1378 fiy1 = _mm_add_pd(fiy1,ty);
1379 fiz1 = _mm_add_pd(fiz1,tz);
1381 fjx2 = _mm_add_pd(fjx2,tx);
1382 fjy2 = _mm_add_pd(fjy2,ty);
1383 fjz2 = _mm_add_pd(fjz2,tz);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 /* REACTION-FIELD ELECTROSTATICS */
1390 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1392 fscal = felec;
1394 /* Calculate temporary vectorial force */
1395 tx = _mm_mul_pd(fscal,dx20);
1396 ty = _mm_mul_pd(fscal,dy20);
1397 tz = _mm_mul_pd(fscal,dz20);
1399 /* Update vectorial force */
1400 fix2 = _mm_add_pd(fix2,tx);
1401 fiy2 = _mm_add_pd(fiy2,ty);
1402 fiz2 = _mm_add_pd(fiz2,tz);
1404 fjx0 = _mm_add_pd(fjx0,tx);
1405 fjy0 = _mm_add_pd(fjy0,ty);
1406 fjz0 = _mm_add_pd(fjz0,tz);
1408 /**************************
1409 * CALCULATE INTERACTIONS *
1410 **************************/
1412 /* REACTION-FIELD ELECTROSTATICS */
1413 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1415 fscal = felec;
1417 /* Calculate temporary vectorial force */
1418 tx = _mm_mul_pd(fscal,dx21);
1419 ty = _mm_mul_pd(fscal,dy21);
1420 tz = _mm_mul_pd(fscal,dz21);
1422 /* Update vectorial force */
1423 fix2 = _mm_add_pd(fix2,tx);
1424 fiy2 = _mm_add_pd(fiy2,ty);
1425 fiz2 = _mm_add_pd(fiz2,tz);
1427 fjx1 = _mm_add_pd(fjx1,tx);
1428 fjy1 = _mm_add_pd(fjy1,ty);
1429 fjz1 = _mm_add_pd(fjz1,tz);
1431 /**************************
1432 * CALCULATE INTERACTIONS *
1433 **************************/
1435 /* REACTION-FIELD ELECTROSTATICS */
1436 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1438 fscal = felec;
1440 /* Calculate temporary vectorial force */
1441 tx = _mm_mul_pd(fscal,dx22);
1442 ty = _mm_mul_pd(fscal,dy22);
1443 tz = _mm_mul_pd(fscal,dz22);
1445 /* Update vectorial force */
1446 fix2 = _mm_add_pd(fix2,tx);
1447 fiy2 = _mm_add_pd(fiy2,ty);
1448 fiz2 = _mm_add_pd(fiz2,tz);
1450 fjx2 = _mm_add_pd(fjx2,tx);
1451 fjy2 = _mm_add_pd(fjy2,ty);
1452 fjz2 = _mm_add_pd(fjz2,tz);
1454 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1456 /* Inner loop uses 270 flops */
1459 if(jidx<j_index_end)
1462 jnrA = jjnr[jidx];
1463 j_coord_offsetA = DIM*jnrA;
1465 /* load j atom coordinates */
1466 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1467 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1469 /* Calculate displacement vector */
1470 dx00 = _mm_sub_pd(ix0,jx0);
1471 dy00 = _mm_sub_pd(iy0,jy0);
1472 dz00 = _mm_sub_pd(iz0,jz0);
1473 dx01 = _mm_sub_pd(ix0,jx1);
1474 dy01 = _mm_sub_pd(iy0,jy1);
1475 dz01 = _mm_sub_pd(iz0,jz1);
1476 dx02 = _mm_sub_pd(ix0,jx2);
1477 dy02 = _mm_sub_pd(iy0,jy2);
1478 dz02 = _mm_sub_pd(iz0,jz2);
1479 dx10 = _mm_sub_pd(ix1,jx0);
1480 dy10 = _mm_sub_pd(iy1,jy0);
1481 dz10 = _mm_sub_pd(iz1,jz0);
1482 dx11 = _mm_sub_pd(ix1,jx1);
1483 dy11 = _mm_sub_pd(iy1,jy1);
1484 dz11 = _mm_sub_pd(iz1,jz1);
1485 dx12 = _mm_sub_pd(ix1,jx2);
1486 dy12 = _mm_sub_pd(iy1,jy2);
1487 dz12 = _mm_sub_pd(iz1,jz2);
1488 dx20 = _mm_sub_pd(ix2,jx0);
1489 dy20 = _mm_sub_pd(iy2,jy0);
1490 dz20 = _mm_sub_pd(iz2,jz0);
1491 dx21 = _mm_sub_pd(ix2,jx1);
1492 dy21 = _mm_sub_pd(iy2,jy1);
1493 dz21 = _mm_sub_pd(iz2,jz1);
1494 dx22 = _mm_sub_pd(ix2,jx2);
1495 dy22 = _mm_sub_pd(iy2,jy2);
1496 dz22 = _mm_sub_pd(iz2,jz2);
1498 /* Calculate squared distance and things based on it */
1499 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1500 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1501 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1502 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1503 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1504 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1505 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1506 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1507 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1509 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1510 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1511 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1512 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1513 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1514 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1515 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1516 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1517 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1519 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1520 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1521 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1522 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1523 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1524 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1525 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1526 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1527 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1529 fjx0 = _mm_setzero_pd();
1530 fjy0 = _mm_setzero_pd();
1531 fjz0 = _mm_setzero_pd();
1532 fjx1 = _mm_setzero_pd();
1533 fjy1 = _mm_setzero_pd();
1534 fjz1 = _mm_setzero_pd();
1535 fjx2 = _mm_setzero_pd();
1536 fjy2 = _mm_setzero_pd();
1537 fjz2 = _mm_setzero_pd();
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1543 r00 = _mm_mul_pd(rsq00,rinv00);
1545 /* Calculate table index by multiplying r with table scale and truncate to integer */
1546 rt = _mm_mul_pd(r00,vftabscale);
1547 vfitab = _mm_cvttpd_epi32(rt);
1548 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1549 vfitab = _mm_slli_epi32(vfitab,3);
1551 /* REACTION-FIELD ELECTROSTATICS */
1552 felec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_mul_pd(rinv00,rinvsq00),krf2));
1554 /* CUBIC SPLINE TABLE DISPERSION */
1555 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1556 F = _mm_setzero_pd();
1557 GMX_MM_TRANSPOSE2_PD(Y,F);
1558 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1559 H = _mm_setzero_pd();
1560 GMX_MM_TRANSPOSE2_PD(G,H);
1561 Heps = _mm_mul_pd(vfeps,H);
1562 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1563 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1564 fvdw6 = _mm_mul_pd(c6_00,FF);
1566 /* CUBIC SPLINE TABLE REPULSION */
1567 vfitab = _mm_add_epi32(vfitab,ifour);
1568 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1569 F = _mm_setzero_pd();
1570 GMX_MM_TRANSPOSE2_PD(Y,F);
1571 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1572 H = _mm_setzero_pd();
1573 GMX_MM_TRANSPOSE2_PD(G,H);
1574 Heps = _mm_mul_pd(vfeps,H);
1575 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1576 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1577 fvdw12 = _mm_mul_pd(c12_00,FF);
1578 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1580 fscal = _mm_add_pd(felec,fvdw);
1582 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1584 /* Calculate temporary vectorial force */
1585 tx = _mm_mul_pd(fscal,dx00);
1586 ty = _mm_mul_pd(fscal,dy00);
1587 tz = _mm_mul_pd(fscal,dz00);
1589 /* Update vectorial force */
1590 fix0 = _mm_add_pd(fix0,tx);
1591 fiy0 = _mm_add_pd(fiy0,ty);
1592 fiz0 = _mm_add_pd(fiz0,tz);
1594 fjx0 = _mm_add_pd(fjx0,tx);
1595 fjy0 = _mm_add_pd(fjy0,ty);
1596 fjz0 = _mm_add_pd(fjz0,tz);
1598 /**************************
1599 * CALCULATE INTERACTIONS *
1600 **************************/
1602 /* REACTION-FIELD ELECTROSTATICS */
1603 felec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_mul_pd(rinv01,rinvsq01),krf2));
1605 fscal = felec;
1607 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1609 /* Calculate temporary vectorial force */
1610 tx = _mm_mul_pd(fscal,dx01);
1611 ty = _mm_mul_pd(fscal,dy01);
1612 tz = _mm_mul_pd(fscal,dz01);
1614 /* Update vectorial force */
1615 fix0 = _mm_add_pd(fix0,tx);
1616 fiy0 = _mm_add_pd(fiy0,ty);
1617 fiz0 = _mm_add_pd(fiz0,tz);
1619 fjx1 = _mm_add_pd(fjx1,tx);
1620 fjy1 = _mm_add_pd(fjy1,ty);
1621 fjz1 = _mm_add_pd(fjz1,tz);
1623 /**************************
1624 * CALCULATE INTERACTIONS *
1625 **************************/
1627 /* REACTION-FIELD ELECTROSTATICS */
1628 felec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_mul_pd(rinv02,rinvsq02),krf2));
1630 fscal = felec;
1632 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1634 /* Calculate temporary vectorial force */
1635 tx = _mm_mul_pd(fscal,dx02);
1636 ty = _mm_mul_pd(fscal,dy02);
1637 tz = _mm_mul_pd(fscal,dz02);
1639 /* Update vectorial force */
1640 fix0 = _mm_add_pd(fix0,tx);
1641 fiy0 = _mm_add_pd(fiy0,ty);
1642 fiz0 = _mm_add_pd(fiz0,tz);
1644 fjx2 = _mm_add_pd(fjx2,tx);
1645 fjy2 = _mm_add_pd(fjy2,ty);
1646 fjz2 = _mm_add_pd(fjz2,tz);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 /* REACTION-FIELD ELECTROSTATICS */
1653 felec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_mul_pd(rinv10,rinvsq10),krf2));
1655 fscal = felec;
1657 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1659 /* Calculate temporary vectorial force */
1660 tx = _mm_mul_pd(fscal,dx10);
1661 ty = _mm_mul_pd(fscal,dy10);
1662 tz = _mm_mul_pd(fscal,dz10);
1664 /* Update vectorial force */
1665 fix1 = _mm_add_pd(fix1,tx);
1666 fiy1 = _mm_add_pd(fiy1,ty);
1667 fiz1 = _mm_add_pd(fiz1,tz);
1669 fjx0 = _mm_add_pd(fjx0,tx);
1670 fjy0 = _mm_add_pd(fjy0,ty);
1671 fjz0 = _mm_add_pd(fjz0,tz);
1673 /**************************
1674 * CALCULATE INTERACTIONS *
1675 **************************/
1677 /* REACTION-FIELD ELECTROSTATICS */
1678 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1680 fscal = felec;
1682 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1684 /* Calculate temporary vectorial force */
1685 tx = _mm_mul_pd(fscal,dx11);
1686 ty = _mm_mul_pd(fscal,dy11);
1687 tz = _mm_mul_pd(fscal,dz11);
1689 /* Update vectorial force */
1690 fix1 = _mm_add_pd(fix1,tx);
1691 fiy1 = _mm_add_pd(fiy1,ty);
1692 fiz1 = _mm_add_pd(fiz1,tz);
1694 fjx1 = _mm_add_pd(fjx1,tx);
1695 fjy1 = _mm_add_pd(fjy1,ty);
1696 fjz1 = _mm_add_pd(fjz1,tz);
1698 /**************************
1699 * CALCULATE INTERACTIONS *
1700 **************************/
1702 /* REACTION-FIELD ELECTROSTATICS */
1703 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1705 fscal = felec;
1707 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1709 /* Calculate temporary vectorial force */
1710 tx = _mm_mul_pd(fscal,dx12);
1711 ty = _mm_mul_pd(fscal,dy12);
1712 tz = _mm_mul_pd(fscal,dz12);
1714 /* Update vectorial force */
1715 fix1 = _mm_add_pd(fix1,tx);
1716 fiy1 = _mm_add_pd(fiy1,ty);
1717 fiz1 = _mm_add_pd(fiz1,tz);
1719 fjx2 = _mm_add_pd(fjx2,tx);
1720 fjy2 = _mm_add_pd(fjy2,ty);
1721 fjz2 = _mm_add_pd(fjz2,tz);
1723 /**************************
1724 * CALCULATE INTERACTIONS *
1725 **************************/
1727 /* REACTION-FIELD ELECTROSTATICS */
1728 felec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_mul_pd(rinv20,rinvsq20),krf2));
1730 fscal = felec;
1732 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1734 /* Calculate temporary vectorial force */
1735 tx = _mm_mul_pd(fscal,dx20);
1736 ty = _mm_mul_pd(fscal,dy20);
1737 tz = _mm_mul_pd(fscal,dz20);
1739 /* Update vectorial force */
1740 fix2 = _mm_add_pd(fix2,tx);
1741 fiy2 = _mm_add_pd(fiy2,ty);
1742 fiz2 = _mm_add_pd(fiz2,tz);
1744 fjx0 = _mm_add_pd(fjx0,tx);
1745 fjy0 = _mm_add_pd(fjy0,ty);
1746 fjz0 = _mm_add_pd(fjz0,tz);
1748 /**************************
1749 * CALCULATE INTERACTIONS *
1750 **************************/
1752 /* REACTION-FIELD ELECTROSTATICS */
1753 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1755 fscal = felec;
1757 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1759 /* Calculate temporary vectorial force */
1760 tx = _mm_mul_pd(fscal,dx21);
1761 ty = _mm_mul_pd(fscal,dy21);
1762 tz = _mm_mul_pd(fscal,dz21);
1764 /* Update vectorial force */
1765 fix2 = _mm_add_pd(fix2,tx);
1766 fiy2 = _mm_add_pd(fiy2,ty);
1767 fiz2 = _mm_add_pd(fiz2,tz);
1769 fjx1 = _mm_add_pd(fjx1,tx);
1770 fjy1 = _mm_add_pd(fjy1,ty);
1771 fjz1 = _mm_add_pd(fjz1,tz);
1773 /**************************
1774 * CALCULATE INTERACTIONS *
1775 **************************/
1777 /* REACTION-FIELD ELECTROSTATICS */
1778 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1780 fscal = felec;
1782 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1784 /* Calculate temporary vectorial force */
1785 tx = _mm_mul_pd(fscal,dx22);
1786 ty = _mm_mul_pd(fscal,dy22);
1787 tz = _mm_mul_pd(fscal,dz22);
1789 /* Update vectorial force */
1790 fix2 = _mm_add_pd(fix2,tx);
1791 fiy2 = _mm_add_pd(fiy2,ty);
1792 fiz2 = _mm_add_pd(fiz2,tz);
1794 fjx2 = _mm_add_pd(fjx2,tx);
1795 fjy2 = _mm_add_pd(fjy2,ty);
1796 fjz2 = _mm_add_pd(fjz2,tz);
1798 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1800 /* Inner loop uses 270 flops */
1803 /* End of innermost loop */
1805 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1806 f+i_coord_offset,fshift+i_shift_offset);
1808 /* Increment number of inner iterations */
1809 inneriter += j_index_end - j_index_start;
1811 /* Outer loop uses 18 flops */
1814 /* Increment number of outer iterations */
1815 outeriter += nri;
1817 /* Update outer/inner flops */
1819 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*270);