Double precision SSE2 kernels
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sse2_double.c
blob314ceccb779d2fdbc0d6ce3a89d66d6f7c0777d9
1 /*
2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
6 * G R O M A C S
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
17 * later version.
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
26 #include <math.h>
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
43 void
44 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66 int vdwioffset0;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68 int vdwioffset1;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70 int vdwioffset2;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwioffset3;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
93 real *charge;
94 int nvdwtype;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 int *vdwtype;
97 real *vdwparam;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
100 __m128d dummy_mask,cutoff_mask;
101 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
102 __m128d one = _mm_set1_pd(1.0);
103 __m128d two = _mm_set1_pd(2.0);
104 x = xx[0];
105 f = ff[0];
107 nri = nlist->nri;
108 iinr = nlist->iinr;
109 jindex = nlist->jindex;
110 jjnr = nlist->jjnr;
111 shiftidx = nlist->shift;
112 gid = nlist->gid;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
117 nvdwtype = fr->ntype;
118 vdwparam = fr->nbfp;
119 vdwtype = mdatoms->typeA;
121 /* Setup water-specific parameters */
122 inr = nlist->iinr[0];
123 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
124 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
125 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
126 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
128 jq1 = _mm_set1_pd(charge[inr+1]);
129 jq2 = _mm_set1_pd(charge[inr+2]);
130 jq3 = _mm_set1_pd(charge[inr+3]);
131 vdwjidx0A = 2*vdwtype[inr+0];
132 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
133 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
134 qq11 = _mm_mul_pd(iq1,jq1);
135 qq12 = _mm_mul_pd(iq1,jq2);
136 qq13 = _mm_mul_pd(iq1,jq3);
137 qq21 = _mm_mul_pd(iq2,jq1);
138 qq22 = _mm_mul_pd(iq2,jq2);
139 qq23 = _mm_mul_pd(iq2,jq3);
140 qq31 = _mm_mul_pd(iq3,jq1);
141 qq32 = _mm_mul_pd(iq3,jq2);
142 qq33 = _mm_mul_pd(iq3,jq3);
144 /* Avoid stupid compiler warnings */
145 jnrA = jnrB = 0;
146 j_coord_offsetA = 0;
147 j_coord_offsetB = 0;
149 outeriter = 0;
150 inneriter = 0;
152 /* Start outer loop over neighborlists */
153 for(iidx=0; iidx<nri; iidx++)
155 /* Load shift vector for this list */
156 i_shift_offset = DIM*shiftidx[iidx];
158 /* Load limits for loop over neighbors */
159 j_index_start = jindex[iidx];
160 j_index_end = jindex[iidx+1];
162 /* Get outer coordinate index */
163 inr = iinr[iidx];
164 i_coord_offset = DIM*inr;
166 /* Load i particle coords and add shift vector */
167 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
168 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
170 fix0 = _mm_setzero_pd();
171 fiy0 = _mm_setzero_pd();
172 fiz0 = _mm_setzero_pd();
173 fix1 = _mm_setzero_pd();
174 fiy1 = _mm_setzero_pd();
175 fiz1 = _mm_setzero_pd();
176 fix2 = _mm_setzero_pd();
177 fiy2 = _mm_setzero_pd();
178 fiz2 = _mm_setzero_pd();
179 fix3 = _mm_setzero_pd();
180 fiy3 = _mm_setzero_pd();
181 fiz3 = _mm_setzero_pd();
183 /* Reset potential sums */
184 velecsum = _mm_setzero_pd();
185 vvdwsum = _mm_setzero_pd();
187 /* Start inner kernel loop */
188 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
191 /* Get j neighbor index, and coordinate index */
192 jnrA = jjnr[jidx];
193 jnrB = jjnr[jidx+1];
194 j_coord_offsetA = DIM*jnrA;
195 j_coord_offsetB = DIM*jnrB;
197 /* load j atom coordinates */
198 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
199 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
200 &jy2,&jz2,&jx3,&jy3,&jz3);
202 /* Calculate displacement vector */
203 dx00 = _mm_sub_pd(ix0,jx0);
204 dy00 = _mm_sub_pd(iy0,jy0);
205 dz00 = _mm_sub_pd(iz0,jz0);
206 dx11 = _mm_sub_pd(ix1,jx1);
207 dy11 = _mm_sub_pd(iy1,jy1);
208 dz11 = _mm_sub_pd(iz1,jz1);
209 dx12 = _mm_sub_pd(ix1,jx2);
210 dy12 = _mm_sub_pd(iy1,jy2);
211 dz12 = _mm_sub_pd(iz1,jz2);
212 dx13 = _mm_sub_pd(ix1,jx3);
213 dy13 = _mm_sub_pd(iy1,jy3);
214 dz13 = _mm_sub_pd(iz1,jz3);
215 dx21 = _mm_sub_pd(ix2,jx1);
216 dy21 = _mm_sub_pd(iy2,jy1);
217 dz21 = _mm_sub_pd(iz2,jz1);
218 dx22 = _mm_sub_pd(ix2,jx2);
219 dy22 = _mm_sub_pd(iy2,jy2);
220 dz22 = _mm_sub_pd(iz2,jz2);
221 dx23 = _mm_sub_pd(ix2,jx3);
222 dy23 = _mm_sub_pd(iy2,jy3);
223 dz23 = _mm_sub_pd(iz2,jz3);
224 dx31 = _mm_sub_pd(ix3,jx1);
225 dy31 = _mm_sub_pd(iy3,jy1);
226 dz31 = _mm_sub_pd(iz3,jz1);
227 dx32 = _mm_sub_pd(ix3,jx2);
228 dy32 = _mm_sub_pd(iy3,jy2);
229 dz32 = _mm_sub_pd(iz3,jz2);
230 dx33 = _mm_sub_pd(ix3,jx3);
231 dy33 = _mm_sub_pd(iy3,jy3);
232 dz33 = _mm_sub_pd(iz3,jz3);
234 /* Calculate squared distance and things based on it */
235 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
236 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
237 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
238 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
239 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
240 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
241 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
242 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
243 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
244 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
246 rinv11 = gmx_mm_invsqrt_pd(rsq11);
247 rinv12 = gmx_mm_invsqrt_pd(rsq12);
248 rinv13 = gmx_mm_invsqrt_pd(rsq13);
249 rinv21 = gmx_mm_invsqrt_pd(rsq21);
250 rinv22 = gmx_mm_invsqrt_pd(rsq22);
251 rinv23 = gmx_mm_invsqrt_pd(rsq23);
252 rinv31 = gmx_mm_invsqrt_pd(rsq31);
253 rinv32 = gmx_mm_invsqrt_pd(rsq32);
254 rinv33 = gmx_mm_invsqrt_pd(rsq33);
256 rinvsq00 = gmx_mm_inv_pd(rsq00);
257 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
258 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
259 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
260 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
261 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
262 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
263 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
264 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
265 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
267 fjx0 = _mm_setzero_pd();
268 fjy0 = _mm_setzero_pd();
269 fjz0 = _mm_setzero_pd();
270 fjx1 = _mm_setzero_pd();
271 fjy1 = _mm_setzero_pd();
272 fjz1 = _mm_setzero_pd();
273 fjx2 = _mm_setzero_pd();
274 fjy2 = _mm_setzero_pd();
275 fjz2 = _mm_setzero_pd();
276 fjx3 = _mm_setzero_pd();
277 fjy3 = _mm_setzero_pd();
278 fjz3 = _mm_setzero_pd();
280 /**************************
281 * CALCULATE INTERACTIONS *
282 **************************/
284 /* LENNARD-JONES DISPERSION/REPULSION */
286 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
287 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
288 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
289 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
290 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
292 /* Update potential sum for this i atom from the interaction with this j atom. */
293 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
295 fscal = fvdw;
297 /* Calculate temporary vectorial force */
298 tx = _mm_mul_pd(fscal,dx00);
299 ty = _mm_mul_pd(fscal,dy00);
300 tz = _mm_mul_pd(fscal,dz00);
302 /* Update vectorial force */
303 fix0 = _mm_add_pd(fix0,tx);
304 fiy0 = _mm_add_pd(fiy0,ty);
305 fiz0 = _mm_add_pd(fiz0,tz);
307 fjx0 = _mm_add_pd(fjx0,tx);
308 fjy0 = _mm_add_pd(fjy0,ty);
309 fjz0 = _mm_add_pd(fjz0,tz);
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 /* COULOMB ELECTROSTATICS */
316 velec = _mm_mul_pd(qq11,rinv11);
317 felec = _mm_mul_pd(velec,rinvsq11);
319 /* Update potential sum for this i atom from the interaction with this j atom. */
320 velecsum = _mm_add_pd(velecsum,velec);
322 fscal = felec;
324 /* Calculate temporary vectorial force */
325 tx = _mm_mul_pd(fscal,dx11);
326 ty = _mm_mul_pd(fscal,dy11);
327 tz = _mm_mul_pd(fscal,dz11);
329 /* Update vectorial force */
330 fix1 = _mm_add_pd(fix1,tx);
331 fiy1 = _mm_add_pd(fiy1,ty);
332 fiz1 = _mm_add_pd(fiz1,tz);
334 fjx1 = _mm_add_pd(fjx1,tx);
335 fjy1 = _mm_add_pd(fjy1,ty);
336 fjz1 = _mm_add_pd(fjz1,tz);
338 /**************************
339 * CALCULATE INTERACTIONS *
340 **************************/
342 /* COULOMB ELECTROSTATICS */
343 velec = _mm_mul_pd(qq12,rinv12);
344 felec = _mm_mul_pd(velec,rinvsq12);
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velecsum = _mm_add_pd(velecsum,velec);
349 fscal = felec;
351 /* Calculate temporary vectorial force */
352 tx = _mm_mul_pd(fscal,dx12);
353 ty = _mm_mul_pd(fscal,dy12);
354 tz = _mm_mul_pd(fscal,dz12);
356 /* Update vectorial force */
357 fix1 = _mm_add_pd(fix1,tx);
358 fiy1 = _mm_add_pd(fiy1,ty);
359 fiz1 = _mm_add_pd(fiz1,tz);
361 fjx2 = _mm_add_pd(fjx2,tx);
362 fjy2 = _mm_add_pd(fjy2,ty);
363 fjz2 = _mm_add_pd(fjz2,tz);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 /* COULOMB ELECTROSTATICS */
370 velec = _mm_mul_pd(qq13,rinv13);
371 felec = _mm_mul_pd(velec,rinvsq13);
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velecsum = _mm_add_pd(velecsum,velec);
376 fscal = felec;
378 /* Calculate temporary vectorial force */
379 tx = _mm_mul_pd(fscal,dx13);
380 ty = _mm_mul_pd(fscal,dy13);
381 tz = _mm_mul_pd(fscal,dz13);
383 /* Update vectorial force */
384 fix1 = _mm_add_pd(fix1,tx);
385 fiy1 = _mm_add_pd(fiy1,ty);
386 fiz1 = _mm_add_pd(fiz1,tz);
388 fjx3 = _mm_add_pd(fjx3,tx);
389 fjy3 = _mm_add_pd(fjy3,ty);
390 fjz3 = _mm_add_pd(fjz3,tz);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 /* COULOMB ELECTROSTATICS */
397 velec = _mm_mul_pd(qq21,rinv21);
398 felec = _mm_mul_pd(velec,rinvsq21);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velecsum = _mm_add_pd(velecsum,velec);
403 fscal = felec;
405 /* Calculate temporary vectorial force */
406 tx = _mm_mul_pd(fscal,dx21);
407 ty = _mm_mul_pd(fscal,dy21);
408 tz = _mm_mul_pd(fscal,dz21);
410 /* Update vectorial force */
411 fix2 = _mm_add_pd(fix2,tx);
412 fiy2 = _mm_add_pd(fiy2,ty);
413 fiz2 = _mm_add_pd(fiz2,tz);
415 fjx1 = _mm_add_pd(fjx1,tx);
416 fjy1 = _mm_add_pd(fjy1,ty);
417 fjz1 = _mm_add_pd(fjz1,tz);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 /* COULOMB ELECTROSTATICS */
424 velec = _mm_mul_pd(qq22,rinv22);
425 felec = _mm_mul_pd(velec,rinvsq22);
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum = _mm_add_pd(velecsum,velec);
430 fscal = felec;
432 /* Calculate temporary vectorial force */
433 tx = _mm_mul_pd(fscal,dx22);
434 ty = _mm_mul_pd(fscal,dy22);
435 tz = _mm_mul_pd(fscal,dz22);
437 /* Update vectorial force */
438 fix2 = _mm_add_pd(fix2,tx);
439 fiy2 = _mm_add_pd(fiy2,ty);
440 fiz2 = _mm_add_pd(fiz2,tz);
442 fjx2 = _mm_add_pd(fjx2,tx);
443 fjy2 = _mm_add_pd(fjy2,ty);
444 fjz2 = _mm_add_pd(fjz2,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 /* COULOMB ELECTROSTATICS */
451 velec = _mm_mul_pd(qq23,rinv23);
452 felec = _mm_mul_pd(velec,rinvsq23);
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velecsum = _mm_add_pd(velecsum,velec);
457 fscal = felec;
459 /* Calculate temporary vectorial force */
460 tx = _mm_mul_pd(fscal,dx23);
461 ty = _mm_mul_pd(fscal,dy23);
462 tz = _mm_mul_pd(fscal,dz23);
464 /* Update vectorial force */
465 fix2 = _mm_add_pd(fix2,tx);
466 fiy2 = _mm_add_pd(fiy2,ty);
467 fiz2 = _mm_add_pd(fiz2,tz);
469 fjx3 = _mm_add_pd(fjx3,tx);
470 fjy3 = _mm_add_pd(fjy3,ty);
471 fjz3 = _mm_add_pd(fjz3,tz);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 /* COULOMB ELECTROSTATICS */
478 velec = _mm_mul_pd(qq31,rinv31);
479 felec = _mm_mul_pd(velec,rinvsq31);
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velecsum = _mm_add_pd(velecsum,velec);
484 fscal = felec;
486 /* Calculate temporary vectorial force */
487 tx = _mm_mul_pd(fscal,dx31);
488 ty = _mm_mul_pd(fscal,dy31);
489 tz = _mm_mul_pd(fscal,dz31);
491 /* Update vectorial force */
492 fix3 = _mm_add_pd(fix3,tx);
493 fiy3 = _mm_add_pd(fiy3,ty);
494 fiz3 = _mm_add_pd(fiz3,tz);
496 fjx1 = _mm_add_pd(fjx1,tx);
497 fjy1 = _mm_add_pd(fjy1,ty);
498 fjz1 = _mm_add_pd(fjz1,tz);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 /* COULOMB ELECTROSTATICS */
505 velec = _mm_mul_pd(qq32,rinv32);
506 felec = _mm_mul_pd(velec,rinvsq32);
508 /* Update potential sum for this i atom from the interaction with this j atom. */
509 velecsum = _mm_add_pd(velecsum,velec);
511 fscal = felec;
513 /* Calculate temporary vectorial force */
514 tx = _mm_mul_pd(fscal,dx32);
515 ty = _mm_mul_pd(fscal,dy32);
516 tz = _mm_mul_pd(fscal,dz32);
518 /* Update vectorial force */
519 fix3 = _mm_add_pd(fix3,tx);
520 fiy3 = _mm_add_pd(fiy3,ty);
521 fiz3 = _mm_add_pd(fiz3,tz);
523 fjx2 = _mm_add_pd(fjx2,tx);
524 fjy2 = _mm_add_pd(fjy2,ty);
525 fjz2 = _mm_add_pd(fjz2,tz);
527 /**************************
528 * CALCULATE INTERACTIONS *
529 **************************/
531 /* COULOMB ELECTROSTATICS */
532 velec = _mm_mul_pd(qq33,rinv33);
533 felec = _mm_mul_pd(velec,rinvsq33);
535 /* Update potential sum for this i atom from the interaction with this j atom. */
536 velecsum = _mm_add_pd(velecsum,velec);
538 fscal = felec;
540 /* Calculate temporary vectorial force */
541 tx = _mm_mul_pd(fscal,dx33);
542 ty = _mm_mul_pd(fscal,dy33);
543 tz = _mm_mul_pd(fscal,dz33);
545 /* Update vectorial force */
546 fix3 = _mm_add_pd(fix3,tx);
547 fiy3 = _mm_add_pd(fiy3,ty);
548 fiz3 = _mm_add_pd(fiz3,tz);
550 fjx3 = _mm_add_pd(fjx3,tx);
551 fjy3 = _mm_add_pd(fjy3,ty);
552 fjz3 = _mm_add_pd(fjz3,tz);
554 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
556 /* Inner loop uses 287 flops */
559 if(jidx<j_index_end)
562 jnrA = jjnr[jidx];
563 j_coord_offsetA = DIM*jnrA;
565 /* load j atom coordinates */
566 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
567 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
568 &jy2,&jz2,&jx3,&jy3,&jz3);
570 /* Calculate displacement vector */
571 dx00 = _mm_sub_pd(ix0,jx0);
572 dy00 = _mm_sub_pd(iy0,jy0);
573 dz00 = _mm_sub_pd(iz0,jz0);
574 dx11 = _mm_sub_pd(ix1,jx1);
575 dy11 = _mm_sub_pd(iy1,jy1);
576 dz11 = _mm_sub_pd(iz1,jz1);
577 dx12 = _mm_sub_pd(ix1,jx2);
578 dy12 = _mm_sub_pd(iy1,jy2);
579 dz12 = _mm_sub_pd(iz1,jz2);
580 dx13 = _mm_sub_pd(ix1,jx3);
581 dy13 = _mm_sub_pd(iy1,jy3);
582 dz13 = _mm_sub_pd(iz1,jz3);
583 dx21 = _mm_sub_pd(ix2,jx1);
584 dy21 = _mm_sub_pd(iy2,jy1);
585 dz21 = _mm_sub_pd(iz2,jz1);
586 dx22 = _mm_sub_pd(ix2,jx2);
587 dy22 = _mm_sub_pd(iy2,jy2);
588 dz22 = _mm_sub_pd(iz2,jz2);
589 dx23 = _mm_sub_pd(ix2,jx3);
590 dy23 = _mm_sub_pd(iy2,jy3);
591 dz23 = _mm_sub_pd(iz2,jz3);
592 dx31 = _mm_sub_pd(ix3,jx1);
593 dy31 = _mm_sub_pd(iy3,jy1);
594 dz31 = _mm_sub_pd(iz3,jz1);
595 dx32 = _mm_sub_pd(ix3,jx2);
596 dy32 = _mm_sub_pd(iy3,jy2);
597 dz32 = _mm_sub_pd(iz3,jz2);
598 dx33 = _mm_sub_pd(ix3,jx3);
599 dy33 = _mm_sub_pd(iy3,jy3);
600 dz33 = _mm_sub_pd(iz3,jz3);
602 /* Calculate squared distance and things based on it */
603 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
604 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
605 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
606 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
607 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
608 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
609 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
610 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
611 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
612 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
614 rinv11 = gmx_mm_invsqrt_pd(rsq11);
615 rinv12 = gmx_mm_invsqrt_pd(rsq12);
616 rinv13 = gmx_mm_invsqrt_pd(rsq13);
617 rinv21 = gmx_mm_invsqrt_pd(rsq21);
618 rinv22 = gmx_mm_invsqrt_pd(rsq22);
619 rinv23 = gmx_mm_invsqrt_pd(rsq23);
620 rinv31 = gmx_mm_invsqrt_pd(rsq31);
621 rinv32 = gmx_mm_invsqrt_pd(rsq32);
622 rinv33 = gmx_mm_invsqrt_pd(rsq33);
624 rinvsq00 = gmx_mm_inv_pd(rsq00);
625 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
626 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
627 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
628 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
629 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
630 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
631 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
632 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
633 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
635 fjx0 = _mm_setzero_pd();
636 fjy0 = _mm_setzero_pd();
637 fjz0 = _mm_setzero_pd();
638 fjx1 = _mm_setzero_pd();
639 fjy1 = _mm_setzero_pd();
640 fjz1 = _mm_setzero_pd();
641 fjx2 = _mm_setzero_pd();
642 fjy2 = _mm_setzero_pd();
643 fjz2 = _mm_setzero_pd();
644 fjx3 = _mm_setzero_pd();
645 fjy3 = _mm_setzero_pd();
646 fjz3 = _mm_setzero_pd();
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
652 /* LENNARD-JONES DISPERSION/REPULSION */
654 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
655 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
656 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
657 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
658 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
662 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
664 fscal = fvdw;
666 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
668 /* Calculate temporary vectorial force */
669 tx = _mm_mul_pd(fscal,dx00);
670 ty = _mm_mul_pd(fscal,dy00);
671 tz = _mm_mul_pd(fscal,dz00);
673 /* Update vectorial force */
674 fix0 = _mm_add_pd(fix0,tx);
675 fiy0 = _mm_add_pd(fiy0,ty);
676 fiz0 = _mm_add_pd(fiz0,tz);
678 fjx0 = _mm_add_pd(fjx0,tx);
679 fjy0 = _mm_add_pd(fjy0,ty);
680 fjz0 = _mm_add_pd(fjz0,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 /* COULOMB ELECTROSTATICS */
687 velec = _mm_mul_pd(qq11,rinv11);
688 felec = _mm_mul_pd(velec,rinvsq11);
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
692 velecsum = _mm_add_pd(velecsum,velec);
694 fscal = felec;
696 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_pd(fscal,dx11);
700 ty = _mm_mul_pd(fscal,dy11);
701 tz = _mm_mul_pd(fscal,dz11);
703 /* Update vectorial force */
704 fix1 = _mm_add_pd(fix1,tx);
705 fiy1 = _mm_add_pd(fiy1,ty);
706 fiz1 = _mm_add_pd(fiz1,tz);
708 fjx1 = _mm_add_pd(fjx1,tx);
709 fjy1 = _mm_add_pd(fjy1,ty);
710 fjz1 = _mm_add_pd(fjz1,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 /* COULOMB ELECTROSTATICS */
717 velec = _mm_mul_pd(qq12,rinv12);
718 felec = _mm_mul_pd(velec,rinvsq12);
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
722 velecsum = _mm_add_pd(velecsum,velec);
724 fscal = felec;
726 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
728 /* Calculate temporary vectorial force */
729 tx = _mm_mul_pd(fscal,dx12);
730 ty = _mm_mul_pd(fscal,dy12);
731 tz = _mm_mul_pd(fscal,dz12);
733 /* Update vectorial force */
734 fix1 = _mm_add_pd(fix1,tx);
735 fiy1 = _mm_add_pd(fiy1,ty);
736 fiz1 = _mm_add_pd(fiz1,tz);
738 fjx2 = _mm_add_pd(fjx2,tx);
739 fjy2 = _mm_add_pd(fjy2,ty);
740 fjz2 = _mm_add_pd(fjz2,tz);
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 /* COULOMB ELECTROSTATICS */
747 velec = _mm_mul_pd(qq13,rinv13);
748 felec = _mm_mul_pd(velec,rinvsq13);
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
752 velecsum = _mm_add_pd(velecsum,velec);
754 fscal = felec;
756 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
758 /* Calculate temporary vectorial force */
759 tx = _mm_mul_pd(fscal,dx13);
760 ty = _mm_mul_pd(fscal,dy13);
761 tz = _mm_mul_pd(fscal,dz13);
763 /* Update vectorial force */
764 fix1 = _mm_add_pd(fix1,tx);
765 fiy1 = _mm_add_pd(fiy1,ty);
766 fiz1 = _mm_add_pd(fiz1,tz);
768 fjx3 = _mm_add_pd(fjx3,tx);
769 fjy3 = _mm_add_pd(fjy3,ty);
770 fjz3 = _mm_add_pd(fjz3,tz);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 /* COULOMB ELECTROSTATICS */
777 velec = _mm_mul_pd(qq21,rinv21);
778 felec = _mm_mul_pd(velec,rinvsq21);
780 /* Update potential sum for this i atom from the interaction with this j atom. */
781 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
782 velecsum = _mm_add_pd(velecsum,velec);
784 fscal = felec;
786 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
788 /* Calculate temporary vectorial force */
789 tx = _mm_mul_pd(fscal,dx21);
790 ty = _mm_mul_pd(fscal,dy21);
791 tz = _mm_mul_pd(fscal,dz21);
793 /* Update vectorial force */
794 fix2 = _mm_add_pd(fix2,tx);
795 fiy2 = _mm_add_pd(fiy2,ty);
796 fiz2 = _mm_add_pd(fiz2,tz);
798 fjx1 = _mm_add_pd(fjx1,tx);
799 fjy1 = _mm_add_pd(fjy1,ty);
800 fjz1 = _mm_add_pd(fjz1,tz);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 /* COULOMB ELECTROSTATICS */
807 velec = _mm_mul_pd(qq22,rinv22);
808 felec = _mm_mul_pd(velec,rinvsq22);
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
812 velecsum = _mm_add_pd(velecsum,velec);
814 fscal = felec;
816 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_pd(fscal,dx22);
820 ty = _mm_mul_pd(fscal,dy22);
821 tz = _mm_mul_pd(fscal,dz22);
823 /* Update vectorial force */
824 fix2 = _mm_add_pd(fix2,tx);
825 fiy2 = _mm_add_pd(fiy2,ty);
826 fiz2 = _mm_add_pd(fiz2,tz);
828 fjx2 = _mm_add_pd(fjx2,tx);
829 fjy2 = _mm_add_pd(fjy2,ty);
830 fjz2 = _mm_add_pd(fjz2,tz);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 /* COULOMB ELECTROSTATICS */
837 velec = _mm_mul_pd(qq23,rinv23);
838 felec = _mm_mul_pd(velec,rinvsq23);
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
842 velecsum = _mm_add_pd(velecsum,velec);
844 fscal = felec;
846 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
848 /* Calculate temporary vectorial force */
849 tx = _mm_mul_pd(fscal,dx23);
850 ty = _mm_mul_pd(fscal,dy23);
851 tz = _mm_mul_pd(fscal,dz23);
853 /* Update vectorial force */
854 fix2 = _mm_add_pd(fix2,tx);
855 fiy2 = _mm_add_pd(fiy2,ty);
856 fiz2 = _mm_add_pd(fiz2,tz);
858 fjx3 = _mm_add_pd(fjx3,tx);
859 fjy3 = _mm_add_pd(fjy3,ty);
860 fjz3 = _mm_add_pd(fjz3,tz);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 /* COULOMB ELECTROSTATICS */
867 velec = _mm_mul_pd(qq31,rinv31);
868 felec = _mm_mul_pd(velec,rinvsq31);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
872 velecsum = _mm_add_pd(velecsum,velec);
874 fscal = felec;
876 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
878 /* Calculate temporary vectorial force */
879 tx = _mm_mul_pd(fscal,dx31);
880 ty = _mm_mul_pd(fscal,dy31);
881 tz = _mm_mul_pd(fscal,dz31);
883 /* Update vectorial force */
884 fix3 = _mm_add_pd(fix3,tx);
885 fiy3 = _mm_add_pd(fiy3,ty);
886 fiz3 = _mm_add_pd(fiz3,tz);
888 fjx1 = _mm_add_pd(fjx1,tx);
889 fjy1 = _mm_add_pd(fjy1,ty);
890 fjz1 = _mm_add_pd(fjz1,tz);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 /* COULOMB ELECTROSTATICS */
897 velec = _mm_mul_pd(qq32,rinv32);
898 felec = _mm_mul_pd(velec,rinvsq32);
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
902 velecsum = _mm_add_pd(velecsum,velec);
904 fscal = felec;
906 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
908 /* Calculate temporary vectorial force */
909 tx = _mm_mul_pd(fscal,dx32);
910 ty = _mm_mul_pd(fscal,dy32);
911 tz = _mm_mul_pd(fscal,dz32);
913 /* Update vectorial force */
914 fix3 = _mm_add_pd(fix3,tx);
915 fiy3 = _mm_add_pd(fiy3,ty);
916 fiz3 = _mm_add_pd(fiz3,tz);
918 fjx2 = _mm_add_pd(fjx2,tx);
919 fjy2 = _mm_add_pd(fjy2,ty);
920 fjz2 = _mm_add_pd(fjz2,tz);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 /* COULOMB ELECTROSTATICS */
927 velec = _mm_mul_pd(qq33,rinv33);
928 felec = _mm_mul_pd(velec,rinvsq33);
930 /* Update potential sum for this i atom from the interaction with this j atom. */
931 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
932 velecsum = _mm_add_pd(velecsum,velec);
934 fscal = felec;
936 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
938 /* Calculate temporary vectorial force */
939 tx = _mm_mul_pd(fscal,dx33);
940 ty = _mm_mul_pd(fscal,dy33);
941 tz = _mm_mul_pd(fscal,dz33);
943 /* Update vectorial force */
944 fix3 = _mm_add_pd(fix3,tx);
945 fiy3 = _mm_add_pd(fiy3,ty);
946 fiz3 = _mm_add_pd(fiz3,tz);
948 fjx3 = _mm_add_pd(fjx3,tx);
949 fjy3 = _mm_add_pd(fjy3,ty);
950 fjz3 = _mm_add_pd(fjz3,tz);
952 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
954 /* Inner loop uses 287 flops */
957 /* End of innermost loop */
959 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
960 f+i_coord_offset,fshift+i_shift_offset);
962 ggid = gid[iidx];
963 /* Update potential energies */
964 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
965 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
967 /* Increment number of inner iterations */
968 inneriter += j_index_end - j_index_start;
970 /* Outer loop uses 26 flops */
973 /* Increment number of outer iterations */
974 outeriter += nri;
976 /* Update outer/inner flops */
978 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*287);
981 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
982 * Electrostatics interaction: Coulomb
983 * VdW interaction: LennardJones
984 * Geometry: Water4-Water4
985 * Calculate force/pot: Force
987 void
988 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
989 (t_nblist * gmx_restrict nlist,
990 rvec * gmx_restrict xx,
991 rvec * gmx_restrict ff,
992 t_forcerec * gmx_restrict fr,
993 t_mdatoms * gmx_restrict mdatoms,
994 nb_kernel_data_t * gmx_restrict kernel_data,
995 t_nrnb * gmx_restrict nrnb)
997 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
998 * just 0 for non-waters.
999 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1000 * jnr indices corresponding to data put in the four positions in the SIMD register.
1002 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1003 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1004 int jnrA,jnrB;
1005 int j_coord_offsetA,j_coord_offsetB;
1006 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1007 real rcutoff_scalar;
1008 real *shiftvec,*fshift,*x,*f;
1009 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1010 int vdwioffset0;
1011 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1012 int vdwioffset1;
1013 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1014 int vdwioffset2;
1015 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1016 int vdwioffset3;
1017 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1018 int vdwjidx0A,vdwjidx0B;
1019 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1020 int vdwjidx1A,vdwjidx1B;
1021 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1022 int vdwjidx2A,vdwjidx2B;
1023 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1024 int vdwjidx3A,vdwjidx3B;
1025 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1026 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1027 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1028 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1029 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1030 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1031 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1032 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1033 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1034 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1035 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1036 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1037 real *charge;
1038 int nvdwtype;
1039 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1040 int *vdwtype;
1041 real *vdwparam;
1042 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1043 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1044 __m128d dummy_mask,cutoff_mask;
1045 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1046 __m128d one = _mm_set1_pd(1.0);
1047 __m128d two = _mm_set1_pd(2.0);
1048 x = xx[0];
1049 f = ff[0];
1051 nri = nlist->nri;
1052 iinr = nlist->iinr;
1053 jindex = nlist->jindex;
1054 jjnr = nlist->jjnr;
1055 shiftidx = nlist->shift;
1056 gid = nlist->gid;
1057 shiftvec = fr->shift_vec[0];
1058 fshift = fr->fshift[0];
1059 facel = _mm_set1_pd(fr->epsfac);
1060 charge = mdatoms->chargeA;
1061 nvdwtype = fr->ntype;
1062 vdwparam = fr->nbfp;
1063 vdwtype = mdatoms->typeA;
1065 /* Setup water-specific parameters */
1066 inr = nlist->iinr[0];
1067 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1068 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1069 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1070 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1072 jq1 = _mm_set1_pd(charge[inr+1]);
1073 jq2 = _mm_set1_pd(charge[inr+2]);
1074 jq3 = _mm_set1_pd(charge[inr+3]);
1075 vdwjidx0A = 2*vdwtype[inr+0];
1076 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1077 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1078 qq11 = _mm_mul_pd(iq1,jq1);
1079 qq12 = _mm_mul_pd(iq1,jq2);
1080 qq13 = _mm_mul_pd(iq1,jq3);
1081 qq21 = _mm_mul_pd(iq2,jq1);
1082 qq22 = _mm_mul_pd(iq2,jq2);
1083 qq23 = _mm_mul_pd(iq2,jq3);
1084 qq31 = _mm_mul_pd(iq3,jq1);
1085 qq32 = _mm_mul_pd(iq3,jq2);
1086 qq33 = _mm_mul_pd(iq3,jq3);
1088 /* Avoid stupid compiler warnings */
1089 jnrA = jnrB = 0;
1090 j_coord_offsetA = 0;
1091 j_coord_offsetB = 0;
1093 outeriter = 0;
1094 inneriter = 0;
1096 /* Start outer loop over neighborlists */
1097 for(iidx=0; iidx<nri; iidx++)
1099 /* Load shift vector for this list */
1100 i_shift_offset = DIM*shiftidx[iidx];
1102 /* Load limits for loop over neighbors */
1103 j_index_start = jindex[iidx];
1104 j_index_end = jindex[iidx+1];
1106 /* Get outer coordinate index */
1107 inr = iinr[iidx];
1108 i_coord_offset = DIM*inr;
1110 /* Load i particle coords and add shift vector */
1111 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1112 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1114 fix0 = _mm_setzero_pd();
1115 fiy0 = _mm_setzero_pd();
1116 fiz0 = _mm_setzero_pd();
1117 fix1 = _mm_setzero_pd();
1118 fiy1 = _mm_setzero_pd();
1119 fiz1 = _mm_setzero_pd();
1120 fix2 = _mm_setzero_pd();
1121 fiy2 = _mm_setzero_pd();
1122 fiz2 = _mm_setzero_pd();
1123 fix3 = _mm_setzero_pd();
1124 fiy3 = _mm_setzero_pd();
1125 fiz3 = _mm_setzero_pd();
1127 /* Start inner kernel loop */
1128 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1131 /* Get j neighbor index, and coordinate index */
1132 jnrA = jjnr[jidx];
1133 jnrB = jjnr[jidx+1];
1134 j_coord_offsetA = DIM*jnrA;
1135 j_coord_offsetB = DIM*jnrB;
1137 /* load j atom coordinates */
1138 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1139 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1140 &jy2,&jz2,&jx3,&jy3,&jz3);
1142 /* Calculate displacement vector */
1143 dx00 = _mm_sub_pd(ix0,jx0);
1144 dy00 = _mm_sub_pd(iy0,jy0);
1145 dz00 = _mm_sub_pd(iz0,jz0);
1146 dx11 = _mm_sub_pd(ix1,jx1);
1147 dy11 = _mm_sub_pd(iy1,jy1);
1148 dz11 = _mm_sub_pd(iz1,jz1);
1149 dx12 = _mm_sub_pd(ix1,jx2);
1150 dy12 = _mm_sub_pd(iy1,jy2);
1151 dz12 = _mm_sub_pd(iz1,jz2);
1152 dx13 = _mm_sub_pd(ix1,jx3);
1153 dy13 = _mm_sub_pd(iy1,jy3);
1154 dz13 = _mm_sub_pd(iz1,jz3);
1155 dx21 = _mm_sub_pd(ix2,jx1);
1156 dy21 = _mm_sub_pd(iy2,jy1);
1157 dz21 = _mm_sub_pd(iz2,jz1);
1158 dx22 = _mm_sub_pd(ix2,jx2);
1159 dy22 = _mm_sub_pd(iy2,jy2);
1160 dz22 = _mm_sub_pd(iz2,jz2);
1161 dx23 = _mm_sub_pd(ix2,jx3);
1162 dy23 = _mm_sub_pd(iy2,jy3);
1163 dz23 = _mm_sub_pd(iz2,jz3);
1164 dx31 = _mm_sub_pd(ix3,jx1);
1165 dy31 = _mm_sub_pd(iy3,jy1);
1166 dz31 = _mm_sub_pd(iz3,jz1);
1167 dx32 = _mm_sub_pd(ix3,jx2);
1168 dy32 = _mm_sub_pd(iy3,jy2);
1169 dz32 = _mm_sub_pd(iz3,jz2);
1170 dx33 = _mm_sub_pd(ix3,jx3);
1171 dy33 = _mm_sub_pd(iy3,jy3);
1172 dz33 = _mm_sub_pd(iz3,jz3);
1174 /* Calculate squared distance and things based on it */
1175 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1176 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1177 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1178 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1179 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1180 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1181 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1182 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1183 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1184 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1186 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1187 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1188 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1189 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1190 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1191 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1192 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1193 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1194 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1196 rinvsq00 = gmx_mm_inv_pd(rsq00);
1197 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1198 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1199 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1200 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1201 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1202 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1203 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1204 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1205 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1207 fjx0 = _mm_setzero_pd();
1208 fjy0 = _mm_setzero_pd();
1209 fjz0 = _mm_setzero_pd();
1210 fjx1 = _mm_setzero_pd();
1211 fjy1 = _mm_setzero_pd();
1212 fjz1 = _mm_setzero_pd();
1213 fjx2 = _mm_setzero_pd();
1214 fjy2 = _mm_setzero_pd();
1215 fjz2 = _mm_setzero_pd();
1216 fjx3 = _mm_setzero_pd();
1217 fjy3 = _mm_setzero_pd();
1218 fjz3 = _mm_setzero_pd();
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 /* LENNARD-JONES DISPERSION/REPULSION */
1226 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1227 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1229 fscal = fvdw;
1231 /* Calculate temporary vectorial force */
1232 tx = _mm_mul_pd(fscal,dx00);
1233 ty = _mm_mul_pd(fscal,dy00);
1234 tz = _mm_mul_pd(fscal,dz00);
1236 /* Update vectorial force */
1237 fix0 = _mm_add_pd(fix0,tx);
1238 fiy0 = _mm_add_pd(fiy0,ty);
1239 fiz0 = _mm_add_pd(fiz0,tz);
1241 fjx0 = _mm_add_pd(fjx0,tx);
1242 fjy0 = _mm_add_pd(fjy0,ty);
1243 fjz0 = _mm_add_pd(fjz0,tz);
1245 /**************************
1246 * CALCULATE INTERACTIONS *
1247 **************************/
1249 /* COULOMB ELECTROSTATICS */
1250 velec = _mm_mul_pd(qq11,rinv11);
1251 felec = _mm_mul_pd(velec,rinvsq11);
1253 fscal = felec;
1255 /* Calculate temporary vectorial force */
1256 tx = _mm_mul_pd(fscal,dx11);
1257 ty = _mm_mul_pd(fscal,dy11);
1258 tz = _mm_mul_pd(fscal,dz11);
1260 /* Update vectorial force */
1261 fix1 = _mm_add_pd(fix1,tx);
1262 fiy1 = _mm_add_pd(fiy1,ty);
1263 fiz1 = _mm_add_pd(fiz1,tz);
1265 fjx1 = _mm_add_pd(fjx1,tx);
1266 fjy1 = _mm_add_pd(fjy1,ty);
1267 fjz1 = _mm_add_pd(fjz1,tz);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 /* COULOMB ELECTROSTATICS */
1274 velec = _mm_mul_pd(qq12,rinv12);
1275 felec = _mm_mul_pd(velec,rinvsq12);
1277 fscal = felec;
1279 /* Calculate temporary vectorial force */
1280 tx = _mm_mul_pd(fscal,dx12);
1281 ty = _mm_mul_pd(fscal,dy12);
1282 tz = _mm_mul_pd(fscal,dz12);
1284 /* Update vectorial force */
1285 fix1 = _mm_add_pd(fix1,tx);
1286 fiy1 = _mm_add_pd(fiy1,ty);
1287 fiz1 = _mm_add_pd(fiz1,tz);
1289 fjx2 = _mm_add_pd(fjx2,tx);
1290 fjy2 = _mm_add_pd(fjy2,ty);
1291 fjz2 = _mm_add_pd(fjz2,tz);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* COULOMB ELECTROSTATICS */
1298 velec = _mm_mul_pd(qq13,rinv13);
1299 felec = _mm_mul_pd(velec,rinvsq13);
1301 fscal = felec;
1303 /* Calculate temporary vectorial force */
1304 tx = _mm_mul_pd(fscal,dx13);
1305 ty = _mm_mul_pd(fscal,dy13);
1306 tz = _mm_mul_pd(fscal,dz13);
1308 /* Update vectorial force */
1309 fix1 = _mm_add_pd(fix1,tx);
1310 fiy1 = _mm_add_pd(fiy1,ty);
1311 fiz1 = _mm_add_pd(fiz1,tz);
1313 fjx3 = _mm_add_pd(fjx3,tx);
1314 fjy3 = _mm_add_pd(fjy3,ty);
1315 fjz3 = _mm_add_pd(fjz3,tz);
1317 /**************************
1318 * CALCULATE INTERACTIONS *
1319 **************************/
1321 /* COULOMB ELECTROSTATICS */
1322 velec = _mm_mul_pd(qq21,rinv21);
1323 felec = _mm_mul_pd(velec,rinvsq21);
1325 fscal = felec;
1327 /* Calculate temporary vectorial force */
1328 tx = _mm_mul_pd(fscal,dx21);
1329 ty = _mm_mul_pd(fscal,dy21);
1330 tz = _mm_mul_pd(fscal,dz21);
1332 /* Update vectorial force */
1333 fix2 = _mm_add_pd(fix2,tx);
1334 fiy2 = _mm_add_pd(fiy2,ty);
1335 fiz2 = _mm_add_pd(fiz2,tz);
1337 fjx1 = _mm_add_pd(fjx1,tx);
1338 fjy1 = _mm_add_pd(fjy1,ty);
1339 fjz1 = _mm_add_pd(fjz1,tz);
1341 /**************************
1342 * CALCULATE INTERACTIONS *
1343 **************************/
1345 /* COULOMB ELECTROSTATICS */
1346 velec = _mm_mul_pd(qq22,rinv22);
1347 felec = _mm_mul_pd(velec,rinvsq22);
1349 fscal = felec;
1351 /* Calculate temporary vectorial force */
1352 tx = _mm_mul_pd(fscal,dx22);
1353 ty = _mm_mul_pd(fscal,dy22);
1354 tz = _mm_mul_pd(fscal,dz22);
1356 /* Update vectorial force */
1357 fix2 = _mm_add_pd(fix2,tx);
1358 fiy2 = _mm_add_pd(fiy2,ty);
1359 fiz2 = _mm_add_pd(fiz2,tz);
1361 fjx2 = _mm_add_pd(fjx2,tx);
1362 fjy2 = _mm_add_pd(fjy2,ty);
1363 fjz2 = _mm_add_pd(fjz2,tz);
1365 /**************************
1366 * CALCULATE INTERACTIONS *
1367 **************************/
1369 /* COULOMB ELECTROSTATICS */
1370 velec = _mm_mul_pd(qq23,rinv23);
1371 felec = _mm_mul_pd(velec,rinvsq23);
1373 fscal = felec;
1375 /* Calculate temporary vectorial force */
1376 tx = _mm_mul_pd(fscal,dx23);
1377 ty = _mm_mul_pd(fscal,dy23);
1378 tz = _mm_mul_pd(fscal,dz23);
1380 /* Update vectorial force */
1381 fix2 = _mm_add_pd(fix2,tx);
1382 fiy2 = _mm_add_pd(fiy2,ty);
1383 fiz2 = _mm_add_pd(fiz2,tz);
1385 fjx3 = _mm_add_pd(fjx3,tx);
1386 fjy3 = _mm_add_pd(fjy3,ty);
1387 fjz3 = _mm_add_pd(fjz3,tz);
1389 /**************************
1390 * CALCULATE INTERACTIONS *
1391 **************************/
1393 /* COULOMB ELECTROSTATICS */
1394 velec = _mm_mul_pd(qq31,rinv31);
1395 felec = _mm_mul_pd(velec,rinvsq31);
1397 fscal = felec;
1399 /* Calculate temporary vectorial force */
1400 tx = _mm_mul_pd(fscal,dx31);
1401 ty = _mm_mul_pd(fscal,dy31);
1402 tz = _mm_mul_pd(fscal,dz31);
1404 /* Update vectorial force */
1405 fix3 = _mm_add_pd(fix3,tx);
1406 fiy3 = _mm_add_pd(fiy3,ty);
1407 fiz3 = _mm_add_pd(fiz3,tz);
1409 fjx1 = _mm_add_pd(fjx1,tx);
1410 fjy1 = _mm_add_pd(fjy1,ty);
1411 fjz1 = _mm_add_pd(fjz1,tz);
1413 /**************************
1414 * CALCULATE INTERACTIONS *
1415 **************************/
1417 /* COULOMB ELECTROSTATICS */
1418 velec = _mm_mul_pd(qq32,rinv32);
1419 felec = _mm_mul_pd(velec,rinvsq32);
1421 fscal = felec;
1423 /* Calculate temporary vectorial force */
1424 tx = _mm_mul_pd(fscal,dx32);
1425 ty = _mm_mul_pd(fscal,dy32);
1426 tz = _mm_mul_pd(fscal,dz32);
1428 /* Update vectorial force */
1429 fix3 = _mm_add_pd(fix3,tx);
1430 fiy3 = _mm_add_pd(fiy3,ty);
1431 fiz3 = _mm_add_pd(fiz3,tz);
1433 fjx2 = _mm_add_pd(fjx2,tx);
1434 fjy2 = _mm_add_pd(fjy2,ty);
1435 fjz2 = _mm_add_pd(fjz2,tz);
1437 /**************************
1438 * CALCULATE INTERACTIONS *
1439 **************************/
1441 /* COULOMB ELECTROSTATICS */
1442 velec = _mm_mul_pd(qq33,rinv33);
1443 felec = _mm_mul_pd(velec,rinvsq33);
1445 fscal = felec;
1447 /* Calculate temporary vectorial force */
1448 tx = _mm_mul_pd(fscal,dx33);
1449 ty = _mm_mul_pd(fscal,dy33);
1450 tz = _mm_mul_pd(fscal,dz33);
1452 /* Update vectorial force */
1453 fix3 = _mm_add_pd(fix3,tx);
1454 fiy3 = _mm_add_pd(fiy3,ty);
1455 fiz3 = _mm_add_pd(fiz3,tz);
1457 fjx3 = _mm_add_pd(fjx3,tx);
1458 fjy3 = _mm_add_pd(fjy3,ty);
1459 fjz3 = _mm_add_pd(fjz3,tz);
1461 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1463 /* Inner loop uses 273 flops */
1466 if(jidx<j_index_end)
1469 jnrA = jjnr[jidx];
1470 j_coord_offsetA = DIM*jnrA;
1472 /* load j atom coordinates */
1473 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1474 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1475 &jy2,&jz2,&jx3,&jy3,&jz3);
1477 /* Calculate displacement vector */
1478 dx00 = _mm_sub_pd(ix0,jx0);
1479 dy00 = _mm_sub_pd(iy0,jy0);
1480 dz00 = _mm_sub_pd(iz0,jz0);
1481 dx11 = _mm_sub_pd(ix1,jx1);
1482 dy11 = _mm_sub_pd(iy1,jy1);
1483 dz11 = _mm_sub_pd(iz1,jz1);
1484 dx12 = _mm_sub_pd(ix1,jx2);
1485 dy12 = _mm_sub_pd(iy1,jy2);
1486 dz12 = _mm_sub_pd(iz1,jz2);
1487 dx13 = _mm_sub_pd(ix1,jx3);
1488 dy13 = _mm_sub_pd(iy1,jy3);
1489 dz13 = _mm_sub_pd(iz1,jz3);
1490 dx21 = _mm_sub_pd(ix2,jx1);
1491 dy21 = _mm_sub_pd(iy2,jy1);
1492 dz21 = _mm_sub_pd(iz2,jz1);
1493 dx22 = _mm_sub_pd(ix2,jx2);
1494 dy22 = _mm_sub_pd(iy2,jy2);
1495 dz22 = _mm_sub_pd(iz2,jz2);
1496 dx23 = _mm_sub_pd(ix2,jx3);
1497 dy23 = _mm_sub_pd(iy2,jy3);
1498 dz23 = _mm_sub_pd(iz2,jz3);
1499 dx31 = _mm_sub_pd(ix3,jx1);
1500 dy31 = _mm_sub_pd(iy3,jy1);
1501 dz31 = _mm_sub_pd(iz3,jz1);
1502 dx32 = _mm_sub_pd(ix3,jx2);
1503 dy32 = _mm_sub_pd(iy3,jy2);
1504 dz32 = _mm_sub_pd(iz3,jz2);
1505 dx33 = _mm_sub_pd(ix3,jx3);
1506 dy33 = _mm_sub_pd(iy3,jy3);
1507 dz33 = _mm_sub_pd(iz3,jz3);
1509 /* Calculate squared distance and things based on it */
1510 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1511 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1512 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1513 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1514 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1515 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1516 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1517 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1518 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1519 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1521 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1522 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1523 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1524 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1525 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1526 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1527 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1528 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1529 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1531 rinvsq00 = gmx_mm_inv_pd(rsq00);
1532 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1533 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1534 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1535 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1536 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1537 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1538 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1539 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1540 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1542 fjx0 = _mm_setzero_pd();
1543 fjy0 = _mm_setzero_pd();
1544 fjz0 = _mm_setzero_pd();
1545 fjx1 = _mm_setzero_pd();
1546 fjy1 = _mm_setzero_pd();
1547 fjz1 = _mm_setzero_pd();
1548 fjx2 = _mm_setzero_pd();
1549 fjy2 = _mm_setzero_pd();
1550 fjz2 = _mm_setzero_pd();
1551 fjx3 = _mm_setzero_pd();
1552 fjy3 = _mm_setzero_pd();
1553 fjz3 = _mm_setzero_pd();
1555 /**************************
1556 * CALCULATE INTERACTIONS *
1557 **************************/
1559 /* LENNARD-JONES DISPERSION/REPULSION */
1561 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1562 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1564 fscal = fvdw;
1566 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1568 /* Calculate temporary vectorial force */
1569 tx = _mm_mul_pd(fscal,dx00);
1570 ty = _mm_mul_pd(fscal,dy00);
1571 tz = _mm_mul_pd(fscal,dz00);
1573 /* Update vectorial force */
1574 fix0 = _mm_add_pd(fix0,tx);
1575 fiy0 = _mm_add_pd(fiy0,ty);
1576 fiz0 = _mm_add_pd(fiz0,tz);
1578 fjx0 = _mm_add_pd(fjx0,tx);
1579 fjy0 = _mm_add_pd(fjy0,ty);
1580 fjz0 = _mm_add_pd(fjz0,tz);
1582 /**************************
1583 * CALCULATE INTERACTIONS *
1584 **************************/
1586 /* COULOMB ELECTROSTATICS */
1587 velec = _mm_mul_pd(qq11,rinv11);
1588 felec = _mm_mul_pd(velec,rinvsq11);
1590 fscal = felec;
1592 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1594 /* Calculate temporary vectorial force */
1595 tx = _mm_mul_pd(fscal,dx11);
1596 ty = _mm_mul_pd(fscal,dy11);
1597 tz = _mm_mul_pd(fscal,dz11);
1599 /* Update vectorial force */
1600 fix1 = _mm_add_pd(fix1,tx);
1601 fiy1 = _mm_add_pd(fiy1,ty);
1602 fiz1 = _mm_add_pd(fiz1,tz);
1604 fjx1 = _mm_add_pd(fjx1,tx);
1605 fjy1 = _mm_add_pd(fjy1,ty);
1606 fjz1 = _mm_add_pd(fjz1,tz);
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 /* COULOMB ELECTROSTATICS */
1613 velec = _mm_mul_pd(qq12,rinv12);
1614 felec = _mm_mul_pd(velec,rinvsq12);
1616 fscal = felec;
1618 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1620 /* Calculate temporary vectorial force */
1621 tx = _mm_mul_pd(fscal,dx12);
1622 ty = _mm_mul_pd(fscal,dy12);
1623 tz = _mm_mul_pd(fscal,dz12);
1625 /* Update vectorial force */
1626 fix1 = _mm_add_pd(fix1,tx);
1627 fiy1 = _mm_add_pd(fiy1,ty);
1628 fiz1 = _mm_add_pd(fiz1,tz);
1630 fjx2 = _mm_add_pd(fjx2,tx);
1631 fjy2 = _mm_add_pd(fjy2,ty);
1632 fjz2 = _mm_add_pd(fjz2,tz);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 /* COULOMB ELECTROSTATICS */
1639 velec = _mm_mul_pd(qq13,rinv13);
1640 felec = _mm_mul_pd(velec,rinvsq13);
1642 fscal = felec;
1644 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1646 /* Calculate temporary vectorial force */
1647 tx = _mm_mul_pd(fscal,dx13);
1648 ty = _mm_mul_pd(fscal,dy13);
1649 tz = _mm_mul_pd(fscal,dz13);
1651 /* Update vectorial force */
1652 fix1 = _mm_add_pd(fix1,tx);
1653 fiy1 = _mm_add_pd(fiy1,ty);
1654 fiz1 = _mm_add_pd(fiz1,tz);
1656 fjx3 = _mm_add_pd(fjx3,tx);
1657 fjy3 = _mm_add_pd(fjy3,ty);
1658 fjz3 = _mm_add_pd(fjz3,tz);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 /* COULOMB ELECTROSTATICS */
1665 velec = _mm_mul_pd(qq21,rinv21);
1666 felec = _mm_mul_pd(velec,rinvsq21);
1668 fscal = felec;
1670 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1672 /* Calculate temporary vectorial force */
1673 tx = _mm_mul_pd(fscal,dx21);
1674 ty = _mm_mul_pd(fscal,dy21);
1675 tz = _mm_mul_pd(fscal,dz21);
1677 /* Update vectorial force */
1678 fix2 = _mm_add_pd(fix2,tx);
1679 fiy2 = _mm_add_pd(fiy2,ty);
1680 fiz2 = _mm_add_pd(fiz2,tz);
1682 fjx1 = _mm_add_pd(fjx1,tx);
1683 fjy1 = _mm_add_pd(fjy1,ty);
1684 fjz1 = _mm_add_pd(fjz1,tz);
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 /* COULOMB ELECTROSTATICS */
1691 velec = _mm_mul_pd(qq22,rinv22);
1692 felec = _mm_mul_pd(velec,rinvsq22);
1694 fscal = felec;
1696 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1698 /* Calculate temporary vectorial force */
1699 tx = _mm_mul_pd(fscal,dx22);
1700 ty = _mm_mul_pd(fscal,dy22);
1701 tz = _mm_mul_pd(fscal,dz22);
1703 /* Update vectorial force */
1704 fix2 = _mm_add_pd(fix2,tx);
1705 fiy2 = _mm_add_pd(fiy2,ty);
1706 fiz2 = _mm_add_pd(fiz2,tz);
1708 fjx2 = _mm_add_pd(fjx2,tx);
1709 fjy2 = _mm_add_pd(fjy2,ty);
1710 fjz2 = _mm_add_pd(fjz2,tz);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 /* COULOMB ELECTROSTATICS */
1717 velec = _mm_mul_pd(qq23,rinv23);
1718 felec = _mm_mul_pd(velec,rinvsq23);
1720 fscal = felec;
1722 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1724 /* Calculate temporary vectorial force */
1725 tx = _mm_mul_pd(fscal,dx23);
1726 ty = _mm_mul_pd(fscal,dy23);
1727 tz = _mm_mul_pd(fscal,dz23);
1729 /* Update vectorial force */
1730 fix2 = _mm_add_pd(fix2,tx);
1731 fiy2 = _mm_add_pd(fiy2,ty);
1732 fiz2 = _mm_add_pd(fiz2,tz);
1734 fjx3 = _mm_add_pd(fjx3,tx);
1735 fjy3 = _mm_add_pd(fjy3,ty);
1736 fjz3 = _mm_add_pd(fjz3,tz);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 /* COULOMB ELECTROSTATICS */
1743 velec = _mm_mul_pd(qq31,rinv31);
1744 felec = _mm_mul_pd(velec,rinvsq31);
1746 fscal = felec;
1748 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1750 /* Calculate temporary vectorial force */
1751 tx = _mm_mul_pd(fscal,dx31);
1752 ty = _mm_mul_pd(fscal,dy31);
1753 tz = _mm_mul_pd(fscal,dz31);
1755 /* Update vectorial force */
1756 fix3 = _mm_add_pd(fix3,tx);
1757 fiy3 = _mm_add_pd(fiy3,ty);
1758 fiz3 = _mm_add_pd(fiz3,tz);
1760 fjx1 = _mm_add_pd(fjx1,tx);
1761 fjy1 = _mm_add_pd(fjy1,ty);
1762 fjz1 = _mm_add_pd(fjz1,tz);
1764 /**************************
1765 * CALCULATE INTERACTIONS *
1766 **************************/
1768 /* COULOMB ELECTROSTATICS */
1769 velec = _mm_mul_pd(qq32,rinv32);
1770 felec = _mm_mul_pd(velec,rinvsq32);
1772 fscal = felec;
1774 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1776 /* Calculate temporary vectorial force */
1777 tx = _mm_mul_pd(fscal,dx32);
1778 ty = _mm_mul_pd(fscal,dy32);
1779 tz = _mm_mul_pd(fscal,dz32);
1781 /* Update vectorial force */
1782 fix3 = _mm_add_pd(fix3,tx);
1783 fiy3 = _mm_add_pd(fiy3,ty);
1784 fiz3 = _mm_add_pd(fiz3,tz);
1786 fjx2 = _mm_add_pd(fjx2,tx);
1787 fjy2 = _mm_add_pd(fjy2,ty);
1788 fjz2 = _mm_add_pd(fjz2,tz);
1790 /**************************
1791 * CALCULATE INTERACTIONS *
1792 **************************/
1794 /* COULOMB ELECTROSTATICS */
1795 velec = _mm_mul_pd(qq33,rinv33);
1796 felec = _mm_mul_pd(velec,rinvsq33);
1798 fscal = felec;
1800 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1802 /* Calculate temporary vectorial force */
1803 tx = _mm_mul_pd(fscal,dx33);
1804 ty = _mm_mul_pd(fscal,dy33);
1805 tz = _mm_mul_pd(fscal,dz33);
1807 /* Update vectorial force */
1808 fix3 = _mm_add_pd(fix3,tx);
1809 fiy3 = _mm_add_pd(fiy3,ty);
1810 fiz3 = _mm_add_pd(fiz3,tz);
1812 fjx3 = _mm_add_pd(fjx3,tx);
1813 fjy3 = _mm_add_pd(fjy3,ty);
1814 fjz3 = _mm_add_pd(fjz3,tz);
1816 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1818 /* Inner loop uses 273 flops */
1821 /* End of innermost loop */
1823 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1824 f+i_coord_offset,fshift+i_shift_offset);
1826 /* Increment number of inner iterations */
1827 inneriter += j_index_end - j_index_start;
1829 /* Outer loop uses 24 flops */
1832 /* Increment number of outer iterations */
1833 outeriter += nri;
1835 /* Update outer/inner flops */
1837 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);