Double precision SSE2 kernels
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_double.c
blobb4567e55a268b84ff011e573be36324041eba05f
1 /*
2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
6 * G R O M A C S
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
17 * later version.
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
26 #include <math.h>
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
43 void
44 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66 int vdwioffset1;
67 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
68 int vdwioffset2;
69 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
70 int vdwioffset3;
71 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
75 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
76 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
77 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
78 real *charge;
79 __m128i ewitab;
80 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
81 real *ewtab;
82 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
83 real rswitch_scalar,d_scalar;
84 __m128d dummy_mask,cutoff_mask;
85 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
86 __m128d one = _mm_set1_pd(1.0);
87 __m128d two = _mm_set1_pd(2.0);
88 x = xx[0];
89 f = ff[0];
91 nri = nlist->nri;
92 iinr = nlist->iinr;
93 jindex = nlist->jindex;
94 jjnr = nlist->jjnr;
95 shiftidx = nlist->shift;
96 gid = nlist->gid;
97 shiftvec = fr->shift_vec[0];
98 fshift = fr->fshift[0];
99 facel = _mm_set1_pd(fr->epsfac);
100 charge = mdatoms->chargeA;
102 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
103 ewtab = fr->ic->tabq_coul_FDV0;
104 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
105 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
107 /* Setup water-specific parameters */
108 inr = nlist->iinr[0];
109 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
110 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
111 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
113 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
114 rcutoff_scalar = fr->rcoulomb;
115 rcutoff = _mm_set1_pd(rcutoff_scalar);
116 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
118 rswitch_scalar = fr->rcoulomb_switch;
119 rswitch = _mm_set1_pd(rswitch_scalar);
120 /* Setup switch parameters */
121 d_scalar = rcutoff_scalar-rswitch_scalar;
122 d = _mm_set1_pd(d_scalar);
123 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
124 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
125 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
126 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
127 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
128 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
130 /* Avoid stupid compiler warnings */
131 jnrA = jnrB = 0;
132 j_coord_offsetA = 0;
133 j_coord_offsetB = 0;
135 outeriter = 0;
136 inneriter = 0;
138 /* Start outer loop over neighborlists */
139 for(iidx=0; iidx<nri; iidx++)
141 /* Load shift vector for this list */
142 i_shift_offset = DIM*shiftidx[iidx];
144 /* Load limits for loop over neighbors */
145 j_index_start = jindex[iidx];
146 j_index_end = jindex[iidx+1];
148 /* Get outer coordinate index */
149 inr = iinr[iidx];
150 i_coord_offset = DIM*inr;
152 /* Load i particle coords and add shift vector */
153 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
154 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
156 fix1 = _mm_setzero_pd();
157 fiy1 = _mm_setzero_pd();
158 fiz1 = _mm_setzero_pd();
159 fix2 = _mm_setzero_pd();
160 fiy2 = _mm_setzero_pd();
161 fiz2 = _mm_setzero_pd();
162 fix3 = _mm_setzero_pd();
163 fiy3 = _mm_setzero_pd();
164 fiz3 = _mm_setzero_pd();
166 /* Reset potential sums */
167 velecsum = _mm_setzero_pd();
169 /* Start inner kernel loop */
170 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
173 /* Get j neighbor index, and coordinate index */
174 jnrA = jjnr[jidx];
175 jnrB = jjnr[jidx+1];
176 j_coord_offsetA = DIM*jnrA;
177 j_coord_offsetB = DIM*jnrB;
179 /* load j atom coordinates */
180 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
181 &jx0,&jy0,&jz0);
183 /* Calculate displacement vector */
184 dx10 = _mm_sub_pd(ix1,jx0);
185 dy10 = _mm_sub_pd(iy1,jy0);
186 dz10 = _mm_sub_pd(iz1,jz0);
187 dx20 = _mm_sub_pd(ix2,jx0);
188 dy20 = _mm_sub_pd(iy2,jy0);
189 dz20 = _mm_sub_pd(iz2,jz0);
190 dx30 = _mm_sub_pd(ix3,jx0);
191 dy30 = _mm_sub_pd(iy3,jy0);
192 dz30 = _mm_sub_pd(iz3,jz0);
194 /* Calculate squared distance and things based on it */
195 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
196 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
197 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
199 rinv10 = gmx_mm_invsqrt_pd(rsq10);
200 rinv20 = gmx_mm_invsqrt_pd(rsq20);
201 rinv30 = gmx_mm_invsqrt_pd(rsq30);
203 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
204 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
205 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
207 /* Load parameters for j particles */
208 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
210 fjx0 = _mm_setzero_pd();
211 fjy0 = _mm_setzero_pd();
212 fjz0 = _mm_setzero_pd();
214 /**************************
215 * CALCULATE INTERACTIONS *
216 **************************/
218 if (gmx_mm_any_lt(rsq10,rcutoff2))
221 r10 = _mm_mul_pd(rsq10,rinv10);
223 /* Compute parameters for interactions between i and j atoms */
224 qq10 = _mm_mul_pd(iq1,jq0);
226 /* EWALD ELECTROSTATICS */
228 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
229 ewrt = _mm_mul_pd(r10,ewtabscale);
230 ewitab = _mm_cvttpd_epi32(ewrt);
231 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
232 ewitab = _mm_slli_epi32(ewitab,2);
233 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
234 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
235 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
236 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
237 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
238 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
239 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
240 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
241 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
242 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
244 d = _mm_sub_pd(r10,rswitch);
245 d = _mm_max_pd(d,_mm_setzero_pd());
246 d2 = _mm_mul_pd(d,d);
247 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
249 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
251 /* Evaluate switch function */
252 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
253 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
254 velec = _mm_mul_pd(velec,sw);
255 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
257 /* Update potential sum for this i atom from the interaction with this j atom. */
258 velec = _mm_and_pd(velec,cutoff_mask);
259 velecsum = _mm_add_pd(velecsum,velec);
261 fscal = felec;
263 fscal = _mm_and_pd(fscal,cutoff_mask);
265 /* Calculate temporary vectorial force */
266 tx = _mm_mul_pd(fscal,dx10);
267 ty = _mm_mul_pd(fscal,dy10);
268 tz = _mm_mul_pd(fscal,dz10);
270 /* Update vectorial force */
271 fix1 = _mm_add_pd(fix1,tx);
272 fiy1 = _mm_add_pd(fiy1,ty);
273 fiz1 = _mm_add_pd(fiz1,tz);
275 fjx0 = _mm_add_pd(fjx0,tx);
276 fjy0 = _mm_add_pd(fjy0,ty);
277 fjz0 = _mm_add_pd(fjz0,tz);
281 /**************************
282 * CALCULATE INTERACTIONS *
283 **************************/
285 if (gmx_mm_any_lt(rsq20,rcutoff2))
288 r20 = _mm_mul_pd(rsq20,rinv20);
290 /* Compute parameters for interactions between i and j atoms */
291 qq20 = _mm_mul_pd(iq2,jq0);
293 /* EWALD ELECTROSTATICS */
295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
296 ewrt = _mm_mul_pd(r20,ewtabscale);
297 ewitab = _mm_cvttpd_epi32(ewrt);
298 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
299 ewitab = _mm_slli_epi32(ewitab,2);
300 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
301 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
302 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
303 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
304 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
305 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
306 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
307 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
308 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
309 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
311 d = _mm_sub_pd(r20,rswitch);
312 d = _mm_max_pd(d,_mm_setzero_pd());
313 d2 = _mm_mul_pd(d,d);
314 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
316 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
318 /* Evaluate switch function */
319 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
320 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
321 velec = _mm_mul_pd(velec,sw);
322 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 velec = _mm_and_pd(velec,cutoff_mask);
326 velecsum = _mm_add_pd(velecsum,velec);
328 fscal = felec;
330 fscal = _mm_and_pd(fscal,cutoff_mask);
332 /* Calculate temporary vectorial force */
333 tx = _mm_mul_pd(fscal,dx20);
334 ty = _mm_mul_pd(fscal,dy20);
335 tz = _mm_mul_pd(fscal,dz20);
337 /* Update vectorial force */
338 fix2 = _mm_add_pd(fix2,tx);
339 fiy2 = _mm_add_pd(fiy2,ty);
340 fiz2 = _mm_add_pd(fiz2,tz);
342 fjx0 = _mm_add_pd(fjx0,tx);
343 fjy0 = _mm_add_pd(fjy0,ty);
344 fjz0 = _mm_add_pd(fjz0,tz);
348 /**************************
349 * CALCULATE INTERACTIONS *
350 **************************/
352 if (gmx_mm_any_lt(rsq30,rcutoff2))
355 r30 = _mm_mul_pd(rsq30,rinv30);
357 /* Compute parameters for interactions between i and j atoms */
358 qq30 = _mm_mul_pd(iq3,jq0);
360 /* EWALD ELECTROSTATICS */
362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
363 ewrt = _mm_mul_pd(r30,ewtabscale);
364 ewitab = _mm_cvttpd_epi32(ewrt);
365 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
366 ewitab = _mm_slli_epi32(ewitab,2);
367 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
368 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
369 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
370 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
371 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
372 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
373 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
374 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
375 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
376 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
378 d = _mm_sub_pd(r30,rswitch);
379 d = _mm_max_pd(d,_mm_setzero_pd());
380 d2 = _mm_mul_pd(d,d);
381 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
383 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
385 /* Evaluate switch function */
386 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
387 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
388 velec = _mm_mul_pd(velec,sw);
389 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
391 /* Update potential sum for this i atom from the interaction with this j atom. */
392 velec = _mm_and_pd(velec,cutoff_mask);
393 velecsum = _mm_add_pd(velecsum,velec);
395 fscal = felec;
397 fscal = _mm_and_pd(fscal,cutoff_mask);
399 /* Calculate temporary vectorial force */
400 tx = _mm_mul_pd(fscal,dx30);
401 ty = _mm_mul_pd(fscal,dy30);
402 tz = _mm_mul_pd(fscal,dz30);
404 /* Update vectorial force */
405 fix3 = _mm_add_pd(fix3,tx);
406 fiy3 = _mm_add_pd(fiy3,ty);
407 fiz3 = _mm_add_pd(fiz3,tz);
409 fjx0 = _mm_add_pd(fjx0,tx);
410 fjy0 = _mm_add_pd(fjy0,ty);
411 fjz0 = _mm_add_pd(fjz0,tz);
415 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
417 /* Inner loop uses 198 flops */
420 if(jidx<j_index_end)
423 jnrA = jjnr[jidx];
424 j_coord_offsetA = DIM*jnrA;
426 /* load j atom coordinates */
427 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
428 &jx0,&jy0,&jz0);
430 /* Calculate displacement vector */
431 dx10 = _mm_sub_pd(ix1,jx0);
432 dy10 = _mm_sub_pd(iy1,jy0);
433 dz10 = _mm_sub_pd(iz1,jz0);
434 dx20 = _mm_sub_pd(ix2,jx0);
435 dy20 = _mm_sub_pd(iy2,jy0);
436 dz20 = _mm_sub_pd(iz2,jz0);
437 dx30 = _mm_sub_pd(ix3,jx0);
438 dy30 = _mm_sub_pd(iy3,jy0);
439 dz30 = _mm_sub_pd(iz3,jz0);
441 /* Calculate squared distance and things based on it */
442 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
443 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
444 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
446 rinv10 = gmx_mm_invsqrt_pd(rsq10);
447 rinv20 = gmx_mm_invsqrt_pd(rsq20);
448 rinv30 = gmx_mm_invsqrt_pd(rsq30);
450 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
451 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
452 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
454 /* Load parameters for j particles */
455 jq0 = _mm_load_sd(charge+jnrA+0);
457 fjx0 = _mm_setzero_pd();
458 fjy0 = _mm_setzero_pd();
459 fjz0 = _mm_setzero_pd();
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 if (gmx_mm_any_lt(rsq10,rcutoff2))
468 r10 = _mm_mul_pd(rsq10,rinv10);
470 /* Compute parameters for interactions between i and j atoms */
471 qq10 = _mm_mul_pd(iq1,jq0);
473 /* EWALD ELECTROSTATICS */
475 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476 ewrt = _mm_mul_pd(r10,ewtabscale);
477 ewitab = _mm_cvttpd_epi32(ewrt);
478 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
479 ewitab = _mm_slli_epi32(ewitab,2);
480 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
481 ewtabD = _mm_setzero_pd();
482 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
483 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
484 ewtabFn = _mm_setzero_pd();
485 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
486 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
487 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
488 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
489 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
491 d = _mm_sub_pd(r10,rswitch);
492 d = _mm_max_pd(d,_mm_setzero_pd());
493 d2 = _mm_mul_pd(d,d);
494 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
496 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
498 /* Evaluate switch function */
499 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
500 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
501 velec = _mm_mul_pd(velec,sw);
502 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
504 /* Update potential sum for this i atom from the interaction with this j atom. */
505 velec = _mm_and_pd(velec,cutoff_mask);
506 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
507 velecsum = _mm_add_pd(velecsum,velec);
509 fscal = felec;
511 fscal = _mm_and_pd(fscal,cutoff_mask);
513 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
515 /* Calculate temporary vectorial force */
516 tx = _mm_mul_pd(fscal,dx10);
517 ty = _mm_mul_pd(fscal,dy10);
518 tz = _mm_mul_pd(fscal,dz10);
520 /* Update vectorial force */
521 fix1 = _mm_add_pd(fix1,tx);
522 fiy1 = _mm_add_pd(fiy1,ty);
523 fiz1 = _mm_add_pd(fiz1,tz);
525 fjx0 = _mm_add_pd(fjx0,tx);
526 fjy0 = _mm_add_pd(fjy0,ty);
527 fjz0 = _mm_add_pd(fjz0,tz);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 if (gmx_mm_any_lt(rsq20,rcutoff2))
538 r20 = _mm_mul_pd(rsq20,rinv20);
540 /* Compute parameters for interactions between i and j atoms */
541 qq20 = _mm_mul_pd(iq2,jq0);
543 /* EWALD ELECTROSTATICS */
545 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
546 ewrt = _mm_mul_pd(r20,ewtabscale);
547 ewitab = _mm_cvttpd_epi32(ewrt);
548 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
549 ewitab = _mm_slli_epi32(ewitab,2);
550 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
551 ewtabD = _mm_setzero_pd();
552 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
553 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
554 ewtabFn = _mm_setzero_pd();
555 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
556 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
557 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
558 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
559 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
561 d = _mm_sub_pd(r20,rswitch);
562 d = _mm_max_pd(d,_mm_setzero_pd());
563 d2 = _mm_mul_pd(d,d);
564 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
566 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
568 /* Evaluate switch function */
569 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
570 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
571 velec = _mm_mul_pd(velec,sw);
572 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 velec = _mm_and_pd(velec,cutoff_mask);
576 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
577 velecsum = _mm_add_pd(velecsum,velec);
579 fscal = felec;
581 fscal = _mm_and_pd(fscal,cutoff_mask);
583 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
585 /* Calculate temporary vectorial force */
586 tx = _mm_mul_pd(fscal,dx20);
587 ty = _mm_mul_pd(fscal,dy20);
588 tz = _mm_mul_pd(fscal,dz20);
590 /* Update vectorial force */
591 fix2 = _mm_add_pd(fix2,tx);
592 fiy2 = _mm_add_pd(fiy2,ty);
593 fiz2 = _mm_add_pd(fiz2,tz);
595 fjx0 = _mm_add_pd(fjx0,tx);
596 fjy0 = _mm_add_pd(fjy0,ty);
597 fjz0 = _mm_add_pd(fjz0,tz);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 if (gmx_mm_any_lt(rsq30,rcutoff2))
608 r30 = _mm_mul_pd(rsq30,rinv30);
610 /* Compute parameters for interactions between i and j atoms */
611 qq30 = _mm_mul_pd(iq3,jq0);
613 /* EWALD ELECTROSTATICS */
615 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
616 ewrt = _mm_mul_pd(r30,ewtabscale);
617 ewitab = _mm_cvttpd_epi32(ewrt);
618 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
619 ewitab = _mm_slli_epi32(ewitab,2);
620 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
621 ewtabD = _mm_setzero_pd();
622 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
623 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
624 ewtabFn = _mm_setzero_pd();
625 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
626 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
627 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
628 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
629 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
631 d = _mm_sub_pd(r30,rswitch);
632 d = _mm_max_pd(d,_mm_setzero_pd());
633 d2 = _mm_mul_pd(d,d);
634 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
636 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
638 /* Evaluate switch function */
639 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
640 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
641 velec = _mm_mul_pd(velec,sw);
642 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec = _mm_and_pd(velec,cutoff_mask);
646 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
647 velecsum = _mm_add_pd(velecsum,velec);
649 fscal = felec;
651 fscal = _mm_and_pd(fscal,cutoff_mask);
653 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
655 /* Calculate temporary vectorial force */
656 tx = _mm_mul_pd(fscal,dx30);
657 ty = _mm_mul_pd(fscal,dy30);
658 tz = _mm_mul_pd(fscal,dz30);
660 /* Update vectorial force */
661 fix3 = _mm_add_pd(fix3,tx);
662 fiy3 = _mm_add_pd(fiy3,ty);
663 fiz3 = _mm_add_pd(fiz3,tz);
665 fjx0 = _mm_add_pd(fjx0,tx);
666 fjy0 = _mm_add_pd(fjy0,ty);
667 fjz0 = _mm_add_pd(fjz0,tz);
671 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
673 /* Inner loop uses 198 flops */
676 /* End of innermost loop */
678 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
679 f+i_coord_offset+DIM,fshift+i_shift_offset);
681 ggid = gid[iidx];
682 /* Update potential energies */
683 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
685 /* Increment number of inner iterations */
686 inneriter += j_index_end - j_index_start;
688 /* Outer loop uses 19 flops */
691 /* Increment number of outer iterations */
692 outeriter += nri;
694 /* Update outer/inner flops */
696 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*198);
699 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double
700 * Electrostatics interaction: Ewald
701 * VdW interaction: None
702 * Geometry: Water4-Particle
703 * Calculate force/pot: Force
705 void
706 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double
707 (t_nblist * gmx_restrict nlist,
708 rvec * gmx_restrict xx,
709 rvec * gmx_restrict ff,
710 t_forcerec * gmx_restrict fr,
711 t_mdatoms * gmx_restrict mdatoms,
712 nb_kernel_data_t * gmx_restrict kernel_data,
713 t_nrnb * gmx_restrict nrnb)
715 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
716 * just 0 for non-waters.
717 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
718 * jnr indices corresponding to data put in the four positions in the SIMD register.
720 int i_shift_offset,i_coord_offset,outeriter,inneriter;
721 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
722 int jnrA,jnrB;
723 int j_coord_offsetA,j_coord_offsetB;
724 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
725 real rcutoff_scalar;
726 real *shiftvec,*fshift,*x,*f;
727 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
728 int vdwioffset1;
729 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
730 int vdwioffset2;
731 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
732 int vdwioffset3;
733 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
734 int vdwjidx0A,vdwjidx0B;
735 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
736 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
737 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
738 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
739 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
740 real *charge;
741 __m128i ewitab;
742 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
743 real *ewtab;
744 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
745 real rswitch_scalar,d_scalar;
746 __m128d dummy_mask,cutoff_mask;
747 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
748 __m128d one = _mm_set1_pd(1.0);
749 __m128d two = _mm_set1_pd(2.0);
750 x = xx[0];
751 f = ff[0];
753 nri = nlist->nri;
754 iinr = nlist->iinr;
755 jindex = nlist->jindex;
756 jjnr = nlist->jjnr;
757 shiftidx = nlist->shift;
758 gid = nlist->gid;
759 shiftvec = fr->shift_vec[0];
760 fshift = fr->fshift[0];
761 facel = _mm_set1_pd(fr->epsfac);
762 charge = mdatoms->chargeA;
764 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
765 ewtab = fr->ic->tabq_coul_FDV0;
766 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
767 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
769 /* Setup water-specific parameters */
770 inr = nlist->iinr[0];
771 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
772 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
773 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
775 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
776 rcutoff_scalar = fr->rcoulomb;
777 rcutoff = _mm_set1_pd(rcutoff_scalar);
778 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
780 rswitch_scalar = fr->rcoulomb_switch;
781 rswitch = _mm_set1_pd(rswitch_scalar);
782 /* Setup switch parameters */
783 d_scalar = rcutoff_scalar-rswitch_scalar;
784 d = _mm_set1_pd(d_scalar);
785 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
786 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
787 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
788 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
789 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
790 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
792 /* Avoid stupid compiler warnings */
793 jnrA = jnrB = 0;
794 j_coord_offsetA = 0;
795 j_coord_offsetB = 0;
797 outeriter = 0;
798 inneriter = 0;
800 /* Start outer loop over neighborlists */
801 for(iidx=0; iidx<nri; iidx++)
803 /* Load shift vector for this list */
804 i_shift_offset = DIM*shiftidx[iidx];
806 /* Load limits for loop over neighbors */
807 j_index_start = jindex[iidx];
808 j_index_end = jindex[iidx+1];
810 /* Get outer coordinate index */
811 inr = iinr[iidx];
812 i_coord_offset = DIM*inr;
814 /* Load i particle coords and add shift vector */
815 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
816 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
818 fix1 = _mm_setzero_pd();
819 fiy1 = _mm_setzero_pd();
820 fiz1 = _mm_setzero_pd();
821 fix2 = _mm_setzero_pd();
822 fiy2 = _mm_setzero_pd();
823 fiz2 = _mm_setzero_pd();
824 fix3 = _mm_setzero_pd();
825 fiy3 = _mm_setzero_pd();
826 fiz3 = _mm_setzero_pd();
828 /* Start inner kernel loop */
829 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
832 /* Get j neighbor index, and coordinate index */
833 jnrA = jjnr[jidx];
834 jnrB = jjnr[jidx+1];
835 j_coord_offsetA = DIM*jnrA;
836 j_coord_offsetB = DIM*jnrB;
838 /* load j atom coordinates */
839 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
840 &jx0,&jy0,&jz0);
842 /* Calculate displacement vector */
843 dx10 = _mm_sub_pd(ix1,jx0);
844 dy10 = _mm_sub_pd(iy1,jy0);
845 dz10 = _mm_sub_pd(iz1,jz0);
846 dx20 = _mm_sub_pd(ix2,jx0);
847 dy20 = _mm_sub_pd(iy2,jy0);
848 dz20 = _mm_sub_pd(iz2,jz0);
849 dx30 = _mm_sub_pd(ix3,jx0);
850 dy30 = _mm_sub_pd(iy3,jy0);
851 dz30 = _mm_sub_pd(iz3,jz0);
853 /* Calculate squared distance and things based on it */
854 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
855 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
856 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
858 rinv10 = gmx_mm_invsqrt_pd(rsq10);
859 rinv20 = gmx_mm_invsqrt_pd(rsq20);
860 rinv30 = gmx_mm_invsqrt_pd(rsq30);
862 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
863 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
864 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
866 /* Load parameters for j particles */
867 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
869 fjx0 = _mm_setzero_pd();
870 fjy0 = _mm_setzero_pd();
871 fjz0 = _mm_setzero_pd();
873 /**************************
874 * CALCULATE INTERACTIONS *
875 **************************/
877 if (gmx_mm_any_lt(rsq10,rcutoff2))
880 r10 = _mm_mul_pd(rsq10,rinv10);
882 /* Compute parameters for interactions between i and j atoms */
883 qq10 = _mm_mul_pd(iq1,jq0);
885 /* EWALD ELECTROSTATICS */
887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
888 ewrt = _mm_mul_pd(r10,ewtabscale);
889 ewitab = _mm_cvttpd_epi32(ewrt);
890 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
891 ewitab = _mm_slli_epi32(ewitab,2);
892 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
893 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
894 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
895 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
896 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
897 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
898 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
899 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
900 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
901 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
903 d = _mm_sub_pd(r10,rswitch);
904 d = _mm_max_pd(d,_mm_setzero_pd());
905 d2 = _mm_mul_pd(d,d);
906 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
908 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
910 /* Evaluate switch function */
911 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
912 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
913 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
915 fscal = felec;
917 fscal = _mm_and_pd(fscal,cutoff_mask);
919 /* Calculate temporary vectorial force */
920 tx = _mm_mul_pd(fscal,dx10);
921 ty = _mm_mul_pd(fscal,dy10);
922 tz = _mm_mul_pd(fscal,dz10);
924 /* Update vectorial force */
925 fix1 = _mm_add_pd(fix1,tx);
926 fiy1 = _mm_add_pd(fiy1,ty);
927 fiz1 = _mm_add_pd(fiz1,tz);
929 fjx0 = _mm_add_pd(fjx0,tx);
930 fjy0 = _mm_add_pd(fjy0,ty);
931 fjz0 = _mm_add_pd(fjz0,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm_any_lt(rsq20,rcutoff2))
942 r20 = _mm_mul_pd(rsq20,rinv20);
944 /* Compute parameters for interactions between i and j atoms */
945 qq20 = _mm_mul_pd(iq2,jq0);
947 /* EWALD ELECTROSTATICS */
949 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
950 ewrt = _mm_mul_pd(r20,ewtabscale);
951 ewitab = _mm_cvttpd_epi32(ewrt);
952 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
953 ewitab = _mm_slli_epi32(ewitab,2);
954 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
955 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
956 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
957 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
958 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
959 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
960 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
961 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
962 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
963 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
965 d = _mm_sub_pd(r20,rswitch);
966 d = _mm_max_pd(d,_mm_setzero_pd());
967 d2 = _mm_mul_pd(d,d);
968 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
970 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
972 /* Evaluate switch function */
973 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
974 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
975 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
977 fscal = felec;
979 fscal = _mm_and_pd(fscal,cutoff_mask);
981 /* Calculate temporary vectorial force */
982 tx = _mm_mul_pd(fscal,dx20);
983 ty = _mm_mul_pd(fscal,dy20);
984 tz = _mm_mul_pd(fscal,dz20);
986 /* Update vectorial force */
987 fix2 = _mm_add_pd(fix2,tx);
988 fiy2 = _mm_add_pd(fiy2,ty);
989 fiz2 = _mm_add_pd(fiz2,tz);
991 fjx0 = _mm_add_pd(fjx0,tx);
992 fjy0 = _mm_add_pd(fjy0,ty);
993 fjz0 = _mm_add_pd(fjz0,tz);
997 /**************************
998 * CALCULATE INTERACTIONS *
999 **************************/
1001 if (gmx_mm_any_lt(rsq30,rcutoff2))
1004 r30 = _mm_mul_pd(rsq30,rinv30);
1006 /* Compute parameters for interactions between i and j atoms */
1007 qq30 = _mm_mul_pd(iq3,jq0);
1009 /* EWALD ELECTROSTATICS */
1011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012 ewrt = _mm_mul_pd(r30,ewtabscale);
1013 ewitab = _mm_cvttpd_epi32(ewrt);
1014 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1015 ewitab = _mm_slli_epi32(ewitab,2);
1016 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1017 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1018 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1019 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1020 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1021 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1022 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1023 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1024 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1025 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1027 d = _mm_sub_pd(r30,rswitch);
1028 d = _mm_max_pd(d,_mm_setzero_pd());
1029 d2 = _mm_mul_pd(d,d);
1030 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1032 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1034 /* Evaluate switch function */
1035 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1036 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1037 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1039 fscal = felec;
1041 fscal = _mm_and_pd(fscal,cutoff_mask);
1043 /* Calculate temporary vectorial force */
1044 tx = _mm_mul_pd(fscal,dx30);
1045 ty = _mm_mul_pd(fscal,dy30);
1046 tz = _mm_mul_pd(fscal,dz30);
1048 /* Update vectorial force */
1049 fix3 = _mm_add_pd(fix3,tx);
1050 fiy3 = _mm_add_pd(fiy3,ty);
1051 fiz3 = _mm_add_pd(fiz3,tz);
1053 fjx0 = _mm_add_pd(fjx0,tx);
1054 fjy0 = _mm_add_pd(fjy0,ty);
1055 fjz0 = _mm_add_pd(fjz0,tz);
1059 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1061 /* Inner loop uses 189 flops */
1064 if(jidx<j_index_end)
1067 jnrA = jjnr[jidx];
1068 j_coord_offsetA = DIM*jnrA;
1070 /* load j atom coordinates */
1071 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1072 &jx0,&jy0,&jz0);
1074 /* Calculate displacement vector */
1075 dx10 = _mm_sub_pd(ix1,jx0);
1076 dy10 = _mm_sub_pd(iy1,jy0);
1077 dz10 = _mm_sub_pd(iz1,jz0);
1078 dx20 = _mm_sub_pd(ix2,jx0);
1079 dy20 = _mm_sub_pd(iy2,jy0);
1080 dz20 = _mm_sub_pd(iz2,jz0);
1081 dx30 = _mm_sub_pd(ix3,jx0);
1082 dy30 = _mm_sub_pd(iy3,jy0);
1083 dz30 = _mm_sub_pd(iz3,jz0);
1085 /* Calculate squared distance and things based on it */
1086 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1087 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1088 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1090 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1091 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1092 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1094 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1095 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1096 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1098 /* Load parameters for j particles */
1099 jq0 = _mm_load_sd(charge+jnrA+0);
1101 fjx0 = _mm_setzero_pd();
1102 fjy0 = _mm_setzero_pd();
1103 fjz0 = _mm_setzero_pd();
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 if (gmx_mm_any_lt(rsq10,rcutoff2))
1112 r10 = _mm_mul_pd(rsq10,rinv10);
1114 /* Compute parameters for interactions between i and j atoms */
1115 qq10 = _mm_mul_pd(iq1,jq0);
1117 /* EWALD ELECTROSTATICS */
1119 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1120 ewrt = _mm_mul_pd(r10,ewtabscale);
1121 ewitab = _mm_cvttpd_epi32(ewrt);
1122 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1123 ewitab = _mm_slli_epi32(ewitab,2);
1124 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1125 ewtabD = _mm_setzero_pd();
1126 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1127 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1128 ewtabFn = _mm_setzero_pd();
1129 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1130 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1131 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1132 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1133 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1135 d = _mm_sub_pd(r10,rswitch);
1136 d = _mm_max_pd(d,_mm_setzero_pd());
1137 d2 = _mm_mul_pd(d,d);
1138 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1140 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1142 /* Evaluate switch function */
1143 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1144 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1145 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1147 fscal = felec;
1149 fscal = _mm_and_pd(fscal,cutoff_mask);
1151 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1153 /* Calculate temporary vectorial force */
1154 tx = _mm_mul_pd(fscal,dx10);
1155 ty = _mm_mul_pd(fscal,dy10);
1156 tz = _mm_mul_pd(fscal,dz10);
1158 /* Update vectorial force */
1159 fix1 = _mm_add_pd(fix1,tx);
1160 fiy1 = _mm_add_pd(fiy1,ty);
1161 fiz1 = _mm_add_pd(fiz1,tz);
1163 fjx0 = _mm_add_pd(fjx0,tx);
1164 fjy0 = _mm_add_pd(fjy0,ty);
1165 fjz0 = _mm_add_pd(fjz0,tz);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 if (gmx_mm_any_lt(rsq20,rcutoff2))
1176 r20 = _mm_mul_pd(rsq20,rinv20);
1178 /* Compute parameters for interactions between i and j atoms */
1179 qq20 = _mm_mul_pd(iq2,jq0);
1181 /* EWALD ELECTROSTATICS */
1183 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1184 ewrt = _mm_mul_pd(r20,ewtabscale);
1185 ewitab = _mm_cvttpd_epi32(ewrt);
1186 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1187 ewitab = _mm_slli_epi32(ewitab,2);
1188 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1189 ewtabD = _mm_setzero_pd();
1190 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1191 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1192 ewtabFn = _mm_setzero_pd();
1193 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1194 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1195 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1196 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1197 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1199 d = _mm_sub_pd(r20,rswitch);
1200 d = _mm_max_pd(d,_mm_setzero_pd());
1201 d2 = _mm_mul_pd(d,d);
1202 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1204 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1206 /* Evaluate switch function */
1207 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1208 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1209 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1211 fscal = felec;
1213 fscal = _mm_and_pd(fscal,cutoff_mask);
1215 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1217 /* Calculate temporary vectorial force */
1218 tx = _mm_mul_pd(fscal,dx20);
1219 ty = _mm_mul_pd(fscal,dy20);
1220 tz = _mm_mul_pd(fscal,dz20);
1222 /* Update vectorial force */
1223 fix2 = _mm_add_pd(fix2,tx);
1224 fiy2 = _mm_add_pd(fiy2,ty);
1225 fiz2 = _mm_add_pd(fiz2,tz);
1227 fjx0 = _mm_add_pd(fjx0,tx);
1228 fjy0 = _mm_add_pd(fjy0,ty);
1229 fjz0 = _mm_add_pd(fjz0,tz);
1233 /**************************
1234 * CALCULATE INTERACTIONS *
1235 **************************/
1237 if (gmx_mm_any_lt(rsq30,rcutoff2))
1240 r30 = _mm_mul_pd(rsq30,rinv30);
1242 /* Compute parameters for interactions between i and j atoms */
1243 qq30 = _mm_mul_pd(iq3,jq0);
1245 /* EWALD ELECTROSTATICS */
1247 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1248 ewrt = _mm_mul_pd(r30,ewtabscale);
1249 ewitab = _mm_cvttpd_epi32(ewrt);
1250 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1251 ewitab = _mm_slli_epi32(ewitab,2);
1252 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1253 ewtabD = _mm_setzero_pd();
1254 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1255 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1256 ewtabFn = _mm_setzero_pd();
1257 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1258 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1259 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1260 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1261 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1263 d = _mm_sub_pd(r30,rswitch);
1264 d = _mm_max_pd(d,_mm_setzero_pd());
1265 d2 = _mm_mul_pd(d,d);
1266 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1268 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1270 /* Evaluate switch function */
1271 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1272 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1273 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1275 fscal = felec;
1277 fscal = _mm_and_pd(fscal,cutoff_mask);
1279 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1281 /* Calculate temporary vectorial force */
1282 tx = _mm_mul_pd(fscal,dx30);
1283 ty = _mm_mul_pd(fscal,dy30);
1284 tz = _mm_mul_pd(fscal,dz30);
1286 /* Update vectorial force */
1287 fix3 = _mm_add_pd(fix3,tx);
1288 fiy3 = _mm_add_pd(fiy3,ty);
1289 fiz3 = _mm_add_pd(fiz3,tz);
1291 fjx0 = _mm_add_pd(fjx0,tx);
1292 fjy0 = _mm_add_pd(fjy0,ty);
1293 fjz0 = _mm_add_pd(fjz0,tz);
1297 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1299 /* Inner loop uses 189 flops */
1302 /* End of innermost loop */
1304 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1305 f+i_coord_offset+DIM,fshift+i_shift_offset);
1307 /* Increment number of inner iterations */
1308 inneriter += j_index_end - j_index_start;
1310 /* Outer loop uses 18 flops */
1313 /* Increment number of outer iterations */
1314 outeriter += nri;
1316 /* Update outer/inner flops */
1318 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*189);