2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LJEwald
55 * Geometry: Water3-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
90 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
91 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
98 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
103 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
105 __m128d one_half
= _mm_set1_pd(0.5);
106 __m128d minus_one
= _mm_set1_pd(-1.0);
108 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
110 __m128d dummy_mask
,cutoff_mask
;
111 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
112 __m128d one
= _mm_set1_pd(1.0);
113 __m128d two
= _mm_set1_pd(2.0);
119 jindex
= nlist
->jindex
;
121 shiftidx
= nlist
->shift
;
123 shiftvec
= fr
->shift_vec
[0];
124 fshift
= fr
->fshift
[0];
125 facel
= _mm_set1_pd(fr
->epsfac
);
126 charge
= mdatoms
->chargeA
;
127 nvdwtype
= fr
->ntype
;
129 vdwtype
= mdatoms
->typeA
;
130 vdwgridparam
= fr
->ljpme_c6grid
;
131 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
132 ewclj
= _mm_set1_pd(fr
->ewaldcoeff_lj
);
133 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
135 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
136 ewtab
= fr
->ic
->tabq_coul_FDV0
;
137 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
138 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
140 /* Setup water-specific parameters */
141 inr
= nlist
->iinr
[0];
142 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
143 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
144 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
145 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
147 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148 rcutoff_scalar
= fr
->rcoulomb
;
149 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
150 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
152 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
153 rvdw
= _mm_set1_pd(fr
->rvdw
);
155 /* Avoid stupid compiler warnings */
163 /* Start outer loop over neighborlists */
164 for(iidx
=0; iidx
<nri
; iidx
++)
166 /* Load shift vector for this list */
167 i_shift_offset
= DIM
*shiftidx
[iidx
];
169 /* Load limits for loop over neighbors */
170 j_index_start
= jindex
[iidx
];
171 j_index_end
= jindex
[iidx
+1];
173 /* Get outer coordinate index */
175 i_coord_offset
= DIM
*inr
;
177 /* Load i particle coords and add shift vector */
178 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
179 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
181 fix0
= _mm_setzero_pd();
182 fiy0
= _mm_setzero_pd();
183 fiz0
= _mm_setzero_pd();
184 fix1
= _mm_setzero_pd();
185 fiy1
= _mm_setzero_pd();
186 fiz1
= _mm_setzero_pd();
187 fix2
= _mm_setzero_pd();
188 fiy2
= _mm_setzero_pd();
189 fiz2
= _mm_setzero_pd();
191 /* Reset potential sums */
192 velecsum
= _mm_setzero_pd();
193 vvdwsum
= _mm_setzero_pd();
195 /* Start inner kernel loop */
196 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
199 /* Get j neighbor index, and coordinate index */
202 j_coord_offsetA
= DIM
*jnrA
;
203 j_coord_offsetB
= DIM
*jnrB
;
205 /* load j atom coordinates */
206 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
209 /* Calculate displacement vector */
210 dx00
= _mm_sub_pd(ix0
,jx0
);
211 dy00
= _mm_sub_pd(iy0
,jy0
);
212 dz00
= _mm_sub_pd(iz0
,jz0
);
213 dx10
= _mm_sub_pd(ix1
,jx0
);
214 dy10
= _mm_sub_pd(iy1
,jy0
);
215 dz10
= _mm_sub_pd(iz1
,jz0
);
216 dx20
= _mm_sub_pd(ix2
,jx0
);
217 dy20
= _mm_sub_pd(iy2
,jy0
);
218 dz20
= _mm_sub_pd(iz2
,jz0
);
220 /* Calculate squared distance and things based on it */
221 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
222 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
223 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
225 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
226 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
227 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
229 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
230 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
231 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
233 /* Load parameters for j particles */
234 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
235 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
236 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
238 fjx0
= _mm_setzero_pd();
239 fjy0
= _mm_setzero_pd();
240 fjz0
= _mm_setzero_pd();
242 /**************************
243 * CALCULATE INTERACTIONS *
244 **************************/
246 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
249 r00
= _mm_mul_pd(rsq00
,rinv00
);
251 /* Compute parameters for interactions between i and j atoms */
252 qq00
= _mm_mul_pd(iq0
,jq0
);
253 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
254 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
256 c6grid_00
= gmx_mm_load_2real_swizzle_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
,
257 vdwgridparam
+vdwioffset0
+vdwjidx0B
);
259 /* EWALD ELECTROSTATICS */
261 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
262 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
263 ewitab
= _mm_cvttpd_epi32(ewrt
);
264 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
265 ewitab
= _mm_slli_epi32(ewitab
,2);
266 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
267 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
268 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
269 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
270 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
271 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
272 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
273 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
274 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
275 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
277 /* Analytical LJ-PME */
278 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
279 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
280 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
281 exponent
= gmx_simd_exp_d(ewcljrsq
);
282 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
283 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
284 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
285 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
286 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
287 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
288 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
289 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
290 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
292 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
294 /* Update potential sum for this i atom from the interaction with this j atom. */
295 velec
= _mm_and_pd(velec
,cutoff_mask
);
296 velecsum
= _mm_add_pd(velecsum
,velec
);
297 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
298 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
300 fscal
= _mm_add_pd(felec
,fvdw
);
302 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
304 /* Calculate temporary vectorial force */
305 tx
= _mm_mul_pd(fscal
,dx00
);
306 ty
= _mm_mul_pd(fscal
,dy00
);
307 tz
= _mm_mul_pd(fscal
,dz00
);
309 /* Update vectorial force */
310 fix0
= _mm_add_pd(fix0
,tx
);
311 fiy0
= _mm_add_pd(fiy0
,ty
);
312 fiz0
= _mm_add_pd(fiz0
,tz
);
314 fjx0
= _mm_add_pd(fjx0
,tx
);
315 fjy0
= _mm_add_pd(fjy0
,ty
);
316 fjz0
= _mm_add_pd(fjz0
,tz
);
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
324 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
327 r10
= _mm_mul_pd(rsq10
,rinv10
);
329 /* Compute parameters for interactions between i and j atoms */
330 qq10
= _mm_mul_pd(iq1
,jq0
);
332 /* EWALD ELECTROSTATICS */
334 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
335 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
336 ewitab
= _mm_cvttpd_epi32(ewrt
);
337 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
338 ewitab
= _mm_slli_epi32(ewitab
,2);
339 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
340 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
341 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
342 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
343 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
344 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
345 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
346 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
347 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
348 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
350 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velec
= _mm_and_pd(velec
,cutoff_mask
);
354 velecsum
= _mm_add_pd(velecsum
,velec
);
358 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
360 /* Calculate temporary vectorial force */
361 tx
= _mm_mul_pd(fscal
,dx10
);
362 ty
= _mm_mul_pd(fscal
,dy10
);
363 tz
= _mm_mul_pd(fscal
,dz10
);
365 /* Update vectorial force */
366 fix1
= _mm_add_pd(fix1
,tx
);
367 fiy1
= _mm_add_pd(fiy1
,ty
);
368 fiz1
= _mm_add_pd(fiz1
,tz
);
370 fjx0
= _mm_add_pd(fjx0
,tx
);
371 fjy0
= _mm_add_pd(fjy0
,ty
);
372 fjz0
= _mm_add_pd(fjz0
,tz
);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
383 r20
= _mm_mul_pd(rsq20
,rinv20
);
385 /* Compute parameters for interactions between i and j atoms */
386 qq20
= _mm_mul_pd(iq2
,jq0
);
388 /* EWALD ELECTROSTATICS */
390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
392 ewitab
= _mm_cvttpd_epi32(ewrt
);
393 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
394 ewitab
= _mm_slli_epi32(ewitab
,2);
395 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
396 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
397 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
398 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
399 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
400 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
401 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
402 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
403 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
404 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
406 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
408 /* Update potential sum for this i atom from the interaction with this j atom. */
409 velec
= _mm_and_pd(velec
,cutoff_mask
);
410 velecsum
= _mm_add_pd(velecsum
,velec
);
414 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
416 /* Calculate temporary vectorial force */
417 tx
= _mm_mul_pd(fscal
,dx20
);
418 ty
= _mm_mul_pd(fscal
,dy20
);
419 tz
= _mm_mul_pd(fscal
,dz20
);
421 /* Update vectorial force */
422 fix2
= _mm_add_pd(fix2
,tx
);
423 fiy2
= _mm_add_pd(fiy2
,ty
);
424 fiz2
= _mm_add_pd(fiz2
,tz
);
426 fjx0
= _mm_add_pd(fjx0
,tx
);
427 fjy0
= _mm_add_pd(fjy0
,ty
);
428 fjz0
= _mm_add_pd(fjz0
,tz
);
432 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
434 /* Inner loop uses 177 flops */
441 j_coord_offsetA
= DIM
*jnrA
;
443 /* load j atom coordinates */
444 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
447 /* Calculate displacement vector */
448 dx00
= _mm_sub_pd(ix0
,jx0
);
449 dy00
= _mm_sub_pd(iy0
,jy0
);
450 dz00
= _mm_sub_pd(iz0
,jz0
);
451 dx10
= _mm_sub_pd(ix1
,jx0
);
452 dy10
= _mm_sub_pd(iy1
,jy0
);
453 dz10
= _mm_sub_pd(iz1
,jz0
);
454 dx20
= _mm_sub_pd(ix2
,jx0
);
455 dy20
= _mm_sub_pd(iy2
,jy0
);
456 dz20
= _mm_sub_pd(iz2
,jz0
);
458 /* Calculate squared distance and things based on it */
459 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
460 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
461 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
463 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
464 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
465 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
467 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
468 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
469 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
471 /* Load parameters for j particles */
472 jq0
= _mm_load_sd(charge
+jnrA
+0);
473 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
475 fjx0
= _mm_setzero_pd();
476 fjy0
= _mm_setzero_pd();
477 fjz0
= _mm_setzero_pd();
479 /**************************
480 * CALCULATE INTERACTIONS *
481 **************************/
483 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
486 r00
= _mm_mul_pd(rsq00
,rinv00
);
488 /* Compute parameters for interactions between i and j atoms */
489 qq00
= _mm_mul_pd(iq0
,jq0
);
490 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
492 c6grid_00
= gmx_mm_load_1real_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
);
494 /* EWALD ELECTROSTATICS */
496 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
497 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
498 ewitab
= _mm_cvttpd_epi32(ewrt
);
499 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
500 ewitab
= _mm_slli_epi32(ewitab
,2);
501 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
502 ewtabD
= _mm_setzero_pd();
503 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
504 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
505 ewtabFn
= _mm_setzero_pd();
506 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
507 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
508 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
509 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
510 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
512 /* Analytical LJ-PME */
513 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
514 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
515 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
516 exponent
= gmx_simd_exp_d(ewcljrsq
);
517 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
518 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
519 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
520 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
521 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
522 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
523 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
524 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
525 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
527 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
529 /* Update potential sum for this i atom from the interaction with this j atom. */
530 velec
= _mm_and_pd(velec
,cutoff_mask
);
531 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
532 velecsum
= _mm_add_pd(velecsum
,velec
);
533 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
534 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
535 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
537 fscal
= _mm_add_pd(felec
,fvdw
);
539 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
541 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
543 /* Calculate temporary vectorial force */
544 tx
= _mm_mul_pd(fscal
,dx00
);
545 ty
= _mm_mul_pd(fscal
,dy00
);
546 tz
= _mm_mul_pd(fscal
,dz00
);
548 /* Update vectorial force */
549 fix0
= _mm_add_pd(fix0
,tx
);
550 fiy0
= _mm_add_pd(fiy0
,ty
);
551 fiz0
= _mm_add_pd(fiz0
,tz
);
553 fjx0
= _mm_add_pd(fjx0
,tx
);
554 fjy0
= _mm_add_pd(fjy0
,ty
);
555 fjz0
= _mm_add_pd(fjz0
,tz
);
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
563 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
566 r10
= _mm_mul_pd(rsq10
,rinv10
);
568 /* Compute parameters for interactions between i and j atoms */
569 qq10
= _mm_mul_pd(iq1
,jq0
);
571 /* EWALD ELECTROSTATICS */
573 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
574 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
575 ewitab
= _mm_cvttpd_epi32(ewrt
);
576 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
577 ewitab
= _mm_slli_epi32(ewitab
,2);
578 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
579 ewtabD
= _mm_setzero_pd();
580 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
581 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
582 ewtabFn
= _mm_setzero_pd();
583 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
584 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
585 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
586 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
587 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
589 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velec
= _mm_and_pd(velec
,cutoff_mask
);
593 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
594 velecsum
= _mm_add_pd(velecsum
,velec
);
598 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
600 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
602 /* Calculate temporary vectorial force */
603 tx
= _mm_mul_pd(fscal
,dx10
);
604 ty
= _mm_mul_pd(fscal
,dy10
);
605 tz
= _mm_mul_pd(fscal
,dz10
);
607 /* Update vectorial force */
608 fix1
= _mm_add_pd(fix1
,tx
);
609 fiy1
= _mm_add_pd(fiy1
,ty
);
610 fiz1
= _mm_add_pd(fiz1
,tz
);
612 fjx0
= _mm_add_pd(fjx0
,tx
);
613 fjy0
= _mm_add_pd(fjy0
,ty
);
614 fjz0
= _mm_add_pd(fjz0
,tz
);
618 /**************************
619 * CALCULATE INTERACTIONS *
620 **************************/
622 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
625 r20
= _mm_mul_pd(rsq20
,rinv20
);
627 /* Compute parameters for interactions between i and j atoms */
628 qq20
= _mm_mul_pd(iq2
,jq0
);
630 /* EWALD ELECTROSTATICS */
632 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
633 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
634 ewitab
= _mm_cvttpd_epi32(ewrt
);
635 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
636 ewitab
= _mm_slli_epi32(ewitab
,2);
637 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
638 ewtabD
= _mm_setzero_pd();
639 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
640 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
641 ewtabFn
= _mm_setzero_pd();
642 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
643 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
644 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
645 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
646 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
648 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velec
= _mm_and_pd(velec
,cutoff_mask
);
652 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
653 velecsum
= _mm_add_pd(velecsum
,velec
);
657 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
659 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
661 /* Calculate temporary vectorial force */
662 tx
= _mm_mul_pd(fscal
,dx20
);
663 ty
= _mm_mul_pd(fscal
,dy20
);
664 tz
= _mm_mul_pd(fscal
,dz20
);
666 /* Update vectorial force */
667 fix2
= _mm_add_pd(fix2
,tx
);
668 fiy2
= _mm_add_pd(fiy2
,ty
);
669 fiz2
= _mm_add_pd(fiz2
,tz
);
671 fjx0
= _mm_add_pd(fjx0
,tx
);
672 fjy0
= _mm_add_pd(fjy0
,ty
);
673 fjz0
= _mm_add_pd(fjz0
,tz
);
677 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
679 /* Inner loop uses 177 flops */
682 /* End of innermost loop */
684 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
685 f
+i_coord_offset
,fshift
+i_shift_offset
);
688 /* Update potential energies */
689 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
690 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
692 /* Increment number of inner iterations */
693 inneriter
+= j_index_end
- j_index_start
;
695 /* Outer loop uses 20 flops */
698 /* Increment number of outer iterations */
701 /* Update outer/inner flops */
703 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_VF
,outeriter
*20 + inneriter
*177);
706 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_double
707 * Electrostatics interaction: Ewald
708 * VdW interaction: LJEwald
709 * Geometry: Water3-Particle
710 * Calculate force/pot: Force
713 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_double
714 (t_nblist
* gmx_restrict nlist
,
715 rvec
* gmx_restrict xx
,
716 rvec
* gmx_restrict ff
,
717 t_forcerec
* gmx_restrict fr
,
718 t_mdatoms
* gmx_restrict mdatoms
,
719 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
720 t_nrnb
* gmx_restrict nrnb
)
722 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
723 * just 0 for non-waters.
724 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
725 * jnr indices corresponding to data put in the four positions in the SIMD register.
727 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
728 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
730 int j_coord_offsetA
,j_coord_offsetB
;
731 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
733 real
*shiftvec
,*fshift
,*x
,*f
;
734 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
736 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
738 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
740 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
741 int vdwjidx0A
,vdwjidx0B
;
742 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
743 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
744 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
745 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
746 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
749 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
752 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
753 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
757 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
759 __m128d one_half
= _mm_set1_pd(0.5);
760 __m128d minus_one
= _mm_set1_pd(-1.0);
762 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
764 __m128d dummy_mask
,cutoff_mask
;
765 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
766 __m128d one
= _mm_set1_pd(1.0);
767 __m128d two
= _mm_set1_pd(2.0);
773 jindex
= nlist
->jindex
;
775 shiftidx
= nlist
->shift
;
777 shiftvec
= fr
->shift_vec
[0];
778 fshift
= fr
->fshift
[0];
779 facel
= _mm_set1_pd(fr
->epsfac
);
780 charge
= mdatoms
->chargeA
;
781 nvdwtype
= fr
->ntype
;
783 vdwtype
= mdatoms
->typeA
;
784 vdwgridparam
= fr
->ljpme_c6grid
;
785 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
786 ewclj
= _mm_set1_pd(fr
->ewaldcoeff_lj
);
787 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
789 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
790 ewtab
= fr
->ic
->tabq_coul_F
;
791 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
792 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
794 /* Setup water-specific parameters */
795 inr
= nlist
->iinr
[0];
796 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
797 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
798 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
799 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
801 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
802 rcutoff_scalar
= fr
->rcoulomb
;
803 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
804 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
806 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
807 rvdw
= _mm_set1_pd(fr
->rvdw
);
809 /* Avoid stupid compiler warnings */
817 /* Start outer loop over neighborlists */
818 for(iidx
=0; iidx
<nri
; iidx
++)
820 /* Load shift vector for this list */
821 i_shift_offset
= DIM
*shiftidx
[iidx
];
823 /* Load limits for loop over neighbors */
824 j_index_start
= jindex
[iidx
];
825 j_index_end
= jindex
[iidx
+1];
827 /* Get outer coordinate index */
829 i_coord_offset
= DIM
*inr
;
831 /* Load i particle coords and add shift vector */
832 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
833 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
835 fix0
= _mm_setzero_pd();
836 fiy0
= _mm_setzero_pd();
837 fiz0
= _mm_setzero_pd();
838 fix1
= _mm_setzero_pd();
839 fiy1
= _mm_setzero_pd();
840 fiz1
= _mm_setzero_pd();
841 fix2
= _mm_setzero_pd();
842 fiy2
= _mm_setzero_pd();
843 fiz2
= _mm_setzero_pd();
845 /* Start inner kernel loop */
846 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
849 /* Get j neighbor index, and coordinate index */
852 j_coord_offsetA
= DIM
*jnrA
;
853 j_coord_offsetB
= DIM
*jnrB
;
855 /* load j atom coordinates */
856 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
859 /* Calculate displacement vector */
860 dx00
= _mm_sub_pd(ix0
,jx0
);
861 dy00
= _mm_sub_pd(iy0
,jy0
);
862 dz00
= _mm_sub_pd(iz0
,jz0
);
863 dx10
= _mm_sub_pd(ix1
,jx0
);
864 dy10
= _mm_sub_pd(iy1
,jy0
);
865 dz10
= _mm_sub_pd(iz1
,jz0
);
866 dx20
= _mm_sub_pd(ix2
,jx0
);
867 dy20
= _mm_sub_pd(iy2
,jy0
);
868 dz20
= _mm_sub_pd(iz2
,jz0
);
870 /* Calculate squared distance and things based on it */
871 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
872 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
873 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
875 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
876 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
877 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
879 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
880 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
881 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
883 /* Load parameters for j particles */
884 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
885 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
886 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
888 fjx0
= _mm_setzero_pd();
889 fjy0
= _mm_setzero_pd();
890 fjz0
= _mm_setzero_pd();
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
899 r00
= _mm_mul_pd(rsq00
,rinv00
);
901 /* Compute parameters for interactions between i and j atoms */
902 qq00
= _mm_mul_pd(iq0
,jq0
);
903 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
904 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
906 c6grid_00
= gmx_mm_load_2real_swizzle_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
,
907 vdwgridparam
+vdwioffset0
+vdwjidx0B
);
909 /* EWALD ELECTROSTATICS */
911 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
912 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
913 ewitab
= _mm_cvttpd_epi32(ewrt
);
914 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
915 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
917 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
918 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
920 /* Analytical LJ-PME */
921 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
922 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
923 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
924 exponent
= gmx_simd_exp_d(ewcljrsq
);
925 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
926 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
927 /* f6A = 6 * C6grid * (1 - poly) */
928 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
929 /* f6B = C6grid * exponent * beta^6 */
930 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
931 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
932 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
934 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
936 fscal
= _mm_add_pd(felec
,fvdw
);
938 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
940 /* Calculate temporary vectorial force */
941 tx
= _mm_mul_pd(fscal
,dx00
);
942 ty
= _mm_mul_pd(fscal
,dy00
);
943 tz
= _mm_mul_pd(fscal
,dz00
);
945 /* Update vectorial force */
946 fix0
= _mm_add_pd(fix0
,tx
);
947 fiy0
= _mm_add_pd(fiy0
,ty
);
948 fiz0
= _mm_add_pd(fiz0
,tz
);
950 fjx0
= _mm_add_pd(fjx0
,tx
);
951 fjy0
= _mm_add_pd(fjy0
,ty
);
952 fjz0
= _mm_add_pd(fjz0
,tz
);
956 /**************************
957 * CALCULATE INTERACTIONS *
958 **************************/
960 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
963 r10
= _mm_mul_pd(rsq10
,rinv10
);
965 /* Compute parameters for interactions between i and j atoms */
966 qq10
= _mm_mul_pd(iq1
,jq0
);
968 /* EWALD ELECTROSTATICS */
970 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
971 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
972 ewitab
= _mm_cvttpd_epi32(ewrt
);
973 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
974 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
976 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
977 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
979 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
983 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
985 /* Calculate temporary vectorial force */
986 tx
= _mm_mul_pd(fscal
,dx10
);
987 ty
= _mm_mul_pd(fscal
,dy10
);
988 tz
= _mm_mul_pd(fscal
,dz10
);
990 /* Update vectorial force */
991 fix1
= _mm_add_pd(fix1
,tx
);
992 fiy1
= _mm_add_pd(fiy1
,ty
);
993 fiz1
= _mm_add_pd(fiz1
,tz
);
995 fjx0
= _mm_add_pd(fjx0
,tx
);
996 fjy0
= _mm_add_pd(fjy0
,ty
);
997 fjz0
= _mm_add_pd(fjz0
,tz
);
1001 /**************************
1002 * CALCULATE INTERACTIONS *
1003 **************************/
1005 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1008 r20
= _mm_mul_pd(rsq20
,rinv20
);
1010 /* Compute parameters for interactions between i and j atoms */
1011 qq20
= _mm_mul_pd(iq2
,jq0
);
1013 /* EWALD ELECTROSTATICS */
1015 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1017 ewitab
= _mm_cvttpd_epi32(ewrt
);
1018 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1019 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1021 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1022 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1024 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1028 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1030 /* Calculate temporary vectorial force */
1031 tx
= _mm_mul_pd(fscal
,dx20
);
1032 ty
= _mm_mul_pd(fscal
,dy20
);
1033 tz
= _mm_mul_pd(fscal
,dz20
);
1035 /* Update vectorial force */
1036 fix2
= _mm_add_pd(fix2
,tx
);
1037 fiy2
= _mm_add_pd(fiy2
,ty
);
1038 fiz2
= _mm_add_pd(fiz2
,tz
);
1040 fjx0
= _mm_add_pd(fjx0
,tx
);
1041 fjy0
= _mm_add_pd(fjy0
,ty
);
1042 fjz0
= _mm_add_pd(fjz0
,tz
);
1046 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
1048 /* Inner loop uses 143 flops */
1051 if(jidx
<j_index_end
)
1055 j_coord_offsetA
= DIM
*jnrA
;
1057 /* load j atom coordinates */
1058 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1061 /* Calculate displacement vector */
1062 dx00
= _mm_sub_pd(ix0
,jx0
);
1063 dy00
= _mm_sub_pd(iy0
,jy0
);
1064 dz00
= _mm_sub_pd(iz0
,jz0
);
1065 dx10
= _mm_sub_pd(ix1
,jx0
);
1066 dy10
= _mm_sub_pd(iy1
,jy0
);
1067 dz10
= _mm_sub_pd(iz1
,jz0
);
1068 dx20
= _mm_sub_pd(ix2
,jx0
);
1069 dy20
= _mm_sub_pd(iy2
,jy0
);
1070 dz20
= _mm_sub_pd(iz2
,jz0
);
1072 /* Calculate squared distance and things based on it */
1073 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1074 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1075 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1077 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1078 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1079 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1081 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1082 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1083 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1085 /* Load parameters for j particles */
1086 jq0
= _mm_load_sd(charge
+jnrA
+0);
1087 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1089 fjx0
= _mm_setzero_pd();
1090 fjy0
= _mm_setzero_pd();
1091 fjz0
= _mm_setzero_pd();
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1100 r00
= _mm_mul_pd(rsq00
,rinv00
);
1102 /* Compute parameters for interactions between i and j atoms */
1103 qq00
= _mm_mul_pd(iq0
,jq0
);
1104 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
1106 c6grid_00
= gmx_mm_load_1real_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
);
1108 /* EWALD ELECTROSTATICS */
1110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1111 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
1112 ewitab
= _mm_cvttpd_epi32(ewrt
);
1113 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1114 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1115 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1116 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1118 /* Analytical LJ-PME */
1119 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1120 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
1121 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
1122 exponent
= gmx_simd_exp_d(ewcljrsq
);
1123 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1124 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
1125 /* f6A = 6 * C6grid * (1 - poly) */
1126 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
1127 /* f6B = C6grid * exponent * beta^6 */
1128 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
1129 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1130 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1132 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1134 fscal
= _mm_add_pd(felec
,fvdw
);
1136 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1138 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1140 /* Calculate temporary vectorial force */
1141 tx
= _mm_mul_pd(fscal
,dx00
);
1142 ty
= _mm_mul_pd(fscal
,dy00
);
1143 tz
= _mm_mul_pd(fscal
,dz00
);
1145 /* Update vectorial force */
1146 fix0
= _mm_add_pd(fix0
,tx
);
1147 fiy0
= _mm_add_pd(fiy0
,ty
);
1148 fiz0
= _mm_add_pd(fiz0
,tz
);
1150 fjx0
= _mm_add_pd(fjx0
,tx
);
1151 fjy0
= _mm_add_pd(fjy0
,ty
);
1152 fjz0
= _mm_add_pd(fjz0
,tz
);
1156 /**************************
1157 * CALCULATE INTERACTIONS *
1158 **************************/
1160 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1163 r10
= _mm_mul_pd(rsq10
,rinv10
);
1165 /* Compute parameters for interactions between i and j atoms */
1166 qq10
= _mm_mul_pd(iq1
,jq0
);
1168 /* EWALD ELECTROSTATICS */
1170 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1171 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1172 ewitab
= _mm_cvttpd_epi32(ewrt
);
1173 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1174 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1175 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1176 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1178 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1182 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1184 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1186 /* Calculate temporary vectorial force */
1187 tx
= _mm_mul_pd(fscal
,dx10
);
1188 ty
= _mm_mul_pd(fscal
,dy10
);
1189 tz
= _mm_mul_pd(fscal
,dz10
);
1191 /* Update vectorial force */
1192 fix1
= _mm_add_pd(fix1
,tx
);
1193 fiy1
= _mm_add_pd(fiy1
,ty
);
1194 fiz1
= _mm_add_pd(fiz1
,tz
);
1196 fjx0
= _mm_add_pd(fjx0
,tx
);
1197 fjy0
= _mm_add_pd(fjy0
,ty
);
1198 fjz0
= _mm_add_pd(fjz0
,tz
);
1202 /**************************
1203 * CALCULATE INTERACTIONS *
1204 **************************/
1206 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1209 r20
= _mm_mul_pd(rsq20
,rinv20
);
1211 /* Compute parameters for interactions between i and j atoms */
1212 qq20
= _mm_mul_pd(iq2
,jq0
);
1214 /* EWALD ELECTROSTATICS */
1216 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1218 ewitab
= _mm_cvttpd_epi32(ewrt
);
1219 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1220 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1221 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1222 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1224 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1228 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1230 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1232 /* Calculate temporary vectorial force */
1233 tx
= _mm_mul_pd(fscal
,dx20
);
1234 ty
= _mm_mul_pd(fscal
,dy20
);
1235 tz
= _mm_mul_pd(fscal
,dz20
);
1237 /* Update vectorial force */
1238 fix2
= _mm_add_pd(fix2
,tx
);
1239 fiy2
= _mm_add_pd(fiy2
,ty
);
1240 fiz2
= _mm_add_pd(fiz2
,tz
);
1242 fjx0
= _mm_add_pd(fjx0
,tx
);
1243 fjy0
= _mm_add_pd(fjy0
,ty
);
1244 fjz0
= _mm_add_pd(fjz0
,tz
);
1248 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1250 /* Inner loop uses 143 flops */
1253 /* End of innermost loop */
1255 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1256 f
+i_coord_offset
,fshift
+i_shift_offset
);
1258 /* Increment number of inner iterations */
1259 inneriter
+= j_index_end
- j_index_start
;
1261 /* Outer loop uses 18 flops */
1264 /* Increment number of outer iterations */
1267 /* Update outer/inner flops */
1269 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_F
,outeriter
*18 + inneriter
*143);