2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
92 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
93 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
94 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
95 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
98 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
101 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
102 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
104 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
106 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
107 real rswitch_scalar
,d_scalar
;
108 __m128d dummy_mask
,cutoff_mask
;
109 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
110 __m128d one
= _mm_set1_pd(1.0);
111 __m128d two
= _mm_set1_pd(2.0);
117 jindex
= nlist
->jindex
;
119 shiftidx
= nlist
->shift
;
121 shiftvec
= fr
->shift_vec
[0];
122 fshift
= fr
->fshift
[0];
123 facel
= _mm_set1_pd(fr
->epsfac
);
124 charge
= mdatoms
->chargeA
;
125 nvdwtype
= fr
->ntype
;
127 vdwtype
= mdatoms
->typeA
;
129 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
130 ewtab
= fr
->ic
->tabq_coul_FDV0
;
131 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
132 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
134 /* Setup water-specific parameters */
135 inr
= nlist
->iinr
[0];
136 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
137 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
138 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
139 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
141 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
142 rcutoff_scalar
= fr
->rcoulomb
;
143 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
144 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
146 rswitch_scalar
= fr
->rcoulomb_switch
;
147 rswitch
= _mm_set1_pd(rswitch_scalar
);
148 /* Setup switch parameters */
149 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
150 d
= _mm_set1_pd(d_scalar
);
151 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
152 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
153 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
154 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
155 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
156 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
158 /* Avoid stupid compiler warnings */
166 /* Start outer loop over neighborlists */
167 for(iidx
=0; iidx
<nri
; iidx
++)
169 /* Load shift vector for this list */
170 i_shift_offset
= DIM
*shiftidx
[iidx
];
172 /* Load limits for loop over neighbors */
173 j_index_start
= jindex
[iidx
];
174 j_index_end
= jindex
[iidx
+1];
176 /* Get outer coordinate index */
178 i_coord_offset
= DIM
*inr
;
180 /* Load i particle coords and add shift vector */
181 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
182 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
184 fix0
= _mm_setzero_pd();
185 fiy0
= _mm_setzero_pd();
186 fiz0
= _mm_setzero_pd();
187 fix1
= _mm_setzero_pd();
188 fiy1
= _mm_setzero_pd();
189 fiz1
= _mm_setzero_pd();
190 fix2
= _mm_setzero_pd();
191 fiy2
= _mm_setzero_pd();
192 fiz2
= _mm_setzero_pd();
193 fix3
= _mm_setzero_pd();
194 fiy3
= _mm_setzero_pd();
195 fiz3
= _mm_setzero_pd();
197 /* Reset potential sums */
198 velecsum
= _mm_setzero_pd();
199 vvdwsum
= _mm_setzero_pd();
201 /* Start inner kernel loop */
202 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
205 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA
= DIM
*jnrA
;
209 j_coord_offsetB
= DIM
*jnrB
;
211 /* load j atom coordinates */
212 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
215 /* Calculate displacement vector */
216 dx00
= _mm_sub_pd(ix0
,jx0
);
217 dy00
= _mm_sub_pd(iy0
,jy0
);
218 dz00
= _mm_sub_pd(iz0
,jz0
);
219 dx10
= _mm_sub_pd(ix1
,jx0
);
220 dy10
= _mm_sub_pd(iy1
,jy0
);
221 dz10
= _mm_sub_pd(iz1
,jz0
);
222 dx20
= _mm_sub_pd(ix2
,jx0
);
223 dy20
= _mm_sub_pd(iy2
,jy0
);
224 dz20
= _mm_sub_pd(iz2
,jz0
);
225 dx30
= _mm_sub_pd(ix3
,jx0
);
226 dy30
= _mm_sub_pd(iy3
,jy0
);
227 dz30
= _mm_sub_pd(iz3
,jz0
);
229 /* Calculate squared distance and things based on it */
230 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
231 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
232 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
233 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
235 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
236 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
237 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
238 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
240 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
241 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
242 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
243 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
245 /* Load parameters for j particles */
246 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
247 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
248 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
250 fjx0
= _mm_setzero_pd();
251 fjy0
= _mm_setzero_pd();
252 fjz0
= _mm_setzero_pd();
254 /**************************
255 * CALCULATE INTERACTIONS *
256 **************************/
258 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
261 r00
= _mm_mul_pd(rsq00
,rinv00
);
263 /* Compute parameters for interactions between i and j atoms */
264 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
265 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
267 /* LENNARD-JONES DISPERSION/REPULSION */
269 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
270 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
271 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
272 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
273 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
275 d
= _mm_sub_pd(r00
,rswitch
);
276 d
= _mm_max_pd(d
,_mm_setzero_pd());
277 d2
= _mm_mul_pd(d
,d
);
278 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
280 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
282 /* Evaluate switch function */
283 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
284 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
285 vvdw
= _mm_mul_pd(vvdw
,sw
);
286 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
288 /* Update potential sum for this i atom from the interaction with this j atom. */
289 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
290 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
294 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
296 /* Calculate temporary vectorial force */
297 tx
= _mm_mul_pd(fscal
,dx00
);
298 ty
= _mm_mul_pd(fscal
,dy00
);
299 tz
= _mm_mul_pd(fscal
,dz00
);
301 /* Update vectorial force */
302 fix0
= _mm_add_pd(fix0
,tx
);
303 fiy0
= _mm_add_pd(fiy0
,ty
);
304 fiz0
= _mm_add_pd(fiz0
,tz
);
306 fjx0
= _mm_add_pd(fjx0
,tx
);
307 fjy0
= _mm_add_pd(fjy0
,ty
);
308 fjz0
= _mm_add_pd(fjz0
,tz
);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
319 r10
= _mm_mul_pd(rsq10
,rinv10
);
321 /* Compute parameters for interactions between i and j atoms */
322 qq10
= _mm_mul_pd(iq1
,jq0
);
324 /* EWALD ELECTROSTATICS */
326 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
327 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
328 ewitab
= _mm_cvttpd_epi32(ewrt
);
329 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
330 ewitab
= _mm_slli_epi32(ewitab
,2);
331 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
332 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
333 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
334 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
335 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
336 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
337 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
338 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
339 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
340 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
342 d
= _mm_sub_pd(r10
,rswitch
);
343 d
= _mm_max_pd(d
,_mm_setzero_pd());
344 d2
= _mm_mul_pd(d
,d
);
345 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
347 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
349 /* Evaluate switch function */
350 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
351 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
352 velec
= _mm_mul_pd(velec
,sw
);
353 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 velec
= _mm_and_pd(velec
,cutoff_mask
);
357 velecsum
= _mm_add_pd(velecsum
,velec
);
361 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
363 /* Calculate temporary vectorial force */
364 tx
= _mm_mul_pd(fscal
,dx10
);
365 ty
= _mm_mul_pd(fscal
,dy10
);
366 tz
= _mm_mul_pd(fscal
,dz10
);
368 /* Update vectorial force */
369 fix1
= _mm_add_pd(fix1
,tx
);
370 fiy1
= _mm_add_pd(fiy1
,ty
);
371 fiz1
= _mm_add_pd(fiz1
,tz
);
373 fjx0
= _mm_add_pd(fjx0
,tx
);
374 fjy0
= _mm_add_pd(fjy0
,ty
);
375 fjz0
= _mm_add_pd(fjz0
,tz
);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
386 r20
= _mm_mul_pd(rsq20
,rinv20
);
388 /* Compute parameters for interactions between i and j atoms */
389 qq20
= _mm_mul_pd(iq2
,jq0
);
391 /* EWALD ELECTROSTATICS */
393 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
394 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
395 ewitab
= _mm_cvttpd_epi32(ewrt
);
396 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
397 ewitab
= _mm_slli_epi32(ewitab
,2);
398 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
399 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
400 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
401 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
402 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
403 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
404 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
405 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
406 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
407 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
409 d
= _mm_sub_pd(r20
,rswitch
);
410 d
= _mm_max_pd(d
,_mm_setzero_pd());
411 d2
= _mm_mul_pd(d
,d
);
412 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
414 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
416 /* Evaluate switch function */
417 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
418 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
419 velec
= _mm_mul_pd(velec
,sw
);
420 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velec
= _mm_and_pd(velec
,cutoff_mask
);
424 velecsum
= _mm_add_pd(velecsum
,velec
);
428 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
430 /* Calculate temporary vectorial force */
431 tx
= _mm_mul_pd(fscal
,dx20
);
432 ty
= _mm_mul_pd(fscal
,dy20
);
433 tz
= _mm_mul_pd(fscal
,dz20
);
435 /* Update vectorial force */
436 fix2
= _mm_add_pd(fix2
,tx
);
437 fiy2
= _mm_add_pd(fiy2
,ty
);
438 fiz2
= _mm_add_pd(fiz2
,tz
);
440 fjx0
= _mm_add_pd(fjx0
,tx
);
441 fjy0
= _mm_add_pd(fjy0
,ty
);
442 fjz0
= _mm_add_pd(fjz0
,tz
);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
453 r30
= _mm_mul_pd(rsq30
,rinv30
);
455 /* Compute parameters for interactions between i and j atoms */
456 qq30
= _mm_mul_pd(iq3
,jq0
);
458 /* EWALD ELECTROSTATICS */
460 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
461 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
462 ewitab
= _mm_cvttpd_epi32(ewrt
);
463 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
464 ewitab
= _mm_slli_epi32(ewitab
,2);
465 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
466 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
467 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
468 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
469 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
470 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
471 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
472 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
473 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
474 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
476 d
= _mm_sub_pd(r30
,rswitch
);
477 d
= _mm_max_pd(d
,_mm_setzero_pd());
478 d2
= _mm_mul_pd(d
,d
);
479 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
481 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
483 /* Evaluate switch function */
484 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
485 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
486 velec
= _mm_mul_pd(velec
,sw
);
487 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
489 /* Update potential sum for this i atom from the interaction with this j atom. */
490 velec
= _mm_and_pd(velec
,cutoff_mask
);
491 velecsum
= _mm_add_pd(velecsum
,velec
);
495 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
497 /* Calculate temporary vectorial force */
498 tx
= _mm_mul_pd(fscal
,dx30
);
499 ty
= _mm_mul_pd(fscal
,dy30
);
500 tz
= _mm_mul_pd(fscal
,dz30
);
502 /* Update vectorial force */
503 fix3
= _mm_add_pd(fix3
,tx
);
504 fiy3
= _mm_add_pd(fiy3
,ty
);
505 fiz3
= _mm_add_pd(fiz3
,tz
);
507 fjx0
= _mm_add_pd(fjx0
,tx
);
508 fjy0
= _mm_add_pd(fjy0
,ty
);
509 fjz0
= _mm_add_pd(fjz0
,tz
);
513 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
515 /* Inner loop uses 257 flops */
522 j_coord_offsetA
= DIM
*jnrA
;
524 /* load j atom coordinates */
525 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
528 /* Calculate displacement vector */
529 dx00
= _mm_sub_pd(ix0
,jx0
);
530 dy00
= _mm_sub_pd(iy0
,jy0
);
531 dz00
= _mm_sub_pd(iz0
,jz0
);
532 dx10
= _mm_sub_pd(ix1
,jx0
);
533 dy10
= _mm_sub_pd(iy1
,jy0
);
534 dz10
= _mm_sub_pd(iz1
,jz0
);
535 dx20
= _mm_sub_pd(ix2
,jx0
);
536 dy20
= _mm_sub_pd(iy2
,jy0
);
537 dz20
= _mm_sub_pd(iz2
,jz0
);
538 dx30
= _mm_sub_pd(ix3
,jx0
);
539 dy30
= _mm_sub_pd(iy3
,jy0
);
540 dz30
= _mm_sub_pd(iz3
,jz0
);
542 /* Calculate squared distance and things based on it */
543 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
544 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
545 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
546 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
548 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
549 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
550 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
551 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
553 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
554 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
555 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
556 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
558 /* Load parameters for j particles */
559 jq0
= _mm_load_sd(charge
+jnrA
+0);
560 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
562 fjx0
= _mm_setzero_pd();
563 fjy0
= _mm_setzero_pd();
564 fjz0
= _mm_setzero_pd();
566 /**************************
567 * CALCULATE INTERACTIONS *
568 **************************/
570 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
573 r00
= _mm_mul_pd(rsq00
,rinv00
);
575 /* Compute parameters for interactions between i and j atoms */
576 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
578 /* LENNARD-JONES DISPERSION/REPULSION */
580 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
581 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
582 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
583 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
584 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
586 d
= _mm_sub_pd(r00
,rswitch
);
587 d
= _mm_max_pd(d
,_mm_setzero_pd());
588 d2
= _mm_mul_pd(d
,d
);
589 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
591 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
593 /* Evaluate switch function */
594 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
595 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
596 vvdw
= _mm_mul_pd(vvdw
,sw
);
597 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
599 /* Update potential sum for this i atom from the interaction with this j atom. */
600 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
601 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
602 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
606 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
608 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
610 /* Calculate temporary vectorial force */
611 tx
= _mm_mul_pd(fscal
,dx00
);
612 ty
= _mm_mul_pd(fscal
,dy00
);
613 tz
= _mm_mul_pd(fscal
,dz00
);
615 /* Update vectorial force */
616 fix0
= _mm_add_pd(fix0
,tx
);
617 fiy0
= _mm_add_pd(fiy0
,ty
);
618 fiz0
= _mm_add_pd(fiz0
,tz
);
620 fjx0
= _mm_add_pd(fjx0
,tx
);
621 fjy0
= _mm_add_pd(fjy0
,ty
);
622 fjz0
= _mm_add_pd(fjz0
,tz
);
626 /**************************
627 * CALCULATE INTERACTIONS *
628 **************************/
630 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
633 r10
= _mm_mul_pd(rsq10
,rinv10
);
635 /* Compute parameters for interactions between i and j atoms */
636 qq10
= _mm_mul_pd(iq1
,jq0
);
638 /* EWALD ELECTROSTATICS */
640 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
641 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
642 ewitab
= _mm_cvttpd_epi32(ewrt
);
643 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
644 ewitab
= _mm_slli_epi32(ewitab
,2);
645 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
646 ewtabD
= _mm_setzero_pd();
647 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
648 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
649 ewtabFn
= _mm_setzero_pd();
650 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
651 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
652 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
653 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
654 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
656 d
= _mm_sub_pd(r10
,rswitch
);
657 d
= _mm_max_pd(d
,_mm_setzero_pd());
658 d2
= _mm_mul_pd(d
,d
);
659 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
661 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
663 /* Evaluate switch function */
664 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
665 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
666 velec
= _mm_mul_pd(velec
,sw
);
667 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
669 /* Update potential sum for this i atom from the interaction with this j atom. */
670 velec
= _mm_and_pd(velec
,cutoff_mask
);
671 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
672 velecsum
= _mm_add_pd(velecsum
,velec
);
676 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
678 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
680 /* Calculate temporary vectorial force */
681 tx
= _mm_mul_pd(fscal
,dx10
);
682 ty
= _mm_mul_pd(fscal
,dy10
);
683 tz
= _mm_mul_pd(fscal
,dz10
);
685 /* Update vectorial force */
686 fix1
= _mm_add_pd(fix1
,tx
);
687 fiy1
= _mm_add_pd(fiy1
,ty
);
688 fiz1
= _mm_add_pd(fiz1
,tz
);
690 fjx0
= _mm_add_pd(fjx0
,tx
);
691 fjy0
= _mm_add_pd(fjy0
,ty
);
692 fjz0
= _mm_add_pd(fjz0
,tz
);
696 /**************************
697 * CALCULATE INTERACTIONS *
698 **************************/
700 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
703 r20
= _mm_mul_pd(rsq20
,rinv20
);
705 /* Compute parameters for interactions between i and j atoms */
706 qq20
= _mm_mul_pd(iq2
,jq0
);
708 /* EWALD ELECTROSTATICS */
710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
711 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
712 ewitab
= _mm_cvttpd_epi32(ewrt
);
713 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
714 ewitab
= _mm_slli_epi32(ewitab
,2);
715 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
716 ewtabD
= _mm_setzero_pd();
717 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
718 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
719 ewtabFn
= _mm_setzero_pd();
720 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
721 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
722 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
723 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
724 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
726 d
= _mm_sub_pd(r20
,rswitch
);
727 d
= _mm_max_pd(d
,_mm_setzero_pd());
728 d2
= _mm_mul_pd(d
,d
);
729 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
731 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
733 /* Evaluate switch function */
734 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
735 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
736 velec
= _mm_mul_pd(velec
,sw
);
737 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velec
= _mm_and_pd(velec
,cutoff_mask
);
741 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
742 velecsum
= _mm_add_pd(velecsum
,velec
);
746 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
748 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
750 /* Calculate temporary vectorial force */
751 tx
= _mm_mul_pd(fscal
,dx20
);
752 ty
= _mm_mul_pd(fscal
,dy20
);
753 tz
= _mm_mul_pd(fscal
,dz20
);
755 /* Update vectorial force */
756 fix2
= _mm_add_pd(fix2
,tx
);
757 fiy2
= _mm_add_pd(fiy2
,ty
);
758 fiz2
= _mm_add_pd(fiz2
,tz
);
760 fjx0
= _mm_add_pd(fjx0
,tx
);
761 fjy0
= _mm_add_pd(fjy0
,ty
);
762 fjz0
= _mm_add_pd(fjz0
,tz
);
766 /**************************
767 * CALCULATE INTERACTIONS *
768 **************************/
770 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
773 r30
= _mm_mul_pd(rsq30
,rinv30
);
775 /* Compute parameters for interactions between i and j atoms */
776 qq30
= _mm_mul_pd(iq3
,jq0
);
778 /* EWALD ELECTROSTATICS */
780 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
781 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
782 ewitab
= _mm_cvttpd_epi32(ewrt
);
783 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
784 ewitab
= _mm_slli_epi32(ewitab
,2);
785 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
786 ewtabD
= _mm_setzero_pd();
787 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
788 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
789 ewtabFn
= _mm_setzero_pd();
790 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
791 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
792 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
793 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
794 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
796 d
= _mm_sub_pd(r30
,rswitch
);
797 d
= _mm_max_pd(d
,_mm_setzero_pd());
798 d2
= _mm_mul_pd(d
,d
);
799 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
801 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
803 /* Evaluate switch function */
804 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
805 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
806 velec
= _mm_mul_pd(velec
,sw
);
807 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
809 /* Update potential sum for this i atom from the interaction with this j atom. */
810 velec
= _mm_and_pd(velec
,cutoff_mask
);
811 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
812 velecsum
= _mm_add_pd(velecsum
,velec
);
816 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
818 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
820 /* Calculate temporary vectorial force */
821 tx
= _mm_mul_pd(fscal
,dx30
);
822 ty
= _mm_mul_pd(fscal
,dy30
);
823 tz
= _mm_mul_pd(fscal
,dz30
);
825 /* Update vectorial force */
826 fix3
= _mm_add_pd(fix3
,tx
);
827 fiy3
= _mm_add_pd(fiy3
,ty
);
828 fiz3
= _mm_add_pd(fiz3
,tz
);
830 fjx0
= _mm_add_pd(fjx0
,tx
);
831 fjy0
= _mm_add_pd(fjy0
,ty
);
832 fjz0
= _mm_add_pd(fjz0
,tz
);
836 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
838 /* Inner loop uses 257 flops */
841 /* End of innermost loop */
843 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
844 f
+i_coord_offset
,fshift
+i_shift_offset
);
847 /* Update potential energies */
848 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
849 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
851 /* Increment number of inner iterations */
852 inneriter
+= j_index_end
- j_index_start
;
854 /* Outer loop uses 26 flops */
857 /* Increment number of outer iterations */
860 /* Update outer/inner flops */
862 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_VF
,outeriter
*26 + inneriter
*257);
865 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double
866 * Electrostatics interaction: Ewald
867 * VdW interaction: LennardJones
868 * Geometry: Water4-Particle
869 * Calculate force/pot: Force
872 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double
873 (t_nblist
* gmx_restrict nlist
,
874 rvec
* gmx_restrict xx
,
875 rvec
* gmx_restrict ff
,
876 t_forcerec
* gmx_restrict fr
,
877 t_mdatoms
* gmx_restrict mdatoms
,
878 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
879 t_nrnb
* gmx_restrict nrnb
)
881 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
882 * just 0 for non-waters.
883 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
884 * jnr indices corresponding to data put in the four positions in the SIMD register.
886 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
887 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
889 int j_coord_offsetA
,j_coord_offsetB
;
890 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
892 real
*shiftvec
,*fshift
,*x
,*f
;
893 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
895 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
897 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
899 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
901 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
902 int vdwjidx0A
,vdwjidx0B
;
903 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
904 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
905 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
906 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
907 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
908 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
911 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
914 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
915 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
917 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
919 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
920 real rswitch_scalar
,d_scalar
;
921 __m128d dummy_mask
,cutoff_mask
;
922 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
923 __m128d one
= _mm_set1_pd(1.0);
924 __m128d two
= _mm_set1_pd(2.0);
930 jindex
= nlist
->jindex
;
932 shiftidx
= nlist
->shift
;
934 shiftvec
= fr
->shift_vec
[0];
935 fshift
= fr
->fshift
[0];
936 facel
= _mm_set1_pd(fr
->epsfac
);
937 charge
= mdatoms
->chargeA
;
938 nvdwtype
= fr
->ntype
;
940 vdwtype
= mdatoms
->typeA
;
942 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
943 ewtab
= fr
->ic
->tabq_coul_FDV0
;
944 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
945 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
947 /* Setup water-specific parameters */
948 inr
= nlist
->iinr
[0];
949 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
950 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
951 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
952 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
954 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
955 rcutoff_scalar
= fr
->rcoulomb
;
956 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
957 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
959 rswitch_scalar
= fr
->rcoulomb_switch
;
960 rswitch
= _mm_set1_pd(rswitch_scalar
);
961 /* Setup switch parameters */
962 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
963 d
= _mm_set1_pd(d_scalar
);
964 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
965 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
966 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
967 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
968 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
969 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
971 /* Avoid stupid compiler warnings */
979 /* Start outer loop over neighborlists */
980 for(iidx
=0; iidx
<nri
; iidx
++)
982 /* Load shift vector for this list */
983 i_shift_offset
= DIM
*shiftidx
[iidx
];
985 /* Load limits for loop over neighbors */
986 j_index_start
= jindex
[iidx
];
987 j_index_end
= jindex
[iidx
+1];
989 /* Get outer coordinate index */
991 i_coord_offset
= DIM
*inr
;
993 /* Load i particle coords and add shift vector */
994 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
995 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
997 fix0
= _mm_setzero_pd();
998 fiy0
= _mm_setzero_pd();
999 fiz0
= _mm_setzero_pd();
1000 fix1
= _mm_setzero_pd();
1001 fiy1
= _mm_setzero_pd();
1002 fiz1
= _mm_setzero_pd();
1003 fix2
= _mm_setzero_pd();
1004 fiy2
= _mm_setzero_pd();
1005 fiz2
= _mm_setzero_pd();
1006 fix3
= _mm_setzero_pd();
1007 fiy3
= _mm_setzero_pd();
1008 fiz3
= _mm_setzero_pd();
1010 /* Start inner kernel loop */
1011 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1014 /* Get j neighbor index, and coordinate index */
1016 jnrB
= jjnr
[jidx
+1];
1017 j_coord_offsetA
= DIM
*jnrA
;
1018 j_coord_offsetB
= DIM
*jnrB
;
1020 /* load j atom coordinates */
1021 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1024 /* Calculate displacement vector */
1025 dx00
= _mm_sub_pd(ix0
,jx0
);
1026 dy00
= _mm_sub_pd(iy0
,jy0
);
1027 dz00
= _mm_sub_pd(iz0
,jz0
);
1028 dx10
= _mm_sub_pd(ix1
,jx0
);
1029 dy10
= _mm_sub_pd(iy1
,jy0
);
1030 dz10
= _mm_sub_pd(iz1
,jz0
);
1031 dx20
= _mm_sub_pd(ix2
,jx0
);
1032 dy20
= _mm_sub_pd(iy2
,jy0
);
1033 dz20
= _mm_sub_pd(iz2
,jz0
);
1034 dx30
= _mm_sub_pd(ix3
,jx0
);
1035 dy30
= _mm_sub_pd(iy3
,jy0
);
1036 dz30
= _mm_sub_pd(iz3
,jz0
);
1038 /* Calculate squared distance and things based on it */
1039 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1040 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1041 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1042 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
1044 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1045 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1046 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1047 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
1049 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1050 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1051 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1052 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
1054 /* Load parameters for j particles */
1055 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
1056 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1057 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
1059 fjx0
= _mm_setzero_pd();
1060 fjy0
= _mm_setzero_pd();
1061 fjz0
= _mm_setzero_pd();
1063 /**************************
1064 * CALCULATE INTERACTIONS *
1065 **************************/
1067 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1070 r00
= _mm_mul_pd(rsq00
,rinv00
);
1072 /* Compute parameters for interactions between i and j atoms */
1073 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
1074 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
1076 /* LENNARD-JONES DISPERSION/REPULSION */
1078 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1079 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
1080 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
1081 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
1082 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
1084 d
= _mm_sub_pd(r00
,rswitch
);
1085 d
= _mm_max_pd(d
,_mm_setzero_pd());
1086 d2
= _mm_mul_pd(d
,d
);
1087 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1089 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1091 /* Evaluate switch function */
1092 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1093 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1094 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1098 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1100 /* Calculate temporary vectorial force */
1101 tx
= _mm_mul_pd(fscal
,dx00
);
1102 ty
= _mm_mul_pd(fscal
,dy00
);
1103 tz
= _mm_mul_pd(fscal
,dz00
);
1105 /* Update vectorial force */
1106 fix0
= _mm_add_pd(fix0
,tx
);
1107 fiy0
= _mm_add_pd(fiy0
,ty
);
1108 fiz0
= _mm_add_pd(fiz0
,tz
);
1110 fjx0
= _mm_add_pd(fjx0
,tx
);
1111 fjy0
= _mm_add_pd(fjy0
,ty
);
1112 fjz0
= _mm_add_pd(fjz0
,tz
);
1116 /**************************
1117 * CALCULATE INTERACTIONS *
1118 **************************/
1120 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1123 r10
= _mm_mul_pd(rsq10
,rinv10
);
1125 /* Compute parameters for interactions between i and j atoms */
1126 qq10
= _mm_mul_pd(iq1
,jq0
);
1128 /* EWALD ELECTROSTATICS */
1130 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1131 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1132 ewitab
= _mm_cvttpd_epi32(ewrt
);
1133 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1134 ewitab
= _mm_slli_epi32(ewitab
,2);
1135 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1136 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1137 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1138 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1139 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1140 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1141 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1142 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1143 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
1144 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1146 d
= _mm_sub_pd(r10
,rswitch
);
1147 d
= _mm_max_pd(d
,_mm_setzero_pd());
1148 d2
= _mm_mul_pd(d
,d
);
1149 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1151 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1153 /* Evaluate switch function */
1154 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1155 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
1156 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1160 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1162 /* Calculate temporary vectorial force */
1163 tx
= _mm_mul_pd(fscal
,dx10
);
1164 ty
= _mm_mul_pd(fscal
,dy10
);
1165 tz
= _mm_mul_pd(fscal
,dz10
);
1167 /* Update vectorial force */
1168 fix1
= _mm_add_pd(fix1
,tx
);
1169 fiy1
= _mm_add_pd(fiy1
,ty
);
1170 fiz1
= _mm_add_pd(fiz1
,tz
);
1172 fjx0
= _mm_add_pd(fjx0
,tx
);
1173 fjy0
= _mm_add_pd(fjy0
,ty
);
1174 fjz0
= _mm_add_pd(fjz0
,tz
);
1178 /**************************
1179 * CALCULATE INTERACTIONS *
1180 **************************/
1182 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1185 r20
= _mm_mul_pd(rsq20
,rinv20
);
1187 /* Compute parameters for interactions between i and j atoms */
1188 qq20
= _mm_mul_pd(iq2
,jq0
);
1190 /* EWALD ELECTROSTATICS */
1192 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1193 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1194 ewitab
= _mm_cvttpd_epi32(ewrt
);
1195 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1196 ewitab
= _mm_slli_epi32(ewitab
,2);
1197 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1198 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1199 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1200 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1201 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1202 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1203 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1204 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1205 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
1206 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1208 d
= _mm_sub_pd(r20
,rswitch
);
1209 d
= _mm_max_pd(d
,_mm_setzero_pd());
1210 d2
= _mm_mul_pd(d
,d
);
1211 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1213 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1215 /* Evaluate switch function */
1216 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1217 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
1218 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1222 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1224 /* Calculate temporary vectorial force */
1225 tx
= _mm_mul_pd(fscal
,dx20
);
1226 ty
= _mm_mul_pd(fscal
,dy20
);
1227 tz
= _mm_mul_pd(fscal
,dz20
);
1229 /* Update vectorial force */
1230 fix2
= _mm_add_pd(fix2
,tx
);
1231 fiy2
= _mm_add_pd(fiy2
,ty
);
1232 fiz2
= _mm_add_pd(fiz2
,tz
);
1234 fjx0
= _mm_add_pd(fjx0
,tx
);
1235 fjy0
= _mm_add_pd(fjy0
,ty
);
1236 fjz0
= _mm_add_pd(fjz0
,tz
);
1240 /**************************
1241 * CALCULATE INTERACTIONS *
1242 **************************/
1244 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1247 r30
= _mm_mul_pd(rsq30
,rinv30
);
1249 /* Compute parameters for interactions between i and j atoms */
1250 qq30
= _mm_mul_pd(iq3
,jq0
);
1252 /* EWALD ELECTROSTATICS */
1254 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1255 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1256 ewitab
= _mm_cvttpd_epi32(ewrt
);
1257 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1258 ewitab
= _mm_slli_epi32(ewitab
,2);
1259 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1260 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1261 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1262 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1263 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1264 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1265 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1266 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1267 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
1268 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1270 d
= _mm_sub_pd(r30
,rswitch
);
1271 d
= _mm_max_pd(d
,_mm_setzero_pd());
1272 d2
= _mm_mul_pd(d
,d
);
1273 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1275 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1277 /* Evaluate switch function */
1278 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1279 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
1280 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1284 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1286 /* Calculate temporary vectorial force */
1287 tx
= _mm_mul_pd(fscal
,dx30
);
1288 ty
= _mm_mul_pd(fscal
,dy30
);
1289 tz
= _mm_mul_pd(fscal
,dz30
);
1291 /* Update vectorial force */
1292 fix3
= _mm_add_pd(fix3
,tx
);
1293 fiy3
= _mm_add_pd(fiy3
,ty
);
1294 fiz3
= _mm_add_pd(fiz3
,tz
);
1296 fjx0
= _mm_add_pd(fjx0
,tx
);
1297 fjy0
= _mm_add_pd(fjy0
,ty
);
1298 fjz0
= _mm_add_pd(fjz0
,tz
);
1302 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
1304 /* Inner loop uses 245 flops */
1307 if(jidx
<j_index_end
)
1311 j_coord_offsetA
= DIM
*jnrA
;
1313 /* load j atom coordinates */
1314 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1317 /* Calculate displacement vector */
1318 dx00
= _mm_sub_pd(ix0
,jx0
);
1319 dy00
= _mm_sub_pd(iy0
,jy0
);
1320 dz00
= _mm_sub_pd(iz0
,jz0
);
1321 dx10
= _mm_sub_pd(ix1
,jx0
);
1322 dy10
= _mm_sub_pd(iy1
,jy0
);
1323 dz10
= _mm_sub_pd(iz1
,jz0
);
1324 dx20
= _mm_sub_pd(ix2
,jx0
);
1325 dy20
= _mm_sub_pd(iy2
,jy0
);
1326 dz20
= _mm_sub_pd(iz2
,jz0
);
1327 dx30
= _mm_sub_pd(ix3
,jx0
);
1328 dy30
= _mm_sub_pd(iy3
,jy0
);
1329 dz30
= _mm_sub_pd(iz3
,jz0
);
1331 /* Calculate squared distance and things based on it */
1332 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1333 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1334 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1335 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
1337 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1338 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1339 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1340 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
1342 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1343 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1344 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1345 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
1347 /* Load parameters for j particles */
1348 jq0
= _mm_load_sd(charge
+jnrA
+0);
1349 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1351 fjx0
= _mm_setzero_pd();
1352 fjy0
= _mm_setzero_pd();
1353 fjz0
= _mm_setzero_pd();
1355 /**************************
1356 * CALCULATE INTERACTIONS *
1357 **************************/
1359 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1362 r00
= _mm_mul_pd(rsq00
,rinv00
);
1364 /* Compute parameters for interactions between i and j atoms */
1365 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
1367 /* LENNARD-JONES DISPERSION/REPULSION */
1369 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1370 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
1371 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
1372 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
1373 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
1375 d
= _mm_sub_pd(r00
,rswitch
);
1376 d
= _mm_max_pd(d
,_mm_setzero_pd());
1377 d2
= _mm_mul_pd(d
,d
);
1378 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1380 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1382 /* Evaluate switch function */
1383 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1384 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1385 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1389 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1391 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1393 /* Calculate temporary vectorial force */
1394 tx
= _mm_mul_pd(fscal
,dx00
);
1395 ty
= _mm_mul_pd(fscal
,dy00
);
1396 tz
= _mm_mul_pd(fscal
,dz00
);
1398 /* Update vectorial force */
1399 fix0
= _mm_add_pd(fix0
,tx
);
1400 fiy0
= _mm_add_pd(fiy0
,ty
);
1401 fiz0
= _mm_add_pd(fiz0
,tz
);
1403 fjx0
= _mm_add_pd(fjx0
,tx
);
1404 fjy0
= _mm_add_pd(fjy0
,ty
);
1405 fjz0
= _mm_add_pd(fjz0
,tz
);
1409 /**************************
1410 * CALCULATE INTERACTIONS *
1411 **************************/
1413 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1416 r10
= _mm_mul_pd(rsq10
,rinv10
);
1418 /* Compute parameters for interactions between i and j atoms */
1419 qq10
= _mm_mul_pd(iq1
,jq0
);
1421 /* EWALD ELECTROSTATICS */
1423 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1424 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1425 ewitab
= _mm_cvttpd_epi32(ewrt
);
1426 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1427 ewitab
= _mm_slli_epi32(ewitab
,2);
1428 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1429 ewtabD
= _mm_setzero_pd();
1430 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1431 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1432 ewtabFn
= _mm_setzero_pd();
1433 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1434 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1435 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1436 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
1437 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1439 d
= _mm_sub_pd(r10
,rswitch
);
1440 d
= _mm_max_pd(d
,_mm_setzero_pd());
1441 d2
= _mm_mul_pd(d
,d
);
1442 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1444 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1446 /* Evaluate switch function */
1447 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1448 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
1449 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1453 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1455 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1457 /* Calculate temporary vectorial force */
1458 tx
= _mm_mul_pd(fscal
,dx10
);
1459 ty
= _mm_mul_pd(fscal
,dy10
);
1460 tz
= _mm_mul_pd(fscal
,dz10
);
1462 /* Update vectorial force */
1463 fix1
= _mm_add_pd(fix1
,tx
);
1464 fiy1
= _mm_add_pd(fiy1
,ty
);
1465 fiz1
= _mm_add_pd(fiz1
,tz
);
1467 fjx0
= _mm_add_pd(fjx0
,tx
);
1468 fjy0
= _mm_add_pd(fjy0
,ty
);
1469 fjz0
= _mm_add_pd(fjz0
,tz
);
1473 /**************************
1474 * CALCULATE INTERACTIONS *
1475 **************************/
1477 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1480 r20
= _mm_mul_pd(rsq20
,rinv20
);
1482 /* Compute parameters for interactions between i and j atoms */
1483 qq20
= _mm_mul_pd(iq2
,jq0
);
1485 /* EWALD ELECTROSTATICS */
1487 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1488 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1489 ewitab
= _mm_cvttpd_epi32(ewrt
);
1490 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1491 ewitab
= _mm_slli_epi32(ewitab
,2);
1492 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1493 ewtabD
= _mm_setzero_pd();
1494 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1495 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1496 ewtabFn
= _mm_setzero_pd();
1497 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1498 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1499 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1500 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
1501 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1503 d
= _mm_sub_pd(r20
,rswitch
);
1504 d
= _mm_max_pd(d
,_mm_setzero_pd());
1505 d2
= _mm_mul_pd(d
,d
);
1506 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1508 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1510 /* Evaluate switch function */
1511 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1512 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
1513 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1517 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1519 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1521 /* Calculate temporary vectorial force */
1522 tx
= _mm_mul_pd(fscal
,dx20
);
1523 ty
= _mm_mul_pd(fscal
,dy20
);
1524 tz
= _mm_mul_pd(fscal
,dz20
);
1526 /* Update vectorial force */
1527 fix2
= _mm_add_pd(fix2
,tx
);
1528 fiy2
= _mm_add_pd(fiy2
,ty
);
1529 fiz2
= _mm_add_pd(fiz2
,tz
);
1531 fjx0
= _mm_add_pd(fjx0
,tx
);
1532 fjy0
= _mm_add_pd(fjy0
,ty
);
1533 fjz0
= _mm_add_pd(fjz0
,tz
);
1537 /**************************
1538 * CALCULATE INTERACTIONS *
1539 **************************/
1541 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1544 r30
= _mm_mul_pd(rsq30
,rinv30
);
1546 /* Compute parameters for interactions between i and j atoms */
1547 qq30
= _mm_mul_pd(iq3
,jq0
);
1549 /* EWALD ELECTROSTATICS */
1551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1552 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1553 ewitab
= _mm_cvttpd_epi32(ewrt
);
1554 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1555 ewitab
= _mm_slli_epi32(ewitab
,2);
1556 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1557 ewtabD
= _mm_setzero_pd();
1558 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1559 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1560 ewtabFn
= _mm_setzero_pd();
1561 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1562 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1563 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1564 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
1565 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1567 d
= _mm_sub_pd(r30
,rswitch
);
1568 d
= _mm_max_pd(d
,_mm_setzero_pd());
1569 d2
= _mm_mul_pd(d
,d
);
1570 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1572 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1574 /* Evaluate switch function */
1575 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1576 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
1577 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1581 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1583 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1585 /* Calculate temporary vectorial force */
1586 tx
= _mm_mul_pd(fscal
,dx30
);
1587 ty
= _mm_mul_pd(fscal
,dy30
);
1588 tz
= _mm_mul_pd(fscal
,dz30
);
1590 /* Update vectorial force */
1591 fix3
= _mm_add_pd(fix3
,tx
);
1592 fiy3
= _mm_add_pd(fiy3
,ty
);
1593 fiz3
= _mm_add_pd(fiz3
,tz
);
1595 fjx0
= _mm_add_pd(fjx0
,tx
);
1596 fjy0
= _mm_add_pd(fjy0
,ty
);
1597 fjz0
= _mm_add_pd(fjz0
,tz
);
1601 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1603 /* Inner loop uses 245 flops */
1606 /* End of innermost loop */
1608 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1609 f
+i_coord_offset
,fshift
+i_shift_offset
);
1611 /* Increment number of inner iterations */
1612 inneriter
+= j_index_end
- j_index_start
;
1614 /* Outer loop uses 24 flops */
1617 /* Increment number of outer iterations */
1620 /* Update outer/inner flops */
1622 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_F
,outeriter
*24 + inneriter
*245);