2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
86 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
90 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
91 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
92 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
93 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
96 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
99 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
100 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
105 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
107 __m128d one_half
= _mm_set1_pd(0.5);
108 __m128d minus_one
= _mm_set1_pd(-1.0);
110 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
112 __m128d dummy_mask
,cutoff_mask
;
113 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
114 __m128d one
= _mm_set1_pd(1.0);
115 __m128d two
= _mm_set1_pd(2.0);
121 jindex
= nlist
->jindex
;
123 shiftidx
= nlist
->shift
;
125 shiftvec
= fr
->shift_vec
[0];
126 fshift
= fr
->fshift
[0];
127 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
128 charge
= mdatoms
->chargeA
;
129 nvdwtype
= fr
->ntype
;
131 vdwtype
= mdatoms
->typeA
;
132 vdwgridparam
= fr
->ljpme_c6grid
;
133 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
134 ewclj
= _mm_set1_pd(fr
->ic
->ewaldcoeff_lj
);
135 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
137 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
138 ewtab
= fr
->ic
->tabq_coul_FDV0
;
139 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
140 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
142 /* Setup water-specific parameters */
143 inr
= nlist
->iinr
[0];
144 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
145 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
146 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
147 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar
= fr
->ic
->rcoulomb
;
151 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
152 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
154 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
155 rvdw
= _mm_set1_pd(fr
->ic
->rvdw
);
157 /* Avoid stupid compiler warnings */
165 /* Start outer loop over neighborlists */
166 for(iidx
=0; iidx
<nri
; iidx
++)
168 /* Load shift vector for this list */
169 i_shift_offset
= DIM
*shiftidx
[iidx
];
171 /* Load limits for loop over neighbors */
172 j_index_start
= jindex
[iidx
];
173 j_index_end
= jindex
[iidx
+1];
175 /* Get outer coordinate index */
177 i_coord_offset
= DIM
*inr
;
179 /* Load i particle coords and add shift vector */
180 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
181 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
183 fix0
= _mm_setzero_pd();
184 fiy0
= _mm_setzero_pd();
185 fiz0
= _mm_setzero_pd();
186 fix1
= _mm_setzero_pd();
187 fiy1
= _mm_setzero_pd();
188 fiz1
= _mm_setzero_pd();
189 fix2
= _mm_setzero_pd();
190 fiy2
= _mm_setzero_pd();
191 fiz2
= _mm_setzero_pd();
192 fix3
= _mm_setzero_pd();
193 fiy3
= _mm_setzero_pd();
194 fiz3
= _mm_setzero_pd();
196 /* Reset potential sums */
197 velecsum
= _mm_setzero_pd();
198 vvdwsum
= _mm_setzero_pd();
200 /* Start inner kernel loop */
201 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
204 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA
= DIM
*jnrA
;
208 j_coord_offsetB
= DIM
*jnrB
;
210 /* load j atom coordinates */
211 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
214 /* Calculate displacement vector */
215 dx00
= _mm_sub_pd(ix0
,jx0
);
216 dy00
= _mm_sub_pd(iy0
,jy0
);
217 dz00
= _mm_sub_pd(iz0
,jz0
);
218 dx10
= _mm_sub_pd(ix1
,jx0
);
219 dy10
= _mm_sub_pd(iy1
,jy0
);
220 dz10
= _mm_sub_pd(iz1
,jz0
);
221 dx20
= _mm_sub_pd(ix2
,jx0
);
222 dy20
= _mm_sub_pd(iy2
,jy0
);
223 dz20
= _mm_sub_pd(iz2
,jz0
);
224 dx30
= _mm_sub_pd(ix3
,jx0
);
225 dy30
= _mm_sub_pd(iy3
,jy0
);
226 dz30
= _mm_sub_pd(iz3
,jz0
);
228 /* Calculate squared distance and things based on it */
229 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
230 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
231 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
232 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
234 rinv00
= sse2_invsqrt_d(rsq00
);
235 rinv10
= sse2_invsqrt_d(rsq10
);
236 rinv20
= sse2_invsqrt_d(rsq20
);
237 rinv30
= sse2_invsqrt_d(rsq30
);
239 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
240 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
241 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
242 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
244 /* Load parameters for j particles */
245 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
246 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
247 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
249 fjx0
= _mm_setzero_pd();
250 fjy0
= _mm_setzero_pd();
251 fjz0
= _mm_setzero_pd();
253 /**************************
254 * CALCULATE INTERACTIONS *
255 **************************/
257 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
260 r00
= _mm_mul_pd(rsq00
,rinv00
);
262 /* Compute parameters for interactions between i and j atoms */
263 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
264 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
266 c6grid_00
= gmx_mm_load_2real_swizzle_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
,
267 vdwgridparam
+vdwioffset0
+vdwjidx0B
);
269 /* Analytical LJ-PME */
270 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
271 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
272 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
273 exponent
= sse2_exp_d(ewcljrsq
);
274 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
275 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
276 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
277 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
278 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
279 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
280 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
281 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
282 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
284 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
286 /* Update potential sum for this i atom from the interaction with this j atom. */
287 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
288 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
292 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
294 /* Calculate temporary vectorial force */
295 tx
= _mm_mul_pd(fscal
,dx00
);
296 ty
= _mm_mul_pd(fscal
,dy00
);
297 tz
= _mm_mul_pd(fscal
,dz00
);
299 /* Update vectorial force */
300 fix0
= _mm_add_pd(fix0
,tx
);
301 fiy0
= _mm_add_pd(fiy0
,ty
);
302 fiz0
= _mm_add_pd(fiz0
,tz
);
304 fjx0
= _mm_add_pd(fjx0
,tx
);
305 fjy0
= _mm_add_pd(fjy0
,ty
);
306 fjz0
= _mm_add_pd(fjz0
,tz
);
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
317 r10
= _mm_mul_pd(rsq10
,rinv10
);
319 /* Compute parameters for interactions between i and j atoms */
320 qq10
= _mm_mul_pd(iq1
,jq0
);
322 /* EWALD ELECTROSTATICS */
324 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
325 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
326 ewitab
= _mm_cvttpd_epi32(ewrt
);
327 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
328 ewitab
= _mm_slli_epi32(ewitab
,2);
329 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
330 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
331 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
332 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
333 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
334 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
335 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
336 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
337 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
338 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
340 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
342 /* Update potential sum for this i atom from the interaction with this j atom. */
343 velec
= _mm_and_pd(velec
,cutoff_mask
);
344 velecsum
= _mm_add_pd(velecsum
,velec
);
348 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
350 /* Calculate temporary vectorial force */
351 tx
= _mm_mul_pd(fscal
,dx10
);
352 ty
= _mm_mul_pd(fscal
,dy10
);
353 tz
= _mm_mul_pd(fscal
,dz10
);
355 /* Update vectorial force */
356 fix1
= _mm_add_pd(fix1
,tx
);
357 fiy1
= _mm_add_pd(fiy1
,ty
);
358 fiz1
= _mm_add_pd(fiz1
,tz
);
360 fjx0
= _mm_add_pd(fjx0
,tx
);
361 fjy0
= _mm_add_pd(fjy0
,ty
);
362 fjz0
= _mm_add_pd(fjz0
,tz
);
366 /**************************
367 * CALCULATE INTERACTIONS *
368 **************************/
370 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
373 r20
= _mm_mul_pd(rsq20
,rinv20
);
375 /* Compute parameters for interactions between i and j atoms */
376 qq20
= _mm_mul_pd(iq2
,jq0
);
378 /* EWALD ELECTROSTATICS */
380 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
381 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
382 ewitab
= _mm_cvttpd_epi32(ewrt
);
383 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
384 ewitab
= _mm_slli_epi32(ewitab
,2);
385 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
386 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
387 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
388 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
389 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
390 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
391 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
392 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
393 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
394 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
396 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velec
= _mm_and_pd(velec
,cutoff_mask
);
400 velecsum
= _mm_add_pd(velecsum
,velec
);
404 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
406 /* Calculate temporary vectorial force */
407 tx
= _mm_mul_pd(fscal
,dx20
);
408 ty
= _mm_mul_pd(fscal
,dy20
);
409 tz
= _mm_mul_pd(fscal
,dz20
);
411 /* Update vectorial force */
412 fix2
= _mm_add_pd(fix2
,tx
);
413 fiy2
= _mm_add_pd(fiy2
,ty
);
414 fiz2
= _mm_add_pd(fiz2
,tz
);
416 fjx0
= _mm_add_pd(fjx0
,tx
);
417 fjy0
= _mm_add_pd(fjy0
,ty
);
418 fjz0
= _mm_add_pd(fjz0
,tz
);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
429 r30
= _mm_mul_pd(rsq30
,rinv30
);
431 /* Compute parameters for interactions between i and j atoms */
432 qq30
= _mm_mul_pd(iq3
,jq0
);
434 /* EWALD ELECTROSTATICS */
436 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
437 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
438 ewitab
= _mm_cvttpd_epi32(ewrt
);
439 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
440 ewitab
= _mm_slli_epi32(ewitab
,2);
441 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
442 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
443 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
444 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
445 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
446 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
447 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
448 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
449 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(_mm_sub_pd(rinv30
,sh_ewald
),velec
));
450 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
452 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velec
= _mm_and_pd(velec
,cutoff_mask
);
456 velecsum
= _mm_add_pd(velecsum
,velec
);
460 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
462 /* Calculate temporary vectorial force */
463 tx
= _mm_mul_pd(fscal
,dx30
);
464 ty
= _mm_mul_pd(fscal
,dy30
);
465 tz
= _mm_mul_pd(fscal
,dz30
);
467 /* Update vectorial force */
468 fix3
= _mm_add_pd(fix3
,tx
);
469 fiy3
= _mm_add_pd(fiy3
,ty
);
470 fiz3
= _mm_add_pd(fiz3
,tz
);
472 fjx0
= _mm_add_pd(fjx0
,tx
);
473 fjy0
= _mm_add_pd(fjy0
,ty
);
474 fjz0
= _mm_add_pd(fjz0
,tz
);
478 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
480 /* Inner loop uses 203 flops */
487 j_coord_offsetA
= DIM
*jnrA
;
489 /* load j atom coordinates */
490 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
493 /* Calculate displacement vector */
494 dx00
= _mm_sub_pd(ix0
,jx0
);
495 dy00
= _mm_sub_pd(iy0
,jy0
);
496 dz00
= _mm_sub_pd(iz0
,jz0
);
497 dx10
= _mm_sub_pd(ix1
,jx0
);
498 dy10
= _mm_sub_pd(iy1
,jy0
);
499 dz10
= _mm_sub_pd(iz1
,jz0
);
500 dx20
= _mm_sub_pd(ix2
,jx0
);
501 dy20
= _mm_sub_pd(iy2
,jy0
);
502 dz20
= _mm_sub_pd(iz2
,jz0
);
503 dx30
= _mm_sub_pd(ix3
,jx0
);
504 dy30
= _mm_sub_pd(iy3
,jy0
);
505 dz30
= _mm_sub_pd(iz3
,jz0
);
507 /* Calculate squared distance and things based on it */
508 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
509 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
510 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
511 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
513 rinv00
= sse2_invsqrt_d(rsq00
);
514 rinv10
= sse2_invsqrt_d(rsq10
);
515 rinv20
= sse2_invsqrt_d(rsq20
);
516 rinv30
= sse2_invsqrt_d(rsq30
);
518 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
519 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
520 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
521 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
523 /* Load parameters for j particles */
524 jq0
= _mm_load_sd(charge
+jnrA
+0);
525 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
527 fjx0
= _mm_setzero_pd();
528 fjy0
= _mm_setzero_pd();
529 fjz0
= _mm_setzero_pd();
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
538 r00
= _mm_mul_pd(rsq00
,rinv00
);
540 /* Compute parameters for interactions between i and j atoms */
541 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
543 c6grid_00
= gmx_mm_load_1real_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
);
545 /* Analytical LJ-PME */
546 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
547 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
548 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
549 exponent
= sse2_exp_d(ewcljrsq
);
550 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
551 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
552 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
553 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
554 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
555 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
556 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
557 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
558 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
560 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
562 /* Update potential sum for this i atom from the interaction with this j atom. */
563 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
564 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
565 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
569 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
571 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
573 /* Calculate temporary vectorial force */
574 tx
= _mm_mul_pd(fscal
,dx00
);
575 ty
= _mm_mul_pd(fscal
,dy00
);
576 tz
= _mm_mul_pd(fscal
,dz00
);
578 /* Update vectorial force */
579 fix0
= _mm_add_pd(fix0
,tx
);
580 fiy0
= _mm_add_pd(fiy0
,ty
);
581 fiz0
= _mm_add_pd(fiz0
,tz
);
583 fjx0
= _mm_add_pd(fjx0
,tx
);
584 fjy0
= _mm_add_pd(fjy0
,ty
);
585 fjz0
= _mm_add_pd(fjz0
,tz
);
589 /**************************
590 * CALCULATE INTERACTIONS *
591 **************************/
593 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
596 r10
= _mm_mul_pd(rsq10
,rinv10
);
598 /* Compute parameters for interactions between i and j atoms */
599 qq10
= _mm_mul_pd(iq1
,jq0
);
601 /* EWALD ELECTROSTATICS */
603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
605 ewitab
= _mm_cvttpd_epi32(ewrt
);
606 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
607 ewitab
= _mm_slli_epi32(ewitab
,2);
608 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
609 ewtabD
= _mm_setzero_pd();
610 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
611 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
612 ewtabFn
= _mm_setzero_pd();
613 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
614 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
615 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
616 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
617 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
619 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velec
= _mm_and_pd(velec
,cutoff_mask
);
623 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
624 velecsum
= _mm_add_pd(velecsum
,velec
);
628 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
630 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
632 /* Calculate temporary vectorial force */
633 tx
= _mm_mul_pd(fscal
,dx10
);
634 ty
= _mm_mul_pd(fscal
,dy10
);
635 tz
= _mm_mul_pd(fscal
,dz10
);
637 /* Update vectorial force */
638 fix1
= _mm_add_pd(fix1
,tx
);
639 fiy1
= _mm_add_pd(fiy1
,ty
);
640 fiz1
= _mm_add_pd(fiz1
,tz
);
642 fjx0
= _mm_add_pd(fjx0
,tx
);
643 fjy0
= _mm_add_pd(fjy0
,ty
);
644 fjz0
= _mm_add_pd(fjz0
,tz
);
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
652 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
655 r20
= _mm_mul_pd(rsq20
,rinv20
);
657 /* Compute parameters for interactions between i and j atoms */
658 qq20
= _mm_mul_pd(iq2
,jq0
);
660 /* EWALD ELECTROSTATICS */
662 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
663 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
664 ewitab
= _mm_cvttpd_epi32(ewrt
);
665 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
666 ewitab
= _mm_slli_epi32(ewitab
,2);
667 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
668 ewtabD
= _mm_setzero_pd();
669 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
670 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
671 ewtabFn
= _mm_setzero_pd();
672 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
673 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
674 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
675 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
676 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
678 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
680 /* Update potential sum for this i atom from the interaction with this j atom. */
681 velec
= _mm_and_pd(velec
,cutoff_mask
);
682 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
683 velecsum
= _mm_add_pd(velecsum
,velec
);
687 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
689 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
691 /* Calculate temporary vectorial force */
692 tx
= _mm_mul_pd(fscal
,dx20
);
693 ty
= _mm_mul_pd(fscal
,dy20
);
694 tz
= _mm_mul_pd(fscal
,dz20
);
696 /* Update vectorial force */
697 fix2
= _mm_add_pd(fix2
,tx
);
698 fiy2
= _mm_add_pd(fiy2
,ty
);
699 fiz2
= _mm_add_pd(fiz2
,tz
);
701 fjx0
= _mm_add_pd(fjx0
,tx
);
702 fjy0
= _mm_add_pd(fjy0
,ty
);
703 fjz0
= _mm_add_pd(fjz0
,tz
);
707 /**************************
708 * CALCULATE INTERACTIONS *
709 **************************/
711 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
714 r30
= _mm_mul_pd(rsq30
,rinv30
);
716 /* Compute parameters for interactions between i and j atoms */
717 qq30
= _mm_mul_pd(iq3
,jq0
);
719 /* EWALD ELECTROSTATICS */
721 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
722 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
723 ewitab
= _mm_cvttpd_epi32(ewrt
);
724 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
725 ewitab
= _mm_slli_epi32(ewitab
,2);
726 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
727 ewtabD
= _mm_setzero_pd();
728 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
729 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
730 ewtabFn
= _mm_setzero_pd();
731 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
732 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
733 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
734 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(_mm_sub_pd(rinv30
,sh_ewald
),velec
));
735 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
737 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velec
= _mm_and_pd(velec
,cutoff_mask
);
741 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
742 velecsum
= _mm_add_pd(velecsum
,velec
);
746 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
748 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
750 /* Calculate temporary vectorial force */
751 tx
= _mm_mul_pd(fscal
,dx30
);
752 ty
= _mm_mul_pd(fscal
,dy30
);
753 tz
= _mm_mul_pd(fscal
,dz30
);
755 /* Update vectorial force */
756 fix3
= _mm_add_pd(fix3
,tx
);
757 fiy3
= _mm_add_pd(fiy3
,ty
);
758 fiz3
= _mm_add_pd(fiz3
,tz
);
760 fjx0
= _mm_add_pd(fjx0
,tx
);
761 fjy0
= _mm_add_pd(fjy0
,ty
);
762 fjz0
= _mm_add_pd(fjz0
,tz
);
766 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
768 /* Inner loop uses 203 flops */
771 /* End of innermost loop */
773 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
774 f
+i_coord_offset
,fshift
+i_shift_offset
);
777 /* Update potential energies */
778 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
779 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
781 /* Increment number of inner iterations */
782 inneriter
+= j_index_end
- j_index_start
;
784 /* Outer loop uses 26 flops */
787 /* Increment number of outer iterations */
790 /* Update outer/inner flops */
792 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_VF
,outeriter
*26 + inneriter
*203);
795 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double
796 * Electrostatics interaction: Ewald
797 * VdW interaction: LJEwald
798 * Geometry: Water4-Particle
799 * Calculate force/pot: Force
802 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double
803 (t_nblist
* gmx_restrict nlist
,
804 rvec
* gmx_restrict xx
,
805 rvec
* gmx_restrict ff
,
806 struct t_forcerec
* gmx_restrict fr
,
807 t_mdatoms
* gmx_restrict mdatoms
,
808 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
809 t_nrnb
* gmx_restrict nrnb
)
811 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
812 * just 0 for non-waters.
813 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
814 * jnr indices corresponding to data put in the four positions in the SIMD register.
816 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
817 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
819 int j_coord_offsetA
,j_coord_offsetB
;
820 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
822 real
*shiftvec
,*fshift
,*x
,*f
;
823 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
825 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
827 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
829 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
831 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
832 int vdwjidx0A
,vdwjidx0B
;
833 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
834 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
835 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
836 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
837 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
838 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
841 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
844 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
845 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
850 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
852 __m128d one_half
= _mm_set1_pd(0.5);
853 __m128d minus_one
= _mm_set1_pd(-1.0);
855 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
857 __m128d dummy_mask
,cutoff_mask
;
858 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
859 __m128d one
= _mm_set1_pd(1.0);
860 __m128d two
= _mm_set1_pd(2.0);
866 jindex
= nlist
->jindex
;
868 shiftidx
= nlist
->shift
;
870 shiftvec
= fr
->shift_vec
[0];
871 fshift
= fr
->fshift
[0];
872 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
873 charge
= mdatoms
->chargeA
;
874 nvdwtype
= fr
->ntype
;
876 vdwtype
= mdatoms
->typeA
;
877 vdwgridparam
= fr
->ljpme_c6grid
;
878 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
879 ewclj
= _mm_set1_pd(fr
->ic
->ewaldcoeff_lj
);
880 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
882 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
883 ewtab
= fr
->ic
->tabq_coul_F
;
884 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
885 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
887 /* Setup water-specific parameters */
888 inr
= nlist
->iinr
[0];
889 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
890 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
891 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
892 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
894 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
895 rcutoff_scalar
= fr
->ic
->rcoulomb
;
896 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
897 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
899 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
900 rvdw
= _mm_set1_pd(fr
->ic
->rvdw
);
902 /* Avoid stupid compiler warnings */
910 /* Start outer loop over neighborlists */
911 for(iidx
=0; iidx
<nri
; iidx
++)
913 /* Load shift vector for this list */
914 i_shift_offset
= DIM
*shiftidx
[iidx
];
916 /* Load limits for loop over neighbors */
917 j_index_start
= jindex
[iidx
];
918 j_index_end
= jindex
[iidx
+1];
920 /* Get outer coordinate index */
922 i_coord_offset
= DIM
*inr
;
924 /* Load i particle coords and add shift vector */
925 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
926 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
928 fix0
= _mm_setzero_pd();
929 fiy0
= _mm_setzero_pd();
930 fiz0
= _mm_setzero_pd();
931 fix1
= _mm_setzero_pd();
932 fiy1
= _mm_setzero_pd();
933 fiz1
= _mm_setzero_pd();
934 fix2
= _mm_setzero_pd();
935 fiy2
= _mm_setzero_pd();
936 fiz2
= _mm_setzero_pd();
937 fix3
= _mm_setzero_pd();
938 fiy3
= _mm_setzero_pd();
939 fiz3
= _mm_setzero_pd();
941 /* Start inner kernel loop */
942 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
945 /* Get j neighbor index, and coordinate index */
948 j_coord_offsetA
= DIM
*jnrA
;
949 j_coord_offsetB
= DIM
*jnrB
;
951 /* load j atom coordinates */
952 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
955 /* Calculate displacement vector */
956 dx00
= _mm_sub_pd(ix0
,jx0
);
957 dy00
= _mm_sub_pd(iy0
,jy0
);
958 dz00
= _mm_sub_pd(iz0
,jz0
);
959 dx10
= _mm_sub_pd(ix1
,jx0
);
960 dy10
= _mm_sub_pd(iy1
,jy0
);
961 dz10
= _mm_sub_pd(iz1
,jz0
);
962 dx20
= _mm_sub_pd(ix2
,jx0
);
963 dy20
= _mm_sub_pd(iy2
,jy0
);
964 dz20
= _mm_sub_pd(iz2
,jz0
);
965 dx30
= _mm_sub_pd(ix3
,jx0
);
966 dy30
= _mm_sub_pd(iy3
,jy0
);
967 dz30
= _mm_sub_pd(iz3
,jz0
);
969 /* Calculate squared distance and things based on it */
970 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
971 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
972 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
973 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
975 rinv00
= sse2_invsqrt_d(rsq00
);
976 rinv10
= sse2_invsqrt_d(rsq10
);
977 rinv20
= sse2_invsqrt_d(rsq20
);
978 rinv30
= sse2_invsqrt_d(rsq30
);
980 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
981 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
982 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
983 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
985 /* Load parameters for j particles */
986 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
987 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
988 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
990 fjx0
= _mm_setzero_pd();
991 fjy0
= _mm_setzero_pd();
992 fjz0
= _mm_setzero_pd();
994 /**************************
995 * CALCULATE INTERACTIONS *
996 **************************/
998 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1001 r00
= _mm_mul_pd(rsq00
,rinv00
);
1003 /* Compute parameters for interactions between i and j atoms */
1004 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
1005 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
1007 c6grid_00
= gmx_mm_load_2real_swizzle_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
,
1008 vdwgridparam
+vdwioffset0
+vdwjidx0B
);
1010 /* Analytical LJ-PME */
1011 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1012 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
1013 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
1014 exponent
= sse2_exp_d(ewcljrsq
);
1015 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1016 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
1017 /* f6A = 6 * C6grid * (1 - poly) */
1018 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
1019 /* f6B = C6grid * exponent * beta^6 */
1020 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
1021 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1022 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1024 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1028 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1030 /* Calculate temporary vectorial force */
1031 tx
= _mm_mul_pd(fscal
,dx00
);
1032 ty
= _mm_mul_pd(fscal
,dy00
);
1033 tz
= _mm_mul_pd(fscal
,dz00
);
1035 /* Update vectorial force */
1036 fix0
= _mm_add_pd(fix0
,tx
);
1037 fiy0
= _mm_add_pd(fiy0
,ty
);
1038 fiz0
= _mm_add_pd(fiz0
,tz
);
1040 fjx0
= _mm_add_pd(fjx0
,tx
);
1041 fjy0
= _mm_add_pd(fjy0
,ty
);
1042 fjz0
= _mm_add_pd(fjz0
,tz
);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1053 r10
= _mm_mul_pd(rsq10
,rinv10
);
1055 /* Compute parameters for interactions between i and j atoms */
1056 qq10
= _mm_mul_pd(iq1
,jq0
);
1058 /* EWALD ELECTROSTATICS */
1060 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1061 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1062 ewitab
= _mm_cvttpd_epi32(ewrt
);
1063 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1064 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1066 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1067 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1069 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1073 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1075 /* Calculate temporary vectorial force */
1076 tx
= _mm_mul_pd(fscal
,dx10
);
1077 ty
= _mm_mul_pd(fscal
,dy10
);
1078 tz
= _mm_mul_pd(fscal
,dz10
);
1080 /* Update vectorial force */
1081 fix1
= _mm_add_pd(fix1
,tx
);
1082 fiy1
= _mm_add_pd(fiy1
,ty
);
1083 fiz1
= _mm_add_pd(fiz1
,tz
);
1085 fjx0
= _mm_add_pd(fjx0
,tx
);
1086 fjy0
= _mm_add_pd(fjy0
,ty
);
1087 fjz0
= _mm_add_pd(fjz0
,tz
);
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1095 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1098 r20
= _mm_mul_pd(rsq20
,rinv20
);
1100 /* Compute parameters for interactions between i and j atoms */
1101 qq20
= _mm_mul_pd(iq2
,jq0
);
1103 /* EWALD ELECTROSTATICS */
1105 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1106 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1107 ewitab
= _mm_cvttpd_epi32(ewrt
);
1108 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1109 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1111 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1112 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1114 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1118 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1120 /* Calculate temporary vectorial force */
1121 tx
= _mm_mul_pd(fscal
,dx20
);
1122 ty
= _mm_mul_pd(fscal
,dy20
);
1123 tz
= _mm_mul_pd(fscal
,dz20
);
1125 /* Update vectorial force */
1126 fix2
= _mm_add_pd(fix2
,tx
);
1127 fiy2
= _mm_add_pd(fiy2
,ty
);
1128 fiz2
= _mm_add_pd(fiz2
,tz
);
1130 fjx0
= _mm_add_pd(fjx0
,tx
);
1131 fjy0
= _mm_add_pd(fjy0
,ty
);
1132 fjz0
= _mm_add_pd(fjz0
,tz
);
1136 /**************************
1137 * CALCULATE INTERACTIONS *
1138 **************************/
1140 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1143 r30
= _mm_mul_pd(rsq30
,rinv30
);
1145 /* Compute parameters for interactions between i and j atoms */
1146 qq30
= _mm_mul_pd(iq3
,jq0
);
1148 /* EWALD ELECTROSTATICS */
1150 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1151 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1152 ewitab
= _mm_cvttpd_epi32(ewrt
);
1153 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1154 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1156 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1157 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1159 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1163 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1165 /* Calculate temporary vectorial force */
1166 tx
= _mm_mul_pd(fscal
,dx30
);
1167 ty
= _mm_mul_pd(fscal
,dy30
);
1168 tz
= _mm_mul_pd(fscal
,dz30
);
1170 /* Update vectorial force */
1171 fix3
= _mm_add_pd(fix3
,tx
);
1172 fiy3
= _mm_add_pd(fiy3
,ty
);
1173 fiz3
= _mm_add_pd(fiz3
,tz
);
1175 fjx0
= _mm_add_pd(fjx0
,tx
);
1176 fjy0
= _mm_add_pd(fjy0
,ty
);
1177 fjz0
= _mm_add_pd(fjz0
,tz
);
1181 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
1183 /* Inner loop uses 169 flops */
1186 if(jidx
<j_index_end
)
1190 j_coord_offsetA
= DIM
*jnrA
;
1192 /* load j atom coordinates */
1193 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1196 /* Calculate displacement vector */
1197 dx00
= _mm_sub_pd(ix0
,jx0
);
1198 dy00
= _mm_sub_pd(iy0
,jy0
);
1199 dz00
= _mm_sub_pd(iz0
,jz0
);
1200 dx10
= _mm_sub_pd(ix1
,jx0
);
1201 dy10
= _mm_sub_pd(iy1
,jy0
);
1202 dz10
= _mm_sub_pd(iz1
,jz0
);
1203 dx20
= _mm_sub_pd(ix2
,jx0
);
1204 dy20
= _mm_sub_pd(iy2
,jy0
);
1205 dz20
= _mm_sub_pd(iz2
,jz0
);
1206 dx30
= _mm_sub_pd(ix3
,jx0
);
1207 dy30
= _mm_sub_pd(iy3
,jy0
);
1208 dz30
= _mm_sub_pd(iz3
,jz0
);
1210 /* Calculate squared distance and things based on it */
1211 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1212 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1213 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1214 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
1216 rinv00
= sse2_invsqrt_d(rsq00
);
1217 rinv10
= sse2_invsqrt_d(rsq10
);
1218 rinv20
= sse2_invsqrt_d(rsq20
);
1219 rinv30
= sse2_invsqrt_d(rsq30
);
1221 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1222 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1223 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1224 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
1226 /* Load parameters for j particles */
1227 jq0
= _mm_load_sd(charge
+jnrA
+0);
1228 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1230 fjx0
= _mm_setzero_pd();
1231 fjy0
= _mm_setzero_pd();
1232 fjz0
= _mm_setzero_pd();
1234 /**************************
1235 * CALCULATE INTERACTIONS *
1236 **************************/
1238 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1241 r00
= _mm_mul_pd(rsq00
,rinv00
);
1243 /* Compute parameters for interactions between i and j atoms */
1244 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
1246 c6grid_00
= gmx_mm_load_1real_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
);
1248 /* Analytical LJ-PME */
1249 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1250 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
1251 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
1252 exponent
= sse2_exp_d(ewcljrsq
);
1253 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1254 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
1255 /* f6A = 6 * C6grid * (1 - poly) */
1256 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
1257 /* f6B = C6grid * exponent * beta^6 */
1258 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
1259 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1260 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1262 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1266 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1268 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1270 /* Calculate temporary vectorial force */
1271 tx
= _mm_mul_pd(fscal
,dx00
);
1272 ty
= _mm_mul_pd(fscal
,dy00
);
1273 tz
= _mm_mul_pd(fscal
,dz00
);
1275 /* Update vectorial force */
1276 fix0
= _mm_add_pd(fix0
,tx
);
1277 fiy0
= _mm_add_pd(fiy0
,ty
);
1278 fiz0
= _mm_add_pd(fiz0
,tz
);
1280 fjx0
= _mm_add_pd(fjx0
,tx
);
1281 fjy0
= _mm_add_pd(fjy0
,ty
);
1282 fjz0
= _mm_add_pd(fjz0
,tz
);
1286 /**************************
1287 * CALCULATE INTERACTIONS *
1288 **************************/
1290 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1293 r10
= _mm_mul_pd(rsq10
,rinv10
);
1295 /* Compute parameters for interactions between i and j atoms */
1296 qq10
= _mm_mul_pd(iq1
,jq0
);
1298 /* EWALD ELECTROSTATICS */
1300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1301 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1302 ewitab
= _mm_cvttpd_epi32(ewrt
);
1303 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1304 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1305 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1306 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1308 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1312 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1314 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1316 /* Calculate temporary vectorial force */
1317 tx
= _mm_mul_pd(fscal
,dx10
);
1318 ty
= _mm_mul_pd(fscal
,dy10
);
1319 tz
= _mm_mul_pd(fscal
,dz10
);
1321 /* Update vectorial force */
1322 fix1
= _mm_add_pd(fix1
,tx
);
1323 fiy1
= _mm_add_pd(fiy1
,ty
);
1324 fiz1
= _mm_add_pd(fiz1
,tz
);
1326 fjx0
= _mm_add_pd(fjx0
,tx
);
1327 fjy0
= _mm_add_pd(fjy0
,ty
);
1328 fjz0
= _mm_add_pd(fjz0
,tz
);
1332 /**************************
1333 * CALCULATE INTERACTIONS *
1334 **************************/
1336 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1339 r20
= _mm_mul_pd(rsq20
,rinv20
);
1341 /* Compute parameters for interactions between i and j atoms */
1342 qq20
= _mm_mul_pd(iq2
,jq0
);
1344 /* EWALD ELECTROSTATICS */
1346 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1347 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1348 ewitab
= _mm_cvttpd_epi32(ewrt
);
1349 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1350 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1351 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1352 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1354 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1358 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1360 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1362 /* Calculate temporary vectorial force */
1363 tx
= _mm_mul_pd(fscal
,dx20
);
1364 ty
= _mm_mul_pd(fscal
,dy20
);
1365 tz
= _mm_mul_pd(fscal
,dz20
);
1367 /* Update vectorial force */
1368 fix2
= _mm_add_pd(fix2
,tx
);
1369 fiy2
= _mm_add_pd(fiy2
,ty
);
1370 fiz2
= _mm_add_pd(fiz2
,tz
);
1372 fjx0
= _mm_add_pd(fjx0
,tx
);
1373 fjy0
= _mm_add_pd(fjy0
,ty
);
1374 fjz0
= _mm_add_pd(fjz0
,tz
);
1378 /**************************
1379 * CALCULATE INTERACTIONS *
1380 **************************/
1382 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1385 r30
= _mm_mul_pd(rsq30
,rinv30
);
1387 /* Compute parameters for interactions between i and j atoms */
1388 qq30
= _mm_mul_pd(iq3
,jq0
);
1390 /* EWALD ELECTROSTATICS */
1392 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1393 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1394 ewitab
= _mm_cvttpd_epi32(ewrt
);
1395 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1396 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1397 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1398 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1400 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1404 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1406 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1408 /* Calculate temporary vectorial force */
1409 tx
= _mm_mul_pd(fscal
,dx30
);
1410 ty
= _mm_mul_pd(fscal
,dy30
);
1411 tz
= _mm_mul_pd(fscal
,dz30
);
1413 /* Update vectorial force */
1414 fix3
= _mm_add_pd(fix3
,tx
);
1415 fiy3
= _mm_add_pd(fiy3
,ty
);
1416 fiz3
= _mm_add_pd(fiz3
,tz
);
1418 fjx0
= _mm_add_pd(fjx0
,tx
);
1419 fjy0
= _mm_add_pd(fjy0
,ty
);
1420 fjz0
= _mm_add_pd(fjz0
,tz
);
1424 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1426 /* Inner loop uses 169 flops */
1429 /* End of innermost loop */
1431 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1432 f
+i_coord_offset
,fshift
+i_shift_offset
);
1434 /* Increment number of inner iterations */
1435 inneriter
+= j_index_end
- j_index_start
;
1437 /* Outer loop uses 24 flops */
1440 /* Increment number of outer iterations */
1443 /* Update outer/inner flops */
1445 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_F
,outeriter
*24 + inneriter
*169);