2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LJEwald
55 * Geometry: Water4-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
92 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
93 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
94 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
95 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
98 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
101 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
102 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
107 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
109 __m128d one_half
= _mm_set1_pd(0.5);
110 __m128d minus_one
= _mm_set1_pd(-1.0);
112 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
114 __m128d dummy_mask
,cutoff_mask
;
115 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116 __m128d one
= _mm_set1_pd(1.0);
117 __m128d two
= _mm_set1_pd(2.0);
123 jindex
= nlist
->jindex
;
125 shiftidx
= nlist
->shift
;
127 shiftvec
= fr
->shift_vec
[0];
128 fshift
= fr
->fshift
[0];
129 facel
= _mm_set1_pd(fr
->epsfac
);
130 charge
= mdatoms
->chargeA
;
131 nvdwtype
= fr
->ntype
;
133 vdwtype
= mdatoms
->typeA
;
134 vdwgridparam
= fr
->ljpme_c6grid
;
135 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
136 ewclj
= _mm_set1_pd(fr
->ewaldcoeff_lj
);
137 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
139 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
140 ewtab
= fr
->ic
->tabq_coul_FDV0
;
141 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
142 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
144 /* Setup water-specific parameters */
145 inr
= nlist
->iinr
[0];
146 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
147 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
148 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
149 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
151 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
152 rcutoff_scalar
= fr
->rcoulomb
;
153 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
154 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
156 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
157 rvdw
= _mm_set1_pd(fr
->rvdw
);
159 /* Avoid stupid compiler warnings */
167 /* Start outer loop over neighborlists */
168 for(iidx
=0; iidx
<nri
; iidx
++)
170 /* Load shift vector for this list */
171 i_shift_offset
= DIM
*shiftidx
[iidx
];
173 /* Load limits for loop over neighbors */
174 j_index_start
= jindex
[iidx
];
175 j_index_end
= jindex
[iidx
+1];
177 /* Get outer coordinate index */
179 i_coord_offset
= DIM
*inr
;
181 /* Load i particle coords and add shift vector */
182 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
183 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
185 fix0
= _mm_setzero_pd();
186 fiy0
= _mm_setzero_pd();
187 fiz0
= _mm_setzero_pd();
188 fix1
= _mm_setzero_pd();
189 fiy1
= _mm_setzero_pd();
190 fiz1
= _mm_setzero_pd();
191 fix2
= _mm_setzero_pd();
192 fiy2
= _mm_setzero_pd();
193 fiz2
= _mm_setzero_pd();
194 fix3
= _mm_setzero_pd();
195 fiy3
= _mm_setzero_pd();
196 fiz3
= _mm_setzero_pd();
198 /* Reset potential sums */
199 velecsum
= _mm_setzero_pd();
200 vvdwsum
= _mm_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA
= DIM
*jnrA
;
210 j_coord_offsetB
= DIM
*jnrB
;
212 /* load j atom coordinates */
213 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
216 /* Calculate displacement vector */
217 dx00
= _mm_sub_pd(ix0
,jx0
);
218 dy00
= _mm_sub_pd(iy0
,jy0
);
219 dz00
= _mm_sub_pd(iz0
,jz0
);
220 dx10
= _mm_sub_pd(ix1
,jx0
);
221 dy10
= _mm_sub_pd(iy1
,jy0
);
222 dz10
= _mm_sub_pd(iz1
,jz0
);
223 dx20
= _mm_sub_pd(ix2
,jx0
);
224 dy20
= _mm_sub_pd(iy2
,jy0
);
225 dz20
= _mm_sub_pd(iz2
,jz0
);
226 dx30
= _mm_sub_pd(ix3
,jx0
);
227 dy30
= _mm_sub_pd(iy3
,jy0
);
228 dz30
= _mm_sub_pd(iz3
,jz0
);
230 /* Calculate squared distance and things based on it */
231 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
232 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
233 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
234 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
236 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
237 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
238 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
239 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
241 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
242 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
243 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
244 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
246 /* Load parameters for j particles */
247 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
248 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
249 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
251 fjx0
= _mm_setzero_pd();
252 fjy0
= _mm_setzero_pd();
253 fjz0
= _mm_setzero_pd();
255 /**************************
256 * CALCULATE INTERACTIONS *
257 **************************/
259 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
262 r00
= _mm_mul_pd(rsq00
,rinv00
);
264 /* Compute parameters for interactions between i and j atoms */
265 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
266 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
268 c6grid_00
= gmx_mm_load_2real_swizzle_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
,
269 vdwgridparam
+vdwioffset0
+vdwjidx0B
);
271 /* Analytical LJ-PME */
272 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
273 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
274 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
275 exponent
= gmx_simd_exp_d(ewcljrsq
);
276 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
277 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
278 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
279 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
280 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
281 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
282 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
283 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
284 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
286 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
288 /* Update potential sum for this i atom from the interaction with this j atom. */
289 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
290 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
294 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
296 /* Calculate temporary vectorial force */
297 tx
= _mm_mul_pd(fscal
,dx00
);
298 ty
= _mm_mul_pd(fscal
,dy00
);
299 tz
= _mm_mul_pd(fscal
,dz00
);
301 /* Update vectorial force */
302 fix0
= _mm_add_pd(fix0
,tx
);
303 fiy0
= _mm_add_pd(fiy0
,ty
);
304 fiz0
= _mm_add_pd(fiz0
,tz
);
306 fjx0
= _mm_add_pd(fjx0
,tx
);
307 fjy0
= _mm_add_pd(fjy0
,ty
);
308 fjz0
= _mm_add_pd(fjz0
,tz
);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
319 r10
= _mm_mul_pd(rsq10
,rinv10
);
321 /* Compute parameters for interactions between i and j atoms */
322 qq10
= _mm_mul_pd(iq1
,jq0
);
324 /* EWALD ELECTROSTATICS */
326 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
327 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
328 ewitab
= _mm_cvttpd_epi32(ewrt
);
329 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
330 ewitab
= _mm_slli_epi32(ewitab
,2);
331 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
332 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
333 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
334 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
335 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
336 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
337 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
338 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
339 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
340 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
342 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velec
= _mm_and_pd(velec
,cutoff_mask
);
346 velecsum
= _mm_add_pd(velecsum
,velec
);
350 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
352 /* Calculate temporary vectorial force */
353 tx
= _mm_mul_pd(fscal
,dx10
);
354 ty
= _mm_mul_pd(fscal
,dy10
);
355 tz
= _mm_mul_pd(fscal
,dz10
);
357 /* Update vectorial force */
358 fix1
= _mm_add_pd(fix1
,tx
);
359 fiy1
= _mm_add_pd(fiy1
,ty
);
360 fiz1
= _mm_add_pd(fiz1
,tz
);
362 fjx0
= _mm_add_pd(fjx0
,tx
);
363 fjy0
= _mm_add_pd(fjy0
,ty
);
364 fjz0
= _mm_add_pd(fjz0
,tz
);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
375 r20
= _mm_mul_pd(rsq20
,rinv20
);
377 /* Compute parameters for interactions between i and j atoms */
378 qq20
= _mm_mul_pd(iq2
,jq0
);
380 /* EWALD ELECTROSTATICS */
382 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
383 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
384 ewitab
= _mm_cvttpd_epi32(ewrt
);
385 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
386 ewitab
= _mm_slli_epi32(ewitab
,2);
387 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
388 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
389 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
390 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
391 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
392 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
393 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
394 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
395 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
396 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
398 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velec
= _mm_and_pd(velec
,cutoff_mask
);
402 velecsum
= _mm_add_pd(velecsum
,velec
);
406 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
408 /* Calculate temporary vectorial force */
409 tx
= _mm_mul_pd(fscal
,dx20
);
410 ty
= _mm_mul_pd(fscal
,dy20
);
411 tz
= _mm_mul_pd(fscal
,dz20
);
413 /* Update vectorial force */
414 fix2
= _mm_add_pd(fix2
,tx
);
415 fiy2
= _mm_add_pd(fiy2
,ty
);
416 fiz2
= _mm_add_pd(fiz2
,tz
);
418 fjx0
= _mm_add_pd(fjx0
,tx
);
419 fjy0
= _mm_add_pd(fjy0
,ty
);
420 fjz0
= _mm_add_pd(fjz0
,tz
);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
431 r30
= _mm_mul_pd(rsq30
,rinv30
);
433 /* Compute parameters for interactions between i and j atoms */
434 qq30
= _mm_mul_pd(iq3
,jq0
);
436 /* EWALD ELECTROSTATICS */
438 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
439 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
440 ewitab
= _mm_cvttpd_epi32(ewrt
);
441 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
442 ewitab
= _mm_slli_epi32(ewitab
,2);
443 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
444 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
445 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
446 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
447 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
448 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
449 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
450 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
451 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(_mm_sub_pd(rinv30
,sh_ewald
),velec
));
452 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
454 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velec
= _mm_and_pd(velec
,cutoff_mask
);
458 velecsum
= _mm_add_pd(velecsum
,velec
);
462 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
464 /* Calculate temporary vectorial force */
465 tx
= _mm_mul_pd(fscal
,dx30
);
466 ty
= _mm_mul_pd(fscal
,dy30
);
467 tz
= _mm_mul_pd(fscal
,dz30
);
469 /* Update vectorial force */
470 fix3
= _mm_add_pd(fix3
,tx
);
471 fiy3
= _mm_add_pd(fiy3
,ty
);
472 fiz3
= _mm_add_pd(fiz3
,tz
);
474 fjx0
= _mm_add_pd(fjx0
,tx
);
475 fjy0
= _mm_add_pd(fjy0
,ty
);
476 fjz0
= _mm_add_pd(fjz0
,tz
);
480 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
482 /* Inner loop uses 203 flops */
489 j_coord_offsetA
= DIM
*jnrA
;
491 /* load j atom coordinates */
492 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
495 /* Calculate displacement vector */
496 dx00
= _mm_sub_pd(ix0
,jx0
);
497 dy00
= _mm_sub_pd(iy0
,jy0
);
498 dz00
= _mm_sub_pd(iz0
,jz0
);
499 dx10
= _mm_sub_pd(ix1
,jx0
);
500 dy10
= _mm_sub_pd(iy1
,jy0
);
501 dz10
= _mm_sub_pd(iz1
,jz0
);
502 dx20
= _mm_sub_pd(ix2
,jx0
);
503 dy20
= _mm_sub_pd(iy2
,jy0
);
504 dz20
= _mm_sub_pd(iz2
,jz0
);
505 dx30
= _mm_sub_pd(ix3
,jx0
);
506 dy30
= _mm_sub_pd(iy3
,jy0
);
507 dz30
= _mm_sub_pd(iz3
,jz0
);
509 /* Calculate squared distance and things based on it */
510 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
511 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
512 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
513 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
515 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
516 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
517 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
518 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
520 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
521 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
522 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
523 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
525 /* Load parameters for j particles */
526 jq0
= _mm_load_sd(charge
+jnrA
+0);
527 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
529 fjx0
= _mm_setzero_pd();
530 fjy0
= _mm_setzero_pd();
531 fjz0
= _mm_setzero_pd();
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
540 r00
= _mm_mul_pd(rsq00
,rinv00
);
542 /* Compute parameters for interactions between i and j atoms */
543 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
545 c6grid_00
= gmx_mm_load_1real_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
);
547 /* Analytical LJ-PME */
548 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
549 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
550 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
551 exponent
= gmx_simd_exp_d(ewcljrsq
);
552 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
553 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
554 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
555 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
556 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
557 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
558 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
559 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
560 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
562 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
564 /* Update potential sum for this i atom from the interaction with this j atom. */
565 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
566 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
567 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
571 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
573 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
575 /* Calculate temporary vectorial force */
576 tx
= _mm_mul_pd(fscal
,dx00
);
577 ty
= _mm_mul_pd(fscal
,dy00
);
578 tz
= _mm_mul_pd(fscal
,dz00
);
580 /* Update vectorial force */
581 fix0
= _mm_add_pd(fix0
,tx
);
582 fiy0
= _mm_add_pd(fiy0
,ty
);
583 fiz0
= _mm_add_pd(fiz0
,tz
);
585 fjx0
= _mm_add_pd(fjx0
,tx
);
586 fjy0
= _mm_add_pd(fjy0
,ty
);
587 fjz0
= _mm_add_pd(fjz0
,tz
);
591 /**************************
592 * CALCULATE INTERACTIONS *
593 **************************/
595 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
598 r10
= _mm_mul_pd(rsq10
,rinv10
);
600 /* Compute parameters for interactions between i and j atoms */
601 qq10
= _mm_mul_pd(iq1
,jq0
);
603 /* EWALD ELECTROSTATICS */
605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
606 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
607 ewitab
= _mm_cvttpd_epi32(ewrt
);
608 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
609 ewitab
= _mm_slli_epi32(ewitab
,2);
610 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
611 ewtabD
= _mm_setzero_pd();
612 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
613 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
614 ewtabFn
= _mm_setzero_pd();
615 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
616 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
617 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
618 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
619 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
621 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
623 /* Update potential sum for this i atom from the interaction with this j atom. */
624 velec
= _mm_and_pd(velec
,cutoff_mask
);
625 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
626 velecsum
= _mm_add_pd(velecsum
,velec
);
630 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
632 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
634 /* Calculate temporary vectorial force */
635 tx
= _mm_mul_pd(fscal
,dx10
);
636 ty
= _mm_mul_pd(fscal
,dy10
);
637 tz
= _mm_mul_pd(fscal
,dz10
);
639 /* Update vectorial force */
640 fix1
= _mm_add_pd(fix1
,tx
);
641 fiy1
= _mm_add_pd(fiy1
,ty
);
642 fiz1
= _mm_add_pd(fiz1
,tz
);
644 fjx0
= _mm_add_pd(fjx0
,tx
);
645 fjy0
= _mm_add_pd(fjy0
,ty
);
646 fjz0
= _mm_add_pd(fjz0
,tz
);
650 /**************************
651 * CALCULATE INTERACTIONS *
652 **************************/
654 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
657 r20
= _mm_mul_pd(rsq20
,rinv20
);
659 /* Compute parameters for interactions between i and j atoms */
660 qq20
= _mm_mul_pd(iq2
,jq0
);
662 /* EWALD ELECTROSTATICS */
664 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
665 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
666 ewitab
= _mm_cvttpd_epi32(ewrt
);
667 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
668 ewitab
= _mm_slli_epi32(ewitab
,2);
669 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
670 ewtabD
= _mm_setzero_pd();
671 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
672 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
673 ewtabFn
= _mm_setzero_pd();
674 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
675 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
676 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
677 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
678 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
680 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
682 /* Update potential sum for this i atom from the interaction with this j atom. */
683 velec
= _mm_and_pd(velec
,cutoff_mask
);
684 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
685 velecsum
= _mm_add_pd(velecsum
,velec
);
689 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
691 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
693 /* Calculate temporary vectorial force */
694 tx
= _mm_mul_pd(fscal
,dx20
);
695 ty
= _mm_mul_pd(fscal
,dy20
);
696 tz
= _mm_mul_pd(fscal
,dz20
);
698 /* Update vectorial force */
699 fix2
= _mm_add_pd(fix2
,tx
);
700 fiy2
= _mm_add_pd(fiy2
,ty
);
701 fiz2
= _mm_add_pd(fiz2
,tz
);
703 fjx0
= _mm_add_pd(fjx0
,tx
);
704 fjy0
= _mm_add_pd(fjy0
,ty
);
705 fjz0
= _mm_add_pd(fjz0
,tz
);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
716 r30
= _mm_mul_pd(rsq30
,rinv30
);
718 /* Compute parameters for interactions between i and j atoms */
719 qq30
= _mm_mul_pd(iq3
,jq0
);
721 /* EWALD ELECTROSTATICS */
723 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
724 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
725 ewitab
= _mm_cvttpd_epi32(ewrt
);
726 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
727 ewitab
= _mm_slli_epi32(ewitab
,2);
728 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
729 ewtabD
= _mm_setzero_pd();
730 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
731 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
732 ewtabFn
= _mm_setzero_pd();
733 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
734 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
735 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
736 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(_mm_sub_pd(rinv30
,sh_ewald
),velec
));
737 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
739 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
741 /* Update potential sum for this i atom from the interaction with this j atom. */
742 velec
= _mm_and_pd(velec
,cutoff_mask
);
743 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
744 velecsum
= _mm_add_pd(velecsum
,velec
);
748 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
750 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
752 /* Calculate temporary vectorial force */
753 tx
= _mm_mul_pd(fscal
,dx30
);
754 ty
= _mm_mul_pd(fscal
,dy30
);
755 tz
= _mm_mul_pd(fscal
,dz30
);
757 /* Update vectorial force */
758 fix3
= _mm_add_pd(fix3
,tx
);
759 fiy3
= _mm_add_pd(fiy3
,ty
);
760 fiz3
= _mm_add_pd(fiz3
,tz
);
762 fjx0
= _mm_add_pd(fjx0
,tx
);
763 fjy0
= _mm_add_pd(fjy0
,ty
);
764 fjz0
= _mm_add_pd(fjz0
,tz
);
768 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
770 /* Inner loop uses 203 flops */
773 /* End of innermost loop */
775 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
776 f
+i_coord_offset
,fshift
+i_shift_offset
);
779 /* Update potential energies */
780 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
781 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
783 /* Increment number of inner iterations */
784 inneriter
+= j_index_end
- j_index_start
;
786 /* Outer loop uses 26 flops */
789 /* Increment number of outer iterations */
792 /* Update outer/inner flops */
794 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_VF
,outeriter
*26 + inneriter
*203);
797 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double
798 * Electrostatics interaction: Ewald
799 * VdW interaction: LJEwald
800 * Geometry: Water4-Particle
801 * Calculate force/pot: Force
804 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double
805 (t_nblist
* gmx_restrict nlist
,
806 rvec
* gmx_restrict xx
,
807 rvec
* gmx_restrict ff
,
808 t_forcerec
* gmx_restrict fr
,
809 t_mdatoms
* gmx_restrict mdatoms
,
810 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
811 t_nrnb
* gmx_restrict nrnb
)
813 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
814 * just 0 for non-waters.
815 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
816 * jnr indices corresponding to data put in the four positions in the SIMD register.
818 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
819 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
821 int j_coord_offsetA
,j_coord_offsetB
;
822 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
824 real
*shiftvec
,*fshift
,*x
,*f
;
825 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
827 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
829 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
831 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
833 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
834 int vdwjidx0A
,vdwjidx0B
;
835 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
836 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
837 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
838 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
839 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
840 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
843 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
846 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
847 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
852 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
854 __m128d one_half
= _mm_set1_pd(0.5);
855 __m128d minus_one
= _mm_set1_pd(-1.0);
857 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
859 __m128d dummy_mask
,cutoff_mask
;
860 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
861 __m128d one
= _mm_set1_pd(1.0);
862 __m128d two
= _mm_set1_pd(2.0);
868 jindex
= nlist
->jindex
;
870 shiftidx
= nlist
->shift
;
872 shiftvec
= fr
->shift_vec
[0];
873 fshift
= fr
->fshift
[0];
874 facel
= _mm_set1_pd(fr
->epsfac
);
875 charge
= mdatoms
->chargeA
;
876 nvdwtype
= fr
->ntype
;
878 vdwtype
= mdatoms
->typeA
;
879 vdwgridparam
= fr
->ljpme_c6grid
;
880 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
881 ewclj
= _mm_set1_pd(fr
->ewaldcoeff_lj
);
882 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
884 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
885 ewtab
= fr
->ic
->tabq_coul_F
;
886 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
887 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
889 /* Setup water-specific parameters */
890 inr
= nlist
->iinr
[0];
891 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
892 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
893 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
894 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
896 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
897 rcutoff_scalar
= fr
->rcoulomb
;
898 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
899 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
901 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
902 rvdw
= _mm_set1_pd(fr
->rvdw
);
904 /* Avoid stupid compiler warnings */
912 /* Start outer loop over neighborlists */
913 for(iidx
=0; iidx
<nri
; iidx
++)
915 /* Load shift vector for this list */
916 i_shift_offset
= DIM
*shiftidx
[iidx
];
918 /* Load limits for loop over neighbors */
919 j_index_start
= jindex
[iidx
];
920 j_index_end
= jindex
[iidx
+1];
922 /* Get outer coordinate index */
924 i_coord_offset
= DIM
*inr
;
926 /* Load i particle coords and add shift vector */
927 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
928 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
930 fix0
= _mm_setzero_pd();
931 fiy0
= _mm_setzero_pd();
932 fiz0
= _mm_setzero_pd();
933 fix1
= _mm_setzero_pd();
934 fiy1
= _mm_setzero_pd();
935 fiz1
= _mm_setzero_pd();
936 fix2
= _mm_setzero_pd();
937 fiy2
= _mm_setzero_pd();
938 fiz2
= _mm_setzero_pd();
939 fix3
= _mm_setzero_pd();
940 fiy3
= _mm_setzero_pd();
941 fiz3
= _mm_setzero_pd();
943 /* Start inner kernel loop */
944 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
947 /* Get j neighbor index, and coordinate index */
950 j_coord_offsetA
= DIM
*jnrA
;
951 j_coord_offsetB
= DIM
*jnrB
;
953 /* load j atom coordinates */
954 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
957 /* Calculate displacement vector */
958 dx00
= _mm_sub_pd(ix0
,jx0
);
959 dy00
= _mm_sub_pd(iy0
,jy0
);
960 dz00
= _mm_sub_pd(iz0
,jz0
);
961 dx10
= _mm_sub_pd(ix1
,jx0
);
962 dy10
= _mm_sub_pd(iy1
,jy0
);
963 dz10
= _mm_sub_pd(iz1
,jz0
);
964 dx20
= _mm_sub_pd(ix2
,jx0
);
965 dy20
= _mm_sub_pd(iy2
,jy0
);
966 dz20
= _mm_sub_pd(iz2
,jz0
);
967 dx30
= _mm_sub_pd(ix3
,jx0
);
968 dy30
= _mm_sub_pd(iy3
,jy0
);
969 dz30
= _mm_sub_pd(iz3
,jz0
);
971 /* Calculate squared distance and things based on it */
972 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
973 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
974 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
975 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
977 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
978 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
979 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
980 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
982 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
983 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
984 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
985 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
987 /* Load parameters for j particles */
988 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
989 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
990 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
992 fjx0
= _mm_setzero_pd();
993 fjy0
= _mm_setzero_pd();
994 fjz0
= _mm_setzero_pd();
996 /**************************
997 * CALCULATE INTERACTIONS *
998 **************************/
1000 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1003 r00
= _mm_mul_pd(rsq00
,rinv00
);
1005 /* Compute parameters for interactions between i and j atoms */
1006 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
1007 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
1009 c6grid_00
= gmx_mm_load_2real_swizzle_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
,
1010 vdwgridparam
+vdwioffset0
+vdwjidx0B
);
1012 /* Analytical LJ-PME */
1013 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1014 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
1015 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
1016 exponent
= gmx_simd_exp_d(ewcljrsq
);
1017 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1018 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
1019 /* f6A = 6 * C6grid * (1 - poly) */
1020 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
1021 /* f6B = C6grid * exponent * beta^6 */
1022 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
1023 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1024 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1026 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1030 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1032 /* Calculate temporary vectorial force */
1033 tx
= _mm_mul_pd(fscal
,dx00
);
1034 ty
= _mm_mul_pd(fscal
,dy00
);
1035 tz
= _mm_mul_pd(fscal
,dz00
);
1037 /* Update vectorial force */
1038 fix0
= _mm_add_pd(fix0
,tx
);
1039 fiy0
= _mm_add_pd(fiy0
,ty
);
1040 fiz0
= _mm_add_pd(fiz0
,tz
);
1042 fjx0
= _mm_add_pd(fjx0
,tx
);
1043 fjy0
= _mm_add_pd(fjy0
,ty
);
1044 fjz0
= _mm_add_pd(fjz0
,tz
);
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1055 r10
= _mm_mul_pd(rsq10
,rinv10
);
1057 /* Compute parameters for interactions between i and j atoms */
1058 qq10
= _mm_mul_pd(iq1
,jq0
);
1060 /* EWALD ELECTROSTATICS */
1062 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1063 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1064 ewitab
= _mm_cvttpd_epi32(ewrt
);
1065 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1066 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1068 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1069 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1071 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1075 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1077 /* Calculate temporary vectorial force */
1078 tx
= _mm_mul_pd(fscal
,dx10
);
1079 ty
= _mm_mul_pd(fscal
,dy10
);
1080 tz
= _mm_mul_pd(fscal
,dz10
);
1082 /* Update vectorial force */
1083 fix1
= _mm_add_pd(fix1
,tx
);
1084 fiy1
= _mm_add_pd(fiy1
,ty
);
1085 fiz1
= _mm_add_pd(fiz1
,tz
);
1087 fjx0
= _mm_add_pd(fjx0
,tx
);
1088 fjy0
= _mm_add_pd(fjy0
,ty
);
1089 fjz0
= _mm_add_pd(fjz0
,tz
);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1100 r20
= _mm_mul_pd(rsq20
,rinv20
);
1102 /* Compute parameters for interactions between i and j atoms */
1103 qq20
= _mm_mul_pd(iq2
,jq0
);
1105 /* EWALD ELECTROSTATICS */
1107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1108 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1109 ewitab
= _mm_cvttpd_epi32(ewrt
);
1110 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1111 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1113 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1114 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1116 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1120 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1122 /* Calculate temporary vectorial force */
1123 tx
= _mm_mul_pd(fscal
,dx20
);
1124 ty
= _mm_mul_pd(fscal
,dy20
);
1125 tz
= _mm_mul_pd(fscal
,dz20
);
1127 /* Update vectorial force */
1128 fix2
= _mm_add_pd(fix2
,tx
);
1129 fiy2
= _mm_add_pd(fiy2
,ty
);
1130 fiz2
= _mm_add_pd(fiz2
,tz
);
1132 fjx0
= _mm_add_pd(fjx0
,tx
);
1133 fjy0
= _mm_add_pd(fjy0
,ty
);
1134 fjz0
= _mm_add_pd(fjz0
,tz
);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1145 r30
= _mm_mul_pd(rsq30
,rinv30
);
1147 /* Compute parameters for interactions between i and j atoms */
1148 qq30
= _mm_mul_pd(iq3
,jq0
);
1150 /* EWALD ELECTROSTATICS */
1152 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1153 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1154 ewitab
= _mm_cvttpd_epi32(ewrt
);
1155 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1156 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1158 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1159 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1161 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1165 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1167 /* Calculate temporary vectorial force */
1168 tx
= _mm_mul_pd(fscal
,dx30
);
1169 ty
= _mm_mul_pd(fscal
,dy30
);
1170 tz
= _mm_mul_pd(fscal
,dz30
);
1172 /* Update vectorial force */
1173 fix3
= _mm_add_pd(fix3
,tx
);
1174 fiy3
= _mm_add_pd(fiy3
,ty
);
1175 fiz3
= _mm_add_pd(fiz3
,tz
);
1177 fjx0
= _mm_add_pd(fjx0
,tx
);
1178 fjy0
= _mm_add_pd(fjy0
,ty
);
1179 fjz0
= _mm_add_pd(fjz0
,tz
);
1183 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
1185 /* Inner loop uses 169 flops */
1188 if(jidx
<j_index_end
)
1192 j_coord_offsetA
= DIM
*jnrA
;
1194 /* load j atom coordinates */
1195 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1198 /* Calculate displacement vector */
1199 dx00
= _mm_sub_pd(ix0
,jx0
);
1200 dy00
= _mm_sub_pd(iy0
,jy0
);
1201 dz00
= _mm_sub_pd(iz0
,jz0
);
1202 dx10
= _mm_sub_pd(ix1
,jx0
);
1203 dy10
= _mm_sub_pd(iy1
,jy0
);
1204 dz10
= _mm_sub_pd(iz1
,jz0
);
1205 dx20
= _mm_sub_pd(ix2
,jx0
);
1206 dy20
= _mm_sub_pd(iy2
,jy0
);
1207 dz20
= _mm_sub_pd(iz2
,jz0
);
1208 dx30
= _mm_sub_pd(ix3
,jx0
);
1209 dy30
= _mm_sub_pd(iy3
,jy0
);
1210 dz30
= _mm_sub_pd(iz3
,jz0
);
1212 /* Calculate squared distance and things based on it */
1213 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1214 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1215 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1216 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
1218 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1219 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1220 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1221 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
1223 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1224 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1225 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1226 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
1228 /* Load parameters for j particles */
1229 jq0
= _mm_load_sd(charge
+jnrA
+0);
1230 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1232 fjx0
= _mm_setzero_pd();
1233 fjy0
= _mm_setzero_pd();
1234 fjz0
= _mm_setzero_pd();
1236 /**************************
1237 * CALCULATE INTERACTIONS *
1238 **************************/
1240 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1243 r00
= _mm_mul_pd(rsq00
,rinv00
);
1245 /* Compute parameters for interactions between i and j atoms */
1246 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
1248 c6grid_00
= gmx_mm_load_1real_pd(vdwgridparam
+vdwioffset0
+vdwjidx0A
);
1250 /* Analytical LJ-PME */
1251 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1252 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
1253 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
1254 exponent
= gmx_simd_exp_d(ewcljrsq
);
1255 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1256 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
1257 /* f6A = 6 * C6grid * (1 - poly) */
1258 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
1259 /* f6B = C6grid * exponent * beta^6 */
1260 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
1261 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1262 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1264 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1268 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1270 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1272 /* Calculate temporary vectorial force */
1273 tx
= _mm_mul_pd(fscal
,dx00
);
1274 ty
= _mm_mul_pd(fscal
,dy00
);
1275 tz
= _mm_mul_pd(fscal
,dz00
);
1277 /* Update vectorial force */
1278 fix0
= _mm_add_pd(fix0
,tx
);
1279 fiy0
= _mm_add_pd(fiy0
,ty
);
1280 fiz0
= _mm_add_pd(fiz0
,tz
);
1282 fjx0
= _mm_add_pd(fjx0
,tx
);
1283 fjy0
= _mm_add_pd(fjy0
,ty
);
1284 fjz0
= _mm_add_pd(fjz0
,tz
);
1288 /**************************
1289 * CALCULATE INTERACTIONS *
1290 **************************/
1292 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1295 r10
= _mm_mul_pd(rsq10
,rinv10
);
1297 /* Compute parameters for interactions between i and j atoms */
1298 qq10
= _mm_mul_pd(iq1
,jq0
);
1300 /* EWALD ELECTROSTATICS */
1302 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1303 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1304 ewitab
= _mm_cvttpd_epi32(ewrt
);
1305 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1306 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1307 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1308 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1310 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1314 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1316 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1318 /* Calculate temporary vectorial force */
1319 tx
= _mm_mul_pd(fscal
,dx10
);
1320 ty
= _mm_mul_pd(fscal
,dy10
);
1321 tz
= _mm_mul_pd(fscal
,dz10
);
1323 /* Update vectorial force */
1324 fix1
= _mm_add_pd(fix1
,tx
);
1325 fiy1
= _mm_add_pd(fiy1
,ty
);
1326 fiz1
= _mm_add_pd(fiz1
,tz
);
1328 fjx0
= _mm_add_pd(fjx0
,tx
);
1329 fjy0
= _mm_add_pd(fjy0
,ty
);
1330 fjz0
= _mm_add_pd(fjz0
,tz
);
1334 /**************************
1335 * CALCULATE INTERACTIONS *
1336 **************************/
1338 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1341 r20
= _mm_mul_pd(rsq20
,rinv20
);
1343 /* Compute parameters for interactions between i and j atoms */
1344 qq20
= _mm_mul_pd(iq2
,jq0
);
1346 /* EWALD ELECTROSTATICS */
1348 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1349 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1350 ewitab
= _mm_cvttpd_epi32(ewrt
);
1351 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1352 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1353 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1354 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1356 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1360 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1362 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1364 /* Calculate temporary vectorial force */
1365 tx
= _mm_mul_pd(fscal
,dx20
);
1366 ty
= _mm_mul_pd(fscal
,dy20
);
1367 tz
= _mm_mul_pd(fscal
,dz20
);
1369 /* Update vectorial force */
1370 fix2
= _mm_add_pd(fix2
,tx
);
1371 fiy2
= _mm_add_pd(fiy2
,ty
);
1372 fiz2
= _mm_add_pd(fiz2
,tz
);
1374 fjx0
= _mm_add_pd(fjx0
,tx
);
1375 fjy0
= _mm_add_pd(fjy0
,ty
);
1376 fjz0
= _mm_add_pd(fjz0
,tz
);
1380 /**************************
1381 * CALCULATE INTERACTIONS *
1382 **************************/
1384 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1387 r30
= _mm_mul_pd(rsq30
,rinv30
);
1389 /* Compute parameters for interactions between i and j atoms */
1390 qq30
= _mm_mul_pd(iq3
,jq0
);
1392 /* EWALD ELECTROSTATICS */
1394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1395 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1396 ewitab
= _mm_cvttpd_epi32(ewrt
);
1397 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1398 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1399 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1400 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1402 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1406 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1408 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1410 /* Calculate temporary vectorial force */
1411 tx
= _mm_mul_pd(fscal
,dx30
);
1412 ty
= _mm_mul_pd(fscal
,dy30
);
1413 tz
= _mm_mul_pd(fscal
,dz30
);
1415 /* Update vectorial force */
1416 fix3
= _mm_add_pd(fix3
,tx
);
1417 fiy3
= _mm_add_pd(fiy3
,ty
);
1418 fiz3
= _mm_add_pd(fiz3
,tz
);
1420 fjx0
= _mm_add_pd(fjx0
,tx
);
1421 fjy0
= _mm_add_pd(fjy0
,ty
);
1422 fjz0
= _mm_add_pd(fjz0
,tz
);
1426 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1428 /* Inner loop uses 169 flops */
1431 /* End of innermost loop */
1433 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1434 f
+i_coord_offset
,fshift
+i_shift_offset
);
1436 /* Increment number of inner iterations */
1437 inneriter
+= j_index_end
- j_index_start
;
1439 /* Outer loop uses 24 flops */
1442 /* Increment number of outer iterations */
1445 /* Update outer/inner flops */
1447 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_F
,outeriter
*24 + inneriter
*169);