2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_double
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
90 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
91 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
98 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
101 __m128i ifour
= _mm_set1_epi32(4);
102 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
104 __m128d dummy_mask
,cutoff_mask
;
105 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one
= _mm_set1_pd(1.0);
107 __m128d two
= _mm_set1_pd(2.0);
113 jindex
= nlist
->jindex
;
115 shiftidx
= nlist
->shift
;
117 shiftvec
= fr
->shift_vec
[0];
118 fshift
= fr
->fshift
[0];
119 facel
= _mm_set1_pd(fr
->epsfac
);
120 charge
= mdatoms
->chargeA
;
121 nvdwtype
= fr
->ntype
;
123 vdwtype
= mdatoms
->typeA
;
125 vftab
= kernel_data
->table_elec
->data
;
126 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
128 /* Setup water-specific parameters */
129 inr
= nlist
->iinr
[0];
130 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
131 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
132 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
133 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
135 /* Avoid stupid compiler warnings */
143 /* Start outer loop over neighborlists */
144 for(iidx
=0; iidx
<nri
; iidx
++)
146 /* Load shift vector for this list */
147 i_shift_offset
= DIM
*shiftidx
[iidx
];
149 /* Load limits for loop over neighbors */
150 j_index_start
= jindex
[iidx
];
151 j_index_end
= jindex
[iidx
+1];
153 /* Get outer coordinate index */
155 i_coord_offset
= DIM
*inr
;
157 /* Load i particle coords and add shift vector */
158 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
159 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
161 fix0
= _mm_setzero_pd();
162 fiy0
= _mm_setzero_pd();
163 fiz0
= _mm_setzero_pd();
164 fix1
= _mm_setzero_pd();
165 fiy1
= _mm_setzero_pd();
166 fiz1
= _mm_setzero_pd();
167 fix2
= _mm_setzero_pd();
168 fiy2
= _mm_setzero_pd();
169 fiz2
= _mm_setzero_pd();
171 /* Reset potential sums */
172 velecsum
= _mm_setzero_pd();
173 vvdwsum
= _mm_setzero_pd();
175 /* Start inner kernel loop */
176 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
179 /* Get j neighbor index, and coordinate index */
182 j_coord_offsetA
= DIM
*jnrA
;
183 j_coord_offsetB
= DIM
*jnrB
;
185 /* load j atom coordinates */
186 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
189 /* Calculate displacement vector */
190 dx00
= _mm_sub_pd(ix0
,jx0
);
191 dy00
= _mm_sub_pd(iy0
,jy0
);
192 dz00
= _mm_sub_pd(iz0
,jz0
);
193 dx10
= _mm_sub_pd(ix1
,jx0
);
194 dy10
= _mm_sub_pd(iy1
,jy0
);
195 dz10
= _mm_sub_pd(iz1
,jz0
);
196 dx20
= _mm_sub_pd(ix2
,jx0
);
197 dy20
= _mm_sub_pd(iy2
,jy0
);
198 dz20
= _mm_sub_pd(iz2
,jz0
);
200 /* Calculate squared distance and things based on it */
201 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
202 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
203 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
205 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
206 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
207 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
209 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
211 /* Load parameters for j particles */
212 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
213 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
214 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
216 fjx0
= _mm_setzero_pd();
217 fjy0
= _mm_setzero_pd();
218 fjz0
= _mm_setzero_pd();
220 /**************************
221 * CALCULATE INTERACTIONS *
222 **************************/
224 r00
= _mm_mul_pd(rsq00
,rinv00
);
226 /* Compute parameters for interactions between i and j atoms */
227 qq00
= _mm_mul_pd(iq0
,jq0
);
228 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
229 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
231 /* Calculate table index by multiplying r with table scale and truncate to integer */
232 rt
= _mm_mul_pd(r00
,vftabscale
);
233 vfitab
= _mm_cvttpd_epi32(rt
);
234 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
235 vfitab
= _mm_slli_epi32(vfitab
,2);
237 /* CUBIC SPLINE TABLE ELECTROSTATICS */
238 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
239 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
240 GMX_MM_TRANSPOSE2_PD(Y
,F
);
241 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
242 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
243 GMX_MM_TRANSPOSE2_PD(G
,H
);
244 Heps
= _mm_mul_pd(vfeps
,H
);
245 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
246 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
247 velec
= _mm_mul_pd(qq00
,VV
);
248 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
249 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
251 /* LENNARD-JONES DISPERSION/REPULSION */
253 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
254 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
255 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
256 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
257 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
259 /* Update potential sum for this i atom from the interaction with this j atom. */
260 velecsum
= _mm_add_pd(velecsum
,velec
);
261 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
263 fscal
= _mm_add_pd(felec
,fvdw
);
265 /* Calculate temporary vectorial force */
266 tx
= _mm_mul_pd(fscal
,dx00
);
267 ty
= _mm_mul_pd(fscal
,dy00
);
268 tz
= _mm_mul_pd(fscal
,dz00
);
270 /* Update vectorial force */
271 fix0
= _mm_add_pd(fix0
,tx
);
272 fiy0
= _mm_add_pd(fiy0
,ty
);
273 fiz0
= _mm_add_pd(fiz0
,tz
);
275 fjx0
= _mm_add_pd(fjx0
,tx
);
276 fjy0
= _mm_add_pd(fjy0
,ty
);
277 fjz0
= _mm_add_pd(fjz0
,tz
);
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 r10
= _mm_mul_pd(rsq10
,rinv10
);
285 /* Compute parameters for interactions between i and j atoms */
286 qq10
= _mm_mul_pd(iq1
,jq0
);
288 /* Calculate table index by multiplying r with table scale and truncate to integer */
289 rt
= _mm_mul_pd(r10
,vftabscale
);
290 vfitab
= _mm_cvttpd_epi32(rt
);
291 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
292 vfitab
= _mm_slli_epi32(vfitab
,2);
294 /* CUBIC SPLINE TABLE ELECTROSTATICS */
295 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
296 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
297 GMX_MM_TRANSPOSE2_PD(Y
,F
);
298 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
299 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
300 GMX_MM_TRANSPOSE2_PD(G
,H
);
301 Heps
= _mm_mul_pd(vfeps
,H
);
302 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
303 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
304 velec
= _mm_mul_pd(qq10
,VV
);
305 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
306 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 velecsum
= _mm_add_pd(velecsum
,velec
);
313 /* Calculate temporary vectorial force */
314 tx
= _mm_mul_pd(fscal
,dx10
);
315 ty
= _mm_mul_pd(fscal
,dy10
);
316 tz
= _mm_mul_pd(fscal
,dz10
);
318 /* Update vectorial force */
319 fix1
= _mm_add_pd(fix1
,tx
);
320 fiy1
= _mm_add_pd(fiy1
,ty
);
321 fiz1
= _mm_add_pd(fiz1
,tz
);
323 fjx0
= _mm_add_pd(fjx0
,tx
);
324 fjy0
= _mm_add_pd(fjy0
,ty
);
325 fjz0
= _mm_add_pd(fjz0
,tz
);
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 r20
= _mm_mul_pd(rsq20
,rinv20
);
333 /* Compute parameters for interactions between i and j atoms */
334 qq20
= _mm_mul_pd(iq2
,jq0
);
336 /* Calculate table index by multiplying r with table scale and truncate to integer */
337 rt
= _mm_mul_pd(r20
,vftabscale
);
338 vfitab
= _mm_cvttpd_epi32(rt
);
339 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
340 vfitab
= _mm_slli_epi32(vfitab
,2);
342 /* CUBIC SPLINE TABLE ELECTROSTATICS */
343 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
344 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
345 GMX_MM_TRANSPOSE2_PD(Y
,F
);
346 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
347 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
348 GMX_MM_TRANSPOSE2_PD(G
,H
);
349 Heps
= _mm_mul_pd(vfeps
,H
);
350 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
351 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
352 velec
= _mm_mul_pd(qq20
,VV
);
353 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
354 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 velecsum
= _mm_add_pd(velecsum
,velec
);
361 /* Calculate temporary vectorial force */
362 tx
= _mm_mul_pd(fscal
,dx20
);
363 ty
= _mm_mul_pd(fscal
,dy20
);
364 tz
= _mm_mul_pd(fscal
,dz20
);
366 /* Update vectorial force */
367 fix2
= _mm_add_pd(fix2
,tx
);
368 fiy2
= _mm_add_pd(fiy2
,ty
);
369 fiz2
= _mm_add_pd(fiz2
,tz
);
371 fjx0
= _mm_add_pd(fjx0
,tx
);
372 fjy0
= _mm_add_pd(fjy0
,ty
);
373 fjz0
= _mm_add_pd(fjz0
,tz
);
375 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
377 /* Inner loop uses 145 flops */
384 j_coord_offsetA
= DIM
*jnrA
;
386 /* load j atom coordinates */
387 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
390 /* Calculate displacement vector */
391 dx00
= _mm_sub_pd(ix0
,jx0
);
392 dy00
= _mm_sub_pd(iy0
,jy0
);
393 dz00
= _mm_sub_pd(iz0
,jz0
);
394 dx10
= _mm_sub_pd(ix1
,jx0
);
395 dy10
= _mm_sub_pd(iy1
,jy0
);
396 dz10
= _mm_sub_pd(iz1
,jz0
);
397 dx20
= _mm_sub_pd(ix2
,jx0
);
398 dy20
= _mm_sub_pd(iy2
,jy0
);
399 dz20
= _mm_sub_pd(iz2
,jz0
);
401 /* Calculate squared distance and things based on it */
402 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
403 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
404 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
406 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
407 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
408 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
410 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
412 /* Load parameters for j particles */
413 jq0
= _mm_load_sd(charge
+jnrA
+0);
414 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
416 fjx0
= _mm_setzero_pd();
417 fjy0
= _mm_setzero_pd();
418 fjz0
= _mm_setzero_pd();
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 r00
= _mm_mul_pd(rsq00
,rinv00
);
426 /* Compute parameters for interactions between i and j atoms */
427 qq00
= _mm_mul_pd(iq0
,jq0
);
428 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
430 /* Calculate table index by multiplying r with table scale and truncate to integer */
431 rt
= _mm_mul_pd(r00
,vftabscale
);
432 vfitab
= _mm_cvttpd_epi32(rt
);
433 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
434 vfitab
= _mm_slli_epi32(vfitab
,2);
436 /* CUBIC SPLINE TABLE ELECTROSTATICS */
437 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
438 F
= _mm_setzero_pd();
439 GMX_MM_TRANSPOSE2_PD(Y
,F
);
440 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
441 H
= _mm_setzero_pd();
442 GMX_MM_TRANSPOSE2_PD(G
,H
);
443 Heps
= _mm_mul_pd(vfeps
,H
);
444 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
445 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
446 velec
= _mm_mul_pd(qq00
,VV
);
447 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
448 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
450 /* LENNARD-JONES DISPERSION/REPULSION */
452 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
453 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
454 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
455 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
456 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
460 velecsum
= _mm_add_pd(velecsum
,velec
);
461 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
462 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
464 fscal
= _mm_add_pd(felec
,fvdw
);
466 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
468 /* Calculate temporary vectorial force */
469 tx
= _mm_mul_pd(fscal
,dx00
);
470 ty
= _mm_mul_pd(fscal
,dy00
);
471 tz
= _mm_mul_pd(fscal
,dz00
);
473 /* Update vectorial force */
474 fix0
= _mm_add_pd(fix0
,tx
);
475 fiy0
= _mm_add_pd(fiy0
,ty
);
476 fiz0
= _mm_add_pd(fiz0
,tz
);
478 fjx0
= _mm_add_pd(fjx0
,tx
);
479 fjy0
= _mm_add_pd(fjy0
,ty
);
480 fjz0
= _mm_add_pd(fjz0
,tz
);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 r10
= _mm_mul_pd(rsq10
,rinv10
);
488 /* Compute parameters for interactions between i and j atoms */
489 qq10
= _mm_mul_pd(iq1
,jq0
);
491 /* Calculate table index by multiplying r with table scale and truncate to integer */
492 rt
= _mm_mul_pd(r10
,vftabscale
);
493 vfitab
= _mm_cvttpd_epi32(rt
);
494 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
495 vfitab
= _mm_slli_epi32(vfitab
,2);
497 /* CUBIC SPLINE TABLE ELECTROSTATICS */
498 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
499 F
= _mm_setzero_pd();
500 GMX_MM_TRANSPOSE2_PD(Y
,F
);
501 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
502 H
= _mm_setzero_pd();
503 GMX_MM_TRANSPOSE2_PD(G
,H
);
504 Heps
= _mm_mul_pd(vfeps
,H
);
505 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
506 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
507 velec
= _mm_mul_pd(qq10
,VV
);
508 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
509 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
513 velecsum
= _mm_add_pd(velecsum
,velec
);
517 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
519 /* Calculate temporary vectorial force */
520 tx
= _mm_mul_pd(fscal
,dx10
);
521 ty
= _mm_mul_pd(fscal
,dy10
);
522 tz
= _mm_mul_pd(fscal
,dz10
);
524 /* Update vectorial force */
525 fix1
= _mm_add_pd(fix1
,tx
);
526 fiy1
= _mm_add_pd(fiy1
,ty
);
527 fiz1
= _mm_add_pd(fiz1
,tz
);
529 fjx0
= _mm_add_pd(fjx0
,tx
);
530 fjy0
= _mm_add_pd(fjy0
,ty
);
531 fjz0
= _mm_add_pd(fjz0
,tz
);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 r20
= _mm_mul_pd(rsq20
,rinv20
);
539 /* Compute parameters for interactions between i and j atoms */
540 qq20
= _mm_mul_pd(iq2
,jq0
);
542 /* Calculate table index by multiplying r with table scale and truncate to integer */
543 rt
= _mm_mul_pd(r20
,vftabscale
);
544 vfitab
= _mm_cvttpd_epi32(rt
);
545 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
546 vfitab
= _mm_slli_epi32(vfitab
,2);
548 /* CUBIC SPLINE TABLE ELECTROSTATICS */
549 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
550 F
= _mm_setzero_pd();
551 GMX_MM_TRANSPOSE2_PD(Y
,F
);
552 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
553 H
= _mm_setzero_pd();
554 GMX_MM_TRANSPOSE2_PD(G
,H
);
555 Heps
= _mm_mul_pd(vfeps
,H
);
556 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
557 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
558 velec
= _mm_mul_pd(qq20
,VV
);
559 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
560 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
562 /* Update potential sum for this i atom from the interaction with this j atom. */
563 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
564 velecsum
= _mm_add_pd(velecsum
,velec
);
568 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
570 /* Calculate temporary vectorial force */
571 tx
= _mm_mul_pd(fscal
,dx20
);
572 ty
= _mm_mul_pd(fscal
,dy20
);
573 tz
= _mm_mul_pd(fscal
,dz20
);
575 /* Update vectorial force */
576 fix2
= _mm_add_pd(fix2
,tx
);
577 fiy2
= _mm_add_pd(fiy2
,ty
);
578 fiz2
= _mm_add_pd(fiz2
,tz
);
580 fjx0
= _mm_add_pd(fjx0
,tx
);
581 fjy0
= _mm_add_pd(fjy0
,ty
);
582 fjz0
= _mm_add_pd(fjz0
,tz
);
584 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
586 /* Inner loop uses 145 flops */
589 /* End of innermost loop */
591 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
592 f
+i_coord_offset
,fshift
+i_shift_offset
);
595 /* Update potential energies */
596 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
597 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
599 /* Increment number of inner iterations */
600 inneriter
+= j_index_end
- j_index_start
;
602 /* Outer loop uses 20 flops */
605 /* Increment number of outer iterations */
608 /* Update outer/inner flops */
610 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_VF
,outeriter
*20 + inneriter
*145);
613 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_double
614 * Electrostatics interaction: CubicSplineTable
615 * VdW interaction: LennardJones
616 * Geometry: Water3-Particle
617 * Calculate force/pot: Force
620 nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_double
621 (t_nblist
* gmx_restrict nlist
,
622 rvec
* gmx_restrict xx
,
623 rvec
* gmx_restrict ff
,
624 t_forcerec
* gmx_restrict fr
,
625 t_mdatoms
* gmx_restrict mdatoms
,
626 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
627 t_nrnb
* gmx_restrict nrnb
)
629 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
630 * just 0 for non-waters.
631 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
632 * jnr indices corresponding to data put in the four positions in the SIMD register.
634 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
635 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
637 int j_coord_offsetA
,j_coord_offsetB
;
638 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
640 real
*shiftvec
,*fshift
,*x
,*f
;
641 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
643 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
645 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
647 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
648 int vdwjidx0A
,vdwjidx0B
;
649 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
650 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
651 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
652 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
653 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
656 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
659 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
660 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
662 __m128i ifour
= _mm_set1_epi32(4);
663 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
665 __m128d dummy_mask
,cutoff_mask
;
666 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
667 __m128d one
= _mm_set1_pd(1.0);
668 __m128d two
= _mm_set1_pd(2.0);
674 jindex
= nlist
->jindex
;
676 shiftidx
= nlist
->shift
;
678 shiftvec
= fr
->shift_vec
[0];
679 fshift
= fr
->fshift
[0];
680 facel
= _mm_set1_pd(fr
->epsfac
);
681 charge
= mdatoms
->chargeA
;
682 nvdwtype
= fr
->ntype
;
684 vdwtype
= mdatoms
->typeA
;
686 vftab
= kernel_data
->table_elec
->data
;
687 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
689 /* Setup water-specific parameters */
690 inr
= nlist
->iinr
[0];
691 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
692 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
693 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
694 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
696 /* Avoid stupid compiler warnings */
704 /* Start outer loop over neighborlists */
705 for(iidx
=0; iidx
<nri
; iidx
++)
707 /* Load shift vector for this list */
708 i_shift_offset
= DIM
*shiftidx
[iidx
];
710 /* Load limits for loop over neighbors */
711 j_index_start
= jindex
[iidx
];
712 j_index_end
= jindex
[iidx
+1];
714 /* Get outer coordinate index */
716 i_coord_offset
= DIM
*inr
;
718 /* Load i particle coords and add shift vector */
719 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
720 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
722 fix0
= _mm_setzero_pd();
723 fiy0
= _mm_setzero_pd();
724 fiz0
= _mm_setzero_pd();
725 fix1
= _mm_setzero_pd();
726 fiy1
= _mm_setzero_pd();
727 fiz1
= _mm_setzero_pd();
728 fix2
= _mm_setzero_pd();
729 fiy2
= _mm_setzero_pd();
730 fiz2
= _mm_setzero_pd();
732 /* Start inner kernel loop */
733 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
736 /* Get j neighbor index, and coordinate index */
739 j_coord_offsetA
= DIM
*jnrA
;
740 j_coord_offsetB
= DIM
*jnrB
;
742 /* load j atom coordinates */
743 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
746 /* Calculate displacement vector */
747 dx00
= _mm_sub_pd(ix0
,jx0
);
748 dy00
= _mm_sub_pd(iy0
,jy0
);
749 dz00
= _mm_sub_pd(iz0
,jz0
);
750 dx10
= _mm_sub_pd(ix1
,jx0
);
751 dy10
= _mm_sub_pd(iy1
,jy0
);
752 dz10
= _mm_sub_pd(iz1
,jz0
);
753 dx20
= _mm_sub_pd(ix2
,jx0
);
754 dy20
= _mm_sub_pd(iy2
,jy0
);
755 dz20
= _mm_sub_pd(iz2
,jz0
);
757 /* Calculate squared distance and things based on it */
758 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
759 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
760 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
762 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
763 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
764 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
766 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
768 /* Load parameters for j particles */
769 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
770 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
771 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
773 fjx0
= _mm_setzero_pd();
774 fjy0
= _mm_setzero_pd();
775 fjz0
= _mm_setzero_pd();
777 /**************************
778 * CALCULATE INTERACTIONS *
779 **************************/
781 r00
= _mm_mul_pd(rsq00
,rinv00
);
783 /* Compute parameters for interactions between i and j atoms */
784 qq00
= _mm_mul_pd(iq0
,jq0
);
785 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
786 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
788 /* Calculate table index by multiplying r with table scale and truncate to integer */
789 rt
= _mm_mul_pd(r00
,vftabscale
);
790 vfitab
= _mm_cvttpd_epi32(rt
);
791 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
792 vfitab
= _mm_slli_epi32(vfitab
,2);
794 /* CUBIC SPLINE TABLE ELECTROSTATICS */
795 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
796 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
797 GMX_MM_TRANSPOSE2_PD(Y
,F
);
798 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
799 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
800 GMX_MM_TRANSPOSE2_PD(G
,H
);
801 Heps
= _mm_mul_pd(vfeps
,H
);
802 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
803 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
804 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
806 /* LENNARD-JONES DISPERSION/REPULSION */
808 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
809 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
811 fscal
= _mm_add_pd(felec
,fvdw
);
813 /* Calculate temporary vectorial force */
814 tx
= _mm_mul_pd(fscal
,dx00
);
815 ty
= _mm_mul_pd(fscal
,dy00
);
816 tz
= _mm_mul_pd(fscal
,dz00
);
818 /* Update vectorial force */
819 fix0
= _mm_add_pd(fix0
,tx
);
820 fiy0
= _mm_add_pd(fiy0
,ty
);
821 fiz0
= _mm_add_pd(fiz0
,tz
);
823 fjx0
= _mm_add_pd(fjx0
,tx
);
824 fjy0
= _mm_add_pd(fjy0
,ty
);
825 fjz0
= _mm_add_pd(fjz0
,tz
);
827 /**************************
828 * CALCULATE INTERACTIONS *
829 **************************/
831 r10
= _mm_mul_pd(rsq10
,rinv10
);
833 /* Compute parameters for interactions between i and j atoms */
834 qq10
= _mm_mul_pd(iq1
,jq0
);
836 /* Calculate table index by multiplying r with table scale and truncate to integer */
837 rt
= _mm_mul_pd(r10
,vftabscale
);
838 vfitab
= _mm_cvttpd_epi32(rt
);
839 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
840 vfitab
= _mm_slli_epi32(vfitab
,2);
842 /* CUBIC SPLINE TABLE ELECTROSTATICS */
843 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
844 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
845 GMX_MM_TRANSPOSE2_PD(Y
,F
);
846 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
847 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
848 GMX_MM_TRANSPOSE2_PD(G
,H
);
849 Heps
= _mm_mul_pd(vfeps
,H
);
850 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
851 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
852 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
856 /* Calculate temporary vectorial force */
857 tx
= _mm_mul_pd(fscal
,dx10
);
858 ty
= _mm_mul_pd(fscal
,dy10
);
859 tz
= _mm_mul_pd(fscal
,dz10
);
861 /* Update vectorial force */
862 fix1
= _mm_add_pd(fix1
,tx
);
863 fiy1
= _mm_add_pd(fiy1
,ty
);
864 fiz1
= _mm_add_pd(fiz1
,tz
);
866 fjx0
= _mm_add_pd(fjx0
,tx
);
867 fjy0
= _mm_add_pd(fjy0
,ty
);
868 fjz0
= _mm_add_pd(fjz0
,tz
);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 r20
= _mm_mul_pd(rsq20
,rinv20
);
876 /* Compute parameters for interactions between i and j atoms */
877 qq20
= _mm_mul_pd(iq2
,jq0
);
879 /* Calculate table index by multiplying r with table scale and truncate to integer */
880 rt
= _mm_mul_pd(r20
,vftabscale
);
881 vfitab
= _mm_cvttpd_epi32(rt
);
882 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
883 vfitab
= _mm_slli_epi32(vfitab
,2);
885 /* CUBIC SPLINE TABLE ELECTROSTATICS */
886 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
887 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
888 GMX_MM_TRANSPOSE2_PD(Y
,F
);
889 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
890 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
891 GMX_MM_TRANSPOSE2_PD(G
,H
);
892 Heps
= _mm_mul_pd(vfeps
,H
);
893 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
894 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
895 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
899 /* Calculate temporary vectorial force */
900 tx
= _mm_mul_pd(fscal
,dx20
);
901 ty
= _mm_mul_pd(fscal
,dy20
);
902 tz
= _mm_mul_pd(fscal
,dz20
);
904 /* Update vectorial force */
905 fix2
= _mm_add_pd(fix2
,tx
);
906 fiy2
= _mm_add_pd(fiy2
,ty
);
907 fiz2
= _mm_add_pd(fiz2
,tz
);
909 fjx0
= _mm_add_pd(fjx0
,tx
);
910 fjy0
= _mm_add_pd(fjy0
,ty
);
911 fjz0
= _mm_add_pd(fjz0
,tz
);
913 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
915 /* Inner loop uses 128 flops */
922 j_coord_offsetA
= DIM
*jnrA
;
924 /* load j atom coordinates */
925 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
928 /* Calculate displacement vector */
929 dx00
= _mm_sub_pd(ix0
,jx0
);
930 dy00
= _mm_sub_pd(iy0
,jy0
);
931 dz00
= _mm_sub_pd(iz0
,jz0
);
932 dx10
= _mm_sub_pd(ix1
,jx0
);
933 dy10
= _mm_sub_pd(iy1
,jy0
);
934 dz10
= _mm_sub_pd(iz1
,jz0
);
935 dx20
= _mm_sub_pd(ix2
,jx0
);
936 dy20
= _mm_sub_pd(iy2
,jy0
);
937 dz20
= _mm_sub_pd(iz2
,jz0
);
939 /* Calculate squared distance and things based on it */
940 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
941 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
942 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
944 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
945 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
946 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
948 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
950 /* Load parameters for j particles */
951 jq0
= _mm_load_sd(charge
+jnrA
+0);
952 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
954 fjx0
= _mm_setzero_pd();
955 fjy0
= _mm_setzero_pd();
956 fjz0
= _mm_setzero_pd();
958 /**************************
959 * CALCULATE INTERACTIONS *
960 **************************/
962 r00
= _mm_mul_pd(rsq00
,rinv00
);
964 /* Compute parameters for interactions between i and j atoms */
965 qq00
= _mm_mul_pd(iq0
,jq0
);
966 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
968 /* Calculate table index by multiplying r with table scale and truncate to integer */
969 rt
= _mm_mul_pd(r00
,vftabscale
);
970 vfitab
= _mm_cvttpd_epi32(rt
);
971 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
972 vfitab
= _mm_slli_epi32(vfitab
,2);
974 /* CUBIC SPLINE TABLE ELECTROSTATICS */
975 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
976 F
= _mm_setzero_pd();
977 GMX_MM_TRANSPOSE2_PD(Y
,F
);
978 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
979 H
= _mm_setzero_pd();
980 GMX_MM_TRANSPOSE2_PD(G
,H
);
981 Heps
= _mm_mul_pd(vfeps
,H
);
982 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
983 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
984 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
986 /* LENNARD-JONES DISPERSION/REPULSION */
988 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
989 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
991 fscal
= _mm_add_pd(felec
,fvdw
);
993 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
995 /* Calculate temporary vectorial force */
996 tx
= _mm_mul_pd(fscal
,dx00
);
997 ty
= _mm_mul_pd(fscal
,dy00
);
998 tz
= _mm_mul_pd(fscal
,dz00
);
1000 /* Update vectorial force */
1001 fix0
= _mm_add_pd(fix0
,tx
);
1002 fiy0
= _mm_add_pd(fiy0
,ty
);
1003 fiz0
= _mm_add_pd(fiz0
,tz
);
1005 fjx0
= _mm_add_pd(fjx0
,tx
);
1006 fjy0
= _mm_add_pd(fjy0
,ty
);
1007 fjz0
= _mm_add_pd(fjz0
,tz
);
1009 /**************************
1010 * CALCULATE INTERACTIONS *
1011 **************************/
1013 r10
= _mm_mul_pd(rsq10
,rinv10
);
1015 /* Compute parameters for interactions between i and j atoms */
1016 qq10
= _mm_mul_pd(iq1
,jq0
);
1018 /* Calculate table index by multiplying r with table scale and truncate to integer */
1019 rt
= _mm_mul_pd(r10
,vftabscale
);
1020 vfitab
= _mm_cvttpd_epi32(rt
);
1021 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1022 vfitab
= _mm_slli_epi32(vfitab
,2);
1024 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1025 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1026 F
= _mm_setzero_pd();
1027 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1028 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1029 H
= _mm_setzero_pd();
1030 GMX_MM_TRANSPOSE2_PD(G
,H
);
1031 Heps
= _mm_mul_pd(vfeps
,H
);
1032 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1033 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1034 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
1038 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1040 /* Calculate temporary vectorial force */
1041 tx
= _mm_mul_pd(fscal
,dx10
);
1042 ty
= _mm_mul_pd(fscal
,dy10
);
1043 tz
= _mm_mul_pd(fscal
,dz10
);
1045 /* Update vectorial force */
1046 fix1
= _mm_add_pd(fix1
,tx
);
1047 fiy1
= _mm_add_pd(fiy1
,ty
);
1048 fiz1
= _mm_add_pd(fiz1
,tz
);
1050 fjx0
= _mm_add_pd(fjx0
,tx
);
1051 fjy0
= _mm_add_pd(fjy0
,ty
);
1052 fjz0
= _mm_add_pd(fjz0
,tz
);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 r20
= _mm_mul_pd(rsq20
,rinv20
);
1060 /* Compute parameters for interactions between i and j atoms */
1061 qq20
= _mm_mul_pd(iq2
,jq0
);
1063 /* Calculate table index by multiplying r with table scale and truncate to integer */
1064 rt
= _mm_mul_pd(r20
,vftabscale
);
1065 vfitab
= _mm_cvttpd_epi32(rt
);
1066 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1067 vfitab
= _mm_slli_epi32(vfitab
,2);
1069 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1070 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1071 F
= _mm_setzero_pd();
1072 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1073 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1074 H
= _mm_setzero_pd();
1075 GMX_MM_TRANSPOSE2_PD(G
,H
);
1076 Heps
= _mm_mul_pd(vfeps
,H
);
1077 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1078 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1079 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1083 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1085 /* Calculate temporary vectorial force */
1086 tx
= _mm_mul_pd(fscal
,dx20
);
1087 ty
= _mm_mul_pd(fscal
,dy20
);
1088 tz
= _mm_mul_pd(fscal
,dz20
);
1090 /* Update vectorial force */
1091 fix2
= _mm_add_pd(fix2
,tx
);
1092 fiy2
= _mm_add_pd(fiy2
,ty
);
1093 fiz2
= _mm_add_pd(fiz2
,tz
);
1095 fjx0
= _mm_add_pd(fjx0
,tx
);
1096 fjy0
= _mm_add_pd(fjy0
,ty
);
1097 fjz0
= _mm_add_pd(fjz0
,tz
);
1099 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1101 /* Inner loop uses 128 flops */
1104 /* End of innermost loop */
1106 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1107 f
+i_coord_offset
,fshift
+i_shift_offset
);
1109 /* Increment number of inner iterations */
1110 inneriter
+= j_index_end
- j_index_start
;
1112 /* Outer loop uses 18 flops */
1115 /* Increment number of outer iterations */
1118 /* Update outer/inner flops */
1120 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_F
,outeriter
*18 + inneriter
*128);