2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
69 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
71 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
73 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
74 int vdwjidx0A
,vdwjidx0B
;
75 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
76 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
77 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
78 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
79 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
80 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
83 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
86 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
87 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
89 __m128i ifour
= _mm_set1_epi32(4);
90 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
92 __m128d dummy_mask
,cutoff_mask
;
93 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
94 __m128d one
= _mm_set1_pd(1.0);
95 __m128d two
= _mm_set1_pd(2.0);
101 jindex
= nlist
->jindex
;
103 shiftidx
= nlist
->shift
;
105 shiftvec
= fr
->shift_vec
[0];
106 fshift
= fr
->fshift
[0];
107 facel
= _mm_set1_pd(fr
->epsfac
);
108 charge
= mdatoms
->chargeA
;
109 nvdwtype
= fr
->ntype
;
111 vdwtype
= mdatoms
->typeA
;
113 vftab
= kernel_data
->table_elec
->data
;
114 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
116 /* Setup water-specific parameters */
117 inr
= nlist
->iinr
[0];
118 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
119 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
120 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
121 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
123 /* Avoid stupid compiler warnings */
131 /* Start outer loop over neighborlists */
132 for(iidx
=0; iidx
<nri
; iidx
++)
134 /* Load shift vector for this list */
135 i_shift_offset
= DIM
*shiftidx
[iidx
];
137 /* Load limits for loop over neighbors */
138 j_index_start
= jindex
[iidx
];
139 j_index_end
= jindex
[iidx
+1];
141 /* Get outer coordinate index */
143 i_coord_offset
= DIM
*inr
;
145 /* Load i particle coords and add shift vector */
146 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
147 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
149 fix0
= _mm_setzero_pd();
150 fiy0
= _mm_setzero_pd();
151 fiz0
= _mm_setzero_pd();
152 fix1
= _mm_setzero_pd();
153 fiy1
= _mm_setzero_pd();
154 fiz1
= _mm_setzero_pd();
155 fix2
= _mm_setzero_pd();
156 fiy2
= _mm_setzero_pd();
157 fiz2
= _mm_setzero_pd();
158 fix3
= _mm_setzero_pd();
159 fiy3
= _mm_setzero_pd();
160 fiz3
= _mm_setzero_pd();
162 /* Reset potential sums */
163 velecsum
= _mm_setzero_pd();
164 vvdwsum
= _mm_setzero_pd();
166 /* Start inner kernel loop */
167 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
170 /* Get j neighbor index, and coordinate index */
173 j_coord_offsetA
= DIM
*jnrA
;
174 j_coord_offsetB
= DIM
*jnrB
;
176 /* load j atom coordinates */
177 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
180 /* Calculate displacement vector */
181 dx00
= _mm_sub_pd(ix0
,jx0
);
182 dy00
= _mm_sub_pd(iy0
,jy0
);
183 dz00
= _mm_sub_pd(iz0
,jz0
);
184 dx10
= _mm_sub_pd(ix1
,jx0
);
185 dy10
= _mm_sub_pd(iy1
,jy0
);
186 dz10
= _mm_sub_pd(iz1
,jz0
);
187 dx20
= _mm_sub_pd(ix2
,jx0
);
188 dy20
= _mm_sub_pd(iy2
,jy0
);
189 dz20
= _mm_sub_pd(iz2
,jz0
);
190 dx30
= _mm_sub_pd(ix3
,jx0
);
191 dy30
= _mm_sub_pd(iy3
,jy0
);
192 dz30
= _mm_sub_pd(iz3
,jz0
);
194 /* Calculate squared distance and things based on it */
195 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
196 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
197 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
198 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
200 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
201 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
202 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
204 rinvsq00
= gmx_mm_inv_pd(rsq00
);
206 /* Load parameters for j particles */
207 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
208 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
209 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
211 fjx0
= _mm_setzero_pd();
212 fjy0
= _mm_setzero_pd();
213 fjz0
= _mm_setzero_pd();
215 /**************************
216 * CALCULATE INTERACTIONS *
217 **************************/
219 /* Compute parameters for interactions between i and j atoms */
220 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
221 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
223 /* LENNARD-JONES DISPERSION/REPULSION */
225 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
226 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
227 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
228 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
229 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
231 /* Update potential sum for this i atom from the interaction with this j atom. */
232 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
236 /* Calculate temporary vectorial force */
237 tx
= _mm_mul_pd(fscal
,dx00
);
238 ty
= _mm_mul_pd(fscal
,dy00
);
239 tz
= _mm_mul_pd(fscal
,dz00
);
241 /* Update vectorial force */
242 fix0
= _mm_add_pd(fix0
,tx
);
243 fiy0
= _mm_add_pd(fiy0
,ty
);
244 fiz0
= _mm_add_pd(fiz0
,tz
);
246 fjx0
= _mm_add_pd(fjx0
,tx
);
247 fjy0
= _mm_add_pd(fjy0
,ty
);
248 fjz0
= _mm_add_pd(fjz0
,tz
);
250 /**************************
251 * CALCULATE INTERACTIONS *
252 **************************/
254 r10
= _mm_mul_pd(rsq10
,rinv10
);
256 /* Compute parameters for interactions between i and j atoms */
257 qq10
= _mm_mul_pd(iq1
,jq0
);
259 /* Calculate table index by multiplying r with table scale and truncate to integer */
260 rt
= _mm_mul_pd(r10
,vftabscale
);
261 vfitab
= _mm_cvttpd_epi32(rt
);
262 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
263 vfitab
= _mm_slli_epi32(vfitab
,2);
265 /* CUBIC SPLINE TABLE ELECTROSTATICS */
266 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
267 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
268 GMX_MM_TRANSPOSE2_PD(Y
,F
);
269 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
270 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
271 GMX_MM_TRANSPOSE2_PD(G
,H
);
272 Heps
= _mm_mul_pd(vfeps
,H
);
273 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
274 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
275 velec
= _mm_mul_pd(qq10
,VV
);
276 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
277 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
279 /* Update potential sum for this i atom from the interaction with this j atom. */
280 velecsum
= _mm_add_pd(velecsum
,velec
);
284 /* Calculate temporary vectorial force */
285 tx
= _mm_mul_pd(fscal
,dx10
);
286 ty
= _mm_mul_pd(fscal
,dy10
);
287 tz
= _mm_mul_pd(fscal
,dz10
);
289 /* Update vectorial force */
290 fix1
= _mm_add_pd(fix1
,tx
);
291 fiy1
= _mm_add_pd(fiy1
,ty
);
292 fiz1
= _mm_add_pd(fiz1
,tz
);
294 fjx0
= _mm_add_pd(fjx0
,tx
);
295 fjy0
= _mm_add_pd(fjy0
,ty
);
296 fjz0
= _mm_add_pd(fjz0
,tz
);
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 r20
= _mm_mul_pd(rsq20
,rinv20
);
304 /* Compute parameters for interactions between i and j atoms */
305 qq20
= _mm_mul_pd(iq2
,jq0
);
307 /* Calculate table index by multiplying r with table scale and truncate to integer */
308 rt
= _mm_mul_pd(r20
,vftabscale
);
309 vfitab
= _mm_cvttpd_epi32(rt
);
310 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
311 vfitab
= _mm_slli_epi32(vfitab
,2);
313 /* CUBIC SPLINE TABLE ELECTROSTATICS */
314 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
315 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
316 GMX_MM_TRANSPOSE2_PD(Y
,F
);
317 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
318 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
319 GMX_MM_TRANSPOSE2_PD(G
,H
);
320 Heps
= _mm_mul_pd(vfeps
,H
);
321 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
322 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
323 velec
= _mm_mul_pd(qq20
,VV
);
324 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
325 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
327 /* Update potential sum for this i atom from the interaction with this j atom. */
328 velecsum
= _mm_add_pd(velecsum
,velec
);
332 /* Calculate temporary vectorial force */
333 tx
= _mm_mul_pd(fscal
,dx20
);
334 ty
= _mm_mul_pd(fscal
,dy20
);
335 tz
= _mm_mul_pd(fscal
,dz20
);
337 /* Update vectorial force */
338 fix2
= _mm_add_pd(fix2
,tx
);
339 fiy2
= _mm_add_pd(fiy2
,ty
);
340 fiz2
= _mm_add_pd(fiz2
,tz
);
342 fjx0
= _mm_add_pd(fjx0
,tx
);
343 fjy0
= _mm_add_pd(fjy0
,ty
);
344 fjz0
= _mm_add_pd(fjz0
,tz
);
346 /**************************
347 * CALCULATE INTERACTIONS *
348 **************************/
350 r30
= _mm_mul_pd(rsq30
,rinv30
);
352 /* Compute parameters for interactions between i and j atoms */
353 qq30
= _mm_mul_pd(iq3
,jq0
);
355 /* Calculate table index by multiplying r with table scale and truncate to integer */
356 rt
= _mm_mul_pd(r30
,vftabscale
);
357 vfitab
= _mm_cvttpd_epi32(rt
);
358 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
359 vfitab
= _mm_slli_epi32(vfitab
,2);
361 /* CUBIC SPLINE TABLE ELECTROSTATICS */
362 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
363 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
364 GMX_MM_TRANSPOSE2_PD(Y
,F
);
365 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
366 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
367 GMX_MM_TRANSPOSE2_PD(G
,H
);
368 Heps
= _mm_mul_pd(vfeps
,H
);
369 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
370 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
371 velec
= _mm_mul_pd(qq30
,VV
);
372 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
373 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velecsum
= _mm_add_pd(velecsum
,velec
);
380 /* Calculate temporary vectorial force */
381 tx
= _mm_mul_pd(fscal
,dx30
);
382 ty
= _mm_mul_pd(fscal
,dy30
);
383 tz
= _mm_mul_pd(fscal
,dz30
);
385 /* Update vectorial force */
386 fix3
= _mm_add_pd(fix3
,tx
);
387 fiy3
= _mm_add_pd(fiy3
,ty
);
388 fiz3
= _mm_add_pd(fiz3
,tz
);
390 fjx0
= _mm_add_pd(fjx0
,tx
);
391 fjy0
= _mm_add_pd(fjy0
,ty
);
392 fjz0
= _mm_add_pd(fjz0
,tz
);
394 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
396 /* Inner loop uses 164 flops */
403 j_coord_offsetA
= DIM
*jnrA
;
405 /* load j atom coordinates */
406 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
409 /* Calculate displacement vector */
410 dx00
= _mm_sub_pd(ix0
,jx0
);
411 dy00
= _mm_sub_pd(iy0
,jy0
);
412 dz00
= _mm_sub_pd(iz0
,jz0
);
413 dx10
= _mm_sub_pd(ix1
,jx0
);
414 dy10
= _mm_sub_pd(iy1
,jy0
);
415 dz10
= _mm_sub_pd(iz1
,jz0
);
416 dx20
= _mm_sub_pd(ix2
,jx0
);
417 dy20
= _mm_sub_pd(iy2
,jy0
);
418 dz20
= _mm_sub_pd(iz2
,jz0
);
419 dx30
= _mm_sub_pd(ix3
,jx0
);
420 dy30
= _mm_sub_pd(iy3
,jy0
);
421 dz30
= _mm_sub_pd(iz3
,jz0
);
423 /* Calculate squared distance and things based on it */
424 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
425 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
426 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
427 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
429 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
430 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
431 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
433 rinvsq00
= gmx_mm_inv_pd(rsq00
);
435 /* Load parameters for j particles */
436 jq0
= _mm_load_sd(charge
+jnrA
+0);
437 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
439 fjx0
= _mm_setzero_pd();
440 fjy0
= _mm_setzero_pd();
441 fjz0
= _mm_setzero_pd();
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 /* Compute parameters for interactions between i and j atoms */
448 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
450 /* LENNARD-JONES DISPERSION/REPULSION */
452 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
453 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
454 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
455 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
456 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
460 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
464 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
466 /* Calculate temporary vectorial force */
467 tx
= _mm_mul_pd(fscal
,dx00
);
468 ty
= _mm_mul_pd(fscal
,dy00
);
469 tz
= _mm_mul_pd(fscal
,dz00
);
471 /* Update vectorial force */
472 fix0
= _mm_add_pd(fix0
,tx
);
473 fiy0
= _mm_add_pd(fiy0
,ty
);
474 fiz0
= _mm_add_pd(fiz0
,tz
);
476 fjx0
= _mm_add_pd(fjx0
,tx
);
477 fjy0
= _mm_add_pd(fjy0
,ty
);
478 fjz0
= _mm_add_pd(fjz0
,tz
);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 r10
= _mm_mul_pd(rsq10
,rinv10
);
486 /* Compute parameters for interactions between i and j atoms */
487 qq10
= _mm_mul_pd(iq1
,jq0
);
489 /* Calculate table index by multiplying r with table scale and truncate to integer */
490 rt
= _mm_mul_pd(r10
,vftabscale
);
491 vfitab
= _mm_cvttpd_epi32(rt
);
492 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
493 vfitab
= _mm_slli_epi32(vfitab
,2);
495 /* CUBIC SPLINE TABLE ELECTROSTATICS */
496 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
497 F
= _mm_setzero_pd();
498 GMX_MM_TRANSPOSE2_PD(Y
,F
);
499 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
500 H
= _mm_setzero_pd();
501 GMX_MM_TRANSPOSE2_PD(G
,H
);
502 Heps
= _mm_mul_pd(vfeps
,H
);
503 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
504 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
505 velec
= _mm_mul_pd(qq10
,VV
);
506 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
507 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
511 velecsum
= _mm_add_pd(velecsum
,velec
);
515 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
517 /* Calculate temporary vectorial force */
518 tx
= _mm_mul_pd(fscal
,dx10
);
519 ty
= _mm_mul_pd(fscal
,dy10
);
520 tz
= _mm_mul_pd(fscal
,dz10
);
522 /* Update vectorial force */
523 fix1
= _mm_add_pd(fix1
,tx
);
524 fiy1
= _mm_add_pd(fiy1
,ty
);
525 fiz1
= _mm_add_pd(fiz1
,tz
);
527 fjx0
= _mm_add_pd(fjx0
,tx
);
528 fjy0
= _mm_add_pd(fjy0
,ty
);
529 fjz0
= _mm_add_pd(fjz0
,tz
);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 r20
= _mm_mul_pd(rsq20
,rinv20
);
537 /* Compute parameters for interactions between i and j atoms */
538 qq20
= _mm_mul_pd(iq2
,jq0
);
540 /* Calculate table index by multiplying r with table scale and truncate to integer */
541 rt
= _mm_mul_pd(r20
,vftabscale
);
542 vfitab
= _mm_cvttpd_epi32(rt
);
543 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
544 vfitab
= _mm_slli_epi32(vfitab
,2);
546 /* CUBIC SPLINE TABLE ELECTROSTATICS */
547 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
548 F
= _mm_setzero_pd();
549 GMX_MM_TRANSPOSE2_PD(Y
,F
);
550 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
551 H
= _mm_setzero_pd();
552 GMX_MM_TRANSPOSE2_PD(G
,H
);
553 Heps
= _mm_mul_pd(vfeps
,H
);
554 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
555 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
556 velec
= _mm_mul_pd(qq20
,VV
);
557 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
558 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
562 velecsum
= _mm_add_pd(velecsum
,velec
);
566 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
568 /* Calculate temporary vectorial force */
569 tx
= _mm_mul_pd(fscal
,dx20
);
570 ty
= _mm_mul_pd(fscal
,dy20
);
571 tz
= _mm_mul_pd(fscal
,dz20
);
573 /* Update vectorial force */
574 fix2
= _mm_add_pd(fix2
,tx
);
575 fiy2
= _mm_add_pd(fiy2
,ty
);
576 fiz2
= _mm_add_pd(fiz2
,tz
);
578 fjx0
= _mm_add_pd(fjx0
,tx
);
579 fjy0
= _mm_add_pd(fjy0
,ty
);
580 fjz0
= _mm_add_pd(fjz0
,tz
);
582 /**************************
583 * CALCULATE INTERACTIONS *
584 **************************/
586 r30
= _mm_mul_pd(rsq30
,rinv30
);
588 /* Compute parameters for interactions between i and j atoms */
589 qq30
= _mm_mul_pd(iq3
,jq0
);
591 /* Calculate table index by multiplying r with table scale and truncate to integer */
592 rt
= _mm_mul_pd(r30
,vftabscale
);
593 vfitab
= _mm_cvttpd_epi32(rt
);
594 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
595 vfitab
= _mm_slli_epi32(vfitab
,2);
597 /* CUBIC SPLINE TABLE ELECTROSTATICS */
598 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
599 F
= _mm_setzero_pd();
600 GMX_MM_TRANSPOSE2_PD(Y
,F
);
601 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
602 H
= _mm_setzero_pd();
603 GMX_MM_TRANSPOSE2_PD(G
,H
);
604 Heps
= _mm_mul_pd(vfeps
,H
);
605 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
606 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
607 velec
= _mm_mul_pd(qq30
,VV
);
608 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
609 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
613 velecsum
= _mm_add_pd(velecsum
,velec
);
617 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
619 /* Calculate temporary vectorial force */
620 tx
= _mm_mul_pd(fscal
,dx30
);
621 ty
= _mm_mul_pd(fscal
,dy30
);
622 tz
= _mm_mul_pd(fscal
,dz30
);
624 /* Update vectorial force */
625 fix3
= _mm_add_pd(fix3
,tx
);
626 fiy3
= _mm_add_pd(fiy3
,ty
);
627 fiz3
= _mm_add_pd(fiz3
,tz
);
629 fjx0
= _mm_add_pd(fjx0
,tx
);
630 fjy0
= _mm_add_pd(fjy0
,ty
);
631 fjz0
= _mm_add_pd(fjz0
,tz
);
633 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
635 /* Inner loop uses 164 flops */
638 /* End of innermost loop */
640 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
641 f
+i_coord_offset
,fshift
+i_shift_offset
);
644 /* Update potential energies */
645 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
646 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
648 /* Increment number of inner iterations */
649 inneriter
+= j_index_end
- j_index_start
;
651 /* Outer loop uses 26 flops */
654 /* Increment number of outer iterations */
657 /* Update outer/inner flops */
659 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_VF
,outeriter
*26 + inneriter
*164);
662 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_double
663 * Electrostatics interaction: CubicSplineTable
664 * VdW interaction: LennardJones
665 * Geometry: Water4-Particle
666 * Calculate force/pot: Force
669 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_double
670 (t_nblist
* gmx_restrict nlist
,
671 rvec
* gmx_restrict xx
,
672 rvec
* gmx_restrict ff
,
673 t_forcerec
* gmx_restrict fr
,
674 t_mdatoms
* gmx_restrict mdatoms
,
675 nb_kernel_data_t
* gmx_restrict kernel_data
,
676 t_nrnb
* gmx_restrict nrnb
)
678 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
679 * just 0 for non-waters.
680 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
681 * jnr indices corresponding to data put in the four positions in the SIMD register.
683 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
684 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
686 int j_coord_offsetA
,j_coord_offsetB
;
687 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
689 real
*shiftvec
,*fshift
,*x
,*f
;
690 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
692 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
694 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
696 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
698 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
699 int vdwjidx0A
,vdwjidx0B
;
700 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
701 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
702 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
703 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
704 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
705 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
708 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
711 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
712 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
714 __m128i ifour
= _mm_set1_epi32(4);
715 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
717 __m128d dummy_mask
,cutoff_mask
;
718 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
719 __m128d one
= _mm_set1_pd(1.0);
720 __m128d two
= _mm_set1_pd(2.0);
726 jindex
= nlist
->jindex
;
728 shiftidx
= nlist
->shift
;
730 shiftvec
= fr
->shift_vec
[0];
731 fshift
= fr
->fshift
[0];
732 facel
= _mm_set1_pd(fr
->epsfac
);
733 charge
= mdatoms
->chargeA
;
734 nvdwtype
= fr
->ntype
;
736 vdwtype
= mdatoms
->typeA
;
738 vftab
= kernel_data
->table_elec
->data
;
739 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
741 /* Setup water-specific parameters */
742 inr
= nlist
->iinr
[0];
743 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
744 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
745 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
746 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
748 /* Avoid stupid compiler warnings */
756 /* Start outer loop over neighborlists */
757 for(iidx
=0; iidx
<nri
; iidx
++)
759 /* Load shift vector for this list */
760 i_shift_offset
= DIM
*shiftidx
[iidx
];
762 /* Load limits for loop over neighbors */
763 j_index_start
= jindex
[iidx
];
764 j_index_end
= jindex
[iidx
+1];
766 /* Get outer coordinate index */
768 i_coord_offset
= DIM
*inr
;
770 /* Load i particle coords and add shift vector */
771 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
772 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
774 fix0
= _mm_setzero_pd();
775 fiy0
= _mm_setzero_pd();
776 fiz0
= _mm_setzero_pd();
777 fix1
= _mm_setzero_pd();
778 fiy1
= _mm_setzero_pd();
779 fiz1
= _mm_setzero_pd();
780 fix2
= _mm_setzero_pd();
781 fiy2
= _mm_setzero_pd();
782 fiz2
= _mm_setzero_pd();
783 fix3
= _mm_setzero_pd();
784 fiy3
= _mm_setzero_pd();
785 fiz3
= _mm_setzero_pd();
787 /* Start inner kernel loop */
788 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
791 /* Get j neighbor index, and coordinate index */
794 j_coord_offsetA
= DIM
*jnrA
;
795 j_coord_offsetB
= DIM
*jnrB
;
797 /* load j atom coordinates */
798 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
801 /* Calculate displacement vector */
802 dx00
= _mm_sub_pd(ix0
,jx0
);
803 dy00
= _mm_sub_pd(iy0
,jy0
);
804 dz00
= _mm_sub_pd(iz0
,jz0
);
805 dx10
= _mm_sub_pd(ix1
,jx0
);
806 dy10
= _mm_sub_pd(iy1
,jy0
);
807 dz10
= _mm_sub_pd(iz1
,jz0
);
808 dx20
= _mm_sub_pd(ix2
,jx0
);
809 dy20
= _mm_sub_pd(iy2
,jy0
);
810 dz20
= _mm_sub_pd(iz2
,jz0
);
811 dx30
= _mm_sub_pd(ix3
,jx0
);
812 dy30
= _mm_sub_pd(iy3
,jy0
);
813 dz30
= _mm_sub_pd(iz3
,jz0
);
815 /* Calculate squared distance and things based on it */
816 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
817 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
818 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
819 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
821 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
822 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
823 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
825 rinvsq00
= gmx_mm_inv_pd(rsq00
);
827 /* Load parameters for j particles */
828 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
829 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
830 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
832 fjx0
= _mm_setzero_pd();
833 fjy0
= _mm_setzero_pd();
834 fjz0
= _mm_setzero_pd();
836 /**************************
837 * CALCULATE INTERACTIONS *
838 **************************/
840 /* Compute parameters for interactions between i and j atoms */
841 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
842 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
844 /* LENNARD-JONES DISPERSION/REPULSION */
846 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
847 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
851 /* Calculate temporary vectorial force */
852 tx
= _mm_mul_pd(fscal
,dx00
);
853 ty
= _mm_mul_pd(fscal
,dy00
);
854 tz
= _mm_mul_pd(fscal
,dz00
);
856 /* Update vectorial force */
857 fix0
= _mm_add_pd(fix0
,tx
);
858 fiy0
= _mm_add_pd(fiy0
,ty
);
859 fiz0
= _mm_add_pd(fiz0
,tz
);
861 fjx0
= _mm_add_pd(fjx0
,tx
);
862 fjy0
= _mm_add_pd(fjy0
,ty
);
863 fjz0
= _mm_add_pd(fjz0
,tz
);
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 r10
= _mm_mul_pd(rsq10
,rinv10
);
871 /* Compute parameters for interactions between i and j atoms */
872 qq10
= _mm_mul_pd(iq1
,jq0
);
874 /* Calculate table index by multiplying r with table scale and truncate to integer */
875 rt
= _mm_mul_pd(r10
,vftabscale
);
876 vfitab
= _mm_cvttpd_epi32(rt
);
877 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
878 vfitab
= _mm_slli_epi32(vfitab
,2);
880 /* CUBIC SPLINE TABLE ELECTROSTATICS */
881 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
882 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
883 GMX_MM_TRANSPOSE2_PD(Y
,F
);
884 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
885 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
886 GMX_MM_TRANSPOSE2_PD(G
,H
);
887 Heps
= _mm_mul_pd(vfeps
,H
);
888 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
889 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
890 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
894 /* Calculate temporary vectorial force */
895 tx
= _mm_mul_pd(fscal
,dx10
);
896 ty
= _mm_mul_pd(fscal
,dy10
);
897 tz
= _mm_mul_pd(fscal
,dz10
);
899 /* Update vectorial force */
900 fix1
= _mm_add_pd(fix1
,tx
);
901 fiy1
= _mm_add_pd(fiy1
,ty
);
902 fiz1
= _mm_add_pd(fiz1
,tz
);
904 fjx0
= _mm_add_pd(fjx0
,tx
);
905 fjy0
= _mm_add_pd(fjy0
,ty
);
906 fjz0
= _mm_add_pd(fjz0
,tz
);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 r20
= _mm_mul_pd(rsq20
,rinv20
);
914 /* Compute parameters for interactions between i and j atoms */
915 qq20
= _mm_mul_pd(iq2
,jq0
);
917 /* Calculate table index by multiplying r with table scale and truncate to integer */
918 rt
= _mm_mul_pd(r20
,vftabscale
);
919 vfitab
= _mm_cvttpd_epi32(rt
);
920 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
921 vfitab
= _mm_slli_epi32(vfitab
,2);
923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
924 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
925 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
926 GMX_MM_TRANSPOSE2_PD(Y
,F
);
927 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
928 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
929 GMX_MM_TRANSPOSE2_PD(G
,H
);
930 Heps
= _mm_mul_pd(vfeps
,H
);
931 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
932 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
933 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
937 /* Calculate temporary vectorial force */
938 tx
= _mm_mul_pd(fscal
,dx20
);
939 ty
= _mm_mul_pd(fscal
,dy20
);
940 tz
= _mm_mul_pd(fscal
,dz20
);
942 /* Update vectorial force */
943 fix2
= _mm_add_pd(fix2
,tx
);
944 fiy2
= _mm_add_pd(fiy2
,ty
);
945 fiz2
= _mm_add_pd(fiz2
,tz
);
947 fjx0
= _mm_add_pd(fjx0
,tx
);
948 fjy0
= _mm_add_pd(fjy0
,ty
);
949 fjz0
= _mm_add_pd(fjz0
,tz
);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 r30
= _mm_mul_pd(rsq30
,rinv30
);
957 /* Compute parameters for interactions between i and j atoms */
958 qq30
= _mm_mul_pd(iq3
,jq0
);
960 /* Calculate table index by multiplying r with table scale and truncate to integer */
961 rt
= _mm_mul_pd(r30
,vftabscale
);
962 vfitab
= _mm_cvttpd_epi32(rt
);
963 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
964 vfitab
= _mm_slli_epi32(vfitab
,2);
966 /* CUBIC SPLINE TABLE ELECTROSTATICS */
967 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
968 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
969 GMX_MM_TRANSPOSE2_PD(Y
,F
);
970 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
971 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
972 GMX_MM_TRANSPOSE2_PD(G
,H
);
973 Heps
= _mm_mul_pd(vfeps
,H
);
974 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
975 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
976 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
980 /* Calculate temporary vectorial force */
981 tx
= _mm_mul_pd(fscal
,dx30
);
982 ty
= _mm_mul_pd(fscal
,dy30
);
983 tz
= _mm_mul_pd(fscal
,dz30
);
985 /* Update vectorial force */
986 fix3
= _mm_add_pd(fix3
,tx
);
987 fiy3
= _mm_add_pd(fiy3
,ty
);
988 fiz3
= _mm_add_pd(fiz3
,tz
);
990 fjx0
= _mm_add_pd(fjx0
,tx
);
991 fjy0
= _mm_add_pd(fjy0
,ty
);
992 fjz0
= _mm_add_pd(fjz0
,tz
);
994 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
996 /* Inner loop uses 147 flops */
1003 j_coord_offsetA
= DIM
*jnrA
;
1005 /* load j atom coordinates */
1006 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1009 /* Calculate displacement vector */
1010 dx00
= _mm_sub_pd(ix0
,jx0
);
1011 dy00
= _mm_sub_pd(iy0
,jy0
);
1012 dz00
= _mm_sub_pd(iz0
,jz0
);
1013 dx10
= _mm_sub_pd(ix1
,jx0
);
1014 dy10
= _mm_sub_pd(iy1
,jy0
);
1015 dz10
= _mm_sub_pd(iz1
,jz0
);
1016 dx20
= _mm_sub_pd(ix2
,jx0
);
1017 dy20
= _mm_sub_pd(iy2
,jy0
);
1018 dz20
= _mm_sub_pd(iz2
,jz0
);
1019 dx30
= _mm_sub_pd(ix3
,jx0
);
1020 dy30
= _mm_sub_pd(iy3
,jy0
);
1021 dz30
= _mm_sub_pd(iz3
,jz0
);
1023 /* Calculate squared distance and things based on it */
1024 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1025 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1026 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1027 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
1029 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1030 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1031 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
1033 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1035 /* Load parameters for j particles */
1036 jq0
= _mm_load_sd(charge
+jnrA
+0);
1037 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1039 fjx0
= _mm_setzero_pd();
1040 fjy0
= _mm_setzero_pd();
1041 fjz0
= _mm_setzero_pd();
1043 /**************************
1044 * CALCULATE INTERACTIONS *
1045 **************************/
1047 /* Compute parameters for interactions between i and j atoms */
1048 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
1050 /* LENNARD-JONES DISPERSION/REPULSION */
1052 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1053 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1057 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1059 /* Calculate temporary vectorial force */
1060 tx
= _mm_mul_pd(fscal
,dx00
);
1061 ty
= _mm_mul_pd(fscal
,dy00
);
1062 tz
= _mm_mul_pd(fscal
,dz00
);
1064 /* Update vectorial force */
1065 fix0
= _mm_add_pd(fix0
,tx
);
1066 fiy0
= _mm_add_pd(fiy0
,ty
);
1067 fiz0
= _mm_add_pd(fiz0
,tz
);
1069 fjx0
= _mm_add_pd(fjx0
,tx
);
1070 fjy0
= _mm_add_pd(fjy0
,ty
);
1071 fjz0
= _mm_add_pd(fjz0
,tz
);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 r10
= _mm_mul_pd(rsq10
,rinv10
);
1079 /* Compute parameters for interactions between i and j atoms */
1080 qq10
= _mm_mul_pd(iq1
,jq0
);
1082 /* Calculate table index by multiplying r with table scale and truncate to integer */
1083 rt
= _mm_mul_pd(r10
,vftabscale
);
1084 vfitab
= _mm_cvttpd_epi32(rt
);
1085 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1086 vfitab
= _mm_slli_epi32(vfitab
,2);
1088 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1089 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1090 F
= _mm_setzero_pd();
1091 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1092 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1093 H
= _mm_setzero_pd();
1094 GMX_MM_TRANSPOSE2_PD(G
,H
);
1095 Heps
= _mm_mul_pd(vfeps
,H
);
1096 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1097 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1098 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
1102 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1104 /* Calculate temporary vectorial force */
1105 tx
= _mm_mul_pd(fscal
,dx10
);
1106 ty
= _mm_mul_pd(fscal
,dy10
);
1107 tz
= _mm_mul_pd(fscal
,dz10
);
1109 /* Update vectorial force */
1110 fix1
= _mm_add_pd(fix1
,tx
);
1111 fiy1
= _mm_add_pd(fiy1
,ty
);
1112 fiz1
= _mm_add_pd(fiz1
,tz
);
1114 fjx0
= _mm_add_pd(fjx0
,tx
);
1115 fjy0
= _mm_add_pd(fjy0
,ty
);
1116 fjz0
= _mm_add_pd(fjz0
,tz
);
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 r20
= _mm_mul_pd(rsq20
,rinv20
);
1124 /* Compute parameters for interactions between i and j atoms */
1125 qq20
= _mm_mul_pd(iq2
,jq0
);
1127 /* Calculate table index by multiplying r with table scale and truncate to integer */
1128 rt
= _mm_mul_pd(r20
,vftabscale
);
1129 vfitab
= _mm_cvttpd_epi32(rt
);
1130 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1131 vfitab
= _mm_slli_epi32(vfitab
,2);
1133 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1134 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1135 F
= _mm_setzero_pd();
1136 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1137 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1138 H
= _mm_setzero_pd();
1139 GMX_MM_TRANSPOSE2_PD(G
,H
);
1140 Heps
= _mm_mul_pd(vfeps
,H
);
1141 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1142 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1143 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1147 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1149 /* Calculate temporary vectorial force */
1150 tx
= _mm_mul_pd(fscal
,dx20
);
1151 ty
= _mm_mul_pd(fscal
,dy20
);
1152 tz
= _mm_mul_pd(fscal
,dz20
);
1154 /* Update vectorial force */
1155 fix2
= _mm_add_pd(fix2
,tx
);
1156 fiy2
= _mm_add_pd(fiy2
,ty
);
1157 fiz2
= _mm_add_pd(fiz2
,tz
);
1159 fjx0
= _mm_add_pd(fjx0
,tx
);
1160 fjy0
= _mm_add_pd(fjy0
,ty
);
1161 fjz0
= _mm_add_pd(fjz0
,tz
);
1163 /**************************
1164 * CALCULATE INTERACTIONS *
1165 **************************/
1167 r30
= _mm_mul_pd(rsq30
,rinv30
);
1169 /* Compute parameters for interactions between i and j atoms */
1170 qq30
= _mm_mul_pd(iq3
,jq0
);
1172 /* Calculate table index by multiplying r with table scale and truncate to integer */
1173 rt
= _mm_mul_pd(r30
,vftabscale
);
1174 vfitab
= _mm_cvttpd_epi32(rt
);
1175 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1176 vfitab
= _mm_slli_epi32(vfitab
,2);
1178 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1179 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1180 F
= _mm_setzero_pd();
1181 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1182 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1183 H
= _mm_setzero_pd();
1184 GMX_MM_TRANSPOSE2_PD(G
,H
);
1185 Heps
= _mm_mul_pd(vfeps
,H
);
1186 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1187 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1188 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
1192 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1194 /* Calculate temporary vectorial force */
1195 tx
= _mm_mul_pd(fscal
,dx30
);
1196 ty
= _mm_mul_pd(fscal
,dy30
);
1197 tz
= _mm_mul_pd(fscal
,dz30
);
1199 /* Update vectorial force */
1200 fix3
= _mm_add_pd(fix3
,tx
);
1201 fiy3
= _mm_add_pd(fiy3
,ty
);
1202 fiz3
= _mm_add_pd(fiz3
,tz
);
1204 fjx0
= _mm_add_pd(fjx0
,tx
);
1205 fjy0
= _mm_add_pd(fjy0
,ty
);
1206 fjz0
= _mm_add_pd(fjz0
,tz
);
1208 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1210 /* Inner loop uses 147 flops */
1213 /* End of innermost loop */
1215 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1216 f
+i_coord_offset
,fshift
+i_shift_offset
);
1218 /* Increment number of inner iterations */
1219 inneriter
+= j_index_end
- j_index_start
;
1221 /* Outer loop uses 24 flops */
1224 /* Increment number of outer iterations */
1227 /* Update outer/inner flops */
1229 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_F
,outeriter
*24 + inneriter
*147);