2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
69 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
71 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
73 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
74 int vdwjidx0A
,vdwjidx0B
;
75 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
76 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
77 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
78 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
79 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
80 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
83 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
86 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
87 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
89 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
91 __m128d dummy_mask
,cutoff_mask
;
92 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
93 __m128d one
= _mm_set1_pd(1.0);
94 __m128d two
= _mm_set1_pd(2.0);
100 jindex
= nlist
->jindex
;
102 shiftidx
= nlist
->shift
;
104 shiftvec
= fr
->shift_vec
[0];
105 fshift
= fr
->fshift
[0];
106 facel
= _mm_set1_pd(fr
->epsfac
);
107 charge
= mdatoms
->chargeA
;
108 nvdwtype
= fr
->ntype
;
110 vdwtype
= mdatoms
->typeA
;
112 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
113 ewtab
= fr
->ic
->tabq_coul_FDV0
;
114 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
115 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
117 /* Setup water-specific parameters */
118 inr
= nlist
->iinr
[0];
119 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
120 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
121 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
122 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
124 /* Avoid stupid compiler warnings */
132 /* Start outer loop over neighborlists */
133 for(iidx
=0; iidx
<nri
; iidx
++)
135 /* Load shift vector for this list */
136 i_shift_offset
= DIM
*shiftidx
[iidx
];
138 /* Load limits for loop over neighbors */
139 j_index_start
= jindex
[iidx
];
140 j_index_end
= jindex
[iidx
+1];
142 /* Get outer coordinate index */
144 i_coord_offset
= DIM
*inr
;
146 /* Load i particle coords and add shift vector */
147 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
148 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
150 fix0
= _mm_setzero_pd();
151 fiy0
= _mm_setzero_pd();
152 fiz0
= _mm_setzero_pd();
153 fix1
= _mm_setzero_pd();
154 fiy1
= _mm_setzero_pd();
155 fiz1
= _mm_setzero_pd();
156 fix2
= _mm_setzero_pd();
157 fiy2
= _mm_setzero_pd();
158 fiz2
= _mm_setzero_pd();
159 fix3
= _mm_setzero_pd();
160 fiy3
= _mm_setzero_pd();
161 fiz3
= _mm_setzero_pd();
163 /* Reset potential sums */
164 velecsum
= _mm_setzero_pd();
165 vvdwsum
= _mm_setzero_pd();
167 /* Start inner kernel loop */
168 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
171 /* Get j neighbor index, and coordinate index */
174 j_coord_offsetA
= DIM
*jnrA
;
175 j_coord_offsetB
= DIM
*jnrB
;
177 /* load j atom coordinates */
178 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
181 /* Calculate displacement vector */
182 dx00
= _mm_sub_pd(ix0
,jx0
);
183 dy00
= _mm_sub_pd(iy0
,jy0
);
184 dz00
= _mm_sub_pd(iz0
,jz0
);
185 dx10
= _mm_sub_pd(ix1
,jx0
);
186 dy10
= _mm_sub_pd(iy1
,jy0
);
187 dz10
= _mm_sub_pd(iz1
,jz0
);
188 dx20
= _mm_sub_pd(ix2
,jx0
);
189 dy20
= _mm_sub_pd(iy2
,jy0
);
190 dz20
= _mm_sub_pd(iz2
,jz0
);
191 dx30
= _mm_sub_pd(ix3
,jx0
);
192 dy30
= _mm_sub_pd(iy3
,jy0
);
193 dz30
= _mm_sub_pd(iz3
,jz0
);
195 /* Calculate squared distance and things based on it */
196 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
197 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
198 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
199 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
201 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
202 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
203 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
205 rinvsq00
= gmx_mm_inv_pd(rsq00
);
206 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
207 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
208 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
210 /* Load parameters for j particles */
211 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
212 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
213 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
215 fjx0
= _mm_setzero_pd();
216 fjy0
= _mm_setzero_pd();
217 fjz0
= _mm_setzero_pd();
219 /**************************
220 * CALCULATE INTERACTIONS *
221 **************************/
223 /* Compute parameters for interactions between i and j atoms */
224 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
225 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
227 /* LENNARD-JONES DISPERSION/REPULSION */
229 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
230 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
231 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
232 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
233 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
235 /* Update potential sum for this i atom from the interaction with this j atom. */
236 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
240 /* Calculate temporary vectorial force */
241 tx
= _mm_mul_pd(fscal
,dx00
);
242 ty
= _mm_mul_pd(fscal
,dy00
);
243 tz
= _mm_mul_pd(fscal
,dz00
);
245 /* Update vectorial force */
246 fix0
= _mm_add_pd(fix0
,tx
);
247 fiy0
= _mm_add_pd(fiy0
,ty
);
248 fiz0
= _mm_add_pd(fiz0
,tz
);
250 fjx0
= _mm_add_pd(fjx0
,tx
);
251 fjy0
= _mm_add_pd(fjy0
,ty
);
252 fjz0
= _mm_add_pd(fjz0
,tz
);
254 /**************************
255 * CALCULATE INTERACTIONS *
256 **************************/
258 r10
= _mm_mul_pd(rsq10
,rinv10
);
260 /* Compute parameters for interactions between i and j atoms */
261 qq10
= _mm_mul_pd(iq1
,jq0
);
263 /* EWALD ELECTROSTATICS */
265 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
266 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
267 ewitab
= _mm_cvttpd_epi32(ewrt
);
268 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
269 ewitab
= _mm_slli_epi32(ewitab
,2);
270 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
271 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
272 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
273 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
274 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
275 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
276 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
277 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
278 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
279 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
281 /* Update potential sum for this i atom from the interaction with this j atom. */
282 velecsum
= _mm_add_pd(velecsum
,velec
);
286 /* Calculate temporary vectorial force */
287 tx
= _mm_mul_pd(fscal
,dx10
);
288 ty
= _mm_mul_pd(fscal
,dy10
);
289 tz
= _mm_mul_pd(fscal
,dz10
);
291 /* Update vectorial force */
292 fix1
= _mm_add_pd(fix1
,tx
);
293 fiy1
= _mm_add_pd(fiy1
,ty
);
294 fiz1
= _mm_add_pd(fiz1
,tz
);
296 fjx0
= _mm_add_pd(fjx0
,tx
);
297 fjy0
= _mm_add_pd(fjy0
,ty
);
298 fjz0
= _mm_add_pd(fjz0
,tz
);
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 r20
= _mm_mul_pd(rsq20
,rinv20
);
306 /* Compute parameters for interactions between i and j atoms */
307 qq20
= _mm_mul_pd(iq2
,jq0
);
309 /* EWALD ELECTROSTATICS */
311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
312 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
313 ewitab
= _mm_cvttpd_epi32(ewrt
);
314 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
315 ewitab
= _mm_slli_epi32(ewitab
,2);
316 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
317 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
318 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
319 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
320 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
321 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
322 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
323 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
324 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
325 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
327 /* Update potential sum for this i atom from the interaction with this j atom. */
328 velecsum
= _mm_add_pd(velecsum
,velec
);
332 /* Calculate temporary vectorial force */
333 tx
= _mm_mul_pd(fscal
,dx20
);
334 ty
= _mm_mul_pd(fscal
,dy20
);
335 tz
= _mm_mul_pd(fscal
,dz20
);
337 /* Update vectorial force */
338 fix2
= _mm_add_pd(fix2
,tx
);
339 fiy2
= _mm_add_pd(fiy2
,ty
);
340 fiz2
= _mm_add_pd(fiz2
,tz
);
342 fjx0
= _mm_add_pd(fjx0
,tx
);
343 fjy0
= _mm_add_pd(fjy0
,ty
);
344 fjz0
= _mm_add_pd(fjz0
,tz
);
346 /**************************
347 * CALCULATE INTERACTIONS *
348 **************************/
350 r30
= _mm_mul_pd(rsq30
,rinv30
);
352 /* Compute parameters for interactions between i and j atoms */
353 qq30
= _mm_mul_pd(iq3
,jq0
);
355 /* EWALD ELECTROSTATICS */
357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
358 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
359 ewitab
= _mm_cvttpd_epi32(ewrt
);
360 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
361 ewitab
= _mm_slli_epi32(ewitab
,2);
362 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
363 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
364 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
365 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
366 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
367 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
368 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
369 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
370 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
371 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velecsum
= _mm_add_pd(velecsum
,velec
);
378 /* Calculate temporary vectorial force */
379 tx
= _mm_mul_pd(fscal
,dx30
);
380 ty
= _mm_mul_pd(fscal
,dy30
);
381 tz
= _mm_mul_pd(fscal
,dz30
);
383 /* Update vectorial force */
384 fix3
= _mm_add_pd(fix3
,tx
);
385 fiy3
= _mm_add_pd(fiy3
,ty
);
386 fiz3
= _mm_add_pd(fiz3
,tz
);
388 fjx0
= _mm_add_pd(fjx0
,tx
);
389 fjy0
= _mm_add_pd(fjy0
,ty
);
390 fjz0
= _mm_add_pd(fjz0
,tz
);
392 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
394 /* Inner loop uses 158 flops */
401 j_coord_offsetA
= DIM
*jnrA
;
403 /* load j atom coordinates */
404 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
407 /* Calculate displacement vector */
408 dx00
= _mm_sub_pd(ix0
,jx0
);
409 dy00
= _mm_sub_pd(iy0
,jy0
);
410 dz00
= _mm_sub_pd(iz0
,jz0
);
411 dx10
= _mm_sub_pd(ix1
,jx0
);
412 dy10
= _mm_sub_pd(iy1
,jy0
);
413 dz10
= _mm_sub_pd(iz1
,jz0
);
414 dx20
= _mm_sub_pd(ix2
,jx0
);
415 dy20
= _mm_sub_pd(iy2
,jy0
);
416 dz20
= _mm_sub_pd(iz2
,jz0
);
417 dx30
= _mm_sub_pd(ix3
,jx0
);
418 dy30
= _mm_sub_pd(iy3
,jy0
);
419 dz30
= _mm_sub_pd(iz3
,jz0
);
421 /* Calculate squared distance and things based on it */
422 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
423 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
424 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
425 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
427 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
428 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
429 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
431 rinvsq00
= gmx_mm_inv_pd(rsq00
);
432 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
433 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
434 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
436 /* Load parameters for j particles */
437 jq0
= _mm_load_sd(charge
+jnrA
+0);
438 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
440 fjx0
= _mm_setzero_pd();
441 fjy0
= _mm_setzero_pd();
442 fjz0
= _mm_setzero_pd();
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 /* Compute parameters for interactions between i and j atoms */
449 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
451 /* LENNARD-JONES DISPERSION/REPULSION */
453 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
454 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
455 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
456 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
457 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
461 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
465 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
467 /* Calculate temporary vectorial force */
468 tx
= _mm_mul_pd(fscal
,dx00
);
469 ty
= _mm_mul_pd(fscal
,dy00
);
470 tz
= _mm_mul_pd(fscal
,dz00
);
472 /* Update vectorial force */
473 fix0
= _mm_add_pd(fix0
,tx
);
474 fiy0
= _mm_add_pd(fiy0
,ty
);
475 fiz0
= _mm_add_pd(fiz0
,tz
);
477 fjx0
= _mm_add_pd(fjx0
,tx
);
478 fjy0
= _mm_add_pd(fjy0
,ty
);
479 fjz0
= _mm_add_pd(fjz0
,tz
);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 r10
= _mm_mul_pd(rsq10
,rinv10
);
487 /* Compute parameters for interactions between i and j atoms */
488 qq10
= _mm_mul_pd(iq1
,jq0
);
490 /* EWALD ELECTROSTATICS */
492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
493 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
494 ewitab
= _mm_cvttpd_epi32(ewrt
);
495 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
496 ewitab
= _mm_slli_epi32(ewitab
,2);
497 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
498 ewtabD
= _mm_setzero_pd();
499 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
500 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
501 ewtabFn
= _mm_setzero_pd();
502 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
503 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
504 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
505 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
506 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
508 /* Update potential sum for this i atom from the interaction with this j atom. */
509 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
510 velecsum
= _mm_add_pd(velecsum
,velec
);
514 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
516 /* Calculate temporary vectorial force */
517 tx
= _mm_mul_pd(fscal
,dx10
);
518 ty
= _mm_mul_pd(fscal
,dy10
);
519 tz
= _mm_mul_pd(fscal
,dz10
);
521 /* Update vectorial force */
522 fix1
= _mm_add_pd(fix1
,tx
);
523 fiy1
= _mm_add_pd(fiy1
,ty
);
524 fiz1
= _mm_add_pd(fiz1
,tz
);
526 fjx0
= _mm_add_pd(fjx0
,tx
);
527 fjy0
= _mm_add_pd(fjy0
,ty
);
528 fjz0
= _mm_add_pd(fjz0
,tz
);
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 r20
= _mm_mul_pd(rsq20
,rinv20
);
536 /* Compute parameters for interactions between i and j atoms */
537 qq20
= _mm_mul_pd(iq2
,jq0
);
539 /* EWALD ELECTROSTATICS */
541 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
542 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
543 ewitab
= _mm_cvttpd_epi32(ewrt
);
544 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
545 ewitab
= _mm_slli_epi32(ewitab
,2);
546 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
547 ewtabD
= _mm_setzero_pd();
548 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
549 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
550 ewtabFn
= _mm_setzero_pd();
551 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
552 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
553 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
554 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
555 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
557 /* Update potential sum for this i atom from the interaction with this j atom. */
558 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
559 velecsum
= _mm_add_pd(velecsum
,velec
);
563 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
565 /* Calculate temporary vectorial force */
566 tx
= _mm_mul_pd(fscal
,dx20
);
567 ty
= _mm_mul_pd(fscal
,dy20
);
568 tz
= _mm_mul_pd(fscal
,dz20
);
570 /* Update vectorial force */
571 fix2
= _mm_add_pd(fix2
,tx
);
572 fiy2
= _mm_add_pd(fiy2
,ty
);
573 fiz2
= _mm_add_pd(fiz2
,tz
);
575 fjx0
= _mm_add_pd(fjx0
,tx
);
576 fjy0
= _mm_add_pd(fjy0
,ty
);
577 fjz0
= _mm_add_pd(fjz0
,tz
);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r30
= _mm_mul_pd(rsq30
,rinv30
);
585 /* Compute parameters for interactions between i and j atoms */
586 qq30
= _mm_mul_pd(iq3
,jq0
);
588 /* EWALD ELECTROSTATICS */
590 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
591 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
592 ewitab
= _mm_cvttpd_epi32(ewrt
);
593 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
594 ewitab
= _mm_slli_epi32(ewitab
,2);
595 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
596 ewtabD
= _mm_setzero_pd();
597 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
598 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
599 ewtabFn
= _mm_setzero_pd();
600 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
601 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
602 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
603 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
604 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
608 velecsum
= _mm_add_pd(velecsum
,velec
);
612 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
614 /* Calculate temporary vectorial force */
615 tx
= _mm_mul_pd(fscal
,dx30
);
616 ty
= _mm_mul_pd(fscal
,dy30
);
617 tz
= _mm_mul_pd(fscal
,dz30
);
619 /* Update vectorial force */
620 fix3
= _mm_add_pd(fix3
,tx
);
621 fiy3
= _mm_add_pd(fiy3
,ty
);
622 fiz3
= _mm_add_pd(fiz3
,tz
);
624 fjx0
= _mm_add_pd(fjx0
,tx
);
625 fjy0
= _mm_add_pd(fjy0
,ty
);
626 fjz0
= _mm_add_pd(fjz0
,tz
);
628 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
630 /* Inner loop uses 158 flops */
633 /* End of innermost loop */
635 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
636 f
+i_coord_offset
,fshift
+i_shift_offset
);
639 /* Update potential energies */
640 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
641 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
643 /* Increment number of inner iterations */
644 inneriter
+= j_index_end
- j_index_start
;
646 /* Outer loop uses 26 flops */
649 /* Increment number of outer iterations */
652 /* Update outer/inner flops */
654 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_VF
,outeriter
*26 + inneriter
*158);
657 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_double
658 * Electrostatics interaction: Ewald
659 * VdW interaction: LennardJones
660 * Geometry: Water4-Particle
661 * Calculate force/pot: Force
664 nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_double
665 (t_nblist
* gmx_restrict nlist
,
666 rvec
* gmx_restrict xx
,
667 rvec
* gmx_restrict ff
,
668 t_forcerec
* gmx_restrict fr
,
669 t_mdatoms
* gmx_restrict mdatoms
,
670 nb_kernel_data_t
* gmx_restrict kernel_data
,
671 t_nrnb
* gmx_restrict nrnb
)
673 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
674 * just 0 for non-waters.
675 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
676 * jnr indices corresponding to data put in the four positions in the SIMD register.
678 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
679 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
681 int j_coord_offsetA
,j_coord_offsetB
;
682 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
684 real
*shiftvec
,*fshift
,*x
,*f
;
685 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
687 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
689 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
691 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
693 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
694 int vdwjidx0A
,vdwjidx0B
;
695 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
696 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
697 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
698 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
699 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
700 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
703 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
706 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
707 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
709 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
711 __m128d dummy_mask
,cutoff_mask
;
712 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
713 __m128d one
= _mm_set1_pd(1.0);
714 __m128d two
= _mm_set1_pd(2.0);
720 jindex
= nlist
->jindex
;
722 shiftidx
= nlist
->shift
;
724 shiftvec
= fr
->shift_vec
[0];
725 fshift
= fr
->fshift
[0];
726 facel
= _mm_set1_pd(fr
->epsfac
);
727 charge
= mdatoms
->chargeA
;
728 nvdwtype
= fr
->ntype
;
730 vdwtype
= mdatoms
->typeA
;
732 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
733 ewtab
= fr
->ic
->tabq_coul_F
;
734 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
735 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
737 /* Setup water-specific parameters */
738 inr
= nlist
->iinr
[0];
739 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
740 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
741 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
742 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
744 /* Avoid stupid compiler warnings */
752 /* Start outer loop over neighborlists */
753 for(iidx
=0; iidx
<nri
; iidx
++)
755 /* Load shift vector for this list */
756 i_shift_offset
= DIM
*shiftidx
[iidx
];
758 /* Load limits for loop over neighbors */
759 j_index_start
= jindex
[iidx
];
760 j_index_end
= jindex
[iidx
+1];
762 /* Get outer coordinate index */
764 i_coord_offset
= DIM
*inr
;
766 /* Load i particle coords and add shift vector */
767 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
768 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
770 fix0
= _mm_setzero_pd();
771 fiy0
= _mm_setzero_pd();
772 fiz0
= _mm_setzero_pd();
773 fix1
= _mm_setzero_pd();
774 fiy1
= _mm_setzero_pd();
775 fiz1
= _mm_setzero_pd();
776 fix2
= _mm_setzero_pd();
777 fiy2
= _mm_setzero_pd();
778 fiz2
= _mm_setzero_pd();
779 fix3
= _mm_setzero_pd();
780 fiy3
= _mm_setzero_pd();
781 fiz3
= _mm_setzero_pd();
783 /* Start inner kernel loop */
784 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
787 /* Get j neighbor index, and coordinate index */
790 j_coord_offsetA
= DIM
*jnrA
;
791 j_coord_offsetB
= DIM
*jnrB
;
793 /* load j atom coordinates */
794 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
797 /* Calculate displacement vector */
798 dx00
= _mm_sub_pd(ix0
,jx0
);
799 dy00
= _mm_sub_pd(iy0
,jy0
);
800 dz00
= _mm_sub_pd(iz0
,jz0
);
801 dx10
= _mm_sub_pd(ix1
,jx0
);
802 dy10
= _mm_sub_pd(iy1
,jy0
);
803 dz10
= _mm_sub_pd(iz1
,jz0
);
804 dx20
= _mm_sub_pd(ix2
,jx0
);
805 dy20
= _mm_sub_pd(iy2
,jy0
);
806 dz20
= _mm_sub_pd(iz2
,jz0
);
807 dx30
= _mm_sub_pd(ix3
,jx0
);
808 dy30
= _mm_sub_pd(iy3
,jy0
);
809 dz30
= _mm_sub_pd(iz3
,jz0
);
811 /* Calculate squared distance and things based on it */
812 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
813 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
814 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
815 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
817 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
818 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
819 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
821 rinvsq00
= gmx_mm_inv_pd(rsq00
);
822 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
823 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
824 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
826 /* Load parameters for j particles */
827 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
828 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
829 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
831 fjx0
= _mm_setzero_pd();
832 fjy0
= _mm_setzero_pd();
833 fjz0
= _mm_setzero_pd();
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 /* Compute parameters for interactions between i and j atoms */
840 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
841 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
843 /* LENNARD-JONES DISPERSION/REPULSION */
845 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
846 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
850 /* Calculate temporary vectorial force */
851 tx
= _mm_mul_pd(fscal
,dx00
);
852 ty
= _mm_mul_pd(fscal
,dy00
);
853 tz
= _mm_mul_pd(fscal
,dz00
);
855 /* Update vectorial force */
856 fix0
= _mm_add_pd(fix0
,tx
);
857 fiy0
= _mm_add_pd(fiy0
,ty
);
858 fiz0
= _mm_add_pd(fiz0
,tz
);
860 fjx0
= _mm_add_pd(fjx0
,tx
);
861 fjy0
= _mm_add_pd(fjy0
,ty
);
862 fjz0
= _mm_add_pd(fjz0
,tz
);
864 /**************************
865 * CALCULATE INTERACTIONS *
866 **************************/
868 r10
= _mm_mul_pd(rsq10
,rinv10
);
870 /* Compute parameters for interactions between i and j atoms */
871 qq10
= _mm_mul_pd(iq1
,jq0
);
873 /* EWALD ELECTROSTATICS */
875 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
876 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
877 ewitab
= _mm_cvttpd_epi32(ewrt
);
878 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
879 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
881 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
882 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
886 /* Calculate temporary vectorial force */
887 tx
= _mm_mul_pd(fscal
,dx10
);
888 ty
= _mm_mul_pd(fscal
,dy10
);
889 tz
= _mm_mul_pd(fscal
,dz10
);
891 /* Update vectorial force */
892 fix1
= _mm_add_pd(fix1
,tx
);
893 fiy1
= _mm_add_pd(fiy1
,ty
);
894 fiz1
= _mm_add_pd(fiz1
,tz
);
896 fjx0
= _mm_add_pd(fjx0
,tx
);
897 fjy0
= _mm_add_pd(fjy0
,ty
);
898 fjz0
= _mm_add_pd(fjz0
,tz
);
900 /**************************
901 * CALCULATE INTERACTIONS *
902 **************************/
904 r20
= _mm_mul_pd(rsq20
,rinv20
);
906 /* Compute parameters for interactions between i and j atoms */
907 qq20
= _mm_mul_pd(iq2
,jq0
);
909 /* EWALD ELECTROSTATICS */
911 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
912 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
913 ewitab
= _mm_cvttpd_epi32(ewrt
);
914 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
915 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
917 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
918 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
922 /* Calculate temporary vectorial force */
923 tx
= _mm_mul_pd(fscal
,dx20
);
924 ty
= _mm_mul_pd(fscal
,dy20
);
925 tz
= _mm_mul_pd(fscal
,dz20
);
927 /* Update vectorial force */
928 fix2
= _mm_add_pd(fix2
,tx
);
929 fiy2
= _mm_add_pd(fiy2
,ty
);
930 fiz2
= _mm_add_pd(fiz2
,tz
);
932 fjx0
= _mm_add_pd(fjx0
,tx
);
933 fjy0
= _mm_add_pd(fjy0
,ty
);
934 fjz0
= _mm_add_pd(fjz0
,tz
);
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 r30
= _mm_mul_pd(rsq30
,rinv30
);
942 /* Compute parameters for interactions between i and j atoms */
943 qq30
= _mm_mul_pd(iq3
,jq0
);
945 /* EWALD ELECTROSTATICS */
947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
948 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
949 ewitab
= _mm_cvttpd_epi32(ewrt
);
950 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
951 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
953 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
954 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
958 /* Calculate temporary vectorial force */
959 tx
= _mm_mul_pd(fscal
,dx30
);
960 ty
= _mm_mul_pd(fscal
,dy30
);
961 tz
= _mm_mul_pd(fscal
,dz30
);
963 /* Update vectorial force */
964 fix3
= _mm_add_pd(fix3
,tx
);
965 fiy3
= _mm_add_pd(fiy3
,ty
);
966 fiz3
= _mm_add_pd(fiz3
,tz
);
968 fjx0
= _mm_add_pd(fjx0
,tx
);
969 fjy0
= _mm_add_pd(fjy0
,ty
);
970 fjz0
= _mm_add_pd(fjz0
,tz
);
972 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
974 /* Inner loop uses 138 flops */
981 j_coord_offsetA
= DIM
*jnrA
;
983 /* load j atom coordinates */
984 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
987 /* Calculate displacement vector */
988 dx00
= _mm_sub_pd(ix0
,jx0
);
989 dy00
= _mm_sub_pd(iy0
,jy0
);
990 dz00
= _mm_sub_pd(iz0
,jz0
);
991 dx10
= _mm_sub_pd(ix1
,jx0
);
992 dy10
= _mm_sub_pd(iy1
,jy0
);
993 dz10
= _mm_sub_pd(iz1
,jz0
);
994 dx20
= _mm_sub_pd(ix2
,jx0
);
995 dy20
= _mm_sub_pd(iy2
,jy0
);
996 dz20
= _mm_sub_pd(iz2
,jz0
);
997 dx30
= _mm_sub_pd(ix3
,jx0
);
998 dy30
= _mm_sub_pd(iy3
,jy0
);
999 dz30
= _mm_sub_pd(iz3
,jz0
);
1001 /* Calculate squared distance and things based on it */
1002 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1003 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1004 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1005 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
1007 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1008 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1009 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
1011 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1012 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1013 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1014 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
1016 /* Load parameters for j particles */
1017 jq0
= _mm_load_sd(charge
+jnrA
+0);
1018 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1020 fjx0
= _mm_setzero_pd();
1021 fjy0
= _mm_setzero_pd();
1022 fjz0
= _mm_setzero_pd();
1024 /**************************
1025 * CALCULATE INTERACTIONS *
1026 **************************/
1028 /* Compute parameters for interactions between i and j atoms */
1029 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
1031 /* LENNARD-JONES DISPERSION/REPULSION */
1033 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1034 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1038 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1040 /* Calculate temporary vectorial force */
1041 tx
= _mm_mul_pd(fscal
,dx00
);
1042 ty
= _mm_mul_pd(fscal
,dy00
);
1043 tz
= _mm_mul_pd(fscal
,dz00
);
1045 /* Update vectorial force */
1046 fix0
= _mm_add_pd(fix0
,tx
);
1047 fiy0
= _mm_add_pd(fiy0
,ty
);
1048 fiz0
= _mm_add_pd(fiz0
,tz
);
1050 fjx0
= _mm_add_pd(fjx0
,tx
);
1051 fjy0
= _mm_add_pd(fjy0
,ty
);
1052 fjz0
= _mm_add_pd(fjz0
,tz
);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 r10
= _mm_mul_pd(rsq10
,rinv10
);
1060 /* Compute parameters for interactions between i and j atoms */
1061 qq10
= _mm_mul_pd(iq1
,jq0
);
1063 /* EWALD ELECTROSTATICS */
1065 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1066 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1067 ewitab
= _mm_cvttpd_epi32(ewrt
);
1068 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1069 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1070 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1071 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1075 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1077 /* Calculate temporary vectorial force */
1078 tx
= _mm_mul_pd(fscal
,dx10
);
1079 ty
= _mm_mul_pd(fscal
,dy10
);
1080 tz
= _mm_mul_pd(fscal
,dz10
);
1082 /* Update vectorial force */
1083 fix1
= _mm_add_pd(fix1
,tx
);
1084 fiy1
= _mm_add_pd(fiy1
,ty
);
1085 fiz1
= _mm_add_pd(fiz1
,tz
);
1087 fjx0
= _mm_add_pd(fjx0
,tx
);
1088 fjy0
= _mm_add_pd(fjy0
,ty
);
1089 fjz0
= _mm_add_pd(fjz0
,tz
);
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1095 r20
= _mm_mul_pd(rsq20
,rinv20
);
1097 /* Compute parameters for interactions between i and j atoms */
1098 qq20
= _mm_mul_pd(iq2
,jq0
);
1100 /* EWALD ELECTROSTATICS */
1102 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1103 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1104 ewitab
= _mm_cvttpd_epi32(ewrt
);
1105 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1106 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1107 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1108 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1112 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1114 /* Calculate temporary vectorial force */
1115 tx
= _mm_mul_pd(fscal
,dx20
);
1116 ty
= _mm_mul_pd(fscal
,dy20
);
1117 tz
= _mm_mul_pd(fscal
,dz20
);
1119 /* Update vectorial force */
1120 fix2
= _mm_add_pd(fix2
,tx
);
1121 fiy2
= _mm_add_pd(fiy2
,ty
);
1122 fiz2
= _mm_add_pd(fiz2
,tz
);
1124 fjx0
= _mm_add_pd(fjx0
,tx
);
1125 fjy0
= _mm_add_pd(fjy0
,ty
);
1126 fjz0
= _mm_add_pd(fjz0
,tz
);
1128 /**************************
1129 * CALCULATE INTERACTIONS *
1130 **************************/
1132 r30
= _mm_mul_pd(rsq30
,rinv30
);
1134 /* Compute parameters for interactions between i and j atoms */
1135 qq30
= _mm_mul_pd(iq3
,jq0
);
1137 /* EWALD ELECTROSTATICS */
1139 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1140 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1141 ewitab
= _mm_cvttpd_epi32(ewrt
);
1142 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1143 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1144 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1145 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1149 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1151 /* Calculate temporary vectorial force */
1152 tx
= _mm_mul_pd(fscal
,dx30
);
1153 ty
= _mm_mul_pd(fscal
,dy30
);
1154 tz
= _mm_mul_pd(fscal
,dz30
);
1156 /* Update vectorial force */
1157 fix3
= _mm_add_pd(fix3
,tx
);
1158 fiy3
= _mm_add_pd(fiy3
,ty
);
1159 fiz3
= _mm_add_pd(fiz3
,tz
);
1161 fjx0
= _mm_add_pd(fjx0
,tx
);
1162 fjy0
= _mm_add_pd(fjy0
,ty
);
1163 fjz0
= _mm_add_pd(fjz0
,tz
);
1165 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1167 /* Inner loop uses 138 flops */
1170 /* End of innermost loop */
1172 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1173 f
+i_coord_offset
,fshift
+i_shift_offset
);
1175 /* Increment number of inner iterations */
1176 inneriter
+= j_index_end
- j_index_start
;
1178 /* Outer loop uses 24 flops */
1181 /* Increment number of outer iterations */
1184 /* Update outer/inner flops */
1186 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_F
,outeriter
*24 + inneriter
*138);