2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
69 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
71 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
72 int vdwjidx0A
,vdwjidx0B
;
73 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
74 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
75 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
76 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
77 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
80 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
82 __m128d dummy_mask
,cutoff_mask
;
83 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
84 __m128d one
= _mm_set1_pd(1.0);
85 __m128d two
= _mm_set1_pd(2.0);
91 jindex
= nlist
->jindex
;
93 shiftidx
= nlist
->shift
;
95 shiftvec
= fr
->shift_vec
[0];
96 fshift
= fr
->fshift
[0];
97 facel
= _mm_set1_pd(fr
->epsfac
);
98 charge
= mdatoms
->chargeA
;
100 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
101 ewtab
= fr
->ic
->tabq_coul_FDV0
;
102 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
103 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
105 /* Setup water-specific parameters */
106 inr
= nlist
->iinr
[0];
107 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
108 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
109 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
111 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
112 rcutoff_scalar
= fr
->rcoulomb
;
113 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
114 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
116 /* Avoid stupid compiler warnings */
124 /* Start outer loop over neighborlists */
125 for(iidx
=0; iidx
<nri
; iidx
++)
127 /* Load shift vector for this list */
128 i_shift_offset
= DIM
*shiftidx
[iidx
];
130 /* Load limits for loop over neighbors */
131 j_index_start
= jindex
[iidx
];
132 j_index_end
= jindex
[iidx
+1];
134 /* Get outer coordinate index */
136 i_coord_offset
= DIM
*inr
;
138 /* Load i particle coords and add shift vector */
139 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
140 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
142 fix0
= _mm_setzero_pd();
143 fiy0
= _mm_setzero_pd();
144 fiz0
= _mm_setzero_pd();
145 fix1
= _mm_setzero_pd();
146 fiy1
= _mm_setzero_pd();
147 fiz1
= _mm_setzero_pd();
148 fix2
= _mm_setzero_pd();
149 fiy2
= _mm_setzero_pd();
150 fiz2
= _mm_setzero_pd();
152 /* Reset potential sums */
153 velecsum
= _mm_setzero_pd();
155 /* Start inner kernel loop */
156 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
159 /* Get j neighbor index, and coordinate index */
162 j_coord_offsetA
= DIM
*jnrA
;
163 j_coord_offsetB
= DIM
*jnrB
;
165 /* load j atom coordinates */
166 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
169 /* Calculate displacement vector */
170 dx00
= _mm_sub_pd(ix0
,jx0
);
171 dy00
= _mm_sub_pd(iy0
,jy0
);
172 dz00
= _mm_sub_pd(iz0
,jz0
);
173 dx10
= _mm_sub_pd(ix1
,jx0
);
174 dy10
= _mm_sub_pd(iy1
,jy0
);
175 dz10
= _mm_sub_pd(iz1
,jz0
);
176 dx20
= _mm_sub_pd(ix2
,jx0
);
177 dy20
= _mm_sub_pd(iy2
,jy0
);
178 dz20
= _mm_sub_pd(iz2
,jz0
);
180 /* Calculate squared distance and things based on it */
181 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
182 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
183 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
185 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
186 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
187 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
189 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
190 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
191 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
193 /* Load parameters for j particles */
194 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
196 fjx0
= _mm_setzero_pd();
197 fjy0
= _mm_setzero_pd();
198 fjz0
= _mm_setzero_pd();
200 /**************************
201 * CALCULATE INTERACTIONS *
202 **************************/
204 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
207 r00
= _mm_mul_pd(rsq00
,rinv00
);
209 /* Compute parameters for interactions between i and j atoms */
210 qq00
= _mm_mul_pd(iq0
,jq0
);
212 /* EWALD ELECTROSTATICS */
214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
215 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
216 ewitab
= _mm_cvttpd_epi32(ewrt
);
217 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
218 ewitab
= _mm_slli_epi32(ewitab
,2);
219 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
220 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
221 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
222 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
223 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
224 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
225 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
226 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
227 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
228 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
230 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
232 /* Update potential sum for this i atom from the interaction with this j atom. */
233 velec
= _mm_and_pd(velec
,cutoff_mask
);
234 velecsum
= _mm_add_pd(velecsum
,velec
);
238 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
240 /* Calculate temporary vectorial force */
241 tx
= _mm_mul_pd(fscal
,dx00
);
242 ty
= _mm_mul_pd(fscal
,dy00
);
243 tz
= _mm_mul_pd(fscal
,dz00
);
245 /* Update vectorial force */
246 fix0
= _mm_add_pd(fix0
,tx
);
247 fiy0
= _mm_add_pd(fiy0
,ty
);
248 fiz0
= _mm_add_pd(fiz0
,tz
);
250 fjx0
= _mm_add_pd(fjx0
,tx
);
251 fjy0
= _mm_add_pd(fjy0
,ty
);
252 fjz0
= _mm_add_pd(fjz0
,tz
);
256 /**************************
257 * CALCULATE INTERACTIONS *
258 **************************/
260 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
263 r10
= _mm_mul_pd(rsq10
,rinv10
);
265 /* Compute parameters for interactions between i and j atoms */
266 qq10
= _mm_mul_pd(iq1
,jq0
);
268 /* EWALD ELECTROSTATICS */
270 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
271 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
272 ewitab
= _mm_cvttpd_epi32(ewrt
);
273 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
274 ewitab
= _mm_slli_epi32(ewitab
,2);
275 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
276 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
277 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
278 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
279 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
280 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
281 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
282 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
283 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
284 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
286 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
288 /* Update potential sum for this i atom from the interaction with this j atom. */
289 velec
= _mm_and_pd(velec
,cutoff_mask
);
290 velecsum
= _mm_add_pd(velecsum
,velec
);
294 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
296 /* Calculate temporary vectorial force */
297 tx
= _mm_mul_pd(fscal
,dx10
);
298 ty
= _mm_mul_pd(fscal
,dy10
);
299 tz
= _mm_mul_pd(fscal
,dz10
);
301 /* Update vectorial force */
302 fix1
= _mm_add_pd(fix1
,tx
);
303 fiy1
= _mm_add_pd(fiy1
,ty
);
304 fiz1
= _mm_add_pd(fiz1
,tz
);
306 fjx0
= _mm_add_pd(fjx0
,tx
);
307 fjy0
= _mm_add_pd(fjy0
,ty
);
308 fjz0
= _mm_add_pd(fjz0
,tz
);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
319 r20
= _mm_mul_pd(rsq20
,rinv20
);
321 /* Compute parameters for interactions between i and j atoms */
322 qq20
= _mm_mul_pd(iq2
,jq0
);
324 /* EWALD ELECTROSTATICS */
326 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
327 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
328 ewitab
= _mm_cvttpd_epi32(ewrt
);
329 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
330 ewitab
= _mm_slli_epi32(ewitab
,2);
331 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
332 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
333 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
334 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
335 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
336 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
337 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
338 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
339 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
340 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
342 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velec
= _mm_and_pd(velec
,cutoff_mask
);
346 velecsum
= _mm_add_pd(velecsum
,velec
);
350 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
352 /* Calculate temporary vectorial force */
353 tx
= _mm_mul_pd(fscal
,dx20
);
354 ty
= _mm_mul_pd(fscal
,dy20
);
355 tz
= _mm_mul_pd(fscal
,dz20
);
357 /* Update vectorial force */
358 fix2
= _mm_add_pd(fix2
,tx
);
359 fiy2
= _mm_add_pd(fiy2
,ty
);
360 fiz2
= _mm_add_pd(fiz2
,tz
);
362 fjx0
= _mm_add_pd(fjx0
,tx
);
363 fjy0
= _mm_add_pd(fjy0
,ty
);
364 fjz0
= _mm_add_pd(fjz0
,tz
);
368 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
370 /* Inner loop uses 141 flops */
377 j_coord_offsetA
= DIM
*jnrA
;
379 /* load j atom coordinates */
380 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
383 /* Calculate displacement vector */
384 dx00
= _mm_sub_pd(ix0
,jx0
);
385 dy00
= _mm_sub_pd(iy0
,jy0
);
386 dz00
= _mm_sub_pd(iz0
,jz0
);
387 dx10
= _mm_sub_pd(ix1
,jx0
);
388 dy10
= _mm_sub_pd(iy1
,jy0
);
389 dz10
= _mm_sub_pd(iz1
,jz0
);
390 dx20
= _mm_sub_pd(ix2
,jx0
);
391 dy20
= _mm_sub_pd(iy2
,jy0
);
392 dz20
= _mm_sub_pd(iz2
,jz0
);
394 /* Calculate squared distance and things based on it */
395 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
396 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
397 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
399 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
400 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
401 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
403 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
404 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
405 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
407 /* Load parameters for j particles */
408 jq0
= _mm_load_sd(charge
+jnrA
+0);
410 fjx0
= _mm_setzero_pd();
411 fjy0
= _mm_setzero_pd();
412 fjz0
= _mm_setzero_pd();
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
421 r00
= _mm_mul_pd(rsq00
,rinv00
);
423 /* Compute parameters for interactions between i and j atoms */
424 qq00
= _mm_mul_pd(iq0
,jq0
);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
430 ewitab
= _mm_cvttpd_epi32(ewrt
);
431 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
432 ewitab
= _mm_slli_epi32(ewitab
,2);
433 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
434 ewtabD
= _mm_setzero_pd();
435 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
436 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
437 ewtabFn
= _mm_setzero_pd();
438 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
439 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
440 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
441 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
442 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
444 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
446 /* Update potential sum for this i atom from the interaction with this j atom. */
447 velec
= _mm_and_pd(velec
,cutoff_mask
);
448 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
449 velecsum
= _mm_add_pd(velecsum
,velec
);
453 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
455 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
457 /* Calculate temporary vectorial force */
458 tx
= _mm_mul_pd(fscal
,dx00
);
459 ty
= _mm_mul_pd(fscal
,dy00
);
460 tz
= _mm_mul_pd(fscal
,dz00
);
462 /* Update vectorial force */
463 fix0
= _mm_add_pd(fix0
,tx
);
464 fiy0
= _mm_add_pd(fiy0
,ty
);
465 fiz0
= _mm_add_pd(fiz0
,tz
);
467 fjx0
= _mm_add_pd(fjx0
,tx
);
468 fjy0
= _mm_add_pd(fjy0
,ty
);
469 fjz0
= _mm_add_pd(fjz0
,tz
);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
480 r10
= _mm_mul_pd(rsq10
,rinv10
);
482 /* Compute parameters for interactions between i and j atoms */
483 qq10
= _mm_mul_pd(iq1
,jq0
);
485 /* EWALD ELECTROSTATICS */
487 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
488 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
489 ewitab
= _mm_cvttpd_epi32(ewrt
);
490 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
491 ewitab
= _mm_slli_epi32(ewitab
,2);
492 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
493 ewtabD
= _mm_setzero_pd();
494 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
495 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
496 ewtabFn
= _mm_setzero_pd();
497 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
498 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
499 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
500 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
501 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
503 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velec
= _mm_and_pd(velec
,cutoff_mask
);
507 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
508 velecsum
= _mm_add_pd(velecsum
,velec
);
512 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
514 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
516 /* Calculate temporary vectorial force */
517 tx
= _mm_mul_pd(fscal
,dx10
);
518 ty
= _mm_mul_pd(fscal
,dy10
);
519 tz
= _mm_mul_pd(fscal
,dz10
);
521 /* Update vectorial force */
522 fix1
= _mm_add_pd(fix1
,tx
);
523 fiy1
= _mm_add_pd(fiy1
,ty
);
524 fiz1
= _mm_add_pd(fiz1
,tz
);
526 fjx0
= _mm_add_pd(fjx0
,tx
);
527 fjy0
= _mm_add_pd(fjy0
,ty
);
528 fjz0
= _mm_add_pd(fjz0
,tz
);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
539 r20
= _mm_mul_pd(rsq20
,rinv20
);
541 /* Compute parameters for interactions between i and j atoms */
542 qq20
= _mm_mul_pd(iq2
,jq0
);
544 /* EWALD ELECTROSTATICS */
546 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
547 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
548 ewitab
= _mm_cvttpd_epi32(ewrt
);
549 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
550 ewitab
= _mm_slli_epi32(ewitab
,2);
551 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
552 ewtabD
= _mm_setzero_pd();
553 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
554 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
555 ewtabFn
= _mm_setzero_pd();
556 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
557 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
558 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
559 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
560 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
562 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
564 /* Update potential sum for this i atom from the interaction with this j atom. */
565 velec
= _mm_and_pd(velec
,cutoff_mask
);
566 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
567 velecsum
= _mm_add_pd(velecsum
,velec
);
571 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
573 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
575 /* Calculate temporary vectorial force */
576 tx
= _mm_mul_pd(fscal
,dx20
);
577 ty
= _mm_mul_pd(fscal
,dy20
);
578 tz
= _mm_mul_pd(fscal
,dz20
);
580 /* Update vectorial force */
581 fix2
= _mm_add_pd(fix2
,tx
);
582 fiy2
= _mm_add_pd(fiy2
,ty
);
583 fiz2
= _mm_add_pd(fiz2
,tz
);
585 fjx0
= _mm_add_pd(fjx0
,tx
);
586 fjy0
= _mm_add_pd(fjy0
,ty
);
587 fjz0
= _mm_add_pd(fjz0
,tz
);
591 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
593 /* Inner loop uses 141 flops */
596 /* End of innermost loop */
598 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
599 f
+i_coord_offset
,fshift
+i_shift_offset
);
602 /* Update potential energies */
603 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
605 /* Increment number of inner iterations */
606 inneriter
+= j_index_end
- j_index_start
;
608 /* Outer loop uses 19 flops */
611 /* Increment number of outer iterations */
614 /* Update outer/inner flops */
616 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_VF
,outeriter
*19 + inneriter
*141);
619 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_double
620 * Electrostatics interaction: Ewald
621 * VdW interaction: None
622 * Geometry: Water3-Particle
623 * Calculate force/pot: Force
626 nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_double
627 (t_nblist
* gmx_restrict nlist
,
628 rvec
* gmx_restrict xx
,
629 rvec
* gmx_restrict ff
,
630 t_forcerec
* gmx_restrict fr
,
631 t_mdatoms
* gmx_restrict mdatoms
,
632 nb_kernel_data_t
* gmx_restrict kernel_data
,
633 t_nrnb
* gmx_restrict nrnb
)
635 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
636 * just 0 for non-waters.
637 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
638 * jnr indices corresponding to data put in the four positions in the SIMD register.
640 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
641 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
643 int j_coord_offsetA
,j_coord_offsetB
;
644 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
646 real
*shiftvec
,*fshift
,*x
,*f
;
647 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
649 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
651 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
653 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
654 int vdwjidx0A
,vdwjidx0B
;
655 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
656 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
657 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
658 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
659 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
662 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
664 __m128d dummy_mask
,cutoff_mask
;
665 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
666 __m128d one
= _mm_set1_pd(1.0);
667 __m128d two
= _mm_set1_pd(2.0);
673 jindex
= nlist
->jindex
;
675 shiftidx
= nlist
->shift
;
677 shiftvec
= fr
->shift_vec
[0];
678 fshift
= fr
->fshift
[0];
679 facel
= _mm_set1_pd(fr
->epsfac
);
680 charge
= mdatoms
->chargeA
;
682 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
683 ewtab
= fr
->ic
->tabq_coul_F
;
684 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
685 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
687 /* Setup water-specific parameters */
688 inr
= nlist
->iinr
[0];
689 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
690 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
691 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
693 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
694 rcutoff_scalar
= fr
->rcoulomb
;
695 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
696 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
698 /* Avoid stupid compiler warnings */
706 /* Start outer loop over neighborlists */
707 for(iidx
=0; iidx
<nri
; iidx
++)
709 /* Load shift vector for this list */
710 i_shift_offset
= DIM
*shiftidx
[iidx
];
712 /* Load limits for loop over neighbors */
713 j_index_start
= jindex
[iidx
];
714 j_index_end
= jindex
[iidx
+1];
716 /* Get outer coordinate index */
718 i_coord_offset
= DIM
*inr
;
720 /* Load i particle coords and add shift vector */
721 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
722 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
724 fix0
= _mm_setzero_pd();
725 fiy0
= _mm_setzero_pd();
726 fiz0
= _mm_setzero_pd();
727 fix1
= _mm_setzero_pd();
728 fiy1
= _mm_setzero_pd();
729 fiz1
= _mm_setzero_pd();
730 fix2
= _mm_setzero_pd();
731 fiy2
= _mm_setzero_pd();
732 fiz2
= _mm_setzero_pd();
734 /* Start inner kernel loop */
735 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
738 /* Get j neighbor index, and coordinate index */
741 j_coord_offsetA
= DIM
*jnrA
;
742 j_coord_offsetB
= DIM
*jnrB
;
744 /* load j atom coordinates */
745 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
748 /* Calculate displacement vector */
749 dx00
= _mm_sub_pd(ix0
,jx0
);
750 dy00
= _mm_sub_pd(iy0
,jy0
);
751 dz00
= _mm_sub_pd(iz0
,jz0
);
752 dx10
= _mm_sub_pd(ix1
,jx0
);
753 dy10
= _mm_sub_pd(iy1
,jy0
);
754 dz10
= _mm_sub_pd(iz1
,jz0
);
755 dx20
= _mm_sub_pd(ix2
,jx0
);
756 dy20
= _mm_sub_pd(iy2
,jy0
);
757 dz20
= _mm_sub_pd(iz2
,jz0
);
759 /* Calculate squared distance and things based on it */
760 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
761 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
762 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
764 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
765 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
766 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
768 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
769 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
770 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
772 /* Load parameters for j particles */
773 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
775 fjx0
= _mm_setzero_pd();
776 fjy0
= _mm_setzero_pd();
777 fjz0
= _mm_setzero_pd();
779 /**************************
780 * CALCULATE INTERACTIONS *
781 **************************/
783 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
786 r00
= _mm_mul_pd(rsq00
,rinv00
);
788 /* Compute parameters for interactions between i and j atoms */
789 qq00
= _mm_mul_pd(iq0
,jq0
);
791 /* EWALD ELECTROSTATICS */
793 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
794 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
795 ewitab
= _mm_cvttpd_epi32(ewrt
);
796 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
797 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
799 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
800 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
802 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
806 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
808 /* Calculate temporary vectorial force */
809 tx
= _mm_mul_pd(fscal
,dx00
);
810 ty
= _mm_mul_pd(fscal
,dy00
);
811 tz
= _mm_mul_pd(fscal
,dz00
);
813 /* Update vectorial force */
814 fix0
= _mm_add_pd(fix0
,tx
);
815 fiy0
= _mm_add_pd(fiy0
,ty
);
816 fiz0
= _mm_add_pd(fiz0
,tz
);
818 fjx0
= _mm_add_pd(fjx0
,tx
);
819 fjy0
= _mm_add_pd(fjy0
,ty
);
820 fjz0
= _mm_add_pd(fjz0
,tz
);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
831 r10
= _mm_mul_pd(rsq10
,rinv10
);
833 /* Compute parameters for interactions between i and j atoms */
834 qq10
= _mm_mul_pd(iq1
,jq0
);
836 /* EWALD ELECTROSTATICS */
838 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
839 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
840 ewitab
= _mm_cvttpd_epi32(ewrt
);
841 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
842 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
844 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
845 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
847 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
851 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
853 /* Calculate temporary vectorial force */
854 tx
= _mm_mul_pd(fscal
,dx10
);
855 ty
= _mm_mul_pd(fscal
,dy10
);
856 tz
= _mm_mul_pd(fscal
,dz10
);
858 /* Update vectorial force */
859 fix1
= _mm_add_pd(fix1
,tx
);
860 fiy1
= _mm_add_pd(fiy1
,ty
);
861 fiz1
= _mm_add_pd(fiz1
,tz
);
863 fjx0
= _mm_add_pd(fjx0
,tx
);
864 fjy0
= _mm_add_pd(fjy0
,ty
);
865 fjz0
= _mm_add_pd(fjz0
,tz
);
869 /**************************
870 * CALCULATE INTERACTIONS *
871 **************************/
873 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
876 r20
= _mm_mul_pd(rsq20
,rinv20
);
878 /* Compute parameters for interactions between i and j atoms */
879 qq20
= _mm_mul_pd(iq2
,jq0
);
881 /* EWALD ELECTROSTATICS */
883 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
884 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
885 ewitab
= _mm_cvttpd_epi32(ewrt
);
886 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
887 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
889 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
890 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
892 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
896 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
898 /* Calculate temporary vectorial force */
899 tx
= _mm_mul_pd(fscal
,dx20
);
900 ty
= _mm_mul_pd(fscal
,dy20
);
901 tz
= _mm_mul_pd(fscal
,dz20
);
903 /* Update vectorial force */
904 fix2
= _mm_add_pd(fix2
,tx
);
905 fiy2
= _mm_add_pd(fiy2
,ty
);
906 fiz2
= _mm_add_pd(fiz2
,tz
);
908 fjx0
= _mm_add_pd(fjx0
,tx
);
909 fjy0
= _mm_add_pd(fjy0
,ty
);
910 fjz0
= _mm_add_pd(fjz0
,tz
);
914 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
916 /* Inner loop uses 120 flops */
923 j_coord_offsetA
= DIM
*jnrA
;
925 /* load j atom coordinates */
926 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
929 /* Calculate displacement vector */
930 dx00
= _mm_sub_pd(ix0
,jx0
);
931 dy00
= _mm_sub_pd(iy0
,jy0
);
932 dz00
= _mm_sub_pd(iz0
,jz0
);
933 dx10
= _mm_sub_pd(ix1
,jx0
);
934 dy10
= _mm_sub_pd(iy1
,jy0
);
935 dz10
= _mm_sub_pd(iz1
,jz0
);
936 dx20
= _mm_sub_pd(ix2
,jx0
);
937 dy20
= _mm_sub_pd(iy2
,jy0
);
938 dz20
= _mm_sub_pd(iz2
,jz0
);
940 /* Calculate squared distance and things based on it */
941 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
942 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
943 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
945 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
946 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
947 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
949 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
950 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
951 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
953 /* Load parameters for j particles */
954 jq0
= _mm_load_sd(charge
+jnrA
+0);
956 fjx0
= _mm_setzero_pd();
957 fjy0
= _mm_setzero_pd();
958 fjz0
= _mm_setzero_pd();
960 /**************************
961 * CALCULATE INTERACTIONS *
962 **************************/
964 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
967 r00
= _mm_mul_pd(rsq00
,rinv00
);
969 /* Compute parameters for interactions between i and j atoms */
970 qq00
= _mm_mul_pd(iq0
,jq0
);
972 /* EWALD ELECTROSTATICS */
974 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
975 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
976 ewitab
= _mm_cvttpd_epi32(ewrt
);
977 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
978 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
979 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
980 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
982 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
986 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
988 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
990 /* Calculate temporary vectorial force */
991 tx
= _mm_mul_pd(fscal
,dx00
);
992 ty
= _mm_mul_pd(fscal
,dy00
);
993 tz
= _mm_mul_pd(fscal
,dz00
);
995 /* Update vectorial force */
996 fix0
= _mm_add_pd(fix0
,tx
);
997 fiy0
= _mm_add_pd(fiy0
,ty
);
998 fiz0
= _mm_add_pd(fiz0
,tz
);
1000 fjx0
= _mm_add_pd(fjx0
,tx
);
1001 fjy0
= _mm_add_pd(fjy0
,ty
);
1002 fjz0
= _mm_add_pd(fjz0
,tz
);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1013 r10
= _mm_mul_pd(rsq10
,rinv10
);
1015 /* Compute parameters for interactions between i and j atoms */
1016 qq10
= _mm_mul_pd(iq1
,jq0
);
1018 /* EWALD ELECTROSTATICS */
1020 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1021 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1022 ewitab
= _mm_cvttpd_epi32(ewrt
);
1023 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1024 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1025 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1026 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1028 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1032 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1034 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1036 /* Calculate temporary vectorial force */
1037 tx
= _mm_mul_pd(fscal
,dx10
);
1038 ty
= _mm_mul_pd(fscal
,dy10
);
1039 tz
= _mm_mul_pd(fscal
,dz10
);
1041 /* Update vectorial force */
1042 fix1
= _mm_add_pd(fix1
,tx
);
1043 fiy1
= _mm_add_pd(fiy1
,ty
);
1044 fiz1
= _mm_add_pd(fiz1
,tz
);
1046 fjx0
= _mm_add_pd(fjx0
,tx
);
1047 fjy0
= _mm_add_pd(fjy0
,ty
);
1048 fjz0
= _mm_add_pd(fjz0
,tz
);
1052 /**************************
1053 * CALCULATE INTERACTIONS *
1054 **************************/
1056 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1059 r20
= _mm_mul_pd(rsq20
,rinv20
);
1061 /* Compute parameters for interactions between i and j atoms */
1062 qq20
= _mm_mul_pd(iq2
,jq0
);
1064 /* EWALD ELECTROSTATICS */
1066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1067 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1068 ewitab
= _mm_cvttpd_epi32(ewrt
);
1069 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1070 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1071 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1072 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1074 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1078 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1080 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1082 /* Calculate temporary vectorial force */
1083 tx
= _mm_mul_pd(fscal
,dx20
);
1084 ty
= _mm_mul_pd(fscal
,dy20
);
1085 tz
= _mm_mul_pd(fscal
,dz20
);
1087 /* Update vectorial force */
1088 fix2
= _mm_add_pd(fix2
,tx
);
1089 fiy2
= _mm_add_pd(fiy2
,ty
);
1090 fiz2
= _mm_add_pd(fiz2
,tz
);
1092 fjx0
= _mm_add_pd(fjx0
,tx
);
1093 fjy0
= _mm_add_pd(fjy0
,ty
);
1094 fjz0
= _mm_add_pd(fjz0
,tz
);
1098 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1100 /* Inner loop uses 120 flops */
1103 /* End of innermost loop */
1105 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1106 f
+i_coord_offset
,fshift
+i_shift_offset
);
1108 /* Increment number of inner iterations */
1109 inneriter
+= j_index_end
- j_index_start
;
1111 /* Outer loop uses 18 flops */
1114 /* Increment number of outer iterations */
1117 /* Update outer/inner flops */
1119 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_F
,outeriter
*18 + inneriter
*120);