2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Particle-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
68 int vdwjidx0A
,vdwjidx0B
;
69 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
70 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
71 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
74 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
76 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
77 real rswitch_scalar
,d_scalar
;
78 __m128d dummy_mask
,cutoff_mask
;
79 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
80 __m128d one
= _mm_set1_pd(1.0);
81 __m128d two
= _mm_set1_pd(2.0);
87 jindex
= nlist
->jindex
;
89 shiftidx
= nlist
->shift
;
91 shiftvec
= fr
->shift_vec
[0];
92 fshift
= fr
->fshift
[0];
93 facel
= _mm_set1_pd(fr
->epsfac
);
94 charge
= mdatoms
->chargeA
;
96 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
97 ewtab
= fr
->ic
->tabq_coul_FDV0
;
98 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
99 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
101 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
102 rcutoff_scalar
= fr
->rcoulomb
;
103 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
104 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
106 rswitch_scalar
= fr
->rcoulomb_switch
;
107 rswitch
= _mm_set1_pd(rswitch_scalar
);
108 /* Setup switch parameters */
109 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
110 d
= _mm_set1_pd(d_scalar
);
111 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
112 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
113 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
114 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
115 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
116 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
118 /* Avoid stupid compiler warnings */
126 /* Start outer loop over neighborlists */
127 for(iidx
=0; iidx
<nri
; iidx
++)
129 /* Load shift vector for this list */
130 i_shift_offset
= DIM
*shiftidx
[iidx
];
132 /* Load limits for loop over neighbors */
133 j_index_start
= jindex
[iidx
];
134 j_index_end
= jindex
[iidx
+1];
136 /* Get outer coordinate index */
138 i_coord_offset
= DIM
*inr
;
140 /* Load i particle coords and add shift vector */
141 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
143 fix0
= _mm_setzero_pd();
144 fiy0
= _mm_setzero_pd();
145 fiz0
= _mm_setzero_pd();
147 /* Load parameters for i particles */
148 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
150 /* Reset potential sums */
151 velecsum
= _mm_setzero_pd();
153 /* Start inner kernel loop */
154 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
157 /* Get j neighbor index, and coordinate index */
160 j_coord_offsetA
= DIM
*jnrA
;
161 j_coord_offsetB
= DIM
*jnrB
;
163 /* load j atom coordinates */
164 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
167 /* Calculate displacement vector */
168 dx00
= _mm_sub_pd(ix0
,jx0
);
169 dy00
= _mm_sub_pd(iy0
,jy0
);
170 dz00
= _mm_sub_pd(iz0
,jz0
);
172 /* Calculate squared distance and things based on it */
173 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
175 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
177 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
179 /* Load parameters for j particles */
180 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
182 /**************************
183 * CALCULATE INTERACTIONS *
184 **************************/
186 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
189 r00
= _mm_mul_pd(rsq00
,rinv00
);
191 /* Compute parameters for interactions between i and j atoms */
192 qq00
= _mm_mul_pd(iq0
,jq0
);
194 /* EWALD ELECTROSTATICS */
196 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
197 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
198 ewitab
= _mm_cvttpd_epi32(ewrt
);
199 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
200 ewitab
= _mm_slli_epi32(ewitab
,2);
201 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
202 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
203 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
204 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
205 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
206 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
207 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
208 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
209 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
210 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
212 d
= _mm_sub_pd(r00
,rswitch
);
213 d
= _mm_max_pd(d
,_mm_setzero_pd());
214 d2
= _mm_mul_pd(d
,d
);
215 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
217 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
219 /* Evaluate switch function */
220 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
221 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
222 velec
= _mm_mul_pd(velec
,sw
);
223 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
225 /* Update potential sum for this i atom from the interaction with this j atom. */
226 velec
= _mm_and_pd(velec
,cutoff_mask
);
227 velecsum
= _mm_add_pd(velecsum
,velec
);
231 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
233 /* Calculate temporary vectorial force */
234 tx
= _mm_mul_pd(fscal
,dx00
);
235 ty
= _mm_mul_pd(fscal
,dy00
);
236 tz
= _mm_mul_pd(fscal
,dz00
);
238 /* Update vectorial force */
239 fix0
= _mm_add_pd(fix0
,tx
);
240 fiy0
= _mm_add_pd(fiy0
,ty
);
241 fiz0
= _mm_add_pd(fiz0
,tz
);
243 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
247 /* Inner loop uses 65 flops */
254 j_coord_offsetA
= DIM
*jnrA
;
256 /* load j atom coordinates */
257 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
260 /* Calculate displacement vector */
261 dx00
= _mm_sub_pd(ix0
,jx0
);
262 dy00
= _mm_sub_pd(iy0
,jy0
);
263 dz00
= _mm_sub_pd(iz0
,jz0
);
265 /* Calculate squared distance and things based on it */
266 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
268 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
270 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
272 /* Load parameters for j particles */
273 jq0
= _mm_load_sd(charge
+jnrA
+0);
275 /**************************
276 * CALCULATE INTERACTIONS *
277 **************************/
279 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
282 r00
= _mm_mul_pd(rsq00
,rinv00
);
284 /* Compute parameters for interactions between i and j atoms */
285 qq00
= _mm_mul_pd(iq0
,jq0
);
287 /* EWALD ELECTROSTATICS */
289 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
290 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
291 ewitab
= _mm_cvttpd_epi32(ewrt
);
292 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
293 ewitab
= _mm_slli_epi32(ewitab
,2);
294 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
295 ewtabD
= _mm_setzero_pd();
296 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
297 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
298 ewtabFn
= _mm_setzero_pd();
299 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
300 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
301 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
302 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
303 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
305 d
= _mm_sub_pd(r00
,rswitch
);
306 d
= _mm_max_pd(d
,_mm_setzero_pd());
307 d2
= _mm_mul_pd(d
,d
);
308 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
310 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
312 /* Evaluate switch function */
313 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
314 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
315 velec
= _mm_mul_pd(velec
,sw
);
316 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
318 /* Update potential sum for this i atom from the interaction with this j atom. */
319 velec
= _mm_and_pd(velec
,cutoff_mask
);
320 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
321 velecsum
= _mm_add_pd(velecsum
,velec
);
325 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
327 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
329 /* Calculate temporary vectorial force */
330 tx
= _mm_mul_pd(fscal
,dx00
);
331 ty
= _mm_mul_pd(fscal
,dy00
);
332 tz
= _mm_mul_pd(fscal
,dz00
);
334 /* Update vectorial force */
335 fix0
= _mm_add_pd(fix0
,tx
);
336 fiy0
= _mm_add_pd(fiy0
,ty
);
337 fiz0
= _mm_add_pd(fiz0
,tz
);
339 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
343 /* Inner loop uses 65 flops */
346 /* End of innermost loop */
348 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
349 f
+i_coord_offset
,fshift
+i_shift_offset
);
352 /* Update potential energies */
353 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
355 /* Increment number of inner iterations */
356 inneriter
+= j_index_end
- j_index_start
;
358 /* Outer loop uses 8 flops */
361 /* Increment number of outer iterations */
364 /* Update outer/inner flops */
366 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VF
,outeriter
*8 + inneriter
*65);
369 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_double
370 * Electrostatics interaction: Ewald
371 * VdW interaction: None
372 * Geometry: Particle-Particle
373 * Calculate force/pot: Force
376 nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_double
377 (t_nblist
* gmx_restrict nlist
,
378 rvec
* gmx_restrict xx
,
379 rvec
* gmx_restrict ff
,
380 t_forcerec
* gmx_restrict fr
,
381 t_mdatoms
* gmx_restrict mdatoms
,
382 nb_kernel_data_t
* gmx_restrict kernel_data
,
383 t_nrnb
* gmx_restrict nrnb
)
385 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
386 * just 0 for non-waters.
387 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
388 * jnr indices corresponding to data put in the four positions in the SIMD register.
390 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
391 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
393 int j_coord_offsetA
,j_coord_offsetB
;
394 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
396 real
*shiftvec
,*fshift
,*x
,*f
;
397 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
399 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
400 int vdwjidx0A
,vdwjidx0B
;
401 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
402 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
403 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
406 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
408 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
409 real rswitch_scalar
,d_scalar
;
410 __m128d dummy_mask
,cutoff_mask
;
411 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
412 __m128d one
= _mm_set1_pd(1.0);
413 __m128d two
= _mm_set1_pd(2.0);
419 jindex
= nlist
->jindex
;
421 shiftidx
= nlist
->shift
;
423 shiftvec
= fr
->shift_vec
[0];
424 fshift
= fr
->fshift
[0];
425 facel
= _mm_set1_pd(fr
->epsfac
);
426 charge
= mdatoms
->chargeA
;
428 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
429 ewtab
= fr
->ic
->tabq_coul_FDV0
;
430 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
431 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
433 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
434 rcutoff_scalar
= fr
->rcoulomb
;
435 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
436 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
438 rswitch_scalar
= fr
->rcoulomb_switch
;
439 rswitch
= _mm_set1_pd(rswitch_scalar
);
440 /* Setup switch parameters */
441 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
442 d
= _mm_set1_pd(d_scalar
);
443 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
444 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
445 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
446 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
447 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
448 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
450 /* Avoid stupid compiler warnings */
458 /* Start outer loop over neighborlists */
459 for(iidx
=0; iidx
<nri
; iidx
++)
461 /* Load shift vector for this list */
462 i_shift_offset
= DIM
*shiftidx
[iidx
];
464 /* Load limits for loop over neighbors */
465 j_index_start
= jindex
[iidx
];
466 j_index_end
= jindex
[iidx
+1];
468 /* Get outer coordinate index */
470 i_coord_offset
= DIM
*inr
;
472 /* Load i particle coords and add shift vector */
473 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
475 fix0
= _mm_setzero_pd();
476 fiy0
= _mm_setzero_pd();
477 fiz0
= _mm_setzero_pd();
479 /* Load parameters for i particles */
480 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
482 /* Start inner kernel loop */
483 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
486 /* Get j neighbor index, and coordinate index */
489 j_coord_offsetA
= DIM
*jnrA
;
490 j_coord_offsetB
= DIM
*jnrB
;
492 /* load j atom coordinates */
493 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
496 /* Calculate displacement vector */
497 dx00
= _mm_sub_pd(ix0
,jx0
);
498 dy00
= _mm_sub_pd(iy0
,jy0
);
499 dz00
= _mm_sub_pd(iz0
,jz0
);
501 /* Calculate squared distance and things based on it */
502 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
504 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
506 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
508 /* Load parameters for j particles */
509 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
518 r00
= _mm_mul_pd(rsq00
,rinv00
);
520 /* Compute parameters for interactions between i and j atoms */
521 qq00
= _mm_mul_pd(iq0
,jq0
);
523 /* EWALD ELECTROSTATICS */
525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
526 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
527 ewitab
= _mm_cvttpd_epi32(ewrt
);
528 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
529 ewitab
= _mm_slli_epi32(ewitab
,2);
530 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
531 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
532 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
533 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
534 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
535 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
536 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
537 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
538 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
539 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
541 d
= _mm_sub_pd(r00
,rswitch
);
542 d
= _mm_max_pd(d
,_mm_setzero_pd());
543 d2
= _mm_mul_pd(d
,d
);
544 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
546 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
548 /* Evaluate switch function */
549 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
550 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
551 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
555 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
557 /* Calculate temporary vectorial force */
558 tx
= _mm_mul_pd(fscal
,dx00
);
559 ty
= _mm_mul_pd(fscal
,dy00
);
560 tz
= _mm_mul_pd(fscal
,dz00
);
562 /* Update vectorial force */
563 fix0
= _mm_add_pd(fix0
,tx
);
564 fiy0
= _mm_add_pd(fiy0
,ty
);
565 fiz0
= _mm_add_pd(fiz0
,tz
);
567 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
571 /* Inner loop uses 62 flops */
578 j_coord_offsetA
= DIM
*jnrA
;
580 /* load j atom coordinates */
581 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
584 /* Calculate displacement vector */
585 dx00
= _mm_sub_pd(ix0
,jx0
);
586 dy00
= _mm_sub_pd(iy0
,jy0
);
587 dz00
= _mm_sub_pd(iz0
,jz0
);
589 /* Calculate squared distance and things based on it */
590 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
592 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
594 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
596 /* Load parameters for j particles */
597 jq0
= _mm_load_sd(charge
+jnrA
+0);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
606 r00
= _mm_mul_pd(rsq00
,rinv00
);
608 /* Compute parameters for interactions between i and j atoms */
609 qq00
= _mm_mul_pd(iq0
,jq0
);
611 /* EWALD ELECTROSTATICS */
613 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
614 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
615 ewitab
= _mm_cvttpd_epi32(ewrt
);
616 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
617 ewitab
= _mm_slli_epi32(ewitab
,2);
618 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
619 ewtabD
= _mm_setzero_pd();
620 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
621 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
622 ewtabFn
= _mm_setzero_pd();
623 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
624 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
625 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
626 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
627 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
629 d
= _mm_sub_pd(r00
,rswitch
);
630 d
= _mm_max_pd(d
,_mm_setzero_pd());
631 d2
= _mm_mul_pd(d
,d
);
632 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
634 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
636 /* Evaluate switch function */
637 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
638 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
639 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
643 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
645 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
647 /* Calculate temporary vectorial force */
648 tx
= _mm_mul_pd(fscal
,dx00
);
649 ty
= _mm_mul_pd(fscal
,dy00
);
650 tz
= _mm_mul_pd(fscal
,dz00
);
652 /* Update vectorial force */
653 fix0
= _mm_add_pd(fix0
,tx
);
654 fiy0
= _mm_add_pd(fiy0
,ty
);
655 fiz0
= _mm_add_pd(fiz0
,tz
);
657 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
661 /* Inner loop uses 62 flops */
664 /* End of innermost loop */
666 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
667 f
+i_coord_offset
,fshift
+i_shift_offset
);
669 /* Increment number of inner iterations */
670 inneriter
+= j_index_end
- j_index_start
;
672 /* Outer loop uses 7 flops */
675 /* Increment number of outer iterations */
678 /* Update outer/inner flops */
680 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_F
,outeriter
*7 + inneriter
*62);