2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Particle-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
68 int vdwjidx0A
,vdwjidx0B
;
69 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
70 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
71 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
74 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
77 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
78 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
80 __m128i ifour
= _mm_set1_epi32(4);
81 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
83 __m128d dummy_mask
,cutoff_mask
;
84 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
85 __m128d one
= _mm_set1_pd(1.0);
86 __m128d two
= _mm_set1_pd(2.0);
92 jindex
= nlist
->jindex
;
94 shiftidx
= nlist
->shift
;
96 shiftvec
= fr
->shift_vec
[0];
97 fshift
= fr
->fshift
[0];
98 facel
= _mm_set1_pd(fr
->epsfac
);
99 charge
= mdatoms
->chargeA
;
100 nvdwtype
= fr
->ntype
;
102 vdwtype
= mdatoms
->typeA
;
104 vftab
= kernel_data
->table_elec_vdw
->data
;
105 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
107 /* Avoid stupid compiler warnings */
115 /* Start outer loop over neighborlists */
116 for(iidx
=0; iidx
<nri
; iidx
++)
118 /* Load shift vector for this list */
119 i_shift_offset
= DIM
*shiftidx
[iidx
];
121 /* Load limits for loop over neighbors */
122 j_index_start
= jindex
[iidx
];
123 j_index_end
= jindex
[iidx
+1];
125 /* Get outer coordinate index */
127 i_coord_offset
= DIM
*inr
;
129 /* Load i particle coords and add shift vector */
130 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
132 fix0
= _mm_setzero_pd();
133 fiy0
= _mm_setzero_pd();
134 fiz0
= _mm_setzero_pd();
136 /* Load parameters for i particles */
137 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
138 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
140 /* Reset potential sums */
141 velecsum
= _mm_setzero_pd();
142 vvdwsum
= _mm_setzero_pd();
144 /* Start inner kernel loop */
145 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
148 /* Get j neighbor index, and coordinate index */
151 j_coord_offsetA
= DIM
*jnrA
;
152 j_coord_offsetB
= DIM
*jnrB
;
154 /* load j atom coordinates */
155 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
158 /* Calculate displacement vector */
159 dx00
= _mm_sub_pd(ix0
,jx0
);
160 dy00
= _mm_sub_pd(iy0
,jy0
);
161 dz00
= _mm_sub_pd(iz0
,jz0
);
163 /* Calculate squared distance and things based on it */
164 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
166 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
168 /* Load parameters for j particles */
169 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
170 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
171 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
173 /**************************
174 * CALCULATE INTERACTIONS *
175 **************************/
177 r00
= _mm_mul_pd(rsq00
,rinv00
);
179 /* Compute parameters for interactions between i and j atoms */
180 qq00
= _mm_mul_pd(iq0
,jq0
);
181 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
182 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
184 /* Calculate table index by multiplying r with table scale and truncate to integer */
185 rt
= _mm_mul_pd(r00
,vftabscale
);
186 vfitab
= _mm_cvttpd_epi32(rt
);
187 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
188 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
190 /* CUBIC SPLINE TABLE ELECTROSTATICS */
191 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
192 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
193 GMX_MM_TRANSPOSE2_PD(Y
,F
);
194 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
195 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
196 GMX_MM_TRANSPOSE2_PD(G
,H
);
197 Heps
= _mm_mul_pd(vfeps
,H
);
198 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
199 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
200 velec
= _mm_mul_pd(qq00
,VV
);
201 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
202 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
204 /* CUBIC SPLINE TABLE DISPERSION */
205 vfitab
= _mm_add_epi32(vfitab
,ifour
);
206 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
207 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
208 GMX_MM_TRANSPOSE2_PD(Y
,F
);
209 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
210 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
211 GMX_MM_TRANSPOSE2_PD(G
,H
);
212 Heps
= _mm_mul_pd(vfeps
,H
);
213 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
214 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
215 vvdw6
= _mm_mul_pd(c6_00
,VV
);
216 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
217 fvdw6
= _mm_mul_pd(c6_00
,FF
);
219 /* CUBIC SPLINE TABLE REPULSION */
220 vfitab
= _mm_add_epi32(vfitab
,ifour
);
221 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
222 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
223 GMX_MM_TRANSPOSE2_PD(Y
,F
);
224 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
225 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
226 GMX_MM_TRANSPOSE2_PD(G
,H
);
227 Heps
= _mm_mul_pd(vfeps
,H
);
228 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
229 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
230 vvdw12
= _mm_mul_pd(c12_00
,VV
);
231 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
232 fvdw12
= _mm_mul_pd(c12_00
,FF
);
233 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
234 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
236 /* Update potential sum for this i atom from the interaction with this j atom. */
237 velecsum
= _mm_add_pd(velecsum
,velec
);
238 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
240 fscal
= _mm_add_pd(felec
,fvdw
);
242 /* Calculate temporary vectorial force */
243 tx
= _mm_mul_pd(fscal
,dx00
);
244 ty
= _mm_mul_pd(fscal
,dy00
);
245 tz
= _mm_mul_pd(fscal
,dz00
);
247 /* Update vectorial force */
248 fix0
= _mm_add_pd(fix0
,tx
);
249 fiy0
= _mm_add_pd(fiy0
,ty
);
250 fiz0
= _mm_add_pd(fiz0
,tz
);
252 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
254 /* Inner loop uses 73 flops */
261 j_coord_offsetA
= DIM
*jnrA
;
263 /* load j atom coordinates */
264 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
267 /* Calculate displacement vector */
268 dx00
= _mm_sub_pd(ix0
,jx0
);
269 dy00
= _mm_sub_pd(iy0
,jy0
);
270 dz00
= _mm_sub_pd(iz0
,jz0
);
272 /* Calculate squared distance and things based on it */
273 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
275 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
277 /* Load parameters for j particles */
278 jq0
= _mm_load_sd(charge
+jnrA
+0);
279 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
281 /**************************
282 * CALCULATE INTERACTIONS *
283 **************************/
285 r00
= _mm_mul_pd(rsq00
,rinv00
);
287 /* Compute parameters for interactions between i and j atoms */
288 qq00
= _mm_mul_pd(iq0
,jq0
);
289 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
291 /* Calculate table index by multiplying r with table scale and truncate to integer */
292 rt
= _mm_mul_pd(r00
,vftabscale
);
293 vfitab
= _mm_cvttpd_epi32(rt
);
294 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
295 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
297 /* CUBIC SPLINE TABLE ELECTROSTATICS */
298 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
299 F
= _mm_setzero_pd();
300 GMX_MM_TRANSPOSE2_PD(Y
,F
);
301 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
302 H
= _mm_setzero_pd();
303 GMX_MM_TRANSPOSE2_PD(G
,H
);
304 Heps
= _mm_mul_pd(vfeps
,H
);
305 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
306 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
307 velec
= _mm_mul_pd(qq00
,VV
);
308 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
309 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
311 /* CUBIC SPLINE TABLE DISPERSION */
312 vfitab
= _mm_add_epi32(vfitab
,ifour
);
313 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
314 F
= _mm_setzero_pd();
315 GMX_MM_TRANSPOSE2_PD(Y
,F
);
316 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
317 H
= _mm_setzero_pd();
318 GMX_MM_TRANSPOSE2_PD(G
,H
);
319 Heps
= _mm_mul_pd(vfeps
,H
);
320 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
321 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
322 vvdw6
= _mm_mul_pd(c6_00
,VV
);
323 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
324 fvdw6
= _mm_mul_pd(c6_00
,FF
);
326 /* CUBIC SPLINE TABLE REPULSION */
327 vfitab
= _mm_add_epi32(vfitab
,ifour
);
328 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
329 F
= _mm_setzero_pd();
330 GMX_MM_TRANSPOSE2_PD(Y
,F
);
331 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
332 H
= _mm_setzero_pd();
333 GMX_MM_TRANSPOSE2_PD(G
,H
);
334 Heps
= _mm_mul_pd(vfeps
,H
);
335 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
336 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
337 vvdw12
= _mm_mul_pd(c12_00
,VV
);
338 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
339 fvdw12
= _mm_mul_pd(c12_00
,FF
);
340 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
341 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
345 velecsum
= _mm_add_pd(velecsum
,velec
);
346 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
347 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
349 fscal
= _mm_add_pd(felec
,fvdw
);
351 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
353 /* Calculate temporary vectorial force */
354 tx
= _mm_mul_pd(fscal
,dx00
);
355 ty
= _mm_mul_pd(fscal
,dy00
);
356 tz
= _mm_mul_pd(fscal
,dz00
);
358 /* Update vectorial force */
359 fix0
= _mm_add_pd(fix0
,tx
);
360 fiy0
= _mm_add_pd(fiy0
,ty
);
361 fiz0
= _mm_add_pd(fiz0
,tz
);
363 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
365 /* Inner loop uses 73 flops */
368 /* End of innermost loop */
370 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
371 f
+i_coord_offset
,fshift
+i_shift_offset
);
374 /* Update potential energies */
375 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
376 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
378 /* Increment number of inner iterations */
379 inneriter
+= j_index_end
- j_index_start
;
381 /* Outer loop uses 9 flops */
384 /* Increment number of outer iterations */
387 /* Update outer/inner flops */
389 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_VF
,outeriter
*9 + inneriter
*73);
392 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_double
393 * Electrostatics interaction: CubicSplineTable
394 * VdW interaction: CubicSplineTable
395 * Geometry: Particle-Particle
396 * Calculate force/pot: Force
399 nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_double
400 (t_nblist
* gmx_restrict nlist
,
401 rvec
* gmx_restrict xx
,
402 rvec
* gmx_restrict ff
,
403 t_forcerec
* gmx_restrict fr
,
404 t_mdatoms
* gmx_restrict mdatoms
,
405 nb_kernel_data_t
* gmx_restrict kernel_data
,
406 t_nrnb
* gmx_restrict nrnb
)
408 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
409 * just 0 for non-waters.
410 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
411 * jnr indices corresponding to data put in the four positions in the SIMD register.
413 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
414 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
416 int j_coord_offsetA
,j_coord_offsetB
;
417 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
419 real
*shiftvec
,*fshift
,*x
,*f
;
420 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
422 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
423 int vdwjidx0A
,vdwjidx0B
;
424 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
425 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
426 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
429 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
432 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
433 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
435 __m128i ifour
= _mm_set1_epi32(4);
436 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
438 __m128d dummy_mask
,cutoff_mask
;
439 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
440 __m128d one
= _mm_set1_pd(1.0);
441 __m128d two
= _mm_set1_pd(2.0);
447 jindex
= nlist
->jindex
;
449 shiftidx
= nlist
->shift
;
451 shiftvec
= fr
->shift_vec
[0];
452 fshift
= fr
->fshift
[0];
453 facel
= _mm_set1_pd(fr
->epsfac
);
454 charge
= mdatoms
->chargeA
;
455 nvdwtype
= fr
->ntype
;
457 vdwtype
= mdatoms
->typeA
;
459 vftab
= kernel_data
->table_elec_vdw
->data
;
460 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
462 /* Avoid stupid compiler warnings */
470 /* Start outer loop over neighborlists */
471 for(iidx
=0; iidx
<nri
; iidx
++)
473 /* Load shift vector for this list */
474 i_shift_offset
= DIM
*shiftidx
[iidx
];
476 /* Load limits for loop over neighbors */
477 j_index_start
= jindex
[iidx
];
478 j_index_end
= jindex
[iidx
+1];
480 /* Get outer coordinate index */
482 i_coord_offset
= DIM
*inr
;
484 /* Load i particle coords and add shift vector */
485 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
487 fix0
= _mm_setzero_pd();
488 fiy0
= _mm_setzero_pd();
489 fiz0
= _mm_setzero_pd();
491 /* Load parameters for i particles */
492 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
493 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
495 /* Start inner kernel loop */
496 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
499 /* Get j neighbor index, and coordinate index */
502 j_coord_offsetA
= DIM
*jnrA
;
503 j_coord_offsetB
= DIM
*jnrB
;
505 /* load j atom coordinates */
506 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
509 /* Calculate displacement vector */
510 dx00
= _mm_sub_pd(ix0
,jx0
);
511 dy00
= _mm_sub_pd(iy0
,jy0
);
512 dz00
= _mm_sub_pd(iz0
,jz0
);
514 /* Calculate squared distance and things based on it */
515 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
517 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
519 /* Load parameters for j particles */
520 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
521 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
522 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
524 /**************************
525 * CALCULATE INTERACTIONS *
526 **************************/
528 r00
= _mm_mul_pd(rsq00
,rinv00
);
530 /* Compute parameters for interactions between i and j atoms */
531 qq00
= _mm_mul_pd(iq0
,jq0
);
532 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
533 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
535 /* Calculate table index by multiplying r with table scale and truncate to integer */
536 rt
= _mm_mul_pd(r00
,vftabscale
);
537 vfitab
= _mm_cvttpd_epi32(rt
);
538 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
539 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
541 /* CUBIC SPLINE TABLE ELECTROSTATICS */
542 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
543 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
544 GMX_MM_TRANSPOSE2_PD(Y
,F
);
545 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
546 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
547 GMX_MM_TRANSPOSE2_PD(G
,H
);
548 Heps
= _mm_mul_pd(vfeps
,H
);
549 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
550 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
551 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
553 /* CUBIC SPLINE TABLE DISPERSION */
554 vfitab
= _mm_add_epi32(vfitab
,ifour
);
555 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
556 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
557 GMX_MM_TRANSPOSE2_PD(Y
,F
);
558 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
559 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
560 GMX_MM_TRANSPOSE2_PD(G
,H
);
561 Heps
= _mm_mul_pd(vfeps
,H
);
562 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
563 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
564 fvdw6
= _mm_mul_pd(c6_00
,FF
);
566 /* CUBIC SPLINE TABLE REPULSION */
567 vfitab
= _mm_add_epi32(vfitab
,ifour
);
568 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
569 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
570 GMX_MM_TRANSPOSE2_PD(Y
,F
);
571 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
572 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
573 GMX_MM_TRANSPOSE2_PD(G
,H
);
574 Heps
= _mm_mul_pd(vfeps
,H
);
575 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
576 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
577 fvdw12
= _mm_mul_pd(c12_00
,FF
);
578 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
580 fscal
= _mm_add_pd(felec
,fvdw
);
582 /* Calculate temporary vectorial force */
583 tx
= _mm_mul_pd(fscal
,dx00
);
584 ty
= _mm_mul_pd(fscal
,dy00
);
585 tz
= _mm_mul_pd(fscal
,dz00
);
587 /* Update vectorial force */
588 fix0
= _mm_add_pd(fix0
,tx
);
589 fiy0
= _mm_add_pd(fiy0
,ty
);
590 fiz0
= _mm_add_pd(fiz0
,tz
);
592 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
594 /* Inner loop uses 61 flops */
601 j_coord_offsetA
= DIM
*jnrA
;
603 /* load j atom coordinates */
604 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
607 /* Calculate displacement vector */
608 dx00
= _mm_sub_pd(ix0
,jx0
);
609 dy00
= _mm_sub_pd(iy0
,jy0
);
610 dz00
= _mm_sub_pd(iz0
,jz0
);
612 /* Calculate squared distance and things based on it */
613 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
615 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
617 /* Load parameters for j particles */
618 jq0
= _mm_load_sd(charge
+jnrA
+0);
619 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
621 /**************************
622 * CALCULATE INTERACTIONS *
623 **************************/
625 r00
= _mm_mul_pd(rsq00
,rinv00
);
627 /* Compute parameters for interactions between i and j atoms */
628 qq00
= _mm_mul_pd(iq0
,jq0
);
629 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
631 /* Calculate table index by multiplying r with table scale and truncate to integer */
632 rt
= _mm_mul_pd(r00
,vftabscale
);
633 vfitab
= _mm_cvttpd_epi32(rt
);
634 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
635 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
637 /* CUBIC SPLINE TABLE ELECTROSTATICS */
638 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
639 F
= _mm_setzero_pd();
640 GMX_MM_TRANSPOSE2_PD(Y
,F
);
641 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
642 H
= _mm_setzero_pd();
643 GMX_MM_TRANSPOSE2_PD(G
,H
);
644 Heps
= _mm_mul_pd(vfeps
,H
);
645 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
646 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
647 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
649 /* CUBIC SPLINE TABLE DISPERSION */
650 vfitab
= _mm_add_epi32(vfitab
,ifour
);
651 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
652 F
= _mm_setzero_pd();
653 GMX_MM_TRANSPOSE2_PD(Y
,F
);
654 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
655 H
= _mm_setzero_pd();
656 GMX_MM_TRANSPOSE2_PD(G
,H
);
657 Heps
= _mm_mul_pd(vfeps
,H
);
658 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
659 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
660 fvdw6
= _mm_mul_pd(c6_00
,FF
);
662 /* CUBIC SPLINE TABLE REPULSION */
663 vfitab
= _mm_add_epi32(vfitab
,ifour
);
664 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
665 F
= _mm_setzero_pd();
666 GMX_MM_TRANSPOSE2_PD(Y
,F
);
667 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
668 H
= _mm_setzero_pd();
669 GMX_MM_TRANSPOSE2_PD(G
,H
);
670 Heps
= _mm_mul_pd(vfeps
,H
);
671 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
672 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
673 fvdw12
= _mm_mul_pd(c12_00
,FF
);
674 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
676 fscal
= _mm_add_pd(felec
,fvdw
);
678 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
680 /* Calculate temporary vectorial force */
681 tx
= _mm_mul_pd(fscal
,dx00
);
682 ty
= _mm_mul_pd(fscal
,dy00
);
683 tz
= _mm_mul_pd(fscal
,dz00
);
685 /* Update vectorial force */
686 fix0
= _mm_add_pd(fix0
,tx
);
687 fiy0
= _mm_add_pd(fiy0
,ty
);
688 fiz0
= _mm_add_pd(fiz0
,tz
);
690 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
692 /* Inner loop uses 61 flops */
695 /* End of innermost loop */
697 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
698 f
+i_coord_offset
,fshift
+i_shift_offset
);
700 /* Increment number of inner iterations */
701 inneriter
+= j_index_end
- j_index_start
;
703 /* Outer loop uses 7 flops */
706 /* Increment number of outer iterations */
709 /* Update outer/inner flops */
711 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_F
,outeriter
*7 + inneriter
*61);