2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: CubicSplineTable
53 * Geometry: Particle-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse4_1_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
81 int vdwjidx0A
,vdwjidx0B
;
82 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
83 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
84 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
87 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
90 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
91 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
93 __m128i ifour
= _mm_set1_epi32(4);
94 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
97 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
99 __m128d dummy_mask
,cutoff_mask
;
100 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one
= _mm_set1_pd(1.0);
102 __m128d two
= _mm_set1_pd(2.0);
108 jindex
= nlist
->jindex
;
110 shiftidx
= nlist
->shift
;
112 shiftvec
= fr
->shift_vec
[0];
113 fshift
= fr
->fshift
[0];
114 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
115 charge
= mdatoms
->chargeA
;
116 nvdwtype
= fr
->ntype
;
118 vdwtype
= mdatoms
->typeA
;
120 vftab
= kernel_data
->table_vdw
->data
;
121 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
123 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
124 ewtab
= fr
->ic
->tabq_coul_FDV0
;
125 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
126 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
128 /* Avoid stupid compiler warnings */
136 /* Start outer loop over neighborlists */
137 for(iidx
=0; iidx
<nri
; iidx
++)
139 /* Load shift vector for this list */
140 i_shift_offset
= DIM
*shiftidx
[iidx
];
142 /* Load limits for loop over neighbors */
143 j_index_start
= jindex
[iidx
];
144 j_index_end
= jindex
[iidx
+1];
146 /* Get outer coordinate index */
148 i_coord_offset
= DIM
*inr
;
150 /* Load i particle coords and add shift vector */
151 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
153 fix0
= _mm_setzero_pd();
154 fiy0
= _mm_setzero_pd();
155 fiz0
= _mm_setzero_pd();
157 /* Load parameters for i particles */
158 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
159 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
161 /* Reset potential sums */
162 velecsum
= _mm_setzero_pd();
163 vvdwsum
= _mm_setzero_pd();
165 /* Start inner kernel loop */
166 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
169 /* Get j neighbor index, and coordinate index */
172 j_coord_offsetA
= DIM
*jnrA
;
173 j_coord_offsetB
= DIM
*jnrB
;
175 /* load j atom coordinates */
176 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
179 /* Calculate displacement vector */
180 dx00
= _mm_sub_pd(ix0
,jx0
);
181 dy00
= _mm_sub_pd(iy0
,jy0
);
182 dz00
= _mm_sub_pd(iz0
,jz0
);
184 /* Calculate squared distance and things based on it */
185 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
187 rinv00
= sse41_invsqrt_d(rsq00
);
189 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
191 /* Load parameters for j particles */
192 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
193 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
194 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
196 /**************************
197 * CALCULATE INTERACTIONS *
198 **************************/
200 r00
= _mm_mul_pd(rsq00
,rinv00
);
202 /* Compute parameters for interactions between i and j atoms */
203 qq00
= _mm_mul_pd(iq0
,jq0
);
204 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
205 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
207 /* Calculate table index by multiplying r with table scale and truncate to integer */
208 rt
= _mm_mul_pd(r00
,vftabscale
);
209 vfitab
= _mm_cvttpd_epi32(rt
);
210 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
211 vfitab
= _mm_slli_epi32(vfitab
,3);
213 /* EWALD ELECTROSTATICS */
215 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
216 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
217 ewitab
= _mm_cvttpd_epi32(ewrt
);
218 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
219 ewitab
= _mm_slli_epi32(ewitab
,2);
220 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
221 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
222 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
223 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
224 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
225 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
226 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
227 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
228 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
229 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
231 /* CUBIC SPLINE TABLE DISPERSION */
232 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
233 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
234 GMX_MM_TRANSPOSE2_PD(Y
,F
);
235 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
236 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
237 GMX_MM_TRANSPOSE2_PD(G
,H
);
238 Heps
= _mm_mul_pd(vfeps
,H
);
239 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
240 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
241 vvdw6
= _mm_mul_pd(c6_00
,VV
);
242 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
243 fvdw6
= _mm_mul_pd(c6_00
,FF
);
245 /* CUBIC SPLINE TABLE REPULSION */
246 vfitab
= _mm_add_epi32(vfitab
,ifour
);
247 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
248 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
249 GMX_MM_TRANSPOSE2_PD(Y
,F
);
250 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
251 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
252 GMX_MM_TRANSPOSE2_PD(G
,H
);
253 Heps
= _mm_mul_pd(vfeps
,H
);
254 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
255 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
256 vvdw12
= _mm_mul_pd(c12_00
,VV
);
257 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
258 fvdw12
= _mm_mul_pd(c12_00
,FF
);
259 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
260 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
262 /* Update potential sum for this i atom from the interaction with this j atom. */
263 velecsum
= _mm_add_pd(velecsum
,velec
);
264 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
266 fscal
= _mm_add_pd(felec
,fvdw
);
268 /* Calculate temporary vectorial force */
269 tx
= _mm_mul_pd(fscal
,dx00
);
270 ty
= _mm_mul_pd(fscal
,dy00
);
271 tz
= _mm_mul_pd(fscal
,dz00
);
273 /* Update vectorial force */
274 fix0
= _mm_add_pd(fix0
,tx
);
275 fiy0
= _mm_add_pd(fiy0
,ty
);
276 fiz0
= _mm_add_pd(fiz0
,tz
);
278 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
280 /* Inner loop uses 75 flops */
287 j_coord_offsetA
= DIM
*jnrA
;
289 /* load j atom coordinates */
290 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
293 /* Calculate displacement vector */
294 dx00
= _mm_sub_pd(ix0
,jx0
);
295 dy00
= _mm_sub_pd(iy0
,jy0
);
296 dz00
= _mm_sub_pd(iz0
,jz0
);
298 /* Calculate squared distance and things based on it */
299 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
301 rinv00
= sse41_invsqrt_d(rsq00
);
303 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
305 /* Load parameters for j particles */
306 jq0
= _mm_load_sd(charge
+jnrA
+0);
307 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 r00
= _mm_mul_pd(rsq00
,rinv00
);
315 /* Compute parameters for interactions between i and j atoms */
316 qq00
= _mm_mul_pd(iq0
,jq0
);
317 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
319 /* Calculate table index by multiplying r with table scale and truncate to integer */
320 rt
= _mm_mul_pd(r00
,vftabscale
);
321 vfitab
= _mm_cvttpd_epi32(rt
);
322 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
323 vfitab
= _mm_slli_epi32(vfitab
,3);
325 /* EWALD ELECTROSTATICS */
327 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
328 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
329 ewitab
= _mm_cvttpd_epi32(ewrt
);
330 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
331 ewitab
= _mm_slli_epi32(ewitab
,2);
332 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
333 ewtabD
= _mm_setzero_pd();
334 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
335 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
336 ewtabFn
= _mm_setzero_pd();
337 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
338 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
339 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
340 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
341 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
343 /* CUBIC SPLINE TABLE DISPERSION */
344 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
345 F
= _mm_setzero_pd();
346 GMX_MM_TRANSPOSE2_PD(Y
,F
);
347 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
348 H
= _mm_setzero_pd();
349 GMX_MM_TRANSPOSE2_PD(G
,H
);
350 Heps
= _mm_mul_pd(vfeps
,H
);
351 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
352 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
353 vvdw6
= _mm_mul_pd(c6_00
,VV
);
354 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
355 fvdw6
= _mm_mul_pd(c6_00
,FF
);
357 /* CUBIC SPLINE TABLE REPULSION */
358 vfitab
= _mm_add_epi32(vfitab
,ifour
);
359 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
360 F
= _mm_setzero_pd();
361 GMX_MM_TRANSPOSE2_PD(Y
,F
);
362 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
363 H
= _mm_setzero_pd();
364 GMX_MM_TRANSPOSE2_PD(G
,H
);
365 Heps
= _mm_mul_pd(vfeps
,H
);
366 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
367 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
368 vvdw12
= _mm_mul_pd(c12_00
,VV
);
369 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
370 fvdw12
= _mm_mul_pd(c12_00
,FF
);
371 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
372 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
376 velecsum
= _mm_add_pd(velecsum
,velec
);
377 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
378 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
380 fscal
= _mm_add_pd(felec
,fvdw
);
382 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
384 /* Calculate temporary vectorial force */
385 tx
= _mm_mul_pd(fscal
,dx00
);
386 ty
= _mm_mul_pd(fscal
,dy00
);
387 tz
= _mm_mul_pd(fscal
,dz00
);
389 /* Update vectorial force */
390 fix0
= _mm_add_pd(fix0
,tx
);
391 fiy0
= _mm_add_pd(fiy0
,ty
);
392 fiz0
= _mm_add_pd(fiz0
,tz
);
394 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
396 /* Inner loop uses 75 flops */
399 /* End of innermost loop */
401 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
402 f
+i_coord_offset
,fshift
+i_shift_offset
);
405 /* Update potential energies */
406 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
407 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
409 /* Increment number of inner iterations */
410 inneriter
+= j_index_end
- j_index_start
;
412 /* Outer loop uses 9 flops */
415 /* Increment number of outer iterations */
418 /* Update outer/inner flops */
420 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_VF
,outeriter
*9 + inneriter
*75);
423 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_double
424 * Electrostatics interaction: Ewald
425 * VdW interaction: CubicSplineTable
426 * Geometry: Particle-Particle
427 * Calculate force/pot: Force
430 nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_double
431 (t_nblist
* gmx_restrict nlist
,
432 rvec
* gmx_restrict xx
,
433 rvec
* gmx_restrict ff
,
434 struct t_forcerec
* gmx_restrict fr
,
435 t_mdatoms
* gmx_restrict mdatoms
,
436 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
437 t_nrnb
* gmx_restrict nrnb
)
439 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
440 * just 0 for non-waters.
441 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
442 * jnr indices corresponding to data put in the four positions in the SIMD register.
444 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
445 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
447 int j_coord_offsetA
,j_coord_offsetB
;
448 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
450 real
*shiftvec
,*fshift
,*x
,*f
;
451 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
453 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
454 int vdwjidx0A
,vdwjidx0B
;
455 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
456 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
457 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
460 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
463 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
464 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
466 __m128i ifour
= _mm_set1_epi32(4);
467 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
470 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
472 __m128d dummy_mask
,cutoff_mask
;
473 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
474 __m128d one
= _mm_set1_pd(1.0);
475 __m128d two
= _mm_set1_pd(2.0);
481 jindex
= nlist
->jindex
;
483 shiftidx
= nlist
->shift
;
485 shiftvec
= fr
->shift_vec
[0];
486 fshift
= fr
->fshift
[0];
487 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
488 charge
= mdatoms
->chargeA
;
489 nvdwtype
= fr
->ntype
;
491 vdwtype
= mdatoms
->typeA
;
493 vftab
= kernel_data
->table_vdw
->data
;
494 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
496 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
497 ewtab
= fr
->ic
->tabq_coul_F
;
498 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
499 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
501 /* Avoid stupid compiler warnings */
509 /* Start outer loop over neighborlists */
510 for(iidx
=0; iidx
<nri
; iidx
++)
512 /* Load shift vector for this list */
513 i_shift_offset
= DIM
*shiftidx
[iidx
];
515 /* Load limits for loop over neighbors */
516 j_index_start
= jindex
[iidx
];
517 j_index_end
= jindex
[iidx
+1];
519 /* Get outer coordinate index */
521 i_coord_offset
= DIM
*inr
;
523 /* Load i particle coords and add shift vector */
524 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
526 fix0
= _mm_setzero_pd();
527 fiy0
= _mm_setzero_pd();
528 fiz0
= _mm_setzero_pd();
530 /* Load parameters for i particles */
531 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
532 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
534 /* Start inner kernel loop */
535 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
538 /* Get j neighbor index, and coordinate index */
541 j_coord_offsetA
= DIM
*jnrA
;
542 j_coord_offsetB
= DIM
*jnrB
;
544 /* load j atom coordinates */
545 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
548 /* Calculate displacement vector */
549 dx00
= _mm_sub_pd(ix0
,jx0
);
550 dy00
= _mm_sub_pd(iy0
,jy0
);
551 dz00
= _mm_sub_pd(iz0
,jz0
);
553 /* Calculate squared distance and things based on it */
554 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
556 rinv00
= sse41_invsqrt_d(rsq00
);
558 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
560 /* Load parameters for j particles */
561 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
562 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
563 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
565 /**************************
566 * CALCULATE INTERACTIONS *
567 **************************/
569 r00
= _mm_mul_pd(rsq00
,rinv00
);
571 /* Compute parameters for interactions between i and j atoms */
572 qq00
= _mm_mul_pd(iq0
,jq0
);
573 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
574 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
576 /* Calculate table index by multiplying r with table scale and truncate to integer */
577 rt
= _mm_mul_pd(r00
,vftabscale
);
578 vfitab
= _mm_cvttpd_epi32(rt
);
579 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
580 vfitab
= _mm_slli_epi32(vfitab
,3);
582 /* EWALD ELECTROSTATICS */
584 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
585 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
586 ewitab
= _mm_cvttpd_epi32(ewrt
);
587 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
588 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
590 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
591 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
593 /* CUBIC SPLINE TABLE DISPERSION */
594 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
595 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
596 GMX_MM_TRANSPOSE2_PD(Y
,F
);
597 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
598 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
599 GMX_MM_TRANSPOSE2_PD(G
,H
);
600 Heps
= _mm_mul_pd(vfeps
,H
);
601 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
602 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
603 fvdw6
= _mm_mul_pd(c6_00
,FF
);
605 /* CUBIC SPLINE TABLE REPULSION */
606 vfitab
= _mm_add_epi32(vfitab
,ifour
);
607 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
608 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
609 GMX_MM_TRANSPOSE2_PD(Y
,F
);
610 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
611 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
612 GMX_MM_TRANSPOSE2_PD(G
,H
);
613 Heps
= _mm_mul_pd(vfeps
,H
);
614 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
615 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
616 fvdw12
= _mm_mul_pd(c12_00
,FF
);
617 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
619 fscal
= _mm_add_pd(felec
,fvdw
);
621 /* Calculate temporary vectorial force */
622 tx
= _mm_mul_pd(fscal
,dx00
);
623 ty
= _mm_mul_pd(fscal
,dy00
);
624 tz
= _mm_mul_pd(fscal
,dz00
);
626 /* Update vectorial force */
627 fix0
= _mm_add_pd(fix0
,tx
);
628 fiy0
= _mm_add_pd(fiy0
,ty
);
629 fiz0
= _mm_add_pd(fiz0
,tz
);
631 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
633 /* Inner loop uses 62 flops */
640 j_coord_offsetA
= DIM
*jnrA
;
642 /* load j atom coordinates */
643 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
646 /* Calculate displacement vector */
647 dx00
= _mm_sub_pd(ix0
,jx0
);
648 dy00
= _mm_sub_pd(iy0
,jy0
);
649 dz00
= _mm_sub_pd(iz0
,jz0
);
651 /* Calculate squared distance and things based on it */
652 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
654 rinv00
= sse41_invsqrt_d(rsq00
);
656 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
658 /* Load parameters for j particles */
659 jq0
= _mm_load_sd(charge
+jnrA
+0);
660 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
662 /**************************
663 * CALCULATE INTERACTIONS *
664 **************************/
666 r00
= _mm_mul_pd(rsq00
,rinv00
);
668 /* Compute parameters for interactions between i and j atoms */
669 qq00
= _mm_mul_pd(iq0
,jq0
);
670 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
672 /* Calculate table index by multiplying r with table scale and truncate to integer */
673 rt
= _mm_mul_pd(r00
,vftabscale
);
674 vfitab
= _mm_cvttpd_epi32(rt
);
675 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
676 vfitab
= _mm_slli_epi32(vfitab
,3);
678 /* EWALD ELECTROSTATICS */
680 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
682 ewitab
= _mm_cvttpd_epi32(ewrt
);
683 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
684 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
685 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
686 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
688 /* CUBIC SPLINE TABLE DISPERSION */
689 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
690 F
= _mm_setzero_pd();
691 GMX_MM_TRANSPOSE2_PD(Y
,F
);
692 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
693 H
= _mm_setzero_pd();
694 GMX_MM_TRANSPOSE2_PD(G
,H
);
695 Heps
= _mm_mul_pd(vfeps
,H
);
696 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
697 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
698 fvdw6
= _mm_mul_pd(c6_00
,FF
);
700 /* CUBIC SPLINE TABLE REPULSION */
701 vfitab
= _mm_add_epi32(vfitab
,ifour
);
702 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
703 F
= _mm_setzero_pd();
704 GMX_MM_TRANSPOSE2_PD(Y
,F
);
705 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
706 H
= _mm_setzero_pd();
707 GMX_MM_TRANSPOSE2_PD(G
,H
);
708 Heps
= _mm_mul_pd(vfeps
,H
);
709 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
710 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
711 fvdw12
= _mm_mul_pd(c12_00
,FF
);
712 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
714 fscal
= _mm_add_pd(felec
,fvdw
);
716 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
718 /* Calculate temporary vectorial force */
719 tx
= _mm_mul_pd(fscal
,dx00
);
720 ty
= _mm_mul_pd(fscal
,dy00
);
721 tz
= _mm_mul_pd(fscal
,dz00
);
723 /* Update vectorial force */
724 fix0
= _mm_add_pd(fix0
,tx
);
725 fiy0
= _mm_add_pd(fiy0
,ty
);
726 fiz0
= _mm_add_pd(fiz0
,tz
);
728 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
730 /* Inner loop uses 62 flops */
733 /* End of innermost loop */
735 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
736 f
+i_coord_offset
,fshift
+i_shift_offset
);
738 /* Increment number of inner iterations */
739 inneriter
+= j_index_end
- j_index_start
;
741 /* Outer loop uses 7 flops */
744 /* Increment number of outer iterations */
747 /* Update outer/inner flops */
749 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_F
,outeriter
*7 + inneriter
*62);