2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "nbnxn_kernel_gpu_ref.h"
45 #include "gromacs/math/functions.h"
46 #include "gromacs/math/utilities.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/mdlib/force.h"
49 #include "gromacs/mdlib/nb_verlet.h"
50 #include "gromacs/mdlib/nbnxn_consts.h"
51 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
52 #include "gromacs/mdtypes/md_enums.h"
53 #include "gromacs/pbcutil/ishift.h"
54 #include "gromacs/utility/fatalerror.h"
56 static const int c_numClPerSupercl
= c_nbnxnGpuNumClusterPerSupercluster
;
57 static const int c_clSize
= c_nbnxnGpuClusterSize
;
60 nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t
*nbl
,
61 const nbnxn_atomdata_t
*nbat
,
62 const interaction_const_t
*iconst
,
71 const nbnxn_sci_t
*nbln
;
75 const real
*Ftab
= NULL
;
76 real rcut2
, rvdw2
, rlist2
;
82 int cj4_ind0
, cj4_ind1
, cj4_ind
;
84 int ic
, jc
, ia
, ja
, is
, ifs
, js
, jfs
, im
, jm
;
88 real fscal
, tx
, ty
, tz
;
91 real qq
, vcoul
= 0, krsq
, vctot
;
97 real Vvdw_rep
, Vvdw_disp
;
98 real ix
, iy
, iz
, fix
, fiy
, fiz
;
100 real dx
, dy
, dz
, rsq
, rinv
;
104 const real
* shiftvec
;
107 const nbnxn_excl_t
*excl
[2];
109 int npair_tot
, npair
;
110 int nhwu
, nhwu_pruned
;
112 if (nbl
->na_ci
!= c_clSize
)
114 gmx_fatal(FARGS
, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl
->na_ci
, c_clSize
);
117 if (clearF
== enbvClearFYes
)
122 bEner
= (force_flags
& GMX_FORCE_ENERGY
);
124 bEwald
= EEL_FULL(iconst
->eeltype
);
127 Ftab
= iconst
->tabq_coul_F
;
130 rcut2
= iconst
->rcoulomb
*iconst
->rcoulomb
;
131 rvdw2
= iconst
->rvdw
*iconst
->rvdw
;
133 rlist2
= nbl
->rlist
*nbl
->rlist
;
136 facel
= iconst
->epsfac
;
137 shiftvec
= shift_vec
[0];
138 vdwparam
= nbat
->nbfp
;
147 for (n
= 0; n
< nbl
->nsci
; n
++)
151 ish3
= 3*nbln
->shift
;
152 shX
= shiftvec
[ish3
];
153 shY
= shiftvec
[ish3
+1];
154 shZ
= shiftvec
[ish3
+2];
155 cj4_ind0
= nbln
->cj4_ind_start
;
156 cj4_ind1
= nbln
->cj4_ind_end
;
161 if (nbln
->shift
== CENTRAL
&&
162 nbl
->cj4
[cj4_ind0
].cj
[0] == sci
*c_numClPerSupercl
)
164 /* we have the diagonal:
165 * add the charge self interaction energy term
167 for (im
= 0; im
< c_numClPerSupercl
; im
++)
169 ci
= sci
*c_numClPerSupercl
+ im
;
170 for (ic
= 0; ic
< c_clSize
; ic
++)
172 ia
= ci
*c_clSize
+ ic
;
173 iq
= x
[ia
*nbat
->xstride
+3];
179 vctot
*= -facel
*0.5*iconst
->c_rf
;
183 /* last factor 1/sqrt(pi) */
184 vctot
*= -facel
*iconst
->ewaldcoeff_q
*M_1_SQRTPI
;
188 for (cj4_ind
= cj4_ind0
; (cj4_ind
< cj4_ind1
); cj4_ind
++)
190 excl
[0] = &nbl
->excl
[nbl
->cj4
[cj4_ind
].imei
[0].excl_ind
];
191 excl
[1] = &nbl
->excl
[nbl
->cj4
[cj4_ind
].imei
[1].excl_ind
];
193 for (jm
= 0; jm
< c_nbnxnGpuJgroupSize
; jm
++)
195 cj
= nbl
->cj4
[cj4_ind
].cj
[jm
];
197 for (im
= 0; im
< c_numClPerSupercl
; im
++)
199 /* We're only using the first imask,
200 * but here imei[1].imask is identical.
202 if ((nbl
->cj4
[cj4_ind
].imei
[0].imask
>> (jm
*c_numClPerSupercl
+ im
)) & 1)
204 gmx_bool within_rlist
;
206 ci
= sci
*c_numClPerSupercl
+ im
;
208 within_rlist
= FALSE
;
210 for (ic
= 0; ic
< c_clSize
; ic
++)
212 ia
= ci
*c_clSize
+ ic
;
214 is
= ia
*nbat
->xstride
;
215 ifs
= ia
*nbat
->fstride
;
220 nti
= ntype
*2*type
[ia
];
226 for (jc
= 0; jc
< c_clSize
; jc
++)
228 ja
= cj
*c_clSize
+ jc
;
230 if (nbln
->shift
== CENTRAL
&&
231 ci
== cj
&& ja
<= ia
)
236 int_bit
= ((excl
[jc
>> 2]->pair
[(jc
& 3)*c_clSize
+ ic
] >> (jm
*c_numClPerSupercl
+ im
)) & 1);
238 js
= ja
*nbat
->xstride
;
239 jfs
= ja
*nbat
->fstride
;
246 rsq
= dx
*dx
+ dy
*dy
+ dz
*dz
;
256 if (type
[ia
] != ntype
-1 && type
[ja
] != ntype
-1)
261 // Ensure distance do not become so small that r^-12 overflows
262 rsq
= std::max(rsq
, NBNXN_MIN_RSQ
);
264 rinv
= gmx::invsqrt(rsq
);
271 krsq
= iconst
->k_rf
*rsq
;
272 fscal
= qq
*(int_bit
*rinv
- 2*krsq
)*rinvsq
;
275 vcoul
= qq
*(int_bit
*rinv
+ krsq
- iconst
->c_rf
);
281 rt
= r
*iconst
->tabq_scale
;
285 fexcl
= (1 - eps
)*Ftab
[n0
] + eps
*Ftab
[n0
+1];
287 fscal
= qq
*(int_bit
*rinvsq
- fexcl
)*rinv
;
291 vcoul
= qq
*((int_bit
- std::erf(iconst
->ewaldcoeff_q
*r
))*rinv
- int_bit
*iconst
->sh_ewald
);
297 tj
= nti
+ 2*type
[ja
];
299 /* Vanilla Lennard-Jones cutoff */
301 c12
= vdwparam
[tj
+1];
303 rinvsix
= int_bit
*rinvsq
*rinvsq
*rinvsq
;
304 Vvdw_disp
= c6
*rinvsix
;
305 Vvdw_rep
= c12
*rinvsix
*rinvsix
;
306 fscal
+= (Vvdw_rep
- Vvdw_disp
)*rinvsq
;
313 (Vvdw_rep
- int_bit
*c12
*iconst
->sh_invrc6
*iconst
->sh_invrc6
)/12 -
314 (Vvdw_disp
- int_bit
*c6
*iconst
->sh_invrc6
)/6;
332 fshift
[ish3
] = fshift
[ish3
] + fix
;
333 fshift
[ish3
+1] = fshift
[ish3
+1] + fiy
;
334 fshift
[ish3
+2] = fshift
[ish3
+2] + fiz
;
336 /* Count in half work-units.
337 * In CUDA one work-unit is 2 warps.
339 if ((ic
+1) % (c_clSize
/c_nbnxnGpuClusterpairSplit
) == 0)
349 within_rlist
= FALSE
;
361 Vc
[ggid
] = Vc
[ggid
] + vctot
;
362 Vvdw
[ggid
] = Vvdw
[ggid
] + Vvdwtot
;
368 fprintf(debug
, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
369 nbl
->na_ci
, nbl
->na_ci
,
370 nhwu
, nhwu_pruned
, nhwu_pruned
/(double)nhwu
);
371 fprintf(debug
, "generic kernel pair interactions: %d\n",
372 nhwu
*nbl
->na_ci
/2*nbl
->na_ci
);
373 fprintf(debug
, "generic kernel post-prune pair interactions: %d\n",
374 nhwu_pruned
*nbl
->na_ci
/2*nbl
->na_ci
);
375 fprintf(debug
, "generic kernel non-zero pair interactions: %d\n",
377 fprintf(debug
, "ratio non-zero/post-prune pair interactions: %4.2f\n",
378 npair_tot
/(double)(nhwu_pruned
*nbl
->na_ci
/2*nbl
->na_ci
));