Pass gmx::ForceFlags to CPU nbnxm dispatch code
[gromacs.git] / src / gromacs / nbnxm / kernels_reference / kernel_gpu_ref.cpp
blob16a439711295b359e5276ee0508ad8462e0b13a4
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #include "gmxpre.h"
37 #include "kernel_gpu_ref.h"
39 #include <cmath>
41 #include <algorithm>
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/ppforceworkload.h"
47 #include "gromacs/mdtypes/md_enums.h"
48 #include "gromacs/nbnxm/atomdata.h"
49 #include "gromacs/nbnxm/nbnxm.h"
50 #include "gromacs/nbnxm/pairlist.h"
51 #include "gromacs/pbcutil/ishift.h"
52 #include "gromacs/utility/fatalerror.h"
54 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
55 static const int c_clSize = c_nbnxnGpuClusterSize;
57 void
58 nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu *nbl,
59 const nbnxn_atomdata_t *nbat,
60 const interaction_const_t *iconst,
61 rvec *shift_vec,
62 const gmx::ForceFlags &forceFlags,
63 int clearF,
64 gmx::ArrayRef<real> f,
65 real * fshift,
66 real * Vc,
67 real * Vvdw)
69 gmx_bool bEwald;
70 const real *Ftab = nullptr;
71 real rcut2, rvdw2, rlist2;
72 int ntype;
73 real facel;
74 int ish3;
75 int sci;
76 int cj4_ind0, cj4_ind1, cj4_ind;
77 int ci, cj;
78 int ic, jc, ia, ja, is, ifs, js, jfs, im, jm;
79 int n0;
80 int ggid;
81 real shX, shY, shZ;
82 real fscal, tx, ty, tz;
83 real rinvsq;
84 real iq;
85 real qq, vcoul = 0, krsq, vctot;
86 int nti;
87 int tj;
88 real rt, r, eps;
89 real rinvsix;
90 real Vvdwtot;
91 real Vvdw_rep, Vvdw_disp;
92 real ix, iy, iz, fix, fiy, fiz;
93 real jx, jy, jz;
94 real dx, dy, dz, rsq, rinv;
95 real int_bit;
96 real fexcl;
97 real c6, c12;
98 const nbnxn_excl_t *excl[2];
100 int npair_tot, npair;
101 int nhwu, nhwu_pruned;
103 if (nbl->na_ci != c_clSize)
105 gmx_fatal(FARGS, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl->na_ci, c_clSize);
108 if (clearF == enbvClearFYes)
110 for (real &elem : f)
112 elem = 0;
116 bEwald = EEL_FULL(iconst->eeltype);
117 if (bEwald)
119 Ftab = iconst->coulombEwaldTables->tableF.data();
122 rcut2 = iconst->rcoulomb*iconst->rcoulomb;
123 rvdw2 = iconst->rvdw*iconst->rvdw;
125 rlist2 = nbl->rlist*nbl->rlist;
127 const int *type = nbat->params().type.data();
128 facel = iconst->epsfac;
129 const real *shiftvec = shift_vec[0];
130 const real *vdwparam = nbat->params().nbfp.data();
131 ntype = nbat->params().numTypes;
133 const real *x = nbat->x().data();
135 npair_tot = 0;
136 nhwu = 0;
137 nhwu_pruned = 0;
139 for (const nbnxn_sci_t &nbln : nbl->sci)
141 ish3 = 3*nbln.shift;
142 shX = shiftvec[ish3];
143 shY = shiftvec[ish3+1];
144 shZ = shiftvec[ish3+2];
145 cj4_ind0 = nbln.cj4_ind_start;
146 cj4_ind1 = nbln.cj4_ind_end;
147 sci = nbln.sci;
148 vctot = 0;
149 Vvdwtot = 0;
151 if (nbln.shift == CENTRAL &&
152 nbl->cj4[cj4_ind0].cj[0] == sci*c_numClPerSupercl)
154 /* we have the diagonal:
155 * add the charge self interaction energy term
157 for (im = 0; im < c_numClPerSupercl; im++)
159 ci = sci*c_numClPerSupercl + im;
160 for (ic = 0; ic < c_clSize; ic++)
162 ia = ci*c_clSize + ic;
163 iq = x[ia*nbat->xstride+3];
164 vctot += iq*iq;
167 if (!bEwald)
169 vctot *= -facel*0.5*iconst->c_rf;
171 else
173 /* last factor 1/sqrt(pi) */
174 vctot *= -facel*iconst->ewaldcoeff_q*M_1_SQRTPI;
178 for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
180 excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
181 excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
183 for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
185 cj = nbl->cj4[cj4_ind].cj[jm];
187 for (im = 0; im < c_numClPerSupercl; im++)
189 /* We're only using the first imask,
190 * but here imei[1].imask is identical.
192 if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*c_numClPerSupercl + im)) & 1)
194 gmx_bool within_rlist;
196 ci = sci*c_numClPerSupercl + im;
198 within_rlist = FALSE;
199 npair = 0;
200 for (ic = 0; ic < c_clSize; ic++)
202 ia = ci*c_clSize + ic;
204 is = ia*nbat->xstride;
205 ifs = ia*nbat->fstride;
206 ix = shX + x[is+0];
207 iy = shY + x[is+1];
208 iz = shZ + x[is+2];
209 iq = facel*x[is+3];
210 nti = ntype*2*type[ia];
212 fix = 0;
213 fiy = 0;
214 fiz = 0;
216 for (jc = 0; jc < c_clSize; jc++)
218 ja = cj*c_clSize + jc;
220 if (nbln.shift == CENTRAL &&
221 ci == cj && ja <= ia)
223 continue;
226 constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
227 int_bit = static_cast<real>((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
228 >> (jm*c_numClPerSupercl + im)) & 1);
230 js = ja*nbat->xstride;
231 jfs = ja*nbat->fstride;
232 jx = x[js+0];
233 jy = x[js+1];
234 jz = x[js+2];
235 dx = ix - jx;
236 dy = iy - jy;
237 dz = iz - jz;
238 rsq = dx*dx + dy*dy + dz*dz;
239 if (rsq < rlist2)
241 within_rlist = TRUE;
243 if (rsq >= rcut2)
245 continue;
248 if (type[ia] != ntype-1 && type[ja] != ntype-1)
250 npair++;
253 // Ensure distance do not become so small that r^-12 overflows
254 rsq = std::max(rsq, NBNXN_MIN_RSQ);
256 rinv = gmx::invsqrt(rsq);
257 rinvsq = rinv*rinv;
259 qq = iq*x[js+3];
260 if (!bEwald)
262 /* Reaction-field */
263 krsq = iconst->k_rf*rsq;
264 fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
265 if (forceFlags.computeEnergy)
267 vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
270 else
272 r = rsq*rinv;
273 rt = r*iconst->coulombEwaldTables->scale;
274 n0 = static_cast<int>(rt);
275 eps = rt - static_cast<real>(n0);
277 fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
279 fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
281 if (forceFlags.computeEnergy)
283 vcoul = qq*((int_bit - std::erf(iconst->ewaldcoeff_q*r))*rinv - int_bit*iconst->sh_ewald);
287 if (rsq < rvdw2)
289 tj = nti + 2*type[ja];
291 /* Vanilla Lennard-Jones cutoff */
292 c6 = vdwparam[tj];
293 c12 = vdwparam[tj+1];
295 rinvsix = int_bit*rinvsq*rinvsq*rinvsq;
296 Vvdw_disp = c6*rinvsix;
297 Vvdw_rep = c12*rinvsix*rinvsix;
298 fscal += (Vvdw_rep - Vvdw_disp)*rinvsq;
300 if (forceFlags.computeEnergy)
302 vctot += vcoul;
304 Vvdwtot +=
305 (Vvdw_rep + int_bit*c12*iconst->repulsion_shift.cpot)/12 -
306 (Vvdw_disp + int_bit*c6*iconst->dispersion_shift.cpot)/6;
310 tx = fscal*dx;
311 ty = fscal*dy;
312 tz = fscal*dz;
313 fix = fix + tx;
314 fiy = fiy + ty;
315 fiz = fiz + tz;
316 f[jfs+0] -= tx;
317 f[jfs+1] -= ty;
318 f[jfs+2] -= tz;
321 f[ifs+0] += fix;
322 f[ifs+1] += fiy;
323 f[ifs+2] += fiz;
324 fshift[ish3] = fshift[ish3] + fix;
325 fshift[ish3+1] = fshift[ish3+1] + fiy;
326 fshift[ish3+2] = fshift[ish3+2] + fiz;
328 /* Count in half work-units.
329 * In CUDA one work-unit is 2 warps.
331 if ((ic+1) % (c_clSize/c_nbnxnGpuClusterpairSplit) == 0)
333 npair_tot += npair;
335 nhwu++;
336 if (within_rlist)
338 nhwu_pruned++;
341 within_rlist = FALSE;
342 npair = 0;
350 if (forceFlags.computeEnergy)
352 ggid = 0;
353 Vc[ggid] = Vc[ggid] + vctot;
354 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
358 if (debug)
360 fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
361 nbl->na_ci, nbl->na_ci,
362 nhwu, nhwu_pruned, nhwu_pruned/static_cast<double>(nhwu));
363 fprintf(debug, "generic kernel pair interactions: %d\n",
364 nhwu*nbl->na_ci/2*nbl->na_ci);
365 fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
366 nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
367 fprintf(debug, "generic kernel non-zero pair interactions: %d\n",
368 npair_tot);
369 fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
370 npair_tot/static_cast<double>(nhwu_pruned*gmx::exactDiv(nbl->na_ci, 2)*nbl->na_ci));