Fix incorrect rvdw on GPU with rvdw<rcoulomb
[gromacs.git] / src / gromacs / mdlib / nbnxn_gpu_common.h
blobc8726c9716e67e64d4af2db2ba0fc887f21a2846
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief Common functions for the different NBNXN GPU implementations.
38 * \author Szilard Pall <pall.szilard@gmail.com>
40 * \ingroup module_mdlib
43 #ifndef GMX_MDLIB_NBNXN_GPU_COMMON_H
44 #define GMX_MDLIB_NBNXN_GPU_COMMON_H
46 #include "config.h"
48 #include <string>
50 #if GMX_GPU == GMX_GPU_CUDA
51 #include "nbnxn_cuda/nbnxn_cuda_types.h"
52 #endif
54 #if GMX_GPU == GMX_GPU_OPENCL
55 #include "nbnxn_ocl/nbnxn_ocl_types.h"
56 #endif
58 #include "gromacs/gpu_utils/gpu_utils.h"
59 #include "gromacs/math/vec.h"
60 #include "gromacs/mdlib/force_flags.h"
61 #include "gromacs/mdlib/nb_verlet.h"
62 #include "gromacs/mdlib/nbnxn_gpu_types.h"
63 #include "gromacs/pbcutil/ishift.h"
64 #include "gromacs/timing/gpu_timing.h"
65 #include "gromacs/utility/fatalerror.h"
66 #include "gromacs/utility/stringutil.h"
68 #include "nbnxn_gpu_common_utils.h"
70 /*! \brief Check that atom locality values are valid for the GPU module.
72 * In the GPU module atom locality "all" is not supported, the local and
73 * non-local ranges are treated separately.
75 * \param[in] atomLocality atom locality specifier
77 static inline void validateGpuAtomLocality(int atomLocality)
79 std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
80 "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal);
82 GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str());
85 /*! \brief Convert atom locality to interaction locality.
87 * In the current implementation the this is straightforward conversion:
88 * local to local, non-local to non-local.
90 * \param[in] atomLocality Atom locality specifier
91 * \returns Interaction locality corresponding to the atom locality passed.
93 static inline int gpuAtomToInteractionLocality(int atomLocality)
95 validateGpuAtomLocality(atomLocality);
97 /* determine interaction locality from atom locality */
98 if (LOCAL_A(atomLocality))
100 return eintLocal;
102 else if (NONLOCAL_A(atomLocality))
104 return eintNonlocal;
106 else
108 gmx_incons("Wrong locality");
112 /*! \brief Calculate atom range and return start index and length.
114 * \param[in] atomData Atom descriptor data structure
115 * \param[in] atomLocality Atom locality specifier
116 * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
117 * \param[out] atomRangeLen Atom range length in the atom data array.
119 template <typename AtomDataT>
120 static inline void getGpuAtomRange(const AtomDataT *atomData,
121 int atomLocality,
122 int *atomRangeBegin,
123 int *atomRangeLen)
125 assert(atomData);
126 validateGpuAtomLocality(atomLocality);
128 /* calculate the atom data index range based on locality */
129 if (LOCAL_A(atomLocality))
131 *atomRangeBegin = 0;
132 *atomRangeLen = atomData->natoms_local;
134 else
136 *atomRangeBegin = atomData->natoms_local;
137 *atomRangeLen = atomData->natoms - atomData->natoms_local;
142 /*! \brief Count pruning kernel time if either kernel has been triggered
144 * We do the accounting for either of the two pruning kernel flavors:
145 * - 1st pass prune: ran during the current step (prior to the force kernel);
146 * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
148 * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
149 * after calling this function.
151 * \param[in] timers structs with GPU timer objects
152 * \param[inout] timings GPU task timing data
153 * \param[in] iloc interaction locality
155 template <typename GpuTimers>
156 static void countPruneKernelTime(GpuTimers *timers,
157 gmx_wallclock_gpu_nbnxn_t *timings,
158 const int iloc)
160 // We might have not done any pruning (e.g. if we skipped with empty domains).
161 if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
163 return;
166 if (timers->didPrune[iloc])
168 timings->pruneTime.c++;
169 timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
172 if (timers->didRollingPrune[iloc])
174 timings->dynamicPruneTime.c++;
175 timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
179 /*! \brief Reduce data staged internally in the nbnxn module.
181 * Shift forces and electrostatic/LJ energies copied from the GPU into
182 * a module-internal staging area are immediately reduced (CPU-side buffers passed)
183 * after having waited for the transfers' completion.
185 * Note that this function should always be called after the transfers into the
186 * staging buffers has completed.
188 * \tparam StagingData Type of staging data
189 * \param[in] nbst Nonbonded staging data
190 * \param[in] iLocality Interaction locality specifier
191 * \param[in] reduceEnergies True if energy reduction should be done
192 * \param[in] reduceFshift True if shift force reduction should be done
193 * \param[out] e_lj Variable to accumulate LJ energy into
194 * \param[out] e_el Variable to accumulate electrostatic energy into
195 * \param[out] fshift Pointer to the array of shift forces to accumulate into
197 template <typename StagingData>
198 static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
199 int iLocality,
200 bool reduceEnergies,
201 bool reduceFshift,
202 real *e_lj,
203 real *e_el,
204 rvec *fshift)
206 /* add up energies and shift forces (only once at local F wait) */
207 if (LOCAL_I(iLocality))
209 if (reduceEnergies)
211 *e_lj += *nbst.e_lj;
212 *e_el += *nbst.e_el;
215 if (reduceFshift)
217 for (int i = 0; i < SHIFTS; i++)
219 rvec_inc(fshift[i], nbst.fshift[i]);
225 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
227 * Does timing accumulation and call-count increments for the nonbonded kernels.
228 * Note that this function should be called after the current step's nonbonded
229 * nonbonded tasks have completed with the exception of the rolling pruning kernels
230 * that are accounted for during the following step.
232 * NOTE: if timing with multiple GPUs (streams) becomes possible, the
233 * counters could end up being inconsistent due to not being incremented
234 * on some of the node when this is skipped on empty local domains!
236 * \tparam GpuTimers GPU timers type
237 * \tparam GpuPairlist Pair list type
238 * \param[out] timings Pointer to the NB GPU timings data
239 * \param[in] timers Pointer to GPU timers data
240 * \param[in] plist Pointer to the pair list data
241 * \param[in] atomLocality Atom locality specifier
242 * \param[in] didEnergyKernels True if energy kernels have been called in the current step
243 * \param[in] doTiming True if timing is enabled.
246 template <typename GpuTimers, typename GpuPairlist>
247 static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
248 GpuTimers *timers,
249 const GpuPairlist *plist,
250 int atomLocality,
251 bool didEnergyKernels,
252 bool doTiming)
254 /* timing data accumulation */
255 if (!doTiming)
257 return;
260 /* determine interaction locality from atom locality */
261 int iLocality = gpuAtomToInteractionLocality(atomLocality);
263 /* only increase counter once (at local F wait) */
264 if (LOCAL_I(iLocality))
266 timings->nb_c++;
267 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
270 /* kernel timings */
271 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
272 timers->nb_k[iLocality].getLastRangeTime();
274 /* X/q H2D and F D2H timings */
275 timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
276 timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
278 /* Count the pruning kernel times for both cases:1st pass (at search step)
279 and rolling pruning (if called at the previous step).
280 We do the accounting here as this is the only sync point where we
281 know (without checking or additional sync-ing) that prune tasks in
282 in the current stream have completed (having just blocking-waited
283 for the force D2H). */
284 countPruneKernelTime(timers, timings, iLocality);
286 /* only count atdat and pair-list H2D at pair-search step */
287 if (timers->didPairlistH2D[iLocality])
289 /* atdat transfer timing (add only once, at local F wait) */
290 if (LOCAL_A(atomLocality))
292 timings->pl_h2d_c++;
293 timings->pl_h2d_t += timers->atdat.getLastRangeTime();
296 timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
298 /* Clear the timing flag for the next step */
299 timers->didPairlistH2D[iLocality] = false;
303 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
304 //NOLINTNEXTLINE(misc-definitions-in-headers)
305 bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb,
306 int flags,
307 int aloc,
308 bool haveOtherWork,
309 real *e_lj,
310 real *e_el,
311 rvec *fshift,
312 GpuTaskCompletion completionKind)
314 /* determine interaction locality from atom locality */
315 int iLocality = gpuAtomToInteractionLocality(aloc);
317 // We skip when during the non-local phase there was actually no work to do.
318 // This is consistent with nbnxn_gpu_launch_kernel.
319 if (haveOtherWork || !canSkipWork(nb, iLocality))
321 // Query the state of the GPU stream and return early if we're not done
322 if (completionKind == GpuTaskCompletion::Check)
324 if (!haveStreamTasksCompleted(nb->stream[iLocality]))
326 // Early return to skip the steps below that we have to do only
327 // after the NB task completed
328 return false;
331 else
333 gpuStreamSynchronize(nb->stream[iLocality]);
336 bool calcEner = (flags & GMX_FORCE_ENERGY) != 0;
337 bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
339 nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
340 nb->bDoTime != 0);
342 nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
345 /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
346 nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
348 /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
349 nb->plist[iLocality]->haveFreshList = false;
351 return true;
354 /*! \brief
355 * Wait for the asynchronously launched nonbonded tasks and data
356 * transfers to finish.
358 * Also does timing accounting and reduction of the internal staging buffers.
359 * As this is called at the end of the step, it also resets the pair list and
360 * pruning flags.
362 * \param[in] nb The nonbonded data GPU structure
363 * \param[in] flags Force flags
364 * \param[in] aloc Atom locality identifier
365 * \param[in] haveOtherWork Tells whether there is other work than non-bonded work in the nbnxn stream(s)
366 * \param[out] e_lj Pointer to the LJ energy output to accumulate into
367 * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
368 * \param[out] fshift Pointer to the shift force buffer to accumulate into
370 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
371 void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
372 int flags,
373 int aloc,
374 bool haveOtherWork,
375 real *e_lj,
376 real *e_el,
377 rvec *fshift)
379 nbnxn_gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
380 GpuTaskCompletion::Wait);
383 #endif