Prepared t_mdatoms for using vector
[gromacs.git] / src / gromacs / mdlib / nbnxn_gpu_common.h
blob63519e0daaf157975539b01fc95992282bd875bb
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief Common functions for the different NBNXN GPU implementations.
38 * \author Szilard Pall <pall.szilard@gmail.com>
40 * \ingroup module_mdlib
43 #ifndef GMX_MDLIB_NBNXN_GPU_COMMON_H
44 #define GMX_MDLIB_NBNXN_GPU_COMMON_H
46 #include "config.h"
48 #include <string>
50 #if GMX_GPU == GMX_GPU_CUDA
51 #include "nbnxn_cuda/nbnxn_cuda_types.h"
52 #endif
54 #if GMX_GPU == GMX_GPU_OPENCL
55 #include "nbnxn_ocl/nbnxn_ocl_types.h"
56 #endif
58 #include "gromacs/math/vec.h"
59 #include "gromacs/mdlib/nbnxn_gpu_types.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/timing/gpu_timing.h"
62 #include "gromacs/utility/stringutil.h"
64 #include "nbnxn_gpu_common_utils.h"
66 /*! \brief Check that atom locality values are valid for the GPU module.
68 * In the GPU module atom locality "all" is not supported, the local and
69 * non-local ranges are treated separately.
71 * \param[in] atomLocality atom locality specifier
73 static inline void validateGpuAtomLocality(int atomLocality)
75 std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
76 "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal);
78 GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str());
81 /*! \brief Convert atom locality to interaction locality.
83 * In the current implementation the this is straightforward conversion:
84 * local to local, non-local to non-local.
86 * \param[in] atomLocality Atom locality specifier
87 * \returns Interaction locality corresponding to the atom locality passed.
89 static inline int gpuAtomToInteractionLocality(int atomLocality)
91 validateGpuAtomLocality(atomLocality);
93 /* determine interaction locality from atom locality */
94 if (LOCAL_A(atomLocality))
96 return eintLocal;
98 else if (NONLOCAL_A(atomLocality))
100 return eintNonlocal;
102 else
104 // can't be reached
105 assert(false);
106 return -1;
110 /*! \brief Calculate atom range and return start index and length.
112 * \param[in] atomData Atom descriptor data structure
113 * \param[in] atomLocality Atom locality specifier
114 * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
115 * \param[out] atomRangeLen Atom range length in the atom data array.
117 template <typename AtomDataT>
118 static inline void getGpuAtomRange(const AtomDataT *atomData,
119 int atomLocality,
120 int &atomRangeBegin,
121 int &atomRangeLen)
123 assert(atomData);
124 validateGpuAtomLocality(atomLocality);
126 /* calculate the atom data index range based on locality */
127 if (LOCAL_A(atomLocality))
129 atomRangeBegin = 0;
130 atomRangeLen = atomData->natoms_local;
132 else
134 atomRangeBegin = atomData->natoms_local;
135 atomRangeLen = atomData->natoms - atomData->natoms_local;
140 /*! \brief Count pruning kernel time if either kernel has been triggered
142 * We do the accounting for either of the two pruning kernel flavors:
143 * - 1st pass prune: ran during the current step (prior to the force kernel);
144 * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
146 * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
147 * after calling this function.
149 * \param[in] timers structs with GPU timer objects
150 * \param[inout] timings GPU task timing data
151 * \param[in] iloc interaction locality
153 template <typename GpuTimers>
154 static void countPruneKernelTime(GpuTimers *timers,
155 gmx_wallclock_gpu_nbnxn_t *timings,
156 const int iloc)
158 // We might have not done any pruning (e.g. if we skipped with empty domains).
159 if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
161 return;
164 if (timers->didPrune[iloc])
166 timings->pruneTime.c++;
167 timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
170 if (timers->didRollingPrune[iloc])
172 timings->dynamicPruneTime.c++;
173 timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
177 /*! \brief Reduce data staged internally in the nbnxn module.
179 * Shift forces and electrostatic/LJ energies copied from the GPU into
180 * a module-internal staging area are immediately reduced (CPU-side buffers passed)
181 * after having waited for the transfers' completion.
183 * Note that this function should always be called after the transfers into the
184 * staging buffers has completed.
186 * \tparam StagingData Type of staging data
187 * \param[in] nbst Nonbonded staging data
188 * \param[in] iLocality Interaction locality specifier
189 * \param[in] reduceEnergies True if energy reduction should be done
190 * \param[in] reduceFshift True if shift force reduction should be done
191 * \param[out] e_lj Variable to accumulate LJ energy into
192 * \param[out] e_el Variable to accumulate electrostatic energy into
193 * \param[out] fshift Pointer to the array of shift forces to accumulate into
195 template <typename StagingData>
196 static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
197 int iLocality,
198 bool reduceEnergies,
199 bool reduceFshift,
200 real *e_lj,
201 real *e_el,
202 rvec *fshift)
204 /* add up energies and shift forces (only once at local F wait) */
205 if (LOCAL_I(iLocality))
207 if (reduceEnergies)
209 *e_lj += *nbst.e_lj;
210 *e_el += *nbst.e_el;
213 if (reduceFshift)
215 for (int i = 0; i < SHIFTS; i++)
217 rvec_inc(fshift[i], nbst.fshift[i]);
223 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
225 * Does timing accumulation and call-count increments for the nonbonded kernels.
226 * Note that this function should be called after the current step's nonbonded
227 * nonbonded tasks have completed with the exception of the rolling pruning kernels
228 * that are accounted for during the following step.
230 * \tparam GpuTimers GPU timers type
231 * \tparam GpuPairlist Pair list type
232 * \param[out] timings Pointer to the NB GPU timings data
233 * \param[in] timers Pointer to GPU timers data
234 * \param[in] plist Pointer to the pair list data
235 * \param[in] atomLocality Atom locality specifier
236 * \param[in] didEnergyKernels True if energy kernels have been called in the current step
237 * \param[in] doTiming True if timing is enabled.
240 template <typename GpuTimers, typename GpuPairlist>
241 static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
242 GpuTimers *timers,
243 const GpuPairlist *plist,
244 int atomLocality,
245 bool didEnergyKernels,
246 bool doTiming)
248 /* timing data accumulation */
249 if (!doTiming)
251 return;
254 /* determine interaction locality from atom locality */
255 int iLocality = gpuAtomToInteractionLocality(atomLocality);
257 /* only increase counter once (at local F wait) */
258 if (LOCAL_I(iLocality))
260 timings->nb_c++;
261 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
264 /* kernel timings */
265 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
266 timers->nb_k[iLocality].getLastRangeTime();
268 /* X/q H2D and F D2H timings */
269 timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
270 timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
272 /* Count the pruning kernel times for both cases:1st pass (at search step)
273 and rolling pruning (if called at the previous step).
274 We do the accounting here as this is the only sync point where we
275 know (without checking or additional sync-ing) that prune tasks in
276 in the current stream have completed (having just blocking-waited
277 for the force D2H). */
278 countPruneKernelTime(timers, timings, iLocality);
280 /* only count atdat and pair-list H2D at pair-search step */
281 if (timers->didPairlistH2D[iLocality])
283 /* atdat transfer timing (add only once, at local F wait) */
284 if (LOCAL_A(atomLocality))
286 timings->pl_h2d_c++;
287 timings->pl_h2d_t += timers->atdat.getLastRangeTime();
290 timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
292 /* Clear the timing flag for the next step */
293 timers->didPairlistH2D[iLocality] = false;
297 // Documented in nbnxn_gpu.h
298 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
299 int flags,
300 int aloc,
301 real *e_lj,
302 real *e_el,
303 rvec *fshift)
305 /* determine interaction locality from atom locality */
306 int iLocality = gpuAtomToInteractionLocality(aloc);
308 /* Launch wait/update timers & counters and do reduction into staging buffers
309 BUT skip it when during the non-local phase there was actually no work to do.
310 This is consistent with nbnxn_gpu_launch_kernel.
312 NOTE: if timing with multiple GPUs (streams) becomes possible, the
313 counters could end up being inconsistent due to not being incremented
314 on some of the nodes! */
315 if (!canSkipWork(nb, iLocality))
317 gpuStreamSynchronize(nb->stream[iLocality]);
319 bool calcEner = flags & GMX_FORCE_ENERGY;
320 bool calcFshift = flags & GMX_FORCE_VIRIAL;
322 nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, nb->bDoTime);
324 nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
327 /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
328 nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
330 /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
331 nb->plist[iLocality]->haveFreshList = false;
334 #endif