src/gromacs/mdlib/nbnxn_gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_mdlib
  41  */
  42
  43 #ifndef GMX_MDLIB_NBNXN_GPU_COMMON_H
  44 #define GMX_MDLIB_NBNXN_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU == GMX_GPU_CUDA
  51 #include "nbnxn_cuda/nbnxn_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU == GMX_GPU_OPENCL
  55 #include "nbnxn_ocl/nbnxn_ocl_types.h"
  56 #endif
  57
  58 #include "gromacs/math/vec.h"
  59 #include "gromacs/mdlib/nbnxn_gpu_types.h"
  60 #include "gromacs/pbcutil/ishift.h"
  61 #include "gromacs/timing/gpu_timing.h"
  62 #include "gromacs/utility/stringutil.h"
  63
  64 #include "nbnxn_gpu_common_utils.h"
  65
  66 /*! \brief Check that atom locality values are valid for the GPU module.
  67  *
  68  *  In the GPU module atom locality "all" is not supported, the local and
  69  *  non-local ranges are treated separately.
  70  *
  71  *  \param[in] atomLocality atom locality specifier
  72  */
  73 static inline void validateGpuAtomLocality(int atomLocality)
  74 {
  75     std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
  76                                         "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal);
  77
  78     GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str());
  79 }
  80
  81 /*! \brief Convert atom locality to interaction locality.
  82  *
  83  *  In the current implementation the this is straightforward conversion:
  84  *  local to local, non-local to non-local.
  85  *
  86  *  \param[in] atomLocality Atom locality specifier
  87  *  \returns                Interaction locality corresponding to the atom locality passed.
  88  */
  89 static inline int gpuAtomToInteractionLocality(int atomLocality)
  90 {
  91     validateGpuAtomLocality(atomLocality);
  92
  93     /* determine interaction locality from atom locality */
  94     if (LOCAL_A(atomLocality))
  95     {
  96         return eintLocal;
  97     }
  98     else if (NONLOCAL_A(atomLocality))
  99     {
 100         return eintNonlocal;
 101     }
 102     else
 103     {
 104         // can't be reached
 105         assert(false);
 106         return -1;
 107     }
 108 }
 109
 110 /*! \brief Calculate atom range and return start index and length.
 111  *
 112  * \param[in] atomData Atom descriptor data structure
 113  * \param[in] atomLocality Atom locality specifier
 114  * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
 115  * \param[out] atomRangeLen Atom range length in the atom data array.
 116  */
 117 template <typename AtomDataT>
 118 static inline void getGpuAtomRange(const AtomDataT *atomData,
 119                                    int              atomLocality,
 120                                    int             &atomRangeBegin,
 121                                    int             &atomRangeLen)
 122 {
 123     assert(atomData);
 124     validateGpuAtomLocality(atomLocality);
 125
 126     /* calculate the atom data index range based on locality */
 127     if (LOCAL_A(atomLocality))
 128     {
 129         atomRangeBegin  = 0;
 130         atomRangeLen    = atomData->natoms_local;
 131     }
 132     else
 133     {
 134         atomRangeBegin  = atomData->natoms_local;
 135         atomRangeLen    = atomData->natoms - atomData->natoms_local;
 136     }
 137 }
 138
 139
 140 /*! \brief Count pruning kernel time if either kernel has been triggered
 141  *
 142  *  We do the accounting for either of the two pruning kernel flavors:
 143  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 144  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 145  *
 146  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 147  * after calling this function.
 148  *
 149  * \param[in] timers   structs with GPU timer objects
 150  * \param[inout] timings  GPU task timing data
 151  * \param[in] iloc        interaction locality
 152  */
 153 template <typename GpuTimers>
 154 static void countPruneKernelTime(GpuTimers                 *timers,
 155                                  gmx_wallclock_gpu_nbnxn_t *timings,
 156                                  const int                  iloc)
 157 {
 158     // We might have not done any pruning (e.g. if we skipped with empty domains).
 159     if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
 160     {
 161         return;
 162     }
 163
 164     if (timers->didPrune[iloc])
 165     {
 166         timings->pruneTime.c++;
 167         timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
 168     }
 169
 170     if (timers->didRollingPrune[iloc])
 171     {
 172         timings->dynamicPruneTime.c++;
 173         timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
 174     }
 175 }
 176
 177 /*! \brief Reduce data staged internally in the nbnxn module.
 178  *
 179  * Shift forces and electrostatic/LJ energies copied from the GPU into
 180  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 181  * after having waited for the transfers' completion.
 182  *
 183  * Note that this function should always be called after the transfers into the
 184  * staging buffers has completed.
 185  *
 186  * \tparam     StagingData    Type of staging data
 187  * \param[in]  nbst           Nonbonded staging data
 188  * \param[in]  iLocality      Interaction locality specifier
 189  * \param[in]  reduceEnergies True if energy reduction should be done
 190  * \param[in]  reduceFshift   True if shift force reduction should be done
 191  * \param[out] e_lj           Variable to accumulate LJ energy into
 192  * \param[out] e_el           Variable to accumulate electrostatic energy into
 193  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 194  */
 195 template <typename StagingData>
 196 static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
 197                                                    int                iLocality,
 198                                                    bool               reduceEnergies,
 199                                                    bool               reduceFshift,
 200                                                    real              *e_lj,
 201                                                    real              *e_el,
 202                                                    rvec              *fshift)
 203 {
 204     /* add up energies and shift forces (only once at local F wait) */
 205     if (LOCAL_I(iLocality))
 206     {
 207         if (reduceEnergies)
 208         {
 209             *e_lj += *nbst.e_lj;
 210             *e_el += *nbst.e_el;
 211         }
 212
 213         if (reduceFshift)
 214         {
 215             for (int i = 0; i < SHIFTS; i++)
 216             {
 217                 rvec_inc(fshift[i], nbst.fshift[i]);
 218             }
 219         }
 220     }
 221 }
 222
 223 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 224  *
 225  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 226  *  Note that this function should be called after the current step's nonbonded
 227  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 228  *  that are accounted for during the following step.
 229  *
 230  * \tparam     GpuTimers         GPU timers type
 231  * \tparam     GpuPairlist       Pair list type
 232  * \param[out] timings           Pointer to the NB GPU timings data
 233  * \param[in]  timers            Pointer to GPU timers data
 234  * \param[in]  plist             Pointer to the pair list data
 235  * \param[in]  atomLocality      Atom locality specifier
 236  * \param[in]  didEnergyKernels  True if energy kernels have been called in the current step
 237  * \param[in]  doTiming          True if timing is enabled.
 238  *
 239  */
 240 template <typename GpuTimers, typename GpuPairlist>
 241 static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
 242                                                 GpuTimers                 *timers,
 243                                                 const GpuPairlist         *plist,
 244                                                 int                        atomLocality,
 245                                                 bool                       didEnergyKernels,
 246                                                 bool                       doTiming)
 247 {
 248     /* timing data accumulation */
 249     if (!doTiming)
 250     {
 251         return;
 252     }
 253
 254     /* determine interaction locality from atom locality */
 255     int iLocality = gpuAtomToInteractionLocality(atomLocality);
 256
 257     /* only increase counter once (at local F wait) */
 258     if (LOCAL_I(iLocality))
 259     {
 260         timings->nb_c++;
 261         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 262     }
 263
 264     /* kernel timings */
 265     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 266         timers->nb_k[iLocality].getLastRangeTime();
 267
 268     /* X/q H2D and F D2H timings */
 269     timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
 270     timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
 271
 272     /* Count the pruning kernel times for both cases:1st pass (at search step)
 273        and rolling pruning (if called at the previous step).
 274        We do the accounting here as this is the only sync point where we
 275        know (without checking or additional sync-ing) that prune tasks in
 276        in the current stream have completed (having just blocking-waited
 277        for the force D2H). */
 278     countPruneKernelTime(timers, timings, iLocality);
 279
 280     /* only count atdat and pair-list H2D at pair-search step */
 281     if (timers->didPairlistH2D[iLocality])
 282     {
 283         /* atdat transfer timing (add only once, at local F wait) */
 284         if (LOCAL_A(atomLocality))
 285         {
 286             timings->pl_h2d_c++;
 287             timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 288         }
 289
 290         timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
 291
 292         /* Clear the timing flag for the next step */
 293         timers->didPairlistH2D[iLocality] = false;
 294     }
 295 }
 296
 297 // Documented in nbnxn_gpu.h
 298 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
 299                             int              flags,
 300                             int              aloc,
 301                             real            *e_lj,
 302                             real            *e_el,
 303                             rvec            *fshift)
 304 {
 305     /* determine interaction locality from atom locality */
 306     int iLocality = gpuAtomToInteractionLocality(aloc);
 307
 308     /* Launch wait/update timers & counters and do reduction into staging buffers
 309        BUT skip it when during the non-local phase there was actually no work to do.
 310        This is consistent with nbnxn_gpu_launch_kernel.
 311
 312        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 313        counters could end up being inconsistent due to not being incremented
 314        on some of the nodes! */
 315     if (!canSkipWork(nb, iLocality))
 316     {
 317         gpuStreamSynchronize(nb->stream[iLocality]);
 318
 319         bool calcEner   = flags & GMX_FORCE_ENERGY;
 320         bool calcFshift = flags & GMX_FORCE_VIRIAL;
 321
 322         nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, nb->bDoTime);
 323
 324         nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
 325     }
 326
 327     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
 328     nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
 329
 330     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 331     nb->plist[iLocality]->haveFreshList = false;
 332 }
 333
 334 #endif