From 6fb2a76e3e02dc01d3605a2ced7566090900c8e9 Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Mon, 16 Sep 2019 08:38:06 +0200
Subject: [PATCH] Use GPU halo exchange only when compatible DD is available

Change-Id: I624861e61e6f1b6854172102b12a4f9b51b94101
---
 src/gromacs/domdec/domdec.cpp       | 85 ++++++++++++++++++++++++++++++++-----
 src/gromacs/domdec/domdec.h         |  8 ++++
 src/gromacs/domdec/domdec_setup.cpp | 11 ++++-
 src/gromacs/mdlib/sim_util.cpp      | 18 ++++----
 src/gromacs/mdrun/runner.cpp        | 12 ++++--
 5 files changed, 110 insertions(+), 24 deletions(-)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 1e3c931858..47a7d3531a 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -2971,13 +2971,7 @@ getDDSettings(const gmx::MDLogger     &mdlog,
 
     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
-    // TODO GPU halo exchange requires a 1D single-pulse DD, and when
-    // it is properly integrated the hack with GMX_GPU_DD_COMMS should
-    // be removed.
-    ddSettings.request1DAnd1Pulse  = (bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)) ||
-                                      (bool(getenv("GMX_GPU_DD_COMMS") != nullptr &&
-                                            GMX_THREAD_MPI &&
-                                            (GMX_GPU == GMX_GPU_CUDA))));
+    ddSettings.request1DAnd1Pulse  = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0));
     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
@@ -3014,10 +3008,71 @@ gmx_domdec_t::gmx_domdec_t(const t_inputrec &ir) :
 {
 }
 
+/*! \brief Return whether the simulation described can run a 1D single-pulse DD.
+ *
+ * The GPU halo exchange code requires a 1D single-pulse DD. Such a DD
+ * generally requires a larger box than other possible decompositions
+ * with the same rank count, so the calling code might need to decide
+ * what is the most appropriate way to run the simulation based on
+ * whether such a DD is possible.
+ *
+ * This function works like init_domain_decomposition(), but will not
+ * give a fatal error, and only uses \c cr for communicating between
+ * ranks.
+ *
+ * It is safe to call before thread-MPI spawns ranks, so that
+ * thread-MPI can decide whether and how to trigger the GPU halo
+ * exchange code path. The number of PME ranks, if any, should be set
+ * in \c options.numPmeRanks.
+ */
+static bool
+canMake1DAnd1PulseDomainDecomposition(const DDSettings              &ddSettingsOriginal,
+                                      const t_commrec               *cr,
+                                      const int                      numRanksRequested,
+                                      const DomdecOptions           &options,
+                                      const gmx_mtop_t              &mtop,
+                                      const t_inputrec              &ir,
+                                      const matrix                   box,
+                                      gmx::ArrayRef<const gmx::RVec> xGlobal)
+{
+    // Ensure we don't write any output from this checking routine
+    gmx::MDLogger dummyLogger;
+
+    DDSystemInfo  systemInfo = getSystemInfo(dummyLogger, cr, options, &mtop, &ir, box, xGlobal);
+
+    int           numPPRanksRequested = numRanksRequested - (EEL_PME(ir.coulombtype) ? options.numPmeRanks : 0);
+
+    DDSettings    ddSettings = ddSettingsOriginal;
+    ddSettings.request1DAnd1Pulse = true;
+    const real    gridSetupCellsizeLimit = getDDGridSetupCellSizeLimit(dummyLogger, ddSettings.request1DAnd1Pulse,
+                                                                       !isDlbDisabled(ddSettings.initialDlbState),
+                                                                       options.dlbScaling, ir,
+                                                                       systemInfo.cellsizeLimit);
+    gmx_ddbox_t ddbox       = {0};
+    DDGridSetup ddGridSetup = getDDGridSetup(dummyLogger, cr, numPPRanksRequested, options,
+                                             ddSettings, systemInfo, gridSetupCellsizeLimit,
+                                             mtop, ir, box, xGlobal, &ddbox);
+
+    const bool canMakeDDWith1DAnd1Pulse = (ddGridSetup.numDomains[XX] != 0);
+
+    return canMakeDDWith1DAnd1Pulse;
+}
+
+bool is1DAnd1PulseDD(const gmx_domdec_t &dd)
+{
+    const int  maxDimensionSize             = std::max(dd.nc[XX], std::max(dd.nc[YY], dd.nc[ZZ]));
+    const int  productOfDimensionSizes      = dd.nc[XX]*dd.nc[YY]*dd.nc[ZZ];
+    const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes);
+
+    return (dd.comm->maxpulse == 1) && decompositionHasOneDimension;
+
+}
+
 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
                                         t_commrec                     *cr,
                                         const DomdecOptions           &options,
                                         const gmx::MdrunOptions       &mdrunOptions,
+                                        const bool                     prefer1DAnd1Pulse,
                                         const gmx_mtop_t              *mtop,
                                         const t_inputrec              *ir,
                                         const matrix                   box,
@@ -3028,6 +3083,14 @@ gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
 
     DDSettings  ddSettings = getDDSettings(mdlog, options, mdrunOptions, *ir);
+
+    if (prefer1DAnd1Pulse &&
+        canMake1DAnd1PulseDomainDecomposition(ddSettings, cr, cr->nnodes, options,
+                                              *mtop, *ir, box, xGlobal))
+    {
+        ddSettings.request1DAnd1Pulse = true;
+    }
+
     if (ddSettings.eFlop > 1)
     {
         /* Ensure that we have different random flop counts on different ranks */
@@ -3039,16 +3102,16 @@ gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
     int          numRanksRequested = cr->nnodes;
     checkForValidRankCountRequests(numRanksRequested, EEL_PME(ir->coulombtype), options.numPmeRanks);
 
-    // DD grid setup uses a more conservative cell size limit for
+    // DD grid setup uses a more different cell size limit for
     // automated setup than the one in systemInfo. The latter is used
-    // later during DLB, for example.
+    // in set_dd_limits() to configure DLB, for example.
     const real gridSetupCellsizeLimit = getDDGridSetupCellSizeLimit(mdlog, ddSettings.request1DAnd1Pulse,
                                                                     !isDlbDisabled(ddSettings.initialDlbState),
                                                                     options.dlbScaling, *ir,
                                                                     systemInfo.cellsizeLimit);
     gmx_ddbox_t  ddbox       = {0};
-    DDGridSetup  ddGridSetup = getDDGridSetup(mdlog, cr, numRanksRequested, options, ddSettings, systemInfo,
-                                              gridSetupCellsizeLimit,
+    DDGridSetup  ddGridSetup = getDDGridSetup(mdlog, cr, numRanksRequested, options,
+                                              ddSettings, systemInfo, gridSetupCellsizeLimit,
                                               *mtop, *ir, box, xGlobal, &ddbox);
     checkDDGridSetup(ddGridSetup, cr, options, ddSettings, systemInfo, gridSetupCellsizeLimit, ddbox);
 
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index 84274b0fa8..4f62dc4255 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -162,12 +162,20 @@ init_domain_decomposition(const gmx::MDLogger            &mdlog,
                           t_commrec                      *cr,
                           const gmx::DomdecOptions       &options,
                           const gmx::MdrunOptions        &mdrunOptions,
+                          bool                            prefer1DAnd1Pulse,
                           const gmx_mtop_t               *mtop,
                           const t_inputrec               *ir,
                           const matrix                    box,
                           gmx::ArrayRef<const gmx::RVec>  xGlobal,
                           gmx::LocalAtomSetManager       *atomSets);
 
+/*! \brief Return whether the DD has a single dimension with a single pulse
+ *
+ * The GPU halo exchange code requires a 1D single-pulse DD, and its
+ * setup code can use the returned value to understand what it should
+ * do. */
+bool is1DAnd1PulseDD(const gmx_domdec_t &dd);
+
 /*! \brief Initialize data structures for bonded interactions */
 void dd_init_bondeds(FILE              *fplog,
                      gmx_domdec_t      *dd,
diff --git a/src/gromacs/domdec/domdec_setup.cpp b/src/gromacs/domdec/domdec_setup.cpp
index e685336bd7..afe132c429 100644
--- a/src/gromacs/domdec/domdec_setup.cpp
+++ b/src/gromacs/domdec/domdec_setup.cpp
@@ -913,7 +913,16 @@ getDDGridSetup(const gmx::MDLogger           &mdlog,
                gmx::ArrayRef<const gmx::RVec> xGlobal,
                gmx_ddbox_t                   *ddbox)
 {
-    int       numPmeOnlyRanks = getNumPmeOnlyRanksToUse(mdlog, options, mtop, ir, box, numRanksRequested);
+    int numPmeOnlyRanks = getNumPmeOnlyRanksToUse(mdlog, options, mtop, ir, box, numRanksRequested);
+
+    if (ddSettings.request1DAnd1Pulse &&
+        (numRanksRequested - numPmeOnlyRanks == 1))
+    {
+        // With only one PP rank, there will not be a need for
+        // GPU-based halo exchange that wants to request that any DD
+        // has only 1 dimension and 1 pulse.
+        return DDGridSetup {};
+    }
 
     gmx::IVec numDomains;
     if (options.numCells[XX] > 0)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 85c8c73ae6..af26e4e3db 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -128,10 +128,6 @@ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GP
 // TODO eventially tie this in with other existing GPU flags.
 static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 
-/*! \brief environment variable to enable GPU P2P communication */
-static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
-    && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
-
 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 {
     const int      end = forceToAdd.size();
@@ -1187,11 +1183,15 @@ void do_force(FILE                                     *fplog,
         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
     }
 
-    const bool            ddUsesGpuDirectCommunication
-        = c_enableGpuHaloExchange && c_enableGpuBufOps && bUseGPU && havePPDomainDecomposition(cr);
-    gmx::GpuHaloExchange *gpuHaloExchange = ddUsesGpuDirectCommunication ? cr->dd->gpuHaloExchange.get() : nullptr;
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || gpuHaloExchange != nullptr,
-               "Must have valid gpuHaloExchange when doing halo exchange on the GPU");
+    // TODO Update this comment when introducing SimulationWorkload
+    //
+    // The conditions for gpuHaloExchange e.g. using GPU buffer
+    // operations were checked before construction, so here we can
+    // just use it and assert upon any conditions.
+    gmx::GpuHaloExchange *gpuHaloExchange              = (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
+    const bool            ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
+               "Must use coordinate buffer ops with GPU halo exchange");
 
     /* Communicate coordinates and sum dipole if necessary +
        do non-local pair search */
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 690bfd748d..f2cc132ecb 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -166,6 +166,9 @@ namespace gmx
 static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
     && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
 
+/*! \brief environment variable to enable GPU buffer operations */
+static const bool c_enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+
 /*! \brief Manage any development feature flag variables encountered
  *
  * The use of dev features indicated by environment variables is
@@ -194,8 +197,6 @@ static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
         {
             gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
         }
-        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
     }
 
     if (useGpuUpdateConstrain)
@@ -1028,11 +1029,13 @@ int Mdrunner::mdrunner()
     prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
                           useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
 
+    const bool          prefer1DAnd1PulseDD = (c_enableGpuHaloExchange && useGpuForNonbonded);
     LocalAtomSetManager atomSets;
     if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
                      inputrec->eI == eiNM))
     {
         cr->dd = init_domain_decomposition(mdlog, cr, domdecOptions, mdrunOptions,
+                                           prefer1DAnd1PulseDD,
                                            &mtop, inputrec,
                                            box, positionsFromStatePointer(globalState.get()),
                                            &atomSets);
@@ -1340,12 +1343,15 @@ int Mdrunner::mdrunner()
         // TODO Move this to happen during domain decomposition setup,
         // once stream and event handling works well with that.
         // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-        if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
+        if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
         {
+            GMX_RELEASE_ASSERT(c_enableGpuBufOps, "Must use GMX_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
             void *streamLocal                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
             void *streamNonLocal                =
                 Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
             void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
+            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                    "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
             cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
                                                                         streamNonLocal, coordinatesOnDeviceEvent);
         }
-- 
2.11.4.GIT