From 83fe36cb0c2e08065ef3b3152f54d76edb600283 Mon Sep 17 00:00:00 2001
From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Thu, 16 Nov 2017 18:07:05 +0100
Subject: [PATCH] Enable separate PME GPU rank

This relaxes a few task assignment restrictions to allow
a simulation with a single separate PME GPU rank to work. E.g.

gmx mdrun -pme gpu -ntpmi 4 -npme 1

works if all the single rank PME GPU requirements are satisfied.

Default behavior is not changed, new behavior is opt-in.

The existing single-rank PME command line sanity test is altered
to also be included in multi-rank tests, testing a separate PME rank.

Change-Id: I82f3950b0e04b0bc21843a8124a9bd5c68b15024
---
 src/gromacs/taskassignment/decidegpuusage.cpp   | 25 ++++----
 src/gromacs/taskassignment/decidegpuusage.h     |  8 ++-
 src/gromacs/taskassignment/resourcedivision.cpp |  7 +--
 src/programs/mdrun/runner.cpp                   | 28 +++++----
 src/programs/mdrun/tests/CMakeLists.txt         |  5 +-
 src/programs/mdrun/tests/pmetest.cpp            | 80 ++++++++++++++++++-------
 6 files changed, 102 insertions(+), 51 deletions(-)
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index d20ca674d6..6002bb913b 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -152,7 +152,8 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuForNonbo
                                           const std::vector<int> &gpuIdsToUse,
                                           const std::vector<int> &userGpuTaskAssignment,
                                           const bool              canUseGpuForPme,
-                                          const int               numRanksPerSimulation)
+                                          const int               numRanksPerSimulation,
+                                          const int               numPmeRanksPerSimulation)
 {
     // First, exclude all cases where we can't run PME on GPUs.
     if ((pmeTarget == TaskTarget::Cpu) ||
@@ -182,10 +183,11 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuForNonbo
         // PME on GPUs is only supported in a single case
         if (pmeTarget == TaskTarget::Gpu)
         {
-            if (numRanksPerSimulation > 1)
+            if (((numRanksPerSimulation > 1) && (numPmeRanksPerSimulation == 0)) ||
+                (numPmeRanksPerSimulation > 1))
             {
                 GMX_THROW(InconsistentInputError
-                              ("When you run mdrun -pme gpu -gputasks, you must supply a PME .tpr file and use a single rank."));
+                              ("When you run mdrun -pme gpu -gputasks, you must supply a PME-enabled .tpr file and use a single PME rank."));
             }
             return true;
         }
@@ -200,12 +202,13 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuForNonbo
 
     if (pmeTarget == TaskTarget::Gpu)
     {
-        if (numRanksPerSimulation > 1)
+        if (((numRanksPerSimulation > 1) && (numPmeRanksPerSimulation == 0)) ||
+            (numPmeRanksPerSimulation > 1))
         {
             GMX_THROW(NotImplementedError
                           ("PME tasks were required to run on GPUs, but that is not implemented with "
-                          "more than one rank. Use a single rank, or permit PME tasks to be assigned "
-                          "to the CPU."));
+                          "more than one PME rank. Use a single rank simulation, or a separate PME rank, "
+                          "or permit PME tasks to be assigned to the CPU."));
         }
         return true;
     }
@@ -316,7 +319,8 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                   const TaskTarget        pmeTarget,
                                   const std::vector<int> &userGpuTaskAssignment,
                                   const bool              canUseGpuForPme,
-                                  const int               numRanksPerSimulation)
+                                  const int               numRanksPerSimulation,
+                                  const int               numPmeRanksPerSimulation)
 {
     if (pmeTarget == TaskTarget::Cpu)
     {
@@ -374,12 +378,13 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
 
     if (pmeTarget == TaskTarget::Gpu)
     {
-        if (numRanksPerSimulation > 1)
+        if (((numRanksPerSimulation > 1) && (numPmeRanksPerSimulation == 0)) ||
+            (numPmeRanksPerSimulation > 1))
         {
             GMX_THROW(NotImplementedError
                           ("PME tasks were required to run on GPUs, but that is not implemented with "
-                          "more than one rank. Use a single rank, or permit PME tasks to be assigned "
-                          "to the CPU."));
+                          "more than one PME rank. Use a single rank simulation, or a separate PME rank, "
+                          "or permit PME tasks to be assigned to the CPU."));
         }
         return true;
     }
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h
index a1738429b4..438e00b8a6 100644
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -102,6 +102,7 @@ bool decideWhetherToUseGpusForNonbondedWithThreadMpi(const TaskTarget          n
  * \param[in]  userGpuTaskAssignment     The user-specified assignment of GPU tasks to device IDs.
  * \param[in]  canUseGpuForPme           Whether the form of PME chosen can run on a GPU
  * \param[in]  numRanksPerSimulation     The number of ranks in each simulation.
+ * \param[in]  numPmeRanksPerSimulation  The number of PME ranks in each simulation.
  *
  * \returns    Whether the simulation will run PME tasks on GPUs.
  *
@@ -112,7 +113,8 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuFor
                                                const std::vector<int> &gpuIdsToUse,
                                                const std::vector<int> &userGpuTaskAssignment,
                                                const bool              canUseGpuForPme,
-                                               const int               numRanksPerSimulation);
+                                               const int               numRanksPerSimulation,
+                                               const int               numPmeRanksPerSimulation);
 
 /*! \brief Decide whether the simulation will try to run nonbonded
  * tasks on GPUs.
@@ -169,6 +171,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarg
  * \param[in]  userGpuTaskAssignment     The user-specified assignment of GPU tasks to device IDs.
  * \param[in]  canUseGpuForPme           Whether the form of PME chosen can run on a GPU
  * \param[in]  numRanksPerSimulation     The number of ranks in each simulation.
+ * \param[in]  numPmeRanksPerSimulation  The number of PME ranks in each simulation.
  *
  * \returns    Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs.
  *
@@ -178,7 +181,8 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                   const TaskTarget        pmeTarget,
                                   const std::vector<int> &userGpuTaskAssignment,
                                   const bool              canUseGpuForPme,
-                                  const int               numRanksPerSimulation);
+                                  const int               numRanksPerSimulation,
+                                  const int               numPmeRanksPerSimulation);
 
 }
 
diff --git a/src/gromacs/taskassignment/resourcedivision.cpp b/src/gromacs/taskassignment/resourcedivision.cpp
index aae46bd895..253e2b54bf 100644
--- a/src/gromacs/taskassignment/resourcedivision.cpp
+++ b/src/gromacs/taskassignment/resourcedivision.cpp
@@ -354,15 +354,12 @@ int get_nthreads_mpi(const gmx_hw_info_t    *hwinfo,
                            pme_gpu_supports_input(inputrec, nullptr),
                            "PME can't be on GPUs unless we are using PME");
 
-        // A single rank is all that is supported with PME on GPUs
+        // PME on GPUs supports a single PME rank with PP running on the same or few other ranks.
+        // For now, let's treat separate PME GPU rank as opt-in.
         if (hw_opt->nthreads_tmpi < 1)
         {
             return 1;
         }
-        if (hw_opt->nthreads_tmpi > 1)
-        {
-            gmx_fatal(FARGS, "PME on GPUs is only supported with a single rank");
-        }
     }
 
     {
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 58059f5c4c..15631660b0 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -600,7 +600,7 @@ int Mdrunner::mdrunner()
             auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
                     (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
-                    canUseGpuForPme, hw_opt.nthreads_tmpi);
+                    canUseGpuForPme, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
         /* Determine how many thread-MPI ranks to start.
@@ -652,7 +652,7 @@ int Mdrunner::mdrunner()
                                                                 gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun));
         auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype);
         auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
-        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks);
         pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
         if ((pmeRunMode == PmeRunMode::GPU) && (pmeFftTarget == TaskTarget::Cpu))
         {
@@ -734,21 +734,30 @@ int Mdrunner::mdrunner()
 
     if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
     {
-        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
+        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
          * improve performance with many threads per GPU, since our OpenMP
          * scaling is bad, but it's difficult to automate the setup.
          */
-        domdecOptions.numPmeRanks = 0;
+        if (useGpuForPme && (hw_opt.nthreads_tmpi > 1))
+        {
+            domdecOptions.numPmeRanks = 1;
+            //TODO print appropriate notice on why asking for multiple threads and -pme gpu causes a separate PME rank to start
+        }
+        else
+        {
+            domdecOptions.numPmeRanks = 0;
+        }
     }
     if (useGpuForPme)
     {
         if (domdecOptions.numPmeRanks < 0)
         {
-            domdecOptions.numPmeRanks = 0; // separate GPU ranks not supported
+            domdecOptions.numPmeRanks = 0;
+            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
         }
         else
         {
-            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks == 0, "Separate PME GPU ranks are not yet supported");
+            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
         }
     }
 
@@ -1045,6 +1054,7 @@ int Mdrunner::mdrunner()
     if (pmeGpuTaskMapping != gpuTaskAssignment.end())
     {
         pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
+        init_gpu(mdlog, pmeDeviceInfo);
     }
 
     /* getting number of PP/PME threads
@@ -1212,12 +1222,6 @@ int Mdrunner::mdrunner()
         {
             try
             {
-                if (pmeDeviceInfo != nullptr && pmeDeviceInfo != nonbondedDeviceInfo)
-                {
-                    GMX_THROW(NotImplementedError
-                                  ("PME on a GPU can run only on the same GPU as nonbonded, because "
-                                  "context switching is not yet supported."));
-                }
                 pmedata = gmx_pme_init(cr, npme_major, npme_minor, inputrec,
                                        mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
                                        mdrunOptions.reproducible,
diff --git a/src/programs/mdrun/tests/CMakeLists.txt b/src/programs/mdrun/tests/CMakeLists.txt
index 675277e65d..8bf1a7d781 100644
--- a/src/programs/mdrun/tests/CMakeLists.txt
+++ b/src/programs/mdrun/tests/CMakeLists.txt
@@ -34,9 +34,12 @@
 
 # make an "object library" for code that we re-use for both kinds of tests
 add_library(mdrun_test_objlib OBJECT
+    energyreader.cpp
     mdruncomparisonfixture.cpp
     moduletest.cpp
     terminationhelper.cpp
+    # PME tests
+    pmetest.cpp
     )
 
 set(testname "MdrunTests")
@@ -46,10 +49,8 @@ gmx_add_gtest_executable(
     ${exename}
     # files with code for tests
     tabulated_bonded_interactions.cpp
-    energyreader.cpp
     grompp.cpp
     initialconstraints.cpp
-    pmetest.cpp
     rerun.cpp
     trajectory_writing.cpp
     trajectoryreader.cpp
diff --git a/src/programs/mdrun/tests/pmetest.cpp b/src/programs/mdrun/tests/pmetest.cpp
index b91d71d543..e1acdc8dee 100644
--- a/src/programs/mdrun/tests/pmetest.cpp
+++ b/src/programs/mdrun/tests/pmetest.cpp
@@ -34,10 +34,14 @@
  */
 /*! \internal \file
  * \brief
- * This implements basic PME sanity test (using single-rank mdrun).
+ * This implements basic PME sanity tests.
  * It runs the input system with PME for several steps (on CPU and GPU, if available),
  * and checks the reciprocal and conserved energies.
- * TODO: implement multi-rank tests as well.
+ * As part of mdrun-test, this will always run single rank PME simulation.
+ * As part of mdrun-mpi-test, this will run same as above when a single rank is requested,
+ * or a simulation with a single separate PME rank ("-npme 1") when multiple ranks are requested.
+ * \todo Extend and generalize this for more multi-rank tests (-npme 0, -npme 2, etc).
+ * \todo Implement death tests (e.g. for PME GPU decomposition).
  *
  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  * \ingroup module_mdrun_integration_tests
@@ -50,11 +54,15 @@
 #include <string>
 #include <vector>
 
+#include <gtest/gtest-spi.h>
+
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/stringutil.h"
 
+#include "testutils/mpitest.h"
 #include "testutils/refdata.h"
 
 #include "energyreader.h"
@@ -111,6 +119,11 @@ TEST_F(PmeTest, ReproducesEnergies)
     const std::string inputFile = "spc-and-methanol";
     runner_.useTopGroAndNdxFromDatabase(inputFile.c_str());
 
+    // With single rank we can and will always test PP+PME as part of mdrun-test.
+    // With multiple ranks we can additionally test a single PME-only rank within mdrun-mpi-test.
+    const bool parallelRun    = (getNumberOfTestMpiRanks() > 1);
+    const bool useSeparatePme = parallelRun;
+
     EXPECT_EQ(0, runner_.callGrompp());
 
     //TODO test all proper/improper combinations in more thorough way?
@@ -122,7 +135,11 @@ TEST_F(PmeTest, ReproducesEnergies)
     runModes["PmeOnGpuFftAuto"]  = {"-pme", "gpu", "-pmefft", "auto"};
     TestReferenceData    refData;
     TestReferenceChecker rootChecker(refData.rootChecker());
-
+    const bool           thisRankChecks = (gmx_node_rank() == 0);
+    if (!thisRankChecks)
+    {
+        EXPECT_NONFATAL_FAILURE(rootChecker.checkUnusedEntries(), ""); // skip checks on other ranks
+    }
     for (const auto &mode : runModes)
     {
         auto modeTargetsGpus = (mode.first.find("Gpu") != std::string::npos);
@@ -138,30 +155,53 @@ TEST_F(PmeTest, ReproducesEnergies)
 
         CommandLine commandLine(mode.second);
         commandLine.append("-notunepme"); // for reciprocal energy reproducibility
+        if (useSeparatePme)
+        {
+            commandLine.addOption("-npme", 1);
+        }
         ASSERT_EQ(0, runner_.callMdrun(commandLine));
 
-        auto energyReader      = openEnergyFileToReadFields(runner_.edrFileName_, {"Coul. recip.", "Total Energy", "Kinetic En."});
-        auto conservedChecker  = rootChecker.checkCompound("Energy", "Conserved");
-        auto reciprocalChecker = rootChecker.checkCompound("Energy", "Reciprocal");
-        for (int i = 0; i <= nsteps; i++)
+        if (thisRankChecks)
         {
-            EnergyFrame frame            = energyReader->frame();
-            std::string stepNum          = gmx::formatString("%d", i);
-            const real  conservedEnergy  = frame.at("Total Energy");
-            const real  reciprocalEnergy = frame.at("Coul. recip.");
-            if (i == 0)
+            auto energyReader      = openEnergyFileToReadFields(runner_.edrFileName_, {"Coul. recip.", "Total Energy", "Kinetic En."});
+            auto conservedChecker  = rootChecker.checkCompound("Energy", "Conserved");
+            auto reciprocalChecker = rootChecker.checkCompound("Energy", "Reciprocal");
+            for (int i = 0; i <= nsteps; i++)
             {
-                // use first step values as references for tolerance
-                const real startingKineticEnergy = frame.at("Kinetic En.");
-                const auto conservedTolerance    = relativeToleranceAsFloatingPoint(startingKineticEnergy, 2e-5);
-                const auto reciprocalTolerance   = relativeToleranceAsFloatingPoint(reciprocalEnergy, 2e-5);
-                reciprocalChecker.setDefaultTolerance(reciprocalTolerance);
-                conservedChecker.setDefaultTolerance(conservedTolerance);
+                EnergyFrame frame            = energyReader->frame();
+                std::string stepNum          = gmx::formatString("%d", i);
+                const real  conservedEnergy  = frame.at("Total Energy");
+                const real  reciprocalEnergy = frame.at("Coul. recip.");
+                if (i == 0)
+                {
+                    // use first step values as references for tolerance
+                    const real startingKineticEnergy = frame.at("Kinetic En.");
+                    const auto conservedTolerance    = relativeToleranceAsFloatingPoint(startingKineticEnergy, 2e-5);
+                    const auto reciprocalTolerance   = relativeToleranceAsFloatingPoint(reciprocalEnergy, 3e-5);
+                    reciprocalChecker.setDefaultTolerance(reciprocalTolerance);
+                    conservedChecker.setDefaultTolerance(conservedTolerance);
+                }
+                conservedChecker.checkReal(conservedEnergy, stepNum.c_str());
+                reciprocalChecker.checkReal(reciprocalEnergy, stepNum.c_str());
             }
-            conservedChecker.checkReal(conservedEnergy, stepNum.c_str());
-            reciprocalChecker.checkReal(reciprocalEnergy, stepNum.c_str());
         }
+        // FIXME: without this barrier, one of the mdruns was somehow having a non-PME inputrec (!)
+#if GMX_LIB_MPI
+        if (parallelRun)
+        {
+            MPI_Barrier(MPI_COMM_WORLD);
+        }
+#endif
+    }
+
+    // This is a workaround for the output files to not be deleted in a parallel run.
+    // TODO: consider moving the barrier into the file manager destructor.
+#if GMX_LIB_MPI
+    if (parallelRun)
+    {
+        MPI_Barrier(MPI_COMM_WORLD);
     }
+#endif
 }
 
 }
-- 
2.11.4.GIT