From 83fe36cb0c2e08065ef3b3152f54d76edb600283 Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Thu, 16 Nov 2017 18:07:05 +0100 Subject: [PATCH] Enable separate PME GPU rank This relaxes a few task assignment restrictions to allow a simulation with a single separate PME GPU rank to work. E.g. gmx mdrun -pme gpu -ntpmi 4 -npme 1 works if all the single rank PME GPU requirements are satisfied. Default behavior is not changed, new behavior is opt-in. The existing single-rank PME command line sanity test is altered to also be included in multi-rank tests, testing a separate PME rank. Change-Id: I82f3950b0e04b0bc21843a8124a9bd5c68b15024 --- src/gromacs/taskassignment/decidegpuusage.cpp | 25 ++++---- src/gromacs/taskassignment/decidegpuusage.h | 8 ++- src/gromacs/taskassignment/resourcedivision.cpp | 7 +-- src/programs/mdrun/runner.cpp | 28 +++++---- src/programs/mdrun/tests/CMakeLists.txt | 5 +- src/programs/mdrun/tests/pmetest.cpp | 80 ++++++++++++++++++------- 6 files changed, 102 insertions(+), 51 deletions(-) diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index d20ca674d6..6002bb913b 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -152,7 +152,8 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuForNonbo const std::vector &gpuIdsToUse, const std::vector &userGpuTaskAssignment, const bool canUseGpuForPme, - const int numRanksPerSimulation) + const int numRanksPerSimulation, + const int numPmeRanksPerSimulation) { // First, exclude all cases where we can't run PME on GPUs. if ((pmeTarget == TaskTarget::Cpu) || @@ -182,10 +183,11 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuForNonbo // PME on GPUs is only supported in a single case if (pmeTarget == TaskTarget::Gpu) { - if (numRanksPerSimulation > 1) + if (((numRanksPerSimulation > 1) && (numPmeRanksPerSimulation == 0)) || + (numPmeRanksPerSimulation > 1)) { GMX_THROW(InconsistentInputError - ("When you run mdrun -pme gpu -gputasks, you must supply a PME .tpr file and use a single rank.")); + ("When you run mdrun -pme gpu -gputasks, you must supply a PME-enabled .tpr file and use a single PME rank.")); } return true; } @@ -200,12 +202,13 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuForNonbo if (pmeTarget == TaskTarget::Gpu) { - if (numRanksPerSimulation > 1) + if (((numRanksPerSimulation > 1) && (numPmeRanksPerSimulation == 0)) || + (numPmeRanksPerSimulation > 1)) { GMX_THROW(NotImplementedError ("PME tasks were required to run on GPUs, but that is not implemented with " - "more than one rank. Use a single rank, or permit PME tasks to be assigned " - "to the CPU.")); + "more than one PME rank. Use a single rank simulation, or a separate PME rank, " + "or permit PME tasks to be assigned to the CPU.")); } return true; } @@ -316,7 +319,8 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, const TaskTarget pmeTarget, const std::vector &userGpuTaskAssignment, const bool canUseGpuForPme, - const int numRanksPerSimulation) + const int numRanksPerSimulation, + const int numPmeRanksPerSimulation) { if (pmeTarget == TaskTarget::Cpu) { @@ -374,12 +378,13 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, if (pmeTarget == TaskTarget::Gpu) { - if (numRanksPerSimulation > 1) + if (((numRanksPerSimulation > 1) && (numPmeRanksPerSimulation == 0)) || + (numPmeRanksPerSimulation > 1)) { GMX_THROW(NotImplementedError ("PME tasks were required to run on GPUs, but that is not implemented with " - "more than one rank. Use a single rank, or permit PME tasks to be assigned " - "to the CPU.")); + "more than one PME rank. Use a single rank simulation, or a separate PME rank, " + "or permit PME tasks to be assigned to the CPU.")); } return true; } diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index a1738429b4..438e00b8a6 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -102,6 +102,7 @@ bool decideWhetherToUseGpusForNonbondedWithThreadMpi(const TaskTarget n * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. * \param[in] canUseGpuForPme Whether the form of PME chosen can run on a GPU * \param[in] numRanksPerSimulation The number of ranks in each simulation. + * \param[in] numPmeRanksPerSimulation The number of PME ranks in each simulation. * * \returns Whether the simulation will run PME tasks on GPUs. * @@ -112,7 +113,8 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuFor const std::vector &gpuIdsToUse, const std::vector &userGpuTaskAssignment, const bool canUseGpuForPme, - const int numRanksPerSimulation); + const int numRanksPerSimulation, + const int numPmeRanksPerSimulation); /*! \brief Decide whether the simulation will try to run nonbonded * tasks on GPUs. @@ -169,6 +171,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarg * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. * \param[in] canUseGpuForPme Whether the form of PME chosen can run on a GPU * \param[in] numRanksPerSimulation The number of ranks in each simulation. + * \param[in] numPmeRanksPerSimulation The number of PME ranks in each simulation. * * \returns Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs. * @@ -178,7 +181,8 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, const TaskTarget pmeTarget, const std::vector &userGpuTaskAssignment, const bool canUseGpuForPme, - const int numRanksPerSimulation); + const int numRanksPerSimulation, + const int numPmeRanksPerSimulation); } diff --git a/src/gromacs/taskassignment/resourcedivision.cpp b/src/gromacs/taskassignment/resourcedivision.cpp index aae46bd895..253e2b54bf 100644 --- a/src/gromacs/taskassignment/resourcedivision.cpp +++ b/src/gromacs/taskassignment/resourcedivision.cpp @@ -354,15 +354,12 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, pme_gpu_supports_input(inputrec, nullptr), "PME can't be on GPUs unless we are using PME"); - // A single rank is all that is supported with PME on GPUs + // PME on GPUs supports a single PME rank with PP running on the same or few other ranks. + // For now, let's treat separate PME GPU rank as opt-in. if (hw_opt->nthreads_tmpi < 1) { return 1; } - if (hw_opt->nthreads_tmpi > 1) - { - gmx_fatal(FARGS, "PME on GPUs is only supported with a single rank"); - } } { diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index 58059f5c4c..15631660b0 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -600,7 +600,7 @@ int Mdrunner::mdrunner() auto canUseGpuForPme = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr); useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, - canUseGpuForPme, hw_opt.nthreads_tmpi); + canUseGpuForPme, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; /* Determine how many thread-MPI ranks to start. @@ -652,7 +652,7 @@ int Mdrunner::mdrunner() gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun)); auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype); auto canUseGpuForPme = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr); - useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes); + useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks); pmeRunMode = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU); if ((pmeRunMode == PmeRunMode::GPU) && (pmeFftTarget == TaskTarget::Cpu)) { @@ -734,21 +734,30 @@ int Mdrunner::mdrunner() if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0) { - /* With GPUs we don't automatically use PME-only ranks. PME ranks can + /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can * improve performance with many threads per GPU, since our OpenMP * scaling is bad, but it's difficult to automate the setup. */ - domdecOptions.numPmeRanks = 0; + if (useGpuForPme && (hw_opt.nthreads_tmpi > 1)) + { + domdecOptions.numPmeRanks = 1; + //TODO print appropriate notice on why asking for multiple threads and -pme gpu causes a separate PME rank to start + } + else + { + domdecOptions.numPmeRanks = 0; + } } if (useGpuForPme) { if (domdecOptions.numPmeRanks < 0) { - domdecOptions.numPmeRanks = 0; // separate GPU ranks not supported + domdecOptions.numPmeRanks = 0; + // TODO possibly print a note that one can opt-in for a separate PME GPU rank? } else { - GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks == 0, "Separate PME GPU ranks are not yet supported"); + GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported"); } } @@ -1045,6 +1054,7 @@ int Mdrunner::mdrunner() if (pmeGpuTaskMapping != gpuTaskAssignment.end()) { pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_); + init_gpu(mdlog, pmeDeviceInfo); } /* getting number of PP/PME threads @@ -1212,12 +1222,6 @@ int Mdrunner::mdrunner() { try { - if (pmeDeviceInfo != nullptr && pmeDeviceInfo != nonbondedDeviceInfo) - { - GMX_THROW(NotImplementedError - ("PME on a GPU can run only on the same GPU as nonbonded, because " - "context switching is not yet supported.")); - } pmedata = gmx_pme_init(cr, npme_major, npme_minor, inputrec, mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed, mdrunOptions.reproducible, diff --git a/src/programs/mdrun/tests/CMakeLists.txt b/src/programs/mdrun/tests/CMakeLists.txt index 675277e65d..8bf1a7d781 100644 --- a/src/programs/mdrun/tests/CMakeLists.txt +++ b/src/programs/mdrun/tests/CMakeLists.txt @@ -34,9 +34,12 @@ # make an "object library" for code that we re-use for both kinds of tests add_library(mdrun_test_objlib OBJECT + energyreader.cpp mdruncomparisonfixture.cpp moduletest.cpp terminationhelper.cpp + # PME tests + pmetest.cpp ) set(testname "MdrunTests") @@ -46,10 +49,8 @@ gmx_add_gtest_executable( ${exename} # files with code for tests tabulated_bonded_interactions.cpp - energyreader.cpp grompp.cpp initialconstraints.cpp - pmetest.cpp rerun.cpp trajectory_writing.cpp trajectoryreader.cpp diff --git a/src/programs/mdrun/tests/pmetest.cpp b/src/programs/mdrun/tests/pmetest.cpp index b91d71d543..e1acdc8dee 100644 --- a/src/programs/mdrun/tests/pmetest.cpp +++ b/src/programs/mdrun/tests/pmetest.cpp @@ -34,10 +34,14 @@ */ /*! \internal \file * \brief - * This implements basic PME sanity test (using single-rank mdrun). + * This implements basic PME sanity tests. * It runs the input system with PME for several steps (on CPU and GPU, if available), * and checks the reciprocal and conserved energies. - * TODO: implement multi-rank tests as well. + * As part of mdrun-test, this will always run single rank PME simulation. + * As part of mdrun-mpi-test, this will run same as above when a single rank is requested, + * or a simulation with a single separate PME rank ("-npme 1") when multiple ranks are requested. + * \todo Extend and generalize this for more multi-rank tests (-npme 0, -npme 2, etc). + * \todo Implement death tests (e.g. for PME GPU decomposition). * * \author Aleksei Iupinov * \ingroup module_mdrun_integration_tests @@ -50,11 +54,15 @@ #include #include +#include + #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/gmxmpi.h" #include "gromacs/utility/stringutil.h" +#include "testutils/mpitest.h" #include "testutils/refdata.h" #include "energyreader.h" @@ -111,6 +119,11 @@ TEST_F(PmeTest, ReproducesEnergies) const std::string inputFile = "spc-and-methanol"; runner_.useTopGroAndNdxFromDatabase(inputFile.c_str()); + // With single rank we can and will always test PP+PME as part of mdrun-test. + // With multiple ranks we can additionally test a single PME-only rank within mdrun-mpi-test. + const bool parallelRun = (getNumberOfTestMpiRanks() > 1); + const bool useSeparatePme = parallelRun; + EXPECT_EQ(0, runner_.callGrompp()); //TODO test all proper/improper combinations in more thorough way? @@ -122,7 +135,11 @@ TEST_F(PmeTest, ReproducesEnergies) runModes["PmeOnGpuFftAuto"] = {"-pme", "gpu", "-pmefft", "auto"}; TestReferenceData refData; TestReferenceChecker rootChecker(refData.rootChecker()); - + const bool thisRankChecks = (gmx_node_rank() == 0); + if (!thisRankChecks) + { + EXPECT_NONFATAL_FAILURE(rootChecker.checkUnusedEntries(), ""); // skip checks on other ranks + } for (const auto &mode : runModes) { auto modeTargetsGpus = (mode.first.find("Gpu") != std::string::npos); @@ -138,30 +155,53 @@ TEST_F(PmeTest, ReproducesEnergies) CommandLine commandLine(mode.second); commandLine.append("-notunepme"); // for reciprocal energy reproducibility + if (useSeparatePme) + { + commandLine.addOption("-npme", 1); + } ASSERT_EQ(0, runner_.callMdrun(commandLine)); - auto energyReader = openEnergyFileToReadFields(runner_.edrFileName_, {"Coul. recip.", "Total Energy", "Kinetic En."}); - auto conservedChecker = rootChecker.checkCompound("Energy", "Conserved"); - auto reciprocalChecker = rootChecker.checkCompound("Energy", "Reciprocal"); - for (int i = 0; i <= nsteps; i++) + if (thisRankChecks) { - EnergyFrame frame = energyReader->frame(); - std::string stepNum = gmx::formatString("%d", i); - const real conservedEnergy = frame.at("Total Energy"); - const real reciprocalEnergy = frame.at("Coul. recip."); - if (i == 0) + auto energyReader = openEnergyFileToReadFields(runner_.edrFileName_, {"Coul. recip.", "Total Energy", "Kinetic En."}); + auto conservedChecker = rootChecker.checkCompound("Energy", "Conserved"); + auto reciprocalChecker = rootChecker.checkCompound("Energy", "Reciprocal"); + for (int i = 0; i <= nsteps; i++) { - // use first step values as references for tolerance - const real startingKineticEnergy = frame.at("Kinetic En."); - const auto conservedTolerance = relativeToleranceAsFloatingPoint(startingKineticEnergy, 2e-5); - const auto reciprocalTolerance = relativeToleranceAsFloatingPoint(reciprocalEnergy, 2e-5); - reciprocalChecker.setDefaultTolerance(reciprocalTolerance); - conservedChecker.setDefaultTolerance(conservedTolerance); + EnergyFrame frame = energyReader->frame(); + std::string stepNum = gmx::formatString("%d", i); + const real conservedEnergy = frame.at("Total Energy"); + const real reciprocalEnergy = frame.at("Coul. recip."); + if (i == 0) + { + // use first step values as references for tolerance + const real startingKineticEnergy = frame.at("Kinetic En."); + const auto conservedTolerance = relativeToleranceAsFloatingPoint(startingKineticEnergy, 2e-5); + const auto reciprocalTolerance = relativeToleranceAsFloatingPoint(reciprocalEnergy, 3e-5); + reciprocalChecker.setDefaultTolerance(reciprocalTolerance); + conservedChecker.setDefaultTolerance(conservedTolerance); + } + conservedChecker.checkReal(conservedEnergy, stepNum.c_str()); + reciprocalChecker.checkReal(reciprocalEnergy, stepNum.c_str()); } - conservedChecker.checkReal(conservedEnergy, stepNum.c_str()); - reciprocalChecker.checkReal(reciprocalEnergy, stepNum.c_str()); } + // FIXME: without this barrier, one of the mdruns was somehow having a non-PME inputrec (!) +#if GMX_LIB_MPI + if (parallelRun) + { + MPI_Barrier(MPI_COMM_WORLD); + } +#endif + } + + // This is a workaround for the output files to not be deleted in a parallel run. + // TODO: consider moving the barrier into the file manager destructor. +#if GMX_LIB_MPI + if (parallelRun) + { + MPI_Barrier(MPI_COMM_WORLD); } +#endif } } -- 2.11.4.GIT