From 1e84d284ef773006e24748ad1ba096a32653335f Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Fri, 3 Nov 2017 03:20:21 +0100 Subject: [PATCH] Activate PME on GPUs Adds support for a run with a single rank, running NB and PME on the same GPU. Fixed bug in thread-MPI gather and gatherv functions. Real MPI does not require that the send buffer is non-NULL if the send count is zero, and the thread-MPI docs have not documented that this is an intended extra requirement. This would be a problem only if e.g. memcpy(dest,src,0) itself dereferenced src. TODO user docs Change-Id: Iace9e720c0958eaacaaa81307f1fe7324fa4c9ac --- src/external/thread_mpi/src/gather.c | 4 +-- src/gromacs/ewald/pme.h | 3 +- src/gromacs/gmxlib/network.cpp | 7 ++++ .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 7 ++-- src/gromacs/taskassignment/taskassignment.cpp | 30 +++++++++++++++- src/gromacs/taskassignment/usergpuids.cpp | 4 +-- src/gromacs/taskassignment/usergpuids.h | 6 ++-- src/programs/mdrun/mdrun.cpp | 16 +++++++-- src/programs/mdrun/runner.cpp | 42 ++++++++++++++++------ src/programs/mdrun/runner.h | 4 +++ src/programs/mdrun/tests/pmetest.cpp | 12 +++---- 11 files changed, 105 insertions(+), 30 deletions(-) diff --git a/src/external/thread_mpi/src/gather.c b/src/external/thread_mpi/src/gather.c index ca928a197a..780f54a966 100644 --- a/src/external/thread_mpi/src/gather.c +++ b/src/external/thread_mpi/src/gather.c @@ -139,7 +139,7 @@ int tMPI_Gather(const void* sendbuf, int sendcount, tMPI_Datatype sendtype, } else { - if (!sendbuf) /* don't do pointer arithmetic on a NULL ptr */ + if (!sendbuf && sendcount > 0) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } @@ -245,7 +245,7 @@ int tMPI_Gatherv(const void* sendbuf, int sendcount, tMPI_Datatype sendtype, } else { - if (!sendbuf) /* don't do pointer arithmetic on a NULL ptr */ + if (!sendbuf && sendcount > 0) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 571f21d4e7..6452b9d091 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -76,11 +76,12 @@ enum { GMX_SUM_GRID_FORWARD, GMX_SUM_GRID_BACKWARD }; -/*! \brief Possible PME codepaths +/*! \brief Possible PME codepaths on a rank. * \todo: make this enum class with gmx_pme_t C++ refactoring */ enum PmeRunMode { + None, //!< No PME task is done CPU, //!< Whole PME computation is done on CPU GPU, //!< Whole PME computation is done on GPU Hybrid, //!< Mixed mode: only spread and gather run on GPU; FFT and solving are done on CPU. diff --git a/src/gromacs/gmxlib/network.cpp b/src/gromacs/gmxlib/network.cpp index 082d980d42..d35662d81e 100644 --- a/src/gromacs/gmxlib/network.cpp +++ b/src/gromacs/gmxlib/network.cpp @@ -69,6 +69,13 @@ void gmx_fill_commrec_from_mpi(t_commrec gmx_unused *cr) cr->nnodes = gmx_node_num(); cr->nodeid = gmx_node_rank(); + // TODO This communicator should be always available. Currently we + // make it multiple times, and keep it only when relevant. But the + // cost of an extra communicator is negligible in single-node + // cases (both thread-MPI and real MPI) case, and we need it in + // all multi-node MPI cases with more than one PP rank per node, + // with and without GPUs. By always having it available, we also + // don't need to protect calls to mpi_comm_physicalnode, etc. if (PAR(cr) || MULTISIM(cr)) { MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(), cr->nodeid, &cr->mpi_comm_physicalnode); diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index e6d2cb15b9..db5a610625 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -1141,10 +1141,13 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData) //! This function is documented in the header file void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) { - int kernel_count; + if (nb == NULL) + { + return; + } /* Free kernels */ - kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]); + int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]); free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count); kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]); diff --git a/src/gromacs/taskassignment/taskassignment.cpp b/src/gromacs/taskassignment/taskassignment.cpp index 2b5655fae7..76136c7edd 100644 --- a/src/gromacs/taskassignment/taskassignment.cpp +++ b/src/gromacs/taskassignment/taskassignment.cpp @@ -179,6 +179,23 @@ size_t countGpuTasksOnThisNode(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode) return numGpuTasksOnThisNode; } +//! Finds whether there is any task of \c queryTask in the tasks on the ranks of this node. +bool hasAnyTaskOfTypeOnThisNode(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode, + const GpuTask queryTask) +{ + for (const auto &gpuTasksOnRank : gpuTasksOnRanksOfThisNode) + { + for (const auto &gpuTask : gpuTasksOnRank) + { + if (queryTask == gpuTask) + { + return true; + } + } + } + return false; +} + } // namespace GpuTaskAssignments::value_type @@ -219,7 +236,18 @@ runTaskAssignment(const std::vector &gpuIdsToUse, std::vector generatedGpuIds; if (userGpuTaskAssignment.empty()) { - generatedGpuIds = makeGpuIds(gpuIdsToUse, numGpuTasksOnThisNode); + ArrayRef compatibleGpusToUse = gpuIdsToUse; + if (hasAnyTaskOfTypeOnThisNode(gpuTasksOnRanksOfThisNode, GpuTask::Pme)) + { + // PP and PME tasks must run on the same device, so + // restrict the assignment to the first device. If + // there aren't any, then that error is handled later. + if (!compatibleGpusToUse.empty()) + { + compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1); + } + } + generatedGpuIds = makeGpuIds(compatibleGpusToUse, numGpuTasksOnThisNode); gpuIdsForTaskAssignment = generatedGpuIds; } else diff --git a/src/gromacs/taskassignment/usergpuids.cpp b/src/gromacs/taskassignment/usergpuids.cpp index 041e39f1cb..e3e06e0677 100644 --- a/src/gromacs/taskassignment/usergpuids.cpp +++ b/src/gromacs/taskassignment/usergpuids.cpp @@ -102,8 +102,8 @@ parseUserGpuIds(const std::string &gpuIdString) } std::vector -makeGpuIds(const std::vector &compatibleGpus, - size_t numGpuTasks) +makeGpuIds(ArrayRef compatibleGpus, + size_t numGpuTasks) { std::vector gpuIdsToUse; diff --git a/src/gromacs/taskassignment/usergpuids.h b/src/gromacs/taskassignment/usergpuids.h index 2209fe258a..006e6b283e 100644 --- a/src/gromacs/taskassignment/usergpuids.h +++ b/src/gromacs/taskassignment/usergpuids.h @@ -52,6 +52,8 @@ #include #include +#include "gromacs/utility/arrayref.h" + struct gmx_gpu_info_t; namespace gmx @@ -82,8 +84,8 @@ parseUserGpuIds(const std::string &gpuIdString); * length matches that of the number of GPU tasks required. */ std::vector -makeGpuIds(const std::vector &compatibleGpus, - size_t numGpuTasks); +makeGpuIds(ArrayRef compatibleGpus, + size_t numGpuTasks); /*! \brief Convert a container of GPU deviced IDs to a string that * can be used by gmx tune_pme as input to mdrun -gputasks. diff --git a/src/programs/mdrun/mdrun.cpp b/src/programs/mdrun/mdrun.cpp index 9af3bb7ca3..0c131c9a16 100644 --- a/src/programs/mdrun/mdrun.cpp +++ b/src/programs/mdrun/mdrun.cpp @@ -250,6 +250,10 @@ int Mdrunner::mainFunction(int argc, char *argv[]) { nullptr, "auto", "on", "off", nullptr }; const char *nbpu_opt_choices[] = { nullptr, "auto", "cpu", "gpu", nullptr }; + const char *pme_opt_choices[] = + { nullptr, "auto", "cpu", "gpu", nullptr }; + const char *pme_fft_opt_choices[] = + { nullptr, "auto", "cpu", "gpu", nullptr }; gmx_bool bTryToAppendFiles = TRUE; const char *gpuIdsAvailable = ""; const char *userGpuTaskAssignment = ""; @@ -315,6 +319,10 @@ int Mdrunner::mainFunction(int argc, char *argv[]) "Set nstlist when using a Verlet buffer tolerance (0 is guess)" }, { "-tunepme", FALSE, etBOOL, {&mdrunOptions.tunePme}, "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" }, + { "-pme", FALSE, etENUM, {pme_opt_choices}, + "Perform PME calculations on" }, + { "-pmefft", FALSE, etENUM, {pme_fft_opt_choices}, + "Perform PME FFT calculations on" }, { "-v", FALSE, etBOOL, {&mdrunOptions.verbose}, "Be loud and noisy" }, { "-pforce", FALSE, etREAL, {&pforce}, @@ -508,8 +516,12 @@ int Mdrunner::mainFunction(int argc, char *argv[]) domdecOptions.numCells[YY] = (int)(realddxyz[YY] + 0.5); domdecOptions.numCells[ZZ] = (int)(realddxyz[ZZ] + 0.5); - nbpu_opt = nbpu_opt_choices[0]; - rc = mdrunner(); + nbpu_opt = nbpu_opt_choices[0]; + pme_opt = pme_opt_choices[0]; + pme_fft_opt = pme_fft_opt_choices[0]; + + + rc = mdrunner(); /* Log file has to be closed in mdrunner if we are appending to it (fplog not set here) */ diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index b266ecb415..1c0e019ee5 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -485,17 +485,11 @@ int Mdrunner::mdrunner() } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; auto nonbondedTarget = findTaskTarget(nbpu_opt); - // TODO Connect these to actual mdrun arguments and some functionality - const char *pme_opt = "cpu"; - auto pmeTarget = findTaskTarget(pme_opt); - + auto pmeTarget = findTaskTarget(pme_opt); // TODO find a sensible home and behaviour for this - //const char *pme_fft_opt = "auto"; + GMX_UNUSED_VALUE(pme_fft_opt); //auto pmeFftTarget = findTaskTarget(pme_fft_opt); - - const PmeRunMode pmeRunMode = PmeRunMode::CPU; - //TODO this is a placeholder as PME on GPU is not permitted yet - //TODO should there exist a PmeRunMode::None value for consistency? + PmeRunMode pmeRunMode = PmeRunMode::None; // Here we assume that SIMMASTER(cr) does not change even after the // threads are started. @@ -661,6 +655,8 @@ int Mdrunner::mdrunner() auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype); auto canUseGpuForPme = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr); useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes); + // FIXME decide how to implement -pmefft support + pmeRunMode = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; @@ -743,6 +739,17 @@ int Mdrunner::mdrunner() */ domdecOptions.numPmeRanks = 0; } + if (useGpuForPme) + { + if (domdecOptions.numPmeRanks < 0) + { + domdecOptions.numPmeRanks = 0; // separate GPU ranks not supported + } + else + { + GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks == 0, "Separate PME GPU ranks are not yet supported"); + } + } #ifdef GMX_FAHCORE if (MASTER(cr)) @@ -1031,6 +1038,14 @@ int Mdrunner::mdrunner() } } + gmx_device_info_t *pmeDeviceInfo = nullptr; + // This works because only one task of each type is currently permitted. + auto pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType); + if (pmeGpuTaskMapping != gpuTaskAssignment.end()) + { + pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_); + } + /* getting number of PP/PME threads PME: env variable should be read only on one node to make sure it is identical everywhere; @@ -1196,13 +1211,18 @@ int Mdrunner::mdrunner() { try { - gmx_device_info_t *pmeGpuInfo = nullptr; + if (pmeDeviceInfo != nullptr && pmeDeviceInfo != nonbondedDeviceInfo) + { + GMX_THROW(NotImplementedError + ("PME on a GPU can run only on the same GPU as nonbonded, because " + "context switching is not yet supported.")); + } pmedata = gmx_pme_init(cr, npme_major, npme_minor, inputrec, mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed, mdrunOptions.reproducible, ewaldcoeff_q, ewaldcoeff_lj, nthreads_pme, - pmeRunMode, nullptr, pmeGpuInfo, mdlog); + pmeRunMode, nullptr, pmeDeviceInfo, mdlog); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } diff --git a/src/programs/mdrun/runner.h b/src/programs/mdrun/runner.h index d0db02bcf6..51c84e4471 100644 --- a/src/programs/mdrun/runner.h +++ b/src/programs/mdrun/runner.h @@ -140,6 +140,10 @@ class Mdrunner DomdecOptions domdecOptions; //! Target short-range interations for "cpu", "gpu", or "auto". Default is "auto". const char *nbpu_opt = nullptr; + //! Target long-range interactions for "cpu", "gpu", or "auto". Default is "auto". + const char *pme_opt = nullptr; + //! Target long-range interactions FFT/solve stages for "cpu", "gpu", or "auto". Default is "auto". + const char *pme_fft_opt = nullptr; //! Command-line override for the duration of a neighbor list with the Verlet scheme. int nstlist_cmdline = 0; //! Number of simulations in multi-simulation set. diff --git a/src/programs/mdrun/tests/pmetest.cpp b/src/programs/mdrun/tests/pmetest.cpp index 7262f406f0..7f73607b93 100644 --- a/src/programs/mdrun/tests/pmetest.cpp +++ b/src/programs/mdrun/tests/pmetest.cpp @@ -115,14 +115,12 @@ TEST_F(PmeTest, ReproducesEnergies) //TODO test all proper/improper combinations in more thorough way? std::map < std::string, std::vector < const char *>> runModes; - runModes["PmeOnCpu"] = {}; - // TODO uncomment these and replace the above as functionality - // gets implemented. - //runModes["PmeOnCpu"] = {"-pme", "cpu"}; - //runModes["PmeAuto"] = {"-pme", "auto"}; + runModes["PmeOnCpu"] = {"-pme", "cpu"}; + runModes["PmeAuto"] = {"-pme", "auto"}; + // TODO uncomment this when functionality gets activated. //runModes["PmeOnGpuFftOnCpu"] = {"-pme", "gpu", "-pmefft", "cpu"}; - //runModes["PmeOnGpuFftOnGpu"] = {"-pme", "gpu", "-pmefft", "gpu"}; - //runModes["PmeOnGpuFftAuto"] = {"-pme", "gpu", "-pmefft", "auto"}; + runModes["PmeOnGpuFftOnGpu"] = {"-pme", "gpu", "-pmefft", "gpu"}; + runModes["PmeOnGpuFftAuto"] = {"-pme", "gpu", "-pmefft", "auto"}; TestReferenceData refData; TestReferenceChecker rootChecker(refData.rootChecker()); -- 2.11.4.GIT