From 6fb2a76e3e02dc01d3605a2ced7566090900c8e9 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Mon, 16 Sep 2019 08:38:06 +0200 Subject: [PATCH] Use GPU halo exchange only when compatible DD is available Change-Id: I624861e61e6f1b6854172102b12a4f9b51b94101 --- src/gromacs/domdec/domdec.cpp | 85 ++++++++++++++++++++++++++++++++----- src/gromacs/domdec/domdec.h | 8 ++++ src/gromacs/domdec/domdec_setup.cpp | 11 ++++- src/gromacs/mdlib/sim_util.cpp | 18 ++++---- src/gromacs/mdrun/runner.cpp | 12 ++++-- 5 files changed, 110 insertions(+), 24 deletions(-) diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 1e3c931858..47a7d3531a 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -2971,13 +2971,7 @@ getDDSettings(const gmx::MDLogger &mdlog, ddSettings.useSendRecv2 = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0); ddSettings.dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10); - // TODO GPU halo exchange requires a 1D single-pulse DD, and when - // it is properly integrated the hack with GMX_GPU_DD_COMMS should - // be removed. - ddSettings.request1DAnd1Pulse = (bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)) || - (bool(getenv("GMX_GPU_DD_COMMS") != nullptr && - GMX_THREAD_MPI && - (GMX_GPU == GMX_GPU_CUDA)))); + ddSettings.request1DAnd1Pulse = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)); ddSettings.useDDOrderZYX = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0)); ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1)); ddSettings.eFlop = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0); @@ -3014,10 +3008,71 @@ gmx_domdec_t::gmx_domdec_t(const t_inputrec &ir) : { } +/*! \brief Return whether the simulation described can run a 1D single-pulse DD. + * + * The GPU halo exchange code requires a 1D single-pulse DD. Such a DD + * generally requires a larger box than other possible decompositions + * with the same rank count, so the calling code might need to decide + * what is the most appropriate way to run the simulation based on + * whether such a DD is possible. + * + * This function works like init_domain_decomposition(), but will not + * give a fatal error, and only uses \c cr for communicating between + * ranks. + * + * It is safe to call before thread-MPI spawns ranks, so that + * thread-MPI can decide whether and how to trigger the GPU halo + * exchange code path. The number of PME ranks, if any, should be set + * in \c options.numPmeRanks. + */ +static bool +canMake1DAnd1PulseDomainDecomposition(const DDSettings &ddSettingsOriginal, + const t_commrec *cr, + const int numRanksRequested, + const DomdecOptions &options, + const gmx_mtop_t &mtop, + const t_inputrec &ir, + const matrix box, + gmx::ArrayRef xGlobal) +{ + // Ensure we don't write any output from this checking routine + gmx::MDLogger dummyLogger; + + DDSystemInfo systemInfo = getSystemInfo(dummyLogger, cr, options, &mtop, &ir, box, xGlobal); + + int numPPRanksRequested = numRanksRequested - (EEL_PME(ir.coulombtype) ? options.numPmeRanks : 0); + + DDSettings ddSettings = ddSettingsOriginal; + ddSettings.request1DAnd1Pulse = true; + const real gridSetupCellsizeLimit = getDDGridSetupCellSizeLimit(dummyLogger, ddSettings.request1DAnd1Pulse, + !isDlbDisabled(ddSettings.initialDlbState), + options.dlbScaling, ir, + systemInfo.cellsizeLimit); + gmx_ddbox_t ddbox = {0}; + DDGridSetup ddGridSetup = getDDGridSetup(dummyLogger, cr, numPPRanksRequested, options, + ddSettings, systemInfo, gridSetupCellsizeLimit, + mtop, ir, box, xGlobal, &ddbox); + + const bool canMakeDDWith1DAnd1Pulse = (ddGridSetup.numDomains[XX] != 0); + + return canMakeDDWith1DAnd1Pulse; +} + +bool is1DAnd1PulseDD(const gmx_domdec_t &dd) +{ + const int maxDimensionSize = std::max(dd.nc[XX], std::max(dd.nc[YY], dd.nc[ZZ])); + const int productOfDimensionSizes = dd.nc[XX]*dd.nc[YY]*dd.nc[ZZ]; + const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes); + + return (dd.comm->maxpulse == 1) && decompositionHasOneDimension; + +} + gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger &mdlog, t_commrec *cr, const DomdecOptions &options, const gmx::MdrunOptions &mdrunOptions, + const bool prefer1DAnd1Pulse, const gmx_mtop_t *mtop, const t_inputrec *ir, const matrix box, @@ -3028,6 +3083,14 @@ gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger &mdlog, "\nInitializing Domain Decomposition on %d ranks", cr->nnodes); DDSettings ddSettings = getDDSettings(mdlog, options, mdrunOptions, *ir); + + if (prefer1DAnd1Pulse && + canMake1DAnd1PulseDomainDecomposition(ddSettings, cr, cr->nnodes, options, + *mtop, *ir, box, xGlobal)) + { + ddSettings.request1DAnd1Pulse = true; + } + if (ddSettings.eFlop > 1) { /* Ensure that we have different random flop counts on different ranks */ @@ -3039,16 +3102,16 @@ gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger &mdlog, int numRanksRequested = cr->nnodes; checkForValidRankCountRequests(numRanksRequested, EEL_PME(ir->coulombtype), options.numPmeRanks); - // DD grid setup uses a more conservative cell size limit for + // DD grid setup uses a more different cell size limit for // automated setup than the one in systemInfo. The latter is used - // later during DLB, for example. + // in set_dd_limits() to configure DLB, for example. const real gridSetupCellsizeLimit = getDDGridSetupCellSizeLimit(mdlog, ddSettings.request1DAnd1Pulse, !isDlbDisabled(ddSettings.initialDlbState), options.dlbScaling, *ir, systemInfo.cellsizeLimit); gmx_ddbox_t ddbox = {0}; - DDGridSetup ddGridSetup = getDDGridSetup(mdlog, cr, numRanksRequested, options, ddSettings, systemInfo, - gridSetupCellsizeLimit, + DDGridSetup ddGridSetup = getDDGridSetup(mdlog, cr, numRanksRequested, options, + ddSettings, systemInfo, gridSetupCellsizeLimit, *mtop, *ir, box, xGlobal, &ddbox); checkDDGridSetup(ddGridSetup, cr, options, ddSettings, systemInfo, gridSetupCellsizeLimit, ddbox); diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index 84274b0fa8..4f62dc4255 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -162,12 +162,20 @@ init_domain_decomposition(const gmx::MDLogger &mdlog, t_commrec *cr, const gmx::DomdecOptions &options, const gmx::MdrunOptions &mdrunOptions, + bool prefer1DAnd1Pulse, const gmx_mtop_t *mtop, const t_inputrec *ir, const matrix box, gmx::ArrayRef xGlobal, gmx::LocalAtomSetManager *atomSets); +/*! \brief Return whether the DD has a single dimension with a single pulse + * + * The GPU halo exchange code requires a 1D single-pulse DD, and its + * setup code can use the returned value to understand what it should + * do. */ +bool is1DAnd1PulseDD(const gmx_domdec_t &dd); + /*! \brief Initialize data structures for bonded interactions */ void dd_init_bondeds(FILE *fplog, gmx_domdec_t *dd, diff --git a/src/gromacs/domdec/domdec_setup.cpp b/src/gromacs/domdec/domdec_setup.cpp index e685336bd7..afe132c429 100644 --- a/src/gromacs/domdec/domdec_setup.cpp +++ b/src/gromacs/domdec/domdec_setup.cpp @@ -913,7 +913,16 @@ getDDGridSetup(const gmx::MDLogger &mdlog, gmx::ArrayRef xGlobal, gmx_ddbox_t *ddbox) { - int numPmeOnlyRanks = getNumPmeOnlyRanksToUse(mdlog, options, mtop, ir, box, numRanksRequested); + int numPmeOnlyRanks = getNumPmeOnlyRanksToUse(mdlog, options, mtop, ir, box, numRanksRequested); + + if (ddSettings.request1DAnd1Pulse && + (numRanksRequested - numPmeOnlyRanks == 1)) + { + // With only one PP rank, there will not be a need for + // GPU-based halo exchange that wants to request that any DD + // has only 1 dimension and 1 pulse. + return DDGridSetup {}; + } gmx::IVec numDomains; if (options.numCells[XX] > 0) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 85c8c73ae6..af26e4e3db 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -128,10 +128,6 @@ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GP // TODO eventially tie this in with other existing GPU flags. static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); -/*! \brief environment variable to enable GPU P2P communication */ -static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr) - && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA); - static void sum_forces(rvec f[], gmx::ArrayRef forceToAdd) { const int end = forceToAdd.size(); @@ -1187,11 +1183,15 @@ void do_force(FILE *fplog, launchPmeGpuFftAndGather(fr->pmedata, wcycle); } - const bool ddUsesGpuDirectCommunication - = c_enableGpuHaloExchange && c_enableGpuBufOps && bUseGPU && havePPDomainDecomposition(cr); - gmx::GpuHaloExchange *gpuHaloExchange = ddUsesGpuDirectCommunication ? cr->dd->gpuHaloExchange.get() : nullptr; - GMX_ASSERT(!ddUsesGpuDirectCommunication || gpuHaloExchange != nullptr, - "Must have valid gpuHaloExchange when doing halo exchange on the GPU"); + // TODO Update this comment when introducing SimulationWorkload + // + // The conditions for gpuHaloExchange e.g. using GPU buffer + // operations were checked before construction, so here we can + // just use it and assert upon any conditions. + gmx::GpuHaloExchange *gpuHaloExchange = (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr); + const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr); + GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True), + "Must use coordinate buffer ops with GPU halo exchange"); /* Communicate coordinates and sum dipole if necessary + do non-local pair search */ diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 690bfd748d..f2cc132ecb 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -166,6 +166,9 @@ namespace gmx static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr) && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA); +/*! \brief environment variable to enable GPU buffer operations */ +static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); + /*! \brief Manage any development feature flag variables encountered * * The use of dev features indicated by environment variables is @@ -194,8 +197,6 @@ static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog) { gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n"); } - GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( - "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable."); } if (useGpuUpdateConstrain) @@ -1028,11 +1029,13 @@ int Mdrunner::mdrunner() prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box, useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo); + const bool prefer1DAnd1PulseDD = (c_enableGpuHaloExchange && useGpuForNonbonded); LocalAtomSetManager atomSets; if (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM)) { cr->dd = init_domain_decomposition(mdlog, cr, domdecOptions, mdrunOptions, + prefer1DAnd1PulseDD, &mtop, inputrec, box, positionsFromStatePointer(globalState.get()), &atomSets); @@ -1340,12 +1343,15 @@ int Mdrunner::mdrunner() // TODO Move this to happen during domain decomposition setup, // once stream and event handling works well with that. // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093 - if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded) + if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd)) { + GMX_RELEASE_ASSERT(c_enableGpuBufOps, "Must use GMX_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1"); void *streamLocal = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal); void *streamNonLocal = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal); void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event(); + GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( + "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable."); cr->dd->gpuHaloExchange = std::make_unique(cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal, coordinatesOnDeviceEvent); } -- 2.11.4.GIT