From 762c6f0cf02d9fbfcd901a7a97fb83fd9ac547ec Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Tue, 22 Aug 2017 17:09:50 +0200 Subject: [PATCH] Fixed thread-MPI with non-default -npme Thread-MPI currently defaults to zero PME-only ranks, but should support non-default specifications. Enforced that e.g. mdrun -ntmpi 0 -npme 1 -gpu_id 0 is not supported, because we don't have the ability to decide how to distribute threads to the different kinds of ranks. Change-Id: I5f175fc087c10d4268e6d8226ba1628e99d376fc --- src/programs/mdrun/resource-division.cpp | 45 ++++++++++++++++++++++++-------- src/programs/mdrun/resource-division.h | 1 + src/programs/mdrun/runner.cpp | 1 + 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/programs/mdrun/resource-division.cpp b/src/programs/mdrun/resource-division.cpp index dd1de23664..8a3a293b64 100644 --- a/src/programs/mdrun/resource-division.cpp +++ b/src/programs/mdrun/resource-division.cpp @@ -338,6 +338,7 @@ class SingleRankChecker */ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, gmx_hw_opt_t *hw_opt, + int numPmeRanks, const t_inputrec *inputrec, const gmx_mtop_t *mtop, const gmx::MDLogger &mdlog, @@ -361,13 +362,21 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, if (hw_opt->nthreads_tmpi < 1 && numGpuIdsSupplied > 0) { /* If the user chose both mdrun -nt -gpu_id, is that consistent? */ - if (hw_opt->nthreads_tot > 0 && - (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0) + if (numPmeRanks <= 0) { - gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied); + if (hw_opt->nthreads_tot > 0 && + (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0) + { + gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied); + } + return numGpuIdsSupplied; + } + else + { + gmx_fatal(FARGS, "The combination of choosing a number of PME ranks, and specific GPU IDs " + "is not supported. Use also -ntmpi and/or -ntomp and -ntomp_pme to specify what " + "distribution of threads to ranks you require."); } - - return numGpuIdsSupplied; } { @@ -399,14 +408,28 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, if (hw_opt->nthreads_tmpi > 0) { - if ((numGpuIdsSupplied > 0) && - (numGpuIdsSupplied != hw_opt->nthreads_tmpi)) + if (numPmeRanks <= 0) + { + int numPpRanks = hw_opt->nthreads_tmpi; + if ((numGpuIdsSupplied > 0) && + (numGpuIdsSupplied != numPpRanks)) + { + gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d " + "GPU IDs supplied. The number of particle-particle (PP) ranks and the " + "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numGpuIdsSupplied); + } + } + else { - gmx_fatal(FARGS, "Cannot run %d thread-MPI ranks with %d GPU IDs supplied. " - "The number of ranks and the number of GPU IDs must match.", - hw_opt->nthreads_tmpi, numGpuIdsSupplied); + int numPpRanks = hw_opt->nthreads_tmpi - numPmeRanks; + if ((numGpuIdsSupplied > 0) && + (numGpuIdsSupplied != numPpRanks)) + { + gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d PME ranks and %d " + "GPU IDs supplied. The number of particle-particle ranks and the " + "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numPmeRanks, numGpuIdsSupplied); + } } - /* Trivial, return the user's choice right away */ return hw_opt->nthreads_tmpi; } diff --git a/src/programs/mdrun/resource-division.h b/src/programs/mdrun/resource-division.h index 14e07f23de..dc0c774f6d 100644 --- a/src/programs/mdrun/resource-division.h +++ b/src/programs/mdrun/resource-division.h @@ -63,6 +63,7 @@ class MDLogger; */ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, gmx_hw_opt_t *hw_opt, + int numPmeRanks, const t_inputrec *inputrec, const gmx_mtop_t *mtop, const gmx::MDLogger &mdlog, diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index c766c12cc0..ed1bd51792 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -600,6 +600,7 @@ int Mdrunner::mdrunner() * correctly. */ hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo, &hw_opt, + npme, inputrec, mtop, mdlog, doMembed); -- 2.11.4.GIT