Fixed thread-MPI with non-default -npme
[gromacs.git] / src / programs / mdrun / resource-division.cpp
blob8a3a293b64c677b7048b722d668f20c922ffbe4e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #include "gmxpre.h"
38 #include "resource-division.h"
40 #include "config.h"
42 #include <stdlib.h>
43 #include <string.h>
45 #include <algorithm>
47 #include "gromacs/hardware/cpuinfo.h"
48 #include "gromacs/hardware/detecthardware.h"
49 #include "gromacs/hardware/hardwareassign.h"
50 #include "gromacs/hardware/hardwaretopology.h"
51 #include "gromacs/hardware/hw_info.h"
52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
53 #include "gromacs/mdtypes/commrec.h"
54 #include "gromacs/mdtypes/inputrec.h"
55 #include "gromacs/mdtypes/md_enums.h"
56 #include "gromacs/topology/topology.h"
57 #include "gromacs/utility/baseversion.h"
58 #include "gromacs/utility/fatalerror.h"
59 #include "gromacs/utility/gmxassert.h"
60 #include "gromacs/utility/logger.h"
61 #include "gromacs/utility/stringutil.h"
64 /* DISCLAIMER: All the atom count and thread numbers below are heuristic.
65 * The real switching points will depend on the system simulation,
66 * the algorithms used and the hardware it's running on, as well as if there
67 * are other jobs running on the same machine. We try to take into account
68 * factors that have a large influence, such as recent Intel CPUs being
69 * much better at wide multi-threading. The remaining factors should
70 * (hopefully) have a small influence, such that the performance just before
71 * and after a switch point doesn't change too much.
74 //! Constant used to help minimize preprocessed code
75 static const bool bHasOmpSupport = GMX_OPENMP;
77 #if GMX_THREAD_MPI
78 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
79 * the number of threads will get lowered.
81 static const int min_atoms_per_mpi_thread = 90;
82 static const int min_atoms_per_gpu = 900;
83 #endif /* GMX_THREAD_MPI */
85 /* TODO choose nthreads_omp based on hardware topology
86 when we have a hardware topology detection library */
87 /* First we consider the case of no MPI (1 MPI rank).
88 * In general, when running up to 8 threads, OpenMP should be faster.
89 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
90 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
91 * even on two CPUs it's usually faster (but with many OpenMP threads
92 * it could be faster not to use HT, currently we always use HT).
93 * On Nehalem/Westmere we want to avoid running 16 threads over
94 * two CPUs with HT, so we need a limit<16; thus we use 12.
95 * A reasonable limit for Intel Sandy and Ivy bridge,
96 * not knowing the topology, is 16 threads.
97 * Below we check for Intel and AVX, which for now includes
98 * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
99 * model numbers we ensure also future Intel CPUs are covered.
101 const int nthreads_omp_faster_default = 8;
102 const int nthreads_omp_faster_Nehalem = 12;
103 const int nthreads_omp_faster_Intel_AVX = 16;
104 const int nthreads_omp_faster_AMD_Ryzen = 16;
105 /* For CPU only runs the fastest options are usually MPI or OpenMP only.
106 * With one GPU, using MPI only is almost never optimal, so we need to
107 * compare running pure OpenMP with combined MPI+OpenMP. This means higher
108 * OpenMP threads counts can still be ok. Multiplying the numbers above
109 * by a factor of 2 seems to be a good estimate.
111 const int nthreads_omp_faster_gpu_fac = 2;
113 /* This is the case with MPI (2 or more MPI PP ranks).
114 * By default we will terminate with a fatal error when more than 8
115 * OpenMP thread are (indirectly) requested, since using less threads
116 * nearly always results in better performance.
117 * With thread-mpi and multiple GPUs or one GPU and too many threads
118 * we first try 6 OpenMP threads and then less until the number of MPI ranks
119 * is divisible by the number of GPUs.
121 #if GMX_OPENMP && GMX_MPI
122 const int nthreads_omp_mpi_ok_max = 8;
123 const int nthreads_omp_mpi_ok_min_cpu = 1;
124 #endif
125 const int nthreads_omp_mpi_ok_min_gpu = 2;
126 const int nthreads_omp_mpi_target_max = 6;
129 /* Returns the maximum OpenMP thread count for which using a single MPI rank
130 * should be faster than using multiple ranks with the same total thread count.
132 static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
134 int nth;
136 if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Intel &&
137 cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx))
139 nth = nthreads_omp_faster_Intel_AVX;
141 else if (gmx::cpuIsX86Nehalem(cpuInfo))
143 // Intel Nehalem
144 nth = nthreads_omp_faster_Nehalem;
146 else if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() >= 23)
148 // AMD Ryzen
149 nth = nthreads_omp_faster_AMD_Ryzen;
151 else
153 nth = nthreads_omp_faster_default;
156 if (bUseGPU)
158 nth *= nthreads_omp_faster_gpu_fac;
161 nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
163 return nth;
166 /* Returns that maximum OpenMP thread count that passes the efficiency check */
167 static int nthreads_omp_efficient_max(int gmx_unused nrank,
168 const gmx::CpuInfo &cpuInfo,
169 gmx_bool bUseGPU)
171 #if GMX_OPENMP && GMX_MPI
172 if (nrank > 1)
174 return nthreads_omp_mpi_ok_max;
176 else
177 #endif
179 return nthreads_omp_faster(cpuInfo, bUseGPU);
183 /* Return the number of thread-MPI ranks to use.
184 * This is chosen such that we can always obey our own efficiency checks.
186 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
187 const gmx_hw_opt_t &hw_opt,
188 int nthreads_tot,
189 int ngpu)
191 int nrank;
192 const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
194 GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank");
196 /* There are no separate PME nodes here, as we ensured in
197 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
198 * and a conditional ensures we would not have ended up here.
199 * Note that separate PME nodes might be switched on later.
201 if (ngpu > 0)
203 nrank = ngpu;
205 /* When the user sets nthreads_omp, we can end up oversubscribing CPU cores
206 * if we simply start as many ranks as GPUs. To avoid this, we start as few
207 * tMPI ranks as necessary to avoid oversubscription and instead leave GPUs idle.
208 * If the user does not set the number of OpenMP threads, nthreads_omp==0 and
209 * this code has no effect.
211 GMX_RELEASE_ASSERT(hw_opt.nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
212 while (nrank*hw_opt.nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
214 nrank--;
217 if (nthreads_tot < nrank)
219 /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
220 nrank = nthreads_tot;
222 else if (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
223 (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))
225 /* The high OpenMP thread count will likely result in sub-optimal
226 * performance. Increase the rank count to reduce the thread count
227 * per rank. This will lead to GPU sharing by MPI ranks/threads.
229 int nshare;
231 /* Increase the rank count as long as have we more than 6 OpenMP
232 * threads per rank or the number of hardware threads is not
233 * divisible by the rank count. Don't go below 2 OpenMP threads.
235 nshare = 1;
238 nshare++;
239 nrank = ngpu*nshare;
241 while (nthreads_tot/nrank > nthreads_omp_mpi_target_max ||
242 (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
245 else if (hw_opt.nthreads_omp > 0)
247 /* Here we could oversubscribe, when we do, we issue a warning later */
248 nrank = std::max(1, nthreads_tot/hw_opt.nthreads_omp);
250 else
252 if (nthreads_tot <= nthreads_omp_faster(cpuInfo, ngpu > 0))
254 /* Use pure OpenMP parallelization */
255 nrank = 1;
257 else
259 /* Don't use OpenMP parallelization */
260 nrank = nthreads_tot;
264 return nrank;
268 static int getMaxGpuUsable(const gmx_hw_info_t *hwinfo,
269 int cutoff_scheme)
271 /* This code relies on the fact that GPU are not detected when GPU
272 * acceleration was disabled at run time by the user, either with
273 * -nb cpu or setting GMX_EMULATE_GPU. */
274 if (cutoff_scheme == ecutsVERLET &&
275 hwinfo->gpu_info.n_dev_compatible > 0)
277 return hwinfo->gpu_info.n_dev_compatible;
279 else
281 return 0;
286 #if GMX_THREAD_MPI
288 static bool
289 gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
291 return (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic && hwTop.machine().sockets[0].cores[0].hwThreads.size() > 1);
294 namespace
297 class SingleRankChecker
299 public:
300 //! Constructor
301 SingleRankChecker() : value_(false), reasons_() {}
302 /*! \brief Call this function for each possible condition
303 under which a single rank is required, along with a string
304 describing the constraint when it is applied. */
305 void applyConstraint(bool condition, const char *description)
307 if (condition)
309 value_ = true;
310 reasons_.push_back(gmx::formatString("%s only supports a single rank.", description));
313 //! After applying any conditions, is a single rank required?
314 bool mustUseOneRank() const
316 return value_;
318 /*! \brief Return a formatted string to use when writing a
319 message when a single rank is required, (or empty if no
320 constraint exists.) */
321 std::string getMessage() const
323 return formatAndJoin(reasons_, "\n", gmx::IdentityFormatter());
325 private:
326 bool value_;
327 std::vector<std::string> reasons_;
330 } // namespace
332 /* Get the number of MPI ranks to use for thread-MPI based on how many
333 * were requested, which algorithms we're using,
334 * and how many particles there are.
335 * At the point we have already called check_and_update_hw_opt.
336 * Thus all options should be internally consistent and consistent
337 * with the hardware, except that ntmpi could be larger than #GPU.
339 int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
340 gmx_hw_opt_t *hw_opt,
341 int numPmeRanks,
342 const t_inputrec *inputrec,
343 const gmx_mtop_t *mtop,
344 const gmx::MDLogger &mdlog,
345 bool doMembed)
347 int nthreads_hw, nthreads_tot_max, nrank, ngpu;
348 int min_atoms_per_mpi_rank;
350 const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
351 const gmx::HardwareTopology &hwTop = *hwinfo->hardwareTopology;
353 /* If the user made a GPU task assignment, that sets the number of thread-MPI ranks. */
354 auto userGpuTaskAssignment = gmx::parseGpuTaskAssignment(hw_opt->gpuIdTaskAssignment);
355 int numGpuIdsSupplied = static_cast<int>(userGpuTaskAssignment.size());
357 /* TODO Here we handle the case where the user set GPU IDs, and
358 further below we handle the case where the algorithm does not
359 support multiple ranks. We need also to handle the case where
360 the user set multiple GPU IDs for an algorithm that cannot
361 handle multiple ranks. */
362 if (hw_opt->nthreads_tmpi < 1 && numGpuIdsSupplied > 0)
364 /* If the user chose both mdrun -nt -gpu_id, is that consistent? */
365 if (numPmeRanks <= 0)
367 if (hw_opt->nthreads_tot > 0 &&
368 (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0)
370 gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied);
372 return numGpuIdsSupplied;
374 else
376 gmx_fatal(FARGS, "The combination of choosing a number of PME ranks, and specific GPU IDs "
377 "is not supported. Use also -ntmpi and/or -ntomp and -ntomp_pme to specify what "
378 "distribution of threads to ranks you require.");
383 /* Check if an algorithm does not support parallel simulation. */
384 // TODO This might work better if e.g. implemented algorithms
385 // had to define a function that returns such requirements,
386 // and a description string.
387 SingleRankChecker checker;
388 checker.applyConstraint(inputrec->eI == eiLBFGS, "L-BFGS minimization");
389 checker.applyConstraint(inputrec->coulombtype == eelEWALD, "Plain Ewald electrostatics");
390 checker.applyConstraint(doMembed, "Membrane embedding");
391 if (checker.mustUseOneRank())
393 std::string message = checker.getMessage();
394 if (hw_opt->nthreads_tmpi > 1)
396 gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str());
398 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str());
400 if (numGpuIdsSupplied > 1)
402 gmx_fatal(FARGS, "You supplied %d GPU IDs but only 1 rank can be used "
403 "by this simulation. Supply only one GPU ID.", numGpuIdsSupplied);
405 return 1;
409 if (hw_opt->nthreads_tmpi > 0)
411 if (numPmeRanks <= 0)
413 int numPpRanks = hw_opt->nthreads_tmpi;
414 if ((numGpuIdsSupplied > 0) &&
415 (numGpuIdsSupplied != numPpRanks))
417 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d "
418 "GPU IDs supplied. The number of particle-particle (PP) ranks and the "
419 "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numGpuIdsSupplied);
422 else
424 int numPpRanks = hw_opt->nthreads_tmpi - numPmeRanks;
425 if ((numGpuIdsSupplied > 0) &&
426 (numGpuIdsSupplied != numPpRanks))
428 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d PME ranks and %d "
429 "GPU IDs supplied. The number of particle-particle ranks and the "
430 "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numPmeRanks, numGpuIdsSupplied);
433 /* Trivial, return the user's choice right away */
434 return hw_opt->nthreads_tmpi;
436 GMX_RELEASE_ASSERT(numGpuIdsSupplied == 0,
437 "If mdrun -gpu_id had information, the number of ranks should have already been chosen");
439 // Now implement automatic selection of number of thread-MPI ranks
440 nthreads_hw = hwinfo->nthreads_hw_avail;
442 if (nthreads_hw <= 0)
444 /* This should normally not happen, but if it does, we handle it */
445 gmx_fatal(FARGS, "The number of available hardware threads can not be detected, please specify the number of MPI ranks and the number of OpenMP threads (if supported) manually with options -ntmpi and -ntomp, respectively");
448 /* How many total (#tMPI*#OpenMP) threads can we start? */
449 if (hw_opt->nthreads_tot > 0)
451 nthreads_tot_max = hw_opt->nthreads_tot;
453 else
455 nthreads_tot_max = nthreads_hw;
458 ngpu = getMaxGpuUsable(hwinfo, inputrec->cutoff_scheme);
460 if (inputrec->cutoff_scheme == ecutsGROUP)
462 /* We checked this before, but it doesn't hurt to do it once more */
463 GMX_RELEASE_ASSERT(hw_opt->nthreads_omp == 1, "The group scheme only supports one OpenMP thread per rank");
466 nrank =
467 get_tmpi_omp_thread_division(hwinfo, *hw_opt, nthreads_tot_max, ngpu);
469 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
471 /* Dims/steps are divided over the nodes iso splitting the atoms.
472 * With NM we can't have more ranks than #atoms*#dim. With TPI it's
473 * unlikely we have fewer atoms than ranks, and if so, communication
474 * would become a bottleneck, so we set the limit to 1 atom/rank.
476 min_atoms_per_mpi_rank = 1;
478 else
480 if (ngpu >= 1)
482 min_atoms_per_mpi_rank = min_atoms_per_gpu;
484 else
486 min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
490 if (mtop->natoms/nrank < min_atoms_per_mpi_rank)
492 int nrank_new;
494 /* the rank number was chosen automatically, but there are too few
495 atoms per rank, so we need to reduce the rank count */
496 nrank_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);
498 /* Avoid partial use of Hyper-Threading */
499 if (gmxSmtIsEnabled(hwTop) &&
500 nrank_new > nthreads_hw/2 && nrank_new < nthreads_hw)
502 nrank_new = nthreads_hw/2;
505 /* If the user specified the total thread count, ensure this is
506 * divisible by the number of ranks.
507 * It is quite likely that we have too many total threads compared
508 * to the size of the system, but if the user asked for this many
509 * threads we should respect that.
511 while (hw_opt->nthreads_tot > 0 &&
512 hw_opt->nthreads_tot % nrank_new != 0)
514 nrank_new--;
517 /* Avoid large prime numbers in the rank count */
518 if (nrank_new >= 6)
520 /* Use only 6,8,10 with additional factors of 2 */
521 int fac;
523 fac = 2;
524 while (3*fac*2 <= nrank_new)
526 fac *= 2;
529 nrank_new = (nrank_new/fac)*fac;
531 else
533 /* Avoid 5, since small system won't fit 5 domains along
534 * a dimension. This might lead to waisting some cores, but this
535 * will have a small impact in this regime of very small systems.
537 if (nrank_new == 5)
539 nrank_new = 4;
543 nrank = nrank_new;
545 /* We reduced the number of tMPI ranks, which means we might violate
546 * our own efficiency checks if we simply use all hardware threads.
548 if (bHasOmpSupport && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
550 /* The user set neither the total nor the OpenMP thread count,
551 * we should use all hardware threads, unless we will violate
552 * our own efficiency limitation on the thread count.
554 int nt_omp_max;
556 nt_omp_max = nthreads_omp_efficient_max(nrank, cpuInfo, ngpu >= 1);
558 if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
560 /* Limit the number of OpenMP threads to start */
561 hw_opt->nthreads_omp = nt_omp_max;
565 fprintf(stderr, "\n");
566 fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
567 fprintf(stderr, " only starting %d thread-MPI ranks.\n", nrank);
568 fprintf(stderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
571 return nrank;
573 #endif /* GMX_THREAD_MPI */
576 void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
577 int numTotalThreads,
578 bool willUsePhysicalGpu,
579 gmx_bool bNtOmpOptionSet,
580 t_commrec *cr,
581 const gmx::MDLogger &mdlog)
583 #if GMX_OPENMP && GMX_MPI
584 int nth_omp_min, nth_omp_max;
585 char buf[1000];
586 #if GMX_THREAD_MPI
587 const char *mpi_option = " (option -ntmpi)";
588 #else
589 const char *mpi_option = "";
590 #endif
592 /* This function should be called after thread-MPI (when configured) and
593 * OpenMP have been initialized. Check that here.
595 #if GMX_THREAD_MPI
596 GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
597 #endif
598 GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");
600 nth_omp_min = gmx_omp_nthreads_get(emntDefault);
601 nth_omp_max = gmx_omp_nthreads_get(emntDefault);
603 bool anyRankIsUsingGpus = willUsePhysicalGpu;
604 /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
605 if (cr->nnodes + cr->npmenodes > 1)
607 int count[3], count_max[3];
609 count[0] = -nth_omp_min;
610 count[1] = nth_omp_max;
611 count[2] = willUsePhysicalGpu;
613 MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);
615 /* In case of an inhomogeneous run setup we use the maximum counts */
616 nth_omp_min = -count_max[0];
617 nth_omp_max = count_max[1];
618 anyRankIsUsingGpus = count_max[2] > 0;
621 int nthreads_omp_mpi_ok_min;
623 if (!anyRankIsUsingGpus)
625 nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
627 else
629 /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
630 * cases where the user specifies #ranks == #cores.
632 nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
635 if (DOMAINDECOMP(cr) && cr->nnodes > 1)
637 if (nth_omp_max < nthreads_omp_mpi_ok_min ||
638 nth_omp_max > nthreads_omp_mpi_ok_max)
640 /* Note that we print target_max here, not ok_max */
641 sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
642 nth_omp_max,
643 nthreads_omp_mpi_ok_min,
644 nthreads_omp_mpi_target_max);
646 if (bNtOmpOptionSet)
648 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
650 else
652 /* This fatal error, and the one below, is nasty, but it's
653 * probably the only way to ensure that all users don't waste
654 * a lot of resources, since many users don't read logs/stderr.
656 gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
660 else
662 const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
664 /* No domain decomposition (or only one domain) */
665 if (nth_omp_max > nthreads_omp_faster(cpuInfo, anyRankIsUsingGpus))
667 /* To arrive here, the user/system set #ranks and/or #OMPthreads */
668 gmx_bool bEnvSet;
669 char buf2[256];
671 bEnvSet = (getenv("OMP_NUM_THREADS") != nullptr);
673 if (bNtOmpOptionSet || bEnvSet)
675 sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
677 else
679 sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
680 cr->nnodes + cr->npmenodes,
681 cr->nnodes + cr->npmenodes == 1 ? "" : "s",
682 numTotalThreads > 0 ? numTotalThreads : hwinfo->nthreads_hw_avail,
683 hwinfo->nphysicalnode > 1 ? "on a node " : "",
684 nth_omp_max);
686 sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
687 buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);
689 /* We can not quit with a fatal error when OMP_NUM_THREADS is set
690 * with different values per rank or node, since in that case
691 * the user can not set -ntomp to override the error.
693 if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
695 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
697 else
699 gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
703 #else /* GMX_OPENMP && GMX_MPI */
704 /* No OpenMP and/or MPI: it doesn't make much sense to check */
705 GMX_UNUSED_VALUE(bNtOmpOptionSet);
706 GMX_UNUSED_VALUE(numTotalThreads);
707 GMX_UNUSED_VALUE(willUsePhysicalGpu);
708 GMX_UNUSED_VALUE(cr);
709 /* Check if we have more than 1 physical core, if detected,
710 * or more than 1 hardware thread if physical cores were not detected.
712 if (!GMX_OPENMP && !GMX_MPI && hwinfo->hardwareTopology->numberOfCores() > 1)
714 GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core");
716 #endif /* GMX_OPENMP && GMX_MPI */
720 static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
722 fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
723 hw_opt->nthreads_tot,
724 hw_opt->nthreads_tmpi,
725 hw_opt->nthreads_omp,
726 hw_opt->nthreads_omp_pme,
727 hw_opt->gpuIdTaskAssignment.c_str());
730 /* Checks we can do when we don't (yet) know the cut-off scheme */
731 void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
732 const t_commrec *cr,
733 int nPmeRanks)
735 /* Currently hw_opt only contains default settings or settings supplied
736 * by the user on the command line.
738 if (hw_opt->nthreads_omp < 0)
740 gmx_fatal(FARGS, "The number of OpenMP threads supplied on the command line is %d, which is negative and not allowed", hw_opt->nthreads_omp);
743 /* Check for OpenMP settings stored in environment variables, which can
744 * potentially be different on different MPI ranks.
746 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, SIMMASTER(cr));
748 /* Check restrictions on the user supplied options before modifying them.
749 * TODO: Put the user values in a const struct and preserve them.
751 #if !GMX_THREAD_MPI
752 if (hw_opt->nthreads_tot > 0)
754 gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
756 if (hw_opt->nthreads_tmpi > 0)
758 gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
760 #endif
762 if (bHasOmpSupport)
764 /* Check restrictions on PME thread related options set by the user */
766 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
768 gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
771 if (hw_opt->nthreads_omp_pme >= 1 &&
772 hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
773 nPmeRanks <= 0)
775 /* This can result in a fatal error on many MPI ranks,
776 * but since the thread count can differ per rank,
777 * we can't easily avoid this.
779 gmx_fatal(FARGS, "You need to explicitly specify the number of PME ranks (-npme) when using different number of OpenMP threads for PP and PME ranks");
782 else
784 /* GROMACS was configured without OpenMP support */
786 if (hw_opt->nthreads_omp > 1 || hw_opt->nthreads_omp_pme > 1)
788 gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
790 hw_opt->nthreads_omp = 1;
791 hw_opt->nthreads_omp_pme = 1;
794 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
796 /* We have the same number of OpenMP threads for PP and PME ranks,
797 * thus we can perform several consistency checks.
799 if (hw_opt->nthreads_tmpi > 0 &&
800 hw_opt->nthreads_omp > 0 &&
801 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
803 gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
804 hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
807 if (hw_opt->nthreads_tmpi > 0 &&
808 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
810 gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
811 hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
814 if (hw_opt->nthreads_omp > 0 &&
815 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
817 gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
818 hw_opt->nthreads_tot, hw_opt->nthreads_omp);
822 if (hw_opt->nthreads_tot > 0)
824 if (hw_opt->nthreads_omp > hw_opt->nthreads_tot)
826 gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads. Choose a total number of threads that is a multiple of the number of OpenMP threads.",
827 hw_opt->nthreads_omp, hw_opt->nthreads_tot);
830 if (hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
832 gmx_fatal(FARGS, "You requested %d thread-MPI ranks with %d total threads. Choose a total number of threads that is a multiple of the number of thread-MPI ranks.",
833 hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
837 if (debug)
839 print_hw_opt(debug, hw_opt);
842 /* Asserting this simplifies the hardware resource division later
843 * on. */
844 GMX_RELEASE_ASSERT(!(hw_opt->nthreads_omp_pme >= 1 && hw_opt->nthreads_omp <= 0),
845 "PME thread count should only be set when the normal thread count is also set");
848 /* Checks we can do when we know the cut-off scheme */
849 void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
850 int cutoff_scheme)
852 if (cutoff_scheme == ecutsGROUP)
854 /* We only have OpenMP support for PME only nodes */
855 if (hw_opt->nthreads_omp > 1)
857 gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
858 ecutscheme_names[cutoff_scheme],
859 ecutscheme_names[ecutsVERLET]);
861 hw_opt->nthreads_omp = 1;
865 /* Checks we can do when we know the thread-MPI rank count */
866 void check_and_update_hw_opt_3(gmx_hw_opt_t *hw_opt)
868 #if GMX_THREAD_MPI
869 GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
871 /* If the user set the total number of threads on the command line
872 * and did not specify the number of OpenMP threads, set the latter here.
874 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
876 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
878 if (!bHasOmpSupport && hw_opt->nthreads_omp > 1)
880 gmx_fatal(FARGS, "You (indirectly) asked for OpenMP threads by setting -nt > -ntmpi, but GROMACS was compiled without OpenMP support");
883 #endif
885 GMX_RELEASE_ASSERT(bHasOmpSupport || hw_opt->nthreads_omp == 1, "Without OpenMP support, only one thread per rank can be used");
887 /* We are done with updating nthreads_omp, we can set nthreads_omp_pme */
888 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
890 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
893 if (debug)
895 print_hw_opt(debug, hw_opt);