Merge branch release-2016
[gromacs.git] / src / gromacs / taskassignment / resourcedivision.cpp
blobfecba7db1f3a23f016287c401e1c8113f9a1367e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief Defines utility functionality for dividing resources and
37 * checking for consistency and usefulness.
39 * \author Mark Abraham <mark.j.abraham@gmail.com>
40 * \ingroup module_taskassignment
43 #include "gmxpre.h"
45 #include "resourcedivision.h"
47 #include "config.h"
49 #include <stdlib.h>
50 #include <string.h>
52 #include <algorithm>
54 #include "gromacs/hardware/cpuinfo.h"
55 #include "gromacs/hardware/detecthardware.h"
56 #include "gromacs/hardware/hardwaretopology.h"
57 #include "gromacs/hardware/hw_info.h"
58 #include "gromacs/mdlib/gmx_omp_nthreads.h"
59 #include "gromacs/mdtypes/commrec.h"
60 #include "gromacs/mdtypes/inputrec.h"
61 #include "gromacs/mdtypes/md_enums.h"
62 #include "gromacs/taskassignment/hardwareassign.h"
63 #include "gromacs/topology/mtop_util.h"
64 #include "gromacs/topology/topology.h"
65 #include "gromacs/utility/baseversion.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/gmxassert.h"
68 #include "gromacs/utility/logger.h"
69 #include "gromacs/utility/stringutil.h"
72 /* DISCLAIMER: All the atom count and thread numbers below are heuristic.
73 * The real switching points will depend on the system simulation,
74 * the algorithms used and the hardware it's running on, as well as if there
75 * are other jobs running on the same machine. We try to take into account
76 * factors that have a large influence, such as recent Intel CPUs being
77 * much better at wide multi-threading. The remaining factors should
78 * (hopefully) have a small influence, such that the performance just before
79 * and after a switch point doesn't change too much.
82 //! Constant used to help minimize preprocessed code
83 static const bool bHasOmpSupport = GMX_OPENMP;
85 #if GMX_THREAD_MPI
86 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
87 * the number of threads will get lowered.
89 static const int min_atoms_per_mpi_thread = 90;
90 static const int min_atoms_per_gpu = 900;
91 #endif /* GMX_THREAD_MPI */
93 /**@{*/
94 /*! \brief Constants for implementing default divisions of threads */
96 /* TODO choose nthreads_omp based on hardware topology
97 when we have a hardware topology detection library */
98 /* First we consider the case of no MPI (1 MPI rank).
99 * In general, when running up to 8 threads, OpenMP should be faster.
100 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
101 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
102 * even on two CPUs it's usually faster (but with many OpenMP threads
103 * it could be faster not to use HT, currently we always use HT).
104 * On Nehalem/Westmere we want to avoid running 16 threads over
105 * two CPUs with HT, so we need a limit<16; thus we use 12.
106 * A reasonable limit for Intel Sandy and Ivy bridge,
107 * not knowing the topology, is 16 threads.
108 * Below we check for Intel and AVX, which for now includes
109 * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
110 * model numbers we ensure also future Intel CPUs are covered.
112 const int nthreads_omp_faster_default = 8;
113 const int nthreads_omp_faster_Nehalem = 12;
114 const int nthreads_omp_faster_Intel_AVX = 16;
115 const int nthreads_omp_faster_AMD_Ryzen = 16;
116 /* For CPU only runs the fastest options are usually MPI or OpenMP only.
117 * With one GPU, using MPI only is almost never optimal, so we need to
118 * compare running pure OpenMP with combined MPI+OpenMP. This means higher
119 * OpenMP threads counts can still be ok. Multiplying the numbers above
120 * by a factor of 2 seems to be a good estimate.
122 const int nthreads_omp_faster_gpu_fac = 2;
124 /* This is the case with MPI (2 or more MPI PP ranks).
125 * By default we will terminate with a fatal error when more than 8
126 * OpenMP thread are (indirectly) requested, since using less threads
127 * nearly always results in better performance.
128 * With thread-mpi and multiple GPUs or one GPU and too many threads
129 * we first try 6 OpenMP threads and then less until the number of MPI ranks
130 * is divisible by the number of GPUs.
132 #if GMX_OPENMP && GMX_MPI
133 const int nthreads_omp_mpi_ok_max = 8;
134 const int nthreads_omp_mpi_ok_min_cpu = 1;
135 #endif
136 const int nthreads_omp_mpi_ok_min_gpu = 2;
137 const int nthreads_omp_mpi_target_max = 6;
139 /**@}*/
141 /*! \brief Returns the maximum OpenMP thread count for which using a single MPI rank
142 * should be faster than using multiple ranks with the same total thread count.
144 static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
146 int nth;
148 if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Intel &&
149 cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx))
151 nth = nthreads_omp_faster_Intel_AVX;
153 else if (gmx::cpuIsX86Nehalem(cpuInfo))
155 // Intel Nehalem
156 nth = nthreads_omp_faster_Nehalem;
158 else if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() >= 23)
160 // AMD Ryzen
161 nth = nthreads_omp_faster_AMD_Ryzen;
163 else
165 nth = nthreads_omp_faster_default;
168 if (bUseGPU)
170 nth *= nthreads_omp_faster_gpu_fac;
173 nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
175 return nth;
178 /*! \brief Returns that maximum OpenMP thread count that passes the efficiency check */
179 gmx_unused static int nthreads_omp_efficient_max(int gmx_unused nrank,
180 const gmx::CpuInfo &cpuInfo,
181 gmx_bool bUseGPU)
183 #if GMX_OPENMP && GMX_MPI
184 if (nrank > 1)
186 return nthreads_omp_mpi_ok_max;
188 else
189 #endif
191 return nthreads_omp_faster(cpuInfo, bUseGPU);
195 /*! \brief Return the number of thread-MPI ranks to use.
196 * This is chosen such that we can always obey our own efficiency checks.
198 gmx_unused static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
199 const gmx_hw_opt_t &hw_opt,
200 int nthreads_tot,
201 int ngpu)
203 int nrank;
204 const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
206 GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank");
208 /* There are no separate PME nodes here, as we ensured in
209 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
210 * and a conditional ensures we would not have ended up here.
211 * Note that separate PME nodes might be switched on later.
213 if (ngpu > 0)
215 nrank = ngpu;
217 /* When the user sets nthreads_omp, we can end up oversubscribing CPU cores
218 * if we simply start as many ranks as GPUs. To avoid this, we start as few
219 * tMPI ranks as necessary to avoid oversubscription and instead leave GPUs idle.
220 * If the user does not set the number of OpenMP threads, nthreads_omp==0 and
221 * this code has no effect.
223 GMX_RELEASE_ASSERT(hw_opt.nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
224 while (nrank*hw_opt.nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
226 nrank--;
229 if (nthreads_tot < nrank)
231 /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
232 nrank = nthreads_tot;
234 else if (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
235 (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))
237 /* The high OpenMP thread count will likely result in sub-optimal
238 * performance. Increase the rank count to reduce the thread count
239 * per rank. This will lead to GPU sharing by MPI ranks/threads.
241 int nshare;
243 /* Increase the rank count as long as have we more than 6 OpenMP
244 * threads per rank or the number of hardware threads is not
245 * divisible by the rank count. Don't go below 2 OpenMP threads.
247 nshare = 1;
250 nshare++;
251 nrank = ngpu*nshare;
253 while (nthreads_tot/nrank > nthreads_omp_mpi_target_max ||
254 (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
257 else if (hw_opt.nthreads_omp > 0)
259 /* Here we could oversubscribe, when we do, we issue a warning later */
260 nrank = std::max(1, nthreads_tot/hw_opt.nthreads_omp);
262 else
264 if (nthreads_tot <= nthreads_omp_faster(cpuInfo, ngpu > 0))
266 /* Use pure OpenMP parallelization */
267 nrank = 1;
269 else
271 /* Don't use OpenMP parallelization */
272 nrank = nthreads_tot;
276 return nrank;
280 #if GMX_THREAD_MPI
282 static bool
283 gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
285 return (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic && hwTop.machine().sockets[0].cores[0].hwThreads.size() > 1);
288 namespace
291 class SingleRankChecker
293 public:
294 //! Constructor
295 SingleRankChecker() : value_(false), reasons_() {}
296 /*! \brief Call this function for each possible condition
297 under which a single rank is required, along with a string
298 describing the constraint when it is applied. */
299 void applyConstraint(bool condition, const char *description)
301 if (condition)
303 value_ = true;
304 reasons_.push_back(gmx::formatString("%s only supports a single rank.", description));
307 //! After applying any conditions, is a single rank required?
308 bool mustUseOneRank() const
310 return value_;
312 /*! \brief Return a formatted string to use when writing a
313 message when a single rank is required, (or empty if no
314 constraint exists.) */
315 std::string getMessage() const
317 return formatAndJoin(reasons_, "\n", gmx::IdentityFormatter());
319 private:
320 bool value_;
321 std::vector<std::string> reasons_;
324 } // namespace
326 /* Get the number of MPI ranks to use for thread-MPI based on how many
327 * were requested, which algorithms we're using,
328 * and how many particles there are.
329 * At the point we have already called check_and_update_hw_opt.
330 * Thus all options should be internally consistent and consistent
331 * with the hardware, except that ntmpi could be larger than #GPU.
333 int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
334 gmx_hw_opt_t *hw_opt,
335 const std::vector<int> &userGpuIds,
336 int numPmeRanks,
337 bool nonbondedOnGpu,
338 const t_inputrec *inputrec,
339 const gmx_mtop_t *mtop,
340 const gmx::MDLogger &mdlog,
341 bool doMembed)
343 int nthreads_hw, nthreads_tot_max, nrank, ngpu;
344 int min_atoms_per_mpi_rank;
346 const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
347 const gmx::HardwareTopology &hwTop = *hwinfo->hardwareTopology;
349 /* If the user made a GPU task assignment, that sets the number of thread-MPI ranks. */
350 int numGpuIdsSupplied = static_cast<int>(userGpuIds.size());
352 /* TODO Here we handle the case where the user set GPU IDs, and
353 further below we handle the case where the algorithm does not
354 support multiple ranks. We need also to handle the case where
355 the user set multiple GPU IDs for an algorithm that cannot
356 handle multiple ranks. */
357 if (hw_opt->nthreads_tmpi < 1 && numGpuIdsSupplied > 0)
359 /* If the user chose both mdrun -nt -gpu_id, is that consistent? */
360 if (numPmeRanks <= 0)
362 if (hw_opt->nthreads_tot > 0 &&
363 (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0)
365 gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied);
367 return numGpuIdsSupplied;
369 else
371 gmx_fatal(FARGS, "The combination of choosing a number of PME ranks, and specific GPU IDs "
372 "is not supported. Use also -ntmpi and/or -ntomp and -ntomp_pme to specify what "
373 "distribution of threads to ranks you require.");
378 /* Check if an algorithm does not support parallel simulation. */
379 // TODO This might work better if e.g. implemented algorithms
380 // had to define a function that returns such requirements,
381 // and a description string.
382 SingleRankChecker checker;
383 checker.applyConstraint(inputrec->eI == eiLBFGS, "L-BFGS minimization");
384 checker.applyConstraint(inputrec->coulombtype == eelEWALD, "Plain Ewald electrostatics");
385 checker.applyConstraint(doMembed, "Membrane embedding");
386 bool useOrientationRestraints = (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0);
387 checker.applyConstraint(useOrientationRestraints, "Orientation restraints");
388 if (checker.mustUseOneRank())
390 std::string message = checker.getMessage();
391 if (hw_opt->nthreads_tmpi > 1)
393 gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str());
395 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str());
397 if (numGpuIdsSupplied > 1)
399 gmx_fatal(FARGS, "You supplied %d GPU IDs but only 1 rank can be used "
400 "by this simulation. Supply only one GPU ID.", numGpuIdsSupplied);
402 return 1;
406 if (hw_opt->nthreads_tmpi > 0)
408 if (numPmeRanks <= 0)
410 int numPpRanks = hw_opt->nthreads_tmpi;
411 if ((numGpuIdsSupplied > 0) &&
412 (numGpuIdsSupplied != numPpRanks))
414 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d "
415 "GPU IDs supplied. The number of particle-particle (PP) ranks and the "
416 "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numGpuIdsSupplied);
419 else
421 int numPpRanks = hw_opt->nthreads_tmpi - numPmeRanks;
422 if ((numGpuIdsSupplied > 0) &&
423 (numGpuIdsSupplied != numPpRanks))
425 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d PME ranks and %d "
426 "GPU IDs supplied. The number of particle-particle ranks and the "
427 "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numPmeRanks, numGpuIdsSupplied);
430 /* Trivial, return the user's choice right away */
431 return hw_opt->nthreads_tmpi;
433 GMX_RELEASE_ASSERT(numGpuIdsSupplied == 0,
434 "If mdrun -gpu_id had information, the number of ranks should have already been chosen");
436 // Now implement automatic selection of number of thread-MPI ranks
437 nthreads_hw = hwinfo->nthreads_hw_avail;
439 if (nthreads_hw <= 0)
441 /* This should normally not happen, but if it does, we handle it */
442 gmx_fatal(FARGS, "The number of available hardware threads can not be detected, please specify the number of MPI ranks and the number of OpenMP threads (if supported) manually with options -ntmpi and -ntomp, respectively");
445 /* How many total (#tMPI*#OpenMP) threads can we start? */
446 if (hw_opt->nthreads_tot > 0)
448 nthreads_tot_max = hw_opt->nthreads_tot;
450 else
452 nthreads_tot_max = nthreads_hw;
455 /* nonbondedOnGpu might be false e.g. because this simulation uses
456 * the group scheme, or is a rerun with energy groups. */
457 ngpu = (nonbondedOnGpu ? hwinfo->gpu_info.n_dev_compatible : 0);
459 if (inputrec->cutoff_scheme == ecutsGROUP)
461 /* We checked this before, but it doesn't hurt to do it once more */
462 GMX_RELEASE_ASSERT(hw_opt->nthreads_omp == 1, "The group scheme only supports one OpenMP thread per rank");
465 nrank =
466 get_tmpi_omp_thread_division(hwinfo, *hw_opt, nthreads_tot_max, ngpu);
468 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
470 /* Dims/steps are divided over the nodes iso splitting the atoms.
471 * With NM we can't have more ranks than #atoms*#dim. With TPI it's
472 * unlikely we have fewer atoms than ranks, and if so, communication
473 * would become a bottleneck, so we set the limit to 1 atom/rank.
475 min_atoms_per_mpi_rank = 1;
477 else
479 if (ngpu >= 1)
481 min_atoms_per_mpi_rank = min_atoms_per_gpu;
483 else
485 min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
489 if (mtop->natoms/nrank < min_atoms_per_mpi_rank)
491 int nrank_new;
493 /* the rank number was chosen automatically, but there are too few
494 atoms per rank, so we need to reduce the rank count */
495 nrank_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);
497 /* Avoid partial use of Hyper-Threading */
498 if (gmxSmtIsEnabled(hwTop) &&
499 nrank_new > nthreads_hw/2 && nrank_new < nthreads_hw)
501 nrank_new = nthreads_hw/2;
504 /* If the user specified the total thread count, ensure this is
505 * divisible by the number of ranks.
506 * It is quite likely that we have too many total threads compared
507 * to the size of the system, but if the user asked for this many
508 * threads we should respect that.
510 while (hw_opt->nthreads_tot > 0 &&
511 hw_opt->nthreads_tot % nrank_new != 0)
513 nrank_new--;
516 /* Avoid large prime numbers in the rank count */
517 if (nrank_new >= 6)
519 /* Use only 6,8,10 with additional factors of 2 */
520 int fac;
522 fac = 2;
523 while (3*fac*2 <= nrank_new)
525 fac *= 2;
528 nrank_new = (nrank_new/fac)*fac;
530 else
532 /* Avoid 5, since small system won't fit 5 domains along
533 * a dimension. This might lead to waisting some cores, but this
534 * will have a small impact in this regime of very small systems.
536 if (nrank_new == 5)
538 nrank_new = 4;
542 nrank = nrank_new;
544 /* We reduced the number of tMPI ranks, which means we might violate
545 * our own efficiency checks if we simply use all hardware threads.
547 if (bHasOmpSupport && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
549 /* The user set neither the total nor the OpenMP thread count,
550 * we should use all hardware threads, unless we will violate
551 * our own efficiency limitation on the thread count.
553 int nt_omp_max;
555 nt_omp_max = nthreads_omp_efficient_max(nrank, cpuInfo, ngpu >= 1);
557 if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
559 /* Limit the number of OpenMP threads to start */
560 hw_opt->nthreads_omp = nt_omp_max;
564 fprintf(stderr, "\n");
565 fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
566 fprintf(stderr, " only starting %d thread-MPI ranks.\n", nrank);
567 fprintf(stderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
570 return nrank;
572 #endif /* GMX_THREAD_MPI */
575 void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
576 int numTotalThreads,
577 bool willUsePhysicalGpu,
578 gmx_bool bNtOmpOptionSet,
579 t_commrec *cr,
580 const gmx::MDLogger &mdlog)
582 #if GMX_OPENMP && GMX_MPI
583 int nth_omp_min, nth_omp_max;
584 char buf[1000];
585 #if GMX_THREAD_MPI
586 const char *mpi_option = " (option -ntmpi)";
587 #else
588 const char *mpi_option = "";
589 #endif
591 /* This function should be called after thread-MPI (when configured) and
592 * OpenMP have been initialized. Check that here.
594 #if GMX_THREAD_MPI
595 GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
596 #endif
597 GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");
599 nth_omp_min = gmx_omp_nthreads_get(emntDefault);
600 nth_omp_max = gmx_omp_nthreads_get(emntDefault);
602 bool anyRankIsUsingGpus = willUsePhysicalGpu;
603 /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
604 if (cr->nnodes + cr->npmenodes > 1)
606 int count[3], count_max[3];
608 count[0] = -nth_omp_min;
609 count[1] = nth_omp_max;
610 count[2] = willUsePhysicalGpu;
612 MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);
614 /* In case of an inhomogeneous run setup we use the maximum counts */
615 nth_omp_min = -count_max[0];
616 nth_omp_max = count_max[1];
617 anyRankIsUsingGpus = count_max[2] > 0;
620 int nthreads_omp_mpi_ok_min;
622 if (!anyRankIsUsingGpus)
624 nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
626 else
628 /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
629 * cases where the user specifies #ranks == #cores.
631 nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
634 if (DOMAINDECOMP(cr) && cr->nnodes > 1)
636 if (nth_omp_max < nthreads_omp_mpi_ok_min ||
637 nth_omp_max > nthreads_omp_mpi_ok_max)
639 /* Note that we print target_max here, not ok_max */
640 sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
641 nth_omp_max,
642 nthreads_omp_mpi_ok_min,
643 nthreads_omp_mpi_target_max);
645 if (bNtOmpOptionSet)
647 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
649 else
651 /* This fatal error, and the one below, is nasty, but it's
652 * probably the only way to ensure that all users don't waste
653 * a lot of resources, since many users don't read logs/stderr.
655 gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
659 else
661 const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
663 /* No domain decomposition (or only one domain) */
664 if (nth_omp_max > nthreads_omp_faster(cpuInfo, anyRankIsUsingGpus))
666 /* To arrive here, the user/system set #ranks and/or #OMPthreads */
667 gmx_bool bEnvSet;
668 char buf2[256];
670 bEnvSet = (getenv("OMP_NUM_THREADS") != nullptr);
672 if (bNtOmpOptionSet || bEnvSet)
674 sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
676 else
678 sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
679 cr->nnodes + cr->npmenodes,
680 cr->nnodes + cr->npmenodes == 1 ? "" : "s",
681 numTotalThreads > 0 ? numTotalThreads : hwinfo->nthreads_hw_avail,
682 hwinfo->nphysicalnode > 1 ? "on a node " : "",
683 nth_omp_max);
685 sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
686 buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);
688 /* We can not quit with a fatal error when OMP_NUM_THREADS is set
689 * with different values per rank or node, since in that case
690 * the user can not set -ntomp to override the error.
692 if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
694 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
696 else
698 gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
702 #else /* GMX_OPENMP && GMX_MPI */
703 /* No OpenMP and/or MPI: it doesn't make much sense to check */
704 GMX_UNUSED_VALUE(bNtOmpOptionSet);
705 GMX_UNUSED_VALUE(numTotalThreads);
706 GMX_UNUSED_VALUE(willUsePhysicalGpu);
707 GMX_UNUSED_VALUE(cr);
708 /* Check if we have more than 1 physical core, if detected,
709 * or more than 1 hardware thread if physical cores were not detected.
711 if (!GMX_OPENMP && !GMX_MPI && hwinfo->hardwareTopology->numberOfCores() > 1)
713 GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core");
715 #endif /* GMX_OPENMP && GMX_MPI */
719 //! Dump a \c hw_opt to \c fp.
720 static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
722 fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
723 hw_opt->nthreads_tot,
724 hw_opt->nthreads_tmpi,
725 hw_opt->nthreads_omp,
726 hw_opt->nthreads_omp_pme,
727 hw_opt->gpuIdTaskAssignment.c_str());
730 void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
731 const t_commrec *cr,
732 int nPmeRanks)
734 /* Currently hw_opt only contains default settings or settings supplied
735 * by the user on the command line.
737 if (hw_opt->nthreads_omp < 0)
739 gmx_fatal(FARGS, "The number of OpenMP threads supplied on the command line is %d, which is negative and not allowed", hw_opt->nthreads_omp);
742 /* Check for OpenMP settings stored in environment variables, which can
743 * potentially be different on different MPI ranks.
745 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, SIMMASTER(cr));
747 /* Check restrictions on the user supplied options before modifying them.
748 * TODO: Put the user values in a const struct and preserve them.
750 #if !GMX_THREAD_MPI
751 if (hw_opt->nthreads_tot > 0)
753 gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
755 if (hw_opt->nthreads_tmpi > 0)
757 gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
759 #endif
761 if (bHasOmpSupport)
763 /* Check restrictions on PME thread related options set by the user */
765 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
767 gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
770 if (hw_opt->nthreads_omp_pme >= 1 &&
771 hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
772 nPmeRanks <= 0)
774 /* This can result in a fatal error on many MPI ranks,
775 * but since the thread count can differ per rank,
776 * we can't easily avoid this.
778 gmx_fatal(FARGS, "You need to explicitly specify the number of PME ranks (-npme) when using different number of OpenMP threads for PP and PME ranks");
781 else
783 /* GROMACS was configured without OpenMP support */
785 if (hw_opt->nthreads_omp > 1 || hw_opt->nthreads_omp_pme > 1)
787 gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
789 hw_opt->nthreads_omp = 1;
790 hw_opt->nthreads_omp_pme = 1;
793 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
795 /* We have the same number of OpenMP threads for PP and PME ranks,
796 * thus we can perform several consistency checks.
798 if (hw_opt->nthreads_tmpi > 0 &&
799 hw_opt->nthreads_omp > 0 &&
800 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
802 gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
803 hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
806 if (hw_opt->nthreads_tmpi > 0 &&
807 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
809 gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
810 hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
813 if (hw_opt->nthreads_omp > 0 &&
814 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
816 gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
817 hw_opt->nthreads_tot, hw_opt->nthreads_omp);
821 if (hw_opt->nthreads_tot > 0)
823 if (hw_opt->nthreads_omp > hw_opt->nthreads_tot)
825 gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads. Choose a total number of threads that is a multiple of the number of OpenMP threads.",
826 hw_opt->nthreads_omp, hw_opt->nthreads_tot);
829 if (hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
831 gmx_fatal(FARGS, "You requested %d thread-MPI ranks with %d total threads. Choose a total number of threads that is a multiple of the number of thread-MPI ranks.",
832 hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
836 if (debug)
838 print_hw_opt(debug, hw_opt);
841 /* Asserting this simplifies the hardware resource division later
842 * on. */
843 GMX_RELEASE_ASSERT(!(hw_opt->nthreads_omp_pme >= 1 && hw_opt->nthreads_omp <= 0),
844 "PME thread count should only be set when the normal thread count is also set");
847 void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
848 int cutoff_scheme)
850 if (cutoff_scheme == ecutsGROUP)
852 /* We only have OpenMP support for PME only nodes */
853 if (hw_opt->nthreads_omp > 1)
855 gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
856 ecutscheme_names[cutoff_scheme],
857 ecutscheme_names[ecutsVERLET]);
859 hw_opt->nthreads_omp = 1;
863 void check_and_update_hw_opt_3(gmx_hw_opt_t *hw_opt)
865 #if GMX_THREAD_MPI
866 GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
868 /* If the user set the total number of threads on the command line
869 * and did not specify the number of OpenMP threads, set the latter here.
871 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
873 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
875 if (!bHasOmpSupport && hw_opt->nthreads_omp > 1)
877 gmx_fatal(FARGS, "You (indirectly) asked for OpenMP threads by setting -nt > -ntmpi, but GROMACS was compiled without OpenMP support");
880 #endif
882 GMX_RELEASE_ASSERT(bHasOmpSupport || hw_opt->nthreads_omp == 1, "Without OpenMP support, only one thread per rank can be used");
884 /* We are done with updating nthreads_omp, we can set nthreads_omp_pme */
885 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
887 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
890 if (debug)
892 print_hw_opt(debug, hw_opt);