src/gromacs/taskassignment/resourcedivision.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Defines utility functionality for dividing resources and
  37  * checking for consistency and usefulness.
  38  *
  39  * \author Mark Abraham <mark.j.abraham@gmail.com>
  40  * \ingroup module_taskassignment
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include "resourcedivision.h"
  46
  47 #include "config.h"
  48
  49 #include <stdlib.h>
  50 #include <string.h>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/hardware/cpuinfo.h"
  55 #include "gromacs/hardware/detecthardware.h"
  56 #include "gromacs/hardware/hardwaretopology.h"
  57 #include "gromacs/hardware/hw_info.h"
  58 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  59 #include "gromacs/mdtypes/commrec.h"
  60 #include "gromacs/mdtypes/inputrec.h"
  61 #include "gromacs/mdtypes/md_enums.h"
  62 #include "gromacs/taskassignment/hardwareassign.h"
  63 #include "gromacs/topology/mtop_util.h"
  64 #include "gromacs/topology/topology.h"
  65 #include "gromacs/utility/baseversion.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/gmxassert.h"
  68 #include "gromacs/utility/logger.h"
  69 #include "gromacs/utility/stringutil.h"
  70
  71
  72 /* DISCLAIMER: All the atom count and thread numbers below are heuristic.
  73  * The real switching points will depend on the system simulation,
  74  * the algorithms used and the hardware it's running on, as well as if there
  75  * are other jobs running on the same machine. We try to take into account
  76  * factors that have a large influence, such as recent Intel CPUs being
  77  * much better at wide multi-threading. The remaining factors should
  78  * (hopefully) have a small influence, such that the performance just before
  79  * and after a switch point doesn't change too much.
  80  */
  81
  82 //! Constant used to help minimize preprocessed code
  83 static const bool bHasOmpSupport = GMX_OPENMP;
  84
  85 #if GMX_THREAD_MPI
  86 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
  87  * the number of threads will get lowered.
  88  */
  89 static const int min_atoms_per_mpi_thread =  90;
  90 static const int min_atoms_per_gpu        = 900;
  91 #endif /* GMX_THREAD_MPI */
  92
  93 /**@{*/
  94 /*! \brief Constants for implementing default divisions of threads */
  95
  96 /* TODO choose nthreads_omp based on hardware topology
  97    when we have a hardware topology detection library */
  98 /* First we consider the case of no MPI (1 MPI rank).
  99  * In general, when running up to 8 threads, OpenMP should be faster.
 100  * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 101  * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 102  * even on two CPUs it's usually faster (but with many OpenMP threads
 103  * it could be faster not to use HT, currently we always use HT).
 104  * On Nehalem/Westmere we want to avoid running 16 threads over
 105  * two CPUs with HT, so we need a limit<16; thus we use 12.
 106  * A reasonable limit for Intel Sandy and Ivy bridge,
 107  * not knowing the topology, is 16 threads.
 108  * Below we check for Intel and AVX, which for now includes
 109  * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
 110  * model numbers we ensure also future Intel CPUs are covered.
 111  */
 112 const int nthreads_omp_faster_default   =  8;
 113 const int nthreads_omp_faster_Nehalem   = 12;
 114 const int nthreads_omp_faster_Intel_AVX = 16;
 115 const int nthreads_omp_faster_AMD_Ryzen = 16;
 116 /* For CPU only runs the fastest options are usually MPI or OpenMP only.
 117  * With one GPU, using MPI only is almost never optimal, so we need to
 118  * compare running pure OpenMP with combined MPI+OpenMP. This means higher
 119  * OpenMP threads counts can still be ok. Multiplying the numbers above
 120  * by a factor of 2 seems to be a good estimate.
 121  */
 122 const int nthreads_omp_faster_gpu_fac   =  2;
 123
 124 /* This is the case with MPI (2 or more MPI PP ranks).
 125  * By default we will terminate with a fatal error when more than 8
 126  * OpenMP thread are (indirectly) requested, since using less threads
 127  * nearly always results in better performance.
 128  * With thread-mpi and multiple GPUs or one GPU and too many threads
 129  * we first try 6 OpenMP threads and then less until the number of MPI ranks
 130  * is divisible by the number of GPUs.
 131  */
 132 #if GMX_OPENMP && GMX_MPI
 133 const int nthreads_omp_mpi_ok_max              =  8;
 134 const int nthreads_omp_mpi_ok_min_cpu          =  1;
 135 #endif
 136 const int nthreads_omp_mpi_ok_min_gpu          =  2;
 137 const int nthreads_omp_mpi_target_max          =  6;
 138
 139 /**@}*/
 140
 141 /*! \brief Returns the maximum OpenMP thread count for which using a single MPI rank
 142  * should be faster than using multiple ranks with the same total thread count.
 143  */
 144 static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
 145 {
 146     int nth;
 147
 148     if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Intel &&
 149         cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx))
 150     {
 151         nth = nthreads_omp_faster_Intel_AVX;
 152     }
 153     else if (gmx::cpuIsX86Nehalem(cpuInfo))
 154     {
 155         // Intel Nehalem
 156         nth = nthreads_omp_faster_Nehalem;
 157     }
 158     else if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() >= 23)
 159     {
 160         // AMD Ryzen
 161         nth = nthreads_omp_faster_AMD_Ryzen;
 162     }
 163     else
 164     {
 165         nth = nthreads_omp_faster_default;
 166     }
 167
 168     if (bUseGPU)
 169     {
 170         nth *= nthreads_omp_faster_gpu_fac;
 171     }
 172
 173     nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
 174
 175     return nth;
 176 }
 177
 178 /*! \brief Returns that maximum OpenMP thread count that passes the efficiency check */
 179 gmx_unused static int nthreads_omp_efficient_max(int gmx_unused       nrank,
 180                                                  const gmx::CpuInfo  &cpuInfo,
 181                                                  gmx_bool             bUseGPU)
 182 {
 183 #if GMX_OPENMP && GMX_MPI
 184     if (nrank > 1)
 185     {
 186         return nthreads_omp_mpi_ok_max;
 187     }
 188     else
 189 #endif
 190     {
 191         return nthreads_omp_faster(cpuInfo, bUseGPU);
 192     }
 193 }
 194
 195 /*! \brief Return the number of thread-MPI ranks to use.
 196  * This is chosen such that we can always obey our own efficiency checks.
 197  */
 198 gmx_unused static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 199                                                    const gmx_hw_opt_t  &hw_opt,
 200                                                    int                  nthreads_tot,
 201                                                    int                  ngpu)
 202 {
 203     int                 nrank;
 204     const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
 205
 206     GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank");
 207
 208     /* There are no separate PME nodes here, as we ensured in
 209      * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 210      * and a conditional ensures we would not have ended up here.
 211      * Note that separate PME nodes might be switched on later.
 212      */
 213     if (ngpu > 0)
 214     {
 215         nrank = ngpu;
 216
 217         /* When the user sets nthreads_omp, we can end up oversubscribing CPU cores
 218          * if we simply start as many ranks as GPUs. To avoid this, we start as few
 219          * tMPI ranks as necessary to avoid oversubscription and instead leave GPUs idle.
 220          * If the user does not set the number of OpenMP threads, nthreads_omp==0 and
 221          * this code has no effect.
 222          */
 223         GMX_RELEASE_ASSERT(hw_opt.nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
 224         while (nrank*hw_opt.nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
 225         {
 226             nrank--;
 227         }
 228
 229         if (nthreads_tot < nrank)
 230         {
 231             /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
 232             nrank = nthreads_tot;
 233         }
 234         else if (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
 235                  (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))
 236         {
 237             /* The high OpenMP thread count will likely result in sub-optimal
 238              * performance. Increase the rank count to reduce the thread count
 239              * per rank. This will lead to GPU sharing by MPI ranks/threads.
 240              */
 241             int nshare;
 242
 243             /* Increase the rank count as long as have we more than 6 OpenMP
 244              * threads per rank or the number of hardware threads is not
 245              * divisible by the rank count. Don't go below 2 OpenMP threads.
 246              */
 247             nshare = 1;
 248             do
 249             {
 250                 nshare++;
 251                 nrank = ngpu*nshare;
 252             }
 253             while (nthreads_tot/nrank > nthreads_omp_mpi_target_max ||
 254                    (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
 255         }
 256     }
 257     else if (hw_opt.nthreads_omp > 0)
 258     {
 259         /* Here we could oversubscribe, when we do, we issue a warning later */
 260         nrank = std::max(1, nthreads_tot/hw_opt.nthreads_omp);
 261     }
 262     else
 263     {
 264         if (nthreads_tot <= nthreads_omp_faster(cpuInfo, ngpu > 0))
 265         {
 266             /* Use pure OpenMP parallelization */
 267             nrank = 1;
 268         }
 269         else
 270         {
 271             /* Don't use OpenMP parallelization */
 272             nrank = nthreads_tot;
 273         }
 274     }
 275
 276     return nrank;
 277 }
 278
 279
 280 #if GMX_THREAD_MPI
 281
 282 static bool
 283 gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
 284 {
 285     return (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic && hwTop.machine().sockets[0].cores[0].hwThreads.size() > 1);
 286 }
 287
 288 namespace
 289 {
 290
 291 class SingleRankChecker
 292 {
 293     public:
 294         //! Constructor
 295         SingleRankChecker() : value_(false), reasons_() {}
 296         /*! \brief Call this function for each possible condition
 297             under which a single rank is required, along with a string
 298             describing the constraint when it is applied. */
 299         void applyConstraint(bool condition, const char *description)
 300         {
 301             if (condition)
 302             {
 303                 value_ = true;
 304                 reasons_.push_back(gmx::formatString("%s only supports a single rank.", description));
 305             }
 306         }
 307         //! After applying any conditions, is a single rank required?
 308         bool mustUseOneRank() const
 309         {
 310             return value_;
 311         }
 312         /*! \brief Return a formatted string to use when writing a
 313             message when a single rank is required, (or empty if no
 314             constraint exists.) */
 315         std::string getMessage() const
 316         {
 317             return formatAndJoin(reasons_, "\n", gmx::IdentityFormatter());
 318         }
 319     private:
 320         bool                     value_;
 321         std::vector<std::string> reasons_;
 322 };
 323
 324 } // namespace
 325
 326 /* Get the number of MPI ranks to use for thread-MPI based on how many
 327  * were requested, which algorithms we're using,
 328  * and how many particles there are.
 329  * At the point we have already called check_and_update_hw_opt.
 330  * Thus all options should be internally consistent and consistent
 331  * with the hardware, except that ntmpi could be larger than #GPU.
 332  */
 333 int get_nthreads_mpi(const gmx_hw_info_t    *hwinfo,
 334                      gmx_hw_opt_t           *hw_opt,
 335                      const std::vector<int> &userGpuIds,
 336                      int                     numPmeRanks,
 337                      bool                    nonbondedOnGpu,
 338                      const t_inputrec       *inputrec,
 339                      const gmx_mtop_t       *mtop,
 340                      const gmx::MDLogger    &mdlog,
 341                      bool                    doMembed)
 342 {
 343     int                          nthreads_hw, nthreads_tot_max, nrank, ngpu;
 344     int                          min_atoms_per_mpi_rank;
 345
 346     const gmx::CpuInfo          &cpuInfo = *hwinfo->cpuInfo;
 347     const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
 348
 349     /* If the user made a GPU task assignment, that sets the number of thread-MPI ranks. */
 350     int  numGpuIdsSupplied = static_cast<int>(userGpuIds.size());
 351
 352     /* TODO Here we handle the case where the user set GPU IDs, and
 353        further below we handle the case where the algorithm does not
 354        support multiple ranks. We need also to handle the case where
 355        the user set multiple GPU IDs for an algorithm that cannot
 356        handle multiple ranks. */
 357     if (hw_opt->nthreads_tmpi < 1 && numGpuIdsSupplied > 0)
 358     {
 359         /* If the user chose both mdrun -nt -gpu_id, is that consistent? */
 360         if (numPmeRanks <= 0)
 361         {
 362             if (hw_opt->nthreads_tot > 0 &&
 363                 (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0)
 364             {
 365                 gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied);
 366             }
 367             return numGpuIdsSupplied;
 368         }
 369         else
 370         {
 371             gmx_fatal(FARGS, "The combination of choosing a number of PME ranks, and specific GPU IDs "
 372                       "is not supported. Use also -ntmpi and/or -ntomp and -ntomp_pme to specify what "
 373                       "distribution of threads to ranks you require.");
 374         }
 375     }
 376
 377     {
 378         /* Check if an algorithm does not support parallel simulation.  */
 379         // TODO This might work better if e.g. implemented algorithms
 380         // had to define a function that returns such requirements,
 381         // and a description string.
 382         SingleRankChecker checker;
 383         checker.applyConstraint(inputrec->eI == eiLBFGS, "L-BFGS minimization");
 384         checker.applyConstraint(inputrec->coulombtype == eelEWALD, "Plain Ewald electrostatics");
 385         checker.applyConstraint(doMembed, "Membrane embedding");
 386         bool useOrientationRestraints = (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0);
 387         checker.applyConstraint(useOrientationRestraints, "Orientation restraints");
 388         if (checker.mustUseOneRank())
 389         {
 390             std::string message = checker.getMessage();
 391             if (hw_opt->nthreads_tmpi > 1)
 392             {
 393                 gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str());
 394             }
 395             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str());
 396
 397             if (numGpuIdsSupplied > 1)
 398             {
 399                 gmx_fatal(FARGS, "You supplied %d GPU IDs but only 1 rank can be used "
 400                           "by this simulation. Supply only one GPU ID.", numGpuIdsSupplied);
 401             }
 402             return 1;
 403         }
 404     }
 405
 406     if (hw_opt->nthreads_tmpi > 0)
 407     {
 408         if (numPmeRanks <= 0)
 409         {
 410             int numPpRanks = hw_opt->nthreads_tmpi;
 411             if ((numGpuIdsSupplied > 0) &&
 412                 (numGpuIdsSupplied != numPpRanks))
 413             {
 414                 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d "
 415                           "GPU IDs supplied. The number of particle-particle (PP) ranks and the "
 416                           "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numGpuIdsSupplied);
 417             }
 418         }
 419         else
 420         {
 421             int numPpRanks = hw_opt->nthreads_tmpi - numPmeRanks;
 422             if ((numGpuIdsSupplied > 0) &&
 423                 (numGpuIdsSupplied != numPpRanks))
 424             {
 425                 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d PME ranks and %d "
 426                           "GPU IDs supplied. The number of particle-particle ranks and the "
 427                           "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numPmeRanks, numGpuIdsSupplied);
 428             }
 429         }
 430         /* Trivial, return the user's choice right away */
 431         return hw_opt->nthreads_tmpi;
 432     }
 433     GMX_RELEASE_ASSERT(numGpuIdsSupplied == 0,
 434                        "If mdrun -gpu_id had information, the number of ranks should have already been chosen");
 435
 436     // Now implement automatic selection of number of thread-MPI ranks
 437     nthreads_hw = hwinfo->nthreads_hw_avail;
 438
 439     if (nthreads_hw <= 0)
 440     {
 441         /* This should normally not happen, but if it does, we handle it */
 442         gmx_fatal(FARGS, "The number of available hardware threads can not be detected, please specify the number of MPI ranks and the number of OpenMP threads (if supported) manually with options -ntmpi and -ntomp, respectively");
 443     }
 444
 445     /* How many total (#tMPI*#OpenMP) threads can we start? */
 446     if (hw_opt->nthreads_tot > 0)
 447     {
 448         nthreads_tot_max = hw_opt->nthreads_tot;
 449     }
 450     else
 451     {
 452         nthreads_tot_max = nthreads_hw;
 453     }
 454
 455     /* nonbondedOnGpu might be false e.g. because this simulation uses
 456      * the group scheme, or is a rerun with energy groups. */
 457     ngpu = (nonbondedOnGpu ? hwinfo->gpu_info.n_dev_compatible : 0);
 458
 459     if (inputrec->cutoff_scheme == ecutsGROUP)
 460     {
 461         /* We checked this before, but it doesn't hurt to do it once more */
 462         GMX_RELEASE_ASSERT(hw_opt->nthreads_omp == 1, "The group scheme only supports one OpenMP thread per rank");
 463     }
 464
 465     nrank =
 466         get_tmpi_omp_thread_division(hwinfo, *hw_opt, nthreads_tot_max, ngpu);
 467
 468     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 469     {
 470         /* Dims/steps are divided over the nodes iso splitting the atoms.
 471          * With NM we can't have more ranks than #atoms*#dim. With TPI it's
 472          * unlikely we have fewer atoms than ranks, and if so, communication
 473          * would become a bottleneck, so we set the limit to 1 atom/rank.
 474          */
 475         min_atoms_per_mpi_rank = 1;
 476     }
 477     else
 478     {
 479         if (ngpu >= 1)
 480         {
 481             min_atoms_per_mpi_rank = min_atoms_per_gpu;
 482         }
 483         else
 484         {
 485             min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
 486         }
 487     }
 488
 489     if (mtop->natoms/nrank < min_atoms_per_mpi_rank)
 490     {
 491         int nrank_new;
 492
 493         /* the rank number was chosen automatically, but there are too few
 494            atoms per rank, so we need to reduce the rank count */
 495         nrank_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);
 496
 497         /* Avoid partial use of Hyper-Threading */
 498         if (gmxSmtIsEnabled(hwTop) &&
 499             nrank_new > nthreads_hw/2 && nrank_new < nthreads_hw)
 500         {
 501             nrank_new = nthreads_hw/2;
 502         }
 503
 504         /* If the user specified the total thread count, ensure this is
 505          * divisible by the number of ranks.
 506          * It is quite likely that we have too many total threads compared
 507          * to the size of the system, but if the user asked for this many
 508          * threads we should respect that.
 509          */
 510         while (hw_opt->nthreads_tot > 0 &&
 511                hw_opt->nthreads_tot % nrank_new != 0)
 512         {
 513             nrank_new--;
 514         }
 515
 516         /* Avoid large prime numbers in the rank count */
 517         if (nrank_new >= 6)
 518         {
 519             /* Use only 6,8,10 with additional factors of 2 */
 520             int fac;
 521
 522             fac = 2;
 523             while (3*fac*2 <= nrank_new)
 524             {
 525                 fac *= 2;
 526             }
 527
 528             nrank_new = (nrank_new/fac)*fac;
 529         }
 530         else
 531         {
 532             /* Avoid 5, since small system won't fit 5 domains along
 533              * a dimension. This might lead to waisting some cores, but this
 534              * will have a small impact in this regime of very small systems.
 535              */
 536             if (nrank_new == 5)
 537             {
 538                 nrank_new = 4;
 539             }
 540         }
 541
 542         nrank = nrank_new;
 543
 544         /* We reduced the number of tMPI ranks, which means we might violate
 545          * our own efficiency checks if we simply use all hardware threads.
 546          */
 547         if (bHasOmpSupport && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
 548         {
 549             /* The user set neither the total nor the OpenMP thread count,
 550              * we should use all hardware threads, unless we will violate
 551              * our own efficiency limitation on the thread count.
 552              */
 553             int  nt_omp_max;
 554
 555             nt_omp_max = nthreads_omp_efficient_max(nrank, cpuInfo, ngpu >= 1);
 556
 557             if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
 558             {
 559                 /* Limit the number of OpenMP threads to start */
 560                 hw_opt->nthreads_omp = nt_omp_max;
 561             }
 562         }
 563
 564         fprintf(stderr, "\n");
 565         fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
 566         fprintf(stderr, "      only starting %d thread-MPI ranks.\n", nrank);
 567         fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 568     }
 569
 570     return nrank;
 571 }
 572 #endif /* GMX_THREAD_MPI */
 573
 574
 575 void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
 576                                         int                  numTotalThreads,
 577                                         bool                 willUsePhysicalGpu,
 578                                         gmx_bool             bNtOmpOptionSet,
 579                                         t_commrec           *cr,
 580                                         const gmx::MDLogger &mdlog)
 581 {
 582 #if GMX_OPENMP && GMX_MPI
 583     int         nth_omp_min, nth_omp_max;
 584     char        buf[1000];
 585 #if GMX_THREAD_MPI
 586     const char *mpi_option = " (option -ntmpi)";
 587 #else
 588     const char *mpi_option = "";
 589 #endif
 590
 591     /* This function should be called after thread-MPI (when configured) and
 592      * OpenMP have been initialized. Check that here.
 593      */
 594 #if GMX_THREAD_MPI
 595     GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
 596 #endif
 597     GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");
 598
 599     nth_omp_min = gmx_omp_nthreads_get(emntDefault);
 600     nth_omp_max = gmx_omp_nthreads_get(emntDefault);
 601
 602     bool anyRankIsUsingGpus = willUsePhysicalGpu;
 603     /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
 604     if (cr->nnodes + cr->npmenodes > 1)
 605     {
 606         int count[3], count_max[3];
 607
 608         count[0] = -nth_omp_min;
 609         count[1] =  nth_omp_max;
 610         count[2] =  willUsePhysicalGpu;
 611
 612         MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);
 613
 614         /* In case of an inhomogeneous run setup we use the maximum counts */
 615         nth_omp_min        = -count_max[0];
 616         nth_omp_max        =  count_max[1];
 617         anyRankIsUsingGpus = count_max[2] > 0;
 618     }
 619
 620     int nthreads_omp_mpi_ok_min;
 621
 622     if (!anyRankIsUsingGpus)
 623     {
 624         nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
 625     }
 626     else
 627     {
 628         /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
 629          * cases where the user specifies #ranks == #cores.
 630          */
 631         nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
 632     }
 633
 634     if (DOMAINDECOMP(cr) && cr->nnodes > 1)
 635     {
 636         if (nth_omp_max < nthreads_omp_mpi_ok_min ||
 637             nth_omp_max > nthreads_omp_mpi_ok_max)
 638         {
 639             /* Note that we print target_max here, not ok_max */
 640             sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
 641                     nth_omp_max,
 642                     nthreads_omp_mpi_ok_min,
 643                     nthreads_omp_mpi_target_max);
 644
 645             if (bNtOmpOptionSet)
 646             {
 647                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
 648             }
 649             else
 650             {
 651                 /* This fatal error, and the one below, is nasty, but it's
 652                  * probably the only way to ensure that all users don't waste
 653                  * a lot of resources, since many users don't read logs/stderr.
 654                  */
 655                 gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
 656             }
 657         }
 658     }
 659     else
 660     {
 661         const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
 662
 663         /* No domain decomposition (or only one domain) */
 664         if (nth_omp_max > nthreads_omp_faster(cpuInfo, anyRankIsUsingGpus))
 665         {
 666             /* To arrive here, the user/system set #ranks and/or #OMPthreads */
 667             gmx_bool bEnvSet;
 668             char     buf2[256];
 669
 670             bEnvSet = (getenv("OMP_NUM_THREADS") != nullptr);
 671
 672             if (bNtOmpOptionSet || bEnvSet)
 673             {
 674                 sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
 675             }
 676             else
 677             {
 678                 sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
 679                         cr->nnodes + cr->npmenodes,
 680                         cr->nnodes + cr->npmenodes == 1 ? "" : "s",
 681                         numTotalThreads > 0 ? numTotalThreads : hwinfo->nthreads_hw_avail,
 682                         hwinfo->nphysicalnode > 1 ? "on a node " : "",
 683                         nth_omp_max);
 684             }
 685             sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
 686                     buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);
 687
 688             /* We can not quit with a fatal error when OMP_NUM_THREADS is set
 689              * with different values per rank or node, since in that case
 690              * the user can not set -ntomp to override the error.
 691              */
 692             if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
 693             {
 694                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
 695             }
 696             else
 697             {
 698                 gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
 699             }
 700         }
 701     }
 702 #else /* GMX_OPENMP && GMX_MPI */
 703       /* No OpenMP and/or MPI: it doesn't make much sense to check */
 704     GMX_UNUSED_VALUE(bNtOmpOptionSet);
 705     GMX_UNUSED_VALUE(numTotalThreads);
 706     GMX_UNUSED_VALUE(willUsePhysicalGpu);
 707     GMX_UNUSED_VALUE(cr);
 708     /* Check if we have more than 1 physical core, if detected,
 709      * or more than 1 hardware thread if physical cores were not detected.
 710      */
 711     if (!GMX_OPENMP && !GMX_MPI && hwinfo->hardwareTopology->numberOfCores() > 1)
 712     {
 713         GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core");
 714     }
 715 #endif /* GMX_OPENMP && GMX_MPI */
 716 }
 717
 718
 719 //! Dump a \c hw_opt to \c fp.
 720 static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
 721 {
 722     fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 723             hw_opt->nthreads_tot,
 724             hw_opt->nthreads_tmpi,
 725             hw_opt->nthreads_omp,
 726             hw_opt->nthreads_omp_pme,
 727             hw_opt->gpuIdTaskAssignment.c_str());
 728 }
 729
 730 void check_and_update_hw_opt_1(gmx_hw_opt_t    *hw_opt,
 731                                const t_commrec *cr,
 732                                int              nPmeRanks)
 733 {
 734     /* Currently hw_opt only contains default settings or settings supplied
 735      * by the user on the command line.
 736      */
 737     if (hw_opt->nthreads_omp < 0)
 738     {
 739         gmx_fatal(FARGS, "The number of OpenMP threads supplied on the command line is %d, which is negative and not allowed", hw_opt->nthreads_omp);
 740     }
 741
 742     /* Check for OpenMP settings stored in environment variables, which can
 743      * potentially be different on different MPI ranks.
 744      */
 745     gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, SIMMASTER(cr));
 746
 747     /* Check restrictions on the user supplied options before modifying them.
 748      * TODO: Put the user values in a const struct and preserve them.
 749      */
 750 #if !GMX_THREAD_MPI
 751     if (hw_opt->nthreads_tot > 0)
 752     {
 753         gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 754     }
 755     if (hw_opt->nthreads_tmpi > 0)
 756     {
 757         gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 758     }
 759 #endif
 760
 761     if (bHasOmpSupport)
 762     {
 763         /* Check restrictions on PME thread related options set by the user */
 764
 765         if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 766         {
 767             gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
 768         }
 769
 770         if (hw_opt->nthreads_omp_pme >= 1 &&
 771             hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 772             nPmeRanks <= 0)
 773         {
 774             /* This can result in a fatal error on many MPI ranks,
 775              * but since the thread count can differ per rank,
 776              * we can't easily avoid this.
 777              */
 778             gmx_fatal(FARGS, "You need to explicitly specify the number of PME ranks (-npme) when using different number of OpenMP threads for PP and PME ranks");
 779         }
 780     }
 781     else
 782     {
 783         /* GROMACS was configured without OpenMP support */
 784
 785         if (hw_opt->nthreads_omp > 1 || hw_opt->nthreads_omp_pme > 1)
 786         {
 787             gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
 788         }
 789         hw_opt->nthreads_omp     = 1;
 790         hw_opt->nthreads_omp_pme = 1;
 791     }
 792
 793     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 794     {
 795         /* We have the same number of OpenMP threads for PP and PME ranks,
 796          * thus we can perform several consistency checks.
 797          */
 798         if (hw_opt->nthreads_tmpi > 0 &&
 799             hw_opt->nthreads_omp > 0 &&
 800             hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 801         {
 802             gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
 803                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
 804         }
 805
 806         if (hw_opt->nthreads_tmpi > 0 &&
 807             hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 808         {
 809             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
 810                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
 811         }
 812
 813         if (hw_opt->nthreads_omp > 0 &&
 814             hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 815         {
 816             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 817                       hw_opt->nthreads_tot, hw_opt->nthreads_omp);
 818         }
 819     }
 820
 821     if (hw_opt->nthreads_tot > 0)
 822     {
 823         if (hw_opt->nthreads_omp > hw_opt->nthreads_tot)
 824         {
 825             gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads. Choose a total number of threads that is a multiple of the number of OpenMP threads.",
 826                       hw_opt->nthreads_omp, hw_opt->nthreads_tot);
 827         }
 828
 829         if (hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
 830         {
 831             gmx_fatal(FARGS, "You requested %d thread-MPI ranks with %d total threads. Choose a total number of threads that is a multiple of the number of thread-MPI ranks.",
 832                       hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
 833         }
 834     }
 835
 836     if (debug)
 837     {
 838         print_hw_opt(debug, hw_opt);
 839     }
 840
 841     /* Asserting this simplifies the hardware resource division later
 842      * on. */
 843     GMX_RELEASE_ASSERT(!(hw_opt->nthreads_omp_pme >= 1 && hw_opt->nthreads_omp <= 0),
 844                        "PME thread count should only be set when the normal thread count is also set");
 845 }
 846
 847 void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
 848                                int           cutoff_scheme)
 849 {
 850     if (cutoff_scheme == ecutsGROUP)
 851     {
 852         /* We only have OpenMP support for PME only nodes */
 853         if (hw_opt->nthreads_omp > 1)
 854         {
 855             gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 856                       ecutscheme_names[cutoff_scheme],
 857                       ecutscheme_names[ecutsVERLET]);
 858         }
 859         hw_opt->nthreads_omp = 1;
 860     }
 861 }
 862
 863 void check_and_update_hw_opt_3(gmx_hw_opt_t *hw_opt)
 864 {
 865 #if GMX_THREAD_MPI
 866     GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
 867
 868     /* If the user set the total number of threads on the command line
 869      * and did not specify the number of OpenMP threads, set the latter here.
 870      */
 871     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 872     {
 873         hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 874
 875         if (!bHasOmpSupport && hw_opt->nthreads_omp > 1)
 876         {
 877             gmx_fatal(FARGS, "You (indirectly) asked for OpenMP threads by setting -nt > -ntmpi, but GROMACS was compiled without OpenMP support");
 878         }
 879     }
 880 #endif
 881
 882     GMX_RELEASE_ASSERT(bHasOmpSupport || hw_opt->nthreads_omp == 1, "Without OpenMP support, only one thread per rank can be used");
 883
 884     /* We are done with updating nthreads_omp, we can set nthreads_omp_pme */
 885     if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 886     {
 887         hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 888     }
 889
 890     if (debug)
 891     {
 892         print_hw_opt(debug, hw_opt);
 893     }
 894 }