src/programs/mdrun/resource-division.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "resource-division.h"
  39
  40 #include "config.h"
  41
  42 #include <stdlib.h>
  43 #include <string.h>
  44
  45 #include <algorithm>
  46
  47 #include "gromacs/hardware/cpuinfo.h"
  48 #include "gromacs/hardware/detecthardware.h"
  49 #include "gromacs/hardware/hardwareassign.h"
  50 #include "gromacs/hardware/hardwaretopology.h"
  51 #include "gromacs/hardware/hw_info.h"
  52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  53 #include "gromacs/mdtypes/commrec.h"
  54 #include "gromacs/mdtypes/inputrec.h"
  55 #include "gromacs/mdtypes/md_enums.h"
  56 #include "gromacs/topology/topology.h"
  57 #include "gromacs/utility/baseversion.h"
  58 #include "gromacs/utility/fatalerror.h"
  59 #include "gromacs/utility/gmxassert.h"
  60 #include "gromacs/utility/logger.h"
  61 #include "gromacs/utility/stringutil.h"
  62
  63
  64 /* DISCLAIMER: All the atom count and thread numbers below are heuristic.
  65  * The real switching points will depend on the system simulation,
  66  * the algorithms used and the hardware it's running on, as well as if there
  67  * are other jobs running on the same machine. We try to take into account
  68  * factors that have a large influence, such as recent Intel CPUs being
  69  * much better at wide multi-threading. The remaining factors should
  70  * (hopefully) have a small influence, such that the performance just before
  71  * and after a switch point doesn't change too much.
  72  */
  73
  74 //! Constant used to help minimize preprocessed code
  75 static const bool bHasOmpSupport = GMX_OPENMP;
  76
  77 #if GMX_THREAD_MPI
  78 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
  79  * the number of threads will get lowered.
  80  */
  81 static const int min_atoms_per_mpi_thread =  90;
  82 static const int min_atoms_per_gpu        = 900;
  83 #endif /* GMX_THREAD_MPI */
  84
  85 /* TODO choose nthreads_omp based on hardware topology
  86    when we have a hardware topology detection library */
  87 /* First we consider the case of no MPI (1 MPI rank).
  88  * In general, when running up to 8 threads, OpenMP should be faster.
  89  * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
  90  * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
  91  * even on two CPUs it's usually faster (but with many OpenMP threads
  92  * it could be faster not to use HT, currently we always use HT).
  93  * On Nehalem/Westmere we want to avoid running 16 threads over
  94  * two CPUs with HT, so we need a limit<16; thus we use 12.
  95  * A reasonable limit for Intel Sandy and Ivy bridge,
  96  * not knowing the topology, is 16 threads.
  97  * Below we check for Intel and AVX, which for now includes
  98  * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
  99  * model numbers we ensure also future Intel CPUs are covered.
 100  */
 101 const int nthreads_omp_faster_default   =  8;
 102 const int nthreads_omp_faster_Nehalem   = 12;
 103 const int nthreads_omp_faster_Intel_AVX = 16;
 104 const int nthreads_omp_faster_AMD_Ryzen = 16;
 105 /* For CPU only runs the fastest options are usually MPI or OpenMP only.
 106  * With one GPU, using MPI only is almost never optimal, so we need to
 107  * compare running pure OpenMP with combined MPI+OpenMP. This means higher
 108  * OpenMP threads counts can still be ok. Multiplying the numbers above
 109  * by a factor of 2 seems to be a good estimate.
 110  */
 111 const int nthreads_omp_faster_gpu_fac   =  2;
 112
 113 /* This is the case with MPI (2 or more MPI PP ranks).
 114  * By default we will terminate with a fatal error when more than 8
 115  * OpenMP thread are (indirectly) requested, since using less threads
 116  * nearly always results in better performance.
 117  * With thread-mpi and multiple GPUs or one GPU and too many threads
 118  * we first try 6 OpenMP threads and then less until the number of MPI ranks
 119  * is divisible by the number of GPUs.
 120  */
 121 #if GMX_OPENMP && GMX_MPI
 122 const int nthreads_omp_mpi_ok_max              =  8;
 123 const int nthreads_omp_mpi_ok_min_cpu          =  1;
 124 #endif
 125 const int nthreads_omp_mpi_ok_min_gpu          =  2;
 126 const int nthreads_omp_mpi_target_max          =  6;
 127
 128
 129 /* Returns the maximum OpenMP thread count for which using a single MPI rank
 130  * should be faster than using multiple ranks with the same total thread count.
 131  */
 132 static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
 133 {
 134     int nth;
 135
 136     if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Intel &&
 137         cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx))
 138     {
 139         nth = nthreads_omp_faster_Intel_AVX;
 140     }
 141     else if (gmx::cpuIsX86Nehalem(cpuInfo))
 142     {
 143         // Intel Nehalem
 144         nth = nthreads_omp_faster_Nehalem;
 145     }
 146     else if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() >= 23)
 147     {
 148         // AMD Ryzen
 149         nth = nthreads_omp_faster_AMD_Ryzen;
 150     }
 151     else
 152     {
 153         nth = nthreads_omp_faster_default;
 154     }
 155
 156     if (bUseGPU)
 157     {
 158         nth *= nthreads_omp_faster_gpu_fac;
 159     }
 160
 161     nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
 162
 163     return nth;
 164 }
 165
 166 /* Returns that maximum OpenMP thread count that passes the efficiency check */
 167 static int nthreads_omp_efficient_max(int gmx_unused       nrank,
 168                                       const gmx::CpuInfo  &cpuInfo,
 169                                       gmx_bool             bUseGPU)
 170 {
 171 #if GMX_OPENMP && GMX_MPI
 172     if (nrank > 1)
 173     {
 174         return nthreads_omp_mpi_ok_max;
 175     }
 176     else
 177 #endif
 178     {
 179         return nthreads_omp_faster(cpuInfo, bUseGPU);
 180     }
 181 }
 182
 183 /* Return the number of thread-MPI ranks to use.
 184  * This is chosen such that we can always obey our own efficiency checks.
 185  */
 186 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 187                                         const gmx_hw_opt_t  &hw_opt,
 188                                         int                  nthreads_tot,
 189                                         int                  ngpu)
 190 {
 191     int                 nrank;
 192     const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
 193
 194     GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank");
 195
 196     /* There are no separate PME nodes here, as we ensured in
 197      * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 198      * and a conditional ensures we would not have ended up here.
 199      * Note that separate PME nodes might be switched on later.
 200      */
 201     if (ngpu > 0)
 202     {
 203         nrank = ngpu;
 204
 205         /* When the user sets nthreads_omp, we can end up oversubscribing CPU cores
 206          * if we simply start as many ranks as GPUs. To avoid this, we start as few
 207          * tMPI ranks as necessary to avoid oversubscription and instead leave GPUs idle.
 208          * If the user does not set the number of OpenMP threads, nthreads_omp==0 and
 209          * this code has no effect.
 210          */
 211         GMX_RELEASE_ASSERT(hw_opt.nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
 212         while (nrank*hw_opt.nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
 213         {
 214             nrank--;
 215         }
 216
 217         if (nthreads_tot < nrank)
 218         {
 219             /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
 220             nrank = nthreads_tot;
 221         }
 222         else if (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
 223                  (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))
 224         {
 225             /* The high OpenMP thread count will likely result in sub-optimal
 226              * performance. Increase the rank count to reduce the thread count
 227              * per rank. This will lead to GPU sharing by MPI ranks/threads.
 228              */
 229             int nshare;
 230
 231             /* Increase the rank count as long as have we more than 6 OpenMP
 232              * threads per rank or the number of hardware threads is not
 233              * divisible by the rank count. Don't go below 2 OpenMP threads.
 234              */
 235             nshare = 1;
 236             do
 237             {
 238                 nshare++;
 239                 nrank = ngpu*nshare;
 240             }
 241             while (nthreads_tot/nrank > nthreads_omp_mpi_target_max ||
 242                    (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
 243         }
 244     }
 245     else if (hw_opt.nthreads_omp > 0)
 246     {
 247         /* Here we could oversubscribe, when we do, we issue a warning later */
 248         nrank = std::max(1, nthreads_tot/hw_opt.nthreads_omp);
 249     }
 250     else
 251     {
 252         if (nthreads_tot <= nthreads_omp_faster(cpuInfo, ngpu > 0))
 253         {
 254             /* Use pure OpenMP parallelization */
 255             nrank = 1;
 256         }
 257         else
 258         {
 259             /* Don't use OpenMP parallelization */
 260             nrank = nthreads_tot;
 261         }
 262     }
 263
 264     return nrank;
 265 }
 266
 267
 268 static int getMaxGpuUsable(const gmx_hw_info_t *hwinfo,
 269                            int                  cutoff_scheme)
 270 {
 271     /* This code relies on the fact that GPU are not detected when GPU
 272      * acceleration was disabled at run time by the user, either with
 273      * -nb cpu or setting GMX_EMULATE_GPU. */
 274     if (cutoff_scheme == ecutsVERLET &&
 275         hwinfo->gpu_info.n_dev_compatible > 0)
 276     {
 277         return hwinfo->gpu_info.n_dev_compatible;
 278     }
 279     else
 280     {
 281         return 0;
 282     }
 283 }
 284
 285
 286 #if GMX_THREAD_MPI
 287
 288 static bool
 289 gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
 290 {
 291     return (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic && hwTop.machine().sockets[0].cores[0].hwThreads.size() > 1);
 292 }
 293
 294 namespace
 295 {
 296
 297 class SingleRankChecker
 298 {
 299     public:
 300         //! Constructor
 301         SingleRankChecker() : value_(false), reasons_() {}
 302         /*! \brief Call this function for each possible condition
 303             under which a single rank is required, along with a string
 304             describing the constraint when it is applied. */
 305         void applyConstraint(bool condition, const char *description)
 306         {
 307             if (condition)
 308             {
 309                 value_ = true;
 310                 reasons_.push_back(gmx::formatString("%s only supports a single rank.", description));
 311             }
 312         }
 313         //! After applying any conditions, is a single rank required?
 314         bool mustUseOneRank() const
 315         {
 316             return value_;
 317         }
 318         /*! \brief Return a formatted string to use when writing a
 319             message when a single rank is required, (or empty if no
 320             constraint exists.) */
 321         std::string getMessage() const
 322         {
 323             return formatAndJoin(reasons_, "\n", gmx::IdentityFormatter());
 324         }
 325     private:
 326         bool                     value_;
 327         std::vector<std::string> reasons_;
 328 };
 329
 330 } // namespace
 331
 332 /* Get the number of MPI ranks to use for thread-MPI based on how many
 333  * were requested, which algorithms we're using,
 334  * and how many particles there are.
 335  * At the point we have already called check_and_update_hw_opt.
 336  * Thus all options should be internally consistent and consistent
 337  * with the hardware, except that ntmpi could be larger than #GPU.
 338  */
 339 int get_nthreads_mpi(const gmx_hw_info_t    *hwinfo,
 340                      gmx_hw_opt_t           *hw_opt,
 341                      int                     numPmeRanks,
 342                      const t_inputrec       *inputrec,
 343                      const gmx_mtop_t       *mtop,
 344                      const gmx::MDLogger    &mdlog,
 345                      bool                    doMembed)
 346 {
 347     int                          nthreads_hw, nthreads_tot_max, nrank, ngpu;
 348     int                          min_atoms_per_mpi_rank;
 349
 350     const gmx::CpuInfo          &cpuInfo = *hwinfo->cpuInfo;
 351     const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
 352
 353     /* If the user made a GPU task assignment, that sets the number of thread-MPI ranks. */
 354     auto userGpuTaskAssignment = gmx::parseGpuTaskAssignment(hw_opt->gpuIdTaskAssignment);
 355     int  numGpuIdsSupplied     = static_cast<int>(userGpuTaskAssignment.size());
 356
 357     /* TODO Here we handle the case where the user set GPU IDs, and
 358        further below we handle the case where the algorithm does not
 359        support multiple ranks. We need also to handle the case where
 360        the user set multiple GPU IDs for an algorithm that cannot
 361        handle multiple ranks. */
 362     if (hw_opt->nthreads_tmpi < 1 && numGpuIdsSupplied > 0)
 363     {
 364         /* If the user chose both mdrun -nt -gpu_id, is that consistent? */
 365         if (numPmeRanks <= 0)
 366         {
 367             if (hw_opt->nthreads_tot > 0 &&
 368                 (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0)
 369             {
 370                 gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied);
 371             }
 372             return numGpuIdsSupplied;
 373         }
 374         else
 375         {
 376             gmx_fatal(FARGS, "The combination of choosing a number of PME ranks, and specific GPU IDs "
 377                       "is not supported. Use also -ntmpi and/or -ntomp and -ntomp_pme to specify what "
 378                       "distribution of threads to ranks you require.");
 379         }
 380     }
 381
 382     {
 383         /* Check if an algorithm does not support parallel simulation.  */
 384         // TODO This might work better if e.g. implemented algorithms
 385         // had to define a function that returns such requirements,
 386         // and a description string.
 387         SingleRankChecker checker;
 388         checker.applyConstraint(inputrec->eI == eiLBFGS, "L-BFGS minimization");
 389         checker.applyConstraint(inputrec->coulombtype == eelEWALD, "Plain Ewald electrostatics");
 390         checker.applyConstraint(doMembed, "Membrane embedding");
 391         if (checker.mustUseOneRank())
 392         {
 393             std::string message = checker.getMessage();
 394             if (hw_opt->nthreads_tmpi > 1)
 395             {
 396                 gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str());
 397             }
 398             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str());
 399
 400             if (numGpuIdsSupplied > 1)
 401             {
 402                 gmx_fatal(FARGS, "You supplied %d GPU IDs but only 1 rank can be used "
 403                           "by this simulation. Supply only one GPU ID.", numGpuIdsSupplied);
 404             }
 405             return 1;
 406         }
 407     }
 408
 409     if (hw_opt->nthreads_tmpi > 0)
 410     {
 411         if (numPmeRanks <= 0)
 412         {
 413             int numPpRanks = hw_opt->nthreads_tmpi;
 414             if ((numGpuIdsSupplied > 0) &&
 415                 (numGpuIdsSupplied != numPpRanks))
 416             {
 417                 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d "
 418                           "GPU IDs supplied. The number of particle-particle (PP) ranks and the "
 419                           "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numGpuIdsSupplied);
 420             }
 421         }
 422         else
 423         {
 424             int numPpRanks = hw_opt->nthreads_tmpi - numPmeRanks;
 425             if ((numGpuIdsSupplied > 0) &&
 426                 (numGpuIdsSupplied != numPpRanks))
 427             {
 428                 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d PME ranks and %d "
 429                           "GPU IDs supplied. The number of particle-particle ranks and the "
 430                           "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numPmeRanks, numGpuIdsSupplied);
 431             }
 432         }
 433         /* Trivial, return the user's choice right away */
 434         return hw_opt->nthreads_tmpi;
 435     }
 436     GMX_RELEASE_ASSERT(numGpuIdsSupplied == 0,
 437                        "If mdrun -gpu_id had information, the number of ranks should have already been chosen");
 438
 439     // Now implement automatic selection of number of thread-MPI ranks
 440     nthreads_hw = hwinfo->nthreads_hw_avail;
 441
 442     if (nthreads_hw <= 0)
 443     {
 444         /* This should normally not happen, but if it does, we handle it */
 445         gmx_fatal(FARGS, "The number of available hardware threads can not be detected, please specify the number of MPI ranks and the number of OpenMP threads (if supported) manually with options -ntmpi and -ntomp, respectively");
 446     }
 447
 448     /* How many total (#tMPI*#OpenMP) threads can we start? */
 449     if (hw_opt->nthreads_tot > 0)
 450     {
 451         nthreads_tot_max = hw_opt->nthreads_tot;
 452     }
 453     else
 454     {
 455         nthreads_tot_max = nthreads_hw;
 456     }
 457
 458     ngpu = getMaxGpuUsable(hwinfo, inputrec->cutoff_scheme);
 459
 460     if (inputrec->cutoff_scheme == ecutsGROUP)
 461     {
 462         /* We checked this before, but it doesn't hurt to do it once more */
 463         GMX_RELEASE_ASSERT(hw_opt->nthreads_omp == 1, "The group scheme only supports one OpenMP thread per rank");
 464     }
 465
 466     nrank =
 467         get_tmpi_omp_thread_division(hwinfo, *hw_opt, nthreads_tot_max, ngpu);
 468
 469     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 470     {
 471         /* Dims/steps are divided over the nodes iso splitting the atoms.
 472          * With NM we can't have more ranks than #atoms*#dim. With TPI it's
 473          * unlikely we have fewer atoms than ranks, and if so, communication
 474          * would become a bottleneck, so we set the limit to 1 atom/rank.
 475          */
 476         min_atoms_per_mpi_rank = 1;
 477     }
 478     else
 479     {
 480         if (ngpu >= 1)
 481         {
 482             min_atoms_per_mpi_rank = min_atoms_per_gpu;
 483         }
 484         else
 485         {
 486             min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
 487         }
 488     }
 489
 490     if (mtop->natoms/nrank < min_atoms_per_mpi_rank)
 491     {
 492         int nrank_new;
 493
 494         /* the rank number was chosen automatically, but there are too few
 495            atoms per rank, so we need to reduce the rank count */
 496         nrank_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);
 497
 498         /* Avoid partial use of Hyper-Threading */
 499         if (gmxSmtIsEnabled(hwTop) &&
 500             nrank_new > nthreads_hw/2 && nrank_new < nthreads_hw)
 501         {
 502             nrank_new = nthreads_hw/2;
 503         }
 504
 505         /* If the user specified the total thread count, ensure this is
 506          * divisible by the number of ranks.
 507          * It is quite likely that we have too many total threads compared
 508          * to the size of the system, but if the user asked for this many
 509          * threads we should respect that.
 510          */
 511         while (hw_opt->nthreads_tot > 0 &&
 512                hw_opt->nthreads_tot % nrank_new != 0)
 513         {
 514             nrank_new--;
 515         }
 516
 517         /* Avoid large prime numbers in the rank count */
 518         if (nrank_new >= 6)
 519         {
 520             /* Use only 6,8,10 with additional factors of 2 */
 521             int fac;
 522
 523             fac = 2;
 524             while (3*fac*2 <= nrank_new)
 525             {
 526                 fac *= 2;
 527             }
 528
 529             nrank_new = (nrank_new/fac)*fac;
 530         }
 531         else
 532         {
 533             /* Avoid 5, since small system won't fit 5 domains along
 534              * a dimension. This might lead to waisting some cores, but this
 535              * will have a small impact in this regime of very small systems.
 536              */
 537             if (nrank_new == 5)
 538             {
 539                 nrank_new = 4;
 540             }
 541         }
 542
 543         nrank = nrank_new;
 544
 545         /* We reduced the number of tMPI ranks, which means we might violate
 546          * our own efficiency checks if we simply use all hardware threads.
 547          */
 548         if (bHasOmpSupport && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
 549         {
 550             /* The user set neither the total nor the OpenMP thread count,
 551              * we should use all hardware threads, unless we will violate
 552              * our own efficiency limitation on the thread count.
 553              */
 554             int  nt_omp_max;
 555
 556             nt_omp_max = nthreads_omp_efficient_max(nrank, cpuInfo, ngpu >= 1);
 557
 558             if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
 559             {
 560                 /* Limit the number of OpenMP threads to start */
 561                 hw_opt->nthreads_omp = nt_omp_max;
 562             }
 563         }
 564
 565         fprintf(stderr, "\n");
 566         fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
 567         fprintf(stderr, "      only starting %d thread-MPI ranks.\n", nrank);
 568         fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 569     }
 570
 571     return nrank;
 572 }
 573 #endif /* GMX_THREAD_MPI */
 574
 575
 576 void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
 577                                         int                  numTotalThreads,
 578                                         bool                 willUsePhysicalGpu,
 579                                         gmx_bool             bNtOmpOptionSet,
 580                                         t_commrec           *cr,
 581                                         const gmx::MDLogger &mdlog)
 582 {
 583 #if GMX_OPENMP && GMX_MPI
 584     int         nth_omp_min, nth_omp_max;
 585     char        buf[1000];
 586 #if GMX_THREAD_MPI
 587     const char *mpi_option = " (option -ntmpi)";
 588 #else
 589     const char *mpi_option = "";
 590 #endif
 591
 592     /* This function should be called after thread-MPI (when configured) and
 593      * OpenMP have been initialized. Check that here.
 594      */
 595 #if GMX_THREAD_MPI
 596     GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
 597 #endif
 598     GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");
 599
 600     nth_omp_min = gmx_omp_nthreads_get(emntDefault);
 601     nth_omp_max = gmx_omp_nthreads_get(emntDefault);
 602
 603     bool anyRankIsUsingGpus = willUsePhysicalGpu;
 604     /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
 605     if (cr->nnodes + cr->npmenodes > 1)
 606     {
 607         int count[3], count_max[3];
 608
 609         count[0] = -nth_omp_min;
 610         count[1] =  nth_omp_max;
 611         count[2] =  willUsePhysicalGpu;
 612
 613         MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);
 614
 615         /* In case of an inhomogeneous run setup we use the maximum counts */
 616         nth_omp_min        = -count_max[0];
 617         nth_omp_max        =  count_max[1];
 618         anyRankIsUsingGpus = count_max[2] > 0;
 619     }
 620
 621     int nthreads_omp_mpi_ok_min;
 622
 623     if (!anyRankIsUsingGpus)
 624     {
 625         nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
 626     }
 627     else
 628     {
 629         /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
 630          * cases where the user specifies #ranks == #cores.
 631          */
 632         nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
 633     }
 634
 635     if (DOMAINDECOMP(cr) && cr->nnodes > 1)
 636     {
 637         if (nth_omp_max < nthreads_omp_mpi_ok_min ||
 638             nth_omp_max > nthreads_omp_mpi_ok_max)
 639         {
 640             /* Note that we print target_max here, not ok_max */
 641             sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
 642                     nth_omp_max,
 643                     nthreads_omp_mpi_ok_min,
 644                     nthreads_omp_mpi_target_max);
 645
 646             if (bNtOmpOptionSet)
 647             {
 648                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
 649             }
 650             else
 651             {
 652                 /* This fatal error, and the one below, is nasty, but it's
 653                  * probably the only way to ensure that all users don't waste
 654                  * a lot of resources, since many users don't read logs/stderr.
 655                  */
 656                 gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
 657             }
 658         }
 659     }
 660     else
 661     {
 662         const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
 663
 664         /* No domain decomposition (or only one domain) */
 665         if (nth_omp_max > nthreads_omp_faster(cpuInfo, anyRankIsUsingGpus))
 666         {
 667             /* To arrive here, the user/system set #ranks and/or #OMPthreads */
 668             gmx_bool bEnvSet;
 669             char     buf2[256];
 670
 671             bEnvSet = (getenv("OMP_NUM_THREADS") != nullptr);
 672
 673             if (bNtOmpOptionSet || bEnvSet)
 674             {
 675                 sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
 676             }
 677             else
 678             {
 679                 sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
 680                         cr->nnodes + cr->npmenodes,
 681                         cr->nnodes + cr->npmenodes == 1 ? "" : "s",
 682                         numTotalThreads > 0 ? numTotalThreads : hwinfo->nthreads_hw_avail,
 683                         hwinfo->nphysicalnode > 1 ? "on a node " : "",
 684                         nth_omp_max);
 685             }
 686             sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
 687                     buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);
 688
 689             /* We can not quit with a fatal error when OMP_NUM_THREADS is set
 690              * with different values per rank or node, since in that case
 691              * the user can not set -ntomp to override the error.
 692              */
 693             if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
 694             {
 695                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
 696             }
 697             else
 698             {
 699                 gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
 700             }
 701         }
 702     }
 703 #else /* GMX_OPENMP && GMX_MPI */
 704       /* No OpenMP and/or MPI: it doesn't make much sense to check */
 705     GMX_UNUSED_VALUE(bNtOmpOptionSet);
 706     GMX_UNUSED_VALUE(numTotalThreads);
 707     GMX_UNUSED_VALUE(willUsePhysicalGpu);
 708     GMX_UNUSED_VALUE(cr);
 709     /* Check if we have more than 1 physical core, if detected,
 710      * or more than 1 hardware thread if physical cores were not detected.
 711      */
 712     if (!GMX_OPENMP && !GMX_MPI && hwinfo->hardwareTopology->numberOfCores() > 1)
 713     {
 714         GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core");
 715     }
 716 #endif /* GMX_OPENMP && GMX_MPI */
 717 }
 718
 719
 720 static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
 721 {
 722     fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 723             hw_opt->nthreads_tot,
 724             hw_opt->nthreads_tmpi,
 725             hw_opt->nthreads_omp,
 726             hw_opt->nthreads_omp_pme,
 727             hw_opt->gpuIdTaskAssignment.c_str());
 728 }
 729
 730 /* Checks we can do when we don't (yet) know the cut-off scheme */
 731 void check_and_update_hw_opt_1(gmx_hw_opt_t    *hw_opt,
 732                                const t_commrec *cr,
 733                                int              nPmeRanks)
 734 {
 735     /* Currently hw_opt only contains default settings or settings supplied
 736      * by the user on the command line.
 737      */
 738     if (hw_opt->nthreads_omp < 0)
 739     {
 740         gmx_fatal(FARGS, "The number of OpenMP threads supplied on the command line is %d, which is negative and not allowed", hw_opt->nthreads_omp);
 741     }
 742
 743     /* Check for OpenMP settings stored in environment variables, which can
 744      * potentially be different on different MPI ranks.
 745      */
 746     gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, SIMMASTER(cr));
 747
 748     /* Check restrictions on the user supplied options before modifying them.
 749      * TODO: Put the user values in a const struct and preserve them.
 750      */
 751 #if !GMX_THREAD_MPI
 752     if (hw_opt->nthreads_tot > 0)
 753     {
 754         gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 755     }
 756     if (hw_opt->nthreads_tmpi > 0)
 757     {
 758         gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 759     }
 760 #endif
 761
 762     if (bHasOmpSupport)
 763     {
 764         /* Check restrictions on PME thread related options set by the user */
 765
 766         if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 767         {
 768             gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
 769         }
 770
 771         if (hw_opt->nthreads_omp_pme >= 1 &&
 772             hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 773             nPmeRanks <= 0)
 774         {
 775             /* This can result in a fatal error on many MPI ranks,
 776              * but since the thread count can differ per rank,
 777              * we can't easily avoid this.
 778              */
 779             gmx_fatal(FARGS, "You need to explicitly specify the number of PME ranks (-npme) when using different number of OpenMP threads for PP and PME ranks");
 780         }
 781     }
 782     else
 783     {
 784         /* GROMACS was configured without OpenMP support */
 785
 786         if (hw_opt->nthreads_omp > 1 || hw_opt->nthreads_omp_pme > 1)
 787         {
 788             gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
 789         }
 790         hw_opt->nthreads_omp     = 1;
 791         hw_opt->nthreads_omp_pme = 1;
 792     }
 793
 794     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 795     {
 796         /* We have the same number of OpenMP threads for PP and PME ranks,
 797          * thus we can perform several consistency checks.
 798          */
 799         if (hw_opt->nthreads_tmpi > 0 &&
 800             hw_opt->nthreads_omp > 0 &&
 801             hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 802         {
 803             gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
 804                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
 805         }
 806
 807         if (hw_opt->nthreads_tmpi > 0 &&
 808             hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 809         {
 810             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
 811                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
 812         }
 813
 814         if (hw_opt->nthreads_omp > 0 &&
 815             hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 816         {
 817             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 818                       hw_opt->nthreads_tot, hw_opt->nthreads_omp);
 819         }
 820     }
 821
 822     if (hw_opt->nthreads_tot > 0)
 823     {
 824         if (hw_opt->nthreads_omp > hw_opt->nthreads_tot)
 825         {
 826             gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads. Choose a total number of threads that is a multiple of the number of OpenMP threads.",
 827                       hw_opt->nthreads_omp, hw_opt->nthreads_tot);
 828         }
 829
 830         if (hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
 831         {
 832             gmx_fatal(FARGS, "You requested %d thread-MPI ranks with %d total threads. Choose a total number of threads that is a multiple of the number of thread-MPI ranks.",
 833                       hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
 834         }
 835     }
 836
 837     if (debug)
 838     {
 839         print_hw_opt(debug, hw_opt);
 840     }
 841
 842     /* Asserting this simplifies the hardware resource division later
 843      * on. */
 844     GMX_RELEASE_ASSERT(!(hw_opt->nthreads_omp_pme >= 1 && hw_opt->nthreads_omp <= 0),
 845                        "PME thread count should only be set when the normal thread count is also set");
 846 }
 847
 848 /* Checks we can do when we know the cut-off scheme */
 849 void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
 850                                int           cutoff_scheme)
 851 {
 852     if (cutoff_scheme == ecutsGROUP)
 853     {
 854         /* We only have OpenMP support for PME only nodes */
 855         if (hw_opt->nthreads_omp > 1)
 856         {
 857             gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 858                       ecutscheme_names[cutoff_scheme],
 859                       ecutscheme_names[ecutsVERLET]);
 860         }
 861         hw_opt->nthreads_omp = 1;
 862     }
 863 }
 864
 865 /* Checks we can do when we know the thread-MPI rank count */
 866 void check_and_update_hw_opt_3(gmx_hw_opt_t *hw_opt)
 867 {
 868 #if GMX_THREAD_MPI
 869     GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
 870
 871     /* If the user set the total number of threads on the command line
 872      * and did not specify the number of OpenMP threads, set the latter here.
 873      */
 874     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 875     {
 876         hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 877
 878         if (!bHasOmpSupport && hw_opt->nthreads_omp > 1)
 879         {
 880             gmx_fatal(FARGS, "You (indirectly) asked for OpenMP threads by setting -nt > -ntmpi, but GROMACS was compiled without OpenMP support");
 881         }
 882     }
 883 #endif
 884
 885     GMX_RELEASE_ASSERT(bHasOmpSupport || hw_opt->nthreads_omp == 1, "Without OpenMP support, only one thread per rank can be used");
 886
 887     /* We are done with updating nthreads_omp, we can set nthreads_omp_pme */
 888     if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 889     {
 890         hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 891     }
 892
 893     if (debug)
 894     {
 895         print_hw_opt(debug, hw_opt);
 896     }
 897 }