src/gromacs/hardware/detecthardware.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "detecthardware.h"
  38
  39 #include "config.h"
  40
  41 #include <cerrno>
  42 #include <cstdlib>
  43 #include <cstring>
  44
  45 #include <algorithm>
  46 #include <chrono>
  47 #include <string>
  48 #include <thread>
  49 #include <vector>
  50
  51 #include "thread_mpi/threads.h"
  52
  53 #include "gromacs/gmxlib/network.h"
  54 #include "gromacs/gpu_utils/gpu_utils.h"
  55 #include "gromacs/hardware/cpuinfo.h"
  56 #include "gromacs/hardware/gpu_hw_info.h"
  57 #include "gromacs/hardware/hardwaretopology.h"
  58 #include "gromacs/hardware/hw_info.h"
  59 #include "gromacs/mdtypes/commrec.h"
  60 #include "gromacs/mdtypes/md_enums.h"
  61 #include "gromacs/simd/support.h"
  62 #include "gromacs/utility/arrayref.h"
  63 #include "gromacs/utility/basedefinitions.h"
  64 #include "gromacs/utility/basenetwork.h"
  65 #include "gromacs/utility/baseversion.h"
  66 #include "gromacs/utility/cstringutil.h"
  67 #include "gromacs/utility/exceptions.h"
  68 #include "gromacs/utility/fatalerror.h"
  69 #include "gromacs/utility/gmxassert.h"
  70 #include "gromacs/utility/logger.h"
  71 #include "gromacs/utility/programcontext.h"
  72 #include "gromacs/utility/smalloc.h"
  73 #include "gromacs/utility/stringutil.h"
  74 #include "gromacs/utility/sysinfo.h"
  75
  76 #ifdef HAVE_UNISTD_H
  77 #    include <unistd.h>       // sysconf()
  78 #endif
  79
  80 //! Convenience macro to help us avoid ifdefs each time we use sysconf
  81 #if !defined(_SC_NPROCESSORS_ONLN) && defined(_SC_NPROC_ONLN)
  82 #    define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
  83 #endif
  84
  85 //! Convenience macro to help us avoid ifdefs each time we use sysconf
  86 #if !defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROC_CONF)
  87 #    define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
  88 #endif
  89
  90 #if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
  91 //! Constant used to help minimize preprocessed code
  92 static const bool isX86 = true;
  93 #else
  94 //! Constant used to help minimize preprocessed code
  95 static const bool isX86 = false;
  96 #endif
  97
  98 #if defined __powerpc__ || defined __ppc__ || defined __PPC__
  99 static const bool isPowerPC = true;
 100 #else
 101 static const bool isPowerPC = false;
 102 #endif
 103
 104 //! Constant used to help minimize preprocessed code
 105 static const bool bGPUBinary     = GMX_GPU != GMX_GPU_NONE;
 106
 107 /* Note that some of the following arrays must match the "GPU support
 108  * enumeration" in src/config.h.cmakein, so that GMX_GPU looks up an
 109  * array entry. */
 110
 111 /* Both CUDA and OpenCL (on the supported/tested platforms) supports
 112  * GPU device sharing.
 113  */
 114 static const bool gpuSharingSupport[] = { false, true, true };
 115 static const bool bGpuSharingSupported = gpuSharingSupport[GMX_GPU];
 116
 117 /* Both CUDA and OpenCL (on the tested/supported platforms) supports everything.
 118  */
 119 static const bool multiGpuSupport[] = {
 120     false, true, true
 121 };
 122 static const bool bMultiGpuPerNodeSupported = multiGpuSupport[GMX_GPU];
 123
 124 // TODO If/when we unify CUDA and OpenCL support code, this should
 125 // move to a single place in gpu_utils.
 126 /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
 127 const char * const gpu_detect_res_str[egpuNR] =
 128 {
 129     "compatible", "inexistent", "incompatible", "insane"
 130 };
 131
 132 static const char * invalid_gpuid_hint =
 133     "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
 134
 135 /* The globally shared hwinfo structure. */
 136 static gmx_hw_info_t      *hwinfo_g;
 137 /* A reference counter for the hwinfo structure */
 138 static int                 n_hwinfo = 0;
 139 /* A lock to protect the hwinfo structure */
 140 static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
 141
 142 #define HOSTNAMELEN 80
 143
 144 /* FW decl. */
 145 static size_t gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
 146                                        const gmx_gpu_opt_t  *gpu_opt);
 147
 148 gmx_bool gmx_multiple_gpu_per_node_supported()
 149 {
 150     return bMultiGpuPerNodeSupported;
 151 }
 152
 153 gmx_bool gmx_gpu_sharing_supported()
 154 {
 155     return bGpuSharingSupported;
 156 }
 157
 158 std::string sprint_gpus(const gmx_gpu_info_t *gpu_info)
 159 {
 160     char                     stmp[STRLEN];
 161     std::vector<std::string> gpuStrings;
 162     for (int i = 0; i < gpu_info->n_dev; i++)
 163     {
 164         get_gpu_device_info_string(stmp, gpu_info, i);
 165         gpuStrings.push_back(gmx::formatString("    %s", stmp));
 166     }
 167     return gmx::joinStrings(gpuStrings, "\n");
 168 }
 169
 170 /*! \brief Helper function for reporting GPU usage information
 171  * in the mdrun log file
 172  *
 173  * \param[in] gpu_info       Pointer to per-node GPU info struct
 174  * \param[in] gpu_opt        Pointer to per-node GPU options struct
 175  * \param[in] numPpRanks     Number of PP ranks per node
 176  * \param[in] bPrintHostName Print the hostname in the usage information
 177  * \return                   String to write to the log file
 178  * \throws                   std::bad_alloc if out of memory */
 179 static std::string
 180 makeGpuUsageReport(const gmx_gpu_info_t *gpu_info,
 181                    const gmx_gpu_opt_t  *gpu_opt,
 182                    size_t                numPpRanks,
 183                    bool                  bPrintHostName)
 184 {
 185     int  ngpu_use  = gpu_opt->n_dev_use;
 186     int  ngpu_comp = gpu_info->n_dev_compatible;
 187     char host[HOSTNAMELEN];
 188
 189     if (bPrintHostName)
 190     {
 191         gmx_gethostname(host, HOSTNAMELEN);
 192     }
 193
 194     /* Issue a note if GPUs are available but not used */
 195     if (ngpu_comp > 0 && ngpu_use < 1)
 196     {
 197         return gmx::formatString("%d compatible GPU%s detected in the system, but none will be used.\n"
 198                                  "Consider trying GPU acceleration with the Verlet scheme!\n",
 199                                  ngpu_comp, (ngpu_comp > 1) ? "s" : "");
 200     }
 201
 202     std::string output;
 203     if (!gpu_opt->bUserSet)
 204     {
 205         // gpu_opt->dev_compatible is only populated during auto-selection
 206         std::string gpuIdsString =
 207             formatAndJoin(gmx::constArrayRefFromArray(gpu_opt->dev_compatible,
 208                                                       gpu_opt->n_dev_compatible),
 209                           ",", gmx::StringFormatter("%d"));
 210         bool bPluralGpus = gpu_opt->n_dev_compatible > 1;
 211
 212         if (bPrintHostName)
 213         {
 214             output += gmx::formatString("On host %s ", host);
 215         }
 216         output += gmx::formatString("%d compatible GPU%s %s present, with ID%s %s\n",
 217                                     gpu_opt->n_dev_compatible,
 218                                     bPluralGpus ? "s" : "",
 219                                     bPluralGpus ? "are" : "is",
 220                                     bPluralGpus ? "s" : "",
 221                                     gpuIdsString.c_str());
 222     }
 223
 224     {
 225         std::vector<int> gpuIdsInUse;
 226         for (int i = 0; i < ngpu_use; i++)
 227         {
 228             gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
 229         }
 230         std::string gpuIdsString =
 231             formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
 232         size_t      numGpusInUse = gmx_count_gpu_dev_unique(gpu_info, gpu_opt);
 233         bool        bPluralGpus  = numGpusInUse > 1;
 234
 235         if (bPrintHostName)
 236         {
 237             output += gmx::formatString("On host %s ", host);
 238         }
 239         output += gmx::formatString("%zu GPU%s %sselected for this run.\n"
 240                                     "Mapping of GPU ID%s to the %d PP rank%s in this node: %s\n",
 241                                     numGpusInUse, bPluralGpus ? "s" : "",
 242                                     gpu_opt->bUserSet ? "user-" : "auto-",
 243                                     bPluralGpus ? "s" : "",
 244                                     numPpRanks,
 245                                     (numPpRanks > 1) ? "s" : "",
 246                                     gpuIdsString.c_str());
 247     }
 248
 249     return output;
 250 }
 251
 252 /* Give a suitable fatal error or warning if the build configuration
 253    and runtime CPU do not match. */
 254 static void
 255 check_use_of_rdtscp_on_this_cpu(const gmx::MDLogger   &mdlog,
 256                                 const gmx::CpuInfo    &cpuInfo)
 257 {
 258 #ifdef HAVE_RDTSCP
 259     bool binaryUsesRdtscp = TRUE;
 260 #else
 261     bool binaryUsesRdtscp = FALSE;
 262 #endif
 263
 264     const char *programName = gmx::getProgramContext().displayName();
 265
 266     if (cpuInfo.supportLevel() < gmx::CpuInfo::SupportLevel::Features)
 267     {
 268         if (binaryUsesRdtscp)
 269         {
 270             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 271                     "The %s executable was compiled to use the rdtscp CPU instruction. "
 272                     "We cannot detect the features of your current CPU, but will proceed anyway. "
 273                     "If you get a crash, rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 274                     programName);
 275         }
 276     }
 277     else
 278     {
 279         bool cpuHasRdtscp = cpuInfo.feature(gmx::CpuInfo::Feature::X86_Rdtscp);
 280
 281         if (!cpuHasRdtscp && binaryUsesRdtscp)
 282         {
 283             gmx_fatal(FARGS, "The %s executable was compiled to use the rdtscp CPU instruction. "
 284                       "However, this is not supported by the current hardware and continuing would lead to a crash. "
 285                       "Please rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 286                       programName);
 287         }
 288
 289         if (cpuHasRdtscp && !binaryUsesRdtscp)
 290         {
 291             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 292                     "The current CPU can measure timings more accurately than the code in\n"
 293                     "%s was configured to use. This might affect your simulation\n"
 294                     "speed as accurate timings are needed for load-balancing.\n"
 295                     "Please consider rebuilding %s with the GMX_USE_RDTSCP=ON CMake option.",
 296                     programName, programName);
 297         }
 298     }
 299 }
 300
 301 void gmx_check_hw_runconf_consistency(const gmx::MDLogger &mdlog,
 302                                       const gmx_hw_info_t *hwinfo,
 303                                       const t_commrec     *cr,
 304                                       const gmx_hw_opt_t  *hw_opt,
 305                                       gmx_bool             bUseGPU)
 306 {
 307     int      npppn;
 308     char     th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
 309     gmx_bool btMPI, bMPI, bNthreadsAuto, bEmulateGPU;
 310
 311     GMX_RELEASE_ASSERT(hwinfo, "hwinfo must be a non-NULL pointer");
 312     GMX_RELEASE_ASSERT(cr, "cr must be a non-NULL pointer");
 313
 314     /* Below we only do consistency checks for PP and GPUs,
 315      * this is irrelevant for PME only nodes, so in that case we return
 316      * here.
 317      */
 318     if (!(cr->duty & DUTY_PP))
 319     {
 320         return;
 321     }
 322
 323 #if GMX_THREAD_MPI
 324     bMPI          = FALSE;
 325     btMPI         = TRUE;
 326     bNthreadsAuto = (hw_opt->nthreads_tmpi < 1);
 327 #elif GMX_LIB_MPI
 328     bMPI          = TRUE;
 329     btMPI         = FALSE;
 330     bNthreadsAuto = FALSE;
 331 #else
 332     bMPI          = FALSE;
 333     btMPI         = FALSE;
 334     bNthreadsAuto = FALSE;
 335 #endif
 336
 337     /* GPU emulation detection is done later, but we need here as well
 338      * -- uncool, but there's no elegant workaround */
 339     bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != nullptr);
 340
 341     if (hwinfo->gpu_info.n_dev_compatible > 0)
 342     {
 343         std::string gpuUsageReport;
 344         try
 345         {
 346             gpuUsageReport = makeGpuUsageReport(&hwinfo->gpu_info,
 347                                                 &hw_opt->gpu_opt,
 348                                                 cr->nrank_pp_intranode,
 349                                                 bMPI && cr->nnodes > 1);
 350         }
 351         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 352
 353         /* NOTE: this print is only for and on one physical node */
 354         GMX_LOG(mdlog.warning).appendText(gpuUsageReport);
 355     }
 356
 357     /* Need to ensure that we have enough GPUs:
 358      * - need one GPU per PP node
 359      * - no GPU oversubscription with tMPI
 360      * */
 361     /* number of PP processes per node */
 362     npppn = cr->nrank_pp_intranode;
 363
 364     pernode[0]           = '\0';
 365     th_or_proc_plural[0] = '\0';
 366     if (btMPI)
 367     {
 368         sprintf(th_or_proc, "thread-MPI thread");
 369         if (npppn > 1)
 370         {
 371             sprintf(th_or_proc_plural, "s");
 372         }
 373     }
 374     else if (bMPI)
 375     {
 376         sprintf(th_or_proc, "MPI process");
 377         if (npppn > 1)
 378         {
 379             sprintf(th_or_proc_plural, "es");
 380         }
 381         sprintf(pernode, " per node");
 382     }
 383     else
 384     {
 385         /* neither MPI nor tMPI */
 386         sprintf(th_or_proc, "process");
 387     }
 388
 389     if (bUseGPU && hwinfo->gpu_info.n_dev_compatible > 0 &&
 390         !bEmulateGPU)
 391     {
 392         int  ngpu_comp, ngpu_use;
 393         char gpu_comp_plural[2], gpu_use_plural[2];
 394
 395         ngpu_comp = hwinfo->gpu_info.n_dev_compatible;
 396         ngpu_use  = hw_opt->gpu_opt.n_dev_use;
 397
 398         sprintf(gpu_comp_plural, "%s", (ngpu_comp > 1) ? "s" : "");
 399         sprintf(gpu_use_plural,  "%s", (ngpu_use > 1) ? "s" : "");
 400
 401         const char *programName = gmx::getProgramContext().displayName();
 402
 403         /* number of tMPI threads auto-adjusted */
 404         if (btMPI && bNthreadsAuto)
 405         {
 406             if (hw_opt->gpu_opt.bUserSet && npppn < ngpu_use)
 407             {
 408                 /* The user manually provided more GPUs than threads we
 409                    could automatically start. */
 410                 gmx_fatal(FARGS,
 411                           "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
 412                           "%s requires one PP tread-MPI thread per GPU; use fewer GPUs.",
 413                           ngpu_use, gpu_use_plural,
 414                           npppn, th_or_proc_plural,
 415                           programName);
 416             }
 417
 418             if (!hw_opt->gpu_opt.bUserSet && npppn < ngpu_comp)
 419             {
 420                 /* There are more GPUs than tMPI threads; we have
 421                    limited the number GPUs used. */
 422                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 423                         "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
 424                         "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.",
 425                         ngpu_comp, gpu_comp_plural,
 426                         npppn, th_or_proc_plural,
 427                         programName, npppn,
 428                         npppn > 1 ? "s" : "");
 429             }
 430         }
 431
 432         if (hw_opt->gpu_opt.bUserSet)
 433         {
 434             if (ngpu_use != npppn)
 435             {
 436                 gmx_fatal(FARGS,
 437                           "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 438                           "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
 439                           th_or_proc, btMPI ? "s" : "es", pernode,
 440                           programName, npppn, th_or_proc,
 441                           th_or_proc_plural, pernode,
 442                           ngpu_use, gpu_use_plural);
 443             }
 444         }
 445         else
 446         {
 447             /* TODO Should we have a gpu_opt->n_dev_supported field? */
 448             if (ngpu_comp > npppn && gmx_multiple_gpu_per_node_supported())
 449             {
 450                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 451                         "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
 452                         "      PP %s%s%s than GPU%s available.\n"
 453                         "      Each PP %s can use only one GPU, %d GPU%s%s will be used.",
 454                         programName, th_or_proc,
 455                         th_or_proc_plural, pernode, gpu_comp_plural,
 456                         th_or_proc, npppn, gpu_use_plural, pernode);
 457             }
 458
 459             if (ngpu_use != npppn)
 460             {
 461                 /* Avoid duplicate error messages.
 462                  * Unfortunately we can only do this at the physical node
 463                  * level, since the hardware setup and MPI process count
 464                  * might differ between physical nodes.
 465                  */
 466                 if (cr->rank_pp_intranode == 0)
 467                 {
 468                     std::string reasonForLimit;
 469                     if (ngpu_comp > 1 &&
 470                         ngpu_use == 1 &&
 471                         !gmx_multiple_gpu_per_node_supported())
 472                     {
 473                         reasonForLimit  = "can be used by ";
 474                         reasonForLimit += getGpuImplementationString();
 475                         reasonForLimit += " in GROMACS";
 476                     }
 477                     else
 478                     {
 479                         reasonForLimit = "was detected";
 480                     }
 481                     gmx_fatal(FARGS,
 482                               "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 483                               "%s was started with %d PP %s%s%s, but only %d GPU%s %s.",
 484                               th_or_proc, btMPI ? "s" : "es", pernode,
 485                               programName, npppn, th_or_proc,
 486                               th_or_proc_plural, pernode,
 487                               ngpu_use, gpu_use_plural, reasonForLimit.c_str());
 488                 }
 489             }
 490         }
 491
 492         {
 493             int      same_count;
 494
 495             same_count = gmx_count_gpu_dev_shared(&hw_opt->gpu_opt);
 496
 497             if (same_count > 0)
 498             {
 499                 GMX_LOG(mdlog.warning).appendTextFormatted(
 500                         "NOTE: You assigned %s to multiple %s%s.",
 501                         same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
 502             }
 503         }
 504     }
 505
 506 #if GMX_MPI
 507     if (PAR(cr))
 508     {
 509         /* Avoid other ranks to continue after
 510            inconsistency */
 511         MPI_Barrier(cr->mpi_comm_mygroup);
 512     }
 513 #endif
 514
 515 }
 516
 517 /* Return 0 if none of the GPU (per node) are shared among PP ranks.
 518  *
 519  * Sharing GPUs among multiple PP ranks is possible when the user passes
 520  * GPU IDs. Here we check for sharing and return a non-zero value when
 521  * this is detected. Note that the return value represents the number of
 522  * PP rank pairs that share a device.
 523  */
 524 int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt)
 525 {
 526     int      same_count    = 0;
 527     int      ngpu          = gpu_opt->n_dev_use;
 528
 529     if (gpu_opt->bUserSet)
 530     {
 531         int      i, j;
 532
 533         for (i = 0; i < ngpu - 1; i++)
 534         {
 535             for (j = i + 1; j < ngpu; j++)
 536             {
 537                 same_count      += (gpu_opt->dev_use[i] ==
 538                                     gpu_opt->dev_use[j]);
 539             }
 540         }
 541     }
 542
 543     return same_count;
 544 }
 545
 546 /* Count and return the number of unique GPUs (per node) selected.
 547  *
 548  * As sharing GPUs among multiple PP ranks is possible when the user passes
 549  * GPU IDs, the number of GPUs user (per node) can be different from the
 550  * number of GPU IDs selected.
 551  */
 552 static size_t gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
 553                                        const gmx_gpu_opt_t  *gpu_opt)
 554 {
 555     GMX_RELEASE_ASSERT(gpu_info, "gpu_info must be a non-NULL pointer");
 556     GMX_RELEASE_ASSERT(gpu_opt, "gpu_opt must be a non-NULL pointer");
 557
 558     std::set<int> uniqIds;
 559     for (int i = 0; i < gpu_opt->n_dev_use; i++)
 560     {
 561         int deviceId = get_gpu_device_id(gpu_info, gpu_opt, i);
 562         uniqIds.insert(deviceId);
 563     }
 564     return uniqIds.size();
 565 }
 566
 567 static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
 568 {
 569 #if GMX_LIB_MPI
 570     int              rank_world;
 571     MPI_Comm         physicalnode_comm;
 572 #endif
 573     int              rank_local;
 574
 575     /* Under certain circumstances MPI ranks on the same physical node
 576      * can not simultaneously access the same GPU(s). Therefore we run
 577      * the detection only on one MPI rank per node and broadcast the info.
 578      * Note that with thread-MPI only a single thread runs this code.
 579      *
 580      * NOTE: We can't broadcast gpu_info with OpenCL as the device and platform
 581      * ID stored in the structure are unique for each rank (even if a device
 582      * is shared by multiple ranks).
 583      *
 584      * TODO: We should also do CPU hardware detection only once on each
 585      * physical node and broadcast it, instead of do it on every MPI rank.
 586      */
 587 #if GMX_LIB_MPI
 588     /* A split of MPI_COMM_WORLD over physical nodes is only required here,
 589      * so we create and destroy it locally.
 590      */
 591     MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
 592     MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
 593                    rank_world, &physicalnode_comm);
 594     MPI_Comm_rank(physicalnode_comm, &rank_local);
 595     GMX_UNUSED_VALUE(cr);
 596 #else
 597     /* Here there should be only one process, check this */
 598     GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here");
 599
 600     rank_local = 0;
 601 #endif
 602
 603     /*  With CUDA detect only on one rank per host, with OpenCL need do
 604      *  the detection on all PP ranks */
 605     bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && (cr->duty & DUTY_PP));
 606
 607     if (rank_local == 0 || isOpenclPpRank)
 608     {
 609         char detection_error[STRLEN] = "", sbuf[STRLEN];
 610
 611         if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
 612         {
 613             if (detection_error[0] != '\0')
 614             {
 615                 sprintf(sbuf, ":\n      %s\n", detection_error);
 616             }
 617             else
 618             {
 619                 sprintf(sbuf, ".");
 620             }
 621             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 622                     "NOTE: Error occurred during GPU detection%s"
 623                     "      Can not use GPU acceleration, will fall back to CPU kernels.",
 624                     sbuf);
 625         }
 626     }
 627
 628 #if GMX_LIB_MPI
 629     if (!isOpenclPpRank)
 630     {
 631         /* Broadcast the GPU info to the other ranks within this node */
 632         MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
 633
 634         if (hwinfo_g->gpu_info.n_dev > 0)
 635         {
 636             int dev_size;
 637
 638             dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
 639
 640             if (rank_local > 0)
 641             {
 642                 hwinfo_g->gpu_info.gpu_dev =
 643                     (struct gmx_device_info_t *)malloc(dev_size);
 644             }
 645             MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
 646                       0, physicalnode_comm);
 647             MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
 648                       0, physicalnode_comm);
 649         }
 650     }
 651
 652     MPI_Comm_free(&physicalnode_comm);
 653 #endif
 654 }
 655
 656 static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
 657 {
 658     const int ncore = hwinfo_g->hardwareTopology->numberOfCores();
 659 #if GMX_LIB_MPI
 660     int       rank_id;
 661     int       nrank, rank, nhwthread, ngpu, i;
 662     int       gpu_hash;
 663     int      *buf, *all;
 664
 665     rank_id   = gmx_physicalnode_id_hash();
 666     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 667     MPI_Comm_size(MPI_COMM_WORLD, &nrank);
 668     nhwthread = hwinfo_g->nthreads_hw_avail;
 669     ngpu      = hwinfo_g->gpu_info.n_dev_compatible;
 670     /* Create a unique hash of the GPU type(s) in this node */
 671     gpu_hash  = 0;
 672     /* Here it might be better to only loop over the compatible GPU, but we
 673      * don't have that information available and it would also require
 674      * removing the device ID from the device info string.
 675      */
 676     for (i = 0; i < hwinfo_g->gpu_info.n_dev; i++)
 677     {
 678         char stmp[STRLEN];
 679
 680         /* Since the device ID is incorporated in the hash, the order of
 681          * the GPUs affects the hash. Also two identical GPUs won't give
 682          * a gpu_hash of zero after XORing.
 683          */
 684         get_gpu_device_info_string(stmp, &hwinfo_g->gpu_info, i);
 685         gpu_hash ^= gmx_string_fullhash_func(stmp, gmx_string_hash_init);
 686     }
 687
 688     snew(buf, nrank);
 689     snew(all, nrank);
 690     buf[rank] = rank_id;
 691
 692     MPI_Allreduce(buf, all, nrank, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 693
 694     gmx_bool bFound;
 695     int      nnode0, ncore0, nhwthread0, ngpu0, r;
 696
 697     bFound     = FALSE;
 698     ncore0     = 0;
 699     nnode0     = 0;
 700     nhwthread0 = 0;
 701     ngpu0      = 0;
 702     for (r = 0; r < nrank; r++)
 703     {
 704         if (all[r] == rank_id)
 705         {
 706             if (!bFound && r == rank)
 707             {
 708                 /* We are the first rank in this physical node */
 709                 nnode0     = 1;
 710                 ncore0     = ncore;
 711                 nhwthread0 = nhwthread;
 712                 ngpu0      = ngpu;
 713             }
 714             bFound = TRUE;
 715         }
 716     }
 717
 718     sfree(buf);
 719     sfree(all);
 720
 721     int sum[4], maxmin[10];
 722
 723     {
 724         int buf[4];
 725
 726         /* Sum values from only intra-rank 0 so we get the sum over all nodes */
 727         buf[0] = nnode0;
 728         buf[1] = ncore0;
 729         buf[2] = nhwthread0;
 730         buf[3] = ngpu0;
 731
 732         MPI_Allreduce(buf, sum, 4, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 733     }
 734
 735     {
 736         int buf[10];
 737
 738         /* Store + and - values for all ranks,
 739          * so we can get max+min with one MPI call.
 740          */
 741         buf[0] = ncore;
 742         buf[1] = nhwthread;
 743         buf[2] = ngpu;
 744         buf[3] = static_cast<int>(gmx::simdSuggested(cpuInfo));
 745         buf[4] = gpu_hash;
 746         buf[5] = -buf[0];
 747         buf[6] = -buf[1];
 748         buf[7] = -buf[2];
 749         buf[8] = -buf[3];
 750         buf[9] = -buf[4];
 751
 752         MPI_Allreduce(buf, maxmin, 10, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
 753     }
 754
 755     hwinfo_g->nphysicalnode       = sum[0];
 756     hwinfo_g->ncore_tot           = sum[1];
 757     hwinfo_g->ncore_min           = -maxmin[5];
 758     hwinfo_g->ncore_max           = maxmin[0];
 759     hwinfo_g->nhwthread_tot       = sum[2];
 760     hwinfo_g->nhwthread_min       = -maxmin[6];
 761     hwinfo_g->nhwthread_max       = maxmin[1];
 762     hwinfo_g->ngpu_compatible_tot = sum[3];
 763     hwinfo_g->ngpu_compatible_min = -maxmin[7];
 764     hwinfo_g->ngpu_compatible_max = maxmin[2];
 765     hwinfo_g->simd_suggest_min    = -maxmin[8];
 766     hwinfo_g->simd_suggest_max    = maxmin[3];
 767     hwinfo_g->bIdenticalGPUs      = (maxmin[4] == -maxmin[9]);
 768 #else
 769     /* All ranks use the same pointer, protected by a mutex in the caller */
 770     hwinfo_g->nphysicalnode       = 1;
 771     hwinfo_g->ncore_tot           = ncore;
 772     hwinfo_g->ncore_min           = ncore;
 773     hwinfo_g->ncore_max           = ncore;
 774     hwinfo_g->nhwthread_tot       = hwinfo_g->nthreads_hw_avail;
 775     hwinfo_g->nhwthread_min       = hwinfo_g->nthreads_hw_avail;
 776     hwinfo_g->nhwthread_max       = hwinfo_g->nthreads_hw_avail;
 777     hwinfo_g->ngpu_compatible_tot = hwinfo_g->gpu_info.n_dev_compatible;
 778     hwinfo_g->ngpu_compatible_min = hwinfo_g->gpu_info.n_dev_compatible;
 779     hwinfo_g->ngpu_compatible_max = hwinfo_g->gpu_info.n_dev_compatible;
 780     hwinfo_g->simd_suggest_min    = static_cast<int>(simdSuggested(cpuInfo));
 781     hwinfo_g->simd_suggest_max    = static_cast<int>(simdSuggested(cpuInfo));
 782     hwinfo_g->bIdenticalGPUs      = TRUE;
 783 #endif
 784 }
 785
 786 /*! \brief Utility that does dummy computing for max 2 seconds to spin up cores
 787  *
 788  *  This routine will check the number of cores configured and online
 789  *  (using sysconf), and the spins doing dummy compute operations for up to
 790  *  2 seconds, or until all cores have come online. This can be used prior to
 791  *  hardware detection for platforms that take unused processors offline.
 792  *
 793  *  This routine will not throw exceptions.
 794  */
 795 static void
 796 spinUpCore() noexcept
 797 {
 798 #if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROCESSORS_ONLN)
 799     float dummy           = 0.1;
 800     int   countConfigured = sysconf(_SC_NPROCESSORS_CONF);    // noexcept
 801     auto  start           = std::chrono::steady_clock::now(); // noexcept
 802
 803     while (sysconf(_SC_NPROCESSORS_ONLN) < countConfigured &&
 804            std::chrono::steady_clock::now() - start < std::chrono::seconds(2))
 805     {
 806         for (int i = 1; i < 10000; i++)
 807         {
 808             dummy /= i;
 809         }
 810     }
 811
 812     if (dummy < 0)
 813     {
 814         printf("This cannot happen, but prevents loop from being optimized away.");
 815     }
 816 #endif
 817 }
 818
 819 /*! \brief Prepare the system before hardware topology detection
 820  *
 821  * This routine should perform any actions we want to put the system in a state
 822  * where we want it to be before detecting the hardware topology. For most
 823  * processors there is nothing to do, but some architectures (in particular ARM)
 824  * have support for taking configured cores offline, which will make them disappear
 825  * from the online processor count.
 826  *
 827  * This routine checks if there is a mismatch between the number of cores
 828  * configured and online, and in that case we issue a small workload that
 829  * attempts to wake sleeping cores before doing the actual detection.
 830  *
 831  * This type of mismatch can also occur for x86 or PowerPC on Linux, if SMT has only
 832  * been disabled in the kernel (rather than bios). Since those cores will never
 833  * come online automatically, we currently skip this test for x86 & PowerPC to
 834  * avoid wasting 2 seconds. We also skip the test if there is no thread support.
 835  *
 836  * \note Cores will sleep relatively quickly again, so it's important to issue
 837  *       the real detection code directly after this routine.
 838  */
 839 static void
 840 hardwareTopologyPrepareDetection()
 841 {
 842 #if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && \
 843     (defined(THREAD_PTHREADS) || defined(THREAD_WINDOWS))
 844
 845     // Modify this conditional when/if x86 or PowerPC starts to sleep some cores
 846     if (!isX86 && !isPowerPC)
 847     {
 848         int                      countConfigured  = sysconf(_SC_NPROCESSORS_CONF);
 849         std::vector<std::thread> workThreads(countConfigured);
 850
 851         for (auto &t : workThreads)
 852         {
 853             t = std::thread(spinUpCore);
 854         }
 855
 856         for (auto &t : workThreads)
 857         {
 858             t.join();
 859         }
 860     }
 861 #endif
 862 }
 863
 864 /*! \brief Sanity check hardware topology and print some notes to log
 865  *
 866  *  \param mdlog            Logger.
 867  *  \param hardwareTopology Reference to hardwareTopology object.
 868  */
 869 static void
 870 hardwareTopologyDoubleCheckDetection(const gmx::MDLogger gmx_unused         &mdlog,
 871                                      const gmx::HardwareTopology gmx_unused &hardwareTopology)
 872 {
 873 #if defined HAVE_SYSCONF && defined(_SC_NPROCESSORS_CONF)
 874     if (hardwareTopology.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
 875     {
 876         return;
 877     }
 878
 879     int countFromDetection = hardwareTopology.machine().logicalProcessorCount;
 880     int countConfigured    = sysconf(_SC_NPROCESSORS_CONF);
 881
 882     /* BIOS, kernel or user actions can take physical processors
 883      * offline. We already cater for the some of the cases inside the hardwareToplogy
 884      * by trying to spin up cores just before we detect, but there could be other
 885      * cases where it is worthwhile to hint that there might be more resources available.
 886      */
 887     if (countConfigured >= 0 && countConfigured != countFromDetection)
 888     {
 889         GMX_LOG(mdlog.info).
 890             appendTextFormatted("Note: %d CPUs configured, but only %d were detected to be online.\n", countConfigured, countFromDetection);
 891
 892         if (isX86 && countConfigured == 2*countFromDetection)
 893         {
 894             GMX_LOG(mdlog.info).
 895                 appendText("      X86 Hyperthreading is likely disabled; enable it for better performance.");
 896         }
 897         // For PowerPC (likely Power8) it is possible to set SMT to either 2,4, or 8-way hardware threads.
 898         // We only warn if it is completely disabled since default performance drops with SMT8.
 899         if (isPowerPC && countConfigured == 8*countFromDetection)
 900         {
 901             GMX_LOG(mdlog.info).
 902                 appendText("      PowerPC SMT is likely disabled; enable SMT2/SMT4 for better performance.");
 903         }
 904     }
 905 #endif
 906 }
 907
 908
 909 gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog, const t_commrec *cr,
 910                                    gmx_bool bDetectGPUs)
 911 {
 912     int ret;
 913
 914     /* make sure no one else is doing the same thing */
 915     ret = tMPI_Thread_mutex_lock(&hw_info_lock);
 916     if (ret != 0)
 917     {
 918         gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
 919     }
 920
 921     /* only initialize the hwinfo structure if it is not already initalized */
 922     if (n_hwinfo == 0)
 923     {
 924         snew(hwinfo_g, 1);
 925
 926         hwinfo_g->cpuInfo             = new gmx::CpuInfo(gmx::CpuInfo::detect());
 927
 928         hardwareTopologyPrepareDetection();
 929         hwinfo_g->hardwareTopology    = new gmx::HardwareTopology(gmx::HardwareTopology::detect());
 930
 931         // If we detected the topology on this system, double-check that it makes sense
 932         if (hwinfo_g->hardwareTopology->isThisSystem())
 933         {
 934             hardwareTopologyDoubleCheckDetection(mdlog, *(hwinfo_g->hardwareTopology));
 935         }
 936
 937         // TODO: Get rid of this altogether.
 938         hwinfo_g->nthreads_hw_avail = hwinfo_g->hardwareTopology->machine().logicalProcessorCount;
 939
 940         /* detect GPUs */
 941         hwinfo_g->gpu_info.n_dev            = 0;
 942         hwinfo_g->gpu_info.n_dev_compatible = 0;
 943         hwinfo_g->gpu_info.gpu_dev          = nullptr;
 944
 945         /* Run the detection if the binary was compiled with GPU support
 946          * and we requested detection.
 947          */
 948         hwinfo_g->gpu_info.bDetectGPUs =
 949             (bGPUBinary && bDetectGPUs &&
 950              getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
 951         if (hwinfo_g->gpu_info.bDetectGPUs)
 952         {
 953             gmx_detect_gpus(mdlog, cr);
 954         }
 955
 956         gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo);
 957     }
 958     /* increase the reference counter */
 959     n_hwinfo++;
 960
 961     ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
 962     if (ret != 0)
 963     {
 964         gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
 965     }
 966
 967     return hwinfo_g;
 968 }
 969
 970 static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo,
 971                                             bool                 bFullCpuInfo)
 972 {
 973     std::string                  s;
 974
 975     const gmx::CpuInfo          &cpuInfo = *hwinfo_g->cpuInfo;
 976     const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
 977
 978     s  = gmx::formatString("\n");
 979     s += gmx::formatString("Running on %d node%s with total",
 980                            hwinfo->nphysicalnode,
 981                            hwinfo->nphysicalnode == 1 ? "" : "s");
 982     if (hwinfo->ncore_tot > 0)
 983     {
 984         s += gmx::formatString(" %d cores,", hwinfo->ncore_tot);
 985     }
 986     s += gmx::formatString(" %d logical cores", hwinfo->nhwthread_tot);
 987     if (hwinfo->gpu_info.bDetectGPUs)
 988     {
 989         s += gmx::formatString(", %d compatible GPU%s",
 990                                hwinfo->ngpu_compatible_tot,
 991                                hwinfo->ngpu_compatible_tot == 1 ? "" : "s");
 992     }
 993     else if (bGPUBinary)
 994     {
 995         s += gmx::formatString(" (GPU detection deactivated)");
 996     }
 997     s += gmx::formatString("\n");
 998
 999     if (hwinfo->nphysicalnode > 1)
1000     {
1001         /* Print per node hardware feature counts */
1002         if (hwinfo->ncore_max > 0)
1003         {
1004             s += gmx::formatString("  Cores per node:           %2d", hwinfo->ncore_min);
1005             if (hwinfo->ncore_max > hwinfo->ncore_min)
1006             {
1007                 s += gmx::formatString(" - %2d", hwinfo->ncore_max);
1008             }
1009             s += gmx::formatString("\n");
1010         }
1011         s += gmx::formatString("  Logical cores per node:   %2d", hwinfo->nhwthread_min);
1012         if (hwinfo->nhwthread_max > hwinfo->nhwthread_min)
1013         {
1014             s += gmx::formatString(" - %2d", hwinfo->nhwthread_max);
1015         }
1016         s += gmx::formatString("\n");
1017         if (bGPUBinary)
1018         {
1019             s += gmx::formatString("  Compatible GPUs per node: %2d",
1020                                    hwinfo->ngpu_compatible_min);
1021             if (hwinfo->ngpu_compatible_max > hwinfo->ngpu_compatible_min)
1022             {
1023                 s += gmx::formatString(" - %2d", hwinfo->ngpu_compatible_max);
1024             }
1025             s += gmx::formatString("\n");
1026             if (hwinfo->ngpu_compatible_tot > 0)
1027             {
1028                 if (hwinfo->bIdenticalGPUs)
1029                 {
1030                     s += gmx::formatString("  All nodes have identical type(s) of GPUs\n");
1031                 }
1032                 else
1033                 {
1034                     /* This message will also appear with identical GPU types
1035                      * when at least one node has no GPU.
1036                      */
1037                     s += gmx::formatString("  Different nodes have different type(s) and/or order of GPUs\n");
1038                 }
1039             }
1040         }
1041     }
1042
1043 #if GMX_LIB_MPI
1044     char host[HOSTNAMELEN];
1045     int  rank;
1046
1047     gmx_gethostname(host, HOSTNAMELEN);
1048     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
1049
1050     s += gmx::formatString("Hardware detected on host %s (the node of MPI rank %d):\n",
1051                            host, rank);
1052 #else
1053     s += gmx::formatString("Hardware detected:\n");
1054 #endif
1055     s += gmx::formatString("  CPU info:\n");
1056
1057     s += gmx::formatString("    Vendor: %s\n", cpuInfo.vendorString().c_str());
1058
1059     s += gmx::formatString("    Brand:  %s\n", cpuInfo.brandString().c_str());
1060
1061     if (bFullCpuInfo)
1062     {
1063         s += gmx::formatString("    Family: %d   Model: %d   Stepping: %d\n",
1064                                cpuInfo.family(), cpuInfo.model(), cpuInfo.stepping());
1065
1066         s += gmx::formatString("    Features:");
1067         for (auto &f : cpuInfo.featureSet())
1068         {
1069             s += gmx::formatString(" %s", cpuInfo.featureString(f).c_str());;
1070         }
1071         s += gmx::formatString("\n");
1072     }
1073
1074     s += gmx::formatString("    SIMD instructions most likely to fit this hardware: %s",
1075                            gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min)).c_str());
1076
1077     if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min)
1078     {
1079         s += gmx::formatString(" - %s", gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_max)).c_str());
1080     }
1081     s += gmx::formatString("\n");
1082
1083     s += gmx::formatString("    SIMD instructions selected at GROMACS compile time: %s\n",
1084                            gmx::simdString(gmx::simdCompiled()).c_str());
1085
1086     s += gmx::formatString("\n");
1087
1088     s += gmx::formatString("  Hardware topology: ");
1089     switch (hwTop.supportLevel())
1090     {
1091         case gmx::HardwareTopology::SupportLevel::None:
1092             s += gmx::formatString("None\n");
1093             break;
1094         case gmx::HardwareTopology::SupportLevel::LogicalProcessorCount:
1095             s += gmx::formatString("Only logical processor count\n");
1096             break;
1097         case gmx::HardwareTopology::SupportLevel::Basic:
1098             s += gmx::formatString("Basic\n");
1099             break;
1100         case gmx::HardwareTopology::SupportLevel::Full:
1101             s += gmx::formatString("Full\n");
1102             break;
1103         case gmx::HardwareTopology::SupportLevel::FullWithDevices:
1104             s += gmx::formatString("Full, with devices\n");
1105             break;
1106     }
1107
1108     if (!hwTop.isThisSystem())
1109     {
1110         s += gmx::formatString("  NOTE: Hardware topology cached or synthetic, not detected.\n");
1111         if (char *p = getenv("HWLOC_XMLFILE"))
1112         {
1113             s += gmx::formatString("        HWLOC_XMLFILE=%s\n", p);
1114         }
1115     }
1116
1117     if (bFullCpuInfo)
1118     {
1119         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic)
1120         {
1121             s += gmx::formatString("    Sockets, cores, and logical processors:\n");
1122
1123             for (auto &socket : hwTop.machine().sockets)
1124             {
1125                 s += gmx::formatString("      Socket %2d:", socket.id);
1126                 for (auto &c : socket.cores)
1127                 {
1128                     s += gmx::formatString(" [");
1129                     for (auto &t : c.hwThreads)
1130                     {
1131                         s += gmx::formatString(" %3d", t.logicalProcessorId);
1132                     }
1133                     s += gmx::formatString("]");
1134                 }
1135                 s += gmx::formatString("\n");
1136             }
1137         }
1138         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Full)
1139         {
1140             s += gmx::formatString("    Numa nodes:\n");
1141             for (auto &n : hwTop.machine().numa.nodes)
1142             {
1143                 s += gmx::formatString("      Node %2d (%" GMX_PRIu64 " bytes mem):", n.id, n.memory);
1144                 for (auto &l : n.logicalProcessorId)
1145                 {
1146                     s += gmx::formatString(" %3d", l);
1147                 }
1148                 s += gmx::formatString("\n");
1149             }
1150             s += gmx::formatString("      Latency:\n          ");
1151             for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
1152             {
1153                 s += gmx::formatString(" %5d", j);
1154             }
1155             s += gmx::formatString("\n");
1156             for (std::size_t i = 0; i < hwTop.machine().numa.nodes.size(); i++)
1157             {
1158                 s += gmx::formatString("     %5d", i);
1159                 for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
1160                 {
1161                     s += gmx::formatString(" %5.2f", hwTop.machine().numa.relativeLatency[i][j]);
1162                 }
1163                 s += gmx::formatString("\n");
1164             }
1165
1166
1167             s += gmx::formatString("    Caches:\n");
1168             for (auto &c : hwTop.machine().caches)
1169             {
1170                 s += gmx::formatString("      L%d: %" GMX_PRIu64 " bytes, linesize %d bytes, assoc. %d, shared %d ways\n",
1171                                        c.level, c.size, c.linesize, c.associativity, c.shared);
1172             }
1173         }
1174         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::FullWithDevices)
1175         {
1176             s += gmx::formatString("    PCI devices:\n");
1177             for (auto &d : hwTop.machine().devices)
1178             {
1179                 s += gmx::formatString("      %04x:%02x:%02x.%1x  Id: %04x:%04x  Class: 0x%04x  Numa: %d\n",
1180                                        d.domain, d.bus, d.dev, d.func, d.vendorId, d.deviceId, d.classId, d.numaNodeId);
1181             }
1182         }
1183     }
1184
1185     if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 ||
1186                        hwinfo->gpu_info.n_dev > 0))
1187     {
1188         s += gmx::formatString("  GPU info:\n");
1189         s += gmx::formatString("    Number of GPUs detected: %d\n",
1190                                hwinfo->gpu_info.n_dev);
1191         if (hwinfo->gpu_info.n_dev > 0)
1192         {
1193             s += sprint_gpus(&hwinfo->gpu_info) + "\n";
1194         }
1195     }
1196     return s;
1197 }
1198
1199 void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
1200                                  const gmx::MDLogger &mdlog,
1201                                  const gmx_hw_info_t *hwinfo)
1202 {
1203     const gmx::CpuInfo &cpuInfo = *hwinfo_g->cpuInfo;
1204
1205     if (fplog != nullptr)
1206     {
1207         std::string detected;
1208
1209         detected = detected_hardware_string(hwinfo, TRUE);
1210
1211         fprintf(fplog, "%s\n", detected.c_str());
1212     }
1213
1214     if (MULTIMASTER(cr))
1215     {
1216         std::string detected;
1217
1218         detected = detected_hardware_string(hwinfo, FALSE);
1219
1220         fprintf(stderr, "%s\n", detected.c_str());
1221     }
1222
1223     /* Check the compiled SIMD instruction set against that of the node
1224      * with the lowest SIMD level support (skip if SIMD detection did not work)
1225      */
1226     if (cpuInfo.supportLevel() >= gmx::CpuInfo::SupportLevel::Features)
1227     {
1228         gmx::simdCheck(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min), fplog, MULTIMASTER(cr));
1229     }
1230
1231     /* For RDTSCP we only check on our local node and skip the MPI reduction */
1232     check_use_of_rdtscp_on_this_cpu(mdlog, cpuInfo);
1233 }
1234
1235 //! \brief Return if any GPU ID (e.g in a user-supplied string) is repeated
1236 static gmx_bool anyGpuIdIsRepeated(const gmx_gpu_opt_t *gpu_opt)
1237 {
1238     /* Loop over IDs in the string */
1239     for (int i = 0; i < gpu_opt->n_dev_use - 1; ++i)
1240     {
1241         /* Look for the ID in location i in the following part of the
1242            string */
1243         for (int j = i + 1; j < gpu_opt->n_dev_use; ++j)
1244         {
1245             if (gpu_opt->dev_use[i] == gpu_opt->dev_use[j])
1246             {
1247                 /* Same ID found in locations i and j */
1248                 return TRUE;
1249             }
1250         }
1251     }
1252
1253     return FALSE;
1254 }
1255
1256 void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
1257 {
1258     char *env;
1259
1260     if (gpu_opt->gpu_id != nullptr && !bGPUBinary)
1261     {
1262         gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!",
1263                   gmx::getProgramContext().displayName());
1264     }
1265
1266     env = getenv("GMX_GPU_ID");
1267     if (env != nullptr && gpu_opt->gpu_id != nullptr)
1268     {
1269         gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
1270     }
1271     if (env == nullptr)
1272     {
1273         env = gpu_opt->gpu_id;
1274     }
1275
1276     /* parse GPU IDs if the user passed any */
1277     if (env != nullptr)
1278     {
1279         /* Parse a "plain" or comma-separated GPU ID string which contains a
1280          * sequence of digits corresponding to GPU IDs; the order will
1281          * indicate the process/tMPI thread - GPU assignment. */
1282         parse_digits_from_string(env, &gpu_opt->n_dev_use, &gpu_opt->dev_use);
1283
1284         if (!gmx_multiple_gpu_per_node_supported() && 1 < gpu_opt->n_dev_use)
1285         {
1286             gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", getGpuImplementationString());
1287         }
1288         if (!gmx_gpu_sharing_supported() && anyGpuIdIsRepeated(gpu_opt))
1289         {
1290             gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", getGpuImplementationString());
1291         }
1292         if (gpu_opt->n_dev_use == 0)
1293         {
1294             gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
1295                       invalid_gpuid_hint);
1296         }
1297
1298         gpu_opt->bUserSet = TRUE;
1299     }
1300 }
1301
1302 void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
1303 {
1304     int ret;
1305
1306     ret = tMPI_Thread_mutex_lock(&hw_info_lock);
1307     if (ret != 0)
1308     {
1309         gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
1310     }
1311
1312     /* decrease the reference counter */
1313     n_hwinfo--;
1314
1315
1316     if (hwinfo != hwinfo_g)
1317     {
1318         gmx_incons("hwinfo < hwinfo_g");
1319     }
1320
1321     if (n_hwinfo < 0)
1322     {
1323         gmx_incons("n_hwinfo < 0");
1324     }
1325
1326     if (n_hwinfo == 0)
1327     {
1328         delete hwinfo_g->cpuInfo;
1329         delete hwinfo_g->hardwareTopology;
1330         free_gpu_info(&hwinfo_g->gpu_info);
1331         sfree(hwinfo_g);
1332     }
1333
1334     ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
1335     if (ret != 0)
1336     {
1337         gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
1338     }
1339 }