src/gromacs/hardware/detecthardware.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "detecthardware.h"
  38
  39 #include "config.h"
  40
  41 #include <cerrno>
  42 #include <cstdlib>
  43 #include <cstring>
  44
  45 #include <algorithm>
  46 #include <chrono>
  47 #include <string>
  48 #include <thread>
  49 #include <vector>
  50
  51 #include "thread_mpi/threads.h"
  52
  53 #include "gromacs/gmxlib/network.h"
  54 #include "gromacs/gpu_utils/gpu_utils.h"
  55 #include "gromacs/hardware/cpuinfo.h"
  56 #include "gromacs/hardware/gpu_hw_info.h"
  57 #include "gromacs/hardware/hardwareassign.h"
  58 #include "gromacs/hardware/hardwaretopology.h"
  59 #include "gromacs/hardware/hw_info.h"
  60 #include "gromacs/mdtypes/commrec.h"
  61 #include "gromacs/mdtypes/md_enums.h"
  62 #include "gromacs/simd/support.h"
  63 #include "gromacs/utility/arrayref.h"
  64 #include "gromacs/utility/basedefinitions.h"
  65 #include "gromacs/utility/basenetwork.h"
  66 #include "gromacs/utility/baseversion.h"
  67 #include "gromacs/utility/cstringutil.h"
  68 #include "gromacs/utility/exceptions.h"
  69 #include "gromacs/utility/fatalerror.h"
  70 #include "gromacs/utility/gmxassert.h"
  71 #include "gromacs/utility/logger.h"
  72 #include "gromacs/utility/programcontext.h"
  73 #include "gromacs/utility/smalloc.h"
  74 #include "gromacs/utility/stringutil.h"
  75 #include "gromacs/utility/sysinfo.h"
  76
  77 #ifdef HAVE_UNISTD_H
  78 #    include <unistd.h>       // sysconf()
  79 #endif
  80
  81 //! Convenience macro to help us avoid ifdefs each time we use sysconf
  82 #if !defined(_SC_NPROCESSORS_ONLN) && defined(_SC_NPROC_ONLN)
  83 #    define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
  84 #endif
  85
  86 //! Convenience macro to help us avoid ifdefs each time we use sysconf
  87 #if !defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROC_CONF)
  88 #    define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
  89 #endif
  90
  91 #if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
  92 //! Constant used to help minimize preprocessed code
  93 static const bool isX86 = true;
  94 #else
  95 //! Constant used to help minimize preprocessed code
  96 static const bool isX86 = false;
  97 #endif
  98
  99 #if defined __powerpc__ || defined __ppc__ || defined __PPC__
 100 static const bool isPowerPC = true;
 101 #else
 102 static const bool isPowerPC = false;
 103 #endif
 104
 105 //! Constant used to help minimize preprocessed code
 106 static const bool bGPUBinary     = GMX_GPU != GMX_GPU_NONE;
 107
 108 /* Note that some of the following arrays must match the "GPU support
 109  * enumeration" in src/config.h.cmakein, so that GMX_GPU looks up an
 110  * array entry. */
 111
 112 // TODO If/when we unify CUDA and OpenCL support code, this should
 113 // move to a single place in gpu_utils.
 114 /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
 115 const char * const gpu_detect_res_str[egpuNR] =
 116 {
 117     "compatible", "inexistent", "incompatible", "insane"
 118 };
 119
 120 /* The globally shared hwinfo structure. */
 121 static gmx_hw_info_t      *hwinfo_g;
 122 /* A reference counter for the hwinfo structure */
 123 static int                 n_hwinfo = 0;
 124 /* A lock to protect the hwinfo structure */
 125 static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
 126
 127 #define HOSTNAMELEN 80
 128
 129 /* FW decl. */
 130 static size_t gmx_count_gpu_dev_unique(const gmx_gpu_info_t &gpu_info,
 131                                        const gmx_gpu_opt_t  *gpu_opt);
 132
 133 /* FW decl. */
 134 static int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt,
 135                                     bool                 userSetGpuIds);
 136
 137 /*! \internal \brief
 138  * Returns the GPU information text, one GPU per line.
 139  */
 140 static std::string sprint_gpus(const gmx_gpu_info_t &gpu_info)
 141 {
 142     char                     stmp[STRLEN];
 143     std::vector<std::string> gpuStrings;
 144     for (int i = 0; i < gpu_info.n_dev; i++)
 145     {
 146         get_gpu_device_info_string(stmp, gpu_info, i);
 147         gpuStrings.push_back(gmx::formatString("    %s", stmp));
 148     }
 149     return gmx::joinStrings(gpuStrings, "\n");
 150 }
 151
 152 // TODO This function should not live in detecthardware.cpp
 153 std::string
 154 makeGpuUsageReport(const gmx_gpu_info_t &gpu_info,
 155                    const gmx_gpu_opt_t  *gpu_opt,
 156                    bool                  userSetGpuIds,
 157                    size_t                numPpRanks,
 158                    bool                  bPrintHostName)
 159 {
 160     int  ngpu_use  = gpu_opt->n_dev_use;
 161     int  ngpu_comp = gpu_info.n_dev_compatible;
 162     char host[HOSTNAMELEN];
 163
 164     if (bPrintHostName)
 165     {
 166         gmx_gethostname(host, HOSTNAMELEN);
 167     }
 168
 169     /* Issue a note if GPUs are available but not used */
 170     if (ngpu_comp > 0 && ngpu_use < 1)
 171     {
 172         return gmx::formatString("%d compatible GPU%s detected in the system, but none will be used.\n"
 173                                  "Consider trying GPU acceleration with the Verlet scheme!\n",
 174                                  ngpu_comp, (ngpu_comp > 1) ? "s" : "");
 175     }
 176
 177     std::string output;
 178     {
 179         std::vector<int> gpuIdsInUse;
 180         for (int i = 0; i < ngpu_use; i++)
 181         {
 182             gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
 183         }
 184         std::string gpuIdsString =
 185             formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
 186         size_t      numGpusInUse = gmx_count_gpu_dev_unique(gpu_info, gpu_opt);
 187         bool        bPluralGpus  = numGpusInUse > 1;
 188
 189         if (bPrintHostName)
 190         {
 191             output += gmx::formatString("On host %s ", host);
 192         }
 193         output += gmx::formatString("%zu GPU%s %sselected for this run.\n"
 194                                     "Mapping of GPU ID%s to the %d PP rank%s in this node: %s\n",
 195                                     numGpusInUse, bPluralGpus ? "s" : "",
 196                                     userSetGpuIds ? "user-" : "auto-",
 197                                     bPluralGpus ? "s" : "",
 198                                     numPpRanks,
 199                                     (numPpRanks > 1) ? "s" : "",
 200                                     gpuIdsString.c_str());
 201     }
 202
 203     int same_count = gmx_count_gpu_dev_shared(gpu_opt, userSetGpuIds);
 204
 205     if (same_count > 0)
 206     {
 207         output += gmx::formatString("NOTE: You assigned %s to multiple ranks.\n",
 208                                     same_count > 1 ? "GPU IDs" : "a GPU ID");
 209     }
 210
 211     if (static_cast<size_t>(ngpu_comp) > numPpRanks)
 212     {
 213         /* TODO In principle, this warning could be warranted only on
 214          * ranks on some nodes, but we lack the infrastructure to do a
 215          * good job of reporting that. */
 216         output += gmx::formatString("NOTE: potentially sub-optimal launch configuration using fewer\n"
 217                                     "      PP ranks on a node than GPUs available on that node.\n");
 218     }
 219
 220     return output;
 221 }
 222
 223 /* Give a suitable fatal error or warning if the build configuration
 224    and runtime CPU do not match. */
 225 static void
 226 check_use_of_rdtscp_on_this_cpu(const gmx::MDLogger   &mdlog,
 227                                 const gmx::CpuInfo    &cpuInfo)
 228 {
 229 #ifdef HAVE_RDTSCP
 230     bool binaryUsesRdtscp = TRUE;
 231 #else
 232     bool binaryUsesRdtscp = FALSE;
 233 #endif
 234
 235     const char *programName = gmx::getProgramContext().displayName();
 236
 237     if (cpuInfo.supportLevel() < gmx::CpuInfo::SupportLevel::Features)
 238     {
 239         if (binaryUsesRdtscp)
 240         {
 241             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 242                     "The %s executable was compiled to use the rdtscp CPU instruction. "
 243                     "We cannot detect the features of your current CPU, but will proceed anyway. "
 244                     "If you get a crash, rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 245                     programName);
 246         }
 247     }
 248     else
 249     {
 250         bool cpuHasRdtscp = cpuInfo.feature(gmx::CpuInfo::Feature::X86_Rdtscp);
 251
 252         if (!cpuHasRdtscp && binaryUsesRdtscp)
 253         {
 254             gmx_fatal(FARGS, "The %s executable was compiled to use the rdtscp CPU instruction. "
 255                       "However, this is not supported by the current hardware and continuing would lead to a crash. "
 256                       "Please rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 257                       programName);
 258         }
 259
 260         if (cpuHasRdtscp && !binaryUsesRdtscp)
 261         {
 262             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 263                     "The current CPU can measure timings more accurately than the code in\n"
 264                     "%s was configured to use. This might affect your simulation\n"
 265                     "speed as accurate timings are needed for load-balancing.\n"
 266                     "Please consider rebuilding %s with the GMX_USE_RDTSCP=ON CMake option.",
 267                     programName, programName);
 268         }
 269     }
 270 }
 271
 272 void gmx_check_hw_runconf_consistency(const gmx::MDLogger &mdlog,
 273                                       const gmx_hw_info_t *hwinfo,
 274                                       const t_commrec     *cr,
 275                                       const gmx_hw_opt_t  *hw_opt,
 276                                       bool                 userSetGpuIds,
 277                                       bool                 willUsePhysicalGpu)
 278 {
 279     int      npppn;
 280     char     th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
 281     gmx_bool btMPI, bMPI, bNthreadsAuto;
 282
 283     GMX_RELEASE_ASSERT(hwinfo, "hwinfo must be a non-NULL pointer");
 284     GMX_RELEASE_ASSERT(cr, "cr must be a non-NULL pointer");
 285
 286     /* Below we only do consistency checks for PP and GPUs,
 287      * this is irrelevant for PME only nodes, so in that case we return
 288      * here.
 289      */
 290     if (!(cr->duty & DUTY_PP))
 291     {
 292         return;
 293     }
 294
 295 #if GMX_THREAD_MPI
 296     bMPI          = FALSE;
 297     btMPI         = TRUE;
 298     bNthreadsAuto = (hw_opt->nthreads_tmpi < 1);
 299 #elif GMX_LIB_MPI
 300     bMPI          = TRUE;
 301     btMPI         = FALSE;
 302     bNthreadsAuto = FALSE;
 303 #else
 304     bMPI          = FALSE;
 305     btMPI         = FALSE;
 306     bNthreadsAuto = FALSE;
 307 #endif
 308
 309     /* Need to ensure that we have enough GPUs:
 310      * - need one GPU per PP node
 311      * - no GPU oversubscription with tMPI
 312      * */
 313     /* number of PP processes per node */
 314     npppn = cr->nrank_pp_intranode;
 315
 316     pernode[0]           = '\0';
 317     th_or_proc_plural[0] = '\0';
 318     if (btMPI)
 319     {
 320         sprintf(th_or_proc, "thread-MPI thread");
 321         if (npppn > 1)
 322         {
 323             sprintf(th_or_proc_plural, "s");
 324         }
 325     }
 326     else if (bMPI)
 327     {
 328         sprintf(th_or_proc, "MPI process");
 329         if (npppn > 1)
 330         {
 331             sprintf(th_or_proc_plural, "es");
 332         }
 333         sprintf(pernode, " per node");
 334     }
 335     else
 336     {
 337         /* neither MPI nor tMPI */
 338         sprintf(th_or_proc, "process");
 339     }
 340
 341     if (willUsePhysicalGpu)
 342     {
 343         int  ngpu_comp, ngpu_use;
 344         char gpu_comp_plural[2], gpu_use_plural[2];
 345
 346         ngpu_comp = hwinfo->gpu_info.n_dev_compatible;
 347         ngpu_use  = hw_opt->gpu_opt.n_dev_use;
 348
 349         sprintf(gpu_comp_plural, "%s", (ngpu_comp > 1) ? "s" : "");
 350         sprintf(gpu_use_plural,  "%s", (ngpu_use > 1) ? "s" : "");
 351
 352         const char *programName = gmx::getProgramContext().displayName();
 353
 354         /* number of tMPI threads auto-adjusted */
 355         if (btMPI && bNthreadsAuto)
 356         {
 357             if (userSetGpuIds && npppn < ngpu_use)
 358             {
 359                 /* The user manually provided more GPUs than threads we
 360                    could automatically start. */
 361                 gmx_fatal(FARGS,
 362                           "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
 363                           "%s requires one PP tread-MPI thread per GPU; use fewer GPUs.",
 364                           ngpu_use, gpu_use_plural,
 365                           npppn, th_or_proc_plural,
 366                           programName);
 367             }
 368
 369             if (!userSetGpuIds && npppn < ngpu_comp)
 370             {
 371                 /* There are more GPUs than tMPI threads; we have
 372                    limited the number GPUs used. */
 373                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 374                         "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
 375                         "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.",
 376                         ngpu_comp, gpu_comp_plural,
 377                         npppn, th_or_proc_plural,
 378                         programName, npppn,
 379                         npppn > 1 ? "s" : "");
 380             }
 381         }
 382
 383         if (userSetGpuIds)
 384         {
 385             if (ngpu_use != npppn)
 386             {
 387                 gmx_fatal(FARGS,
 388                           "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 389                           "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
 390                           th_or_proc, btMPI ? "s" : "es", pernode,
 391                           programName, npppn, th_or_proc,
 392                           th_or_proc_plural, pernode,
 393                           ngpu_use, gpu_use_plural);
 394             }
 395         }
 396         else
 397         {
 398             if (ngpu_use != npppn)
 399             {
 400                 /* Avoid duplicate error messages.
 401                  * Unfortunately we can only do this at the physical node
 402                  * level, since the hardware setup and MPI process count
 403                  * might differ between physical nodes.
 404                  */
 405                 if (cr->rank_pp_intranode == 0)
 406                 {
 407                     gmx_fatal(FARGS,
 408                               "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 409                               "%s was started with %d PP %s%s%s, but only %d GPU%s was detected.",
 410                               th_or_proc, btMPI ? "s" : "es", pernode,
 411                               programName, npppn, th_or_proc,
 412                               th_or_proc_plural, pernode,
 413                               ngpu_use, gpu_use_plural);
 414                 }
 415             }
 416         }
 417     }
 418 }
 419
 420 /*! \brief Return the number of PP rank pairs that share a GPU device between them.
 421  *
 422  * Sharing GPUs among multiple PP ranks is possible via either user or
 423  * automated selection. */
 424 static int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt, bool userSetGpuIds)
 425 {
 426     int      same_count    = 0;
 427     int      ngpu          = gpu_opt->n_dev_use;
 428
 429     if (userSetGpuIds)
 430     {
 431         int      i, j;
 432
 433         for (i = 0; i < ngpu - 1; i++)
 434         {
 435             for (j = i + 1; j < ngpu; j++)
 436             {
 437                 same_count      += (gpu_opt->dev_use[i] ==
 438                                     gpu_opt->dev_use[j]);
 439             }
 440         }
 441     }
 442
 443     return same_count;
 444 }
 445
 446 /* Count and return the number of unique GPUs (per node) selected.
 447  *
 448  * As sharing GPUs among multiple PP ranks is possible when the user passes
 449  * GPU IDs, the number of GPUs user (per node) can be different from the
 450  * number of GPU IDs selected.
 451  */
 452 static size_t gmx_count_gpu_dev_unique(const gmx_gpu_info_t &gpu_info,
 453                                        const gmx_gpu_opt_t  *gpu_opt)
 454 {
 455     GMX_RELEASE_ASSERT(gpu_opt, "gpu_opt must be a non-NULL pointer");
 456
 457     std::set<int> uniqIds;
 458     for (int i = 0; i < gpu_opt->n_dev_use; i++)
 459     {
 460         int deviceId = get_gpu_device_id(gpu_info, gpu_opt, i);
 461         uniqIds.insert(deviceId);
 462     }
 463     return uniqIds.size();
 464 }
 465
 466 static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
 467 {
 468 #if GMX_LIB_MPI
 469     int              rank_world;
 470     MPI_Comm         physicalnode_comm;
 471 #endif
 472     int              rank_local;
 473
 474     /* Under certain circumstances MPI ranks on the same physical node
 475      * can not simultaneously access the same GPU(s). Therefore we run
 476      * the detection only on one MPI rank per node and broadcast the info.
 477      * Note that with thread-MPI only a single thread runs this code.
 478      *
 479      * NOTE: We can't broadcast gpu_info with OpenCL as the device and platform
 480      * ID stored in the structure are unique for each rank (even if a device
 481      * is shared by multiple ranks).
 482      *
 483      * TODO: We should also do CPU hardware detection only once on each
 484      * physical node and broadcast it, instead of do it on every MPI rank.
 485      */
 486 #if GMX_LIB_MPI
 487     /* A split of MPI_COMM_WORLD over physical nodes is only required here,
 488      * so we create and destroy it locally.
 489      */
 490     MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
 491     MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
 492                    rank_world, &physicalnode_comm);
 493     MPI_Comm_rank(physicalnode_comm, &rank_local);
 494     GMX_UNUSED_VALUE(cr);
 495 #else
 496     /* Here there should be only one process, check this */
 497     GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here");
 498
 499     rank_local = 0;
 500 #endif
 501
 502     /*  With CUDA detect only on one rank per host, with OpenCL need do
 503      *  the detection on all PP ranks */
 504     bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && (cr->duty & DUTY_PP));
 505
 506     if (rank_local == 0 || isOpenclPpRank)
 507     {
 508         char detection_error[STRLEN] = "", sbuf[STRLEN];
 509
 510         if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
 511         {
 512             if (detection_error[0] != '\0')
 513             {
 514                 sprintf(sbuf, ":\n      %s\n", detection_error);
 515             }
 516             else
 517             {
 518                 sprintf(sbuf, ".");
 519             }
 520             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 521                     "NOTE: Error occurred during GPU detection%s"
 522                     "      Can not use GPU acceleration, will fall back to CPU kernels.",
 523                     sbuf);
 524         }
 525     }
 526
 527 #if GMX_LIB_MPI
 528     if (!isOpenclPpRank)
 529     {
 530         /* Broadcast the GPU info to the other ranks within this node */
 531         MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
 532
 533         if (hwinfo_g->gpu_info.n_dev > 0)
 534         {
 535             int dev_size;
 536
 537             dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
 538
 539             if (rank_local > 0)
 540             {
 541                 hwinfo_g->gpu_info.gpu_dev =
 542                     (struct gmx_device_info_t *)malloc(dev_size);
 543             }
 544             MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
 545                       0, physicalnode_comm);
 546             MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
 547                       0, physicalnode_comm);
 548         }
 549     }
 550
 551     MPI_Comm_free(&physicalnode_comm);
 552 #endif
 553 }
 554
 555 static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
 556 {
 557     const int ncore = hwinfo_g->hardwareTopology->numberOfCores();
 558 #if GMX_LIB_MPI
 559     int       rank_id;
 560     int       nrank, rank, nhwthread, ngpu, i;
 561     int       gpu_hash;
 562     int      *buf, *all;
 563
 564     rank_id   = gmx_physicalnode_id_hash();
 565     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 566     MPI_Comm_size(MPI_COMM_WORLD, &nrank);
 567     nhwthread = hwinfo_g->nthreads_hw_avail;
 568     ngpu      = hwinfo_g->gpu_info.n_dev_compatible;
 569     /* Create a unique hash of the GPU type(s) in this node */
 570     gpu_hash  = 0;
 571     /* Here it might be better to only loop over the compatible GPU, but we
 572      * don't have that information available and it would also require
 573      * removing the device ID from the device info string.
 574      */
 575     for (i = 0; i < hwinfo_g->gpu_info.n_dev; i++)
 576     {
 577         char stmp[STRLEN];
 578
 579         /* Since the device ID is incorporated in the hash, the order of
 580          * the GPUs affects the hash. Also two identical GPUs won't give
 581          * a gpu_hash of zero after XORing.
 582          */
 583         get_gpu_device_info_string(stmp, hwinfo_g->gpu_info, i);
 584         gpu_hash ^= gmx_string_fullhash_func(stmp, gmx_string_hash_init);
 585     }
 586
 587     snew(buf, nrank);
 588     snew(all, nrank);
 589     buf[rank] = rank_id;
 590
 591     MPI_Allreduce(buf, all, nrank, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 592
 593     gmx_bool bFound;
 594     int      nnode0, ncore0, nhwthread0, ngpu0, r;
 595
 596     bFound     = FALSE;
 597     ncore0     = 0;
 598     nnode0     = 0;
 599     nhwthread0 = 0;
 600     ngpu0      = 0;
 601     for (r = 0; r < nrank; r++)
 602     {
 603         if (all[r] == rank_id)
 604         {
 605             if (!bFound && r == rank)
 606             {
 607                 /* We are the first rank in this physical node */
 608                 nnode0     = 1;
 609                 ncore0     = ncore;
 610                 nhwthread0 = nhwthread;
 611                 ngpu0      = ngpu;
 612             }
 613             bFound = TRUE;
 614         }
 615     }
 616
 617     sfree(buf);
 618     sfree(all);
 619
 620     int sum[4], maxmin[10];
 621
 622     {
 623         int buf[4];
 624
 625         /* Sum values from only intra-rank 0 so we get the sum over all nodes */
 626         buf[0] = nnode0;
 627         buf[1] = ncore0;
 628         buf[2] = nhwthread0;
 629         buf[3] = ngpu0;
 630
 631         MPI_Allreduce(buf, sum, 4, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 632     }
 633
 634     {
 635         int buf[10];
 636
 637         /* Store + and - values for all ranks,
 638          * so we can get max+min with one MPI call.
 639          */
 640         buf[0] = ncore;
 641         buf[1] = nhwthread;
 642         buf[2] = ngpu;
 643         buf[3] = static_cast<int>(gmx::simdSuggested(cpuInfo));
 644         buf[4] = gpu_hash;
 645         buf[5] = -buf[0];
 646         buf[6] = -buf[1];
 647         buf[7] = -buf[2];
 648         buf[8] = -buf[3];
 649         buf[9] = -buf[4];
 650
 651         MPI_Allreduce(buf, maxmin, 10, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
 652     }
 653
 654     hwinfo_g->nphysicalnode       = sum[0];
 655     hwinfo_g->ncore_tot           = sum[1];
 656     hwinfo_g->ncore_min           = -maxmin[5];
 657     hwinfo_g->ncore_max           = maxmin[0];
 658     hwinfo_g->nhwthread_tot       = sum[2];
 659     hwinfo_g->nhwthread_min       = -maxmin[6];
 660     hwinfo_g->nhwthread_max       = maxmin[1];
 661     hwinfo_g->ngpu_compatible_tot = sum[3];
 662     hwinfo_g->ngpu_compatible_min = -maxmin[7];
 663     hwinfo_g->ngpu_compatible_max = maxmin[2];
 664     hwinfo_g->simd_suggest_min    = -maxmin[8];
 665     hwinfo_g->simd_suggest_max    = maxmin[3];
 666     hwinfo_g->bIdenticalGPUs      = (maxmin[4] == -maxmin[9]);
 667 #else
 668     /* All ranks use the same pointer, protected by a mutex in the caller */
 669     hwinfo_g->nphysicalnode       = 1;
 670     hwinfo_g->ncore_tot           = ncore;
 671     hwinfo_g->ncore_min           = ncore;
 672     hwinfo_g->ncore_max           = ncore;
 673     hwinfo_g->nhwthread_tot       = hwinfo_g->nthreads_hw_avail;
 674     hwinfo_g->nhwthread_min       = hwinfo_g->nthreads_hw_avail;
 675     hwinfo_g->nhwthread_max       = hwinfo_g->nthreads_hw_avail;
 676     hwinfo_g->ngpu_compatible_tot = hwinfo_g->gpu_info.n_dev_compatible;
 677     hwinfo_g->ngpu_compatible_min = hwinfo_g->gpu_info.n_dev_compatible;
 678     hwinfo_g->ngpu_compatible_max = hwinfo_g->gpu_info.n_dev_compatible;
 679     hwinfo_g->simd_suggest_min    = static_cast<int>(simdSuggested(cpuInfo));
 680     hwinfo_g->simd_suggest_max    = static_cast<int>(simdSuggested(cpuInfo));
 681     hwinfo_g->bIdenticalGPUs      = TRUE;
 682 #endif
 683 }
 684
 685 /*! \brief Utility that does dummy computing for max 2 seconds to spin up cores
 686  *
 687  *  This routine will check the number of cores configured and online
 688  *  (using sysconf), and the spins doing dummy compute operations for up to
 689  *  2 seconds, or until all cores have come online. This can be used prior to
 690  *  hardware detection for platforms that take unused processors offline.
 691  *
 692  *  This routine will not throw exceptions.
 693  */
 694 static void
 695 spinUpCore() noexcept
 696 {
 697 #if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROCESSORS_ONLN)
 698     float dummy           = 0.1;
 699     int   countConfigured = sysconf(_SC_NPROCESSORS_CONF);    // noexcept
 700     auto  start           = std::chrono::steady_clock::now(); // noexcept
 701
 702     while (sysconf(_SC_NPROCESSORS_ONLN) < countConfigured &&
 703            std::chrono::steady_clock::now() - start < std::chrono::seconds(2))
 704     {
 705         for (int i = 1; i < 10000; i++)
 706         {
 707             dummy /= i;
 708         }
 709     }
 710
 711     if (dummy < 0)
 712     {
 713         printf("This cannot happen, but prevents loop from being optimized away.");
 714     }
 715 #endif
 716 }
 717
 718 /*! \brief Prepare the system before hardware topology detection
 719  *
 720  * This routine should perform any actions we want to put the system in a state
 721  * where we want it to be before detecting the hardware topology. For most
 722  * processors there is nothing to do, but some architectures (in particular ARM)
 723  * have support for taking configured cores offline, which will make them disappear
 724  * from the online processor count.
 725  *
 726  * This routine checks if there is a mismatch between the number of cores
 727  * configured and online, and in that case we issue a small workload that
 728  * attempts to wake sleeping cores before doing the actual detection.
 729  *
 730  * This type of mismatch can also occur for x86 or PowerPC on Linux, if SMT has only
 731  * been disabled in the kernel (rather than bios). Since those cores will never
 732  * come online automatically, we currently skip this test for x86 & PowerPC to
 733  * avoid wasting 2 seconds. We also skip the test if there is no thread support.
 734  *
 735  * \note Cores will sleep relatively quickly again, so it's important to issue
 736  *       the real detection code directly after this routine.
 737  */
 738 static void
 739 hardwareTopologyPrepareDetection()
 740 {
 741 #if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && \
 742     (defined(THREAD_PTHREADS) || defined(THREAD_WINDOWS))
 743
 744     // Modify this conditional when/if x86 or PowerPC starts to sleep some cores
 745     if (!isX86 && !isPowerPC)
 746     {
 747         int                      countConfigured  = sysconf(_SC_NPROCESSORS_CONF);
 748         std::vector<std::thread> workThreads(countConfigured);
 749
 750         for (auto &t : workThreads)
 751         {
 752             t = std::thread(spinUpCore);
 753         }
 754
 755         for (auto &t : workThreads)
 756         {
 757             t.join();
 758         }
 759     }
 760 #endif
 761 }
 762
 763 /*! \brief Sanity check hardware topology and print some notes to log
 764  *
 765  *  \param mdlog            Logger.
 766  *  \param hardwareTopology Reference to hardwareTopology object.
 767  */
 768 static void
 769 hardwareTopologyDoubleCheckDetection(const gmx::MDLogger gmx_unused         &mdlog,
 770                                      const gmx::HardwareTopology gmx_unused &hardwareTopology)
 771 {
 772 #if defined HAVE_SYSCONF && defined(_SC_NPROCESSORS_CONF)
 773     if (hardwareTopology.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
 774     {
 775         return;
 776     }
 777
 778     int countFromDetection = hardwareTopology.machine().logicalProcessorCount;
 779     int countConfigured    = sysconf(_SC_NPROCESSORS_CONF);
 780
 781     /* BIOS, kernel or user actions can take physical processors
 782      * offline. We already cater for the some of the cases inside the hardwareToplogy
 783      * by trying to spin up cores just before we detect, but there could be other
 784      * cases where it is worthwhile to hint that there might be more resources available.
 785      */
 786     if (countConfigured >= 0 && countConfigured != countFromDetection)
 787     {
 788         GMX_LOG(mdlog.info).
 789             appendTextFormatted("Note: %d CPUs configured, but only %d were detected to be online.\n", countConfigured, countFromDetection);
 790
 791         if (isX86 && countConfigured == 2*countFromDetection)
 792         {
 793             GMX_LOG(mdlog.info).
 794                 appendText("      X86 Hyperthreading is likely disabled; enable it for better performance.");
 795         }
 796         // For PowerPC (likely Power8) it is possible to set SMT to either 2,4, or 8-way hardware threads.
 797         // We only warn if it is completely disabled since default performance drops with SMT8.
 798         if (isPowerPC && countConfigured == 8*countFromDetection)
 799         {
 800             GMX_LOG(mdlog.info).
 801                 appendText("      PowerPC SMT is likely disabled; enable SMT2/SMT4 for better performance.");
 802         }
 803     }
 804 #endif
 805 }
 806
 807
 808 gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog, const t_commrec *cr,
 809                                    gmx_bool bDetectGPUs)
 810 {
 811     int ret;
 812
 813     /* make sure no one else is doing the same thing */
 814     ret = tMPI_Thread_mutex_lock(&hw_info_lock);
 815     if (ret != 0)
 816     {
 817         gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
 818     }
 819
 820     /* only initialize the hwinfo structure if it is not already initalized */
 821     if (n_hwinfo == 0)
 822     {
 823         snew(hwinfo_g, 1);
 824
 825         hwinfo_g->cpuInfo             = new gmx::CpuInfo(gmx::CpuInfo::detect());
 826
 827         hardwareTopologyPrepareDetection();
 828         hwinfo_g->hardwareTopology    = new gmx::HardwareTopology(gmx::HardwareTopology::detect());
 829
 830         // If we detected the topology on this system, double-check that it makes sense
 831         if (hwinfo_g->hardwareTopology->isThisSystem())
 832         {
 833             hardwareTopologyDoubleCheckDetection(mdlog, *(hwinfo_g->hardwareTopology));
 834         }
 835
 836         // TODO: Get rid of this altogether.
 837         hwinfo_g->nthreads_hw_avail = hwinfo_g->hardwareTopology->machine().logicalProcessorCount;
 838
 839         /* detect GPUs */
 840         hwinfo_g->gpu_info.n_dev            = 0;
 841         hwinfo_g->gpu_info.n_dev_compatible = 0;
 842         hwinfo_g->gpu_info.gpu_dev          = nullptr;
 843
 844         /* Run the detection if the binary was compiled with GPU support
 845          * and we requested detection.
 846          */
 847         hwinfo_g->gpu_info.bDetectGPUs =
 848             (bGPUBinary && bDetectGPUs &&
 849              getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
 850         if (hwinfo_g->gpu_info.bDetectGPUs)
 851         {
 852             gmx_detect_gpus(mdlog, cr);
 853         }
 854
 855         gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo);
 856     }
 857     /* increase the reference counter */
 858     n_hwinfo++;
 859
 860     ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
 861     if (ret != 0)
 862     {
 863         gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
 864     }
 865
 866     return hwinfo_g;
 867 }
 868
 869 static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo,
 870                                             bool                 bFullCpuInfo)
 871 {
 872     std::string                  s;
 873
 874     const gmx::CpuInfo          &cpuInfo = *hwinfo_g->cpuInfo;
 875     const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
 876
 877     s  = gmx::formatString("\n");
 878     s += gmx::formatString("Running on %d node%s with total",
 879                            hwinfo->nphysicalnode,
 880                            hwinfo->nphysicalnode == 1 ? "" : "s");
 881     if (hwinfo->ncore_tot > 0)
 882     {
 883         s += gmx::formatString(" %d cores,", hwinfo->ncore_tot);
 884     }
 885     s += gmx::formatString(" %d logical cores", hwinfo->nhwthread_tot);
 886     if (hwinfo->gpu_info.bDetectGPUs)
 887     {
 888         s += gmx::formatString(", %d compatible GPU%s",
 889                                hwinfo->ngpu_compatible_tot,
 890                                hwinfo->ngpu_compatible_tot == 1 ? "" : "s");
 891     }
 892     else if (bGPUBinary)
 893     {
 894         s += gmx::formatString(" (GPU detection deactivated)");
 895     }
 896     s += gmx::formatString("\n");
 897
 898     if (hwinfo->nphysicalnode > 1)
 899     {
 900         /* Print per node hardware feature counts */
 901         if (hwinfo->ncore_max > 0)
 902         {
 903             s += gmx::formatString("  Cores per node:           %2d", hwinfo->ncore_min);
 904             if (hwinfo->ncore_max > hwinfo->ncore_min)
 905             {
 906                 s += gmx::formatString(" - %2d", hwinfo->ncore_max);
 907             }
 908             s += gmx::formatString("\n");
 909         }
 910         s += gmx::formatString("  Logical cores per node:   %2d", hwinfo->nhwthread_min);
 911         if (hwinfo->nhwthread_max > hwinfo->nhwthread_min)
 912         {
 913             s += gmx::formatString(" - %2d", hwinfo->nhwthread_max);
 914         }
 915         s += gmx::formatString("\n");
 916         if (bGPUBinary)
 917         {
 918             s += gmx::formatString("  Compatible GPUs per node: %2d",
 919                                    hwinfo->ngpu_compatible_min);
 920             if (hwinfo->ngpu_compatible_max > hwinfo->ngpu_compatible_min)
 921             {
 922                 s += gmx::formatString(" - %2d", hwinfo->ngpu_compatible_max);
 923             }
 924             s += gmx::formatString("\n");
 925             if (hwinfo->ngpu_compatible_tot > 0)
 926             {
 927                 if (hwinfo->bIdenticalGPUs)
 928                 {
 929                     s += gmx::formatString("  All nodes have identical type(s) of GPUs\n");
 930                 }
 931                 else
 932                 {
 933                     /* This message will also appear with identical GPU types
 934                      * when at least one node has no GPU.
 935                      */
 936                     s += gmx::formatString("  Different nodes have different type(s) and/or order of GPUs\n");
 937                 }
 938             }
 939         }
 940     }
 941
 942 #if GMX_LIB_MPI
 943     char host[HOSTNAMELEN];
 944     int  rank;
 945
 946     gmx_gethostname(host, HOSTNAMELEN);
 947     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 948
 949     s += gmx::formatString("Hardware detected on host %s (the node of MPI rank %d):\n",
 950                            host, rank);
 951 #else
 952     s += gmx::formatString("Hardware detected:\n");
 953 #endif
 954     s += gmx::formatString("  CPU info:\n");
 955
 956     s += gmx::formatString("    Vendor: %s\n", cpuInfo.vendorString().c_str());
 957
 958     s += gmx::formatString("    Brand:  %s\n", cpuInfo.brandString().c_str());
 959
 960     if (bFullCpuInfo)
 961     {
 962         s += gmx::formatString("    Family: %d   Model: %d   Stepping: %d\n",
 963                                cpuInfo.family(), cpuInfo.model(), cpuInfo.stepping());
 964
 965         s += gmx::formatString("    Features:");
 966         for (auto &f : cpuInfo.featureSet())
 967         {
 968             s += gmx::formatString(" %s", cpuInfo.featureString(f).c_str());;
 969         }
 970         s += gmx::formatString("\n");
 971     }
 972
 973     s += gmx::formatString("    SIMD instructions most likely to fit this hardware: %s",
 974                            gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min)).c_str());
 975
 976     if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min)
 977     {
 978         s += gmx::formatString(" - %s", gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_max)).c_str());
 979     }
 980     s += gmx::formatString("\n");
 981
 982     s += gmx::formatString("    SIMD instructions selected at GROMACS compile time: %s\n",
 983                            gmx::simdString(gmx::simdCompiled()).c_str());
 984
 985     s += gmx::formatString("\n");
 986
 987     s += gmx::formatString("  Hardware topology: ");
 988     switch (hwTop.supportLevel())
 989     {
 990         case gmx::HardwareTopology::SupportLevel::None:
 991             s += gmx::formatString("None\n");
 992             break;
 993         case gmx::HardwareTopology::SupportLevel::LogicalProcessorCount:
 994             s += gmx::formatString("Only logical processor count\n");
 995             break;
 996         case gmx::HardwareTopology::SupportLevel::Basic:
 997             s += gmx::formatString("Basic\n");
 998             break;
 999         case gmx::HardwareTopology::SupportLevel::Full:
1000             s += gmx::formatString("Full\n");
1001             break;
1002         case gmx::HardwareTopology::SupportLevel::FullWithDevices:
1003             s += gmx::formatString("Full, with devices\n");
1004             break;
1005     }
1006
1007     if (!hwTop.isThisSystem())
1008     {
1009         s += gmx::formatString("  NOTE: Hardware topology cached or synthetic, not detected.\n");
1010         if (char *p = getenv("HWLOC_XMLFILE"))
1011         {
1012             s += gmx::formatString("        HWLOC_XMLFILE=%s\n", p);
1013         }
1014     }
1015
1016     if (bFullCpuInfo)
1017     {
1018         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic)
1019         {
1020             s += gmx::formatString("    Sockets, cores, and logical processors:\n");
1021
1022             for (auto &socket : hwTop.machine().sockets)
1023             {
1024                 s += gmx::formatString("      Socket %2d:", socket.id);
1025                 for (auto &c : socket.cores)
1026                 {
1027                     s += gmx::formatString(" [");
1028                     for (auto &t : c.hwThreads)
1029                     {
1030                         s += gmx::formatString(" %3d", t.logicalProcessorId);
1031                     }
1032                     s += gmx::formatString("]");
1033                 }
1034                 s += gmx::formatString("\n");
1035             }
1036         }
1037         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Full)
1038         {
1039             s += gmx::formatString("    Numa nodes:\n");
1040             for (auto &n : hwTop.machine().numa.nodes)
1041             {
1042                 s += gmx::formatString("      Node %2d (%" GMX_PRIu64 " bytes mem):", n.id, n.memory);
1043                 for (auto &l : n.logicalProcessorId)
1044                 {
1045                     s += gmx::formatString(" %3d", l);
1046                 }
1047                 s += gmx::formatString("\n");
1048             }
1049             s += gmx::formatString("      Latency:\n          ");
1050             for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
1051             {
1052                 s += gmx::formatString(" %5d", j);
1053             }
1054             s += gmx::formatString("\n");
1055             for (std::size_t i = 0; i < hwTop.machine().numa.nodes.size(); i++)
1056             {
1057                 s += gmx::formatString("     %5d", i);
1058                 for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
1059                 {
1060                     s += gmx::formatString(" %5.2f", hwTop.machine().numa.relativeLatency[i][j]);
1061                 }
1062                 s += gmx::formatString("\n");
1063             }
1064
1065
1066             s += gmx::formatString("    Caches:\n");
1067             for (auto &c : hwTop.machine().caches)
1068             {
1069                 s += gmx::formatString("      L%d: %" GMX_PRIu64 " bytes, linesize %d bytes, assoc. %d, shared %d ways\n",
1070                                        c.level, c.size, c.linesize, c.associativity, c.shared);
1071             }
1072         }
1073         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::FullWithDevices)
1074         {
1075             s += gmx::formatString("    PCI devices:\n");
1076             for (auto &d : hwTop.machine().devices)
1077             {
1078                 s += gmx::formatString("      %04x:%02x:%02x.%1x  Id: %04x:%04x  Class: 0x%04x  Numa: %d\n",
1079                                        d.domain, d.bus, d.dev, d.func, d.vendorId, d.deviceId, d.classId, d.numaNodeId);
1080             }
1081         }
1082     }
1083
1084     if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 ||
1085                        hwinfo->gpu_info.n_dev > 0))
1086     {
1087         s += gmx::formatString("  GPU info:\n");
1088         s += gmx::formatString("    Number of GPUs detected: %d\n",
1089                                hwinfo->gpu_info.n_dev);
1090         if (hwinfo->gpu_info.n_dev > 0)
1091         {
1092             s += sprint_gpus(hwinfo->gpu_info) + "\n";
1093         }
1094     }
1095     return s;
1096 }
1097
1098 void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
1099                                  const gmx::MDLogger &mdlog,
1100                                  const gmx_hw_info_t *hwinfo)
1101 {
1102     const gmx::CpuInfo &cpuInfo = *hwinfo_g->cpuInfo;
1103
1104     if (fplog != nullptr)
1105     {
1106         std::string detected;
1107
1108         detected = detected_hardware_string(hwinfo, TRUE);
1109
1110         fprintf(fplog, "%s\n", detected.c_str());
1111     }
1112
1113     if (MULTIMASTER(cr))
1114     {
1115         std::string detected;
1116
1117         detected = detected_hardware_string(hwinfo, FALSE);
1118
1119         fprintf(stderr, "%s\n", detected.c_str());
1120     }
1121
1122     /* Check the compiled SIMD instruction set against that of the node
1123      * with the lowest SIMD level support (skip if SIMD detection did not work)
1124      */
1125     if (cpuInfo.supportLevel() >= gmx::CpuInfo::SupportLevel::Features)
1126     {
1127         gmx::simdCheck(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min), fplog, MULTIMASTER(cr));
1128     }
1129
1130     /* For RDTSCP we only check on our local node and skip the MPI reduction */
1131     check_use_of_rdtscp_on_this_cpu(mdlog, cpuInfo);
1132 }
1133
1134 bool hasUserSetGpuIds(const gmx_gpu_opt_t *gpu_opt)
1135 {
1136     return gpu_opt->gpu_id != nullptr;
1137 }
1138
1139 bool compatibleGpusFound(const gmx_gpu_info_t &gpu_info)
1140 {
1141     return gpu_info.n_dev_compatible > 0;
1142 }
1143
1144 void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
1145 {
1146     if (!hasUserSetGpuIds(gpu_opt))
1147     {
1148         return;
1149     }
1150
1151     /* Parse a "plain" or comma-separated GPU ID string which contains a
1152      * sequence of digits corresponding to GPU IDs; the order will
1153      * indicate the process/tMPI thread - GPU assignment. */
1154     parse_digits_from_string(gpu_opt->gpu_id, &gpu_opt->n_dev_use, &gpu_opt->dev_use);
1155
1156     if (gpu_opt->n_dev_use == 0)
1157     {
1158         gmx_fatal(FARGS, "Empty GPU ID string encountered.\n"
1159                   "An empty, delimiter-free, or comma-separated sequence of valid numeric IDs of available GPUs is required.\n");
1160     }
1161 }
1162
1163 void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
1164 {
1165     int ret;
1166
1167     ret = tMPI_Thread_mutex_lock(&hw_info_lock);
1168     if (ret != 0)
1169     {
1170         gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
1171     }
1172
1173     /* decrease the reference counter */
1174     n_hwinfo--;
1175
1176
1177     if (hwinfo != hwinfo_g)
1178     {
1179         gmx_incons("hwinfo < hwinfo_g");
1180     }
1181
1182     if (n_hwinfo < 0)
1183     {
1184         gmx_incons("n_hwinfo < 0");
1185     }
1186
1187     if (n_hwinfo == 0)
1188     {
1189         delete hwinfo_g->cpuInfo;
1190         delete hwinfo_g->hardwareTopology;
1191         free_gpu_info(&hwinfo_g->gpu_info);
1192         sfree(hwinfo_g);
1193     }
1194
1195     ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
1196     if (ret != 0)
1197     {
1198         gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
1199     }
1200 }