src/gromacs/hardware/detecthardware.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "detecthardware.h"
  38
  39 #include "config.h"
  40
  41 #include <cerrno>
  42 #include <cstdlib>
  43 #include <cstring>
  44
  45 #include <algorithm>
  46 #include <chrono>
  47 #include <string>
  48 #include <thread>
  49 #include <vector>
  50
  51 #include "thread_mpi/threads.h"
  52
  53 #include "gromacs/gmxlib/network.h"
  54 #include "gromacs/gpu_utils/gpu_utils.h"
  55 #include "gromacs/hardware/cpuinfo.h"
  56 #include "gromacs/hardware/gpu_hw_info.h"
  57 #include "gromacs/hardware/hardwareassign.h"
  58 #include "gromacs/hardware/hardwaretopology.h"
  59 #include "gromacs/hardware/hw_info.h"
  60 #include "gromacs/mdtypes/commrec.h"
  61 #include "gromacs/mdtypes/md_enums.h"
  62 #include "gromacs/simd/support.h"
  63 #include "gromacs/utility/arrayref.h"
  64 #include "gromacs/utility/basedefinitions.h"
  65 #include "gromacs/utility/basenetwork.h"
  66 #include "gromacs/utility/baseversion.h"
  67 #include "gromacs/utility/cstringutil.h"
  68 #include "gromacs/utility/exceptions.h"
  69 #include "gromacs/utility/fatalerror.h"
  70 #include "gromacs/utility/gmxassert.h"
  71 #include "gromacs/utility/logger.h"
  72 #include "gromacs/utility/programcontext.h"
  73 #include "gromacs/utility/smalloc.h"
  74 #include "gromacs/utility/stringutil.h"
  75 #include "gromacs/utility/sysinfo.h"
  76
  77 #ifdef HAVE_UNISTD_H
  78 #    include <unistd.h>       // sysconf()
  79 #endif
  80
  81 //! Convenience macro to help us avoid ifdefs each time we use sysconf
  82 #if !defined(_SC_NPROCESSORS_ONLN) && defined(_SC_NPROC_ONLN)
  83 #    define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
  84 #endif
  85
  86 //! Convenience macro to help us avoid ifdefs each time we use sysconf
  87 #if !defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROC_CONF)
  88 #    define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
  89 #endif
  90
  91 #if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
  92 //! Constant used to help minimize preprocessed code
  93 static const bool isX86 = true;
  94 #else
  95 //! Constant used to help minimize preprocessed code
  96 static const bool isX86 = false;
  97 #endif
  98
  99 #if defined __powerpc__ || defined __ppc__ || defined __PPC__
 100 static const bool isPowerPC = true;
 101 #else
 102 static const bool isPowerPC = false;
 103 #endif
 104
 105 //! Constant used to help minimize preprocessed code
 106 static const bool bGPUBinary     = GMX_GPU != GMX_GPU_NONE;
 107
 108 /* Note that some of the following arrays must match the "GPU support
 109  * enumeration" in src/config.h.cmakein, so that GMX_GPU looks up an
 110  * array entry. */
 111
 112 // TODO If/when we unify CUDA and OpenCL support code, this should
 113 // move to a single place in gpu_utils.
 114 /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
 115 const char * const gpu_detect_res_str[egpuNR] =
 116 {
 117     "compatible", "inexistent", "incompatible", "insane"
 118 };
 119
 120 /* The globally shared hwinfo structure. */
 121 static gmx_hw_info_t      *hwinfo_g;
 122 /* A reference counter for the hwinfo structure */
 123 static int                 n_hwinfo = 0;
 124 /* A lock to protect the hwinfo structure */
 125 static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
 126
 127 #define HOSTNAMELEN 80
 128
 129 /* FW decl. */
 130 static size_t gmx_count_gpu_dev_unique(const gmx_gpu_info_t &gpu_info,
 131                                        const gmx_gpu_opt_t  *gpu_opt);
 132
 133 /* FW decl. */
 134 static int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt,
 135                                     bool                 userSetGpuIds);
 136
 137 /*! \internal \brief
 138  * Returns the GPU information text, one GPU per line.
 139  */
 140 static std::string sprint_gpus(const gmx_gpu_info_t &gpu_info)
 141 {
 142     char                     stmp[STRLEN];
 143     std::vector<std::string> gpuStrings;
 144     for (int i = 0; i < gpu_info.n_dev; i++)
 145     {
 146         get_gpu_device_info_string(stmp, gpu_info, i);
 147         gpuStrings.push_back(gmx::formatString("    %s", stmp));
 148     }
 149     return gmx::joinStrings(gpuStrings, "\n");
 150 }
 151
 152 // TODO This function should not live in detecthardware.cpp
 153 std::string
 154 makeGpuUsageReport(const gmx_gpu_info_t &gpu_info,
 155                    const gmx_gpu_opt_t  *gpu_opt,
 156                    bool                  userSetGpuIds,
 157                    size_t                numPpRanks,
 158                    bool                  bPrintHostName)
 159 {
 160     int  ngpu_use  = gpu_opt->n_dev_use;
 161     int  ngpu_comp = gpu_info.n_dev_compatible;
 162     char host[HOSTNAMELEN];
 163
 164     if (bPrintHostName)
 165     {
 166         gmx_gethostname(host, HOSTNAMELEN);
 167     }
 168
 169     /* Issue a note if GPUs are available but not used */
 170     if (ngpu_comp > 0 && ngpu_use < 1)
 171     {
 172         return gmx::formatString("%d compatible GPU%s detected in the system, but none will be used.\n"
 173                                  "Consider trying GPU acceleration with the Verlet scheme!\n",
 174                                  ngpu_comp, (ngpu_comp > 1) ? "s" : "");
 175     }
 176
 177     std::string output;
 178     {
 179         std::vector<int> gpuIdsInUse;
 180         for (int i = 0; i < ngpu_use; i++)
 181         {
 182             gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
 183         }
 184         std::string gpuIdsString =
 185             formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
 186         size_t      numGpusInUse = gmx_count_gpu_dev_unique(gpu_info, gpu_opt);
 187         bool        bPluralGpus  = numGpusInUse > 1;
 188
 189         if (bPrintHostName)
 190         {
 191             output += gmx::formatString("On host %s ", host);
 192         }
 193         output += gmx::formatString("%zu GPU%s %sselected for this run.\n"
 194                                     "Mapping of GPU ID%s to the %d PP rank%s in this node: %s\n",
 195                                     numGpusInUse, bPluralGpus ? "s" : "",
 196                                     userSetGpuIds ? "user-" : "auto-",
 197                                     bPluralGpus ? "s" : "",
 198                                     numPpRanks,
 199                                     (numPpRanks > 1) ? "s" : "",
 200                                     gpuIdsString.c_str());
 201     }
 202
 203     int same_count = gmx_count_gpu_dev_shared(gpu_opt, userSetGpuIds);
 204
 205     if (same_count > 0)
 206     {
 207         output += gmx::formatString("NOTE: You assigned %s to multiple ranks.\n",
 208                                     same_count > 1 ? "GPU IDs" : "a GPU ID");
 209     }
 210
 211     if (static_cast<size_t>(ngpu_comp) > numPpRanks)
 212     {
 213         /* TODO In principle, this warning could be warranted only on
 214          * ranks on some nodes, but we lack the infrastructure to do a
 215          * good job of reporting that. */
 216         output += gmx::formatString("NOTE: potentially sub-optimal launch configuration using fewer\n"
 217                                     "      PP ranks on a node than GPUs available on that node.\n");
 218     }
 219
 220     return output;
 221 }
 222
 223 /* Give a suitable fatal error or warning if the build configuration
 224    and runtime CPU do not match. */
 225 static void
 226 check_use_of_rdtscp_on_this_cpu(const gmx::MDLogger   &mdlog,
 227                                 const gmx::CpuInfo    &cpuInfo)
 228 {
 229     bool        binaryUsesRdtscp = HAVE_RDTSCP;
 230
 231     const char *programName = gmx::getProgramContext().displayName();
 232
 233     if (cpuInfo.supportLevel() < gmx::CpuInfo::SupportLevel::Features)
 234     {
 235         if (binaryUsesRdtscp)
 236         {
 237             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 238                     "The %s executable was compiled to use the rdtscp CPU instruction. "
 239                     "We cannot detect the features of your current CPU, but will proceed anyway. "
 240                     "If you get a crash, rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 241                     programName);
 242         }
 243     }
 244     else
 245     {
 246         bool cpuHasRdtscp = cpuInfo.feature(gmx::CpuInfo::Feature::X86_Rdtscp);
 247
 248         if (!cpuHasRdtscp && binaryUsesRdtscp)
 249         {
 250             gmx_fatal(FARGS, "The %s executable was compiled to use the rdtscp CPU instruction. "
 251                       "However, this is not supported by the current hardware and continuing would lead to a crash. "
 252                       "Please rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 253                       programName);
 254         }
 255
 256         if (cpuHasRdtscp && !binaryUsesRdtscp)
 257         {
 258             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 259                     "The current CPU can measure timings more accurately than the code in\n"
 260                     "%s was configured to use. This might affect your simulation\n"
 261                     "speed as accurate timings are needed for load-balancing.\n"
 262                     "Please consider rebuilding %s with the GMX_USE_RDTSCP=ON CMake option.",
 263                     programName, programName);
 264         }
 265     }
 266 }
 267
 268 void gmx_check_hw_runconf_consistency(const gmx::MDLogger &mdlog,
 269                                       const gmx_hw_info_t *hwinfo,
 270                                       const t_commrec     *cr,
 271                                       const gmx_hw_opt_t  &hw_opt,
 272                                       bool                 userSetGpuIds,
 273                                       bool                 willUsePhysicalGpu)
 274 {
 275     int      npppn;
 276     char     th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
 277     gmx_bool btMPI, bMPI, bNthreadsAuto;
 278
 279     GMX_RELEASE_ASSERT(hwinfo, "hwinfo must be a non-NULL pointer");
 280     GMX_RELEASE_ASSERT(cr, "cr must be a non-NULL pointer");
 281
 282     /* Below we only do consistency checks for PP and GPUs,
 283      * this is irrelevant for PME only nodes, so in that case we return
 284      * here.
 285      */
 286     if (!(cr->duty & DUTY_PP))
 287     {
 288         return;
 289     }
 290
 291 #if GMX_THREAD_MPI
 292     bMPI          = FALSE;
 293     btMPI         = TRUE;
 294     bNthreadsAuto = (hw_opt.nthreads_tmpi < 1);
 295 #elif GMX_LIB_MPI
 296     bMPI          = TRUE;
 297     btMPI         = FALSE;
 298     bNthreadsAuto = FALSE;
 299 #else
 300     bMPI          = FALSE;
 301     btMPI         = FALSE;
 302     bNthreadsAuto = FALSE;
 303 #endif
 304
 305     /* Need to ensure that we have enough GPUs:
 306      * - need one GPU per PP node
 307      * - no GPU oversubscription with tMPI
 308      * */
 309     /* number of PP processes per node */
 310     npppn = cr->nrank_pp_intranode;
 311
 312     pernode[0]           = '\0';
 313     th_or_proc_plural[0] = '\0';
 314     if (btMPI)
 315     {
 316         sprintf(th_or_proc, "thread-MPI thread");
 317         if (npppn > 1)
 318         {
 319             sprintf(th_or_proc_plural, "s");
 320         }
 321     }
 322     else if (bMPI)
 323     {
 324         sprintf(th_or_proc, "MPI process");
 325         if (npppn > 1)
 326         {
 327             sprintf(th_or_proc_plural, "es");
 328         }
 329         sprintf(pernode, " per node");
 330     }
 331     else
 332     {
 333         /* neither MPI nor tMPI */
 334         sprintf(th_or_proc, "process");
 335     }
 336
 337     if (willUsePhysicalGpu)
 338     {
 339         int  ngpu_comp, ngpu_use;
 340         char gpu_comp_plural[2], gpu_use_plural[2];
 341
 342         ngpu_comp = hwinfo->gpu_info.n_dev_compatible;
 343         ngpu_use  = hw_opt.gpu_opt.n_dev_use;
 344
 345         sprintf(gpu_comp_plural, "%s", (ngpu_comp > 1) ? "s" : "");
 346         sprintf(gpu_use_plural,  "%s", (ngpu_use > 1) ? "s" : "");
 347
 348         const char *programName = gmx::getProgramContext().displayName();
 349
 350         /* number of tMPI threads auto-adjusted */
 351         if (btMPI && bNthreadsAuto)
 352         {
 353             if (userSetGpuIds && npppn < ngpu_use)
 354             {
 355                 /* The user manually provided more GPUs than threads we
 356                    could automatically start. */
 357                 gmx_fatal(FARGS,
 358                           "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
 359                           "%s requires one PP tread-MPI thread per GPU; use fewer GPUs.",
 360                           ngpu_use, gpu_use_plural,
 361                           npppn, th_or_proc_plural,
 362                           programName);
 363             }
 364
 365             if (!userSetGpuIds && npppn < ngpu_comp)
 366             {
 367                 /* There are more GPUs than tMPI threads; we have
 368                    limited the number GPUs used. */
 369                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 370                         "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
 371                         "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.",
 372                         ngpu_comp, gpu_comp_plural,
 373                         npppn, th_or_proc_plural,
 374                         programName, npppn,
 375                         npppn > 1 ? "s" : "");
 376             }
 377         }
 378
 379         if (userSetGpuIds)
 380         {
 381             if (ngpu_use != npppn)
 382             {
 383                 gmx_fatal(FARGS,
 384                           "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 385                           "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
 386                           th_or_proc, btMPI ? "s" : "es", pernode,
 387                           programName, npppn, th_or_proc,
 388                           th_or_proc_plural, pernode,
 389                           ngpu_use, gpu_use_plural);
 390             }
 391         }
 392         else
 393         {
 394             if (ngpu_use != npppn)
 395             {
 396                 /* Avoid duplicate error messages.
 397                  * Unfortunately we can only do this at the physical node
 398                  * level, since the hardware setup and MPI process count
 399                  * might differ between physical nodes.
 400                  */
 401                 if (cr->rank_pp_intranode == 0)
 402                 {
 403                     gmx_fatal(FARGS,
 404                               "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 405                               "%s was started with %d PP %s%s%s, but only %d GPU%s was detected.",
 406                               th_or_proc, btMPI ? "s" : "es", pernode,
 407                               programName, npppn, th_or_proc,
 408                               th_or_proc_plural, pernode,
 409                               ngpu_use, gpu_use_plural);
 410                 }
 411             }
 412         }
 413     }
 414 }
 415
 416 /*! \brief Return the number of PP rank pairs that share a GPU device between them.
 417  *
 418  * Sharing GPUs among multiple PP ranks is possible via either user or
 419  * automated selection. */
 420 static int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt, bool userSetGpuIds)
 421 {
 422     int      same_count    = 0;
 423     int      ngpu          = gpu_opt->n_dev_use;
 424
 425     if (userSetGpuIds)
 426     {
 427         int      i, j;
 428
 429         for (i = 0; i < ngpu - 1; i++)
 430         {
 431             for (j = i + 1; j < ngpu; j++)
 432             {
 433                 same_count      += (gpu_opt->dev_use[i] ==
 434                                     gpu_opt->dev_use[j]);
 435             }
 436         }
 437     }
 438
 439     return same_count;
 440 }
 441
 442 /* Count and return the number of unique GPUs (per node) selected.
 443  *
 444  * As sharing GPUs among multiple PP ranks is possible when the user passes
 445  * GPU IDs, the number of GPUs user (per node) can be different from the
 446  * number of GPU IDs selected.
 447  */
 448 static size_t gmx_count_gpu_dev_unique(const gmx_gpu_info_t &gpu_info,
 449                                        const gmx_gpu_opt_t  *gpu_opt)
 450 {
 451     GMX_RELEASE_ASSERT(gpu_opt, "gpu_opt must be a non-NULL pointer");
 452
 453     std::set<int> uniqIds;
 454     for (int i = 0; i < gpu_opt->n_dev_use; i++)
 455     {
 456         int deviceId = get_gpu_device_id(gpu_info, gpu_opt, i);
 457         uniqIds.insert(deviceId);
 458     }
 459     return uniqIds.size();
 460 }
 461
 462 static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
 463 {
 464 #if GMX_LIB_MPI
 465     int              rank_world;
 466     MPI_Comm         physicalnode_comm;
 467 #endif
 468     int              rank_local;
 469
 470     /* Under certain circumstances MPI ranks on the same physical node
 471      * can not simultaneously access the same GPU(s). Therefore we run
 472      * the detection only on one MPI rank per node and broadcast the info.
 473      * Note that with thread-MPI only a single thread runs this code.
 474      *
 475      * NOTE: We can't broadcast gpu_info with OpenCL as the device and platform
 476      * ID stored in the structure are unique for each rank (even if a device
 477      * is shared by multiple ranks).
 478      *
 479      * TODO: We should also do CPU hardware detection only once on each
 480      * physical node and broadcast it, instead of do it on every MPI rank.
 481      */
 482 #if GMX_LIB_MPI
 483     /* A split of MPI_COMM_WORLD over physical nodes is only required here,
 484      * so we create and destroy it locally.
 485      */
 486     MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
 487     MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
 488                    rank_world, &physicalnode_comm);
 489     MPI_Comm_rank(physicalnode_comm, &rank_local);
 490     GMX_UNUSED_VALUE(cr);
 491 #else
 492     /* Here there should be only one process, check this */
 493     GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here");
 494
 495     rank_local = 0;
 496 #endif
 497
 498     /*  With CUDA detect only on one rank per host, with OpenCL need do
 499      *  the detection on all PP ranks */
 500     bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && (cr->duty & DUTY_PP));
 501
 502     if (rank_local == 0 || isOpenclPpRank)
 503     {
 504         char detection_error[STRLEN] = "", sbuf[STRLEN];
 505
 506         if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
 507         {
 508             if (detection_error[0] != '\0')
 509             {
 510                 sprintf(sbuf, ":\n      %s\n", detection_error);
 511             }
 512             else
 513             {
 514                 sprintf(sbuf, ".");
 515             }
 516             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 517                     "NOTE: Error occurred during GPU detection%s"
 518                     "      Can not use GPU acceleration, will fall back to CPU kernels.",
 519                     sbuf);
 520         }
 521     }
 522
 523 #if GMX_LIB_MPI
 524     if (!isOpenclPpRank)
 525     {
 526         /* Broadcast the GPU info to the other ranks within this node */
 527         MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
 528
 529         if (hwinfo_g->gpu_info.n_dev > 0)
 530         {
 531             int dev_size;
 532
 533             dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
 534
 535             if (rank_local > 0)
 536             {
 537                 hwinfo_g->gpu_info.gpu_dev =
 538                     (struct gmx_device_info_t *)malloc(dev_size);
 539             }
 540             MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
 541                       0, physicalnode_comm);
 542             MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
 543                       0, physicalnode_comm);
 544         }
 545     }
 546
 547     MPI_Comm_free(&physicalnode_comm);
 548 #endif
 549 }
 550
 551 static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
 552 {
 553     const int ncore = hwinfo_g->hardwareTopology->numberOfCores();
 554 #if GMX_LIB_MPI
 555     int       rank_id;
 556     int       nrank, rank, nhwthread, ngpu, i;
 557     int       gpu_hash;
 558     int      *buf, *all;
 559
 560     rank_id   = gmx_physicalnode_id_hash();
 561     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 562     MPI_Comm_size(MPI_COMM_WORLD, &nrank);
 563     nhwthread = hwinfo_g->nthreads_hw_avail;
 564     ngpu      = hwinfo_g->gpu_info.n_dev_compatible;
 565     /* Create a unique hash of the GPU type(s) in this node */
 566     gpu_hash  = 0;
 567     /* Here it might be better to only loop over the compatible GPU, but we
 568      * don't have that information available and it would also require
 569      * removing the device ID from the device info string.
 570      */
 571     for (i = 0; i < hwinfo_g->gpu_info.n_dev; i++)
 572     {
 573         char stmp[STRLEN];
 574
 575         /* Since the device ID is incorporated in the hash, the order of
 576          * the GPUs affects the hash. Also two identical GPUs won't give
 577          * a gpu_hash of zero after XORing.
 578          */
 579         get_gpu_device_info_string(stmp, hwinfo_g->gpu_info, i);
 580         gpu_hash ^= gmx_string_fullhash_func(stmp, gmx_string_hash_init);
 581     }
 582
 583     snew(buf, nrank);
 584     snew(all, nrank);
 585     buf[rank] = rank_id;
 586
 587     MPI_Allreduce(buf, all, nrank, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 588
 589     gmx_bool bFound;
 590     int      nnode0, ncore0, nhwthread0, ngpu0, r;
 591
 592     bFound     = FALSE;
 593     ncore0     = 0;
 594     nnode0     = 0;
 595     nhwthread0 = 0;
 596     ngpu0      = 0;
 597     for (r = 0; r < nrank; r++)
 598     {
 599         if (all[r] == rank_id)
 600         {
 601             if (!bFound && r == rank)
 602             {
 603                 /* We are the first rank in this physical node */
 604                 nnode0     = 1;
 605                 ncore0     = ncore;
 606                 nhwthread0 = nhwthread;
 607                 ngpu0      = ngpu;
 608             }
 609             bFound = TRUE;
 610         }
 611     }
 612
 613     sfree(buf);
 614     sfree(all);
 615
 616     int sum[4], maxmin[10];
 617
 618     {
 619         int buf[4];
 620
 621         /* Sum values from only intra-rank 0 so we get the sum over all nodes */
 622         buf[0] = nnode0;
 623         buf[1] = ncore0;
 624         buf[2] = nhwthread0;
 625         buf[3] = ngpu0;
 626
 627         MPI_Allreduce(buf, sum, 4, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 628     }
 629
 630     {
 631         int buf[10];
 632
 633         /* Store + and - values for all ranks,
 634          * so we can get max+min with one MPI call.
 635          */
 636         buf[0] = ncore;
 637         buf[1] = nhwthread;
 638         buf[2] = ngpu;
 639         buf[3] = static_cast<int>(gmx::simdSuggested(cpuInfo));
 640         buf[4] = gpu_hash;
 641         buf[5] = -buf[0];
 642         buf[6] = -buf[1];
 643         buf[7] = -buf[2];
 644         buf[8] = -buf[3];
 645         buf[9] = -buf[4];
 646
 647         MPI_Allreduce(buf, maxmin, 10, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
 648     }
 649
 650     hwinfo_g->nphysicalnode       = sum[0];
 651     hwinfo_g->ncore_tot           = sum[1];
 652     hwinfo_g->ncore_min           = -maxmin[5];
 653     hwinfo_g->ncore_max           = maxmin[0];
 654     hwinfo_g->nhwthread_tot       = sum[2];
 655     hwinfo_g->nhwthread_min       = -maxmin[6];
 656     hwinfo_g->nhwthread_max       = maxmin[1];
 657     hwinfo_g->ngpu_compatible_tot = sum[3];
 658     hwinfo_g->ngpu_compatible_min = -maxmin[7];
 659     hwinfo_g->ngpu_compatible_max = maxmin[2];
 660     hwinfo_g->simd_suggest_min    = -maxmin[8];
 661     hwinfo_g->simd_suggest_max    = maxmin[3];
 662     hwinfo_g->bIdenticalGPUs      = (maxmin[4] == -maxmin[9]);
 663 #else
 664     /* All ranks use the same pointer, protected by a mutex in the caller */
 665     hwinfo_g->nphysicalnode       = 1;
 666     hwinfo_g->ncore_tot           = ncore;
 667     hwinfo_g->ncore_min           = ncore;
 668     hwinfo_g->ncore_max           = ncore;
 669     hwinfo_g->nhwthread_tot       = hwinfo_g->nthreads_hw_avail;
 670     hwinfo_g->nhwthread_min       = hwinfo_g->nthreads_hw_avail;
 671     hwinfo_g->nhwthread_max       = hwinfo_g->nthreads_hw_avail;
 672     hwinfo_g->ngpu_compatible_tot = hwinfo_g->gpu_info.n_dev_compatible;
 673     hwinfo_g->ngpu_compatible_min = hwinfo_g->gpu_info.n_dev_compatible;
 674     hwinfo_g->ngpu_compatible_max = hwinfo_g->gpu_info.n_dev_compatible;
 675     hwinfo_g->simd_suggest_min    = static_cast<int>(simdSuggested(cpuInfo));
 676     hwinfo_g->simd_suggest_max    = static_cast<int>(simdSuggested(cpuInfo));
 677     hwinfo_g->bIdenticalGPUs      = TRUE;
 678 #endif
 679 }
 680
 681 /*! \brief Utility that does dummy computing for max 2 seconds to spin up cores
 682  *
 683  *  This routine will check the number of cores configured and online
 684  *  (using sysconf), and the spins doing dummy compute operations for up to
 685  *  2 seconds, or until all cores have come online. This can be used prior to
 686  *  hardware detection for platforms that take unused processors offline.
 687  *
 688  *  This routine will not throw exceptions.
 689  */
 690 static void
 691 spinUpCore() noexcept
 692 {
 693 #if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && defined(_SC_NPROCESSORS_ONLN)
 694     float dummy           = 0.1;
 695     int   countConfigured = sysconf(_SC_NPROCESSORS_CONF);    // noexcept
 696     auto  start           = std::chrono::steady_clock::now(); // noexcept
 697
 698     while (sysconf(_SC_NPROCESSORS_ONLN) < countConfigured &&
 699            std::chrono::steady_clock::now() - start < std::chrono::seconds(2))
 700     {
 701         for (int i = 1; i < 10000; i++)
 702         {
 703             dummy /= i;
 704         }
 705     }
 706
 707     if (dummy < 0)
 708     {
 709         printf("This cannot happen, but prevents loop from being optimized away.");
 710     }
 711 #endif
 712 }
 713
 714 /*! \brief Prepare the system before hardware topology detection
 715  *
 716  * This routine should perform any actions we want to put the system in a state
 717  * where we want it to be before detecting the hardware topology. For most
 718  * processors there is nothing to do, but some architectures (in particular ARM)
 719  * have support for taking configured cores offline, which will make them disappear
 720  * from the online processor count.
 721  *
 722  * This routine checks if there is a mismatch between the number of cores
 723  * configured and online, and in that case we issue a small workload that
 724  * attempts to wake sleeping cores before doing the actual detection.
 725  *
 726  * This type of mismatch can also occur for x86 or PowerPC on Linux, if SMT has only
 727  * been disabled in the kernel (rather than bios). Since those cores will never
 728  * come online automatically, we currently skip this test for x86 & PowerPC to
 729  * avoid wasting 2 seconds. We also skip the test if there is no thread support.
 730  *
 731  * \note Cores will sleep relatively quickly again, so it's important to issue
 732  *       the real detection code directly after this routine.
 733  */
 734 static void
 735 hardwareTopologyPrepareDetection()
 736 {
 737 #if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) && \
 738     (defined(THREAD_PTHREADS) || defined(THREAD_WINDOWS))
 739
 740     // Modify this conditional when/if x86 or PowerPC starts to sleep some cores
 741     if (!isX86 && !isPowerPC)
 742     {
 743         int                      countConfigured  = sysconf(_SC_NPROCESSORS_CONF);
 744         std::vector<std::thread> workThreads(countConfigured);
 745
 746         for (auto &t : workThreads)
 747         {
 748             t = std::thread(spinUpCore);
 749         }
 750
 751         for (auto &t : workThreads)
 752         {
 753             t.join();
 754         }
 755     }
 756 #endif
 757 }
 758
 759 /*! \brief Sanity check hardware topology and print some notes to log
 760  *
 761  *  \param mdlog            Logger.
 762  *  \param hardwareTopology Reference to hardwareTopology object.
 763  */
 764 static void
 765 hardwareTopologyDoubleCheckDetection(const gmx::MDLogger gmx_unused         &mdlog,
 766                                      const gmx::HardwareTopology gmx_unused &hardwareTopology)
 767 {
 768 #if defined HAVE_SYSCONF && defined(_SC_NPROCESSORS_CONF)
 769     if (hardwareTopology.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
 770     {
 771         return;
 772     }
 773
 774     int countFromDetection = hardwareTopology.machine().logicalProcessorCount;
 775     int countConfigured    = sysconf(_SC_NPROCESSORS_CONF);
 776
 777     /* BIOS, kernel or user actions can take physical processors
 778      * offline. We already cater for the some of the cases inside the hardwareToplogy
 779      * by trying to spin up cores just before we detect, but there could be other
 780      * cases where it is worthwhile to hint that there might be more resources available.
 781      */
 782     if (countConfigured >= 0 && countConfigured != countFromDetection)
 783     {
 784         GMX_LOG(mdlog.info).
 785             appendTextFormatted("Note: %d CPUs configured, but only %d were detected to be online.\n", countConfigured, countFromDetection);
 786
 787         if (isX86 && countConfigured == 2*countFromDetection)
 788         {
 789             GMX_LOG(mdlog.info).
 790                 appendText("      X86 Hyperthreading is likely disabled; enable it for better performance.");
 791         }
 792         // For PowerPC (likely Power8) it is possible to set SMT to either 2,4, or 8-way hardware threads.
 793         // We only warn if it is completely disabled since default performance drops with SMT8.
 794         if (isPowerPC && countConfigured == 8*countFromDetection)
 795         {
 796             GMX_LOG(mdlog.info).
 797                 appendText("      PowerPC SMT is likely disabled; enable SMT2/SMT4 for better performance.");
 798         }
 799     }
 800 #endif
 801 }
 802
 803
 804 gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog, const t_commrec *cr,
 805                                    gmx_bool bDetectGPUs)
 806 {
 807     int ret;
 808
 809     /* make sure no one else is doing the same thing */
 810     ret = tMPI_Thread_mutex_lock(&hw_info_lock);
 811     if (ret != 0)
 812     {
 813         gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
 814     }
 815
 816     /* only initialize the hwinfo structure if it is not already initalized */
 817     if (n_hwinfo == 0)
 818     {
 819         snew(hwinfo_g, 1);
 820
 821         hwinfo_g->cpuInfo             = new gmx::CpuInfo(gmx::CpuInfo::detect());
 822
 823         hardwareTopologyPrepareDetection();
 824         hwinfo_g->hardwareTopology    = new gmx::HardwareTopology(gmx::HardwareTopology::detect());
 825
 826         // If we detected the topology on this system, double-check that it makes sense
 827         if (hwinfo_g->hardwareTopology->isThisSystem())
 828         {
 829             hardwareTopologyDoubleCheckDetection(mdlog, *(hwinfo_g->hardwareTopology));
 830         }
 831
 832         // TODO: Get rid of this altogether.
 833         hwinfo_g->nthreads_hw_avail = hwinfo_g->hardwareTopology->machine().logicalProcessorCount;
 834
 835         /* detect GPUs */
 836         hwinfo_g->gpu_info.n_dev            = 0;
 837         hwinfo_g->gpu_info.n_dev_compatible = 0;
 838         hwinfo_g->gpu_info.gpu_dev          = nullptr;
 839
 840         /* Run the detection if the binary was compiled with GPU support
 841          * and we requested detection.
 842          */
 843         hwinfo_g->gpu_info.bDetectGPUs =
 844             (bGPUBinary && bDetectGPUs &&
 845              getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
 846         if (hwinfo_g->gpu_info.bDetectGPUs)
 847         {
 848             gmx_detect_gpus(mdlog, cr);
 849         }
 850
 851         gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo);
 852     }
 853     /* increase the reference counter */
 854     n_hwinfo++;
 855
 856     ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
 857     if (ret != 0)
 858     {
 859         gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
 860     }
 861
 862     return hwinfo_g;
 863 }
 864
 865 static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo,
 866                                             bool                 bFullCpuInfo)
 867 {
 868     std::string                  s;
 869
 870     const gmx::CpuInfo          &cpuInfo = *hwinfo->cpuInfo;
 871     const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
 872
 873     s  = gmx::formatString("\n");
 874     s += gmx::formatString("Running on %d node%s with total",
 875                            hwinfo->nphysicalnode,
 876                            hwinfo->nphysicalnode == 1 ? "" : "s");
 877     if (hwinfo->ncore_tot > 0)
 878     {
 879         s += gmx::formatString(" %d cores,", hwinfo->ncore_tot);
 880     }
 881     s += gmx::formatString(" %d logical cores", hwinfo->nhwthread_tot);
 882     if (hwinfo->gpu_info.bDetectGPUs)
 883     {
 884         s += gmx::formatString(", %d compatible GPU%s",
 885                                hwinfo->ngpu_compatible_tot,
 886                                hwinfo->ngpu_compatible_tot == 1 ? "" : "s");
 887     }
 888     else if (bGPUBinary)
 889     {
 890         s += gmx::formatString(" (GPU detection deactivated)");
 891     }
 892     s += gmx::formatString("\n");
 893
 894     if (hwinfo->nphysicalnode > 1)
 895     {
 896         /* Print per node hardware feature counts */
 897         if (hwinfo->ncore_max > 0)
 898         {
 899             s += gmx::formatString("  Cores per node:           %2d", hwinfo->ncore_min);
 900             if (hwinfo->ncore_max > hwinfo->ncore_min)
 901             {
 902                 s += gmx::formatString(" - %2d", hwinfo->ncore_max);
 903             }
 904             s += gmx::formatString("\n");
 905         }
 906         s += gmx::formatString("  Logical cores per node:   %2d", hwinfo->nhwthread_min);
 907         if (hwinfo->nhwthread_max > hwinfo->nhwthread_min)
 908         {
 909             s += gmx::formatString(" - %2d", hwinfo->nhwthread_max);
 910         }
 911         s += gmx::formatString("\n");
 912         if (bGPUBinary)
 913         {
 914             s += gmx::formatString("  Compatible GPUs per node: %2d",
 915                                    hwinfo->ngpu_compatible_min);
 916             if (hwinfo->ngpu_compatible_max > hwinfo->ngpu_compatible_min)
 917             {
 918                 s += gmx::formatString(" - %2d", hwinfo->ngpu_compatible_max);
 919             }
 920             s += gmx::formatString("\n");
 921             if (hwinfo->ngpu_compatible_tot > 0)
 922             {
 923                 if (hwinfo->bIdenticalGPUs)
 924                 {
 925                     s += gmx::formatString("  All nodes have identical type(s) of GPUs\n");
 926                 }
 927                 else
 928                 {
 929                     /* This message will also appear with identical GPU types
 930                      * when at least one node has no GPU.
 931                      */
 932                     s += gmx::formatString("  Different nodes have different type(s) and/or order of GPUs\n");
 933                 }
 934             }
 935         }
 936     }
 937
 938 #if GMX_LIB_MPI
 939     char host[HOSTNAMELEN];
 940     int  rank;
 941
 942     gmx_gethostname(host, HOSTNAMELEN);
 943     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 944
 945     s += gmx::formatString("Hardware detected on host %s (the node of MPI rank %d):\n",
 946                            host, rank);
 947 #else
 948     s += gmx::formatString("Hardware detected:\n");
 949 #endif
 950     s += gmx::formatString("  CPU info:\n");
 951
 952     s += gmx::formatString("    Vendor: %s\n", cpuInfo.vendorString().c_str());
 953
 954     s += gmx::formatString("    Brand:  %s\n", cpuInfo.brandString().c_str());
 955
 956     if (bFullCpuInfo)
 957     {
 958         s += gmx::formatString("    Family: %d   Model: %d   Stepping: %d\n",
 959                                cpuInfo.family(), cpuInfo.model(), cpuInfo.stepping());
 960
 961         s += gmx::formatString("    Features:");
 962         for (auto &f : cpuInfo.featureSet())
 963         {
 964             s += gmx::formatString(" %s", cpuInfo.featureString(f).c_str());;
 965         }
 966         s += gmx::formatString("\n");
 967     }
 968
 969     s += gmx::formatString("    SIMD instructions most likely to fit this hardware: %s",
 970                            gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min)).c_str());
 971
 972     if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min)
 973     {
 974         s += gmx::formatString(" - %s", gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_max)).c_str());
 975     }
 976     s += gmx::formatString("\n");
 977
 978     s += gmx::formatString("    SIMD instructions selected at GROMACS compile time: %s\n",
 979                            gmx::simdString(gmx::simdCompiled()).c_str());
 980
 981     s += gmx::formatString("\n");
 982
 983     s += gmx::formatString("  Hardware topology: ");
 984     switch (hwTop.supportLevel())
 985     {
 986         case gmx::HardwareTopology::SupportLevel::None:
 987             s += gmx::formatString("None\n");
 988             break;
 989         case gmx::HardwareTopology::SupportLevel::LogicalProcessorCount:
 990             s += gmx::formatString("Only logical processor count\n");
 991             break;
 992         case gmx::HardwareTopology::SupportLevel::Basic:
 993             s += gmx::formatString("Basic\n");
 994             break;
 995         case gmx::HardwareTopology::SupportLevel::Full:
 996             s += gmx::formatString("Full\n");
 997             break;
 998         case gmx::HardwareTopology::SupportLevel::FullWithDevices:
 999             s += gmx::formatString("Full, with devices\n");
1000             break;
1001     }
1002
1003     if (!hwTop.isThisSystem())
1004     {
1005         s += gmx::formatString("  NOTE: Hardware topology cached or synthetic, not detected.\n");
1006         if (char *p = getenv("HWLOC_XMLFILE"))
1007         {
1008             s += gmx::formatString("        HWLOC_XMLFILE=%s\n", p);
1009         }
1010     }
1011
1012     if (bFullCpuInfo)
1013     {
1014         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic)
1015         {
1016             s += gmx::formatString("    Sockets, cores, and logical processors:\n");
1017
1018             for (auto &socket : hwTop.machine().sockets)
1019             {
1020                 s += gmx::formatString("      Socket %2d:", socket.id);
1021                 for (auto &c : socket.cores)
1022                 {
1023                     s += gmx::formatString(" [");
1024                     for (auto &t : c.hwThreads)
1025                     {
1026                         s += gmx::formatString(" %3d", t.logicalProcessorId);
1027                     }
1028                     s += gmx::formatString("]");
1029                 }
1030                 s += gmx::formatString("\n");
1031             }
1032         }
1033         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Full)
1034         {
1035             s += gmx::formatString("    Numa nodes:\n");
1036             for (auto &n : hwTop.machine().numa.nodes)
1037             {
1038                 s += gmx::formatString("      Node %2d (%" GMX_PRIu64 " bytes mem):", n.id, n.memory);
1039                 for (auto &l : n.logicalProcessorId)
1040                 {
1041                     s += gmx::formatString(" %3d", l);
1042                 }
1043                 s += gmx::formatString("\n");
1044             }
1045             s += gmx::formatString("      Latency:\n          ");
1046             for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
1047             {
1048                 s += gmx::formatString(" %5d", j);
1049             }
1050             s += gmx::formatString("\n");
1051             for (std::size_t i = 0; i < hwTop.machine().numa.nodes.size(); i++)
1052             {
1053                 s += gmx::formatString("     %5d", i);
1054                 for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
1055                 {
1056                     s += gmx::formatString(" %5.2f", hwTop.machine().numa.relativeLatency[i][j]);
1057                 }
1058                 s += gmx::formatString("\n");
1059             }
1060
1061
1062             s += gmx::formatString("    Caches:\n");
1063             for (auto &c : hwTop.machine().caches)
1064             {
1065                 s += gmx::formatString("      L%d: %" GMX_PRIu64 " bytes, linesize %d bytes, assoc. %d, shared %d ways\n",
1066                                        c.level, c.size, c.linesize, c.associativity, c.shared);
1067             }
1068         }
1069         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::FullWithDevices)
1070         {
1071             s += gmx::formatString("    PCI devices:\n");
1072             for (auto &d : hwTop.machine().devices)
1073             {
1074                 s += gmx::formatString("      %04x:%02x:%02x.%1x  Id: %04x:%04x  Class: 0x%04x  Numa: %d\n",
1075                                        d.domain, d.bus, d.dev, d.func, d.vendorId, d.deviceId, d.classId, d.numaNodeId);
1076             }
1077         }
1078     }
1079
1080     if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 ||
1081                        hwinfo->gpu_info.n_dev > 0))
1082     {
1083         s += gmx::formatString("  GPU info:\n");
1084         s += gmx::formatString("    Number of GPUs detected: %d\n",
1085                                hwinfo->gpu_info.n_dev);
1086         if (hwinfo->gpu_info.n_dev > 0)
1087         {
1088             s += sprint_gpus(hwinfo->gpu_info) + "\n";
1089         }
1090     }
1091     return s;
1092 }
1093
1094 void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
1095                                  const gmx::MDLogger &mdlog,
1096                                  const gmx_hw_info_t *hwinfo)
1097 {
1098     const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
1099
1100     if (fplog != nullptr)
1101     {
1102         std::string detected;
1103
1104         detected = detected_hardware_string(hwinfo, TRUE);
1105
1106         fprintf(fplog, "%s\n", detected.c_str());
1107     }
1108
1109     if (MULTIMASTER(cr))
1110     {
1111         std::string detected;
1112
1113         detected = detected_hardware_string(hwinfo, FALSE);
1114
1115         fprintf(stderr, "%s\n", detected.c_str());
1116     }
1117
1118     /* Check the compiled SIMD instruction set against that of the node
1119      * with the lowest SIMD level support (skip if SIMD detection did not work)
1120      */
1121     if (cpuInfo.supportLevel() >= gmx::CpuInfo::SupportLevel::Features)
1122     {
1123         gmx::simdCheck(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min), fplog, MULTIMASTER(cr));
1124     }
1125
1126     /* For RDTSCP we only check on our local node and skip the MPI reduction */
1127     check_use_of_rdtscp_on_this_cpu(mdlog, cpuInfo);
1128 }
1129
1130 bool hasUserSetGpuIds(const gmx_gpu_opt_t *gpu_opt)
1131 {
1132     return gpu_opt->gpu_id != nullptr;
1133 }
1134
1135 bool compatibleGpusFound(const gmx_gpu_info_t &gpu_info)
1136 {
1137     return gpu_info.n_dev_compatible > 0;
1138 }
1139
1140 void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
1141 {
1142     if (!hasUserSetGpuIds(gpu_opt))
1143     {
1144         return;
1145     }
1146
1147     /* Parse a "plain" or comma-separated GPU ID string which contains a
1148      * sequence of digits corresponding to GPU IDs; the order will
1149      * indicate the process/tMPI thread - GPU assignment. */
1150     parse_digits_from_string(gpu_opt->gpu_id, &gpu_opt->n_dev_use, &gpu_opt->dev_use);
1151
1152     if (gpu_opt->n_dev_use == 0)
1153     {
1154         gmx_fatal(FARGS, "Empty GPU ID string encountered.\n"
1155                   "An empty, delimiter-free, or comma-separated sequence of valid numeric IDs of available GPUs is required.\n");
1156     }
1157 }
1158
1159 void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
1160 {
1161     int ret;
1162
1163     ret = tMPI_Thread_mutex_lock(&hw_info_lock);
1164     if (ret != 0)
1165     {
1166         gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
1167     }
1168
1169     /* decrease the reference counter */
1170     n_hwinfo--;
1171
1172
1173     if (hwinfo != hwinfo_g)
1174     {
1175         gmx_incons("hwinfo < hwinfo_g");
1176     }
1177
1178     if (n_hwinfo < 0)
1179     {
1180         gmx_incons("n_hwinfo < 0");
1181     }
1182
1183     if (n_hwinfo == 0)
1184     {
1185         delete hwinfo_g->cpuInfo;
1186         delete hwinfo_g->hardwareTopology;
1187         free_gpu_info(&hwinfo_g->gpu_info);
1188         sfree(hwinfo_g);
1189     }
1190
1191     ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
1192     if (ret != 0)
1193     {
1194         gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
1195     }
1196 }