src/gromacs/hardware/printhardware.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "printhardware.h"
  38
  39 #include "config.h"
  40
  41 #include <cstdlib>
  42
  43 #include <algorithm>
  44 #include <string>
  45 #include <vector>
  46
  47 #include "gromacs/gpu_utils/gpu_utils.h"
  48 #include "gromacs/hardware/cpuinfo.h"
  49 #include "gromacs/hardware/gpu_hw_info.h"
  50 #include "gromacs/hardware/hardwaretopology.h"
  51 #include "gromacs/hardware/hw_info.h"
  52 #include "gromacs/mdtypes/commrec.h"
  53 #include "gromacs/simd/support.h"
  54 #include "gromacs/utility/basedefinitions.h"
  55 #include "gromacs/utility/basenetwork.h"
  56 #include "gromacs/utility/cstringutil.h"
  57 #include "gromacs/utility/fatalerror.h"
  58 #include "gromacs/utility/gmxassert.h"
  59 #include "gromacs/utility/logger.h"
  60 #include "gromacs/utility/programcontext.h"
  61 #include "gromacs/utility/stringutil.h"
  62 #include "gromacs/utility/sysinfo.h"
  63
  64 //! Constant used to help minimize preprocessed code
  65 static const bool bGPUBinary     = GMX_GPU != GMX_GPU_NONE;
  66
  67 /*! \brief Return the number of PP rank pairs that share a GPU device between them.
  68  *
  69  * Sharing GPUs among multiple PP ranks is possible via either user or
  70  * automated selection. */
  71 static int gmx_count_gpu_dev_shared(const std::vector<int> &gpuTaskAssignment,
  72                                     bool                    userSetGpuIds)
  73 {
  74     int      same_count    = 0;
  75
  76     if (userSetGpuIds)
  77     {
  78         GMX_RELEASE_ASSERT(!gpuTaskAssignment.empty(),
  79                            "The user cannot choose an empty set of GPU IDs, code is wrong somewhere");
  80         size_t ngpu = gpuTaskAssignment.size();
  81
  82         for (size_t i = 0; i < ngpu - 1; i++)
  83         {
  84             for (size_t j = i + 1; j < ngpu; j++)
  85             {
  86                 same_count      += (gpuTaskAssignment[i] ==
  87                                     gpuTaskAssignment[j]);
  88             }
  89         }
  90     }
  91
  92     return same_count;
  93 }
  94
  95 /* Count and return the number of unique GPUs (per node) selected.
  96  *
  97  * As sharing GPUs among multiple PP ranks is possible when the user passes
  98  * GPU IDs, the number of GPUs user (per node) can be different from the
  99  * number of GPU IDs selected.
 100  */
 101 static size_t gmx_count_gpu_dev_unique(const std::vector<int> &userGpuTaskAssignment)
 102 {
 103     std::set<int> uniqIds;
 104     for (const auto &deviceId : userGpuTaskAssignment)
 105     {
 106         uniqIds.insert(deviceId);
 107     }
 108     return uniqIds.size();
 109 }
 110
 111 /*! \internal \brief
 112  * Returns the GPU information text, one GPU per line.
 113  */
 114 static std::string sprint_gpus(const gmx_gpu_info_t &gpu_info)
 115 {
 116     char                     stmp[STRLEN];
 117     std::vector<std::string> gpuStrings;
 118     for (int i = 0; i < gpu_info.n_dev; i++)
 119     {
 120         get_gpu_device_info_string(stmp, gpu_info, i);
 121         gpuStrings.push_back(gmx::formatString("    %s", stmp));
 122     }
 123     return gmx::joinStrings(gpuStrings, "\n");
 124 }
 125
 126 std::string
 127 makeGpuUsageReport(const gmx_gpu_info_t   &gpu_info,
 128                    bool                    userSetGpuIds,
 129                    const std::vector<int> &gpuTaskAssignment,
 130                    size_t                  numPpRanks,
 131                    bool                    bPrintHostName)
 132 {
 133     int  ngpu_comp = gpu_info.n_dev_compatible;
 134     char host[STRLEN];
 135
 136     if (bPrintHostName)
 137     {
 138         gmx_gethostname(host, STRLEN);
 139     }
 140
 141     /* Issue a note if GPUs are available but not used */
 142     if (ngpu_comp > 0 && gpuTaskAssignment.empty())
 143     {
 144         return gmx::formatString("%d compatible GPU%s detected in the system, but none will be used.\n"
 145                                  "Consider trying GPU acceleration with the Verlet scheme!\n",
 146                                  ngpu_comp, (ngpu_comp > 1) ? "s" : "");
 147     }
 148
 149     std::string output;
 150     {
 151         std::string gpuIdsString =
 152             formatAndJoin(gpuTaskAssignment, ",", gmx::StringFormatter("%d"));
 153         size_t      numGpusInUse = gmx_count_gpu_dev_unique(gpuTaskAssignment);
 154         bool        bPluralGpus  = numGpusInUse > 1;
 155
 156         if (bPrintHostName)
 157         {
 158             output += gmx::formatString("On host %s", host);
 159         }
 160         output += gmx::formatString("%zu GPU%s %sselected for this run.\n"
 161                                     "Mapping of GPU ID%s to the %d PP rank%s in this node: %s\n",
 162                                     numGpusInUse, bPluralGpus ? "s" : "",
 163                                     userSetGpuIds ? "user-" : "auto-",
 164                                     bPluralGpus ? "s" : "",
 165                                     numPpRanks,
 166                                     (numPpRanks > 1) ? "s" : "",
 167                                     gpuIdsString.c_str());
 168     }
 169
 170     int same_count = gmx_count_gpu_dev_shared(gpuTaskAssignment, userSetGpuIds);
 171
 172     if (same_count > 0)
 173     {
 174         output += gmx::formatString("NOTE: You assigned %s to multiple ranks.\n",
 175                                     same_count > 1 ? "GPU IDs" : "a GPU ID");
 176     }
 177
 178     if (static_cast<size_t>(ngpu_comp) > numPpRanks)
 179     {
 180         /* TODO In principle, this warning could be warranted only on
 181          * ranks on some nodes, but we lack the infrastructure to do a
 182          * good job of reporting that. */
 183         output += gmx::formatString("NOTE: potentially sub-optimal launch configuration using fewer\n"
 184                                     "      PP ranks on a node than GPUs available on that node.\n");
 185     }
 186
 187     return output;
 188 }
 189
 190 /* Give a suitable fatal error or warning if the build configuration
 191    and runtime CPU do not match. */
 192 static void
 193 check_use_of_rdtscp_on_this_cpu(const gmx::MDLogger   &mdlog,
 194                                 const gmx::CpuInfo    &cpuInfo)
 195 {
 196     bool        binaryUsesRdtscp = HAVE_RDTSCP;
 197
 198     const char *programName = gmx::getProgramContext().displayName();
 199
 200     if (cpuInfo.supportLevel() < gmx::CpuInfo::SupportLevel::Features)
 201     {
 202         if (binaryUsesRdtscp)
 203         {
 204             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 205                     "The %s executable was compiled to use the rdtscp CPU instruction. "
 206                     "We cannot detect the features of your current CPU, but will proceed anyway. "
 207                     "If you get a crash, rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 208                     programName);
 209         }
 210     }
 211     else
 212     {
 213         bool cpuHasRdtscp = cpuInfo.feature(gmx::CpuInfo::Feature::X86_Rdtscp);
 214
 215         if (!cpuHasRdtscp && binaryUsesRdtscp)
 216         {
 217             gmx_fatal(FARGS, "The %s executable was compiled to use the rdtscp CPU instruction. "
 218                       "However, this is not supported by the current hardware and continuing would lead to a crash. "
 219                       "Please rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
 220                       programName);
 221         }
 222
 223         if (cpuHasRdtscp && !binaryUsesRdtscp)
 224         {
 225             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 226                     "The current CPU can measure timings more accurately than the code in\n"
 227                     "%s was configured to use. This might affect your simulation\n"
 228                     "speed as accurate timings are needed for load-balancing.\n"
 229                     "Please consider rebuilding %s with the GMX_USE_RDTSCP=ON CMake option.",
 230                     programName, programName);
 231         }
 232     }
 233 }
 234
 235 static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo,
 236                                             bool                 bFullCpuInfo)
 237 {
 238     std::string                  s;
 239
 240     const gmx::CpuInfo          &cpuInfo = *hwinfo->cpuInfo;
 241     const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
 242
 243     s  = gmx::formatString("\n");
 244     s += gmx::formatString("Running on %d node%s with total",
 245                            hwinfo->nphysicalnode,
 246                            hwinfo->nphysicalnode == 1 ? "" : "s");
 247     if (hwinfo->ncore_tot > 0)
 248     {
 249         s += gmx::formatString(" %d cores,", hwinfo->ncore_tot);
 250     }
 251     s += gmx::formatString(" %d logical cores", hwinfo->nhwthread_tot);
 252     if (hwinfo->gpu_info.bDetectGPUs)
 253     {
 254         s += gmx::formatString(", %d compatible GPU%s",
 255                                hwinfo->ngpu_compatible_tot,
 256                                hwinfo->ngpu_compatible_tot == 1 ? "" : "s");
 257     }
 258     else if (bGPUBinary)
 259     {
 260         s += gmx::formatString(" (GPU detection deactivated)");
 261     }
 262     s += gmx::formatString("\n");
 263
 264     if (hwinfo->nphysicalnode > 1)
 265     {
 266         /* Print per node hardware feature counts */
 267         if (hwinfo->ncore_max > 0)
 268         {
 269             s += gmx::formatString("  Cores per node:           %2d", hwinfo->ncore_min);
 270             if (hwinfo->ncore_max > hwinfo->ncore_min)
 271             {
 272                 s += gmx::formatString(" - %2d", hwinfo->ncore_max);
 273             }
 274             s += gmx::formatString("\n");
 275         }
 276         s += gmx::formatString("  Logical cores per node:   %2d", hwinfo->nhwthread_min);
 277         if (hwinfo->nhwthread_max > hwinfo->nhwthread_min)
 278         {
 279             s += gmx::formatString(" - %2d", hwinfo->nhwthread_max);
 280         }
 281         s += gmx::formatString("\n");
 282         if (bGPUBinary)
 283         {
 284             s += gmx::formatString("  Compatible GPUs per node: %2d",
 285                                    hwinfo->ngpu_compatible_min);
 286             if (hwinfo->ngpu_compatible_max > hwinfo->ngpu_compatible_min)
 287             {
 288                 s += gmx::formatString(" - %2d", hwinfo->ngpu_compatible_max);
 289             }
 290             s += gmx::formatString("\n");
 291             if (hwinfo->ngpu_compatible_tot > 0)
 292             {
 293                 if (hwinfo->bIdenticalGPUs)
 294                 {
 295                     s += gmx::formatString("  All nodes have identical type(s) of GPUs\n");
 296                 }
 297                 else
 298                 {
 299                     /* This message will also appear with identical GPU types
 300                      * when at least one node has no GPU.
 301                      */
 302                     s += gmx::formatString("  Different nodes have different type(s) and/or order of GPUs\n");
 303                 }
 304             }
 305         }
 306     }
 307
 308 #if GMX_LIB_MPI
 309     int  rank;
 310     char host[STRLEN];
 311
 312     gmx_gethostname(host, STRLEN);
 313
 314     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 315
 316     // TODO Use a wrapper around MPI_Get_processor_name instead.
 317     s += gmx::formatString("Hardware detected on host %s (the node of MPI rank %d):\n",
 318                            host, rank);
 319 #else
 320     s += gmx::formatString("Hardware detected:\n");
 321 #endif
 322     s += gmx::formatString("  CPU info:\n");
 323
 324     s += gmx::formatString("    Vendor: %s\n", cpuInfo.vendorString().c_str());
 325
 326     s += gmx::formatString("    Brand:  %s\n", cpuInfo.brandString().c_str());
 327
 328     if (bFullCpuInfo)
 329     {
 330         s += gmx::formatString("    Family: %d   Model: %d   Stepping: %d\n",
 331                                cpuInfo.family(), cpuInfo.model(), cpuInfo.stepping());
 332
 333         s += gmx::formatString("    Features:");
 334         for (auto &f : cpuInfo.featureSet())
 335         {
 336             s += gmx::formatString(" %s", cpuInfo.featureString(f).c_str());;
 337         }
 338         s += gmx::formatString("\n");
 339     }
 340
 341     s += gmx::formatString("    SIMD instructions most likely to fit this hardware: %s",
 342                            gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min)).c_str());
 343
 344     if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min)
 345     {
 346         s += gmx::formatString(" - %s", gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_max)).c_str());
 347     }
 348     s += gmx::formatString("\n");
 349
 350     s += gmx::formatString("    SIMD instructions selected at GROMACS compile time: %s\n",
 351                            gmx::simdString(gmx::simdCompiled()).c_str());
 352
 353     s += gmx::formatString("\n");
 354
 355     s += gmx::formatString("  Hardware topology: ");
 356     switch (hwTop.supportLevel())
 357     {
 358         case gmx::HardwareTopology::SupportLevel::None:
 359             s += gmx::formatString("None\n");
 360             break;
 361         case gmx::HardwareTopology::SupportLevel::LogicalProcessorCount:
 362             s += gmx::formatString("Only logical processor count\n");
 363             break;
 364         case gmx::HardwareTopology::SupportLevel::Basic:
 365             s += gmx::formatString("Basic\n");
 366             break;
 367         case gmx::HardwareTopology::SupportLevel::Full:
 368             s += gmx::formatString("Full\n");
 369             break;
 370         case gmx::HardwareTopology::SupportLevel::FullWithDevices:
 371             s += gmx::formatString("Full, with devices\n");
 372             break;
 373     }
 374
 375     if (!hwTop.isThisSystem())
 376     {
 377         s += gmx::formatString("  NOTE: Hardware topology cached or synthetic, not detected.\n");
 378         if (char *p = std::getenv("HWLOC_XMLFILE"))
 379         {
 380             s += gmx::formatString("        HWLOC_XMLFILE=%s\n", p);
 381         }
 382     }
 383
 384     if (bFullCpuInfo)
 385     {
 386         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic)
 387         {
 388             s += gmx::formatString("    Sockets, cores, and logical processors:\n");
 389
 390             for (auto &socket : hwTop.machine().sockets)
 391             {
 392                 s += gmx::formatString("      Socket %2d:", socket.id);
 393                 for (auto &c : socket.cores)
 394                 {
 395                     s += gmx::formatString(" [");
 396                     for (auto &t : c.hwThreads)
 397                     {
 398                         s += gmx::formatString(" %3d", t.logicalProcessorId);
 399                     }
 400                     s += gmx::formatString("]");
 401                 }
 402                 s += gmx::formatString("\n");
 403             }
 404         }
 405         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Full)
 406         {
 407             s += gmx::formatString("    Numa nodes:\n");
 408             for (auto &n : hwTop.machine().numa.nodes)
 409             {
 410                 s += gmx::formatString("      Node %2d (%" GMX_PRIu64 " bytes mem):", n.id, n.memory);
 411                 for (auto &l : n.logicalProcessorId)
 412                 {
 413                     s += gmx::formatString(" %3d", l);
 414                 }
 415                 s += gmx::formatString("\n");
 416             }
 417             s += gmx::formatString("      Latency:\n          ");
 418             for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
 419             {
 420                 s += gmx::formatString(" %5d", j);
 421             }
 422             s += gmx::formatString("\n");
 423             for (std::size_t i = 0; i < hwTop.machine().numa.nodes.size(); i++)
 424             {
 425                 s += gmx::formatString("     %5d", i);
 426                 for (std::size_t j = 0; j < hwTop.machine().numa.nodes.size(); j++)
 427                 {
 428                     s += gmx::formatString(" %5.2f", hwTop.machine().numa.relativeLatency[i][j]);
 429                 }
 430                 s += gmx::formatString("\n");
 431             }
 432
 433
 434             s += gmx::formatString("    Caches:\n");
 435             for (auto &c : hwTop.machine().caches)
 436             {
 437                 s += gmx::formatString("      L%d: %" GMX_PRIu64 " bytes, linesize %d bytes, assoc. %d, shared %d ways\n",
 438                                        c.level, c.size, c.linesize, c.associativity, c.shared);
 439             }
 440         }
 441         if (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::FullWithDevices)
 442         {
 443             s += gmx::formatString("    PCI devices:\n");
 444             for (auto &d : hwTop.machine().devices)
 445             {
 446                 s += gmx::formatString("      %04x:%02x:%02x.%1x  Id: %04x:%04x  Class: 0x%04x  Numa: %d\n",
 447                                        d.domain, d.bus, d.dev, d.func, d.vendorId, d.deviceId, d.classId, d.numaNodeId);
 448             }
 449         }
 450     }
 451
 452     if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 ||
 453                        hwinfo->gpu_info.n_dev > 0))
 454     {
 455         s += gmx::formatString("  GPU info:\n");
 456         s += gmx::formatString("    Number of GPUs detected: %d\n",
 457                                hwinfo->gpu_info.n_dev);
 458         if (hwinfo->gpu_info.n_dev > 0)
 459         {
 460             s += sprint_gpus(hwinfo->gpu_info) + "\n";
 461         }
 462     }
 463     return s;
 464 }
 465
 466 void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
 467                                  const gmx::MDLogger &mdlog,
 468                                  const gmx_hw_info_t *hwinfo)
 469 {
 470     const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
 471
 472     if (fplog != nullptr)
 473     {
 474         std::string detected;
 475
 476         detected = detected_hardware_string(hwinfo, TRUE);
 477
 478         fprintf(fplog, "%s\n", detected.c_str());
 479     }
 480
 481     if (MULTIMASTER(cr))
 482     {
 483         std::string detected;
 484
 485         detected = detected_hardware_string(hwinfo, FALSE);
 486
 487         fprintf(stderr, "%s\n", detected.c_str());
 488     }
 489
 490     /* Check the compiled SIMD instruction set against that of the node
 491      * with the lowest SIMD level support (skip if SIMD detection did not work)
 492      */
 493     if (cpuInfo.supportLevel() >= gmx::CpuInfo::SupportLevel::Features)
 494     {
 495         gmx::simdCheck(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min), fplog, MULTIMASTER(cr));
 496     }
 497
 498     /* For RDTSCP we only check on our local node and skip the MPI reduction */
 499     check_use_of_rdtscp_on_this_cpu(mdlog, cpuInfo);
 500 }