2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "printhardware.h"
47 #include "gromacs/gpu_utils/gpu_utils.h"
48 #include "gromacs/hardware/cpuinfo.h"
49 #include "gromacs/hardware/gpu_hw_info.h"
50 #include "gromacs/hardware/hardwaretopology.h"
51 #include "gromacs/hardware/hw_info.h"
52 #include "gromacs/mdtypes/commrec.h"
53 #include "gromacs/simd/support.h"
54 #include "gromacs/utility/basedefinitions.h"
55 #include "gromacs/utility/basenetwork.h"
56 #include "gromacs/utility/cstringutil.h"
57 #include "gromacs/utility/fatalerror.h"
58 #include "gromacs/utility/gmxassert.h"
59 #include "gromacs/utility/logger.h"
60 #include "gromacs/utility/programcontext.h"
61 #include "gromacs/utility/stringutil.h"
62 #include "gromacs/utility/sysinfo.h"
64 //! Constant used to help minimize preprocessed code
65 static const bool bGPUBinary
= GMX_GPU
!= GMX_GPU_NONE
;
67 /*! \brief Return the number of PP rank pairs that share a GPU device between them.
69 * Sharing GPUs among multiple PP ranks is possible via either user or
70 * automated selection. */
71 static int gmx_count_gpu_dev_shared(const std::vector
<int> &gpuTaskAssignment
,
78 GMX_RELEASE_ASSERT(!gpuTaskAssignment
.empty(),
79 "The user cannot choose an empty set of GPU IDs, code is wrong somewhere");
80 size_t ngpu
= gpuTaskAssignment
.size();
82 for (size_t i
= 0; i
< ngpu
- 1; i
++)
84 for (size_t j
= i
+ 1; j
< ngpu
; j
++)
86 same_count
+= (gpuTaskAssignment
[i
] ==
87 gpuTaskAssignment
[j
]);
95 /* Count and return the number of unique GPUs (per node) selected.
97 * As sharing GPUs among multiple PP ranks is possible, the number of
98 * GPUs used (per node) can be different from the number of GPU IDs
101 static size_t gmx_count_gpu_dev_unique(const std::vector
<int> &gpuTaskAssignment
)
103 std::set
<int> uniqIds
;
104 for (const auto &deviceId
: gpuTaskAssignment
)
106 uniqIds
.insert(deviceId
);
108 return uniqIds
.size();
112 * Returns the GPU information text, one GPU per line.
114 static std::string
sprint_gpus(const gmx_gpu_info_t
&gpu_info
)
117 std::vector
<std::string
> gpuStrings
;
118 for (int i
= 0; i
< gpu_info
.n_dev
; i
++)
120 get_gpu_device_info_string(stmp
, gpu_info
, i
);
121 gpuStrings
.push_back(gmx::formatString(" %s", stmp
));
123 return gmx::joinStrings(gpuStrings
, "\n");
126 void reportGpuUsage(const gmx::MDLogger
&mdlog
,
127 const gmx_gpu_info_t
&gpu_info
,
129 const std::vector
<int> &gpuTaskAssignment
,
133 if (gpuTaskAssignment
.empty())
140 std::string gpuIdsString
=
141 formatAndJoin(gpuTaskAssignment
, ",", gmx::StringFormatter("%d"));
142 size_t numGpusInUse
= gmx_count_gpu_dev_unique(gpuTaskAssignment
);
143 bool bPluralGpus
= numGpusInUse
> 1;
148 gmx_gethostname(host
, STRLEN
);
149 output
+= gmx::formatString("On host %s", host
);
151 output
+= gmx::formatString("%zu GPU%s %sselected for this run.\n"
152 "Mapping of GPU ID%s to the %d PP rank%s in this node: %s\n",
153 numGpusInUse
, bPluralGpus
? "s" : "",
154 userSetGpuIds
? "user-" : "auto-",
155 bPluralGpus
? "s" : "",
157 (numPpRanks
> 1) ? "s" : "",
158 gpuIdsString
.c_str());
161 int same_count
= gmx_count_gpu_dev_shared(gpuTaskAssignment
, userSetGpuIds
);
165 output
+= gmx::formatString("NOTE: You assigned %s to multiple ranks.\n",
166 same_count
> 1 ? "GPU IDs" : "a GPU ID");
169 if (static_cast<size_t>(gpu_info
.n_dev_compatible
) > numPpRanks
)
171 /* TODO In principle, this warning could be warranted only on
172 * ranks on some nodes, but we lack the infrastructure to do a
173 * good job of reporting that. */
174 output
+= gmx::formatString("NOTE: potentially sub-optimal launch configuration using fewer\n"
175 " PP ranks on a node than GPUs available on that node.\n");
178 /* NOTE: this print is only for and on one physical node */
179 GMX_LOG(mdlog
.warning
).appendText(output
);
182 /* Give a suitable fatal error or warning if the build configuration
183 and runtime CPU do not match. */
185 check_use_of_rdtscp_on_this_cpu(const gmx::MDLogger
&mdlog
,
186 const gmx::CpuInfo
&cpuInfo
)
188 bool binaryUsesRdtscp
= HAVE_RDTSCP
;
190 const char *programName
= gmx::getProgramContext().displayName();
192 if (cpuInfo
.supportLevel() < gmx::CpuInfo::SupportLevel::Features
)
194 if (binaryUsesRdtscp
)
196 GMX_LOG(mdlog
.warning
).asParagraph().appendTextFormatted(
197 "The %s executable was compiled to use the rdtscp CPU instruction. "
198 "We cannot detect the features of your current CPU, but will proceed anyway. "
199 "If you get a crash, rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
205 bool cpuHasRdtscp
= cpuInfo
.feature(gmx::CpuInfo::Feature::X86_Rdtscp
);
207 if (!cpuHasRdtscp
&& binaryUsesRdtscp
)
209 gmx_fatal(FARGS
, "The %s executable was compiled to use the rdtscp CPU instruction. "
210 "However, this is not supported by the current hardware and continuing would lead to a crash. "
211 "Please rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
215 if (cpuHasRdtscp
&& !binaryUsesRdtscp
)
217 GMX_LOG(mdlog
.warning
).asParagraph().appendTextFormatted(
218 "The current CPU can measure timings more accurately than the code in\n"
219 "%s was configured to use. This might affect your simulation\n"
220 "speed as accurate timings are needed for load-balancing.\n"
221 "Please consider rebuilding %s with the GMX_USE_RDTSCP=ON CMake option.",
222 programName
, programName
);
227 static std::string
detected_hardware_string(const gmx_hw_info_t
*hwinfo
,
232 const gmx::CpuInfo
&cpuInfo
= *hwinfo
->cpuInfo
;
233 const gmx::HardwareTopology
&hwTop
= *hwinfo
->hardwareTopology
;
235 s
= gmx::formatString("\n");
236 s
+= gmx::formatString("Running on %d node%s with total",
237 hwinfo
->nphysicalnode
,
238 hwinfo
->nphysicalnode
== 1 ? "" : "s");
239 if (hwinfo
->ncore_tot
> 0)
241 s
+= gmx::formatString(" %d cores,", hwinfo
->ncore_tot
);
243 s
+= gmx::formatString(" %d logical cores", hwinfo
->nhwthread_tot
);
244 if (hwinfo
->gpu_info
.bDetectGPUs
)
246 s
+= gmx::formatString(", %d compatible GPU%s",
247 hwinfo
->ngpu_compatible_tot
,
248 hwinfo
->ngpu_compatible_tot
== 1 ? "" : "s");
252 s
+= gmx::formatString(" (GPU detection deactivated)");
254 s
+= gmx::formatString("\n");
256 if (hwinfo
->nphysicalnode
> 1)
258 /* Print per node hardware feature counts */
259 if (hwinfo
->ncore_max
> 0)
261 s
+= gmx::formatString(" Cores per node: %2d", hwinfo
->ncore_min
);
262 if (hwinfo
->ncore_max
> hwinfo
->ncore_min
)
264 s
+= gmx::formatString(" - %2d", hwinfo
->ncore_max
);
266 s
+= gmx::formatString("\n");
268 s
+= gmx::formatString(" Logical cores per node: %2d", hwinfo
->nhwthread_min
);
269 if (hwinfo
->nhwthread_max
> hwinfo
->nhwthread_min
)
271 s
+= gmx::formatString(" - %2d", hwinfo
->nhwthread_max
);
273 s
+= gmx::formatString("\n");
276 s
+= gmx::formatString(" Compatible GPUs per node: %2d",
277 hwinfo
->ngpu_compatible_min
);
278 if (hwinfo
->ngpu_compatible_max
> hwinfo
->ngpu_compatible_min
)
280 s
+= gmx::formatString(" - %2d", hwinfo
->ngpu_compatible_max
);
282 s
+= gmx::formatString("\n");
283 if (hwinfo
->ngpu_compatible_tot
> 0)
285 if (hwinfo
->bIdenticalGPUs
)
287 s
+= gmx::formatString(" All nodes have identical type(s) of GPUs\n");
291 /* This message will also appear with identical GPU types
292 * when at least one node has no GPU.
294 s
+= gmx::formatString(" Different nodes have different type(s) and/or order of GPUs\n");
304 gmx_gethostname(host
, STRLEN
);
306 MPI_Comm_rank(MPI_COMM_WORLD
, &rank
);
308 // TODO Use a wrapper around MPI_Get_processor_name instead.
309 s
+= gmx::formatString("Hardware detected on host %s (the node of MPI rank %d):\n",
312 s
+= gmx::formatString("Hardware detected:\n");
314 s
+= gmx::formatString(" CPU info:\n");
316 s
+= gmx::formatString(" Vendor: %s\n", cpuInfo
.vendorString().c_str());
318 s
+= gmx::formatString(" Brand: %s\n", cpuInfo
.brandString().c_str());
322 s
+= gmx::formatString(" Family: %d Model: %d Stepping: %d\n",
323 cpuInfo
.family(), cpuInfo
.model(), cpuInfo
.stepping());
325 s
+= gmx::formatString(" Features:");
326 for (auto &f
: cpuInfo
.featureSet())
328 s
+= gmx::formatString(" %s", cpuInfo
.featureString(f
).c_str());;
330 s
+= gmx::formatString("\n");
333 s
+= gmx::formatString(" SIMD instructions most likely to fit this hardware: %s",
334 gmx::simdString(static_cast<gmx::SimdType
>(hwinfo
->simd_suggest_min
)).c_str());
336 if (hwinfo
->simd_suggest_max
> hwinfo
->simd_suggest_min
)
338 s
+= gmx::formatString(" - %s", gmx::simdString(static_cast<gmx::SimdType
>(hwinfo
->simd_suggest_max
)).c_str());
340 s
+= gmx::formatString("\n");
342 s
+= gmx::formatString(" SIMD instructions selected at GROMACS compile time: %s\n",
343 gmx::simdString(gmx::simdCompiled()).c_str());
345 s
+= gmx::formatString("\n");
347 s
+= gmx::formatString(" Hardware topology: ");
348 switch (hwTop
.supportLevel())
350 case gmx::HardwareTopology::SupportLevel::None
:
351 s
+= gmx::formatString("None\n");
353 case gmx::HardwareTopology::SupportLevel::LogicalProcessorCount
:
354 s
+= gmx::formatString("Only logical processor count\n");
356 case gmx::HardwareTopology::SupportLevel::Basic
:
357 s
+= gmx::formatString("Basic\n");
359 case gmx::HardwareTopology::SupportLevel::Full
:
360 s
+= gmx::formatString("Full\n");
362 case gmx::HardwareTopology::SupportLevel::FullWithDevices
:
363 s
+= gmx::formatString("Full, with devices\n");
367 if (!hwTop
.isThisSystem())
369 s
+= gmx::formatString(" NOTE: Hardware topology cached or synthetic, not detected.\n");
370 if (char *p
= std::getenv("HWLOC_XMLFILE"))
372 s
+= gmx::formatString(" HWLOC_XMLFILE=%s\n", p
);
378 if (hwTop
.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic
)
380 s
+= gmx::formatString(" Sockets, cores, and logical processors:\n");
382 for (auto &socket
: hwTop
.machine().sockets
)
384 s
+= gmx::formatString(" Socket %2d:", socket
.id
);
385 for (auto &c
: socket
.cores
)
387 s
+= gmx::formatString(" [");
388 for (auto &t
: c
.hwThreads
)
390 s
+= gmx::formatString(" %3d", t
.logicalProcessorId
);
392 s
+= gmx::formatString("]");
394 s
+= gmx::formatString("\n");
397 if (hwTop
.supportLevel() >= gmx::HardwareTopology::SupportLevel::Full
)
399 s
+= gmx::formatString(" Numa nodes:\n");
400 for (auto &n
: hwTop
.machine().numa
.nodes
)
402 s
+= gmx::formatString(" Node %2d (%" GMX_PRIu64
" bytes mem):", n
.id
, n
.memory
);
403 for (auto &l
: n
.logicalProcessorId
)
405 s
+= gmx::formatString(" %3d", l
);
407 s
+= gmx::formatString("\n");
409 s
+= gmx::formatString(" Latency:\n ");
410 for (std::size_t j
= 0; j
< hwTop
.machine().numa
.nodes
.size(); j
++)
412 s
+= gmx::formatString(" %5d", j
);
414 s
+= gmx::formatString("\n");
415 for (std::size_t i
= 0; i
< hwTop
.machine().numa
.nodes
.size(); i
++)
417 s
+= gmx::formatString(" %5d", i
);
418 for (std::size_t j
= 0; j
< hwTop
.machine().numa
.nodes
.size(); j
++)
420 s
+= gmx::formatString(" %5.2f", hwTop
.machine().numa
.relativeLatency
[i
][j
]);
422 s
+= gmx::formatString("\n");
426 s
+= gmx::formatString(" Caches:\n");
427 for (auto &c
: hwTop
.machine().caches
)
429 s
+= gmx::formatString(" L%d: %" GMX_PRIu64
" bytes, linesize %d bytes, assoc. %d, shared %d ways\n",
430 c
.level
, c
.size
, c
.linesize
, c
.associativity
, c
.shared
);
433 if (hwTop
.supportLevel() >= gmx::HardwareTopology::SupportLevel::FullWithDevices
)
435 s
+= gmx::formatString(" PCI devices:\n");
436 for (auto &d
: hwTop
.machine().devices
)
438 s
+= gmx::formatString(" %04x:%02x:%02x.%1x Id: %04x:%04x Class: 0x%04x Numa: %d\n",
439 d
.domain
, d
.bus
, d
.dev
, d
.func
, d
.vendorId
, d
.deviceId
, d
.classId
, d
.numaNodeId
);
444 if (bGPUBinary
&& (hwinfo
->ngpu_compatible_tot
> 0 ||
445 hwinfo
->gpu_info
.n_dev
> 0))
447 s
+= gmx::formatString(" GPU info:\n");
448 s
+= gmx::formatString(" Number of GPUs detected: %d\n",
449 hwinfo
->gpu_info
.n_dev
);
450 if (hwinfo
->gpu_info
.n_dev
> 0)
452 s
+= sprint_gpus(hwinfo
->gpu_info
) + "\n";
458 void gmx_print_detected_hardware(FILE *fplog
, const t_commrec
*cr
,
459 const gmx::MDLogger
&mdlog
,
460 const gmx_hw_info_t
*hwinfo
)
462 const gmx::CpuInfo
&cpuInfo
= *hwinfo
->cpuInfo
;
464 if (fplog
!= nullptr)
466 std::string detected
;
468 detected
= detected_hardware_string(hwinfo
, TRUE
);
470 fprintf(fplog
, "%s\n", detected
.c_str());
475 std::string detected
;
477 detected
= detected_hardware_string(hwinfo
, FALSE
);
479 fprintf(stderr
, "%s\n", detected
.c_str());
482 /* Check the compiled SIMD instruction set against that of the node
483 * with the lowest SIMD level support (skip if SIMD detection did not work)
485 if (cpuInfo
.supportLevel() >= gmx::CpuInfo::SupportLevel::Features
)
487 gmx::simdCheck(static_cast<gmx::SimdType
>(hwinfo
->simd_suggest_min
), fplog
, MULTIMASTER(cr
));
490 /* For RDTSCP we only check on our local node and skip the MPI reduction */
491 check_use_of_rdtscp_on_this_cpu(mdlog
, cpuInfo
);