Minor change to GPU compatibility reporting handling
[gromacs.git] / src / gromacs / hardware / hardwareassign.cpp
blob224864e16b479b6b0a40b1de806918b6d9db5fde
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #include "gmxpre.h"
37 #include "hardwareassign.h"
39 #include "config.h"
41 #include <cstring>
43 #include <algorithm>
44 #include <string>
46 #include "gromacs/gmxlib/network.h"
47 #include "gromacs/gpu_utils/gpu_utils.h"
48 #include "gromacs/hardware/detecthardware.h"
49 #include "gromacs/hardware/gpu_hw_info.h"
50 #include "gromacs/mdtypes/commrec.h"
51 #include "gromacs/utility/basenetwork.h"
52 #include "gromacs/utility/exceptions.h"
53 #include "gromacs/utility/fatalerror.h"
54 #include "gromacs/utility/gmxassert.h"
55 #include "gromacs/utility/logger.h"
56 #include "gromacs/utility/programcontext.h"
57 #include "gromacs/utility/smalloc.h"
58 #include "gromacs/utility/stringutil.h"
59 #include "gromacs/utility/sysinfo.h"
61 #define HOSTNAMELEN 80
63 /*! \internal \brief
64 * Prints GPU information strings on this node into the stderr and log.
65 * Only used for logging errors in heterogenous MPI configurations.
67 static void print_gpu_detection_stats(const gmx::MDLogger &mdlog,
68 const gmx_gpu_info_t *gpu_info)
70 char onhost[HOSTNAMELEN+10];
71 int ngpu;
73 if (!gpu_info->bDetectGPUs)
75 /* We skipped the detection, so don't print detection stats */
76 return;
79 ngpu = gpu_info->n_dev;
81 /* We only print the detection on one, of possibly multiple, nodes */
82 std::strncpy(onhost, " on host ", 10);
83 gmx_gethostname(onhost + 9, HOSTNAMELEN);
85 if (ngpu > 0)
87 std::string gpuDesc = sprint_gpus(gpu_info);
88 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
89 "%d GPU%s detected%s:\n%s",
90 ngpu, (ngpu > 1) ? "s" : "", onhost, gpuDesc.c_str());
92 else
94 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("No GPUs detected%s", onhost);
96 // FIXME: This currently only logs on the master rank, which defeats the purpose.
97 // A new MDLogger option is required for printing to stderr on all ranks.
98 // There is also a question of MPI reduction of the outputs, see Redmine issue #1505.
101 /*! \internal \brief
102 * This function is responsible for mapping the GPUs to the processes on a single node
103 * (filling the gpu_opt->dev_use array).
105 * \param[in,out] gpu_opt Input/output GPU assignment data.
106 * \param[in] nrank Number of PP GPU ranks on the node.
107 * \param[in] rank Index of PP GPU rank on the node.
109 * This selects the GPUs we will use. This is an operation local to each physical node.
110 * If we have less MPI ranks than GPUs, we will waste some GPUs.
112 static void assign_rank_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
114 GMX_RELEASE_ASSERT(gpu_opt, "Invalid gpu_opt pointer passed");
115 GMX_RELEASE_ASSERT(nrank >= 1,
116 gmx::formatString("Invalid limit (%d) for the number of GPUs (detected %d compatible GPUs)",
117 rank, gpu_opt->n_dev_compatible).c_str());
119 if (gpu_opt->n_dev_compatible == 0)
121 char host[HOSTNAMELEN];
123 gmx_gethostname(host, HOSTNAMELEN);
124 gmx_fatal(FARGS, "A GPU was requested on host %s, but no compatible GPUs were detected. All nodes with PP ranks need to have GPUs. If you intended to use GPU acceleration in a parallel run, you can either avoid using the nodes that don't have GPUs or place PME ranks on these nodes.", host);
127 int nshare;
129 nshare = 1;
130 if (nrank > gpu_opt->n_dev_compatible)
132 if (nrank % gpu_opt->n_dev_compatible == 0)
134 nshare = gmx_gpu_sharing_supported() ? nrank/gpu_opt->n_dev_compatible : 1;
136 else
138 if (rank == 0)
140 gmx_fatal(FARGS, "The number of MPI ranks (%d) in a physical node is not a multiple of the number of GPUs (%d). Select a different number of MPI ranks or use the -gpu_id option to manually specify the GPU to be used.",
141 nrank, gpu_opt->n_dev_compatible);
144 #if GMX_MPI
145 /* We use a global barrier to prevent ranks from continuing with
146 * an invalid setup.
148 MPI_Barrier(MPI_COMM_WORLD);
149 #endif
153 /* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
154 gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
155 if (!gmx_multiple_gpu_per_node_supported())
157 gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1);
159 snew(gpu_opt->dev_use, gpu_opt->n_dev_use);
160 for (int i = 0; i != gpu_opt->n_dev_use; ++i)
162 /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
163 gpu_opt->dev_use[i] = gpu_opt->dev_compatible[i/nshare];
167 /*! \brief Return whether all selected GPUs are compatible.
169 * Given the list of selected GPU device IDs in \c gpu_opt and
170 * detected GPUs in \c gpu_info, return whether all selected GPUs are
171 * compatible. If not, place a suitable string in \c errorMessage.
173 * \param[in] gpu_info pointer to structure holding GPU information
174 * \param[in] gpu_opt pointer to structure holding GPU options
175 * \param[out] errorMessage pointer to string to hold a possible error message (is not updated when returning true)
176 * \returns true if every requested GPU is compatible
178 static bool checkGpuSelection(const gmx_gpu_info_t *gpu_info,
179 const gmx_gpu_opt_t *gpu_opt,
180 std::string *errorMessage)
182 GMX_ASSERT(gpu_info, "Invalid gpu_info");
184 bool allOK = true;
185 std::string message = "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n";
186 for (int i = 0; i < gpu_opt->n_dev_use; i++)
188 GMX_ASSERT(gpu_opt, "Invalid gpu_opt");
189 GMX_ASSERT(gpu_opt->dev_use, "Invalid gpu_opt->dev_use");
191 int id = gpu_opt->dev_use[i];
192 if (!isGpuCompatible(gpu_info, id))
194 allOK = false;
195 message += gmx::formatString(" GPU #%d: %s\n",
197 getGpuCompatibilityDescription(gpu_info, id));
200 if (!allOK && errorMessage)
202 *errorMessage = message;
204 return allOK;
207 /*! \brief Select the compatible GPUs
209 * This function filters gpu_info->gpu_dev for compatible gpus based
210 * on the previously run compatibility tests. Sets
211 * gpu_info->dev_compatible and gpu_info->n_dev_compatible.
213 * \param[in] gpu_info pointer to structure holding GPU information
214 * \param[out] gpu_opt pointer to structure holding GPU options
216 static void pickCompatibleGpus(const gmx_gpu_info_t *gpu_info,
217 gmx_gpu_opt_t *gpu_opt)
219 GMX_ASSERT(gpu_info, "Invalid gpu_info");
220 GMX_ASSERT(gpu_opt, "Invalid gpu_opt");
222 // Possible minor over-allocation here, but not important for anything
223 gpu_opt->n_dev_compatible = 0;
224 snew(gpu_opt->dev_compatible, gpu_info->n_dev);
225 for (int i = 0; i < gpu_info->n_dev; i++)
227 GMX_ASSERT(gpu_info->gpu_dev, "Invalid gpu_info->gpu_dev");
228 if (isGpuCompatible(gpu_info, i))
230 gpu_opt->dev_compatible[gpu_opt->n_dev_compatible] = i;
231 gpu_opt->n_dev_compatible++;
236 void gmx_select_rank_gpu_ids(const gmx::MDLogger &mdlog, const t_commrec *cr,
237 const gmx_gpu_info_t *gpu_info,
238 gmx_bool bForceUseGPU,
239 gmx_gpu_opt_t *gpu_opt)
241 /* Bail if binary is not compiled with GPU acceleration, but this is either
242 * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
243 if (bForceUseGPU && (GMX_GPU == GMX_GPU_NONE))
245 gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!",
246 gmx::getProgramContext().displayName());
249 if (!(cr->duty & DUTY_PP))
251 /* Our rank is not doing PP, we don't use a GPU */
252 return;
255 if (gpu_opt->bUserSet)
257 /* Check the GPU IDs passed by the user.
258 * (GPU IDs have been parsed by gmx_parse_gpu_ids before)
260 std::string errorMessage;
261 if (!checkGpuSelection(gpu_info, gpu_opt, &errorMessage))
263 const bool canHaveHeterogeneousNodes = GMX_LIB_MPI && PAR(cr);
264 if (canHaveHeterogeneousNodes)
266 print_gpu_detection_stats(mdlog, gpu_info);
268 gmx_fatal(FARGS, errorMessage.c_str());
271 else if (getenv("GMX_EMULATE_GPU") == nullptr)
273 pickCompatibleGpus(gpu_info, gpu_opt);
274 assign_rank_gpu_ids(gpu_opt, cr->nrank_pp_intranode, cr->rank_pp_intranode);
277 /* If the user asked for a GPU, check whether we have a GPU */
278 if (bForceUseGPU && gpu_info->n_dev_compatible == 0)
280 gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");