2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * Defines helper and factory functionality for task assignment.
39 * Note that the GPU ID assignment could potentially match many
40 * different kinds of simulation setups, including ranks from multiple
41 * simulations, ranks from the same simulation, and/or ranks with duty
42 * only for particular tasks (e.g. PME-only ranks). Which GPU ID
43 * assignments are valid will naturally depend on the other run-time
44 * options given to mdrun, and the current capabilities of the
47 * \author Mark Abraham <mark.j.abraham@gmail.com>
48 * \ingroup module_taskassignment
52 #include "taskassignment.h"
59 #include "gromacs/hardware/hw_info.h"
60 #include "gromacs/mdtypes/commrec.h"
61 #include "gromacs/taskassignment/usergpuids.h"
62 #include "gromacs/utility/cstringutil.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxassert.h"
66 #include "gromacs/utility/gmxmpi.h"
67 #include "gromacs/utility/logger.h"
68 #include "gromacs/utility/stringutil.h"
69 #include "gromacs/utility/sysinfo.h"
71 #include "findallgputasks.h"
72 #include "reportgpuusage.h"
80 /*! \brief Build data structure of types of GPU tasks on a rank,
81 * together with the mapped GPU device IDs, for all GPU tasks on all
82 * the ranks of this node.
84 * \param[in] gpuTasksOnRanksOfThisNode For each rank on this node, the set of tasks
85 * that are eligible to run on GPUs.
86 * \param[in] gpuIds The user-supplied GPU IDs.
88 static GpuTaskAssignments
89 buildTaskAssignment(const GpuTasksOnRanks
&gpuTasksOnRanksOfThisNode
,
90 ArrayRef
<const int> gpuIds
)
92 GpuTaskAssignments
gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode
.size());
94 // Loop over the ranks on this node, and the tasks on each
95 // rank. For each task, take the next device ID from those
96 // provided by the user, to build a vector of mappings of task to
97 // ID, for each rank on this node. Note that if there have not
98 // been any GPU tasks identified, then gpuIds can be empty.
99 auto currentGpuId
= gpuIds
.begin();
100 auto gpuTaskAssignmentOnRank
= gpuTaskAssignmentOnRanksOfThisNode
.begin();
101 for (const auto &gpuTasksOnRank
: gpuTasksOnRanksOfThisNode
)
103 gpuTaskAssignmentOnRank
->reserve(gpuTasksOnRank
.size());
104 for (const auto &gpuTaskType
: gpuTasksOnRank
)
106 GMX_RELEASE_ASSERT(currentGpuId
!= gpuIds
.end(), "Indexing out of range for GPU tasks");
107 gpuTaskAssignmentOnRank
->push_back({gpuTaskType
, *currentGpuId
});
110 GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank
->size() == gpuTasksOnRank
.size(),
111 "Mismatch in number of GPU tasks on a rank with the number of elements in the resulting task assignment");
112 ++gpuTaskAssignmentOnRank
;
115 return gpuTaskAssignmentOnRanksOfThisNode
;
118 /*! \brief Return whether a GPU device is shared between any ranks.
120 * Sharing GPUs among multiple ranks is possible via either user or
121 * automated selection. */
122 static bool isAnyGpuSharedBetweenRanks(const GpuTaskAssignments
&gpuTaskAssignments
)
124 // Loop over all ranks i, looking on all higher ranks j whether
125 // any tasks on them share GPU device IDs.
127 // TODO Should this functionality also consider whether tasks on
128 // the same rank are sharing a device?
129 for (size_t i
= 0; i
< gpuTaskAssignments
.size(); ++i
)
131 for (const auto &taskOnRankI
: gpuTaskAssignments
[i
])
133 for (size_t j
= i
+1; j
< gpuTaskAssignments
.size(); ++j
)
135 for (const auto &taskOnRankJ
: gpuTaskAssignments
[j
])
137 if (taskOnRankI
.deviceId_
== taskOnRankJ
.deviceId_
)
148 //! Logs to \c mdlog information that may help a user learn how to let mdrun make a task assignment that runs faster.
149 void logPerformanceHints(const MDLogger
&mdlog
,
150 size_t numCompatibleGpus
,
151 size_t numGpuTasksOnThisNode
,
152 const GpuTaskAssignments
&gpuTaskAssignments
)
154 if (numCompatibleGpus
> numGpuTasksOnThisNode
)
156 /* TODO In principle, this warning could be warranted only on
157 * some nodes, but we lack the infrastructure to do a good job
158 * of reporting that. */
159 GMX_LOG(mdlog
.warning
).asParagraph().
160 appendText("NOTE: You assigned the GPU tasks on a node such that some GPUs "
161 "available on that node are unused, which might not be optimal.");
164 if (isAnyGpuSharedBetweenRanks(gpuTaskAssignments
))
166 GMX_LOG(mdlog
.warning
).asParagraph().
167 appendText("NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good idea if you have measured the performance of alternatives.");
171 //! Counts all the GPU tasks on this node.
172 size_t countGpuTasksOnThisNode(const GpuTasksOnRanks
&gpuTasksOnRanksOfThisNode
)
174 size_t numGpuTasksOnThisNode
= 0;
175 for (const auto &gpuTasksOnRank
: gpuTasksOnRanksOfThisNode
)
177 numGpuTasksOnThisNode
+= gpuTasksOnRank
.size();
179 return numGpuTasksOnThisNode
;
182 //! Finds whether there is any task of \c queryTask in the tasks on the ranks of this node.
183 bool hasAnyTaskOfTypeOnThisNode(const GpuTasksOnRanks
&gpuTasksOnRanksOfThisNode
,
184 const GpuTask queryTask
)
186 for (const auto &gpuTasksOnRank
: gpuTasksOnRanksOfThisNode
)
188 for (const auto &gpuTask
: gpuTasksOnRank
)
190 if (queryTask
== gpuTask
)
201 GpuTaskAssignments::value_type
202 runTaskAssignment(const std::vector
<int> &gpuIdsToUse
,
203 const std::vector
<int> &userGpuTaskAssignment
,
204 const gmx_hw_info_t
&hardwareInfo
,
205 const MDLogger
&mdlog
,
207 const std::vector
<GpuTask
> &gpuTasksOnThisRank
)
209 /* Communicate among ranks on this node to find each task that can
210 * be executed on a GPU, on each rank. */
211 auto gpuTasksOnRanksOfThisNode
= findAllGpuTasksOnThisNode(gpuTasksOnThisRank
,
213 cr
->mpi_comm_physicalnode
);
214 auto numGpuTasksOnThisNode
= countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode
);
216 GpuTaskAssignments taskAssignmentOnRanksOfThisNode
;
219 // Use the GPU IDs from the user if they supplied
220 // them. Otherwise, choose from the compatible GPUs.
222 // GPU ID assignment strings, if provided, cover all the ranks
223 // on a node. If nodes or the process placement on them are
224 // heterogeneous, then the GMX_GPU_ID environment variable
225 // must be set by a user who also wishes to direct GPU ID
226 // assignment. Thus this implementation of task assignment
227 // can assume it has a GPU ID assignment appropriate for the
228 // node upon which its process is running.
230 // Valid GPU ID assignments are `an ordered set of digits that
231 // identify GPU device IDs (e.g. as understood by the GPU
232 // runtime, and subject to environment modification such as
233 // with CUDA_VISIBLE_DEVICES) that will be used for the
234 // GPU-suitable tasks on all of the ranks of that node.
235 ArrayRef
<const int> gpuIdsForTaskAssignment
;
236 std::vector
<int> generatedGpuIds
;
237 if (userGpuTaskAssignment
.empty())
239 ArrayRef
<const int> compatibleGpusToUse
= gpuIdsToUse
;
240 if (hasAnyTaskOfTypeOnThisNode(gpuTasksOnRanksOfThisNode
, GpuTask::Pme
))
242 // PP and PME tasks must run on the same device, so
243 // restrict the assignment to the first device. If
244 // there aren't any, then that error is handled later.
245 if (!compatibleGpusToUse
.empty())
247 compatibleGpusToUse
= compatibleGpusToUse
.subArray(0, 1);
250 generatedGpuIds
= makeGpuIds(compatibleGpusToUse
, numGpuTasksOnThisNode
);
251 gpuIdsForTaskAssignment
= generatedGpuIds
;
255 if (numGpuTasksOnThisNode
!= userGpuTaskAssignment
.size())
257 // TODO Decorating the message with hostname should be
258 // the job of an error-reporting module.
260 gmx_gethostname(host
, STRLEN
);
262 GMX_THROW(InconsistentInputError
263 (formatString("There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were "
264 "identified, and these must match. Reconsider your GPU task assignment, "
265 "number of ranks, or your use of the -nb, -pme, and -npme options.", userGpuTaskAssignment
.size(),
266 host
, numGpuTasksOnThisNode
)));
268 // Did the user choose compatible GPUs?
269 checkUserGpuIds(hardwareInfo
.gpu_info
, gpuIdsToUse
, userGpuTaskAssignment
);
271 gpuIdsForTaskAssignment
= userGpuTaskAssignment
;
273 taskAssignmentOnRanksOfThisNode
=
274 buildTaskAssignment(gpuTasksOnRanksOfThisNode
, gpuIdsForTaskAssignment
);
277 catch (const std::exception
&ex
)
279 // TODO This implementation is quite similar to that of
280 // processExceptionAsFatalError (which implements
281 // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear
282 // how we should involve MPI in the implementation of error
284 if (cr
->rank_intranode
== 0)
286 printFatalErrorMessage(stderr
, ex
);
292 MPI_Barrier(cr
->mpi_comm_mysim
);
298 MPI_Barrier(cr
->ms
->mpi_comm_masters
);
302 gmx_exit_on_fatal_error(ExitType_Abort
, 1);
305 reportGpuUsage(mdlog
, !userGpuTaskAssignment
.empty(), taskAssignmentOnRanksOfThisNode
,
306 numGpuTasksOnThisNode
, cr
->nrank_intranode
, cr
->nnodes
> 1);
308 // If the user chose a task assignment, give them some hints where appropriate.
309 if (!userGpuTaskAssignment
.empty())
311 logPerformanceHints(mdlog
, gpuIdsToUse
.size(),
312 numGpuTasksOnThisNode
,
313 taskAssignmentOnRanksOfThisNode
);
316 return taskAssignmentOnRanksOfThisNode
[cr
->rank_intranode
];
318 // TODO There is no check that mdrun -nb gpu or -pme gpu or
319 // -gpu_id is actually being implemented such that nonbonded tasks
320 // are being run on compatible GPUs, on all applicable ranks. That
321 // would require communication.