2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017 The GROMACS development team.
5 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Defines the CUDA implementations of the device management.
39 * \author Artem Zhmurov <zhmurov@gmail.com>
41 * \ingroup module_hardware
45 #include "device_management.h"
49 #include "gromacs/gpu_utils/cudautils.cuh"
50 #include "gromacs/gpu_utils/device_context.h"
51 #include "gromacs/gpu_utils/device_stream.h"
52 #include "gromacs/utility/programcontext.h"
53 #include "gromacs/utility/smalloc.h"
56 * Max number of devices supported by CUDA (for consistency checking).
58 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
60 static int cuda_max_device_count = 32;
62 /** Dummy kernel used for sanity checking. */
63 static __global__ void k_dummy_test(void) {}
65 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
67 cudaFuncAttributes attributes;
68 cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
70 if (cudaErrorInvalidDeviceFunction == stat)
73 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
74 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
75 "By default, GROMACS supports all architectures of compute "
76 "capability >= 3.0, so your GPU "
77 "might be rare, or some architectures were disabled in the build. \n"
78 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
79 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
80 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
87 * \brief Runs GPU sanity checks.
89 * Runs a series of checks to determine that the given GPU and underlying CUDA
90 * driver/runtime functions properly.
92 * \todo Currently we do not make a distinction between the type of errors
93 * that can appear during functionality checks. This needs to be improved,
94 * e.g if the dummy test kernel fails to execute with a "device busy message"
95 * we should appropriately report that the device is busy instead of NonFunctional.
97 * \todo Introduce errors codes and handle errors more smoothly.
100 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
101 * \param[in] dev_prop The device properties structure
102 * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
104 static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
109 cu_err = cudaGetDeviceCount(&dev_count);
110 if (cu_err != cudaSuccess)
112 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
113 return DeviceStatus::NonFunctional;
116 /* no CUDA compatible device at all */
119 return DeviceStatus::NonFunctional;
122 /* things might go horribly wrong if cudart is not compatible with the driver */
123 if (dev_count < 0 || dev_count > cuda_max_device_count)
125 return DeviceStatus::NonFunctional;
128 if (dev_id == -1) /* device already selected let's not destroy the context */
130 cu_err = cudaGetDevice(&id);
131 if (cu_err != cudaSuccess)
133 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
134 return DeviceStatus::NonFunctional;
140 if (id > dev_count - 1) /* pfff there's no such device */
143 "The requested device with id %d does not seem to exist (device count=%d)\n",
145 return DeviceStatus::NonFunctional;
149 /* both major & minor is 9999 if no CUDA capable devices are present */
150 if (dev_prop.major == 9999 && dev_prop.minor == 9999)
152 return DeviceStatus::NonFunctional;
154 /* we don't care about emulation mode */
155 if (dev_prop.major == 0)
157 return DeviceStatus::NonFunctional;
162 cu_err = cudaSetDevice(id);
163 if (cu_err != cudaSuccess)
165 fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
166 cudaGetErrorString(cu_err));
167 return DeviceStatus::NonFunctional;
171 cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
172 // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
173 // it is enough to check for cudaErrorDevicesUnavailable only here because
174 // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
175 if (cu_err == cudaErrorDevicesUnavailable)
177 return DeviceStatus::Unavailable;
179 else if (cu_err != cudaSuccess)
181 return DeviceStatus::NonFunctional;
184 /* try to execute a dummy kernel */
187 KernelLaunchConfig config;
188 config.blockSize[0] = 512;
189 const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
190 DeviceInformation deviceInfo;
191 const DeviceContext deviceContext(deviceInfo);
192 const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
193 launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
195 catch (gmx::GromacsException& ex)
197 // launchGpuKernel error is not fatal and should continue with marking the device bad
199 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
200 formatExceptionMessageToString(ex).c_str());
201 return DeviceStatus::NonFunctional;
204 if (cudaDeviceSynchronize() != cudaSuccess)
206 return DeviceStatus::NonFunctional;
209 /* destroy context if we created one */
212 cu_err = cudaDeviceReset();
213 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
216 return DeviceStatus::Compatible;
219 /*! \brief Returns true if the gpu characterized by the device properties is
220 * supported by the native gpu acceleration.
222 * \param[in] dev_prop the CUDA device properties of the gpus to test.
223 * \returns true if the GPU properties passed indicate a compatible
224 * GPU, otherwise false.
226 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
228 return (dev_prop.major >= 3);
231 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
233 * Returns a status value which indicates compatibility or one of the following
234 * errors: incompatibility or insanity (=unexpected behavior).
236 * As the error handling only permits returning the state of the GPU, this function
237 * does not clear the CUDA runtime API status allowing the caller to inspect the error
238 * upon return. Note that this also means it is the caller's responsibility to
239 * reset the CUDA runtime state.
241 * \param[in] deviceId the ID of the GPU to check.
242 * \param[in] deviceProp the CUDA device properties of the device checked.
243 * \returns the status of the requested device
245 static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
247 if (!is_gmx_supported_gpu(deviceProp))
249 return DeviceStatus::Incompatible;
251 return isDeviceFunctional(deviceId, deviceProp);
254 bool isGpuDetectionFunctional(std::string* errorMessage)
257 int driverVersion = -1;
258 stat = cudaDriverGetVersion(&driverVersion);
259 GMX_ASSERT(stat != cudaErrorInvalidValue,
260 "An impossible null pointer was passed to cudaDriverGetVersion");
263 gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
264 cudaGetErrorName(stat), cudaGetErrorString(stat))
266 bool foundDriver = (driverVersion > 0);
269 // Can't detect GPUs if there is no driver
270 if (errorMessage != nullptr)
272 errorMessage->assign("No valid CUDA driver found");
278 stat = cudaGetDeviceCount(&numDevices);
279 if (stat != cudaSuccess)
281 if (errorMessage != nullptr)
283 /* cudaGetDeviceCount failed which means that there is
284 * something wrong with the machine: driver-runtime
285 * mismatch, all GPUs being busy in exclusive mode,
286 * invalid CUDA_VISIBLE_DEVICES, or some other condition
287 * which should result in GROMACS issuing at least a
289 errorMessage->assign(cudaGetErrorString(stat));
292 // Consume the error now that we have prepared to handle
293 // it. This stops it reappearing next time we check for
294 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
295 // valid devices, then cudaGetLastError returns the
296 // (undocumented) cudaErrorNoDevice, but this should not be a
297 // problem as there should be no future CUDA API calls.
298 // NVIDIA bug report #2038718 has been filed.
304 // We don't actually use numDevices here, that's not the job of
309 void findGpus(gmx_gpu_info_t* gpu_info)
313 gpu_info->n_dev_compatible = 0;
316 cudaError_t stat = cudaGetDeviceCount(&ndev);
317 if (stat != cudaSuccess)
319 GMX_THROW(gmx::InternalError(
320 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
321 "canDetectGpus() was not called appropriately beforehand."));
324 // We expect to start device support/sanity checks with a clean runtime error state
325 gmx::ensureNoPendingCudaError("");
327 DeviceInformation* devs;
329 for (int i = 0; i < ndev; i++)
332 memset(&prop, 0, sizeof(cudaDeviceProp));
333 stat = cudaGetDeviceProperties(&prop, i);
334 const DeviceStatus checkResult =
335 (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
339 devs[i].stat = checkResult;
341 if (checkResult == DeviceStatus::Compatible)
343 gpu_info->n_dev_compatible++;
348 // - we inspect the CUDA API state to retrieve and record any
349 // errors that occurred during is_gmx_supported_gpu_id() here,
350 // but this would be more elegant done within is_gmx_supported_gpu_id()
351 // and only return a string with the error if one was encountered.
352 // - we'll be reporting without rank information which is not ideal.
353 // - we'll end up warning also in cases where users would already
354 // get an error before mdrun aborts.
356 // Here we also clear the CUDA API error state so potential
357 // errors during sanity checks don't propagate.
358 if ((stat = cudaGetLastError()) != cudaSuccess)
360 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
361 devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
366 stat = cudaPeekAtLastError();
367 GMX_RELEASE_ASSERT(stat == cudaSuccess,
368 gmx::formatString("We promise to return with clean CUDA state, but "
369 "non-success state encountered: %s: %s",
370 cudaGetErrorName(stat), cudaGetErrorString(stat))
373 gpu_info->n_dev = ndev;
374 gpu_info->deviceInfo = devs;
377 void init_gpu(const DeviceInformation* deviceInfo)
383 stat = cudaSetDevice(deviceInfo->id);
384 if (stat != cudaSuccess)
386 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
387 CU_RET_ERR(stat, message.c_str());
392 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
396 void free_gpu(const DeviceInformation* deviceInfo)
398 // One should only attempt to clear the device context when
399 // it has been used, but currently the only way to know that a GPU
400 // device was used is that deviceInfo will be non-null.
401 if (deviceInfo == nullptr)
411 stat = cudaGetDevice(&gpuid);
412 CU_RET_ERR(stat, "cudaGetDevice failed");
413 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
416 stat = cudaDeviceReset();
417 if (stat != cudaSuccess)
419 gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
423 DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
425 if (deviceId < 0 || deviceId >= gpu_info.n_dev)
427 gmx_incons("Invalid GPU deviceId requested");
429 return &gpu_info.deviceInfo[deviceId];
432 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
436 if (index < 0 && index >= gpu_info.n_dev)
441 DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
444 (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional);
448 sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]);
452 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
453 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
454 dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]);
458 size_t sizeof_gpu_dev_info(void)
460 return sizeof(DeviceInformation);
463 DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
465 return info.deviceInfo[index].stat;