2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define functions for detection and initialization for CUDA devices.
38 * \author Szilard Pall <pall.szilard@gmail.com>
43 #include "gpu_utils.h"
51 #include <cuda_profiler_api.h>
53 #include "gromacs/gpu_utils/cudautils.cuh"
54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
55 #include "gromacs/hardware/gpu_hw_info.h"
56 #include "gromacs/utility/basedefinitions.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/exceptions.h"
59 #include "gromacs/utility/fatalerror.h"
60 #include "gromacs/utility/gmxassert.h"
61 #include "gromacs/utility/logger.h"
62 #include "gromacs/utility/programcontext.h"
63 #include "gromacs/utility/smalloc.h"
64 #include "gromacs/utility/snprintf.h"
65 #include "gromacs/utility/stringutil.h"
69 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
71 #define HAVE_NVML_APPLICATION_CLOCKS 0
72 #endif /* HAVE_NVML */
74 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
75 /*! Check for NVML error on the return status of a NVML API call. */
76 # define HANDLE_NVML_RET_ERR(status, msg) \
78 if (status != NVML_SUCCESS) \
80 gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
83 #else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
84 # define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
85 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
87 #if HAVE_NVML_APPLICATION_CLOCKS
88 static const gmx_bool bCompiledWithApplicationClockSupport = true;
90 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
94 * Max number of devices supported by CUDA (for consistency checking).
96 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
98 static int cuda_max_device_count = 32;
100 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL));
102 /** Dummy kernel used for sanity checking. */
103 static __global__ void k_dummy_test(void)
107 static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
111 cudaFuncAttributes attributes;
112 cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
114 if (cudaErrorInvalidDeviceFunction == stat)
117 "The %s binary does not include support for the CUDA architecture "
118 "of the selected GPU (device ID #%d, compute capability %d.%d). "
119 "By default, GROMACS supports all common architectures, so your GPU "
120 "might be rare, or some architectures were disabled in the build. ",
121 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and ",
122 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture.",
123 gmx::getProgramContext().displayName(), devInfo->id,
124 devInfo->prop.major, devInfo->prop.minor);
127 CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
129 if (devInfo->prop.major >= 3 && attributes.ptxVersion < 30)
132 "The GPU device code was compiled at runtime from 2.0 source which is "
133 "not compatible with the selected GPU (device ID #%d, compute capability %d.%d). "
134 "Pass the appropriate target in GMX_CUDA_TARGET_SM or a >=30 value to GMX_CUDA_TARGET_COMPUTE.",
136 devInfo->prop.major, devInfo->prop.minor);
140 bool isHostMemoryPinned(const void *h_ptr)
142 cudaPointerAttributes memoryAttributes;
143 cudaError_t stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
152 case cudaErrorInvalidValue:
153 // If the buffer was not pinned, then it will not be recognized by CUDA at all
155 // Reset the last error status
160 CU_RET_ERR(stat, "Unexpected CUDA error");
166 * \brief Runs GPU sanity checks.
168 * Runs a series of checks to determine that the given GPU and underlying CUDA
169 * driver/runtime functions properly.
170 * Returns properties of a device with given ID or the one that has
171 * already been initialized earlier in the case if of \dev_id == -1.
173 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
174 * \param[out] dev_prop pointer to the structure in which the device properties will be returned
175 * \returns 0 if the device looks OK
177 * TODO: introduce errors codes and handle errors more smoothly.
179 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
184 cu_err = cudaGetDeviceCount(&dev_count);
185 if (cu_err != cudaSuccess)
187 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
188 cudaGetErrorString(cu_err));
192 /* no CUDA compatible device at all */
198 /* things might go horribly wrong if cudart is not compatible with the driver */
199 if (dev_count < 0 || dev_count > cuda_max_device_count)
204 if (dev_id == -1) /* device already selected let's not destroy the context */
206 cu_err = cudaGetDevice(&id);
207 if (cu_err != cudaSuccess)
209 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
210 cudaGetErrorString(cu_err));
217 if (id > dev_count - 1) /* pfff there's no such device */
219 fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
225 memset(dev_prop, 0, sizeof(cudaDeviceProp));
226 cu_err = cudaGetDeviceProperties(dev_prop, id);
227 if (cu_err != cudaSuccess)
229 fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
230 cudaGetErrorString(cu_err));
234 /* both major & minor is 9999 if no CUDA capable devices are present */
235 if (dev_prop->major == 9999 && dev_prop->minor == 9999)
239 /* we don't care about emulation mode */
240 if (dev_prop->major == 0)
247 cu_err = cudaSetDevice(id);
248 if (cu_err != cudaSuccess)
250 fprintf(stderr, "Error %d while switching to device #%d: %s\n",
251 cu_err, id, cudaGetErrorString(cu_err));
256 /* try to execute a dummy kernel */
257 KernelLaunchConfig config;
258 config.blockSize[0] = 512;
259 const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
260 launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
261 if (cudaThreadSynchronize() != cudaSuccess)
266 /* destroy context if we created one */
269 cu_err = cudaDeviceReset();
270 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
276 #if HAVE_NVML_APPLICATION_CLOCKS
277 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
279 * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
280 * matching PCI-E information from \cuda_dev with the available NVML devices.
282 * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
283 * \returns true if \cuda_dev could be enriched with matching NVML device information.
285 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
287 nvmlDevice_t nvml_device_id;
288 unsigned int nvml_device_count = 0;
289 nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count );
290 bool nvmlWasInitialized = false;
291 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
292 for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
294 nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
295 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
296 if (nvml_stat != NVML_SUCCESS)
301 nvmlPciInfo_t nvml_pci_info;
302 nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
303 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
304 if (nvml_stat != NVML_SUCCESS)
308 if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
309 static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
310 static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
312 nvmlWasInitialized = true;
313 cuda_dev->nvml_device_id = nvml_device_id;
317 return nvmlWasInitialized;
320 /*! \brief Reads and returns the application clocks for device.
322 * \param[in] device The GPU device
323 * \param[out] app_sm_clock The current application SM clock
324 * \param[out] app_mem_clock The current application memory clock
325 * \returns if applacation clocks are supported
327 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
328 unsigned int *app_sm_clock,
329 unsigned int *app_mem_clock)
331 nvmlReturn_t nvml_stat;
333 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
334 if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
338 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
339 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
340 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
344 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
346 /*! \brief Tries to set application clocks for the GPU with the given index.
348 * Application clocks are set to the max supported value to increase
349 * performance if application clock permissions allow this. For future
350 * GPU architectures a more sophisticated scheme might be required.
352 * \todo Refactor this into a detection phase and a work phase. Also
353 * refactor to remove compile-time dependence on logging header.
355 * \param mdlog log file to write to
356 * \param[in] cuda_dev GPU device info for the GPU in use
357 * \returns true if no error occurs during application clocks handling.
359 static gmx_bool init_gpu_application_clocks(
360 const gmx::MDLogger &mdlog,
361 gmx_device_info_t *cuda_dev)
363 const cudaDeviceProp *prop = &cuda_dev->prop;
364 int cuda_compute_capability = prop->major * 10 + prop->minor;
365 gmx_bool bGpuCanUseApplicationClocks =
366 ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
367 (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
368 if (!bGpuCanUseApplicationClocks)
373 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
374 "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
375 " application clocks of the detected %s GPU to improve performance.\n"
376 " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
380 if (!bCompiledWithApplicationClockSupport)
382 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
383 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
384 " managing application clocks of the detected %s GPU to improve performance.\n"
385 " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
390 /* We've compiled with NVML application clocks support, and have a GPU that can use it */
391 nvmlReturn_t nvml_stat = NVML_SUCCESS;
393 //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
394 // this variable can be later used to give a user more fine grained control.
395 env = getenv("GMX_GPU_APPLICATION_CLOCKS");
396 if (env != NULL && ( strcmp( env, "0") == 0 ||
397 gmx_strcasecmp( env, "OFF") == 0 ||
398 gmx_strcasecmp( env, "DISABLE") == 0 ))
402 nvml_stat = nvmlInit();
403 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
404 if (nvml_stat != NVML_SUCCESS)
409 if (!addNVMLDeviceId(cuda_dev))
413 //get current application clocks setting
414 if (!getApplicationClocks(cuda_dev,
415 &cuda_dev->nvml_orig_app_sm_clock,
416 &cuda_dev->nvml_orig_app_mem_clock))
420 //get max application clocks
421 unsigned int max_sm_clock = 0;
422 unsigned int max_mem_clock = 0;
423 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
424 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
425 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
426 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
428 cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED;
429 cuda_dev->nvml_app_clocks_changed = false;
431 if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
433 //TODO: This should probably be integrated into the GPU Properties table.
434 GMX_LOG(mdlog.info).appendTextFormatted(
435 "Application clocks (GPU clocks) for %s are (%d,%d)",
436 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
440 if (cuda_compute_capability >= 60)
442 // Only warn about not being able to change clocks if they are not already at the max values
443 if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
445 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
446 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
447 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
452 nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
453 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
455 if (nvml_stat != NVML_SUCCESS)
457 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
458 "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
459 nvml_stat, nvmlErrorString(nvml_stat));
463 if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
465 // Only warn about not being able to change clocks if they are not already at the max values
466 if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
468 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
469 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
470 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
475 /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
476 e.g. if max application clocks should not be used for certain GPUs. */
477 GMX_LOG(mdlog.warning).appendTextFormatted(
478 "Changing GPU application clocks for %s to (%d,%d)",
479 cuda_dev->prop.name, max_mem_clock, max_sm_clock);
480 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
481 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
482 cuda_dev->nvml_app_clocks_changed = true;
483 cuda_dev->nvml_set_app_sm_clock = max_sm_clock;
484 cuda_dev->nvml_set_app_mem_clock = max_mem_clock;
487 #endif /* HAVE_NVML */
490 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
492 * \param[in] gpu_dev CUDA device information
494 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
496 #if !HAVE_NVML_APPLICATION_CLOCKS
497 GMX_UNUSED_VALUE(cuda_dev);
499 #else /* HAVE_NVML_APPLICATION_CLOCKS */
500 nvmlReturn_t nvml_stat = NVML_SUCCESS;
502 cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
503 cuda_dev->nvml_app_clocks_changed)
505 /* Check if the clocks are still what we set them to.
506 * If so, set them back to the state we originally found them in.
507 * If not, don't touch them, because something else set them later.
509 unsigned int app_sm_clock, app_mem_clock;
510 getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
511 if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock &&
512 app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
514 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
515 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
518 nvml_stat = nvmlShutdown();
519 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
520 return (nvml_stat == NVML_SUCCESS);
521 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
524 void init_gpu(const gmx::MDLogger &mdlog,
525 gmx_device_info_t *deviceInfo)
531 stat = cudaSetDevice(deviceInfo->id);
532 if (stat != cudaSuccess)
534 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
535 CU_RET_ERR(stat, message.c_str());
540 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
543 checkCompiledTargetCompatibility(deviceInfo);
545 //Ignoring return value as NVML errors should be treated not critical.
546 init_gpu_application_clocks(mdlog, deviceInfo);
549 void free_gpu(const gmx_device_info_t *deviceInfo)
551 // One should only attempt to clear the device context when
552 // it has been used, but currently the only way to know that a GPU
553 // device was used is that deviceInfo will be non-null.
554 if (deviceInfo == nullptr)
564 stat = cudaGetDevice(&gpuid);
565 CU_RET_ERR(stat, "cudaGetDevice failed");
566 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
569 if (!reset_gpu_application_clocks(deviceInfo))
571 gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
574 stat = cudaDeviceReset();
575 if (stat != cudaSuccess)
577 gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
581 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
584 if (deviceId < 0 || deviceId >= gpu_info.n_dev)
586 gmx_incons("Invalid GPU deviceId requested");
588 return &gpu_info.gpu_dev[deviceId];
591 /*! \brief Returns true if the gpu characterized by the device properties is
592 * supported by the native gpu acceleration.
594 * \param[in] dev_prop the CUDA device properties of the gpus to test.
595 * \returns true if the GPU properties passed indicate a compatible
596 * GPU, otherwise false.
598 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
600 return (dev_prop->major >= 2);
603 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
605 * Returns a status value which indicates compatibility or one of the following
606 * errors: incompatibility, insistence, or insanity (=unexpected behavior).
607 * It also returns the respective device's properties in \dev_prop (if applicable).
609 * As the error handling only permits returning the state of the GPU, this function
610 * does not clear the CUDA runtime API status allowing the caller to inspect the error
611 * upon return. Note that this also means it is the caller's responsibility to
612 * reset the CUDA runtime state.
614 * \param[in] dev_id the ID of the GPU to check.
615 * \param[out] dev_prop the CUDA device properties of the device checked.
616 * \returns the status of the requested device
618 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
623 stat = cudaGetDeviceCount(&ndev);
624 if (stat != cudaSuccess)
629 if (dev_id > ndev - 1)
631 return egpuNonexistent;
634 /* TODO: currently we do not make a distinction between the type of errors
635 * that can appear during sanity checks. This needs to be improved, e.g if
636 * the dummy test kernel fails to execute with a "device busy message" we
637 * should appropriately report that the device is busy instead of insane.
639 if (do_sanity_checks(dev_id, dev_prop) == 0)
641 if (is_gmx_supported_gpu(dev_prop))
643 return egpuCompatible;
647 return egpuIncompatible;
656 bool canDetectGpus(std::string *errorMessage)
659 int driverVersion = -1;
660 stat = cudaDriverGetVersion(&driverVersion);
661 GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
662 GMX_RELEASE_ASSERT(stat == cudaSuccess,
663 gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
664 cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
665 bool foundDriver = (driverVersion > 0);
668 // Can't detect GPUs if there is no driver
669 if (errorMessage != nullptr)
671 errorMessage->assign("No valid CUDA driver found");
677 stat = cudaGetDeviceCount(&numDevices);
678 if (stat != cudaSuccess)
680 if (errorMessage != nullptr)
682 /* cudaGetDeviceCount failed which means that there is
683 * something wrong with the machine: driver-runtime
684 * mismatch, all GPUs being busy in exclusive mode,
685 * invalid CUDA_VISIBLE_DEVICES, or some other condition
686 * which should result in GROMACS issuing a warning a
687 * falling back to CPUs. */
688 errorMessage->assign(cudaGetErrorString(stat));
691 // Consume the error now that we have prepared to handle
692 // it. This stops it reappearing next time we check for
693 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
694 // valid devices, then cudaGetLastError returns the
695 // (undocumented) cudaErrorNoDevice, but this should not be a
696 // problem as there should be no future CUDA API calls.
697 // NVIDIA bug report #2038718 has been filed.
703 // We don't actually use numDevices here, that's not the job of
708 void findGpus(gmx_gpu_info_t *gpu_info)
710 int i, ndev, checkres;
713 gmx_device_info_t *devs;
717 gpu_info->n_dev_compatible = 0;
722 stat = cudaGetDeviceCount(&ndev);
723 if (stat != cudaSuccess)
725 GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
726 "canDetectGpus() was not called appropriately beforehand."));
729 // We expect to start device support/sanity checks with a clean runtime error state
730 gmx::ensureNoPendingCudaError("");
733 for (i = 0; i < ndev; i++)
735 checkres = is_gmx_supported_gpu_id(i, &prop);
739 devs[i].stat = checkres;
741 if (checkres == egpuCompatible)
743 gpu_info->n_dev_compatible++;
748 // - we inspect the CUDA API state to retrieve and record any
749 // errors that occurred during is_gmx_supported_gpu_id() here,
750 // but this would be more elegant done within is_gmx_supported_gpu_id()
751 // and only return a string with the error if one was encountered.
752 // - we'll be reporting without rank information which is not ideal.
753 // - we'll end up warning also in cases where users would already
754 // get an error before mdrun aborts.
756 // Here we also clear the CUDA API error state so potential
757 // errors during sanity checks don't propagate.
758 if ((stat = cudaGetLastError()) != cudaSuccess)
760 gmx_warning(gmx::formatString("An error occurred while sanity checking device #%d; %s: %s",
761 devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
765 GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "We promise to return with clean CUDA state!");
767 gpu_info->n_dev = ndev;
768 gpu_info->gpu_dev = devs;
771 std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
773 // Possible minor over-allocation here, but not important for anything
774 std::vector<int> compatibleGpus;
775 compatibleGpus.reserve(gpu_info.n_dev);
776 for (int i = 0; i < gpu_info.n_dev; i++)
778 assert(gpu_info.gpu_dev);
779 if (gpu_info.gpu_dev[i].stat == egpuCompatible)
781 compatibleGpus.push_back(i);
784 return compatibleGpus;
787 const char *getGpuCompatibilityDescription(const gmx_gpu_info_t &gpu_info,
790 return (index >= gpu_info.n_dev ?
791 gpu_detect_res_str[egpuNonexistent] :
792 gpu_detect_res_str[gpu_info.gpu_dev[index].stat]);
795 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
797 if (gpu_info == NULL)
802 sfree(gpu_info->gpu_dev);
805 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
809 if (index < 0 && index >= gpu_info.n_dev)
814 gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
817 dinfo->stat == egpuCompatible ||
818 dinfo->stat == egpuIncompatible;
822 sprintf(s, "#%d: %s, stat: %s",
824 gpu_detect_res_str[dinfo->stat]);
828 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
829 dinfo->id, dinfo->prop.name,
830 dinfo->prop.major, dinfo->prop.minor,
831 dinfo->prop.ECCEnabled ? "yes" : " no",
832 gpu_detect_res_str[dinfo->stat]);
836 int get_current_cuda_gpu_device_id(void)
839 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
844 size_t sizeof_gpu_dev_info(void)
846 return sizeof(gmx_device_info_t);
849 void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
850 gmx_host_alloc_t **nb_alloc,
851 gmx_host_free_t **nb_free)
855 *nb_alloc = &pmalloc;
865 void startGpuProfiler(void)
867 /* The NVPROF_ID environment variable is set by nvprof and indicates that
868 mdrun is executed in the CUDA profiler.
869 If nvprof was run is with "--profile-from-start off", the profiler will
870 be started here. This way we can avoid tracing the CUDA events from the
871 first part of the run. Starting the profiler again does nothing.
876 stat = cudaProfilerStart();
877 CU_RET_ERR(stat, "cudaProfilerStart failed");
881 void stopGpuProfiler(void)
883 /* Stopping the nvidia here allows us to eliminate the subsequent
884 API calls from the trace, e.g. uninitialization and cleanup. */
888 stat = cudaProfilerStop();
889 CU_RET_ERR(stat, "cudaProfilerStop failed");
893 void resetGpuProfiler(void)
895 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
896 * the profiling here (can't stop it) which will achieve the desired effect if
897 * the run was started with the profiling disabled.
899 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.