2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define functions for detection and initialization for CUDA devices.
38 * \author Szilard Pall <pall.szilard@gmail.com>
43 #include "gpu_utils.h"
51 #include <cuda_profiler_api.h>
53 #include "gromacs/gpu_utils/cudautils.cuh"
54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
55 #include "gromacs/hardware/gpu_hw_info.h"
56 #include "gromacs/utility/basedefinitions.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/smalloc.h"
62 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
64 #define HAVE_NVML_APPLICATION_CLOCKS 0
65 #endif /* HAVE_NVML */
67 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
68 /*! Check for NVML error on the return status of a NVML API call. */
69 # define HANDLE_NVML_RET_ERR(status, msg) \
71 if (status != NVML_SUCCESS) \
73 gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
76 #else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
77 # define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
78 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
80 #if HAVE_NVML_APPLICATION_CLOCKS
81 static const gmx_bool bCompiledWithApplicationClockSupport = true;
83 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
87 * Max number of devices supported by CUDA (for consistency checking).
89 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
91 static int cuda_max_device_count = 32;
93 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL));
95 /** Dummy kernel used for sanity checking. */
96 __global__ void k_dummy_test()
102 * \brief Runs GPU sanity checks.
104 * Runs a series of checks to determine that the given GPU and underlying CUDA
105 * driver/runtime functions properly.
106 * Returns properties of a device with given ID or the one that has
107 * already been initialized earlier in the case if of \dev_id == -1.
109 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
110 * \param[out] dev_prop pointer to the structure in which the device properties will be returned
111 * \returns 0 if the device looks OK
113 * TODO: introduce errors codes and handle errors more smoothly.
115 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
120 cu_err = cudaGetDeviceCount(&dev_count);
121 if (cu_err != cudaSuccess)
123 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
124 cudaGetErrorString(cu_err));
128 /* no CUDA compatible device at all */
134 /* things might go horribly wrong if cudart is not compatible with the driver */
135 if (dev_count < 0 || dev_count > cuda_max_device_count)
140 if (dev_id == -1) /* device already selected let's not destroy the context */
142 cu_err = cudaGetDevice(&id);
143 if (cu_err != cudaSuccess)
145 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
146 cudaGetErrorString(cu_err));
153 if (id > dev_count - 1) /* pfff there's no such device */
155 fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
161 memset(dev_prop, 0, sizeof(cudaDeviceProp));
162 cu_err = cudaGetDeviceProperties(dev_prop, id);
163 if (cu_err != cudaSuccess)
165 fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
166 cudaGetErrorString(cu_err));
170 /* both major & minor is 9999 if no CUDA capable devices are present */
171 if (dev_prop->major == 9999 && dev_prop->minor == 9999)
175 /* we don't care about emulation mode */
176 if (dev_prop->major == 0)
183 cu_err = cudaSetDevice(id);
184 if (cu_err != cudaSuccess)
186 fprintf(stderr, "Error %d while switching to device #%d: %s\n",
187 cu_err, id, cudaGetErrorString(cu_err));
192 /* try to execute a dummy kernel */
193 k_dummy_test<<< 1, 512>>> ();
194 if (cudaThreadSynchronize() != cudaSuccess)
199 /* destroy context if we created one */
202 cu_err = cudaDeviceReset();
203 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
210 /* TODO: We should actually be using md_print_warn in md_logging.c,
211 * but we can't include mpi.h in CUDA code.
213 static void md_print_info(FILE *fplog,
214 const char *fmt, ...)
220 /* We should only print to stderr on the master node,
221 * in most cases fplog is only set on the master node, so this works.
224 vfprintf(stderr, fmt, ap);
228 vfprintf(fplog, fmt, ap);
234 /* TODO: We should actually be using md_print_warn in md_logging.c,
235 * but we can't include mpi.h in CUDA code.
236 * This is replicated from nbnxn_cuda_data_mgmt.cu.
238 static void md_print_warn(FILE *fplog,
239 const char *fmt, ...)
245 /* We should only print to stderr on the master node,
246 * in most cases fplog is only set on the master node, so this works.
249 fprintf(stderr, "\n");
250 vfprintf(stderr, fmt, ap);
251 fprintf(stderr, "\n");
255 fprintf(fplog, "\n");
256 vfprintf(fplog, fmt, ap);
257 fprintf(fplog, "\n");
262 #if HAVE_NVML_APPLICATION_CLOCKS
263 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
265 * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
266 * matching PCI-E information from \cuda_dev with the available NVML devices.
268 * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
269 * \returns true if \cuda_dev could be enriched with matching NVML device information.
271 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
273 nvmlDevice_t nvml_device_id;
274 unsigned int nvml_device_count = 0;
275 nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count );
276 cuda_dev->nvml_initialized = false;
277 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
278 for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
280 nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
281 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
282 if (nvml_stat != NVML_SUCCESS)
287 nvmlPciInfo_t nvml_pci_info;
288 nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
289 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
290 if (nvml_stat != NVML_SUCCESS)
294 if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
295 static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
296 static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
298 cuda_dev->nvml_initialized = true;
299 cuda_dev->nvml_device_id = nvml_device_id;
303 return cuda_dev->nvml_initialized;
306 /*! \brief Reads and returns the application clocks for device.
308 * \param[in] device The GPU device
309 * \param[out] app_sm_clock The current application SM clock
310 * \param[out] app_mem_clock The current application memory clock
311 * \returns if applacation clocks are supported
313 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
314 unsigned int *app_sm_clock,
315 unsigned int *app_mem_clock)
317 nvmlReturn_t nvml_stat;
319 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
320 if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
324 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
325 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
326 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
330 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
332 /*! \brief Tries to set application clocks for the GPU with the given index.
334 * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
335 * to handle the application clocks for. Application clocks are set to the
336 * max supported value to increase performance if application clock permissions
337 * allow this. For future GPU architectures a more sophisticated scheme might be
340 * \param[out] fplog log file to write to
341 * \param[in] gpuid index of the GPU to set application clocks for
342 * \param[in] gpu_info GPU info of all detected devices in the system.
343 * \returns true if no error occurs during application clocks handling.
345 static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
347 const cudaDeviceProp *prop = &gpu_info->gpu_dev[gpuid].prop;
348 int cuda_version_number = prop->major * 10 + prop->minor;
349 gmx_bool bGpuCanUseApplicationClocks =
350 ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
351 (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
352 if (!bGpuCanUseApplicationClocks)
358 int cuda_runtime = 0;
359 cudaDriverGetVersion(&cuda_driver);
360 cudaRuntimeGetVersion(&cuda_runtime);
361 md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
362 " application clocks of the detected %s GPU to improve performance.\n"
363 " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n",
367 if (!bCompiledWithApplicationClockSupport)
370 int cuda_runtime = 0;
371 cudaDriverGetVersion(&cuda_driver);
372 cudaRuntimeGetVersion(&cuda_runtime);
373 md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
374 " managing application clocks of the detected %s GPU to improve performance.\n"
375 " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n",
380 /* We've compiled with NVML application clocks support, and have a GPU that can use it */
381 nvmlReturn_t nvml_stat = NVML_SUCCESS;
383 //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
384 // this variable can be later used to give a user more fine grained control.
385 env = getenv("GMX_GPU_APPLICATION_CLOCKS");
386 if (env != NULL && ( strcmp( env, "0") == 0 ||
387 gmx_strcasecmp( env, "OFF") == 0 ||
388 gmx_strcasecmp( env, "DISABLE") == 0 ))
392 nvml_stat = nvmlInit();
393 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
394 if (nvml_stat != NVML_SUCCESS)
399 gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
401 if (!addNVMLDeviceId(cuda_dev))
405 //get current application clocks setting
406 if (!getApplicationClocks(cuda_dev,
407 &cuda_dev->nvml_orig_app_sm_clock,
408 &cuda_dev->nvml_orig_app_mem_clock))
412 //get max application clocks
413 unsigned int max_sm_clock = 0;
414 unsigned int max_mem_clock = 0;
415 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
416 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
417 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
418 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
420 cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED;
421 cuda_dev->nvml_app_clocks_changed = false;
423 nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
424 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
426 /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
427 e.g. if max application clocks should not be used for certain GPUs. */
428 if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock && cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED)
430 md_print_info(fplog, "Changing GPU application clocks for %s to (%d,%d)\n", cuda_dev->prop.name, max_mem_clock, max_sm_clock);
431 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
432 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
433 cuda_dev->nvml_app_clocks_changed = true;
434 cuda_dev->nvml_set_app_sm_clock = max_sm_clock;
435 cuda_dev->nvml_set_app_mem_clock = max_mem_clock;
437 else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock)
439 md_print_warn(fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
441 else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock == max_sm_clock)
443 md_print_info(fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
447 md_print_warn( fplog, "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
449 return (nvml_stat == NVML_SUCCESS);
450 #endif /* HAVE_NVML */
453 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
455 * \param[in] gpu_dev CUDA device information
457 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
459 #if !HAVE_NVML_APPLICATION_CLOCKS
460 GMX_UNUSED_VALUE(cuda_dev);
462 #else /* HAVE_NVML_APPLICATION_CLOCKS */
463 nvmlReturn_t nvml_stat = NVML_SUCCESS;
465 cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
466 cuda_dev->nvml_app_clocks_changed)
468 /* Check if the clocks are still what we set them to.
469 * If so, set them back to the state we originally found them in.
470 * If not, don't touch them, because something else set them later.
472 unsigned int app_sm_clock, app_mem_clock;
473 getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
474 if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock &&
475 app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
477 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
478 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
481 nvml_stat = nvmlShutdown();
482 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
483 return (nvml_stat == NVML_SUCCESS);
484 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
487 gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
488 const struct gmx_gpu_info_t *gpu_info,
489 const struct gmx_gpu_opt_t *gpu_opt)
498 if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
500 sprintf(sbuf, "Trying to initialize an inexistent GPU: "
501 "there are %d %s-selected GPU(s), but #%d was requested.",
502 gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
506 gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
508 stat = cudaSetDevice(gpuid);
509 strncpy(result_str, cudaGetErrorString(stat), STRLEN);
513 fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
516 //Ignoring return value as NVML errors should be treated not critical.
517 if (stat == cudaSuccess)
519 init_gpu_application_clocks(fplog, gpuid, gpu_info);
521 return (stat == cudaSuccess);
524 gmx_bool free_cuda_gpu(
525 int gmx_unused mygpu, char *result_str,
526 const gmx_gpu_info_t gmx_unused *gpu_info,
527 const gmx_gpu_opt_t gmx_unused *gpu_opt
531 gmx_bool reset_gpu_application_clocks_status = true;
539 stat = cudaGetDevice(&gpuid);
540 CU_RET_ERR(stat, "cudaGetDevice failed");
541 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
544 gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
547 reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
550 stat = cudaDeviceReset();
551 strncpy(result_str, cudaGetErrorString(stat), STRLEN);
552 return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
555 /*! \brief Returns true if the gpu characterized by the device properties is
556 * supported by the native gpu acceleration.
558 * \param[in] dev_prop the CUDA device properties of the gpus to test.
559 * \returns true if the GPU properties passed indicate a compatible
560 * GPU, otherwise false.
562 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
564 return (dev_prop->major >= 2);
567 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
569 * \param[in] stat GPU status.
570 * \returns true if the provided status is egpuCompatible, otherwise false.
572 static bool is_compatible_gpu(int stat)
574 return (stat == egpuCompatible);
577 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
579 * Returns a status value which indicates compatibility or one of the following
580 * errors: incompatibility, insistence, or insanity (=unexpected behavior).
581 * It also returns the respective device's properties in \dev_prop (if applicable).
583 * \param[in] dev_id the ID of the GPU to check.
584 * \param[out] dev_prop the CUDA device properties of the device checked.
585 * \returns the status of the requested device
587 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
592 stat = cudaGetDeviceCount(&ndev);
593 if (stat != cudaSuccess)
598 if (dev_id > ndev - 1)
600 return egpuNonexistent;
603 /* TODO: currently we do not make a distinction between the type of errors
604 * that can appear during sanity checks. This needs to be improved, e.g if
605 * the dummy test kernel fails to execute with a "device busy message" we
606 * should appropriately report that the device is busy instead of insane.
608 if (do_sanity_checks(dev_id, dev_prop) == 0)
610 if (is_gmx_supported_gpu(dev_prop))
612 return egpuCompatible;
616 return egpuIncompatible;
626 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
628 int i, ndev, checkres, retval;
631 gmx_device_info_t *devs;
636 gpu_info->n_dev_compatible = 0;
641 stat = cudaGetDeviceCount(&ndev);
642 if (stat != cudaSuccess)
646 /* cudaGetDeviceCount failed which means that there is something
647 * wrong with the machine: driver-runtime mismatch, all GPUs being
648 * busy in exclusive mode, or some other condition which should
649 * result in us issuing a warning a falling back to CPUs. */
651 s = cudaGetErrorString(stat);
652 strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
657 for (i = 0; i < ndev; i++)
659 checkres = is_gmx_supported_gpu_id(i, &prop);
663 devs[i].stat = checkres;
665 if (checkres == egpuCompatible)
667 gpu_info->n_dev_compatible++;
673 gpu_info->n_dev = ndev;
674 gpu_info->gpu_dev = devs;
679 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
680 gmx_gpu_opt_t *gpu_opt)
686 /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
687 assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
689 snew(compat, gpu_info->n_dev);
691 for (i = 0; i < gpu_info->n_dev; i++)
693 if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
696 compat[ncompat - 1] = i;
700 gpu_opt->n_dev_compatible = ncompat;
701 snew(gpu_opt->dev_compatible, ncompat);
702 memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
706 gmx_bool check_selected_gpus(int *checkres,
707 const gmx_gpu_info_t *gpu_info,
708 gmx_gpu_opt_t *gpu_opt)
715 assert(gpu_opt->n_dev_use >= 0);
717 if (gpu_opt->n_dev_use == 0)
722 assert(gpu_opt->dev_use);
724 /* we will assume that all GPUs requested are valid IDs,
725 otherwise we'll bail anyways */
728 for (i = 0; i < gpu_opt->n_dev_use; i++)
730 id = gpu_opt->dev_use[i];
732 /* devices are stored in increasing order of IDs in gpu_dev */
733 gpu_opt->dev_use[i] = id;
735 checkres[i] = (id >= gpu_info->n_dev) ?
736 egpuNonexistent : gpu_info->gpu_dev[id].stat;
738 bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
744 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
746 if (gpu_info == NULL)
751 sfree(gpu_info->gpu_dev);
754 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
759 if (index < 0 && index >= gpu_info->n_dev)
764 gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
767 dinfo->stat == egpuCompatible ||
768 dinfo->stat == egpuIncompatible;
772 sprintf(s, "#%d: %s, stat: %s",
774 gpu_detect_res_str[dinfo->stat]);
778 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
779 dinfo->id, dinfo->prop.name,
780 dinfo->prop.major, dinfo->prop.minor,
781 dinfo->prop.ECCEnabled ? "yes" : " no",
782 gpu_detect_res_str[dinfo->stat]);
786 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
787 const gmx_gpu_opt_t *gpu_opt,
792 assert(idx >= 0 && idx < gpu_opt->n_dev_use);
794 return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
797 int get_current_cuda_gpu_device_id(void)
800 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
805 size_t sizeof_gpu_dev_info(void)
807 return sizeof(gmx_device_info_t);
810 void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
811 gmx_host_alloc_t **nb_alloc,
812 gmx_host_free_t **nb_free)
816 *nb_alloc = &pmalloc;
826 void startGpuProfiler(void)
828 /* The NVPROF_ID environment variable is set by nvprof and indicates that
829 mdrun is executed in the CUDA profiler.
830 If nvprof was run is with "--profile-from-start off", the profiler will
831 be started here. This way we can avoid tracing the CUDA events from the
832 first part of the run. Starting the profiler again does nothing.
837 stat = cudaProfilerStart();
838 CU_RET_ERR(stat, "cudaProfilerStart failed");
842 void stopGpuProfiler(void)
844 /* Stopping the nvidia here allows us to eliminate the subsequent
845 API calls from the trace, e.g. uninitialization and cleanup. */
849 stat = cudaProfilerStop();
850 CU_RET_ERR(stat, "cudaProfilerStop failed");
854 void resetGpuProfiler(void)
856 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
857 * the profiling here (can't stop it) which will achieve the desired effect if
858 * the run was started with the profiling disabled.
860 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.