src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include <cuda_profiler_api.h>
  52
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/utility/basedefinitions.h"
  57 #include "gromacs/utility/cstringutil.h"
  58 #include "gromacs/utility/exceptions.h"
  59 #include "gromacs/utility/fatalerror.h"
  60 #include "gromacs/utility/gmxassert.h"
  61 #include "gromacs/utility/logger.h"
  62 #include "gromacs/utility/programcontext.h"
  63 #include "gromacs/utility/smalloc.h"
  64 #include "gromacs/utility/snprintf.h"
  65 #include "gromacs/utility/stringutil.h"
  66
  67 #if HAVE_NVML
  68 #include <nvml.h>
  69 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
  70 #else  /* HAVE_NVML */
  71 #define HAVE_NVML_APPLICATION_CLOCKS 0
  72 #endif /* HAVE_NVML */
  73
  74 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
  75 /*! Check for NVML error on the return status of a NVML API call. */
  76 #  define HANDLE_NVML_RET_ERR(status, msg) \
  77     do { \
  78         if (status != NVML_SUCCESS) \
  79         { \
  80             gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
  81         } \
  82     } while (0)
  83 #else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  84 #  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
  85 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  86
  87 #if HAVE_NVML_APPLICATION_CLOCKS
  88 static const gmx_bool            bCompiledWithApplicationClockSupport = true;
  89 #else
  90 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  91 #endif
  92
  93 /*! \internal \brief
  94  * Max number of devices supported by CUDA (for consistency checking).
  95  *
  96  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  97  */
  98 static int  cuda_max_device_count = 32;
  99
 100 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
 101
 102 /** Dummy kernel used for sanity checking. */
 103 static __global__ void k_dummy_test(void)
 104 {
 105 }
 106
 107 static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
 108 {
 109     assert(devInfo);
 110
 111     cudaFuncAttributes attributes;
 112     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
 113
 114     if (cudaErrorInvalidDeviceFunction == stat)
 115     {
 116         gmx_fatal(FARGS,
 117                   "The %s binary does not include support for the CUDA architecture "
 118                   "of the selected GPU (device ID #%d, compute capability %d.%d). "
 119                   "By default, GROMACS supports all common architectures, so your GPU "
 120                   "might be rare, or some architectures were disabled in the build. ",
 121                   "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and ",
 122                   "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture.",
 123                   gmx::getProgramContext().displayName(), devInfo->id,
 124                   devInfo->prop.major, devInfo->prop.minor);
 125     }
 126
 127     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
 128
 129     if (devInfo->prop.major >= 3 && attributes.ptxVersion < 30)
 130     {
 131         gmx_fatal(FARGS,
 132                   "The GPU device code was compiled at runtime from 2.0 source which is "
 133                   "not compatible with the selected GPU (device ID #%d, compute capability %d.%d). "
 134                   "Pass the appropriate target in GMX_CUDA_TARGET_SM or a >=30 value to GMX_CUDA_TARGET_COMPUTE.",
 135                   devInfo->id,
 136                   devInfo->prop.major, devInfo->prop.minor);
 137     }
 138 }
 139
 140 bool isHostMemoryPinned(const void *h_ptr)
 141 {
 142     cudaPointerAttributes memoryAttributes;
 143     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 144
 145     bool                  result = false;
 146     switch (stat)
 147     {
 148         case cudaSuccess:
 149             result = true;
 150             break;
 151
 152         case cudaErrorInvalidValue:
 153             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 154             result = false;
 155             // Reset the last error status
 156             cudaGetLastError();
 157             break;
 158
 159         default:
 160             CU_RET_ERR(stat, "Unexpected CUDA error");
 161     }
 162     return result;
 163 }
 164
 165 /*!
 166  * \brief Runs GPU sanity checks.
 167  *
 168  * Runs a series of checks to determine that the given GPU and underlying CUDA
 169  * driver/runtime functions properly.
 170  * Returns properties of a device with given ID or the one that has
 171  * already been initialized earlier in the case if of \dev_id == -1.
 172  *
 173  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 174  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 175  * \returns                0 if the device looks OK
 176  *
 177  * TODO: introduce errors codes and handle errors more smoothly.
 178  */
 179 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 180 {
 181     cudaError_t cu_err;
 182     int         dev_count, id;
 183
 184     cu_err = cudaGetDeviceCount(&dev_count);
 185     if (cu_err != cudaSuccess)
 186     {
 187         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 188                 cudaGetErrorString(cu_err));
 189         return -1;
 190     }
 191
 192     /* no CUDA compatible device at all */
 193     if (dev_count == 0)
 194     {
 195         return -1;
 196     }
 197
 198     /* things might go horribly wrong if cudart is not compatible with the driver */
 199     if (dev_count < 0 || dev_count > cuda_max_device_count)
 200     {
 201         return -1;
 202     }
 203
 204     if (dev_id == -1) /* device already selected let's not destroy the context */
 205     {
 206         cu_err = cudaGetDevice(&id);
 207         if (cu_err != cudaSuccess)
 208         {
 209             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 210                     cudaGetErrorString(cu_err));
 211             return -1;
 212         }
 213     }
 214     else
 215     {
 216         id = dev_id;
 217         if (id > dev_count - 1) /* pfff there's no such device */
 218         {
 219             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 220                     dev_id, dev_count);
 221             return -1;
 222         }
 223     }
 224
 225     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 226     cu_err = cudaGetDeviceProperties(dev_prop, id);
 227     if (cu_err != cudaSuccess)
 228     {
 229         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 230                 cudaGetErrorString(cu_err));
 231         return -1;
 232     }
 233
 234     /* both major & minor is 9999 if no CUDA capable devices are present */
 235     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 236     {
 237         return -1;
 238     }
 239     /* we don't care about emulation mode */
 240     if (dev_prop->major == 0)
 241     {
 242         return -1;
 243     }
 244
 245     if (id != -1)
 246     {
 247         cu_err = cudaSetDevice(id);
 248         if (cu_err != cudaSuccess)
 249         {
 250             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 251                     cu_err, id, cudaGetErrorString(cu_err));
 252             return -1;
 253         }
 254     }
 255
 256     /* try to execute a dummy kernel */
 257     KernelLaunchConfig config;
 258     config.blockSize[0] = 512;
 259     const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 260     launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
 261     if (cudaThreadSynchronize() != cudaSuccess)
 262     {
 263         return -1;
 264     }
 265
 266     /* destroy context if we created one */
 267     if (id != -1)
 268     {
 269         cu_err = cudaDeviceReset();
 270         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 271     }
 272
 273     return 0;
 274 }
 275
 276 #if HAVE_NVML_APPLICATION_CLOCKS
 277 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 278  *
 279  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 280  * matching PCI-E information from \cuda_dev with the available NVML devices.
 281  *
 282  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 283  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 284  */
 285 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 286 {
 287     nvmlDevice_t nvml_device_id;
 288     unsigned int nvml_device_count  = 0;
 289     nvmlReturn_t nvml_stat          = nvmlDeviceGetCount ( &nvml_device_count );
 290     bool         nvmlWasInitialized = false;
 291     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 292     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 293     {
 294         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 295         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 296         if (nvml_stat != NVML_SUCCESS)
 297         {
 298             break;
 299         }
 300
 301         nvmlPciInfo_t nvml_pci_info;
 302         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 303         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 304         if (nvml_stat != NVML_SUCCESS)
 305         {
 306             break;
 307         }
 308         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 309             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 310             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 311         {
 312             nvmlWasInitialized         = true;
 313             cuda_dev->nvml_device_id   = nvml_device_id;
 314             break;
 315         }
 316     }
 317     return nvmlWasInitialized;
 318 }
 319
 320 /*! \brief Reads and returns the application clocks for device.
 321  *
 322  * \param[in]  device        The GPU device
 323  * \param[out] app_sm_clock  The current application SM clock
 324  * \param[out] app_mem_clock The current application memory clock
 325  * \returns if applacation clocks are supported
 326  */
 327 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
 328                                  unsigned int            *app_sm_clock,
 329                                  unsigned int            *app_mem_clock)
 330 {
 331     nvmlReturn_t nvml_stat;
 332
 333     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
 334     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 335     {
 336         return false;
 337     }
 338     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
 339     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
 340     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
 341
 342     return true;
 343 }
 344 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 345
 346 /*! \brief Tries to set application clocks for the GPU with the given index.
 347  *
 348  * Application clocks are set to the max supported value to increase
 349  * performance if application clock permissions allow this. For future
 350  * GPU architectures a more sophisticated scheme might be required.
 351  *
 352  * \todo Refactor this into a detection phase and a work phase. Also
 353  * refactor to remove compile-time dependence on logging header.
 354  *
 355  * \param     mdlog         log file to write to
 356  * \param[in] cuda_dev      GPU device info for the GPU in use
 357  * \returns                 true if no error occurs during application clocks handling.
 358  */
 359 static gmx_bool init_gpu_application_clocks(
 360         const gmx::MDLogger &mdlog,
 361         gmx_device_info_t   *cuda_dev)
 362 {
 363     const cudaDeviceProp *prop                        = &cuda_dev->prop;
 364     int                   cuda_compute_capability     = prop->major * 10 + prop->minor;
 365     gmx_bool              bGpuCanUseApplicationClocks =
 366         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
 367          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
 368     if (!bGpuCanUseApplicationClocks)
 369     {
 370         return true;
 371     }
 372 #if !HAVE_NVML
 373     GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 374             "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 375             "      application clocks of the detected %s GPU to improve performance.\n"
 376             "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
 377             prop->name);
 378     return true;
 379 #else
 380     if (!bCompiledWithApplicationClockSupport)
 381     {
 382         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 383                 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 384                 "      managing application clocks of the detected %s GPU to improve performance.\n"
 385                 "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
 386                 prop->name );
 387         return true;
 388     }
 389
 390     /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 391     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 392     char        *env;
 393     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 394     //      this variable can be later used to give a user more fine grained control.
 395     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 396     if (env != NULL && ( strcmp( env, "0") == 0 ||
 397                          gmx_strcasecmp( env, "OFF") == 0 ||
 398                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 399     {
 400         return true;
 401     }
 402     nvml_stat = nvmlInit();
 403     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 404     if (nvml_stat != NVML_SUCCESS)
 405     {
 406         return false;
 407     }
 408
 409     if (!addNVMLDeviceId(cuda_dev))
 410     {
 411         return false;
 412     }
 413     //get current application clocks setting
 414     if (!getApplicationClocks(cuda_dev,
 415                               &cuda_dev->nvml_orig_app_sm_clock,
 416                               &cuda_dev->nvml_orig_app_mem_clock))
 417     {
 418         return false;
 419     }
 420     //get max application clocks
 421     unsigned int max_sm_clock  = 0;
 422     unsigned int max_mem_clock = 0;
 423     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
 424     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 425     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
 426     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 427
 428     cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
 429     cuda_dev->nvml_app_clocks_changed = false;
 430
 431     if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
 432     {
 433         //TODO: This should probably be integrated into the GPU Properties table.
 434         GMX_LOG(mdlog.info).appendTextFormatted(
 435                 "Application clocks (GPU clocks) for %s are (%d,%d)",
 436                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 437         return true;
 438     }
 439
 440     if (cuda_compute_capability >= 60)
 441     {
 442         // Only warn about not being able to change clocks if they are not already at the max values
 443         if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
 444         {
 445             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 446                     "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
 447                     cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 448         }
 449         return true;
 450     }
 451
 452     nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
 453     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 454
 455     if (nvml_stat != NVML_SUCCESS)
 456     {
 457         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 458                 "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
 459                 nvml_stat, nvmlErrorString(nvml_stat));
 460         return false;
 461     }
 462
 463     if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
 464     {
 465         // Only warn about not being able to change clocks if they are not already at the max values
 466         if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
 467         {
 468             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 469                     "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
 470                     cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 471         }
 472         return true;
 473     }
 474
 475     /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 476        e.g. if max application clocks should not be used for certain GPUs. */
 477     GMX_LOG(mdlog.warning).appendTextFormatted(
 478             "Changing GPU application clocks for %s to (%d,%d)",
 479             cuda_dev->prop.name, max_mem_clock, max_sm_clock);
 480     nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
 481     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 482     cuda_dev->nvml_app_clocks_changed = true;
 483     cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
 484     cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
 485
 486     return true;
 487 #endif /* HAVE_NVML */
 488 }
 489
 490 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 491  *
 492  * \param[in] gpu_dev  CUDA device information
 493  */
 494 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 495 {
 496 #if !HAVE_NVML_APPLICATION_CLOCKS
 497     GMX_UNUSED_VALUE(cuda_dev);
 498     return true;
 499 #else /* HAVE_NVML_APPLICATION_CLOCKS */
 500     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 501     if (cuda_dev &&
 502         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 503         cuda_dev->nvml_app_clocks_changed)
 504     {
 505         /* Check if the clocks are still what we set them to.
 506          * If so, set them back to the state we originally found them in.
 507          * If not, don't touch them, because something else set them later.
 508          */
 509         unsigned int app_sm_clock, app_mem_clock;
 510         getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
 511         if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
 512             app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
 513         {
 514             nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 515             HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
 516         }
 517     }
 518     nvml_stat = nvmlShutdown();
 519     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 520     return (nvml_stat == NVML_SUCCESS);
 521 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 522 }
 523
 524 void init_gpu(const gmx::MDLogger &mdlog,
 525               gmx_device_info_t   *deviceInfo)
 526 {
 527     cudaError_t stat;
 528
 529     assert(deviceInfo);
 530
 531     stat = cudaSetDevice(deviceInfo->id);
 532     if (stat != cudaSuccess)
 533     {
 534         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 535         CU_RET_ERR(stat, message.c_str());
 536     }
 537
 538     if (debug)
 539     {
 540         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 541     }
 542
 543     checkCompiledTargetCompatibility(deviceInfo);
 544
 545     //Ignoring return value as NVML errors should be treated not critical.
 546     init_gpu_application_clocks(mdlog, deviceInfo);
 547 }
 548
 549 void free_gpu(const gmx_device_info_t *deviceInfo)
 550 {
 551     // One should only attempt to clear the device context when
 552     // it has been used, but currently the only way to know that a GPU
 553     // device was used is that deviceInfo will be non-null.
 554     if (deviceInfo == nullptr)
 555     {
 556         return;
 557     }
 558
 559     cudaError_t  stat;
 560
 561     if (debug)
 562     {
 563         int gpuid;
 564         stat = cudaGetDevice(&gpuid);
 565         CU_RET_ERR(stat, "cudaGetDevice failed");
 566         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 567     }
 568
 569     if (!reset_gpu_application_clocks(deviceInfo))
 570     {
 571         gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
 572     }
 573
 574     stat = cudaDeviceReset();
 575     if (stat != cudaSuccess)
 576     {
 577         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 578     }
 579 }
 580
 581 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 582                                  int                   deviceId)
 583 {
 584     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 585     {
 586         gmx_incons("Invalid GPU deviceId requested");
 587     }
 588     return &gpu_info.gpu_dev[deviceId];
 589 }
 590
 591 /*! \brief Returns true if the gpu characterized by the device properties is
 592  *  supported by the native gpu acceleration.
 593  *
 594  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 595  * \returns             true if the GPU properties passed indicate a compatible
 596  *                      GPU, otherwise false.
 597  */
 598 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 599 {
 600     return (dev_prop->major >= 2);
 601 }
 602
 603 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 604  *
 605  *  Returns a status value which indicates compatibility or one of the following
 606  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 607  *  It also returns the respective device's properties in \dev_prop (if applicable).
 608  *
 609  *  As the error handling only permits returning the state of the GPU, this function
 610  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 611  *  upon return. Note that this also means it is the caller's responsibility to
 612  *  reset the CUDA runtime state.
 613  *
 614  *  \param[in]  dev_id   the ID of the GPU to check.
 615  *  \param[out] dev_prop the CUDA device properties of the device checked.
 616  *  \returns             the status of the requested device
 617  */
 618 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 619 {
 620     cudaError_t stat;
 621     int         ndev;
 622
 623     stat = cudaGetDeviceCount(&ndev);
 624     if (stat != cudaSuccess)
 625     {
 626         return egpuInsane;
 627     }
 628
 629     if (dev_id > ndev - 1)
 630     {
 631         return egpuNonexistent;
 632     }
 633
 634     /* TODO: currently we do not make a distinction between the type of errors
 635      * that can appear during sanity checks. This needs to be improved, e.g if
 636      * the dummy test kernel fails to execute with a "device busy message" we
 637      * should appropriately report that the device is busy instead of insane.
 638      */
 639     if (do_sanity_checks(dev_id, dev_prop) == 0)
 640     {
 641         if (is_gmx_supported_gpu(dev_prop))
 642         {
 643             return egpuCompatible;
 644         }
 645         else
 646         {
 647             return egpuIncompatible;
 648         }
 649     }
 650     else
 651     {
 652         return egpuInsane;
 653     }
 654 }
 655
 656 bool canDetectGpus(std::string *errorMessage)
 657 {
 658     cudaError_t        stat;
 659     int                driverVersion = -1;
 660     stat = cudaDriverGetVersion(&driverVersion);
 661     GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
 662     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 663                        gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 664                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 665     bool foundDriver = (driverVersion > 0);
 666     if (!foundDriver)
 667     {
 668         // Can't detect GPUs if there is no driver
 669         if (errorMessage != nullptr)
 670         {
 671             errorMessage->assign("No valid CUDA driver found");
 672         }
 673         return false;
 674     }
 675
 676     int numDevices;
 677     stat = cudaGetDeviceCount(&numDevices);
 678     if (stat != cudaSuccess)
 679     {
 680         if (errorMessage != nullptr)
 681         {
 682             /* cudaGetDeviceCount failed which means that there is
 683              * something wrong with the machine: driver-runtime
 684              * mismatch, all GPUs being busy in exclusive mode,
 685              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 686              * which should result in GROMACS issuing a warning a
 687              * falling back to CPUs. */
 688             errorMessage->assign(cudaGetErrorString(stat));
 689         }
 690
 691         // Consume the error now that we have prepared to handle
 692         // it. This stops it reappearing next time we check for
 693         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 694         // valid devices, then cudaGetLastError returns the
 695         // (undocumented) cudaErrorNoDevice, but this should not be a
 696         // problem as there should be no future CUDA API calls.
 697         // NVIDIA bug report #2038718 has been filed.
 698         cudaGetLastError();
 699         // Can't detect GPUs
 700         return false;
 701     }
 702
 703     // We don't actually use numDevices here, that's not the job of
 704     // this function.
 705     return true;
 706 }
 707
 708 void findGpus(gmx_gpu_info_t *gpu_info)
 709 {
 710     int                i, ndev, checkres;
 711     cudaError_t        stat;
 712     cudaDeviceProp     prop;
 713     gmx_device_info_t *devs;
 714
 715     assert(gpu_info);
 716
 717     gpu_info->n_dev_compatible = 0;
 718
 719     ndev    = 0;
 720     devs    = NULL;
 721
 722     stat = cudaGetDeviceCount(&ndev);
 723     if (stat != cudaSuccess)
 724     {
 725         GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
 726                                      "canDetectGpus() was not called appropriately beforehand."));
 727     }
 728
 729     // We expect to start device support/sanity checks with a clean runtime error state
 730     gmx::ensureNoPendingCudaError("");
 731
 732     snew(devs, ndev);
 733     for (i = 0; i < ndev; i++)
 734     {
 735         checkres = is_gmx_supported_gpu_id(i, &prop);
 736
 737         devs[i].id   = i;
 738         devs[i].prop = prop;
 739         devs[i].stat = checkres;
 740
 741         if (checkres == egpuCompatible)
 742         {
 743             gpu_info->n_dev_compatible++;
 744         }
 745         else
 746         {
 747             // TODO:
 748             //  - we inspect the CUDA API state to retrieve and record any
 749             //    errors that occurred during is_gmx_supported_gpu_id() here,
 750             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 751             //    and only return a string with the error if one was encountered.
 752             //  - we'll be reporting without rank information which is not ideal.
 753             //  - we'll end up warning also in cases where users would already
 754             //    get an error before mdrun aborts.
 755             //
 756             // Here we also clear the CUDA API error state so potential
 757             // errors during sanity checks don't propagate.
 758             if ((stat = cudaGetLastError()) != cudaSuccess)
 759             {
 760                 gmx_warning(gmx::formatString("An error occurred while sanity checking device #%d; %s: %s",
 761                                               devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 762             }
 763         }
 764     }
 765     GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "We promise to return with clean CUDA state!");
 766
 767     gpu_info->n_dev   = ndev;
 768     gpu_info->gpu_dev = devs;
 769 }
 770
 771 std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
 772 {
 773     // Possible minor over-allocation here, but not important for anything
 774     std::vector<int> compatibleGpus;
 775     compatibleGpus.reserve(gpu_info.n_dev);
 776     for (int i = 0; i < gpu_info.n_dev; i++)
 777     {
 778         assert(gpu_info.gpu_dev);
 779         if (gpu_info.gpu_dev[i].stat == egpuCompatible)
 780         {
 781             compatibleGpus.push_back(i);
 782         }
 783     }
 784     return compatibleGpus;
 785 }
 786
 787 const char *getGpuCompatibilityDescription(const gmx_gpu_info_t &gpu_info,
 788                                            int                   index)
 789 {
 790     return (index >= gpu_info.n_dev ?
 791             gpu_detect_res_str[egpuNonexistent] :
 792             gpu_detect_res_str[gpu_info.gpu_dev[index].stat]);
 793 }
 794
 795 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 796 {
 797     if (gpu_info == NULL)
 798     {
 799         return;
 800     }
 801
 802     sfree(gpu_info->gpu_dev);
 803 }
 804
 805 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
 806 {
 807     assert(s);
 808
 809     if (index < 0 && index >= gpu_info.n_dev)
 810     {
 811         return;
 812     }
 813
 814     gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
 815
 816     bool               bGpuExists =
 817         dinfo->stat == egpuCompatible ||
 818         dinfo->stat == egpuIncompatible;
 819
 820     if (!bGpuExists)
 821     {
 822         sprintf(s, "#%d: %s, stat: %s",
 823                 dinfo->id, "N/A",
 824                 gpu_detect_res_str[dinfo->stat]);
 825     }
 826     else
 827     {
 828         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 829                 dinfo->id, dinfo->prop.name,
 830                 dinfo->prop.major, dinfo->prop.minor,
 831                 dinfo->prop.ECCEnabled ? "yes" : " no",
 832                 gpu_detect_res_str[dinfo->stat]);
 833     }
 834 }
 835
 836 int get_current_cuda_gpu_device_id(void)
 837 {
 838     int gpuid;
 839     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 840
 841     return gpuid;
 842 }
 843
 844 size_t sizeof_gpu_dev_info(void)
 845 {
 846     return sizeof(gmx_device_info_t);
 847 }
 848
 849 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 850                                   gmx_host_alloc_t **nb_alloc,
 851                                   gmx_host_free_t  **nb_free)
 852 {
 853     if (bUseGpuKernels)
 854     {
 855         *nb_alloc = &pmalloc;
 856         *nb_free  = &pfree;
 857     }
 858     else
 859     {
 860         *nb_alloc = NULL;
 861         *nb_free  = NULL;
 862     }
 863 }
 864
 865 void startGpuProfiler(void)
 866 {
 867     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 868        mdrun is executed in the CUDA profiler.
 869        If nvprof was run is with "--profile-from-start off", the profiler will
 870        be started here. This way we can avoid tracing the CUDA events from the
 871        first part of the run. Starting the profiler again does nothing.
 872      */
 873     if (cudaProfilerRun)
 874     {
 875         cudaError_t stat;
 876         stat = cudaProfilerStart();
 877         CU_RET_ERR(stat, "cudaProfilerStart failed");
 878     }
 879 }
 880
 881 void stopGpuProfiler(void)
 882 {
 883     /* Stopping the nvidia here allows us to eliminate the subsequent
 884        API calls from the trace, e.g. uninitialization and cleanup. */
 885     if (cudaProfilerRun)
 886     {
 887         cudaError_t stat;
 888         stat = cudaProfilerStop();
 889         CU_RET_ERR(stat, "cudaProfilerStop failed");
 890     }
 891 }
 892
 893 void resetGpuProfiler(void)
 894 {
 895     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 896      *  the profiling here (can't stop it) which will achieve the desired effect if
 897      *  the run was started with the profiling disabled.
 898      *
 899      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 900      * stopGpuProfiler();
 901      */
 902     if (cudaProfilerRun)
 903     {
 904         startGpuProfiler();
 905     }
 906 }