src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include <cuda_profiler_api.h>
  52
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/utility/basedefinitions.h"
  57 #include "gromacs/utility/cstringutil.h"
  58 #include "gromacs/utility/smalloc.h"
  59
  60 #if HAVE_NVML
  61 #include <nvml.h>
  62 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
  63 #else  /* HAVE_NVML */
  64 #define HAVE_NVML_APPLICATION_CLOCKS 0
  65 #endif /* HAVE_NVML */
  66
  67 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
  68 /*! Check for NVML error on the return status of a NVML API call. */
  69 #  define HANDLE_NVML_RET_ERR(status, msg) \
  70     do { \
  71         if (status != NVML_SUCCESS) \
  72         { \
  73             gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
  74         } \
  75     } while (0)
  76 #else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  77 #  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
  78 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  79
  80 #if HAVE_NVML_APPLICATION_CLOCKS
  81 static const gmx_bool            bCompiledWithApplicationClockSupport = true;
  82 #else
  83 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  84 #endif
  85
  86 /*! \internal \brief
  87  * Max number of devices supported by CUDA (for consistency checking).
  88  *
  89  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  90  */
  91 static int  cuda_max_device_count = 32;
  92
  93 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
  94
  95 /** Dummy kernel used for sanity checking. */
  96 __global__ void k_dummy_test()
  97 {
  98 }
  99
 100
 101 /*!
 102  * \brief Runs GPU sanity checks.
 103  *
 104  * Runs a series of checks to determine that the given GPU and underlying CUDA
 105  * driver/runtime functions properly.
 106  * Returns properties of a device with given ID or the one that has
 107  * already been initialized earlier in the case if of \dev_id == -1.
 108  *
 109  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 110  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 111  * \returns                0 if the device looks OK
 112  *
 113  * TODO: introduce errors codes and handle errors more smoothly.
 114  */
 115 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 116 {
 117     cudaError_t cu_err;
 118     int         dev_count, id;
 119
 120     cu_err = cudaGetDeviceCount(&dev_count);
 121     if (cu_err != cudaSuccess)
 122     {
 123         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 124                 cudaGetErrorString(cu_err));
 125         return -1;
 126     }
 127
 128     /* no CUDA compatible device at all */
 129     if (dev_count == 0)
 130     {
 131         return -1;
 132     }
 133
 134     /* things might go horribly wrong if cudart is not compatible with the driver */
 135     if (dev_count < 0 || dev_count > cuda_max_device_count)
 136     {
 137         return -1;
 138     }
 139
 140     if (dev_id == -1) /* device already selected let's not destroy the context */
 141     {
 142         cu_err = cudaGetDevice(&id);
 143         if (cu_err != cudaSuccess)
 144         {
 145             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 146                     cudaGetErrorString(cu_err));
 147             return -1;
 148         }
 149     }
 150     else
 151     {
 152         id = dev_id;
 153         if (id > dev_count - 1) /* pfff there's no such device */
 154         {
 155             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 156                     dev_id, dev_count);
 157             return -1;
 158         }
 159     }
 160
 161     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 162     cu_err = cudaGetDeviceProperties(dev_prop, id);
 163     if (cu_err != cudaSuccess)
 164     {
 165         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 166                 cudaGetErrorString(cu_err));
 167         return -1;
 168     }
 169
 170     /* both major & minor is 9999 if no CUDA capable devices are present */
 171     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 172     {
 173         return -1;
 174     }
 175     /* we don't care about emulation mode */
 176     if (dev_prop->major == 0)
 177     {
 178         return -1;
 179     }
 180
 181     if (id != -1)
 182     {
 183         cu_err = cudaSetDevice(id);
 184         if (cu_err != cudaSuccess)
 185         {
 186             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 187                     cu_err, id, cudaGetErrorString(cu_err));
 188             return -1;
 189         }
 190     }
 191
 192     /* try to execute a dummy kernel */
 193     k_dummy_test<<< 1, 512>>> ();
 194     if (cudaThreadSynchronize() != cudaSuccess)
 195     {
 196         return -1;
 197     }
 198
 199     /* destroy context if we created one */
 200     if (id != -1)
 201     {
 202         cu_err = cudaDeviceReset();
 203         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 204     }
 205
 206     return 0;
 207 }
 208
 209 #if HAVE_NVML
 210 /* TODO: We should actually be using md_print_warn in md_logging.c,
 211  * but we can't include mpi.h in CUDA code.
 212  */
 213 static void md_print_info(FILE       *fplog,
 214                           const char *fmt, ...)
 215 {
 216     va_list ap;
 217
 218     if (fplog != NULL)
 219     {
 220         /* We should only print to stderr on the master node,
 221          * in most cases fplog is only set on the master node, so this works.
 222          */
 223         va_start(ap, fmt);
 224         vfprintf(stderr, fmt, ap);
 225         va_end(ap);
 226
 227         va_start(ap, fmt);
 228         vfprintf(fplog, fmt, ap);
 229         va_end(ap);
 230     }
 231 }
 232 #endif /*HAVE_NVML*/
 233
 234 /* TODO: We should actually be using md_print_warn in md_logging.c,
 235  * but we can't include mpi.h in CUDA code.
 236  * This is replicated from nbnxn_cuda_data_mgmt.cu.
 237  */
 238 static void md_print_warn(FILE       *fplog,
 239                           const char *fmt, ...)
 240 {
 241     va_list ap;
 242
 243     if (fplog != NULL)
 244     {
 245         /* We should only print to stderr on the master node,
 246          * in most cases fplog is only set on the master node, so this works.
 247          */
 248         va_start(ap, fmt);
 249         fprintf(stderr, "\n");
 250         vfprintf(stderr, fmt, ap);
 251         fprintf(stderr, "\n");
 252         va_end(ap);
 253
 254         va_start(ap, fmt);
 255         fprintf(fplog, "\n");
 256         vfprintf(fplog, fmt, ap);
 257         fprintf(fplog, "\n");
 258         va_end(ap);
 259     }
 260 }
 261
 262 #if HAVE_NVML_APPLICATION_CLOCKS
 263 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 264  *
 265  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 266  * matching PCI-E information from \cuda_dev with the available NVML devices.
 267  *
 268  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 269  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 270  */
 271 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 272 {
 273     nvmlDevice_t nvml_device_id;
 274     unsigned int nvml_device_count = 0;
 275     nvmlReturn_t nvml_stat         = nvmlDeviceGetCount ( &nvml_device_count );
 276     cuda_dev->nvml_initialized = false;
 277     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 278     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 279     {
 280         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 281         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 282         if (nvml_stat != NVML_SUCCESS)
 283         {
 284             break;
 285         }
 286
 287         nvmlPciInfo_t nvml_pci_info;
 288         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 289         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 290         if (nvml_stat != NVML_SUCCESS)
 291         {
 292             break;
 293         }
 294         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 295             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 296             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 297         {
 298             cuda_dev->nvml_initialized = true;
 299             cuda_dev->nvml_device_id   = nvml_device_id;
 300             break;
 301         }
 302     }
 303     return cuda_dev->nvml_initialized;
 304 }
 305
 306 /*! \brief Reads and returns the application clocks for device.
 307  *
 308  * \param[in]  device        The GPU device
 309  * \param[out] app_sm_clock  The current application SM clock
 310  * \param[out] app_mem_clock The current application memory clock
 311  * \returns if applacation clocks are supported
 312  */
 313 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
 314                                  unsigned int            *app_sm_clock,
 315                                  unsigned int            *app_mem_clock)
 316 {
 317     nvmlReturn_t nvml_stat;
 318
 319     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
 320     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 321     {
 322         return false;
 323     }
 324     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
 325     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
 326     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
 327
 328     return true;
 329 }
 330 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 331
 332 /*! \brief Tries to set application clocks for the GPU with the given index.
 333  *
 334  * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
 335  * to handle the application clocks for. Application clocks are set to the
 336  * max supported value to increase performance if application clock permissions
 337  * allow this. For future GPU architectures a more sophisticated scheme might be
 338  * required.
 339  *
 340  * \param[out] fplog        log file to write to
 341  * \param[in] gpuid         index of the GPU to set application clocks for
 342  * \param[in] gpu_info      GPU info of all detected devices in the system.
 343  * \returns                 true if no error occurs during application clocks handling.
 344  */
 345 static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
 346 {
 347     const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
 348     int                   cuda_version_number         = prop->major * 10 + prop->minor;
 349     gmx_bool              bGpuCanUseApplicationClocks =
 350         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
 351          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
 352     if (!bGpuCanUseApplicationClocks)
 353     {
 354         return true;
 355     }
 356 #if !HAVE_NVML
 357     int cuda_driver  = 0;
 358     int cuda_runtime = 0;
 359     cudaDriverGetVersion(&cuda_driver);
 360     cudaRuntimeGetVersion(&cuda_runtime);
 361     md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 362                    "      application clocks of the detected %s GPU to improve performance.\n"
 363                    "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n",
 364                    prop->name);
 365     return true;
 366 #else
 367     if (!bCompiledWithApplicationClockSupport)
 368     {
 369         int cuda_driver  = 0;
 370         int cuda_runtime = 0;
 371         cudaDriverGetVersion(&cuda_driver);
 372         cudaRuntimeGetVersion(&cuda_runtime);
 373         md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 374                        "      managing application clocks of the detected %s GPU to improve performance.\n"
 375                        "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n",
 376                        prop->name );
 377         return true;
 378     }
 379
 380     /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 381     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 382     char        *env;
 383     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 384     //      this variable can be later used to give a user more fine grained control.
 385     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 386     if (env != NULL && ( strcmp( env, "0") == 0 ||
 387                          gmx_strcasecmp( env, "OFF") == 0 ||
 388                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 389     {
 390         return true;
 391     }
 392     nvml_stat = nvmlInit();
 393     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 394     if (nvml_stat != NVML_SUCCESS)
 395     {
 396         return false;
 397     }
 398
 399     gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
 400
 401     if (!addNVMLDeviceId(cuda_dev))
 402     {
 403         return false;
 404     }
 405     //get current application clocks setting
 406     if (!getApplicationClocks(cuda_dev,
 407                               &cuda_dev->nvml_orig_app_sm_clock,
 408                               &cuda_dev->nvml_orig_app_mem_clock))
 409     {
 410         return false;
 411     }
 412     //get max application clocks
 413     unsigned int max_sm_clock  = 0;
 414     unsigned int max_mem_clock = 0;
 415     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
 416     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 417     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
 418     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 419
 420     cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
 421     cuda_dev->nvml_app_clocks_changed = false;
 422
 423     nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
 424     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 425
 426     /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 427        e.g. if max application clocks should not be used for certain GPUs. */
 428     if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock && cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED)
 429     {
 430         md_print_info(fplog, "Changing GPU application clocks for %s to (%d,%d)\n", cuda_dev->prop.name, max_mem_clock, max_sm_clock);
 431         nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
 432         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 433         cuda_dev->nvml_app_clocks_changed = true;
 434         cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
 435         cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
 436     }
 437     else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock < max_sm_clock)
 438     {
 439         md_print_warn(fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 440     }
 441     else if (nvml_stat == NVML_SUCCESS && cuda_dev->nvml_orig_app_sm_clock == max_sm_clock)
 442     {
 443         md_print_info(fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 444     }
 445     else
 446     {
 447         md_print_warn( fplog,  "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
 448     }
 449     return (nvml_stat == NVML_SUCCESS);
 450 #endif /* HAVE_NVML */
 451 }
 452
 453 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 454  *
 455  * \param[in] gpu_dev  CUDA device information
 456  */
 457 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 458 {
 459 #if !HAVE_NVML_APPLICATION_CLOCKS
 460     GMX_UNUSED_VALUE(cuda_dev);
 461     return true;
 462 #else /* HAVE_NVML_APPLICATION_CLOCKS */
 463     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 464     if (cuda_dev &&
 465         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 466         cuda_dev->nvml_app_clocks_changed)
 467     {
 468         /* Check if the clocks are still what we set them to.
 469          * If so, set them back to the state we originally found them in.
 470          * If not, don't touch them, because something else set them later.
 471          */
 472         unsigned int app_sm_clock, app_mem_clock;
 473         getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
 474         if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
 475             app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
 476         {
 477             nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 478             HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 479         }
 480     }
 481     nvml_stat = nvmlShutdown();
 482     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 483     return (nvml_stat == NVML_SUCCESS);
 484 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 485 }
 486
 487 gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
 488                   const struct gmx_gpu_info_t *gpu_info,
 489                   const struct gmx_gpu_opt_t *gpu_opt)
 490 {
 491     cudaError_t stat;
 492     char        sbuf[STRLEN];
 493     int         gpuid;
 494
 495     assert(gpu_info);
 496     assert(result_str);
 497
 498     if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
 499     {
 500         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 501                 "there are %d %s-selected GPU(s), but #%d was requested.",
 502                 gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 503         gmx_incons(sbuf);
 504     }
 505
 506     gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
 507
 508     stat = cudaSetDevice(gpuid);
 509     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 510
 511     if (debug)
 512     {
 513         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
 514     }
 515
 516     //Ignoring return value as NVML errors should be treated not critical.
 517     if (stat == cudaSuccess)
 518     {
 519         init_gpu_application_clocks(fplog, gpuid, gpu_info);
 520     }
 521     return (stat == cudaSuccess);
 522 }
 523
 524 gmx_bool free_cuda_gpu(
 525         int gmx_unused mygpu, char *result_str,
 526         const gmx_gpu_info_t gmx_unused *gpu_info,
 527         const gmx_gpu_opt_t gmx_unused *gpu_opt
 528         )
 529 {
 530     cudaError_t  stat;
 531     gmx_bool     reset_gpu_application_clocks_status = true;
 532     int          gpuid;
 533
 534     assert(result_str);
 535
 536     if (debug)
 537     {
 538         int gpuid;
 539         stat = cudaGetDevice(&gpuid);
 540         CU_RET_ERR(stat, "cudaGetDevice failed");
 541         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 542     }
 543
 544     gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
 545     if (gpuid != -1)
 546     {
 547         reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
 548     }
 549
 550     stat = cudaDeviceReset();
 551     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 552     return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
 553 }
 554
 555 /*! \brief Returns true if the gpu characterized by the device properties is
 556  *  supported by the native gpu acceleration.
 557  *
 558  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 559  * \returns             true if the GPU properties passed indicate a compatible
 560  *                      GPU, otherwise false.
 561  */
 562 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 563 {
 564     return (dev_prop->major >= 2);
 565 }
 566
 567 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 568  *
 569  * \param[in] stat  GPU status.
 570  * \returns         true if the provided status is egpuCompatible, otherwise false.
 571  */
 572 static bool is_compatible_gpu(int stat)
 573 {
 574     return (stat == egpuCompatible);
 575 }
 576
 577 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 578  *
 579  *  Returns a status value which indicates compatibility or one of the following
 580  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 581  *  It also returns the respective device's properties in \dev_prop (if applicable).
 582  *
 583  *  \param[in]  dev_id   the ID of the GPU to check.
 584  *  \param[out] dev_prop the CUDA device properties of the device checked.
 585  *  \returns             the status of the requested device
 586  */
 587 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 588 {
 589     cudaError_t stat;
 590     int         ndev;
 591
 592     stat = cudaGetDeviceCount(&ndev);
 593     if (stat != cudaSuccess)
 594     {
 595         return egpuInsane;
 596     }
 597
 598     if (dev_id > ndev - 1)
 599     {
 600         return egpuNonexistent;
 601     }
 602
 603     /* TODO: currently we do not make a distinction between the type of errors
 604      * that can appear during sanity checks. This needs to be improved, e.g if
 605      * the dummy test kernel fails to execute with a "device busy message" we
 606      * should appropriately report that the device is busy instead of insane.
 607      */
 608     if (do_sanity_checks(dev_id, dev_prop) == 0)
 609     {
 610         if (is_gmx_supported_gpu(dev_prop))
 611         {
 612             return egpuCompatible;
 613         }
 614         else
 615         {
 616             return egpuIncompatible;
 617         }
 618     }
 619     else
 620     {
 621         return egpuInsane;
 622     }
 623 }
 624
 625
 626 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 627 {
 628     int                i, ndev, checkres, retval;
 629     cudaError_t        stat;
 630     cudaDeviceProp     prop;
 631     gmx_device_info_t *devs;
 632
 633     assert(gpu_info);
 634     assert(err_str);
 635
 636     gpu_info->n_dev_compatible = 0;
 637
 638     ndev    = 0;
 639     devs    = NULL;
 640
 641     stat = cudaGetDeviceCount(&ndev);
 642     if (stat != cudaSuccess)
 643     {
 644         const char *s;
 645
 646         /* cudaGetDeviceCount failed which means that there is something
 647          * wrong with the machine: driver-runtime mismatch, all GPUs being
 648          * busy in exclusive mode, or some other condition which should
 649          * result in us issuing a warning a falling back to CPUs. */
 650         retval = -1;
 651         s      = cudaGetErrorString(stat);
 652         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 653     }
 654     else
 655     {
 656         snew(devs, ndev);
 657         for (i = 0; i < ndev; i++)
 658         {
 659             checkres = is_gmx_supported_gpu_id(i, &prop);
 660
 661             devs[i].id   = i;
 662             devs[i].prop = prop;
 663             devs[i].stat = checkres;
 664
 665             if (checkres == egpuCompatible)
 666             {
 667                 gpu_info->n_dev_compatible++;
 668             }
 669         }
 670         retval = 0;
 671     }
 672
 673     gpu_info->n_dev   = ndev;
 674     gpu_info->gpu_dev = devs;
 675
 676     return retval;
 677 }
 678
 679 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 680                           gmx_gpu_opt_t        *gpu_opt)
 681 {
 682     int  i, ncompat;
 683     int *compat;
 684
 685     assert(gpu_info);
 686     /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
 687     assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
 688
 689     snew(compat, gpu_info->n_dev);
 690     ncompat = 0;
 691     for (i = 0; i < gpu_info->n_dev; i++)
 692     {
 693         if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
 694         {
 695             ncompat++;
 696             compat[ncompat - 1] = i;
 697         }
 698     }
 699
 700     gpu_opt->n_dev_compatible = ncompat;
 701     snew(gpu_opt->dev_compatible, ncompat);
 702     memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
 703     sfree(compat);
 704 }
 705
 706 gmx_bool check_selected_gpus(int                  *checkres,
 707                              const gmx_gpu_info_t *gpu_info,
 708                              gmx_gpu_opt_t        *gpu_opt)
 709 {
 710     int  i, id;
 711     bool bAllOk;
 712
 713     assert(checkres);
 714     assert(gpu_info);
 715     assert(gpu_opt->n_dev_use >= 0);
 716
 717     if (gpu_opt->n_dev_use == 0)
 718     {
 719         return TRUE;
 720     }
 721
 722     assert(gpu_opt->dev_use);
 723
 724     /* we will assume that all GPUs requested are valid IDs,
 725        otherwise we'll bail anyways */
 726
 727     bAllOk = true;
 728     for (i = 0; i < gpu_opt->n_dev_use; i++)
 729     {
 730         id = gpu_opt->dev_use[i];
 731
 732         /* devices are stored in increasing order of IDs in gpu_dev */
 733         gpu_opt->dev_use[i] = id;
 734
 735         checkres[i] = (id >= gpu_info->n_dev) ?
 736             egpuNonexistent : gpu_info->gpu_dev[id].stat;
 737
 738         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 739     }
 740
 741     return bAllOk;
 742 }
 743
 744 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 745 {
 746     if (gpu_info == NULL)
 747     {
 748         return;
 749     }
 750
 751     sfree(gpu_info->gpu_dev);
 752 }
 753
 754 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 755 {
 756     assert(s);
 757     assert(gpu_info);
 758
 759     if (index < 0 && index >= gpu_info->n_dev)
 760     {
 761         return;
 762     }
 763
 764     gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
 765
 766     bool               bGpuExists =
 767         dinfo->stat == egpuCompatible ||
 768         dinfo->stat == egpuIncompatible;
 769
 770     if (!bGpuExists)
 771     {
 772         sprintf(s, "#%d: %s, stat: %s",
 773                 dinfo->id, "N/A",
 774                 gpu_detect_res_str[dinfo->stat]);
 775     }
 776     else
 777     {
 778         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 779                 dinfo->id, dinfo->prop.name,
 780                 dinfo->prop.major, dinfo->prop.minor,
 781                 dinfo->prop.ECCEnabled ? "yes" : " no",
 782                 gpu_detect_res_str[dinfo->stat]);
 783     }
 784 }
 785
 786 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 787                       const gmx_gpu_opt_t  *gpu_opt,
 788                       int                   idx)
 789 {
 790     assert(gpu_info);
 791     assert(gpu_opt);
 792     assert(idx >= 0 && idx < gpu_opt->n_dev_use);
 793
 794     return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
 795 }
 796
 797 int get_current_cuda_gpu_device_id(void)
 798 {
 799     int gpuid;
 800     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 801
 802     return gpuid;
 803 }
 804
 805 size_t sizeof_gpu_dev_info(void)
 806 {
 807     return sizeof(gmx_device_info_t);
 808 }
 809
 810 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 811                                   gmx_host_alloc_t **nb_alloc,
 812                                   gmx_host_free_t  **nb_free)
 813 {
 814     if (bUseGpuKernels)
 815     {
 816         *nb_alloc = &pmalloc;
 817         *nb_free  = &pfree;
 818     }
 819     else
 820     {
 821         *nb_alloc = NULL;
 822         *nb_free  = NULL;
 823     }
 824 }
 825
 826 void startGpuProfiler(void)
 827 {
 828     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 829        mdrun is executed in the CUDA profiler.
 830        If nvprof was run is with "--profile-from-start off", the profiler will
 831        be started here. This way we can avoid tracing the CUDA events from the
 832        first part of the run. Starting the profiler again does nothing.
 833      */
 834     if (cudaProfilerRun)
 835     {
 836         cudaError_t stat;
 837         stat = cudaProfilerStart();
 838         CU_RET_ERR(stat, "cudaProfilerStart failed");
 839     }
 840 }
 841
 842 void stopGpuProfiler(void)
 843 {
 844     /* Stopping the nvidia here allows us to eliminate the subsequent
 845        API calls from the trace, e.g. uninitialization and cleanup. */
 846     if (cudaProfilerRun)
 847     {
 848         cudaError_t stat;
 849         stat = cudaProfilerStop();
 850         CU_RET_ERR(stat, "cudaProfilerStop failed");
 851     }
 852 }
 853
 854 void resetGpuProfiler(void)
 855 {
 856     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 857      *  the profiling here (can't stop it) which will achieve the desired effect if
 858      *  the run was started with the profiling disabled.
 859      *
 860      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 861      * stopGpuProfiler();
 862      */
 863     if (cudaProfilerRun)
 864     {
 865         startGpuProfiler();
 866     }
 867 }