src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  53 #include "gromacs/hardware/gpu_hw_info.h"
  54 #include "gromacs/utility/basedefinitions.h"
  55 #include "gromacs/utility/cstringutil.h"
  56 #include "gromacs/utility/smalloc.h"
  57
  58 #ifdef HAVE_NVML
  59 #include <nvml.h>
  60 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
  61 #else  /* HAVE_NVML */
  62 #define HAVE_NVML_APPLICATION_CLOCKS 0
  63 #endif /* HAVE_NVML */
  64
  65 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
  66 /*! Check for NVML error on the return status of a NVML API call. */
  67 #  define HANDLE_NVML_RET_ERR(status, msg) \
  68     do { \
  69         if (status != NVML_SUCCESS) \
  70         { \
  71             gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
  72         } \
  73     } while (0)
  74 #else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  75 #  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
  76 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  77
  78 #if HAVE_NVML_APPLICATION_CLOCKS
  79 static const gmx_bool            bCompiledWithApplicationClockSupport = true;
  80 #else
  81 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  82 #endif
  83
  84 /*! \internal \brief
  85  * Max number of devices supported by CUDA (for consistency checking).
  86  *
  87  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  88  */
  89 static int cuda_max_device_count = 32;
  90
  91 /** Dummy kernel used for sanity checking. */
  92 __global__ void k_dummy_test()
  93 {
  94 }
  95
  96
  97 /*!
  98  * \brief Runs GPU sanity checks.
  99  *
 100  * Runs a series of checks to determine that the given GPU and underlying CUDA
 101  * driver/runtime functions properly.
 102  * Returns properties of a device with given ID or the one that has
 103  * already been initialized earlier in the case if of \dev_id == -1.
 104  *
 105  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 106  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 107  * \returns                0 if the device looks OK
 108  *
 109  * TODO: introduce errors codes and handle errors more smoothly.
 110  */
 111 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 112 {
 113     cudaError_t cu_err;
 114     int         dev_count, id;
 115
 116     cu_err = cudaGetDeviceCount(&dev_count);
 117     if (cu_err != cudaSuccess)
 118     {
 119         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 120                 cudaGetErrorString(cu_err));
 121         return -1;
 122     }
 123
 124     /* no CUDA compatible device at all */
 125     if (dev_count == 0)
 126     {
 127         return -1;
 128     }
 129
 130     /* things might go horribly wrong if cudart is not compatible with the driver */
 131     if (dev_count < 0 || dev_count > cuda_max_device_count)
 132     {
 133         return -1;
 134     }
 135
 136     if (dev_id == -1) /* device already selected let's not destroy the context */
 137     {
 138         cu_err = cudaGetDevice(&id);
 139         if (cu_err != cudaSuccess)
 140         {
 141             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 142                     cudaGetErrorString(cu_err));
 143             return -1;
 144         }
 145     }
 146     else
 147     {
 148         id = dev_id;
 149         if (id > dev_count - 1) /* pfff there's no such device */
 150         {
 151             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 152                     dev_id, dev_count);
 153             return -1;
 154         }
 155     }
 156
 157     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 158     cu_err = cudaGetDeviceProperties(dev_prop, id);
 159     if (cu_err != cudaSuccess)
 160     {
 161         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 162                 cudaGetErrorString(cu_err));
 163         return -1;
 164     }
 165
 166     /* both major & minor is 9999 if no CUDA capable devices are present */
 167     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 168     {
 169         return -1;
 170     }
 171     /* we don't care about emulation mode */
 172     if (dev_prop->major == 0)
 173     {
 174         return -1;
 175     }
 176
 177     if (id != -1)
 178     {
 179         cu_err = cudaSetDevice(id);
 180         if (cu_err != cudaSuccess)
 181         {
 182             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 183                     cu_err, id, cudaGetErrorString(cu_err));
 184             return -1;
 185         }
 186     }
 187
 188     /* try to execute a dummy kernel */
 189     k_dummy_test<<< 1, 512>>> ();
 190     if (cudaThreadSynchronize() != cudaSuccess)
 191     {
 192         return -1;
 193     }
 194
 195     /* destroy context if we created one */
 196     if (id != -1)
 197     {
 198         cu_err = cudaDeviceReset();
 199         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 200     }
 201
 202     return 0;
 203 }
 204
 205 #ifdef HAVE_NVML
 206 /* TODO: We should actually be using md_print_warn in md_logging.c,
 207  * but we can't include mpi.h in CUDA code.
 208  */
 209 static void md_print_info(FILE       *fplog,
 210                           const char *fmt, ...)
 211 {
 212     va_list ap;
 213
 214     if (fplog != NULL)
 215     {
 216         /* We should only print to stderr on the master node,
 217          * in most cases fplog is only set on the master node, so this works.
 218          */
 219         va_start(ap, fmt);
 220         vfprintf(stderr, fmt, ap);
 221         va_end(ap);
 222
 223         va_start(ap, fmt);
 224         vfprintf(fplog, fmt, ap);
 225         va_end(ap);
 226     }
 227 }
 228 #endif /*HAVE_NVML*/
 229
 230 /* TODO: We should actually be using md_print_warn in md_logging.c,
 231  * but we can't include mpi.h in CUDA code.
 232  * This is replicated from nbnxn_cuda_data_mgmt.cu.
 233  */
 234 static void md_print_warn(FILE       *fplog,
 235                           const char *fmt, ...)
 236 {
 237     va_list ap;
 238
 239     if (fplog != NULL)
 240     {
 241         /* We should only print to stderr on the master node,
 242          * in most cases fplog is only set on the master node, so this works.
 243          */
 244         va_start(ap, fmt);
 245         fprintf(stderr, "\n");
 246         vfprintf(stderr, fmt, ap);
 247         fprintf(stderr, "\n");
 248         va_end(ap);
 249
 250         va_start(ap, fmt);
 251         fprintf(fplog, "\n");
 252         vfprintf(fplog, fmt, ap);
 253         fprintf(fplog, "\n");
 254         va_end(ap);
 255     }
 256 }
 257
 258 #if HAVE_NVML_APPLICATION_CLOCKS
 259 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 260  *
 261  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 262  * matching PCI-E information from \cuda_dev with the available NVML devices.
 263  *
 264  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 265  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 266  */
 267 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 268 {
 269     nvmlDevice_t nvml_device_id;
 270     unsigned int nvml_device_count = 0;
 271     nvmlReturn_t nvml_stat         = nvmlDeviceGetCount ( &nvml_device_count );
 272     cuda_dev->nvml_initialized = false;
 273     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 274     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 275     {
 276         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 277         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 278         if (nvml_stat != NVML_SUCCESS)
 279         {
 280             break;
 281         }
 282
 283         nvmlPciInfo_t nvml_pci_info;
 284         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 285         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 286         if (nvml_stat != NVML_SUCCESS)
 287         {
 288             break;
 289         }
 290         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 291             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 292             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 293         {
 294             cuda_dev->nvml_initialized = true;
 295             cuda_dev->nvml_device_id   = nvml_device_id;
 296             break;
 297         }
 298     }
 299     return cuda_dev->nvml_initialized;
 300 }
 301 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 302
 303 /*! \brief Tries to set application clocks for the GPU with the given index.
 304  *
 305  * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
 306  * to handle the application clocks for. Application clocks are set to the
 307  * max supported value to increase performance if application clock permissions
 308  * allow this. For future GPU architectures a more sophisticated scheme might be
 309  * required.
 310  *
 311  * \param[out] fplog        log file to write to
 312  * \param[in] gpuid         index of the GPU to set application clocks for
 313  * \param[in] gpu_info      GPU info of all detected devices in the system.
 314  * \returns                 true if no error occurs during application clocks handling.
 315  */
 316 static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
 317 {
 318     const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
 319     int                   cuda_version_number         = prop->major * 10 + prop->minor;
 320     gmx_bool              bGpuCanUseApplicationClocks =
 321         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
 322          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
 323     if (!bGpuCanUseApplicationClocks)
 324     {
 325         return true;
 326     }
 327 #ifndef HAVE_NVML
 328     int cuda_driver  = 0;
 329     int cuda_runtime = 0;
 330     cudaDriverGetVersion(&cuda_driver);
 331     cudaRuntimeGetVersion(&cuda_runtime);
 332     md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 333                    "      application clocks of the detected %s GPU to improve performance.\n"
 334                    "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n",
 335                    prop->name);
 336     return true;
 337 #else
 338     if (!bCompiledWithApplicationClockSupport)
 339     {
 340         int cuda_driver  = 0;
 341         int cuda_runtime = 0;
 342         cudaDriverGetVersion(&cuda_driver);
 343         cudaRuntimeGetVersion(&cuda_runtime);
 344         md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 345                        "      managing application clocks of the detected %s GPU to improve performance.\n"
 346                        "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n",
 347                        prop->name );
 348         return true;
 349     }
 350
 351     /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 352     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 353     char        *env;
 354     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 355     //      this variable can be later used to give a user more fine grained control.
 356     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 357     if (env != NULL && ( strcmp( env, "0") == 0 ||
 358                          gmx_strcasecmp( env, "OFF") == 0 ||
 359                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 360     {
 361         return true;
 362     }
 363     nvml_stat = nvmlInit();
 364     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 365     if (nvml_stat != NVML_SUCCESS)
 366     {
 367         return false;
 368     }
 369     if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
 370     {
 371         return false;
 372     }
 373     //get current application clocks setting
 374     unsigned int app_sm_clock  = 0;
 375     unsigned int app_mem_clock = 0;
 376     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
 377     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 378     {
 379         return false;
 380     }
 381     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 382     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
 383     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 384     //get max application clocks
 385     unsigned int max_sm_clock  = 0;
 386     unsigned int max_mem_clock = 0;
 387     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
 388     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 389     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
 390     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 391
 392     gpu_info->gpu_dev[gpuid].nvml_is_restricted     = NVML_FEATURE_ENABLED;
 393     gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
 394
 395     nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
 396     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 397
 398     /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 399        e.g. if max application clocks should not be used for certain GPUs. */
 400     if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
 401     {
 402         md_print_info( fplog, "Changing GPU application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
 403         nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
 404         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 405         gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
 406     }
 407     else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
 408     {
 409         md_print_warn( fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
 410     }
 411     else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
 412     {
 413         //TODO: This should probably be integrated into the GPU Properties table.
 414         md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
 415     }
 416     else
 417     {
 418         md_print_warn( fplog,  "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
 419     }
 420     return (nvml_stat == NVML_SUCCESS);
 421 #endif /* HAVE_NVML */
 422 }
 423
 424 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 425  *
 426  * \param[in] gpu_dev  CUDA device information
 427  */
 428 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 429 {
 430 #if !HAVE_NVML_APPLICATION_CLOCKS
 431     GMX_UNUSED_VALUE(cuda_dev);
 432     return true;
 433 #else /* HAVE_NVML_APPLICATION_CLOCKS */
 434     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 435     if (cuda_dev &&
 436         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 437         cuda_dev->nvml_ap_clocks_changed)
 438     {
 439         nvml_stat = nvmlDeviceResetApplicationsClocks( cuda_dev->nvml_device_id );
 440         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceResetApplicationsClocks failed" );
 441     }
 442     nvml_stat = nvmlShutdown();
 443     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 444     return (nvml_stat == NVML_SUCCESS);
 445 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 446 }
 447
 448 gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
 449                   const struct gmx_gpu_info_t *gpu_info,
 450                   const struct gmx_gpu_opt_t *gpu_opt)
 451 {
 452     cudaError_t stat;
 453     char        sbuf[STRLEN];
 454     int         gpuid;
 455
 456     assert(gpu_info);
 457     assert(result_str);
 458
 459     if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
 460     {
 461         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 462                 "there are %d %s-selected GPU(s), but #%d was requested.",
 463                 gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 464         gmx_incons(sbuf);
 465     }
 466
 467     gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
 468
 469     stat = cudaSetDevice(gpuid);
 470     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 471
 472     if (debug)
 473     {
 474         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
 475     }
 476
 477     //Ignoring return value as NVML errors should be treated not critical.
 478     if (stat == cudaSuccess)
 479     {
 480         init_gpu_application_clocks(fplog, gpuid, gpu_info);
 481     }
 482     return (stat == cudaSuccess);
 483 }
 484
 485 gmx_bool free_cuda_gpu(
 486         int gmx_unused mygpu, char *result_str,
 487         const gmx_gpu_info_t gmx_unused *gpu_info,
 488         const gmx_gpu_opt_t gmx_unused *gpu_opt
 489         )
 490 {
 491     cudaError_t  stat;
 492     gmx_bool     reset_gpu_application_clocks_status = true;
 493     int          gpuid;
 494
 495     assert(result_str);
 496
 497     if (debug)
 498     {
 499         int gpuid;
 500         stat = cudaGetDevice(&gpuid);
 501         CU_RET_ERR(stat, "cudaGetDevice failed");
 502         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 503     }
 504
 505     gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
 506     if (gpuid != -1)
 507     {
 508         reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
 509     }
 510
 511     stat = cudaDeviceReset();
 512     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 513     return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
 514 }
 515
 516 /*! \brief Returns true if the gpu characterized by the device properties is
 517  *  supported by the native gpu acceleration.
 518  *
 519  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 520  * \returns             true if the GPU properties passed indicate a compatible
 521  *                      GPU, otherwise false.
 522  */
 523 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 524 {
 525     return (dev_prop->major >= 2);
 526 }
 527
 528 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 529  *
 530  * \param[in] stat  GPU status.
 531  * \returns         true if the provided status is egpuCompatible, otherwise false.
 532  */
 533 static bool is_compatible_gpu(int stat)
 534 {
 535     return (stat == egpuCompatible);
 536 }
 537
 538 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 539  *
 540  *  Returns a status value which indicates compatibility or one of the following
 541  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 542  *  It also returns the respective device's properties in \dev_prop (if applicable).
 543  *
 544  *  \param[in]  dev_id   the ID of the GPU to check.
 545  *  \param[out] dev_prop the CUDA device properties of the device checked.
 546  *  \returns             the status of the requested device
 547  */
 548 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 549 {
 550     cudaError_t stat;
 551     int         ndev;
 552
 553     stat = cudaGetDeviceCount(&ndev);
 554     if (stat != cudaSuccess)
 555     {
 556         return egpuInsane;
 557     }
 558
 559     if (dev_id > ndev - 1)
 560     {
 561         return egpuNonexistent;
 562     }
 563
 564     /* TODO: currently we do not make a distinction between the type of errors
 565      * that can appear during sanity checks. This needs to be improved, e.g if
 566      * the dummy test kernel fails to execute with a "device busy message" we
 567      * should appropriately report that the device is busy instead of insane.
 568      */
 569     if (do_sanity_checks(dev_id, dev_prop) == 0)
 570     {
 571         if (is_gmx_supported_gpu(dev_prop))
 572         {
 573             return egpuCompatible;
 574         }
 575         else
 576         {
 577             return egpuIncompatible;
 578         }
 579     }
 580     else
 581     {
 582         return egpuInsane;
 583     }
 584 }
 585
 586
 587 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 588 {
 589     int                i, ndev, checkres, retval;
 590     cudaError_t        stat;
 591     cudaDeviceProp     prop;
 592     gmx_device_info_t *devs;
 593
 594     assert(gpu_info);
 595     assert(err_str);
 596
 597     gpu_info->n_dev_compatible = 0;
 598
 599     ndev    = 0;
 600     devs    = NULL;
 601
 602     stat = cudaGetDeviceCount(&ndev);
 603     if (stat != cudaSuccess)
 604     {
 605         const char *s;
 606
 607         /* cudaGetDeviceCount failed which means that there is something
 608          * wrong with the machine: driver-runtime mismatch, all GPUs being
 609          * busy in exclusive mode, or some other condition which should
 610          * result in us issuing a warning a falling back to CPUs. */
 611         retval = -1;
 612         s      = cudaGetErrorString(stat);
 613         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 614     }
 615     else
 616     {
 617         snew(devs, ndev);
 618         for (i = 0; i < ndev; i++)
 619         {
 620             checkres = is_gmx_supported_gpu_id(i, &prop);
 621
 622             devs[i].id   = i;
 623             devs[i].prop = prop;
 624             devs[i].stat = checkres;
 625
 626             if (checkres == egpuCompatible)
 627             {
 628                 gpu_info->n_dev_compatible++;
 629             }
 630         }
 631         retval = 0;
 632     }
 633
 634     gpu_info->n_dev   = ndev;
 635     gpu_info->gpu_dev = devs;
 636
 637     return retval;
 638 }
 639
 640 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 641                           gmx_gpu_opt_t        *gpu_opt)
 642 {
 643     int  i, ncompat;
 644     int *compat;
 645
 646     assert(gpu_info);
 647     /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
 648     assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
 649
 650     snew(compat, gpu_info->n_dev);
 651     ncompat = 0;
 652     for (i = 0; i < gpu_info->n_dev; i++)
 653     {
 654         if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
 655         {
 656             ncompat++;
 657             compat[ncompat - 1] = i;
 658         }
 659     }
 660
 661     gpu_opt->n_dev_compatible = ncompat;
 662     snew(gpu_opt->dev_compatible, ncompat);
 663     memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
 664     sfree(compat);
 665 }
 666
 667 gmx_bool check_selected_gpus(int                  *checkres,
 668                              const gmx_gpu_info_t *gpu_info,
 669                              gmx_gpu_opt_t        *gpu_opt)
 670 {
 671     int  i, id;
 672     bool bAllOk;
 673
 674     assert(checkres);
 675     assert(gpu_info);
 676     assert(gpu_opt->n_dev_use >= 0);
 677
 678     if (gpu_opt->n_dev_use == 0)
 679     {
 680         return TRUE;
 681     }
 682
 683     assert(gpu_opt->dev_use);
 684
 685     /* we will assume that all GPUs requested are valid IDs,
 686        otherwise we'll bail anyways */
 687
 688     bAllOk = true;
 689     for (i = 0; i < gpu_opt->n_dev_use; i++)
 690     {
 691         id = gpu_opt->dev_use[i];
 692
 693         /* devices are stored in increasing order of IDs in gpu_dev */
 694         gpu_opt->dev_use[i] = id;
 695
 696         checkres[i] = (id >= gpu_info->n_dev) ?
 697             egpuNonexistent : gpu_info->gpu_dev[id].stat;
 698
 699         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 700     }
 701
 702     return bAllOk;
 703 }
 704
 705 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 706 {
 707     if (gpu_info == NULL)
 708     {
 709         return;
 710     }
 711
 712     sfree(gpu_info->gpu_dev);
 713 }
 714
 715 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 716 {
 717     assert(s);
 718     assert(gpu_info);
 719
 720     if (index < 0 && index >= gpu_info->n_dev)
 721     {
 722         return;
 723     }
 724
 725     gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
 726
 727     bool               bGpuExists =
 728         dinfo->stat == egpuCompatible ||
 729         dinfo->stat == egpuIncompatible;
 730
 731     if (!bGpuExists)
 732     {
 733         sprintf(s, "#%d: %s, stat: %s",
 734                 dinfo->id, "N/A",
 735                 gpu_detect_res_str[dinfo->stat]);
 736     }
 737     else
 738     {
 739         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 740                 dinfo->id, dinfo->prop.name,
 741                 dinfo->prop.major, dinfo->prop.minor,
 742                 dinfo->prop.ECCEnabled ? "yes" : " no",
 743                 gpu_detect_res_str[dinfo->stat]);
 744     }
 745 }
 746
 747 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 748                       const gmx_gpu_opt_t  *gpu_opt,
 749                       int                   idx)
 750 {
 751     assert(gpu_info);
 752     assert(gpu_opt);
 753     assert(idx >= 0 && idx < gpu_opt->n_dev_use);
 754
 755     return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
 756 }
 757
 758 int get_current_cuda_gpu_device_id(void)
 759 {
 760     int gpuid;
 761     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 762
 763     return gpuid;
 764 }
 765
 766 size_t sizeof_gpu_dev_info(void)
 767 {
 768     return sizeof(gmx_device_info_t);
 769 }
 770
 771 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 772                                   gmx_host_alloc_t **nb_alloc,
 773                                   gmx_host_free_t  **nb_free)
 774 {
 775     if (bUseGpuKernels)
 776     {
 777         *nb_alloc = &pmalloc;
 778         *nb_free  = &pfree;
 779     }
 780     else
 781     {
 782         *nb_alloc = NULL;
 783         *nb_free  = NULL;
 784     }
 785 }