src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdarg.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
  55 #include "gromacs/legacyheaders/types/enums.h"
  56 #include "gromacs/legacyheaders/types/force_flags.h"
  57 #include "gromacs/legacyheaders/types/interaction_const.h"
  58 #include "gromacs/math/vectypes.h"
  59 #include "gromacs/mdlib/nb_verlet.h"
  60 #include "gromacs/mdlib/nbnxn_consts.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  62 #include "gromacs/pbcutil/ishift.h"
  63 #include "gromacs/timing/gpu_timing.h"
  64 #include "gromacs/utility/basedefinitions.h"
  65 #include "gromacs/utility/cstringutil.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/real.h"
  68 #include "gromacs/utility/smalloc.h"
  69
  70 #include "nbnxn_cuda_types.h"
  71
  72 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
  73
  74 /* This is a heuristically determined parameter for the Fermi, Kepler
  75  * and Maxwell architectures for the minimum size of ci lists by multiplying
  76  * this constant with the # of multiprocessors on the current device.
  77  * Since the maximum number of blocks per multiprocessor is 16, the ideal
  78  * count for small systems is 32 or 48 blocks per multiprocessor. Because
  79  * there is a bit of fluctuations in the generated block counts, we use
  80  * a target of 44 instead of the ideal value of 48.
  81  */
  82 static unsigned int gpu_min_ci_balanced_factor = 44;
  83
  84 /* Functions from nbnxn_cuda.cu */
  85 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
  86 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
  87 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
  88 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
  89
  90
  91 /* Fw. decl. */
  92 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
  93
  94 /* Fw. decl, */
  95 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
  96                                           const gmx_device_info_t *dev_info);
  97
  98
  99
 100 static bool use_texobj(const gmx_device_info_t *dev_info)
 101 {
 102     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 103     return (dev_info->prop.major >= 3);
 104 }
 105
 106 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
 107     and the table GPU array. If called with an already allocated table,
 108     it just re-uploads the table.
 109  */
 110 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
 111                                            cu_nbparam_t              *nbp,
 112                                            const gmx_device_info_t   *dev_info)
 113 {
 114     float       *coul_tab;
 115     cudaError_t  stat;
 116
 117     if (nbp->coulomb_tab != NULL)
 118     {
 119         nbnxn_cuda_free_nbparam_table(nbp, dev_info);
 120     }
 121
 122     stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
 123     CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 124
 125     nbp->coulomb_tab = coul_tab;
 126
 127     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 128     if (use_texobj(dev_info))
 129     {
 130         cudaResourceDesc rd;
 131         memset(&rd, 0, sizeof(rd));
 132         rd.resType                  = cudaResourceTypeLinear;
 133         rd.res.linear.devPtr        = nbp->coulomb_tab;
 134         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 135         rd.res.linear.desc.x        = 32;
 136         rd.res.linear.sizeInBytes   = ic->tabq_size*sizeof(*coul_tab);
 137
 138         cudaTextureDesc td;
 139         memset(&td, 0, sizeof(td));
 140         td.readMode                 = cudaReadModeElementType;
 141         stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
 142         CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
 143     }
 144     else
 145     {
 146         GMX_UNUSED_VALUE(dev_info);
 147         cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 148         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 149                                coul_tab, &cd,
 150                                ic->tabq_size*sizeof(*coul_tab));
 151         CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
 152     }
 153
 154     cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
 155
 156     nbp->coulomb_tab_size     = ic->tabq_size;
 157     nbp->coulomb_tab_scale    = ic->tabq_scale;
 158 }
 159
 160
 161 /*! Initializes the atomdata structure first time, it only gets filled at
 162     pair-search. */
 163 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 164 {
 165     cudaError_t stat;
 166
 167     ad->ntypes  = ntypes;
 168     stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 169     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 170     ad->bShiftVecUploaded = false;
 171
 172     stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 173     CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 174
 175     stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 176     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 177     stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 178     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 179
 180     /* initialize to NULL poiters to data that is not allocated here and will
 181        need reallocation in nbnxn_cuda_init_atomdata */
 182     ad->xq = NULL;
 183     ad->f  = NULL;
 184
 185     /* size -1 indicates that the respective array hasn't been initialized yet */
 186     ad->natoms = -1;
 187     ad->nalloc = -1;
 188 }
 189
 190 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
 191     earlier GPUs, single or twin cut-off. */
 192 static int pick_ewald_kernel_type(bool                     bTwinCut,
 193                                   const gmx_device_info_t *dev_info)
 194 {
 195     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
 196     int  kernel_type;
 197
 198     /* Benchmarking/development environment variables to force the use of
 199        analytical or tabulated Ewald kernel. */
 200     bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
 201     bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
 202
 203     if (bForceAnalyticalEwald && bForceTabulatedEwald)
 204     {
 205         gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
 206                    "requested through environment variables.");
 207     }
 208
 209     /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
 210     if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 211     {
 212         bUseAnalyticalEwald = true;
 213
 214         if (debug)
 215         {
 216             fprintf(debug, "Using analytical Ewald CUDA kernels\n");
 217         }
 218     }
 219     else
 220     {
 221         bUseAnalyticalEwald = false;
 222
 223         if (debug)
 224         {
 225             fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
 226         }
 227     }
 228
 229     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 230        forces it (use it for debugging/benchmarking only). */
 231     if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
 232     {
 233         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
 234     }
 235     else
 236     {
 237         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
 238     }
 239
 240     return kernel_type;
 241 }
 242
 243 /*! Copies all parameters related to the cut-off from ic to nbp */
 244 static void set_cutoff_parameters(cu_nbparam_t              *nbp,
 245                                   const interaction_const_t *ic)
 246 {
 247     nbp->ewald_beta       = ic->ewaldcoeff_q;
 248     nbp->sh_ewald         = ic->sh_ewald;
 249     nbp->epsfac           = ic->epsfac;
 250     nbp->two_k_rf         = 2.0 * ic->k_rf;
 251     nbp->c_rf             = ic->c_rf;
 252     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 253     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 254     nbp->rlist_sq         = ic->rlist * ic->rlist;
 255
 256     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 257     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 258
 259     nbp->rvdw_switch      = ic->rvdw_switch;
 260     nbp->dispersion_shift = ic->dispersion_shift;
 261     nbp->repulsion_shift  = ic->repulsion_shift;
 262     nbp->vdw_switch       = ic->vdw_switch;
 263 }
 264
 265 /*! Initializes the nonbonded parameter data structure. */
 266 static void init_nbparam(cu_nbparam_t              *nbp,
 267                          const interaction_const_t *ic,
 268                          const nbnxn_atomdata_t    *nbat,
 269                          const gmx_device_info_t   *dev_info)
 270 {
 271     cudaError_t stat;
 272     int         ntypes, nnbfp, nnbfp_comb;
 273
 274     ntypes  = nbat->ntype;
 275
 276     set_cutoff_parameters(nbp, ic);
 277
 278     if (ic->vdwtype == evdwCUT)
 279     {
 280         switch (ic->vdw_modifier)
 281         {
 282             case eintmodNONE:
 283             case eintmodPOTSHIFT:
 284                 nbp->vdwtype = evdwCuCUT;
 285                 break;
 286             case eintmodFORCESWITCH:
 287                 nbp->vdwtype = evdwCuFSWITCH;
 288                 break;
 289             case eintmodPOTSWITCH:
 290                 nbp->vdwtype = evdwCuPSWITCH;
 291                 break;
 292             default:
 293                 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
 294                 break;
 295         }
 296     }
 297     else if (ic->vdwtype == evdwPME)
 298     {
 299         if (ic->ljpme_comb_rule == ljcrGEOM)
 300         {
 301             assert(nbat->comb_rule == ljcrGEOM);
 302             nbp->vdwtype = evdwCuEWALDGEOM;
 303         }
 304         else
 305         {
 306             assert(nbat->comb_rule == ljcrLB);
 307             nbp->vdwtype = evdwCuEWALDLB;
 308         }
 309     }
 310     else
 311     {
 312         gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
 313     }
 314
 315     if (ic->eeltype == eelCUT)
 316     {
 317         nbp->eeltype = eelCuCUT;
 318     }
 319     else if (EEL_RF(ic->eeltype))
 320     {
 321         nbp->eeltype = eelCuRF;
 322     }
 323     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 324     {
 325         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 326         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
 327     }
 328     else
 329     {
 330         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 331         gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 332     }
 333
 334     /* generate table for PME */
 335     nbp->coulomb_tab = NULL;
 336     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
 337     {
 338         init_ewald_coulomb_force_table(ic, nbp, dev_info);
 339     }
 340
 341     nnbfp      = 2*ntypes*ntypes;
 342     nnbfp_comb = 2*ntypes;
 343
 344     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 345     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 346     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 347
 348
 349     if (ic->vdwtype == evdwPME)
 350     {
 351         stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 352         CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
 353         cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 354     }
 355
 356     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 357     if (use_texobj(dev_info))
 358     {
 359         cudaResourceDesc rd;
 360         cudaTextureDesc  td;
 361
 362         memset(&rd, 0, sizeof(rd));
 363         rd.resType                  = cudaResourceTypeLinear;
 364         rd.res.linear.devPtr        = nbp->nbfp;
 365         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 366         rd.res.linear.desc.x        = 32;
 367         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 368
 369         memset(&td, 0, sizeof(td));
 370         td.readMode                 = cudaReadModeElementType;
 371         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
 372         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
 373
 374         if (ic->vdwtype == evdwPME)
 375         {
 376             memset(&rd, 0, sizeof(rd));
 377             rd.resType                  = cudaResourceTypeLinear;
 378             rd.res.linear.devPtr        = nbp->nbfp_comb;
 379             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 380             rd.res.linear.desc.x        = 32;
 381             rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
 382
 383             memset(&td, 0, sizeof(td));
 384             td.readMode = cudaReadModeElementType;
 385             stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
 386             CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
 387         }
 388     }
 389     else
 390     {
 391         cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
 392         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 393                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 394         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
 395
 396         if (ic->vdwtype == evdwPME)
 397         {
 398             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
 399                                    nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 400             CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
 401         }
 402     }
 403 }
 404
 405 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
 406  *  electrostatic type switching to twin cut-off (or back) if needed. */
 407 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 408                                         const interaction_const_t   *ic)
 409 {
 410     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 411     {
 412         return;
 413     }
 414     gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
 415     cu_nbparam_t     *nbp   = nb->nbparam;
 416
 417     set_cutoff_parameters(nbp, ic);
 418
 419     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
 420                                                  nb->dev_info);
 421
 422     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
 423 }
 424
 425 /*! Initializes the pair list data structure. */
 426 static void init_plist(cu_plist_t *pl)
 427 {
 428     /* initialize to NULL pointers to data that is not allocated here and will
 429        need reallocation in nbnxn_gpu_init_pairlist */
 430     pl->sci     = NULL;
 431     pl->cj4     = NULL;
 432     pl->excl    = NULL;
 433
 434     /* size -1 indicates that the respective array hasn't been initialized yet */
 435     pl->na_c        = -1;
 436     pl->nsci        = -1;
 437     pl->sci_nalloc  = -1;
 438     pl->ncj4        = -1;
 439     pl->cj4_nalloc  = -1;
 440     pl->nexcl       = -1;
 441     pl->excl_nalloc = -1;
 442     pl->bDoPrune    = false;
 443 }
 444
 445 /*! Initializes the timer data structure. */
 446 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 447 {
 448     cudaError_t stat;
 449     int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 450
 451     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 452     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 453     stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 454     CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 455
 456     /* The non-local counters/stream (second in the array) are needed only with DD. */
 457     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 458     {
 459         stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 460         CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 461         stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 462         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 463
 464
 465         stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 466         CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 467         stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 468         CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 469
 470         stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 471         CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 472         stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 473         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 474
 475         stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 476         CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 477         stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 478         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 479     }
 480 }
 481
 482 /*! Initializes the timings data structure. */
 483 static void init_timings(gmx_wallclock_gpu_t *t)
 484 {
 485     int i, j;
 486
 487     t->nb_h2d_t = 0.0;
 488     t->nb_d2h_t = 0.0;
 489     t->nb_c     = 0;
 490     t->pl_h2d_t = 0.0;
 491     t->pl_h2d_c = 0;
 492     for (i = 0; i < 2; i++)
 493     {
 494         for (j = 0; j < 2; j++)
 495         {
 496             t->ktime[i][j].t = 0.0;
 497             t->ktime[i][j].c = 0;
 498         }
 499     }
 500 }
 501
 502 /*! Initializes simulation constant data. */
 503 static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
 504                                   const interaction_const_t      *ic,
 505                                   const nonbonded_verlet_group_t *nbv_group)
 506 {
 507     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
 508     init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
 509
 510     /* clear energy and shift force outputs */
 511     nbnxn_cuda_clear_e_fshift(nb);
 512 }
 513
 514 void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
 515                     const gmx_gpu_info_t      *gpu_info,
 516                     const gmx_gpu_opt_t       *gpu_opt,
 517                     const interaction_const_t *ic,
 518                     nonbonded_verlet_group_t  *nbv_grp,
 519                     int                        my_gpu_index,
 520                     int                        /*rank*/,
 521                     gmx_bool                   bLocalAndNonlocal)
 522 {
 523     cudaError_t       stat;
 524     gmx_nbnxn_cuda_t *nb;
 525
 526     assert(gpu_info);
 527
 528     if (p_nb == NULL)
 529     {
 530         return;
 531     }
 532
 533     snew(nb, 1);
 534     snew(nb->atdat, 1);
 535     snew(nb->nbparam, 1);
 536     snew(nb->plist[eintLocal], 1);
 537     if (bLocalAndNonlocal)
 538     {
 539         snew(nb->plist[eintNonlocal], 1);
 540     }
 541
 542     nb->bUseTwoStreams = bLocalAndNonlocal;
 543
 544     snew(nb->timers, 1);
 545     snew(nb->timings, 1);
 546
 547     /* init nbst */
 548     pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 549     pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 550     pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 551
 552     init_plist(nb->plist[eintLocal]);
 553
 554     /* set device info, just point it to the right GPU among the detected ones */
 555     nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 556
 557     /* local/non-local GPU streams */
 558     stat = cudaStreamCreate(&nb->stream[eintLocal]);
 559     CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 560     if (nb->bUseTwoStreams)
 561     {
 562         init_plist(nb->plist[eintNonlocal]);
 563
 564         /* CUDA stream priority available in the CUDA RT 5.5 API.
 565          * Note that the device we're running on does not have to support
 566          * priorities, because we are querying the priority range which in this
 567          * case will be a single value.
 568          */
 569 #if GMX_CUDA_VERSION >= 5050
 570         {
 571             int highest_priority;
 572             stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
 573             CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 574
 575             stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
 576                                                 cudaStreamDefault,
 577                                                 highest_priority);
 578             CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
 579         }
 580 #else
 581         stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 582         CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 583 #endif
 584     }
 585
 586     /* init events for sychronization (timing disabled for performance reasons!) */
 587     stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 588     CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 589     stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
 590     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
 591
 592     /* CUDA timing disabled as event timers don't work:
 593        - with multiple streams = domain-decomposition;
 594        - when turned off by GMX_DISABLE_CUDA_TIMING.
 595      */
 596     nb->bDoTime = (!nb->bUseTwoStreams &&
 597                    (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 598
 599     if (nb->bDoTime)
 600     {
 601         init_timers(nb->timers, nb->bUseTwoStreams);
 602         init_timings(nb->timings);
 603     }
 604
 605     /* set the kernel type for the current GPU */
 606     /* pick L1 cache configuration */
 607     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 608
 609     nbnxn_cuda_init_const(nb, ic, nbv_grp);
 610
 611     *p_nb = nb;
 612
 613     if (debug)
 614     {
 615         fprintf(debug, "Initialized CUDA data structures.\n");
 616     }
 617 }
 618
 619 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
 620                              const nbnxn_pairlist_t *h_plist,
 621                              int                     iloc)
 622 {
 623     char          sbuf[STRLEN];
 624     cudaError_t   stat;
 625     bool          bDoTime    = nb->bDoTime;
 626     cudaStream_t  stream     = nb->stream[iloc];
 627     cu_plist_t   *d_plist    = nb->plist[iloc];
 628
 629     if (d_plist->na_c < 0)
 630     {
 631         d_plist->na_c = h_plist->na_ci;
 632     }
 633     else
 634     {
 635         if (d_plist->na_c != h_plist->na_ci)
 636         {
 637             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 638                     d_plist->na_c, h_plist->na_ci);
 639             gmx_incons(sbuf);
 640         }
 641     }
 642
 643     if (bDoTime)
 644     {
 645         stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
 646         CU_RET_ERR(stat, "cudaEventRecord failed");
 647     }
 648
 649     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 650                         &d_plist->nsci, &d_plist->sci_nalloc,
 651                         h_plist->nsci,
 652                         stream, true);
 653
 654     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 655                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 656                         h_plist->ncj4,
 657                         stream, true);
 658
 659     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 660                         &d_plist->nexcl, &d_plist->excl_nalloc,
 661                         h_plist->nexcl,
 662                         stream, true);
 663
 664     if (bDoTime)
 665     {
 666         stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
 667         CU_RET_ERR(stat, "cudaEventRecord failed");
 668     }
 669
 670     /* need to prune the pair list during the next step */
 671     d_plist->bDoPrune = true;
 672 }
 673
 674 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
 675                                const nbnxn_atomdata_t *nbatom)
 676 {
 677     cu_atomdata_t *adat  = nb->atdat;
 678     cudaStream_t   ls    = nb->stream[eintLocal];
 679
 680     /* only if we have a dynamic box */
 681     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 682     {
 683         cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
 684                           SHIFTS * sizeof(*adat->shift_vec), ls);
 685         adat->bShiftVecUploaded = true;
 686     }
 687 }
 688
 689 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 690 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
 691 {
 692     cudaError_t    stat;
 693     cu_atomdata_t *adat  = nb->atdat;
 694     cudaStream_t   ls    = nb->stream[eintLocal];
 695
 696     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 697     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 698 }
 699
 700 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 701 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
 702 {
 703     cudaError_t    stat;
 704     cu_atomdata_t *adat  = nb->atdat;
 705     cudaStream_t   ls    = nb->stream[eintLocal];
 706
 707     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 708     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 709     stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 710     CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 711     stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 712     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 713 }
 714
 715 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
 716 {
 717     nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
 718     /* clear shift force array and energies if the outputs were
 719        used in the current step */
 720     if (flags & GMX_FORCE_VIRIAL)
 721     {
 722         nbnxn_cuda_clear_e_fshift(nb);
 723     }
 724 }
 725
 726 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
 727                              const struct nbnxn_atomdata_t *nbat)
 728 {
 729     cudaError_t    stat;
 730     int            nalloc, natoms;
 731     bool           realloced;
 732     bool           bDoTime   = nb->bDoTime;
 733     cu_timers_t   *timers    = nb->timers;
 734     cu_atomdata_t *d_atdat   = nb->atdat;
 735     cudaStream_t   ls        = nb->stream[eintLocal];
 736
 737     natoms    = nbat->natoms;
 738     realloced = false;
 739
 740     if (bDoTime)
 741     {
 742         /* time async copy */
 743         stat = cudaEventRecord(timers->start_atdat, ls);
 744         CU_RET_ERR(stat, "cudaEventRecord failed");
 745     }
 746
 747     /* need to reallocate if we have to copy more atoms than the amount of space
 748        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 749     if (natoms > d_atdat->nalloc)
 750     {
 751         nalloc = over_alloc_small(natoms);
 752
 753         /* free up first if the arrays have already been initialized */
 754         if (d_atdat->nalloc != -1)
 755         {
 756             cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 757             cu_free_buffered(d_atdat->xq);
 758             cu_free_buffered(d_atdat->atom_types);
 759         }
 760
 761         stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 762         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 763         stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 764         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 765
 766         stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 767         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 768
 769         d_atdat->nalloc = nalloc;
 770         realloced       = true;
 771     }
 772
 773     d_atdat->natoms       = natoms;
 774     d_atdat->natoms_local = nbat->natoms_local;
 775
 776     /* need to clear GPU f output if realloc happened */
 777     if (realloced)
 778     {
 779         nbnxn_cuda_clear_f(nb, nalloc);
 780     }
 781
 782     cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 783                       natoms*sizeof(*d_atdat->atom_types), ls);
 784
 785     if (bDoTime)
 786     {
 787         stat = cudaEventRecord(timers->stop_atdat, ls);
 788         CU_RET_ERR(stat, "cudaEventRecord failed");
 789     }
 790 }
 791
 792 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 793                                           const gmx_device_info_t *dev_info)
 794 {
 795     cudaError_t stat;
 796
 797     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
 798     {
 799         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 800         if (use_texobj(dev_info))
 801         {
 802             stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
 803             CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
 804         }
 805         else
 806         {
 807             GMX_UNUSED_VALUE(dev_info);
 808             stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 809             CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
 810         }
 811         cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 812     }
 813 }
 814
 815 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 816 {
 817     cudaError_t      stat;
 818     cu_atomdata_t   *atdat;
 819     cu_nbparam_t    *nbparam;
 820     cu_plist_t      *plist, *plist_nl;
 821     cu_timers_t     *timers;
 822
 823     /* Stopping the nvidia profiler here allows us to eliminate the subsequent
 824        uninitialization API calls from the trace. */
 825     if (getenv("NVPROF_ID") != NULL)
 826     {
 827         stat = cudaProfilerStop();
 828         CU_RET_ERR(stat, "cudaProfilerStop failed");
 829     }
 830
 831     if (nb == NULL)
 832     {
 833         return;
 834     }
 835
 836     atdat       = nb->atdat;
 837     nbparam     = nb->nbparam;
 838     plist       = nb->plist[eintLocal];
 839     plist_nl    = nb->plist[eintNonlocal];
 840     timers      = nb->timers;
 841
 842     nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
 843
 844     stat = cudaEventDestroy(nb->nonlocal_done);
 845     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 846     stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
 847     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
 848
 849     if (nb->bDoTime)
 850     {
 851         stat = cudaEventDestroy(timers->start_atdat);
 852         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 853         stat = cudaEventDestroy(timers->stop_atdat);
 854         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
 855
 856         /* The non-local counters/stream (second in the array) are needed only with DD. */
 857         for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
 858         {
 859             stat = cudaEventDestroy(timers->start_nb_k[i]);
 860             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
 861             stat = cudaEventDestroy(timers->stop_nb_k[i]);
 862             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
 863
 864             stat = cudaEventDestroy(timers->start_pl_h2d[i]);
 865             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
 866             stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
 867             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
 868
 869             stat = cudaStreamDestroy(nb->stream[i]);
 870             CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
 871
 872             stat = cudaEventDestroy(timers->start_nb_h2d[i]);
 873             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
 874             stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
 875             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
 876
 877             stat = cudaEventDestroy(timers->start_nb_d2h[i]);
 878             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
 879             stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
 880             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
 881         }
 882     }
 883
 884     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 885     if (use_texobj(nb->dev_info))
 886     {
 887         stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
 888         CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
 889     }
 890     else
 891     {
 892         stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
 893         CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
 894     }
 895     cu_free_buffered(nbparam->nbfp);
 896
 897     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
 898     {
 899         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 900         if (use_texobj(nb->dev_info))
 901         {
 902             stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
 903             CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
 904         }
 905         else
 906         {
 907             stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
 908             CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
 909         }
 910         cu_free_buffered(nbparam->nbfp_comb);
 911     }
 912
 913     stat = cudaFree(atdat->shift_vec);
 914     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
 915     stat = cudaFree(atdat->fshift);
 916     CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
 917
 918     stat = cudaFree(atdat->e_lj);
 919     CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
 920     stat = cudaFree(atdat->e_el);
 921     CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
 922
 923     cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
 924     cu_free_buffered(atdat->xq);
 925     cu_free_buffered(atdat->atom_types, &atdat->ntypes);
 926
 927     cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
 928     cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
 929     cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
 930     if (nb->bUseTwoStreams)
 931     {
 932         cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
 933         cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
 934         cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
 935     }
 936
 937     sfree(atdat);
 938     sfree(nbparam);
 939     sfree(plist);
 940     if (nb->bUseTwoStreams)
 941     {
 942         sfree(plist_nl);
 943     }
 944     sfree(timers);
 945     sfree(nb->timings);
 946     sfree(nb);
 947
 948     if (debug)
 949     {
 950         fprintf(debug, "Cleaned up CUDA data structures.\n");
 951     }
 952 }
 953
 954 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
 955 {
 956     cudaError_t  stat;
 957     cudaStream_t stream = nb->stream[iloc];
 958
 959     stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
 960     CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
 961 }
 962
 963 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
 964 {
 965     return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
 966 }
 967
 968 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
 969 {
 970     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 971        mdrun is executed in the CUDA profiler.
 972        If nvprof was run is with "--profile-from-start off", the profiler will
 973        be started here. This way we can avoid tracing the CUDA events from the
 974        first part of the run. Starting the profiler again does nothing.
 975      */
 976     if (getenv("NVPROF_ID") != NULL)
 977     {
 978         cudaError_t stat;
 979         stat = cudaProfilerStart();
 980         CU_RET_ERR(stat, "cudaProfilerStart failed");
 981     }
 982
 983     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 984     {
 985         init_timings(nbv->gpu_nbv->timings);
 986     }
 987 }
 988
 989 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
 990 {
 991     return nb != NULL ?
 992            gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
 993
 994 }
 995
 996 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
 997 {
 998     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
 999             (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
1000 }