2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
38 * \author Szilard Pall <pall.szilard@gmail.com>
49 #include <cuda_profiler_api.h>
51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
55 #include "gromacs/legacyheaders/types/enums.h"
56 #include "gromacs/legacyheaders/types/force_flags.h"
57 #include "gromacs/legacyheaders/types/interaction_const.h"
58 #include "gromacs/math/vectypes.h"
59 #include "gromacs/mdlib/nb_verlet.h"
60 #include "gromacs/mdlib/nbnxn_consts.h"
61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
62 #include "gromacs/pbcutil/ishift.h"
63 #include "gromacs/timing/gpu_timing.h"
64 #include "gromacs/utility/basedefinitions.h"
65 #include "gromacs/utility/cstringutil.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/real.h"
68 #include "gromacs/utility/smalloc.h"
70 #include "nbnxn_cuda_types.h"
72 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
74 /* This is a heuristically determined parameter for the Fermi, Kepler
75 * and Maxwell architectures for the minimum size of ci lists by multiplying
76 * this constant with the # of multiprocessors on the current device.
77 * Since the maximum number of blocks per multiprocessor is 16, the ideal
78 * count for small systems is 32 or 48 blocks per multiprocessor. Because
79 * there is a bit of fluctuations in the generated block counts, we use
80 * a target of 44 instead of the ideal value of 48.
82 static unsigned int gpu_min_ci_balanced_factor = 44;
84 /* Functions from nbnxn_cuda.cu */
85 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
86 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
87 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
88 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
92 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
95 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
96 const gmx_device_info_t *dev_info);
100 static bool use_texobj(const gmx_device_info_t *dev_info)
102 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
103 return (dev_info->prop.major >= 3);
106 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
107 and the table GPU array. If called with an already allocated table,
108 it just re-uploads the table.
110 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
112 const gmx_device_info_t *dev_info)
117 if (nbp->coulomb_tab != NULL)
119 nbnxn_cuda_free_nbparam_table(nbp, dev_info);
122 stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
123 CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
125 nbp->coulomb_tab = coul_tab;
127 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
128 if (use_texobj(dev_info))
131 memset(&rd, 0, sizeof(rd));
132 rd.resType = cudaResourceTypeLinear;
133 rd.res.linear.devPtr = nbp->coulomb_tab;
134 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
135 rd.res.linear.desc.x = 32;
136 rd.res.linear.sizeInBytes = ic->tabq_size*sizeof(*coul_tab);
139 memset(&td, 0, sizeof(td));
140 td.readMode = cudaReadModeElementType;
141 stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
142 CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
146 GMX_UNUSED_VALUE(dev_info);
147 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
148 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
150 ic->tabq_size*sizeof(*coul_tab));
151 CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
154 cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
156 nbp->coulomb_tab_size = ic->tabq_size;
157 nbp->coulomb_tab_scale = ic->tabq_scale;
161 /*! Initializes the atomdata structure first time, it only gets filled at
163 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
168 stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
169 CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
170 ad->bShiftVecUploaded = false;
172 stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
173 CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
175 stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
176 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
177 stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
178 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
180 /* initialize to NULL poiters to data that is not allocated here and will
181 need reallocation in nbnxn_cuda_init_atomdata */
185 /* size -1 indicates that the respective array hasn't been initialized yet */
190 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
191 earlier GPUs, single or twin cut-off. */
192 static int pick_ewald_kernel_type(bool bTwinCut,
193 const gmx_device_info_t *dev_info)
195 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
198 /* Benchmarking/development environment variables to force the use of
199 analytical or tabulated Ewald kernel. */
200 bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
201 bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
203 if (bForceAnalyticalEwald && bForceTabulatedEwald)
205 gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
206 "requested through environment variables.");
209 /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
210 if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
212 bUseAnalyticalEwald = true;
216 fprintf(debug, "Using analytical Ewald CUDA kernels\n");
221 bUseAnalyticalEwald = false;
225 fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
229 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
230 forces it (use it for debugging/benchmarking only). */
231 if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
233 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
237 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
243 /*! Copies all parameters related to the cut-off from ic to nbp */
244 static void set_cutoff_parameters(cu_nbparam_t *nbp,
245 const interaction_const_t *ic)
247 nbp->ewald_beta = ic->ewaldcoeff_q;
248 nbp->sh_ewald = ic->sh_ewald;
249 nbp->epsfac = ic->epsfac;
250 nbp->two_k_rf = 2.0 * ic->k_rf;
251 nbp->c_rf = ic->c_rf;
252 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
253 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
254 nbp->rlist_sq = ic->rlist * ic->rlist;
256 nbp->sh_lj_ewald = ic->sh_lj_ewald;
257 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
259 nbp->rvdw_switch = ic->rvdw_switch;
260 nbp->dispersion_shift = ic->dispersion_shift;
261 nbp->repulsion_shift = ic->repulsion_shift;
262 nbp->vdw_switch = ic->vdw_switch;
265 /*! Initializes the nonbonded parameter data structure. */
266 static void init_nbparam(cu_nbparam_t *nbp,
267 const interaction_const_t *ic,
268 const nbnxn_atomdata_t *nbat,
269 const gmx_device_info_t *dev_info)
272 int ntypes, nnbfp, nnbfp_comb;
274 ntypes = nbat->ntype;
276 set_cutoff_parameters(nbp, ic);
278 if (ic->vdwtype == evdwCUT)
280 switch (ic->vdw_modifier)
283 case eintmodPOTSHIFT:
284 nbp->vdwtype = evdwCuCUT;
286 case eintmodFORCESWITCH:
287 nbp->vdwtype = evdwCuFSWITCH;
289 case eintmodPOTSWITCH:
290 nbp->vdwtype = evdwCuPSWITCH;
293 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
297 else if (ic->vdwtype == evdwPME)
299 if (ic->ljpme_comb_rule == ljcrGEOM)
301 assert(nbat->comb_rule == ljcrGEOM);
302 nbp->vdwtype = evdwCuEWALDGEOM;
306 assert(nbat->comb_rule == ljcrLB);
307 nbp->vdwtype = evdwCuEWALDLB;
312 gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
315 if (ic->eeltype == eelCUT)
317 nbp->eeltype = eelCuCUT;
319 else if (EEL_RF(ic->eeltype))
321 nbp->eeltype = eelCuRF;
323 else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
325 /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
326 nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
330 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
331 gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
334 /* generate table for PME */
335 nbp->coulomb_tab = NULL;
336 if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
338 init_ewald_coulomb_force_table(ic, nbp, dev_info);
341 nnbfp = 2*ntypes*ntypes;
342 nnbfp_comb = 2*ntypes;
344 stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
345 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
346 cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
349 if (ic->vdwtype == evdwPME)
351 stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
352 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
353 cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
356 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
357 if (use_texobj(dev_info))
362 memset(&rd, 0, sizeof(rd));
363 rd.resType = cudaResourceTypeLinear;
364 rd.res.linear.devPtr = nbp->nbfp;
365 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
366 rd.res.linear.desc.x = 32;
367 rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp);
369 memset(&td, 0, sizeof(td));
370 td.readMode = cudaReadModeElementType;
371 stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
372 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
374 if (ic->vdwtype == evdwPME)
376 memset(&rd, 0, sizeof(rd));
377 rd.resType = cudaResourceTypeLinear;
378 rd.res.linear.devPtr = nbp->nbfp_comb;
379 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
380 rd.res.linear.desc.x = 32;
381 rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb);
383 memset(&td, 0, sizeof(td));
384 td.readMode = cudaReadModeElementType;
385 stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
386 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
391 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
392 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
393 nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
394 CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
396 if (ic->vdwtype == evdwPME)
398 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
399 nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
400 CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
405 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
406 * electrostatic type switching to twin cut-off (or back) if needed. */
407 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
408 const interaction_const_t *ic)
410 if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
414 gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
415 cu_nbparam_t *nbp = nb->nbparam;
417 set_cutoff_parameters(nbp, ic);
419 nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
422 init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
425 /*! Initializes the pair list data structure. */
426 static void init_plist(cu_plist_t *pl)
428 /* initialize to NULL pointers to data that is not allocated here and will
429 need reallocation in nbnxn_gpu_init_pairlist */
434 /* size -1 indicates that the respective array hasn't been initialized yet */
441 pl->excl_nalloc = -1;
442 pl->bDoPrune = false;
445 /*! Initializes the timer data structure. */
446 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
449 int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
451 stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
452 CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
453 stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
454 CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
456 /* The non-local counters/stream (second in the array) are needed only with DD. */
457 for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
459 stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
460 CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
461 stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
462 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
465 stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
466 CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
467 stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
468 CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
470 stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
471 CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
472 stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
473 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
475 stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
476 CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
477 stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
478 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
482 /*! Initializes the timings data structure. */
483 static void init_timings(gmx_wallclock_gpu_t *t)
492 for (i = 0; i < 2; i++)
494 for (j = 0; j < 2; j++)
496 t->ktime[i][j].t = 0.0;
497 t->ktime[i][j].c = 0;
502 /*! Initializes simulation constant data. */
503 static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb,
504 const interaction_const_t *ic,
505 const nonbonded_verlet_group_t *nbv_group)
507 init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
508 init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
510 /* clear energy and shift force outputs */
511 nbnxn_cuda_clear_e_fshift(nb);
514 void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb,
515 const gmx_gpu_info_t *gpu_info,
516 const gmx_gpu_opt_t *gpu_opt,
517 const interaction_const_t *ic,
518 nonbonded_verlet_group_t *nbv_grp,
521 gmx_bool bLocalAndNonlocal)
524 gmx_nbnxn_cuda_t *nb;
535 snew(nb->nbparam, 1);
536 snew(nb->plist[eintLocal], 1);
537 if (bLocalAndNonlocal)
539 snew(nb->plist[eintNonlocal], 1);
542 nb->bUseTwoStreams = bLocalAndNonlocal;
545 snew(nb->timings, 1);
548 pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
549 pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
550 pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
552 init_plist(nb->plist[eintLocal]);
554 /* set device info, just point it to the right GPU among the detected ones */
555 nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
557 /* local/non-local GPU streams */
558 stat = cudaStreamCreate(&nb->stream[eintLocal]);
559 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
560 if (nb->bUseTwoStreams)
562 init_plist(nb->plist[eintNonlocal]);
564 /* CUDA stream priority available in the CUDA RT 5.5 API.
565 * Note that the device we're running on does not have to support
566 * priorities, because we are querying the priority range which in this
567 * case will be a single value.
569 #if GMX_CUDA_VERSION >= 5050
571 int highest_priority;
572 stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
573 CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
575 stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
578 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
581 stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
582 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
586 /* init events for sychronization (timing disabled for performance reasons!) */
587 stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
588 CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
589 stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
590 CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
592 /* CUDA timing disabled as event timers don't work:
593 - with multiple streams = domain-decomposition;
594 - when turned off by GMX_DISABLE_CUDA_TIMING.
596 nb->bDoTime = (!nb->bUseTwoStreams &&
597 (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
601 init_timers(nb->timers, nb->bUseTwoStreams);
602 init_timings(nb->timings);
605 /* set the kernel type for the current GPU */
606 /* pick L1 cache configuration */
607 nbnxn_cuda_set_cacheconfig(nb->dev_info);
609 nbnxn_cuda_init_const(nb, ic, nbv_grp);
615 fprintf(debug, "Initialized CUDA data structures.\n");
619 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
620 const nbnxn_pairlist_t *h_plist,
625 bool bDoTime = nb->bDoTime;
626 cudaStream_t stream = nb->stream[iloc];
627 cu_plist_t *d_plist = nb->plist[iloc];
629 if (d_plist->na_c < 0)
631 d_plist->na_c = h_plist->na_ci;
635 if (d_plist->na_c != h_plist->na_ci)
637 sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
638 d_plist->na_c, h_plist->na_ci);
645 stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
646 CU_RET_ERR(stat, "cudaEventRecord failed");
649 cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
650 &d_plist->nsci, &d_plist->sci_nalloc,
654 cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
655 &d_plist->ncj4, &d_plist->cj4_nalloc,
659 cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
660 &d_plist->nexcl, &d_plist->excl_nalloc,
666 stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
667 CU_RET_ERR(stat, "cudaEventRecord failed");
670 /* need to prune the pair list during the next step */
671 d_plist->bDoPrune = true;
674 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
675 const nbnxn_atomdata_t *nbatom)
677 cu_atomdata_t *adat = nb->atdat;
678 cudaStream_t ls = nb->stream[eintLocal];
680 /* only if we have a dynamic box */
681 if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
683 cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
684 SHIFTS * sizeof(*adat->shift_vec), ls);
685 adat->bShiftVecUploaded = true;
689 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
690 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
693 cu_atomdata_t *adat = nb->atdat;
694 cudaStream_t ls = nb->stream[eintLocal];
696 stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
697 CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
700 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
701 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
704 cu_atomdata_t *adat = nb->atdat;
705 cudaStream_t ls = nb->stream[eintLocal];
707 stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
708 CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
709 stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
710 CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
711 stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
712 CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
715 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
717 nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
718 /* clear shift force array and energies if the outputs were
719 used in the current step */
720 if (flags & GMX_FORCE_VIRIAL)
722 nbnxn_cuda_clear_e_fshift(nb);
726 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
727 const struct nbnxn_atomdata_t *nbat)
732 bool bDoTime = nb->bDoTime;
733 cu_timers_t *timers = nb->timers;
734 cu_atomdata_t *d_atdat = nb->atdat;
735 cudaStream_t ls = nb->stream[eintLocal];
737 natoms = nbat->natoms;
742 /* time async copy */
743 stat = cudaEventRecord(timers->start_atdat, ls);
744 CU_RET_ERR(stat, "cudaEventRecord failed");
747 /* need to reallocate if we have to copy more atoms than the amount of space
748 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
749 if (natoms > d_atdat->nalloc)
751 nalloc = over_alloc_small(natoms);
753 /* free up first if the arrays have already been initialized */
754 if (d_atdat->nalloc != -1)
756 cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
757 cu_free_buffered(d_atdat->xq);
758 cu_free_buffered(d_atdat->atom_types);
761 stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
762 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
763 stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
764 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
766 stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
767 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
769 d_atdat->nalloc = nalloc;
773 d_atdat->natoms = natoms;
774 d_atdat->natoms_local = nbat->natoms_local;
776 /* need to clear GPU f output if realloc happened */
779 nbnxn_cuda_clear_f(nb, nalloc);
782 cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
783 natoms*sizeof(*d_atdat->atom_types), ls);
787 stat = cudaEventRecord(timers->stop_atdat, ls);
788 CU_RET_ERR(stat, "cudaEventRecord failed");
792 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
793 const gmx_device_info_t *dev_info)
797 if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
799 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
800 if (use_texobj(dev_info))
802 stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
803 CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
807 GMX_UNUSED_VALUE(dev_info);
808 stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
809 CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
811 cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
815 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
818 cu_atomdata_t *atdat;
819 cu_nbparam_t *nbparam;
820 cu_plist_t *plist, *plist_nl;
823 /* Stopping the nvidia profiler here allows us to eliminate the subsequent
824 uninitialization API calls from the trace. */
825 if (getenv("NVPROF_ID") != NULL)
827 stat = cudaProfilerStop();
828 CU_RET_ERR(stat, "cudaProfilerStop failed");
837 nbparam = nb->nbparam;
838 plist = nb->plist[eintLocal];
839 plist_nl = nb->plist[eintNonlocal];
842 nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
844 stat = cudaEventDestroy(nb->nonlocal_done);
845 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
846 stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
847 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
851 stat = cudaEventDestroy(timers->start_atdat);
852 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
853 stat = cudaEventDestroy(timers->stop_atdat);
854 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
856 /* The non-local counters/stream (second in the array) are needed only with DD. */
857 for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
859 stat = cudaEventDestroy(timers->start_nb_k[i]);
860 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
861 stat = cudaEventDestroy(timers->stop_nb_k[i]);
862 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
864 stat = cudaEventDestroy(timers->start_pl_h2d[i]);
865 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
866 stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
867 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
869 stat = cudaStreamDestroy(nb->stream[i]);
870 CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
872 stat = cudaEventDestroy(timers->start_nb_h2d[i]);
873 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
874 stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
875 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
877 stat = cudaEventDestroy(timers->start_nb_d2h[i]);
878 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
879 stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
880 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
884 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
885 if (use_texobj(nb->dev_info))
887 stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
888 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
892 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
893 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
895 cu_free_buffered(nbparam->nbfp);
897 if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
899 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
900 if (use_texobj(nb->dev_info))
902 stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
903 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
907 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
908 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
910 cu_free_buffered(nbparam->nbfp_comb);
913 stat = cudaFree(atdat->shift_vec);
914 CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
915 stat = cudaFree(atdat->fshift);
916 CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
918 stat = cudaFree(atdat->e_lj);
919 CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
920 stat = cudaFree(atdat->e_el);
921 CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
923 cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
924 cu_free_buffered(atdat->xq);
925 cu_free_buffered(atdat->atom_types, &atdat->ntypes);
927 cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
928 cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
929 cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
930 if (nb->bUseTwoStreams)
932 cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
933 cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
934 cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
940 if (nb->bUseTwoStreams)
950 fprintf(debug, "Cleaned up CUDA data structures.\n");
954 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
957 cudaStream_t stream = nb->stream[iloc];
959 stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
960 CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
963 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
965 return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
968 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
970 /* The NVPROF_ID environment variable is set by nvprof and indicates that
971 mdrun is executed in the CUDA profiler.
972 If nvprof was run is with "--profile-from-start off", the profiler will
973 be started here. This way we can avoid tracing the CUDA events from the
974 first part of the run. Starting the profiler again does nothing.
976 if (getenv("NVPROF_ID") != NULL)
979 stat = cudaProfilerStart();
980 CU_RET_ERR(stat, "cudaProfilerStart failed");
983 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
985 init_timings(nbv->gpu_nbv->timings);
989 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
992 gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
996 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
998 return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
999 (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));