From 9756541c1c0c0259caed70fea23ddadbd090e4b0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 12 Apr 2016 15:54:51 +0200 Subject: [PATCH] Move out CUDA profiler triggers from NBNXN The profiler triggering is a general functionality that should not be tied to the nonbonded module. Hence, it is now moved into the gpu_utils module and called directly at reset/cleanup. Change-Id: Ifa862dbcbc6386c514dfcc1f6a5169ea6ae8d09f --- src/gromacs/gpu_utils/gpu_utils.cu | 49 +++++++++++++++++++++- src/gromacs/gpu_utils/gpu_utils.h | 42 ++++++++++++++++++- src/gromacs/mdlib/forcerec.cpp | 2 + .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 23 ---------- src/programs/mdrun/md.cpp | 2 + 5 files changed, 93 insertions(+), 25 deletions(-) diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index 3c271547f7..f08d55443e 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -48,6 +48,8 @@ #include #include +#include + #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/pmalloc_cuda.h" #include "gromacs/hardware/gpu_hw_info.h" @@ -86,7 +88,9 @@ static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false; * * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side. */ -static int cuda_max_device_count = 32; +static int cuda_max_device_count = 32; + +static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL)); /** Dummy kernel used for sanity checking. */ __global__ void k_dummy_test() @@ -783,3 +787,46 @@ void gpu_set_host_malloc_and_free(bool bUseGpuKernels, *nb_free = NULL; } } + +void startGpuProfiler(void) +{ + /* The NVPROF_ID environment variable is set by nvprof and indicates that + mdrun is executed in the CUDA profiler. + If nvprof was run is with "--profile-from-start off", the profiler will + be started here. This way we can avoid tracing the CUDA events from the + first part of the run. Starting the profiler again does nothing. + */ + if (cudaProfilerRun) + { + cudaError_t stat; + stat = cudaProfilerStart(); + CU_RET_ERR(stat, "cudaProfilerStart failed"); + } +} + +void stopGpuProfiler(void) +{ + /* Stopping the nvidia here allows us to eliminate the subsequent + API calls from the trace, e.g. uninitialization and cleanup. */ + if (cudaProfilerRun) + { + cudaError_t stat; + stat = cudaProfilerStop(); + CU_RET_ERR(stat, "cudaProfilerStop failed"); + } +} + +void resetGpuProfiler(void) +{ + /* With CUDA <=7.5 the profiler can't be properly reset; we can only start + * the profiling here (can't stop it) which will achieve the desired effect if + * the run was started with the profiling disabled. + * + * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA. + * stopGpuProfiler(); + */ + if (cudaProfilerRun) + { + startGpuProfiler(); + } +} diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 2b20f38235..80fba6e6f1 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2010, The GROMACS development team. - * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -232,4 +232,44 @@ void gpu_set_host_malloc_and_free(bool bUseGpuKernels, gmx_host_alloc_t **nb_alloc, gmx_host_free_t **nb_free); + + +/*! \brief Starts the GPU profiler if mdrun is being profiled. + * + * When a profiler run is in progress (based on the presence of the NVPROF_ID + * env. var.), the profiler is started to begin collecting data during the + * rest of the run (or until stopGpuProfiler is called). + * + * Note that this is implemented only for the CUDA API. + */ +CUDA_FUNC_QUALIFIER +void startGpuProfiler(void) GPU_FUNC_TERM + + +/*! \brief Resets the GPU profiler if mdrun is being profiled. + * + * When a profiler run is in progress (based on the presence of the NVPROF_ID + * env. var.), the profiler data is restet in order to eliminate the data collected + * from the preceding part fo the run. + * + * This function should typically be called at the mdrun counter reset time. + * + * Note that this is implemented only for the CUDA API. + */ +CUDA_FUNC_QUALIFIER +void resetGpuProfiler(void) GPU_FUNC_TERM + + +/*! \brief Stops the CUDA profiler if mdrun is being profiled. + * + * This function can be called at cleanup when skipping recording + * recording subsequent API calls from being traces/profiled is desired, + * e.g. before uninitialization. + * + * Note that this is implemented only for the CUDA API. + */ +CUDA_FUNC_QUALIFIER +void stopGpuProfiler(void) GPU_FUNC_TERM + + #endif diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index e22d3ac8ed..6ed1608b20 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -3238,6 +3238,8 @@ void free_gpu_resources(const t_forcerec *fr, { /* free nbnxn data in GPU memory */ nbnxn_gpu_free(fr->nbv->gpu_nbv); + /* stop the GPU profiler (only CUDA) */ + stopGpuProfiler(); /* With tMPI we need to wait for all ranks to finish deallocation before * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index eafbdbac82..5239842a9e 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -46,8 +46,6 @@ #include #include -#include - #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/pmalloc_cuda.h" @@ -902,14 +900,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) cu_plist_t *plist, *plist_nl; cu_timers_t *timers; - /* Stopping the nvidia profiler here allows us to eliminate the subsequent - uninitialization API calls from the trace. */ - if (getenv("NVPROF_ID") != NULL) - { - stat = cudaProfilerStop(); - CU_RET_ERR(stat, "cudaProfilerStop failed"); - } - if (nb == NULL) { return; @@ -1053,19 +1043,6 @@ gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb) void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv) { - /* The NVPROF_ID environment variable is set by nvprof and indicates that - mdrun is executed in the CUDA profiler. - If nvprof was run is with "--profile-from-start off", the profiler will - be started here. This way we can avoid tracing the CUDA events from the - first part of the run. Starting the profiler again does nothing. - */ - if (getenv("NVPROF_ID") != NULL) - { - cudaError_t stat; - stat = cudaProfilerStart(); - CU_RET_ERR(stat, "cudaProfilerStart failed"); - } - if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime) { init_timings(nbv->gpu_nbv->timings); diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp index 20c0cbc602..fa65cabe81 100644 --- a/src/programs/mdrun/md.cpp +++ b/src/programs/mdrun/md.cpp @@ -56,6 +56,7 @@ #include "gromacs/gmxlib/md_logging.h" #include "gromacs/gmxlib/network.h" #include "gromacs/gmxlib/nrnb.h" +#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/imd/imd.h" #include "gromacs/listed-forces/manage-threading.h" #include "gromacs/math/functions.h" @@ -164,6 +165,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr, if (use_GPU(nbv)) { nbnxn_gpu_reset_timings(nbv); + resetGpuProfiler(); } wallcycle_stop(wcycle, ewcRUN); -- 2.11.4.GIT