From 9756541c1c0c0259caed70fea23ddadbd090e4b0 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Tue, 12 Apr 2016 15:54:51 +0200
Subject: [PATCH] Move out CUDA profiler triggers from NBNXN

The profiler triggering is a general functionality that should not be
tied to the nonbonded module. Hence, it is now moved into the gpu_utils
module and called directly at reset/cleanup.

Change-Id: Ifa862dbcbc6386c514dfcc1f6a5169ea6ae8d09f
---
 src/gromacs/gpu_utils/gpu_utils.cu                 | 49 +++++++++++++++++++++-
 src/gromacs/gpu_utils/gpu_utils.h                  | 42 ++++++++++++++++++-
 src/gromacs/mdlib/forcerec.cpp                     |  2 +
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu       | 23 ----------
 src/programs/mdrun/md.cpp                          |  2 +
 5 files changed, 93 insertions(+), 25 deletions(-)

diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index 3c271547f7..f08d55443e 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -48,6 +48,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <cuda_profiler_api.h>
+
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
 #include "gromacs/hardware/gpu_hw_info.h"
@@ -86,7 +88,9 @@ static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  *
  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  */
-static int cuda_max_device_count = 32;
+static int  cuda_max_device_count = 32;
+
+static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
 
 /** Dummy kernel used for sanity checking. */
 __global__ void k_dummy_test()
@@ -783,3 +787,46 @@ void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
         *nb_free  = NULL;
     }
 }
+
+void startGpuProfiler(void)
+{
+    /* The NVPROF_ID environment variable is set by nvprof and indicates that
+       mdrun is executed in the CUDA profiler.
+       If nvprof was run is with "--profile-from-start off", the profiler will
+       be started here. This way we can avoid tracing the CUDA events from the
+       first part of the run. Starting the profiler again does nothing.
+     */
+    if (cudaProfilerRun)
+    {
+        cudaError_t stat;
+        stat = cudaProfilerStart();
+        CU_RET_ERR(stat, "cudaProfilerStart failed");
+    }
+}
+
+void stopGpuProfiler(void)
+{
+    /* Stopping the nvidia here allows us to eliminate the subsequent
+       API calls from the trace, e.g. uninitialization and cleanup. */
+    if (cudaProfilerRun)
+    {
+        cudaError_t stat;
+        stat = cudaProfilerStop();
+        CU_RET_ERR(stat, "cudaProfilerStop failed");
+    }
+}
+
+void resetGpuProfiler(void)
+{
+    /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
+     *  the profiling here (can't stop it) which will achieve the desired effect if
+     *  the run was started with the profiling disabled.
+     *
+     * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
+     * stopGpuProfiler();
+     */
+    if (cudaProfilerRun)
+    {
+        startGpuProfiler();
+    }
+}
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 2b20f38235..80fba6e6f1 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2010, The GROMACS development team.
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -232,4 +232,44 @@ void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
                                   gmx_host_alloc_t **nb_alloc,
                                   gmx_host_free_t  **nb_free);
 
+
+
+/*! \brief Starts the GPU profiler if mdrun is being profiled.
+ *
+ *  When a profiler run is in progress (based on the presence of the NVPROF_ID
+ *  env. var.), the profiler is started to begin collecting data during the
+ *  rest of the run (or until stopGpuProfiler is called).
+ *
+ *  Note that this is implemented only for the CUDA API.
+ */
+CUDA_FUNC_QUALIFIER
+void startGpuProfiler(void) GPU_FUNC_TERM
+
+
+/*! \brief Resets the GPU profiler if mdrun is being profiled.
+ *
+ * When a profiler run is in progress (based on the presence of the NVPROF_ID
+ * env. var.), the profiler data is restet in order to eliminate the data collected
+ * from the preceding part fo the run.
+ *
+ * This function should typically be called at the mdrun counter reset time.
+ *
+ * Note that this is implemented only for the CUDA API.
+ */
+CUDA_FUNC_QUALIFIER
+void resetGpuProfiler(void) GPU_FUNC_TERM
+
+
+/*! \brief Stops the CUDA profiler if mdrun is being profiled.
+ *
+ *  This function can be called at cleanup when skipping recording
+ *  recording subsequent API calls from being traces/profiled is desired,
+ *  e.g. before uninitialization.
+ *
+ *  Note that this is implemented only for the CUDA API.
+ */
+CUDA_FUNC_QUALIFIER
+void stopGpuProfiler(void) GPU_FUNC_TERM
+
+
 #endif
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index e22d3ac8ed..6ed1608b20 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -3238,6 +3238,8 @@ void free_gpu_resources(const t_forcerec     *fr,
     {
         /* free nbnxn data in GPU memory */
         nbnxn_gpu_free(fr->nbv->gpu_nbv);
+        /* stop the GPU profiler (only CUDA) */
+        stopGpuProfiler();
 
         /* With tMPI we need to wait for all ranks to finish deallocation before
          * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index eafbdbac82..5239842a9e 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -46,8 +46,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include <cuda_profiler_api.h>
-
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -902,14 +900,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     cu_plist_t      *plist, *plist_nl;
     cu_timers_t     *timers;
 
-    /* Stopping the nvidia profiler here allows us to eliminate the subsequent
-       uninitialization API calls from the trace. */
-    if (getenv("NVPROF_ID") != NULL)
-    {
-        stat = cudaProfilerStop();
-        CU_RET_ERR(stat, "cudaProfilerStop failed");
-    }
-
     if (nb == NULL)
     {
         return;
@@ -1053,19 +1043,6 @@ gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
 
 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
 {
-    /* The NVPROF_ID environment variable is set by nvprof and indicates that
-       mdrun is executed in the CUDA profiler.
-       If nvprof was run is with "--profile-from-start off", the profiler will
-       be started here. This way we can avoid tracing the CUDA events from the
-       first part of the run. Starting the profiler again does nothing.
-     */
-    if (getenv("NVPROF_ID") != NULL)
-    {
-        cudaError_t stat;
-        stat = cudaProfilerStart();
-        CU_RET_ERR(stat, "cudaProfilerStart failed");
-    }
-
     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
     {
         init_timings(nbv->gpu_nbv->timings);
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp
index 20c0cbc602..fa65cabe81 100644
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -56,6 +56,7 @@
 #include "gromacs/gmxlib/md_logging.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/imd/imd.h"
 #include "gromacs/listed-forces/manage-threading.h"
 #include "gromacs/math/functions.h"
@@ -164,6 +165,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
     if (use_GPU(nbv))
     {
         nbnxn_gpu_reset_timings(nbv);
+        resetGpuProfiler();
     }
 
     wallcycle_stop(wcycle, ewcRUN);
-- 
2.11.4.GIT