From bb934c6b438b3314cbacc2627c8e3af51b533697 Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Sat, 1 Jul 2017 01:12:58 +0200
Subject: [PATCH] Refactor error handling in init_gpu

Change-Id: I7fe1d196ed3696d359443553407862b08c089c4c
---
 src/gromacs/gpu_utils/gpu_utils.cu      | 28 +++++++++++++++-------------
 src/gromacs/gpu_utils/gpu_utils.h       | 17 +++++++++--------
 src/gromacs/gpu_utils/gpu_utils_ocl.cpp | 22 +++++++++-------------
 src/gromacs/mdlib/forcerec.cpp          | 26 +++++++++-----------------
 4 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index c2f5527efb..260001f006 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -57,6 +57,7 @@
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/snprintf.h"
 
 #if HAVE_NVML
 #include <nvml.h>
@@ -452,29 +453,34 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused
 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 }
 
-gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str,
-                  const struct gmx_gpu_info_t *gpu_info,
-                  const struct gmx_gpu_opt_t *gpu_opt)
+void init_gpu(const gmx::MDLogger &mdlog, int rank, int mygpu,
+              const struct gmx_gpu_info_t *gpu_info,
+              const struct gmx_gpu_opt_t *gpu_opt)
 {
     cudaError_t stat;
     char        sbuf[STRLEN];
     int         gpuid;
 
     assert(gpu_info);
-    assert(result_str);
+    assert(gpu_opt);
 
     if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
     {
-        sprintf(sbuf, "Trying to initialize an non-existent GPU: "
-                "there are %d selected GPU(s), but #%d was requested.",
-                gpu_opt->n_dev_use, mygpu);
+        snprintf(sbuf, STRLEN, "On rank %d trying to initialize an non-existent GPU: "
+                 "there are %d selected GPU(s), but #%d was requested.",
+                 rank, gpu_opt->n_dev_use, mygpu);
         gmx_incons(sbuf);
     }
 
     gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
 
     stat = cudaSetDevice(gpuid);
-    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
+    if (stat != cudaSuccess)
+    {
+        snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d",
+                 rank, mygpu);
+        CU_RET_ERR(stat, sbuf);
+    }
 
     if (debug)
     {
@@ -482,11 +488,7 @@ gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str,
     }
 
     //Ignoring return value as NVML errors should be treated not critical.
-    if (stat == cudaSuccess)
-    {
-        init_gpu_application_clocks(mdlog, gpuid, gpu_info);
-    }
-    return (stat == cudaSuccess);
+    init_gpu_application_clocks(mdlog, gpuid, gpu_info);
 }
 
 gmx_bool free_cuda_gpu(
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 86001cbdf1..bcafdf7f7f 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -107,19 +107,20 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
  * gpu_info.gpu_dev array.
  *
  * \param      mdlog        log file to write to
+ * \param[in]  rank         MPI rank of this process (for error output)
  * \param[in]  mygpu        index of the GPU to initialize
- * \param[out] result_str   the message related to the error that occurred
- *                          during the initialization (if there was any).
  * \param[in] gpu_info      GPU info of all detected devices in the system.
  * \param[in] gpu_opt       options for using the GPUs in gpu_info
- * \returns                 true if no error occurs during initialization.
+ *
+ * Issues a fatal error for any critical errors that occur during
+ * initialization.
  */
 GPU_FUNC_QUALIFIER
-gmx_bool init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
-                  int GPU_FUNC_ARGUMENT(mygpu),
-                  char *GPU_FUNC_ARGUMENT(result_str),
-                  const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
-                  const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
+void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
+              int GPU_FUNC_ARGUMENT(rank),
+              int GPU_FUNC_ARGUMENT(mygpu),
+              const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+              const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM
 
 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
  *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
index 2b245395d8..8d02efa51d 100644
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -397,23 +397,21 @@ void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unu
 }
 
 //! This function is documented in the header file
-gmx_bool init_gpu(const gmx::MDLogger              & /*mdlog*/,
-                  int                              mygpu,
-                  char                            *result_str,
-                  const gmx_gpu_info_t gmx_unused *gpu_info,
-                  const gmx_gpu_opt_t             *gpu_opt
-                  )
+void init_gpu(const gmx::MDLogger               & /*mdlog*/,
+              int                               rank,
+              int                               mygpu,
+              const gmx_gpu_info_t             *gpu_info,
+              const gmx_gpu_opt_t              *gpu_opt
+              )
 {
-    assert(result_str);
-
-    result_str[0] = 0;
+    assert(gpu_opt);
 
     if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
     {
         char        sbuf[STRLEN];
-        sprintf(sbuf, "Trying to initialize an non-existent GPU: "
+        sprintf(sbuf, "On rank %d trying to initialize an non-existent GPU: "
                 "there are %d selected GPU(s), but #%d was requested.",
-                gpu_opt->n_dev_use, mygpu);
+                rank, gpu_opt->n_dev_use, mygpu);
         gmx_incons(sbuf);
     }
 
@@ -433,8 +431,6 @@ gmx_bool init_gpu(const gmx::MDLogger              & /*mdlog*/,
         setenv("CUDA_CACHE_DISABLE", "1", 0);
 #endif
     }
-
-    return TRUE;
 }
 
 //! This function is documented in the header file
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 77591bda1a..451822d994 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1768,8 +1768,6 @@ static void pick_nbnxn_resources(const gmx::MDLogger &mdlog,
                                  bool                 emulateGpu,
                                  const gmx_gpu_opt_t *gpu_opt)
 {
-    char     gpu_err_str[STRLEN];
-
     *bUseGPU = FALSE;
 
     /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
@@ -1777,21 +1775,15 @@ static void pick_nbnxn_resources(const gmx::MDLogger &mdlog,
     if (gpu_opt->n_dev_use > 0 && !emulateGpu)
     {
         /* Each PP node will use the intra-node id-th device from the
-         * list of detected/selected GPUs. */
-        if (!init_gpu(mdlog, cr->rank_pp_intranode, gpu_err_str,
-                      &hwinfo->gpu_info, gpu_opt))
-        {
-            /* At this point the init should never fail as we made sure that
-             * we have all the GPUs we need. If it still does, we'll bail. */
-            /* TODO the decorating of gpu_err_str is nicer if it
-               happens inside init_gpu. Out here, the decorating with
-               the MPI rank makes sense. */
-            gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
-                      cr->nodeid,
-                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
-                                        cr->rank_pp_intranode),
-                      gpu_err_str);
-        }
+         * list of detected/selected GPUs.
+         *
+         * At this point the init should never fail as we made sure that
+         * we have all the GPUs we need. If it still does, we'll exit.
+         *
+         * TODO The error reporting will be nicer when the logger is
+         * aware of MPI ranks. */
+        init_gpu(mdlog, cr->nodeid, cr->rank_pp_intranode,
+                 &hwinfo->gpu_info, gpu_opt);
 
         /* Here we actually turn on hardware GPU acceleration */
         *bUseGPU = TRUE;
-- 
2.11.4.GIT