From bb934c6b438b3314cbacc2627c8e3af51b533697 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Sat, 1 Jul 2017 01:12:58 +0200 Subject: [PATCH] Refactor error handling in init_gpu Change-Id: I7fe1d196ed3696d359443553407862b08c089c4c --- src/gromacs/gpu_utils/gpu_utils.cu | 28 +++++++++++++++------------- src/gromacs/gpu_utils/gpu_utils.h | 17 +++++++++-------- src/gromacs/gpu_utils/gpu_utils_ocl.cpp | 22 +++++++++------------- src/gromacs/mdlib/forcerec.cpp | 26 +++++++++----------------- 4 files changed, 42 insertions(+), 51 deletions(-) diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index c2f5527efb..260001f006 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -57,6 +57,7 @@ #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/logger.h" #include "gromacs/utility/smalloc.h" +#include "gromacs/utility/snprintf.h" #if HAVE_NVML #include @@ -452,29 +453,34 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused #endif /* HAVE_NVML_APPLICATION_CLOCKS */ } -gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str, - const struct gmx_gpu_info_t *gpu_info, - const struct gmx_gpu_opt_t *gpu_opt) +void init_gpu(const gmx::MDLogger &mdlog, int rank, int mygpu, + const struct gmx_gpu_info_t *gpu_info, + const struct gmx_gpu_opt_t *gpu_opt) { cudaError_t stat; char sbuf[STRLEN]; int gpuid; assert(gpu_info); - assert(result_str); + assert(gpu_opt); if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use) { - sprintf(sbuf, "Trying to initialize an non-existent GPU: " - "there are %d selected GPU(s), but #%d was requested.", - gpu_opt->n_dev_use, mygpu); + snprintf(sbuf, STRLEN, "On rank %d trying to initialize an non-existent GPU: " + "there are %d selected GPU(s), but #%d was requested.", + rank, gpu_opt->n_dev_use, mygpu); gmx_incons(sbuf); } gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id; stat = cudaSetDevice(gpuid); - strncpy(result_str, cudaGetErrorString(stat), STRLEN); + if (stat != cudaSuccess) + { + snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d", + rank, mygpu); + CU_RET_ERR(stat, sbuf); + } if (debug) { @@ -482,11 +488,7 @@ gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str, } //Ignoring return value as NVML errors should be treated not critical. - if (stat == cudaSuccess) - { - init_gpu_application_clocks(mdlog, gpuid, gpu_info); - } - return (stat == cudaSuccess); + init_gpu_application_clocks(mdlog, gpuid, gpu_info); } gmx_bool free_cuda_gpu( diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 86001cbdf1..bcafdf7f7f 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -107,19 +107,20 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU * gpu_info.gpu_dev array. * * \param mdlog log file to write to + * \param[in] rank MPI rank of this process (for error output) * \param[in] mygpu index of the GPU to initialize - * \param[out] result_str the message related to the error that occurred - * during the initialization (if there was any). * \param[in] gpu_info GPU info of all detected devices in the system. * \param[in] gpu_opt options for using the GPUs in gpu_info - * \returns true if no error occurs during initialization. + * + * Issues a fatal error for any critical errors that occur during + * initialization. */ GPU_FUNC_QUALIFIER -gmx_bool init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog), - int GPU_FUNC_ARGUMENT(mygpu), - char *GPU_FUNC_ARGUMENT(result_str), - const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), - const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1) +void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog), + int GPU_FUNC_ARGUMENT(rank), + int GPU_FUNC_ARGUMENT(mygpu), + const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), + const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM /*! \brief Frees up the CUDA GPU used by the active context at the time of calling. * diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp index 2b245395d8..8d02efa51d 100644 --- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp +++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp @@ -397,23 +397,21 @@ void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unu } //! This function is documented in the header file -gmx_bool init_gpu(const gmx::MDLogger & /*mdlog*/, - int mygpu, - char *result_str, - const gmx_gpu_info_t gmx_unused *gpu_info, - const gmx_gpu_opt_t *gpu_opt - ) +void init_gpu(const gmx::MDLogger & /*mdlog*/, + int rank, + int mygpu, + const gmx_gpu_info_t *gpu_info, + const gmx_gpu_opt_t *gpu_opt + ) { - assert(result_str); - - result_str[0] = 0; + assert(gpu_opt); if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use) { char sbuf[STRLEN]; - sprintf(sbuf, "Trying to initialize an non-existent GPU: " + sprintf(sbuf, "On rank %d trying to initialize an non-existent GPU: " "there are %d selected GPU(s), but #%d was requested.", - gpu_opt->n_dev_use, mygpu); + rank, gpu_opt->n_dev_use, mygpu); gmx_incons(sbuf); } @@ -433,8 +431,6 @@ gmx_bool init_gpu(const gmx::MDLogger & /*mdlog*/, setenv("CUDA_CACHE_DISABLE", "1", 0); #endif } - - return TRUE; } //! This function is documented in the header file diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index 77591bda1a..451822d994 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -1768,8 +1768,6 @@ static void pick_nbnxn_resources(const gmx::MDLogger &mdlog, bool emulateGpu, const gmx_gpu_opt_t *gpu_opt) { - char gpu_err_str[STRLEN]; - *bUseGPU = FALSE; /* Enable GPU mode when GPUs are available or no GPU emulation is requested. @@ -1777,21 +1775,15 @@ static void pick_nbnxn_resources(const gmx::MDLogger &mdlog, if (gpu_opt->n_dev_use > 0 && !emulateGpu) { /* Each PP node will use the intra-node id-th device from the - * list of detected/selected GPUs. */ - if (!init_gpu(mdlog, cr->rank_pp_intranode, gpu_err_str, - &hwinfo->gpu_info, gpu_opt)) - { - /* At this point the init should never fail as we made sure that - * we have all the GPUs we need. If it still does, we'll bail. */ - /* TODO the decorating of gpu_err_str is nicer if it - happens inside init_gpu. Out here, the decorating with - the MPI rank makes sense. */ - gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s", - cr->nodeid, - get_gpu_device_id(&hwinfo->gpu_info, gpu_opt, - cr->rank_pp_intranode), - gpu_err_str); - } + * list of detected/selected GPUs. + * + * At this point the init should never fail as we made sure that + * we have all the GPUs we need. If it still does, we'll exit. + * + * TODO The error reporting will be nicer when the logger is + * aware of MPI ranks. */ + init_gpu(mdlog, cr->nodeid, cr->rank_pp_intranode, + &hwinfo->gpu_info, gpu_opt); /* Here we actually turn on hardware GPU acceleration */ *bUseGPU = TRUE; -- 2.11.4.GIT