src/gromacs/gpu_utils/oclutils.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \libinternal \file
  36  *  \brief Declare utility routines for OpenCL
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \inlibraryapi
  40  */
  41 #ifndef GMX_GPU_UTILS_OCLUTILS_H
  42 #define GMX_GPU_UTILS_OCLUTILS_H
  43
  44 #include <string>
  45
  46 #include "gromacs/gpu_utils/gmxopencl.h"
  47 #include "gromacs/gpu_utils/gputraits_ocl.h"
  48 #include "gromacs/utility/exceptions.h"
  49 #include "gromacs/utility/gmxassert.h"
  50
  51 enum class GpuApiCallBehavior;
  52
  53 /*! \brief OpenCL vendor IDs */
  54 typedef enum {
  55     OCL_VENDOR_NVIDIA = 0,
  56     OCL_VENDOR_AMD,
  57     OCL_VENDOR_INTEL,
  58     OCL_VENDOR_UNKNOWN
  59 } ocl_vendor_id_t;
  60
  61 /*! \internal
  62  * \brief OpenCL GPU device identificator
  63  *
  64  * An OpenCL device is identified by its ID.
  65  * The platform ID is also included for caching reasons.
  66  */
  67 typedef struct
  68 {
  69     cl_platform_id      ocl_platform_id; /**< Platform ID */
  70     cl_device_id        ocl_device_id;   /**< Device ID */
  71 } ocl_gpu_id_t;
  72
  73 /*! \internal
  74  * \brief OpenCL device information.
  75  *
  76  * The OpenCL device information is queried and set at detection and contains
  77  * both information about the device/hardware returned by the runtime as well
  78  * as additional data like support status.
  79  */
  80 struct gmx_device_info_t
  81 {
  82     ocl_gpu_id_t        ocl_gpu_id;          /**< device ID assigned at detection   */
  83     char                device_name[256];    /**< device name */
  84     char                device_version[256]; /**< device version */
  85     char                device_vendor[256];  /**< device vendor */
  86     int                 compute_units;       /**< number of compute units */
  87     int                 adress_bits;         /**< number of adress bits the device is capable of */
  88     int                 stat;                /**< device status takes values of e_gpu_detect_res_t */
  89     ocl_vendor_id_t     vendor_e;            /**< device vendor as defined by ocl_vendor_id_t */
  90 };
  91
  92 /*! \internal
  93  * \brief OpenCL GPU runtime data
  94  *
  95  * The device runtime data is meant to hold objects associated with a GROMACS rank's
  96  * (thread or process) use of a single device (multiple devices per rank is not
  97  * implemented). These objects should be constructed at ther point where a device
  98  * dets assigned to a rank and released at when this assignment is no longer valid
  99  * (i.e. at cleanup in the current implementation).
 100  *
 101  */
 102 struct gmx_device_runtime_data_t
 103 {
 104     cl_context context; /**< OpenCL context */
 105     cl_program program; /**< OpenCL program */
 106 };
 107
 108 /*! \brief Launches synchronous or asynchronous device to host memory copy.
 109  *
 110  *  If copy_event is not NULL, on return it will contain an event object
 111  *  identifying this particular device to host operation. The event can further
 112  *  be used to queue a wait for this operation or to query profiling information.
 113  */
 114 int ocl_copy_D2H(void * h_dest, cl_mem d_src,
 115                  size_t offset, size_t bytes,
 116                  GpuApiCallBehavior transferKind,
 117                  cl_command_queue command_queue,
 118                  cl_event *copy_event);
 119
 120
 121 /*! \brief Launches asynchronous device to host memory copy. */
 122 int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
 123                        size_t offset, size_t bytes,
 124                        cl_command_queue command_queue,
 125                        cl_event *copy_event);
 126
 127 /*! \brief Launches synchronous or asynchronous host to device memory copy.
 128  *
 129  *  If copy_event is not NULL, on return it will contain an event object
 130  *  identifying this particular host to device operation. The event can further
 131  *  be used to queue a wait for this operation or to query profiling information.
 132  */
 133 int ocl_copy_H2D(cl_mem d_dest, void* h_src,
 134                  size_t offset, size_t bytes,
 135                  GpuApiCallBehavior transferKind,
 136                  cl_command_queue command_queue,
 137                  cl_event *copy_event);
 138
 139 /*! \brief Launches asynchronous host to device memory copy. */
 140 int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
 141                        size_t offset, size_t bytes,
 142                        cl_command_queue command_queue,
 143                        cl_event *copy_event);
 144
 145 /*! \brief Launches synchronous host to device memory copy. */
 146 int ocl_copy_H2D_sync(cl_mem d_dest, void * h_src,
 147                       size_t offset, size_t bytes,
 148                       cl_command_queue command_queue);
 149
 150 /*! \brief Allocate host memory in malloc style */
 151 void pmalloc(void **h_ptr, size_t nbytes);
 152
 153 /*! \brief Free host memory in malloc style */
 154 void pfree(void *h_ptr);
 155
 156 /*! \brief Convert error code to diagnostic string */
 157 std::string ocl_get_error_string(cl_int error);
 158
 159 /*! \brief Calls clFinish() in the stream \p s.
 160  *
 161  * \param[in] s stream to synchronize with
 162  */
 163 static inline void gpuStreamSynchronize(cl_command_queue s)
 164 {
 165     cl_int cl_error = clFinish(s);
 166     GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error,
 167                        ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
 168 }
 169
 170 /*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
 171  *
 172  * \param[in] s queue to check
 173  *
 174  *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
 175  */
 176 static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
 177 {
 178     GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
 179     return false;
 180 }
 181
 182 /* Kernel launch helpers */
 183
 184 /*! \brief
 185  * A function for setting up a single OpenCL kernel argument.
 186  * This is the tail of the compile-time recursive function below.
 187  * It has to be seen by the compiler first.
 188  * As NB kernels might be using dynamic local memory as the last argument,
 189  * this function also manages that, using sharedMemorySize from \p config.
 190  *
 191  * \param[in]     kernel          Kernel function handle
 192  * \param[in]     config          Kernel configuration for launching
 193  * \param[in]     argIndex        Index of the current argument
 194  */
 195 void inline prepareGpuKernelArgument(cl_kernel                 kernel,
 196                                      const KernelLaunchConfig &config,
 197                                      size_t                    argIndex)
 198 {
 199     if (config.sharedMemorySize > 0)
 200     {
 201         cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, config.sharedMemorySize, nullptr);
 202         GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
 203     }
 204 }
 205
 206 /*! \brief
 207  * Compile-time recursive function for setting up a single OpenCL kernel argument.
 208  * This function uses one kernel argument pointer \p argPtr to call clSetKernelArg(),
 209  * and calls itself on the next argument, eventually calling the tail function above.
 210  *
 211  * \tparam        CurrentArg      Type of the current argument
 212  * \tparam        RemainingArgs   Types of remaining arguments after the current one
 213  * \param[in]     kernel          Kernel function handle
 214  * \param[in]     config          Kernel configuration for launching
 215  * \param[in]     argIndex        Index of the current argument
 216  * \param[in]     argPtr          Pointer to the current argument
 217  * \param[in]     otherArgsPtrs   Pack of pointers to arguments remaining to process after the current one
 218  */
 219 template <typename CurrentArg, typename ... RemainingArgs>
 220 void prepareGpuKernelArgument(cl_kernel                 kernel,
 221                               const KernelLaunchConfig &config,
 222                               size_t                    argIndex,
 223                               const CurrentArg         *argPtr,
 224                               const RemainingArgs *...  otherArgsPtrs)
 225 {
 226     cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, sizeof(CurrentArg), argPtr);
 227     GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
 228
 229     prepareGpuKernelArgument(kernel, config, argIndex + 1, otherArgsPtrs ...);
 230 }
 231
 232 /*! \brief
 233  * A wrapper function for setting up all the OpenCL kernel arguments.
 234  * Calls the recursive functions above.
 235  *
 236  * \tparam    Args            Types of all the kernel arguments
 237  * \param[in] kernel          Kernel function handle
 238  * \param[in] config          Kernel configuration for launching
 239  * \param[in] argsPtrs        Pointers to all the kernel arguments
 240  * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument
 241  * - currently always nullptr for OpenCL, as it manages kernel/arguments association by itself.
 242  */
 243 template <typename ... Args>
 244 void *prepareGpuKernelArguments(cl_kernel                 kernel,
 245                                 const KernelLaunchConfig &config,
 246                                 const Args *...           argsPtrs)
 247 {
 248     prepareGpuKernelArgument(kernel, config, 0, argsPtrs ...);
 249     return nullptr;
 250 }
 251
 252 /*! \brief Launches the OpenCL kernel and handles the errors.
 253  *
 254  * \param[in] kernel          Kernel function handle
 255  * \param[in] config          Kernel configuration for launching
 256  * \param[in] timingEvent     Timing event, fetched from GpuRegionTimer
 257  * \param[in] kernelName      Human readable kernel description, for error handling only
 258  * \throws gmx::InternalError on kernel launch failure
 259  */
 260 inline void launchGpuKernel(cl_kernel                 kernel,
 261                             const KernelLaunchConfig &config,
 262                             CommandEvent             *timingEvent,
 263                             const char               *kernelName,
 264                             const void                * /*kernelArgs*/)
 265 {
 266     const int       workDimensions    = 3;
 267     const size_t   *globalWorkOffset  = nullptr;
 268     const size_t    waitListSize      = 0;
 269     const cl_event *waitList          = nullptr;
 270     size_t          globalWorkSize[3];
 271     for (int i = 0; i < workDimensions; i++)
 272     {
 273         globalWorkSize[i] = config.gridSize[i] * config.blockSize[i];
 274     }
 275     cl_int clError = clEnqueueNDRangeKernel(config.stream, kernel, workDimensions, globalWorkOffset,
 276                                             globalWorkSize, config.blockSize, waitListSize, waitList, timingEvent);
 277     if (CL_SUCCESS != clError)
 278     {
 279         const std::string errorMessage = "GPU kernel (" +  std::string(kernelName) +
 280             ") failed to launch: " + ocl_get_error_string(clError);
 281         GMX_THROW(gmx::InternalError(errorMessage));
 282     }
 283 }
 284
 285 #endif