src/gmxlib/cuda_tools/cudautils.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdlib.h>
  37
  38 #include "gmx_fatal.h"
  39 #include "smalloc.h"
  40 #include "typedefs.h"
  41 #include "cudautils.cuh"
  42
  43 /*** Generic CUDA data operation wrappers ***/
  44
  45 /*! Launches synchronous or asynchronous host to device memory copy.
  46  *
  47  *  The copy is launched in stream s or if not specified, in stream 0.
  48  */
  49 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  50                                bool bAsync = false, cudaStream_t s = 0)
  51 {
  52     cudaError_t stat;
  53
  54     if (h_dest == NULL || d_src == NULL || bytes == 0)
  55         return -1;
  56
  57     if (bAsync)
  58     {
  59         stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  60         CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  61
  62     }
  63     else
  64     {
  65         stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  66         CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  67     }
  68
  69     return 0;
  70 }
  71
  72 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
  73 {
  74     return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
  75 }
  76
  77 /*!
  78  *  The copy is launched in stream s or if not specified, in stream 0.
  79  */
  80 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  81 {
  82     return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
  83 }
  84
  85 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
  86 {
  87     if (h_dest == NULL || d_src == NULL || bytes == 0)
  88         return -1;
  89
  90     smalloc(*h_dest, bytes);
  91
  92     return cu_copy_D2H(*h_dest, d_src, bytes);
  93 }
  94
  95 /*! Launches synchronous or asynchronous device to host memory copy.
  96  *
  97  *  The copy is launched in stream s or if not specified, in stream 0.
  98  */
  99 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 100                                bool bAsync = false, cudaStream_t s = 0)
 101 {
 102     cudaError_t stat;
 103
 104     if (d_dest == NULL || h_src == NULL || bytes == 0)
 105         return -1;
 106
 107     if (bAsync)
 108     {
 109         stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 110         CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 111     }
 112     else
 113     {
 114         stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 115         CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 116     }
 117
 118     return 0;
 119 }
 120
 121 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
 122 {
 123     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 124 }
 125
 126 /*!
 127  *  The copy is launched in stream s or if not specified, in stream 0.
 128  */
 129 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 130 {
 131     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 132 }
 133
 134 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
 135 {
 136     cudaError_t stat;
 137
 138     if (d_dest == NULL || h_src == NULL || bytes == 0)
 139         return -1;
 140
 141     stat = cudaMalloc(d_dest, bytes);
 142     CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
 143
 144     return cu_copy_H2D(*d_dest, h_src, bytes);
 145 }
 146
 147 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
 148 {
 149     float t = 0.0;
 150     cudaError_t stat;
 151
 152     stat = cudaEventElapsedTime(&t, start, end);
 153     CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
 154
 155     return t;
 156 }
 157
 158 int cu_wait_event(cudaEvent_t e)
 159 {
 160     cudaError_t s;
 161
 162     s = cudaEventSynchronize(e);
 163     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 164
 165     return 0;
 166 }
 167
 168 /*!
 169  *  If time != NULL it also calculates the time elapsed between start and end and
 170  *  return this is milliseconds.
 171  */
 172 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
 173 {
 174     cudaError_t s;
 175
 176     s = cudaEventSynchronize(end);
 177     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 178
 179     if (time)
 180     {
 181         *time = cu_event_elapsed(start, end);
 182     }
 183
 184     return 0;
 185 }
 186
 187 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 188
 189 /*!
 190  * If the pointers to the size variables are NULL no resetting happens.
 191  */
 192 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 193 {
 194     cudaError_t stat;
 195
 196     if (d_ptr)
 197     {
 198         stat = cudaFree(d_ptr);
 199         CU_RET_ERR(stat, "cudaFree failed");
 200     }
 201
 202     if (n)
 203     {
 204         *n = -1;
 205     }
 206
 207     if (nalloc)
 208     {
 209         *nalloc = -1;
 210     }
 211 }
 212
 213 /*!
 214  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 215  *  the location pointed by h_src host-side pointer is done. Allocation is
 216  *  buffered and therefore freeing is only needed if the previously allocated
 217  *  space is not enough.
 218  *  The H2D copy is launched in stream s and can be done synchronously or
 219  *  asynchronously (the default is the latter).
 220  */
 221 void cu_realloc_buffered(void **d_dest, void *h_src,
 222                          size_t type_size,
 223                          int *curr_size, int *curr_alloc_size,
 224                          int req_size,
 225                          cudaStream_t s,
 226                          bool bAsync = true)
 227 {
 228     cudaError_t stat;
 229
 230     if (d_dest == NULL || req_size < 0)
 231     {
 232         return;
 233     }
 234
 235     /* reallocate only if the data does not fit = allocation size is smaller
 236        than the current requested size */
 237     if (req_size > *curr_alloc_size)
 238     {
 239         /* only free if the array has already been initialized */
 240         if (*curr_alloc_size >= 0)
 241         {
 242             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 243         }
 244
 245         *curr_alloc_size = over_alloc_large(req_size);
 246
 247         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 248         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 249     }
 250
 251     /* size could have changed without actual reallocation */
 252     *curr_size = req_size;
 253
 254     /* upload to device */
 255     if (h_src)
 256     {
 257         if (bAsync)
 258         {
 259             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 260         }
 261         else
 262         {
 263             cu_copy_H2D(*d_dest, h_src,  *curr_size * type_size);
 264         }
 265     }
 266 }