1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2012, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
33 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
38 #include "gmx_fatal.h"
41 #include "cudautils.cuh"
43 /*** Generic CUDA data operation wrappers ***/
45 /*! Launches synchronous or asynchronous host to device memory copy.
47 * The copy is launched in stream s or if not specified, in stream 0.
49 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
50 bool bAsync = false, cudaStream_t s = 0)
54 if (h_dest == NULL || d_src == NULL || bytes == 0)
59 stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
60 CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
65 stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
66 CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
72 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
74 return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
78 * The copy is launched in stream s or if not specified, in stream 0.
80 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
82 return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
85 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
87 if (h_dest == NULL || d_src == NULL || bytes == 0)
90 smalloc(*h_dest, bytes);
92 return cu_copy_D2H(*h_dest, d_src, bytes);
95 /*! Launches synchronous or asynchronous device to host memory copy.
97 * The copy is launched in stream s or if not specified, in stream 0.
99 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
100 bool bAsync = false, cudaStream_t s = 0)
104 if (d_dest == NULL || h_src == NULL || bytes == 0)
109 stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
110 CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
114 stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
115 CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
121 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
123 return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
127 * The copy is launched in stream s or if not specified, in stream 0.
129 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
131 return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
134 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
138 if (d_dest == NULL || h_src == NULL || bytes == 0)
141 stat = cudaMalloc(d_dest, bytes);
142 CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
144 return cu_copy_H2D(*d_dest, h_src, bytes);
147 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
152 stat = cudaEventElapsedTime(&t, start, end);
153 CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
158 int cu_wait_event(cudaEvent_t e)
162 s = cudaEventSynchronize(e);
163 CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
169 * If time != NULL it also calculates the time elapsed between start and end and
170 * return this is milliseconds.
172 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
176 s = cudaEventSynchronize(end);
177 CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
181 *time = cu_event_elapsed(start, end);
187 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
190 * If the pointers to the size variables are NULL no resetting happens.
192 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
198 stat = cudaFree(d_ptr);
199 CU_RET_ERR(stat, "cudaFree failed");
214 * Reallocation of the memory pointed by d_ptr and copying of the data from
215 * the location pointed by h_src host-side pointer is done. Allocation is
216 * buffered and therefore freeing is only needed if the previously allocated
217 * space is not enough.
218 * The H2D copy is launched in stream s and can be done synchronously or
219 * asynchronously (the default is the latter).
221 void cu_realloc_buffered(void **d_dest, void *h_src,
223 int *curr_size, int *curr_alloc_size,
230 if (d_dest == NULL || req_size < 0)
235 /* reallocate only if the data does not fit = allocation size is smaller
236 than the current requested size */
237 if (req_size > *curr_alloc_size)
239 /* only free if the array has already been initialized */
240 if (*curr_alloc_size >= 0)
242 cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
245 *curr_alloc_size = over_alloc_large(req_size);
247 stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
248 CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
251 /* size could have changed without actual reallocation */
252 *curr_size = req_size;
254 /* upload to device */
259 cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
263 cu_copy_H2D(*d_dest, h_src, *curr_size * type_size);