Move gpu_utils etc. out of gmxlib
[gromacs.git] / src / gromacs / gpu_utils / oclutils.cpp
blobbe95e9c6a9f26ea766feb484f21ed76a97bde047
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief Define utility routines for OpenCL
38 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 #include "gmxpre.h"
42 #include "oclutils.h"
44 #include <stdlib.h>
46 #include <cassert>
47 #include <cstdio>
49 #include "gromacs/utility/fatalerror.h"
50 #include "gromacs/utility/smalloc.h"
52 /*! \brief Launches synchronous or asynchronous host to device memory copy.
54 * If copy_event is not NULL, on return it will contain an event object
55 * identifying this particular host to device operation. The event can further
56 * be used to queue a wait for this operation or to query profiling information.
58 static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
59 size_t offset, size_t bytes,
60 bool bAsync /* = false*/,
61 cl_command_queue command_queue,
62 cl_event *copy_event)
64 cl_int gmx_unused cl_error;
66 if (d_dest == NULL || h_src == NULL || bytes == 0)
68 return -1;
71 if (bAsync)
73 cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
74 assert(cl_error == CL_SUCCESS);
75 // TODO: handle errors
77 else
79 cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
80 assert(cl_error == CL_SUCCESS);
81 // TODO: handle errors
84 return 0;
87 /*! \brief Launches asynchronous host to device memory copy.
89 * If copy_event is not NULL, on return it will contain an event object
90 * identifying this particular host to device operation. The event can further
91 * be used to queue a wait for this operation or to query profiling information.
93 int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
94 size_t offset, size_t bytes,
95 cl_command_queue command_queue,
96 cl_event *copy_event)
98 return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event);
101 /*! \brief Launches synchronous host to device memory copy.
103 int ocl_copy_H2D(cl_mem d_dest, void * h_src,
104 size_t offset, size_t bytes,
105 cl_command_queue command_queue)
107 return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL);
110 /*! \brief Launches synchronous or asynchronous device to host memory copy.
112 * If copy_event is not NULL, on return it will contain an event object
113 * identifying this particular device to host operation. The event can further
114 * be used to queue a wait for this operation or to query profiling information.
116 int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
117 size_t offset, size_t bytes,
118 bool bAsync,
119 cl_command_queue command_queue,
120 cl_event *copy_event)
122 cl_int gmx_unused cl_error;
124 if (h_dest == NULL || d_src == NULL || bytes == 0)
126 return -1;
129 if (bAsync)
131 cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
132 assert(cl_error == CL_SUCCESS);
133 // TODO: handle errors
135 else
137 cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
138 assert(cl_error == CL_SUCCESS);
139 // TODO: handle errors
142 return 0;
145 /*! \brief Launches asynchronous device to host memory copy.
147 * If copy_event is not NULL, on return it will contain an event object
148 * identifying this particular host to device operation. The event can further
149 * be used to queue a wait for this operation or to query profiling information.
151 int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
152 size_t offset, size_t bytes,
153 cl_command_queue command_queue,
154 cl_event *copy_event)
156 return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event);
159 /*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
161 * \todo
162 * This function should allocate page-locked memory to help reduce D2H and H2D
163 * transfer times, similar with pmalloc from pmalloc_cuda.cu.
165 * \param[in,out] h_ptr Pointer where to store the address of the newly allocated buffer.
166 * \param[in] nbytes Size in bytes of the buffer to be allocated.
168 void ocl_pmalloc(void **h_ptr, size_t nbytes)
170 /* Need a temporary type whose size is 1 byte, so that the
171 * implementation of snew_aligned can cope without issuing
172 * warnings. */
173 char **temporary = reinterpret_cast<char **>(h_ptr);
175 /* 16-byte alignment is required by the neighbour-searching code,
176 * because it uses four-wide SIMD for bounding-box calculation.
177 * However, when we organize using page-locked memory for
178 * device-host transfers, it will probably need to be aligned to a
179 * 4kb page, like CUDA does. */
180 snew_aligned(*temporary, nbytes, 16);
183 /*! \brief Frees memory allocated with ocl_pmalloc.
185 * \param[in] h_ptr Buffer allocated with ocl_pmalloc that needs to be freed.
187 void ocl_pfree(void *h_ptr)
190 if (h_ptr)
192 sfree_aligned(h_ptr);
194 return;