Update instructions in containers.rst
[gromacs.git] / src / gromacs / nbnxm / opencl / nbnxm_ocl.cpp
blobaffbad0811729cdd7ec9e0f4dd5a4ceac5d81cdf
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 /*! \internal \file
37 * \brief Define OpenCL implementation of nbnxm_gpu.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Teemu Virolainen <teemu@streamcomputing.eu>
41 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_nbnxm
45 * TODO (psz):
46 * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
47 * - Rework the copying of OCL data structures done before every invocation of both
48 * nb and prune kernels (using fillin_ocl_structures); also consider at the same
49 * time calling clSetKernelArg only on the updated parameters (if tracking changed
50 * parameters is feasible);
51 * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
52 * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
53 * on the misc_ops_and_local_H2D_done event could be better expressed this way.
55 * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
56 * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
57 * is identical in the two implementations, so a 3-way split might allow sharing
58 * code;
59 * -
62 #include "gmxpre.h"
64 #include <assert.h>
65 #include <stdlib.h>
67 #if defined(_MSVC)
68 # include <limits>
69 #endif
71 #include "thread_mpi/atomic.h"
73 #include "gromacs/gpu_utils/device_context.h"
74 #include "gromacs/gpu_utils/gputraits_ocl.h"
75 #include "gromacs/gpu_utils/oclutils.h"
76 #include "gromacs/hardware/device_information.h"
77 #include "gromacs/hardware/hw_info.h"
78 #include "gromacs/mdtypes/simulation_workload.h"
79 #include "gromacs/nbnxm/atomdata.h"
80 #include "gromacs/nbnxm/gpu_common.h"
81 #include "gromacs/nbnxm/gpu_common_utils.h"
82 #include "gromacs/nbnxm/gpu_data_mgmt.h"
83 #include "gromacs/nbnxm/nbnxm.h"
84 #include "gromacs/nbnxm/nbnxm_gpu.h"
85 #include "gromacs/nbnxm/pairlist.h"
86 #include "gromacs/pbcutil/ishift.h"
87 #include "gromacs/timing/gpu_timing.h"
88 #include "gromacs/utility/cstringutil.h"
89 #include "gromacs/utility/fatalerror.h"
90 #include "gromacs/utility/gmxassert.h"
92 #include "nbnxm_ocl_types.h"
94 namespace Nbnxm
97 /*! \brief Convenience constants */
98 //@{
99 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
100 //@}
103 /*! \brief Validates the input global work size parameter.
105 static inline void validate_global_work_size(const KernelLaunchConfig& config,
106 int work_dim,
107 const DeviceInformation* dinfo)
109 cl_uint device_size_t_size_bits;
110 cl_uint host_size_t_size_bits;
112 GMX_ASSERT(dinfo, "Need a valid device info object");
114 size_t global_work_size[3];
115 GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
116 for (int i = 0; i < work_dim; i++)
118 global_work_size[i] = config.blockSize[i] * config.gridSize[i];
121 /* Each component of a global_work_size must not exceed the range given by the
122 sizeof(device size_t) for the device on which the kernel execution will
123 be enqueued. See:
124 https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
126 device_size_t_size_bits = dinfo->adress_bits;
127 host_size_t_size_bits = static_cast<cl_uint>(sizeof(size_t) * 8);
129 /* If sizeof(host size_t) <= sizeof(device size_t)
130 => global_work_size components will always be valid
131 else
132 => get device limit for global work size and
133 compare it against each component of global_work_size.
135 if (host_size_t_size_bits > device_size_t_size_bits)
137 size_t device_limit;
139 device_limit = (1ULL << device_size_t_size_bits) - 1;
141 for (int i = 0; i < work_dim; i++)
143 if (global_work_size[i] > device_limit)
145 gmx_fatal(
146 FARGS,
147 "Watch out, the input system is too large to simulate!\n"
148 "The number of nonbonded work units (=number of super-clusters) exceeds the"
149 "device capabilities. Global work size limit exceeded (%zu > %zu)!",
150 global_work_size[i], device_limit);
156 /* Constant arrays listing non-bonded kernel function names. The arrays are
157 * organized in 2-dim arrays by: electrostatics and VDW type.
159 * Note that the row- and column-order of function pointers has to match the
160 * order of corresponding enumerated electrostatics and vdw types, resp.,
161 * defined in nbnxm_ocl_types.h.
164 /*! \brief Force-only kernel function names. */
165 static const char* nb_kfunc_noener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
166 { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
167 "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
168 "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
169 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
170 { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
171 "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
172 "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
173 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
174 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
175 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
176 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",
177 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",
178 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
179 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl",
180 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl",
181 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl",
182 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl",
183 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl",
184 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl",
185 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
186 { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
187 "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
188 "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
189 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
190 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",
191 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",
192 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",
193 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
194 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",
195 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
198 /*! \brief Force + energy kernel function pointers. */
199 static const char* nb_kfunc_ener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
200 { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
201 "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
202 "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
203 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
204 { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
205 "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
206 "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
207 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
208 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
209 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",
210 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
211 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",
212 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
213 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl",
214 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl",
215 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl",
216 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl",
217 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl",
218 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl",
219 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
220 { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
221 "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
222 "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
223 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
224 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",
225 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",
226 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",
227 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",
228 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",
229 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",
230 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
233 /*! \brief Force + pruning kernel function pointers. */
234 static const char* nb_kfunc_noener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
235 { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",
236 "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",
237 "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",
238 "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
239 "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",
240 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
241 { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
242 "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",
243 "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
244 "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",
245 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
246 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",
247 "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",
248 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",
249 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",
250 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",
251 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",
252 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
253 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",
254 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl",
255 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl",
256 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl",
257 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl",
258 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl",
259 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
260 { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
261 "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",
262 "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
263 "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",
264 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
265 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",
266 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",
267 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",
268 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",
269 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",
270 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",
271 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
274 /*! \brief Force + energy + pruning kernel function pointers. */
275 static const char* nb_kfunc_ener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
276 { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",
277 "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",
278 "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",
279 "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",
280 "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",
281 "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",
282 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
283 { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",
284 "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",
285 "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",
286 "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
287 "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",
288 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
289 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",
290 "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",
291 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",
292 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",
293 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",
294 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",
295 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
296 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl",
297 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl",
298 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl",
299 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl",
300 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl",
301 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
302 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
303 { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",
304 "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",
305 "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",
306 "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
307 "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",
308 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
309 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",
310 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",
311 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",
312 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",
313 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",
314 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
315 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
318 /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
320 * \param[in] kernel_pruneonly array of prune kernel objects
321 * \param[in] firstPrunePass true if the first pruning pass is being executed
323 static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool firstPrunePass)
325 cl_kernel* kernelPtr;
327 if (firstPrunePass)
329 kernelPtr = &(kernel_pruneonly[epruneFirst]);
331 else
333 kernelPtr = &(kernel_pruneonly[epruneRolling]);
335 // TODO: consider creating the prune kernel object here to avoid a
336 // clCreateKernel for the rolling prune kernel if this is not needed.
337 return *kernelPtr;
340 /*! \brief Return a pointer to the kernel version to be executed at the current step.
341 * OpenCL kernel objects are cached in nb. If the requested kernel is not
342 * found in the cache, it will be created and the cache will be updated.
344 static inline cl_kernel
345 select_nbnxn_kernel(NbnxmGpu* nb, enum ElecType elecType, enum VdwType vdwType, bool bDoEne, bool bDoPrune)
347 const char* kernel_name_to_run;
348 cl_kernel* kernel_ptr;
349 cl_int cl_error;
351 const int elecTypeIdx = static_cast<int>(elecType);
352 const int vdwTypeIdx = static_cast<int>(vdwType);
354 GMX_ASSERT(elecTypeIdx < c_numElecTypes,
355 "The electrostatics type requested is not implemented in the OpenCL kernels.");
356 GMX_ASSERT(vdwTypeIdx < c_numVdwTypes,
357 "The VdW type requested is not implemented in the OpenCL kernels.");
359 if (bDoEne)
361 if (bDoPrune)
363 kernel_name_to_run = nb_kfunc_ener_prune_ptr[elecTypeIdx][vdwTypeIdx];
364 kernel_ptr = &(nb->kernel_ener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
366 else
368 kernel_name_to_run = nb_kfunc_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
369 kernel_ptr = &(nb->kernel_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
372 else
374 if (bDoPrune)
376 kernel_name_to_run = nb_kfunc_noener_prune_ptr[elecTypeIdx][vdwTypeIdx];
377 kernel_ptr = &(nb->kernel_noener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
379 else
381 kernel_name_to_run = nb_kfunc_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
382 kernel_ptr = &(nb->kernel_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
386 if (nullptr == kernel_ptr[0])
388 *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
389 GMX_ASSERT(cl_error == CL_SUCCESS, ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
390 + " for kernel named " + kernel_name_to_run)
391 .c_str());
394 return *kernel_ptr;
397 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
399 static inline int calc_shmem_required_nonbonded(enum VdwType vdwType, bool bPrefetchLjParam)
401 int shmem;
403 /* size of shmem (force-buffers/xq/atom type preloading) */
404 /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
405 /* i-atom x+q in shared memory */
406 shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */
407 /* cj in shared memory, for both warps separately
408 * TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed.
410 shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
411 if (bPrefetchLjParam)
413 if (useLjCombRule(vdwType))
415 /* i-atom LJ combination parameters in shared memory */
416 shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2
417 * sizeof(float); /* atib abused for ljcp, float2 */
419 else
421 /* i-atom types in shared memory */
422 shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */
425 /* force reduction buffers in shared memory */
426 shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
427 /* Warp vote. In fact it must be * number of warps in block.. */
428 shmem += sizeof(cl_uint) * 2; /* warp_any */
429 return shmem;
432 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
434 * The device can't use the same data structures as the host for two main reasons:
435 * - OpenCL restrictions (pointers are not accepted inside data structures)
436 * - some host side fields are not needed for the OpenCL kernels.
438 * This function is called before the launch of both nbnxn and prune kernels.
440 static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams_params)
442 nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
443 nbparams_params->c_rf = nbp->c_rf;
444 nbparams_params->dispersion_shift = nbp->dispersion_shift;
445 nbparams_params->elecType = nbp->elecType;
446 nbparams_params->epsfac = nbp->epsfac;
447 nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
448 nbparams_params->ewald_beta = nbp->ewald_beta;
449 nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
450 nbparams_params->repulsion_shift = nbp->repulsion_shift;
451 nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
452 nbparams_params->rvdw_sq = nbp->rvdw_sq;
453 nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
454 nbparams_params->rvdw_switch = nbp->rvdw_switch;
455 nbparams_params->sh_ewald = nbp->sh_ewald;
456 nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
457 nbparams_params->two_k_rf = nbp->two_k_rf;
458 nbparams_params->vdwType = nbp->vdwType;
459 nbparams_params->vdw_switch = nbp->vdw_switch;
462 /*! \brief Enqueues a wait for event completion.
464 * Then it releases the event and sets it to 0.
465 * Don't use this function when more than one wait will be issued for the event.
466 * Equivalent to Cuda Stream Sync. */
467 static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
469 cl_int gmx_unused cl_error;
471 /* Enqueue wait */
472 cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
473 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
475 /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
476 cl_error = clReleaseEvent(*ocl_event);
477 GMX_ASSERT(cl_error == CL_SUCCESS,
478 ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
479 *ocl_event = nullptr;
482 /*! \brief Launch asynchronously the xq buffer host to device copy. */
483 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
485 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
487 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
489 /* local/nonlocal offset and length used for xq and f */
490 int adat_begin, adat_len;
492 cl_atomdata_t* adat = nb->atdat;
493 gpu_plist* plist = nb->plist[iloc];
494 cl_timers_t* t = nb->timers;
495 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
497 bool bDoTime = nb->bDoTime;
499 /* Don't launch the non-local H2D copy if there is no dependent
500 work to do: neither non-local nor other (e.g. bonded) work
501 to do that has as input the nbnxn coordinates.
502 Doing the same for the local kernel is more complicated, since the
503 local part of the force array also depends on the non-local kernel.
504 So to avoid complicating the code and to reduce the risk of bugs,
505 we always call the local local x+q copy (and the rest of the local
506 work in nbnxn_gpu_launch_kernel().
508 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
510 plist->haveFreshList = false;
512 return;
515 /* calculate the atom data index range based on locality */
516 if (atomLocality == AtomLocality::Local)
518 adat_begin = 0;
519 adat_len = adat->natoms_local;
521 else
523 adat_begin = adat->natoms_local;
524 adat_len = adat->natoms - adat->natoms_local;
527 /* beginning of timed HtoD section */
528 if (bDoTime)
530 t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
533 /* HtoD x, q */
534 GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
535 "The size of the xyzq buffer element should be equal to the size of float4.");
536 copyToDeviceBuffer(&adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * 4, adat_len * 4,
537 deviceStream, GpuApiCallBehavior::Async,
538 bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
540 if (bDoTime)
542 t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
545 /* When we get here all misc operations issues in the local stream as well as
546 the local xq H2D are done,
547 so we record that in the local stream and wait for it in the nonlocal one. */
548 if (nb->bUseTwoStreams)
550 if (iloc == InteractionLocality::Local)
552 cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
553 deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
554 GMX_ASSERT(cl_error == CL_SUCCESS,
555 ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
557 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
558 * in the local stream in order to be able to sync with the above event
559 * from the non-local stream.
561 cl_error = clFlush(deviceStream.stream());
562 GMX_ASSERT(cl_error == CL_SUCCESS,
563 ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
565 else
567 sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
573 /*! \brief Launch GPU kernel
575 As we execute nonbonded workload in separate queues, before launching
576 the kernel we need to make sure that he following operations have completed:
577 - atomdata allocation and related H2D transfers (every nstlist step);
578 - pair list H2D transfer (every nstlist step);
579 - shift vector H2D transfer (every nstlist step);
580 - force (+shift force and energy) output clearing (every step).
582 These operations are issued in the local queue at the beginning of the step
583 and therefore always complete before the local kernel launch. The non-local
584 kernel is launched after the local on the same device/context, so this is
585 inherently scheduled after the operations in the local stream (including the
586 above "misc_ops").
587 However, for the sake of having a future-proof implementation, we use the
588 misc_ops_done event to record the point in time when the above operations
589 are finished and synchronize with this event in the non-local stream.
591 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
593 cl_atomdata_t* adat = nb->atdat;
594 NBParamGpu* nbp = nb->nbparam;
595 gpu_plist* plist = nb->plist[iloc];
596 cl_timers_t* t = nb->timers;
597 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
599 bool bDoTime = nb->bDoTime;
601 cl_nbparam_params_t nbparams_params;
603 /* Don't launch the non-local kernel if there is no work to do.
604 Doing the same for the local kernel is more complicated, since the
605 local part of the force array also depends on the non-local kernel.
606 So to avoid complicating the code and to reduce the risk of bugs,
607 we always call the local kernel and later (not in
608 this function) the stream wait, local f copyback and the f buffer
609 clearing. All these operations, except for the local interaction kernel,
610 are needed for the non-local interactions. The skip of the local kernel
611 call is taken care of later in this function. */
612 if (canSkipNonbondedWork(*nb, iloc))
614 plist->haveFreshList = false;
616 return;
619 if (nbp->useDynamicPruning && plist->haveFreshList)
621 /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
622 (that's the way the timing accounting can distinguish between
623 separate prune kernel and combined force+prune).
625 Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
628 if (plist->nsci == 0)
630 /* Don't launch an empty local kernel (is not allowed with OpenCL).
632 return;
635 /* beginning of timed nonbonded calculation section */
636 if (bDoTime)
638 t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
641 /* kernel launch config */
643 KernelLaunchConfig config;
644 config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwType, nb->bPrefetchLjParam);
645 config.blockSize[0] = c_clSize;
646 config.blockSize[1] = c_clSize;
647 config.gridSize[0] = plist->nsci;
649 validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
651 if (debug)
653 fprintf(debug,
654 "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
655 "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
656 config.blockSize[0], config.blockSize[1], config.blockSize[2],
657 config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
658 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
659 c_nbnxnGpuNumClusterPerSupercluster, plist->na_c);
662 fillin_ocl_structures(nbp, &nbparams_params);
664 auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
665 constexpr char kernelName[] = "k_calc_nb";
666 const auto kernel =
667 select_nbnxn_kernel(nb, nbp->elecType, nbp->vdwType, stepWork.computeEnergy,
668 (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
671 // The OpenCL kernel takes int as second to last argument because bool is
672 // not supported as a kernel argument type (sizeof(bool) is implementation defined).
673 const int computeFshift = static_cast<int>(stepWork.computeVirial);
674 if (useLjCombRule(nb->nbparam->vdwType))
676 const auto kernelArgs = prepareGpuKernelArguments(
677 kernel, config, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el,
678 &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
679 &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
681 launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
683 else
685 const auto kernelArgs = prepareGpuKernelArguments(
686 kernel, config, &adat->ntypes, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj,
687 &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
688 &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
689 launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
692 if (bDoTime)
694 t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
699 /*! \brief Calculates the amount of shared memory required by the prune kernel.
701 * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
702 * for OpenCL local memory.
704 * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
705 * dimension. \returns the amount of local memory in bytes required by the pruning kernel
707 static inline int calc_shmem_required_prune(const int num_threads_z)
709 int shmem;
711 /* i-atom x in shared memory (for convenience we load all 4 components including q) */
712 shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4;
713 /* cj in shared memory, for each warp separately
714 * Note: only need to load once per wavefront, but to keep the code simple,
715 * for now we load twice on AMD.
717 shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
718 /* Warp vote, requires one uint per warp/32 threads per block. */
719 shmem += sizeof(cl_uint) * 2 * num_threads_z;
721 return shmem;
724 /*! \brief
725 * Launch the pairlist prune only kernel for the given locality.
726 * \p numParts tells in how many parts, i.e. calls the list will be pruned.
728 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
730 cl_atomdata_t* adat = nb->atdat;
731 NBParamGpu* nbp = nb->nbparam;
732 gpu_plist* plist = nb->plist[iloc];
733 cl_timers_t* t = nb->timers;
734 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
735 bool bDoTime = nb->bDoTime;
737 if (plist->haveFreshList)
739 GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
741 /* Set rollingPruningNumParts to signal that it is not set */
742 plist->rollingPruningNumParts = 0;
743 plist->rollingPruningPart = 0;
745 else
747 if (plist->rollingPruningNumParts == 0)
749 plist->rollingPruningNumParts = numParts;
751 else
753 GMX_ASSERT(numParts == plist->rollingPruningNumParts,
754 "It is not allowed to change numParts in between list generation steps");
758 /* Use a local variable for part and update in plist, so we can return here
759 * without duplicating the part increment code.
761 int part = plist->rollingPruningPart;
763 plist->rollingPruningPart++;
764 if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
766 plist->rollingPruningPart = 0;
769 /* Compute the number of list entries to prune in this pass */
770 int numSciInPart = (plist->nsci - part) / numParts;
772 /* Don't launch the kernel if there is no work to do. */
773 if (numSciInPart <= 0)
775 plist->haveFreshList = false;
777 return;
780 GpuRegionTimer* timer = nullptr;
781 if (bDoTime)
783 timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
786 /* beginning of timed prune calculation section */
787 if (bDoTime)
789 timer->openTimingRegion(deviceStream);
792 /* Kernel launch config:
793 * - The thread block dimensions match the size of i-clusters, j-clusters,
794 * and j-cluster concurrency, in x, y, and z, respectively.
795 * - The 1D block-grid contains as many blocks as super-clusters.
797 int num_threads_z = c_oclPruneKernelJ4ConcurrencyDEFAULT;
800 /* kernel launch config */
801 KernelLaunchConfig config;
802 config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
803 config.blockSize[0] = c_clSize;
804 config.blockSize[1] = c_clSize;
805 config.blockSize[2] = num_threads_z;
806 config.gridSize[0] = numSciInPart;
808 validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
810 if (debug)
812 fprintf(debug,
813 "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
814 "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
815 "\tShMem: %zu\n",
816 config.blockSize[0], config.blockSize[1], config.blockSize[2],
817 config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
818 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
819 c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
822 cl_nbparam_params_t nbparams_params;
823 fillin_ocl_structures(nbp, &nbparams_params);
825 auto* timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
826 constexpr char kernelName[] = "k_pruneonly";
827 const auto pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
828 const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config, &nbparams_params,
829 &adat->xq, &adat->shift_vec, &plist->sci,
830 &plist->cj4, &plist->imask, &numParts, &part);
831 launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
833 if (plist->haveFreshList)
835 plist->haveFreshList = false;
836 /* Mark that pruning has been done */
837 nb->timers->interaction[iloc].didPrune = true;
839 else
841 /* Mark that rolling pruning has been done */
842 nb->timers->interaction[iloc].didRollingPrune = true;
845 if (bDoTime)
847 timer->closeTimingRegion(deviceStream);
851 /*! \brief
852 * Launch asynchronously the download of nonbonded forces from the GPU
853 * (and energies/shift forces if required).
855 void gpu_launch_cpyback(NbnxmGpu* nb,
856 struct nbnxn_atomdata_t* nbatom,
857 const gmx::StepWorkload& stepWork,
858 const AtomLocality aloc)
860 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
862 cl_int gmx_unused cl_error;
863 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
865 /* determine interaction locality from atom locality */
866 const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
868 cl_atomdata_t* adat = nb->atdat;
869 cl_timers_t* t = nb->timers;
870 bool bDoTime = nb->bDoTime;
871 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
873 /* don't launch non-local copy-back if there was no non-local work to do */
874 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
876 /* TODO An alternative way to signal that non-local work is
877 complete is to use a clEnqueueMarker+clEnqueueBarrier
878 pair. However, the use of bNonLocalStreamActive has the
879 advantage of being local to the host, so probably minimizes
880 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
881 test case, overall simulation performance was higher with
882 the API calls, but this has not been tested on AMD OpenCL,
883 so could be worth considering in future. */
884 nb->bNonLocalStreamActive = CL_FALSE;
885 return;
888 getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
890 /* beginning of timed D2H section */
891 if (bDoTime)
893 t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
896 /* With DD the local D2H transfer can only start after the non-local
897 has been launched. */
898 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
900 sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
903 /* DtoH f */
904 GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
905 "The host force buffer should be in single precision to match device data size.");
906 copyFromDeviceBuffer(&nbatom->out[0].f[adat_begin * DIM], &adat->f, adat_begin * DIM,
907 adat_len * DIM, deviceStream, GpuApiCallBehavior::Async,
908 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
910 /* kick off work */
911 cl_error = clFlush(deviceStream.stream());
912 GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
914 /* After the non-local D2H is launched the nonlocal_done event can be
915 recorded which signals that the local D2H can proceed. This event is not
916 placed after the non-local kernel because we first need the non-local
917 data back first. */
918 if (iloc == InteractionLocality::NonLocal)
920 cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
921 GMX_ASSERT(cl_error == CL_SUCCESS,
922 ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
923 nb->bNonLocalStreamActive = CL_TRUE;
926 /* only transfer energies in the local stream */
927 if (iloc == InteractionLocality::Local)
929 /* DtoH fshift when virial is needed */
930 if (stepWork.computeVirial)
932 GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
933 "Sizes of host- and device-side shift vector elements should be the same.");
934 copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift), &adat->fshift, 0,
935 SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async,
936 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
939 /* DtoH energies */
940 if (stepWork.computeEnergy)
942 GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
943 "Sizes of host- and device-side LJ energy terms should be the same.");
944 copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async,
945 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
946 GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
947 "Sizes of host- and device-side electrostatic energy terms should be the "
948 "same.");
949 copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async,
950 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
954 if (bDoTime)
956 t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
960 } // namespace Nbnxm