Report name of missing OpenCL kernels
[gromacs.git] / src / gromacs / nbnxm / opencl / nbnxm_ocl.cpp
blob359683337a41829e07d97f34e095fbfe4d590024
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 /*! \internal \file
37 * \brief Define OpenCL implementation of nbnxm_gpu.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Teemu Virolainen <teemu@streamcomputing.eu>
41 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_nbnxm
45 * TODO (psz):
46 * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
47 * - Rework the copying of OCL data structures done before every invocation of both
48 * nb and prune kernels (using fillin_ocl_structures); also consider at the same
49 * time calling clSetKernelArg only on the updated parameters (if tracking changed
50 * parameters is feasible);
51 * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
52 * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
53 * on the misc_ops_and_local_H2D_done event could be better expressed this way.
55 * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
56 * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
57 * is identical in the two implementations, so a 3-way split might allow sharing
58 * code;
59 * -
62 #include "gmxpre.h"
64 #include <assert.h>
65 #include <stdlib.h>
67 #if defined(_MSVC)
68 # include <limits>
69 #endif
71 #include "thread_mpi/atomic.h"
73 #include "gromacs/gpu_utils/device_context.h"
74 #include "gromacs/gpu_utils/gputraits_ocl.h"
75 #include "gromacs/gpu_utils/oclutils.h"
76 #include "gromacs/hardware/device_information.h"
77 #include "gromacs/hardware/hw_info.h"
78 #include "gromacs/mdtypes/simulation_workload.h"
79 #include "gromacs/nbnxm/atomdata.h"
80 #include "gromacs/nbnxm/gpu_common.h"
81 #include "gromacs/nbnxm/gpu_common_utils.h"
82 #include "gromacs/nbnxm/gpu_data_mgmt.h"
83 #include "gromacs/nbnxm/nbnxm.h"
84 #include "gromacs/nbnxm/nbnxm_gpu.h"
85 #include "gromacs/nbnxm/pairlist.h"
86 #include "gromacs/pbcutil/ishift.h"
87 #include "gromacs/timing/gpu_timing.h"
88 #include "gromacs/utility/cstringutil.h"
89 #include "gromacs/utility/fatalerror.h"
90 #include "gromacs/utility/gmxassert.h"
92 #include "nbnxm_ocl_types.h"
94 namespace Nbnxm
97 /*! \brief Convenience constants */
98 //@{
99 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
100 //@}
103 /*! \brief Validates the input global work size parameter.
105 static inline void validate_global_work_size(const KernelLaunchConfig& config,
106 int work_dim,
107 const DeviceInformation* dinfo)
109 cl_uint device_size_t_size_bits;
110 cl_uint host_size_t_size_bits;
112 GMX_ASSERT(dinfo, "Need a valid device info object");
114 size_t global_work_size[3];
115 GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
116 for (int i = 0; i < work_dim; i++)
118 global_work_size[i] = config.blockSize[i] * config.gridSize[i];
121 /* Each component of a global_work_size must not exceed the range given by the
122 sizeof(device size_t) for the device on which the kernel execution will
123 be enqueued. See:
124 https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
126 device_size_t_size_bits = dinfo->adress_bits;
127 host_size_t_size_bits = static_cast<cl_uint>(sizeof(size_t) * 8);
129 /* If sizeof(host size_t) <= sizeof(device size_t)
130 => global_work_size components will always be valid
131 else
132 => get device limit for global work size and
133 compare it against each component of global_work_size.
135 if (host_size_t_size_bits > device_size_t_size_bits)
137 size_t device_limit;
139 device_limit = (1ULL << device_size_t_size_bits) - 1;
141 for (int i = 0; i < work_dim; i++)
143 if (global_work_size[i] > device_limit)
145 gmx_fatal(
146 FARGS,
147 "Watch out, the input system is too large to simulate!\n"
148 "The number of nonbonded work units (=number of super-clusters) exceeds the"
149 "device capabilities. Global work size limit exceeded (%zu > %zu)!",
150 global_work_size[i], device_limit);
156 /* Constant arrays listing non-bonded kernel function names. The arrays are
157 * organized in 2-dim arrays by: electrostatics and VDW type.
159 * Note that the row- and column-order of function pointers has to match the
160 * order of corresponding enumerated electrostatics and vdw types, resp.,
161 * defined in nbnxm_ocl_types.h.
164 /*! \brief Force-only kernel function names. */
165 static const char* nb_kfunc_noener_noprune_ptr[eelTypeNR][evdwTypeNR] = {
166 { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
167 "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
168 "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
169 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
170 { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
171 "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
172 "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
173 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
174 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
175 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
176 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",
177 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",
178 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
179 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl",
180 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl",
181 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl",
182 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl",
183 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl",
184 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl",
185 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
186 { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
187 "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
188 "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
189 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
190 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",
191 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",
192 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",
193 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
194 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",
195 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
198 /*! \brief Force + energy kernel function pointers. */
199 static const char* nb_kfunc_ener_noprune_ptr[eelTypeNR][evdwTypeNR] = {
200 { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
201 "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
202 "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
203 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
204 { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
205 "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
206 "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
207 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
208 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
209 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",
210 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
211 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",
212 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
213 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl",
214 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl",
215 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl",
216 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl",
217 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl",
218 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl",
219 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
220 { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
221 "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
222 "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
223 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
224 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",
225 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",
226 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",
227 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",
228 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",
229 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",
230 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
233 /*! \brief Force + pruning kernel function pointers. */
234 static const char* nb_kfunc_noener_prune_ptr[eelTypeNR][evdwTypeNR] = {
235 { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",
236 "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",
237 "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",
238 "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
239 "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",
240 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
241 { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
242 "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",
243 "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
244 "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",
245 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
246 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",
247 "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",
248 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",
249 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",
250 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",
251 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",
252 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
253 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",
254 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl",
255 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl",
256 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl",
257 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl",
258 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl",
259 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
260 { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
261 "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",
262 "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
263 "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",
264 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
265 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",
266 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",
267 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",
268 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",
269 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",
270 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",
271 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
274 /*! \brief Force + energy + pruning kernel function pointers. */
275 static const char* nb_kfunc_ener_prune_ptr[eelTypeNR][evdwTypeNR] = {
276 { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",
277 "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",
278 "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",
279 "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",
280 "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",
281 "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",
282 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
283 { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",
284 "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",
285 "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",
286 "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
287 "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",
288 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
289 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",
290 "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",
291 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",
292 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",
293 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",
294 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",
295 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
296 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl",
297 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl",
298 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl",
299 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl",
300 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl",
301 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
302 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
303 { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",
304 "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",
305 "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",
306 "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
307 "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",
308 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
309 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",
310 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",
311 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",
312 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",
313 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",
314 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
315 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
318 /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
320 * \param[in] kernel_pruneonly array of prune kernel objects
321 * \param[in] firstPrunePass true if the first pruning pass is being executed
323 static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool firstPrunePass)
325 cl_kernel* kernelPtr;
327 if (firstPrunePass)
329 kernelPtr = &(kernel_pruneonly[epruneFirst]);
331 else
333 kernelPtr = &(kernel_pruneonly[epruneRolling]);
335 // TODO: consider creating the prune kernel object here to avoid a
336 // clCreateKernel for the rolling prune kernel if this is not needed.
337 return *kernelPtr;
340 /*! \brief Return a pointer to the kernel version to be executed at the current step.
341 * OpenCL kernel objects are cached in nb. If the requested kernel is not
342 * found in the cache, it will be created and the cache will be updated.
344 static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune)
346 const char* kernel_name_to_run;
347 cl_kernel* kernel_ptr;
348 cl_int cl_error;
350 GMX_ASSERT(eeltype < eelTypeNR,
351 "The electrostatics type requested is not implemented in the OpenCL kernels.");
352 GMX_ASSERT(evdwtype < evdwTypeNR,
353 "The VdW type requested is not implemented in the OpenCL kernels.");
355 if (bDoEne)
357 if (bDoPrune)
359 kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
360 kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
362 else
364 kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
365 kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
368 else
370 if (bDoPrune)
372 kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
373 kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
375 else
377 kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
378 kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
382 if (nullptr == kernel_ptr[0])
384 *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
385 GMX_ASSERT(cl_error == CL_SUCCESS, ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
386 + " for kernel named " + kernel_name_to_run)
387 .c_str());
390 return *kernel_ptr;
393 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
395 static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjParam)
397 int shmem;
399 /* size of shmem (force-buffers/xq/atom type preloading) */
400 /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
401 /* i-atom x+q in shared memory */
402 shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */
403 /* cj in shared memory, for both warps separately
404 * TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed.
406 shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
407 if (bPrefetchLjParam)
409 if (useLjCombRule(vdwType))
411 /* i-atom LJ combination parameters in shared memory */
412 shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2
413 * sizeof(float); /* atib abused for ljcp, float2 */
415 else
417 /* i-atom types in shared memory */
418 shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */
421 /* force reduction buffers in shared memory */
422 shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
423 /* Warp vote. In fact it must be * number of warps in block.. */
424 shmem += sizeof(cl_uint) * 2; /* warp_any */
425 return shmem;
428 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
430 * The device can't use the same data structures as the host for two main reasons:
431 * - OpenCL restrictions (pointers are not accepted inside data structures)
432 * - some host side fields are not needed for the OpenCL kernels.
434 * This function is called before the launch of both nbnxn and prune kernels.
436 static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams_params)
438 nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
439 nbparams_params->c_rf = nbp->c_rf;
440 nbparams_params->dispersion_shift = nbp->dispersion_shift;
441 nbparams_params->eeltype = nbp->eeltype;
442 nbparams_params->epsfac = nbp->epsfac;
443 nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
444 nbparams_params->ewald_beta = nbp->ewald_beta;
445 nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
446 nbparams_params->repulsion_shift = nbp->repulsion_shift;
447 nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
448 nbparams_params->rvdw_sq = nbp->rvdw_sq;
449 nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
450 nbparams_params->rvdw_switch = nbp->rvdw_switch;
451 nbparams_params->sh_ewald = nbp->sh_ewald;
452 nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
453 nbparams_params->two_k_rf = nbp->two_k_rf;
454 nbparams_params->vdwtype = nbp->vdwtype;
455 nbparams_params->vdw_switch = nbp->vdw_switch;
458 /*! \brief Enqueues a wait for event completion.
460 * Then it releases the event and sets it to 0.
461 * Don't use this function when more than one wait will be issued for the event.
462 * Equivalent to Cuda Stream Sync. */
463 static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
465 cl_int gmx_unused cl_error;
467 /* Enqueue wait */
468 cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
469 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
471 /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
472 cl_error = clReleaseEvent(*ocl_event);
473 GMX_ASSERT(cl_error == CL_SUCCESS,
474 ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
475 *ocl_event = nullptr;
478 /*! \brief Launch asynchronously the xq buffer host to device copy. */
479 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
481 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
483 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
485 /* local/nonlocal offset and length used for xq and f */
486 int adat_begin, adat_len;
488 cl_atomdata_t* adat = nb->atdat;
489 gpu_plist* plist = nb->plist[iloc];
490 cl_timers_t* t = nb->timers;
491 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
493 bool bDoTime = nb->bDoTime;
495 /* Don't launch the non-local H2D copy if there is no dependent
496 work to do: neither non-local nor other (e.g. bonded) work
497 to do that has as input the nbnxn coordinates.
498 Doing the same for the local kernel is more complicated, since the
499 local part of the force array also depends on the non-local kernel.
500 So to avoid complicating the code and to reduce the risk of bugs,
501 we always call the local local x+q copy (and the rest of the local
502 work in nbnxn_gpu_launch_kernel().
504 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
506 plist->haveFreshList = false;
508 return;
511 /* calculate the atom data index range based on locality */
512 if (atomLocality == AtomLocality::Local)
514 adat_begin = 0;
515 adat_len = adat->natoms_local;
517 else
519 adat_begin = adat->natoms_local;
520 adat_len = adat->natoms - adat->natoms_local;
523 /* beginning of timed HtoD section */
524 if (bDoTime)
526 t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
529 /* HtoD x, q */
530 GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
531 "The size of the xyzq buffer element should be equal to the size of float4.");
532 copyToDeviceBuffer(&adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * 4, adat_len * 4,
533 deviceStream, GpuApiCallBehavior::Async,
534 bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
536 if (bDoTime)
538 t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
541 /* When we get here all misc operations issues in the local stream as well as
542 the local xq H2D are done,
543 so we record that in the local stream and wait for it in the nonlocal one. */
544 if (nb->bUseTwoStreams)
546 if (iloc == InteractionLocality::Local)
548 cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
549 deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
550 GMX_ASSERT(cl_error == CL_SUCCESS,
551 ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
553 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
554 * in the local stream in order to be able to sync with the above event
555 * from the non-local stream.
557 cl_error = clFlush(deviceStream.stream());
558 GMX_ASSERT(cl_error == CL_SUCCESS,
559 ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
561 else
563 sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
569 /*! \brief Launch GPU kernel
571 As we execute nonbonded workload in separate queues, before launching
572 the kernel we need to make sure that he following operations have completed:
573 - atomdata allocation and related H2D transfers (every nstlist step);
574 - pair list H2D transfer (every nstlist step);
575 - shift vector H2D transfer (every nstlist step);
576 - force (+shift force and energy) output clearing (every step).
578 These operations are issued in the local queue at the beginning of the step
579 and therefore always complete before the local kernel launch. The non-local
580 kernel is launched after the local on the same device/context, so this is
581 inherently scheduled after the operations in the local stream (including the
582 above "misc_ops").
583 However, for the sake of having a future-proof implementation, we use the
584 misc_ops_done event to record the point in time when the above operations
585 are finished and synchronize with this event in the non-local stream.
587 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
589 cl_atomdata_t* adat = nb->atdat;
590 NBParamGpu* nbp = nb->nbparam;
591 gpu_plist* plist = nb->plist[iloc];
592 cl_timers_t* t = nb->timers;
593 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
595 bool bDoTime = nb->bDoTime;
597 cl_nbparam_params_t nbparams_params;
599 /* Don't launch the non-local kernel if there is no work to do.
600 Doing the same for the local kernel is more complicated, since the
601 local part of the force array also depends on the non-local kernel.
602 So to avoid complicating the code and to reduce the risk of bugs,
603 we always call the local kernel and later (not in
604 this function) the stream wait, local f copyback and the f buffer
605 clearing. All these operations, except for the local interaction kernel,
606 are needed for the non-local interactions. The skip of the local kernel
607 call is taken care of later in this function. */
608 if (canSkipNonbondedWork(*nb, iloc))
610 plist->haveFreshList = false;
612 return;
615 if (nbp->useDynamicPruning && plist->haveFreshList)
617 /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
618 (that's the way the timing accounting can distinguish between
619 separate prune kernel and combined force+prune).
621 Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
624 if (plist->nsci == 0)
626 /* Don't launch an empty local kernel (is not allowed with OpenCL).
628 return;
631 /* beginning of timed nonbonded calculation section */
632 if (bDoTime)
634 t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
637 /* kernel launch config */
639 KernelLaunchConfig config;
640 config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
641 config.blockSize[0] = c_clSize;
642 config.blockSize[1] = c_clSize;
643 config.gridSize[0] = plist->nsci;
645 validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
647 if (debug)
649 fprintf(debug,
650 "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
651 "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
652 config.blockSize[0], config.blockSize[1], config.blockSize[2],
653 config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
654 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
655 c_nbnxnGpuNumClusterPerSupercluster, plist->na_c);
658 fillin_ocl_structures(nbp, &nbparams_params);
660 auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
661 constexpr char kernelName[] = "k_calc_nb";
662 const auto kernel =
663 select_nbnxn_kernel(nb, nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
664 (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
667 // The OpenCL kernel takes int as second to last argument because bool is
668 // not supported as a kernel argument type (sizeof(bool) is implementation defined).
669 const int computeFshift = static_cast<int>(stepWork.computeVirial);
670 if (useLjCombRule(nb->nbparam->vdwtype))
672 const auto kernelArgs = prepareGpuKernelArguments(
673 kernel, config, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el,
674 &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
675 &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
677 launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
679 else
681 const auto kernelArgs = prepareGpuKernelArguments(
682 kernel, config, &adat->ntypes, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj,
683 &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
684 &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
685 launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
688 if (bDoTime)
690 t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
695 /*! \brief Calculates the amount of shared memory required by the prune kernel.
697 * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
698 * for OpenCL local memory.
700 * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
701 * dimension. \returns the amount of local memory in bytes required by the pruning kernel
703 static inline int calc_shmem_required_prune(const int num_threads_z)
705 int shmem;
707 /* i-atom x in shared memory (for convenience we load all 4 components including q) */
708 shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4;
709 /* cj in shared memory, for each warp separately
710 * Note: only need to load once per wavefront, but to keep the code simple,
711 * for now we load twice on AMD.
713 shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
714 /* Warp vote, requires one uint per warp/32 threads per block. */
715 shmem += sizeof(cl_uint) * 2 * num_threads_z;
717 return shmem;
720 /*! \brief
721 * Launch the pairlist prune only kernel for the given locality.
722 * \p numParts tells in how many parts, i.e. calls the list will be pruned.
724 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
726 cl_atomdata_t* adat = nb->atdat;
727 NBParamGpu* nbp = nb->nbparam;
728 gpu_plist* plist = nb->plist[iloc];
729 cl_timers_t* t = nb->timers;
730 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
731 bool bDoTime = nb->bDoTime;
733 if (plist->haveFreshList)
735 GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
737 /* Set rollingPruningNumParts to signal that it is not set */
738 plist->rollingPruningNumParts = 0;
739 plist->rollingPruningPart = 0;
741 else
743 if (plist->rollingPruningNumParts == 0)
745 plist->rollingPruningNumParts = numParts;
747 else
749 GMX_ASSERT(numParts == plist->rollingPruningNumParts,
750 "It is not allowed to change numParts in between list generation steps");
754 /* Use a local variable for part and update in plist, so we can return here
755 * without duplicating the part increment code.
757 int part = plist->rollingPruningPart;
759 plist->rollingPruningPart++;
760 if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
762 plist->rollingPruningPart = 0;
765 /* Compute the number of list entries to prune in this pass */
766 int numSciInPart = (plist->nsci - part) / numParts;
768 /* Don't launch the kernel if there is no work to do. */
769 if (numSciInPart <= 0)
771 plist->haveFreshList = false;
773 return;
776 GpuRegionTimer* timer = nullptr;
777 if (bDoTime)
779 timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
782 /* beginning of timed prune calculation section */
783 if (bDoTime)
785 timer->openTimingRegion(deviceStream);
788 /* Kernel launch config:
789 * - The thread block dimensions match the size of i-clusters, j-clusters,
790 * and j-cluster concurrency, in x, y, and z, respectively.
791 * - The 1D block-grid contains as many blocks as super-clusters.
793 int num_threads_z = c_oclPruneKernelJ4ConcurrencyDEFAULT;
796 /* kernel launch config */
797 KernelLaunchConfig config;
798 config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
799 config.blockSize[0] = c_clSize;
800 config.blockSize[1] = c_clSize;
801 config.blockSize[2] = num_threads_z;
802 config.gridSize[0] = numSciInPart;
804 validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
806 if (debug)
808 fprintf(debug,
809 "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
810 "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
811 "\tShMem: %zu\n",
812 config.blockSize[0], config.blockSize[1], config.blockSize[2],
813 config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
814 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
815 c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
818 cl_nbparam_params_t nbparams_params;
819 fillin_ocl_structures(nbp, &nbparams_params);
821 auto* timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
822 constexpr char kernelName[] = "k_pruneonly";
823 const auto pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
824 const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config, &nbparams_params,
825 &adat->xq, &adat->shift_vec, &plist->sci,
826 &plist->cj4, &plist->imask, &numParts, &part);
827 launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
829 if (plist->haveFreshList)
831 plist->haveFreshList = false;
832 /* Mark that pruning has been done */
833 nb->timers->interaction[iloc].didPrune = true;
835 else
837 /* Mark that rolling pruning has been done */
838 nb->timers->interaction[iloc].didRollingPrune = true;
841 if (bDoTime)
843 timer->closeTimingRegion(deviceStream);
847 /*! \brief
848 * Launch asynchronously the download of nonbonded forces from the GPU
849 * (and energies/shift forces if required).
851 void gpu_launch_cpyback(NbnxmGpu* nb,
852 struct nbnxn_atomdata_t* nbatom,
853 const gmx::StepWorkload& stepWork,
854 const AtomLocality aloc)
856 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
858 cl_int gmx_unused cl_error;
859 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
861 /* determine interaction locality from atom locality */
862 const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
864 cl_atomdata_t* adat = nb->atdat;
865 cl_timers_t* t = nb->timers;
866 bool bDoTime = nb->bDoTime;
867 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
869 /* don't launch non-local copy-back if there was no non-local work to do */
870 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
872 /* TODO An alternative way to signal that non-local work is
873 complete is to use a clEnqueueMarker+clEnqueueBarrier
874 pair. However, the use of bNonLocalStreamActive has the
875 advantage of being local to the host, so probably minimizes
876 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
877 test case, overall simulation performance was higher with
878 the API calls, but this has not been tested on AMD OpenCL,
879 so could be worth considering in future. */
880 nb->bNonLocalStreamActive = CL_FALSE;
881 return;
884 getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
886 /* beginning of timed D2H section */
887 if (bDoTime)
889 t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
892 /* With DD the local D2H transfer can only start after the non-local
893 has been launched. */
894 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
896 sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
899 /* DtoH f */
900 GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
901 "The host force buffer should be in single precision to match device data size.");
902 copyFromDeviceBuffer(&nbatom->out[0].f.data()[adat_begin * DIM], &adat->f, adat_begin * DIM,
903 adat_len * DIM, deviceStream, GpuApiCallBehavior::Async,
904 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
906 /* kick off work */
907 cl_error = clFlush(deviceStream.stream());
908 GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
910 /* After the non-local D2H is launched the nonlocal_done event can be
911 recorded which signals that the local D2H can proceed. This event is not
912 placed after the non-local kernel because we first need the non-local
913 data back first. */
914 if (iloc == InteractionLocality::NonLocal)
916 cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
917 GMX_ASSERT(cl_error == CL_SUCCESS,
918 ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
919 nb->bNonLocalStreamActive = CL_TRUE;
922 /* only transfer energies in the local stream */
923 if (iloc == InteractionLocality::Local)
925 /* DtoH fshift when virial is needed */
926 if (stepWork.computeVirial)
928 GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
929 "Sizes of host- and device-side shift vector elements should be the same.");
930 copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift), &adat->fshift, 0,
931 SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async,
932 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
935 /* DtoH energies */
936 if (stepWork.computeEnergy)
938 GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
939 "Sizes of host- and device-side LJ energy terms should be the same.");
940 copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async,
941 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
942 GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
943 "Sizes of host- and device-side electrostatic energy terms should be the "
944 "same.");
945 copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async,
946 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
950 if (bDoTime)
952 t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
956 } // namespace Nbnxm