Remove texture reference support in the CUDA
[gromacs.git] / src / gromacs / mdlib / nbnxn_ocl / nbnxn_ocl.cpp
blobb91074075cafff31be01e92d840df993d6e5ec3f
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief Define OpenCL implementation of nbnxn_gpu.h
38 * \author Anca Hamuraru <anca@streamcomputing.eu>
39 * \author Teemu Virolainen <teemu@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Szilárd Páll <pall.szilard@gmail.com>
42 * \ingroup module_mdlib
44 * TODO (psz):
45 * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
46 * - Rework the copying of OCL data structures done before every invocation of both
47 * nb and prune kernels (using fillin_ocl_structures); also consider at the same
48 * time calling clSetKernelArg only on the updated parameters (if tracking changed
49 * parameters is feasible);
50 * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
51 * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
52 * on the misc_ops_and_local_H2D_done event could be better expressed this way.
54 * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
55 * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
56 * is identical in the two implementations, so a 3-way split might allow sharing
57 * code;
58 * -
61 #include "gmxpre.h"
63 #include <assert.h>
64 #include <stdlib.h>
66 #if defined(_MSVC)
67 #include <limits>
68 #endif
70 #include "thread_mpi/atomic.h"
72 #include "gromacs/gpu_utils/oclutils.h"
73 #include "gromacs/hardware/hw_info.h"
74 #include "gromacs/mdlib/force_flags.h"
75 #include "gromacs/mdlib/nb_verlet.h"
76 #include "gromacs/mdlib/nbnxn_consts.h"
77 #include "gromacs/mdlib/nbnxn_gpu.h"
78 #include "gromacs/mdlib/nbnxn_gpu_common.h"
79 #include "gromacs/mdlib/nbnxn_gpu_common_utils.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_pairlist.h"
82 #include "gromacs/pbcutil/ishift.h"
83 #include "gromacs/timing/gpu_timing.h"
84 #include "gromacs/utility/cstringutil.h"
85 #include "gromacs/utility/fatalerror.h"
86 #include "gromacs/utility/gmxassert.h"
88 #include "nbnxn_ocl_internal.h"
89 #include "nbnxn_ocl_types.h"
92 /*! \brief Convenience constants */
93 //@{
94 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
95 static const int c_clSize = c_nbnxnGpuClusterSize;
96 //@}
99 /* Uncomment this define to enable kernel debugging */
100 //#define DEBUG_OCL
102 /*! \brief Specifies which kernel run to debug */
103 #define DEBUG_RUN_STEP 2
105 /*! \brief Validates the input global work size parameter.
107 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
109 cl_uint device_size_t_size_bits;
110 cl_uint host_size_t_size_bits;
112 assert(dinfo);
114 /* Each component of a global_work_size must not exceed the range given by the
115 sizeof(device size_t) for the device on which the kernel execution will
116 be enqueued. See:
117 https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
119 device_size_t_size_bits = dinfo->adress_bits;
120 host_size_t_size_bits = (cl_uint)(sizeof(size_t) * 8);
122 /* If sizeof(host size_t) <= sizeof(device size_t)
123 => global_work_size components will always be valid
124 else
125 => get device limit for global work size and
126 compare it against each component of global_work_size.
128 if (host_size_t_size_bits > device_size_t_size_bits)
130 size_t device_limit;
132 device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
134 for (int i = 0; i < work_dim; i++)
136 if (global_work_size[i] > device_limit)
138 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
139 "The number of nonbonded work units (=number of super-clusters) exceeds the"
140 "device capabilities. Global work size limit exceeded (%d > %d)!",
141 global_work_size[i], device_limit);
147 /* Constant arrays listing non-bonded kernel function names. The arrays are
148 * organized in 2-dim arrays by: electrostatics and VDW type.
150 * Note that the row- and column-order of function pointers has to match the
151 * order of corresponding enumerated electrostatics and vdw types, resp.,
152 * defined in nbnxn_cuda_types.h.
155 /*! \brief Force-only kernel function names. */
156 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
158 { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
159 { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
160 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
161 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
162 { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
163 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
166 /*! \brief Force + energy kernel function pointers. */
167 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
169 { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
170 { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
171 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
172 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
173 { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
174 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
177 /*! \brief Force + pruning kernel function pointers. */
178 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
180 { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
181 { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
182 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
183 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
184 { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
185 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
188 /*! \brief Force + energy + pruning kernel function pointers. */
189 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
191 { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
192 { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
193 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
194 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
195 { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
196 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
199 /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
201 * \param[in] kernel_pruneonly array of prune kernel objects
202 * \param[in] firstPrunePass true if the first pruning pass is being executed
204 static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
205 bool firstPrunePass)
207 cl_kernel *kernelPtr;
209 if (firstPrunePass)
211 kernelPtr = &(kernel_pruneonly[epruneFirst]);
213 else
215 kernelPtr = &(kernel_pruneonly[epruneRolling]);
217 // TODO: consider creating the prune kernel object here to avoid a
218 // clCreateKernel for the rolling prune kernel if this is not needed.
219 return *kernelPtr;
222 /*! \brief Return a pointer to the kernel version to be executed at the current step.
223 * OpenCL kernel objects are cached in nb. If the requested kernel is not
224 * found in the cache, it will be created and the cache will be updated.
226 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t *nb,
227 int eeltype,
228 int evdwtype,
229 bool bDoEne,
230 bool bDoPrune)
232 const char* kernel_name_to_run;
233 cl_kernel *kernel_ptr;
234 cl_int cl_error;
236 assert(eeltype < eelOclNR);
237 assert(evdwtype < evdwOclNR);
239 if (bDoEne)
241 if (bDoPrune)
243 kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
244 kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
246 else
248 kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
249 kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
252 else
254 if (bDoPrune)
256 kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
257 kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
259 else
261 kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
262 kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
266 if (NULL == kernel_ptr[0])
268 *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
269 assert(cl_error == CL_SUCCESS);
271 // TODO: handle errors
273 return *kernel_ptr;
276 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
278 static inline int calc_shmem_required_nonbonded(int vdwType,
279 bool bPrefetchLjParam)
281 int shmem;
283 /* size of shmem (force-buffers/xq/atom type preloading) */
284 /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
285 /* i-atom x+q in shared memory */
286 shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
287 /* cj in shared memory, for both warps separately */
288 shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
289 if (bPrefetchLjParam)
291 if (useLjCombRule(vdwType))
293 /* i-atom LJ combination parameters in shared memory */
294 shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
296 else
298 /* i-atom types in shared memory */
299 shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
302 /* force reduction buffers in shared memory */
303 shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
304 /* Warp vote. In fact it must be * number of warps in block.. */
305 shmem += sizeof(cl_uint) * 2; /* warp_any */
306 return shmem;
309 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
311 * The device can't use the same data structures as the host for two main reasons:
312 * - OpenCL restrictions (pointers are not accepted inside data structures)
313 * - some host side fields are not needed for the OpenCL kernels.
315 * This function is called before the launch of both nbnxn and prune kernels.
317 static void fillin_ocl_structures(cl_nbparam_t *nbp,
318 cl_nbparam_params_t *nbparams_params)
320 nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
321 nbparams_params->c_rf = nbp->c_rf;
322 nbparams_params->dispersion_shift = nbp->dispersion_shift;
323 nbparams_params->eeltype = nbp->eeltype;
324 nbparams_params->epsfac = nbp->epsfac;
325 nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
326 nbparams_params->ewald_beta = nbp->ewald_beta;
327 nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
328 nbparams_params->repulsion_shift = nbp->repulsion_shift;
329 nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
330 nbparams_params->rvdw_sq = nbp->rvdw_sq;
331 nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
332 nbparams_params->rvdw_switch = nbp->rvdw_switch;
333 nbparams_params->sh_ewald = nbp->sh_ewald;
334 nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
335 nbparams_params->two_k_rf = nbp->two_k_rf;
336 nbparams_params->vdwtype = nbp->vdwtype;
337 nbparams_params->vdw_switch = nbp->vdw_switch;
340 /*! \brief Enqueues a wait for event completion.
342 * Then it releases the event and sets it to 0.
343 * Don't use this function when more than one wait will be issued for the event.
344 * Equivalent to Cuda Stream Sync. */
345 static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
347 cl_int gmx_unused cl_error;
349 /* Enqueue wait */
350 #ifdef CL_VERSION_1_2
351 cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
352 #else
353 cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
354 #endif
356 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
358 /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
359 cl_error = clReleaseEvent(*ocl_event);
360 assert(CL_SUCCESS == cl_error);
361 *ocl_event = 0;
364 /*! \brief Launch GPU kernel
366 As we execute nonbonded workload in separate queues, before launching
367 the kernel we need to make sure that he following operations have completed:
368 - atomdata allocation and related H2D transfers (every nstlist step);
369 - pair list H2D transfer (every nstlist step);
370 - shift vector H2D transfer (every nstlist step);
371 - force (+shift force and energy) output clearing (every step).
373 These operations are issued in the local queue at the beginning of the step
374 and therefore always complete before the local kernel launch. The non-local
375 kernel is launched after the local on the same device/context, so this is
376 inherently scheduled after the operations in the local stream (including the
377 above "misc_ops").
378 However, for the sake of having a future-proof implementation, we use the
379 misc_ops_done event to record the point in time when the above operations
380 are finished and synchronize with this event in the non-local stream.
382 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
383 const struct nbnxn_atomdata_t *nbatom,
384 int flags,
385 int iloc)
387 cl_int cl_error;
388 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
389 /* OpenCL kernel launch-related stuff */
390 int shmem;
391 size_t local_work_size[3], global_work_size[3];
392 cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
394 cl_atomdata_t *adat = nb->atdat;
395 cl_nbparam_t *nbp = nb->nbparam;
396 cl_plist_t *plist = nb->plist[iloc];
397 cl_timers_t *t = nb->timers;
398 cl_command_queue stream = nb->stream[iloc];
400 bool bCalcEner = flags & GMX_FORCE_ENERGY;
401 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
402 bool bDoTime = nb->bDoTime;
403 cl_uint arg_no;
405 cl_nbparam_params_t nbparams_params;
406 #ifdef DEBUG_OCL
407 float * debug_buffer_h;
408 size_t debug_buffer_size;
409 #endif
411 /* Don't launch the non-local kernel if there is no work to do.
412 Doing the same for the local kernel is more complicated, since the
413 local part of the force array also depends on the non-local kernel.
414 So to avoid complicating the code and to reduce the risk of bugs,
415 we always call the local kernel, the local x+q copy and later (not in
416 this function) the stream wait, local f copyback and the f buffer
417 clearing. All these operations, except for the local interaction kernel,
418 are needed for the non-local interactions. The skip of the local kernel
419 call is taken care of later in this function. */
420 if (canSkipWork(nb, iloc))
422 plist->haveFreshList = false;
424 return;
427 /* calculate the atom data index range based on locality */
428 if (LOCAL_I(iloc))
430 adat_begin = 0;
431 adat_len = adat->natoms_local;
433 else
435 adat_begin = adat->natoms_local;
436 adat_len = adat->natoms - adat->natoms_local;
439 /* beginning of timed HtoD section */
440 if (bDoTime)
442 t->nb_h2d[iloc].openTimingRegion(stream);
445 /* HtoD x, q */
446 ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
447 adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
449 if (bDoTime)
451 t->nb_h2d[iloc].closeTimingRegion(stream);
454 /* When we get here all misc operations issues in the local stream as well as
455 the local xq H2D are done,
456 so we record that in the local stream and wait for it in the nonlocal one. */
457 if (nb->bUseTwoStreams)
459 if (iloc == eintLocal)
461 #ifdef CL_VERSION_1_2
462 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
463 #else
464 cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
465 #endif
466 assert(CL_SUCCESS == cl_error);
468 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
469 * in the local stream in order to be able to sync with the above event
470 * from the non-local stream.
472 cl_error = clFlush(stream);
473 assert(CL_SUCCESS == cl_error);
475 else
477 sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
481 if (nbp->useDynamicPruning && plist->haveFreshList)
483 /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
484 (that's the way the timing accounting can distinguish between
485 separate prune kernel and combined force+prune).
487 nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
490 if (plist->nsci == 0)
492 /* Don't launch an empty local kernel (is not allowed with OpenCL).
493 * TODO: Separate H2D and kernel launch into separate functions.
495 return;
498 /* beginning of timed nonbonded calculation section */
499 if (bDoTime)
501 t->nb_k[iloc].openTimingRegion(stream);
504 /* get the pointer to the kernel flavor we need to use */
505 nb_kernel = select_nbnxn_kernel(nb,
506 nbp->eeltype,
507 nbp->vdwtype,
508 bCalcEner,
509 (plist->haveFreshList && !nb->timers->didPrune[iloc]));
511 /* kernel launch config */
512 local_work_size[0] = c_clSize;
513 local_work_size[1] = c_clSize;
514 local_work_size[2] = 1;
516 global_work_size[0] = plist->nsci * local_work_size[0];
517 global_work_size[1] = 1 * local_work_size[1];
518 global_work_size[2] = 1 * local_work_size[2];
520 validate_global_work_size(global_work_size, 3, nb->dev_info);
522 shmem = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
524 #ifdef DEBUG_OCL
526 static int run_step = 1;
528 if (DEBUG_RUN_STEP == run_step)
530 debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
531 debug_buffer_h = (float*)calloc(1, debug_buffer_size);
532 assert(NULL != debug_buffer_h);
534 if (NULL == nb->debug_buffer)
536 nb->debug_buffer = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
537 debug_buffer_size, debug_buffer_h, &cl_error);
539 assert(CL_SUCCESS == cl_error);
543 run_step++;
545 #endif
546 if (debug)
548 fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
549 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
550 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
551 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
552 c_numClPerSupercl, plist->na_c);
555 fillin_ocl_structures(nbp, &nbparams_params);
557 arg_no = 0;
558 cl_error = CL_SUCCESS;
559 if (!useLjCombRule(nb->nbparam->vdwtype))
561 cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
563 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
564 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
565 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
566 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
567 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
568 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
569 if (useLjCombRule(nb->nbparam->vdwtype))
571 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb));
573 else
575 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
577 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
578 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
579 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
580 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
581 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
582 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
583 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
584 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
585 cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
586 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
588 assert(cl_error == CL_SUCCESS);
590 if (cl_error)
592 printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
594 cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr);
595 assert(cl_error == CL_SUCCESS);
597 if (bDoTime)
599 t->nb_k[iloc].closeTimingRegion(stream);
602 #ifdef DEBUG_OCL
604 static int run_step = 1;
606 if (DEBUG_RUN_STEP == run_step)
608 FILE *pf;
609 char file_name[256] = {0};
611 ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
612 debug_buffer_size, stream, NULL);
614 // Make sure all data has been transfered back from device
615 clFinish(stream);
617 printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
619 sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
620 pf = fopen(file_name, "wt");
621 assert(pf != NULL);
623 fprintf(pf, "%20s", "");
624 for (int j = 0; j < global_work_size[0]; j++)
626 char label[20];
627 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
628 fprintf(pf, "%20s", label);
631 for (int i = 0; i < global_work_size[1]; i++)
633 char label[20];
634 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
635 fprintf(pf, "\n%20s", label);
637 for (int j = 0; j < global_work_size[0]; j++)
639 fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
642 //fprintf(pf, "\n");
645 fclose(pf);
647 printf(" done.\n");
650 free(debug_buffer_h);
651 debug_buffer_h = NULL;
654 run_step++;
656 #endif
660 /*! \brief Calculates the amount of shared memory required by the prune kernel.
662 * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
663 * for OpenCL local memory.
665 * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
666 * \returns the amount of local memory in bytes required by the pruning kernel
668 static inline int calc_shmem_required_prune(const int num_threads_z)
670 int shmem;
672 /* i-atom x in shared memory (for convenience we load all 4 components including q) */
673 shmem = c_numClPerSupercl * c_clSize * sizeof(float)*4;
674 /* cj in shared memory, for each warp separately */
675 shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
676 /* Warp vote, requires one uint per warp/32 threads per block. */
677 shmem += sizeof(cl_uint) * 2*num_threads_z;
679 return shmem;
682 void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
683 int iloc,
684 int numParts)
686 cl_int cl_error;
688 cl_atomdata_t *adat = nb->atdat;
689 cl_nbparam_t *nbp = nb->nbparam;
690 cl_plist_t *plist = nb->plist[iloc];
691 cl_timers_t *t = nb->timers;
692 cl_command_queue stream = nb->stream[iloc];
693 bool bDoTime = nb->bDoTime;
695 if (plist->haveFreshList)
697 GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
699 /* Set rollingPruningNumParts to signal that it is not set */
700 plist->rollingPruningNumParts = 0;
701 plist->rollingPruningPart = 0;
703 else
705 if (plist->rollingPruningNumParts == 0)
707 plist->rollingPruningNumParts = numParts;
709 else
711 GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
715 /* Use a local variable for part and update in plist, so we can return here
716 * without duplicating the part increment code.
718 int part = plist->rollingPruningPart;
720 plist->rollingPruningPart++;
721 if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
723 plist->rollingPruningPart = 0;
726 /* Compute the number of list entries to prune in this pass */
727 int numSciInPart = (plist->nsci - part)/numParts;
729 /* Don't launch the kernel if there is no work to do. */
730 if (numSciInPart <= 0)
732 plist->haveFreshList = false;
734 return;
737 GpuRegionTimer *timer = nullptr;
738 if (bDoTime)
740 timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
743 /* beginning of timed prune calculation section */
744 if (bDoTime)
746 timer->openTimingRegion(stream);
749 /* Kernel launch config:
750 * - The thread block dimensions match the size of i-clusters, j-clusters,
751 * and j-cluster concurrency, in x, y, and z, respectively.
752 * - The 1D block-grid contains as many blocks as super-clusters.
754 int num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
755 cl_kernel pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
757 /* kernel launch config */
758 size_t local_work_size[3], global_work_size[3];
759 local_work_size[0] = c_clSize;
760 local_work_size[1] = c_clSize;
761 local_work_size[2] = num_threads_z;
763 global_work_size[0] = numSciInPart * local_work_size[0];
764 global_work_size[1] = 1 * local_work_size[1];
765 global_work_size[2] = 1 * local_work_size[2];
767 validate_global_work_size(global_work_size, 3, nb->dev_info);
769 int shmem = calc_shmem_required_prune(num_threads_z);
771 if (debug)
773 fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
774 "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
775 "\tShMem: %d\n",
776 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
777 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
778 c_numClPerSupercl, plist->na_c, shmem);
781 cl_nbparam_params_t nbparams_params;
782 fillin_ocl_structures(nbp, &nbparams_params);
784 cl_uint arg_no = 0;
785 cl_error = CL_SUCCESS;
787 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
788 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->xq));
789 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
790 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->sci));
791 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
792 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->imask));
793 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(numParts));
794 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(part));
795 cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
796 assert(cl_error == CL_SUCCESS);
798 cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
799 nullptr, global_work_size, local_work_size,
800 0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr);
801 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
803 if (plist->haveFreshList)
805 plist->haveFreshList = false;
806 /* Mark that pruning has been done */
807 nb->timers->didPrune[iloc] = true;
809 else
811 /* Mark that rolling pruning has been done */
812 nb->timers->didRollingPrune[iloc] = true;
815 if (bDoTime)
817 timer->closeTimingRegion(stream);
821 /*! \brief
822 * Launch asynchronously the download of nonbonded forces from the GPU
823 * (and energies/shift forces if required).
825 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
826 const struct nbnxn_atomdata_t *nbatom,
827 int flags,
828 int aloc)
830 cl_int gmx_unused cl_error;
831 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
833 /* determine interaction locality from atom locality */
834 int iloc = gpuAtomToInteractionLocality(aloc);
836 cl_atomdata_t *adat = nb->atdat;
837 cl_timers_t *t = nb->timers;
838 bool bDoTime = nb->bDoTime;
839 cl_command_queue stream = nb->stream[iloc];
841 bool bCalcEner = flags & GMX_FORCE_ENERGY;
842 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
845 /* don't launch non-local copy-back if there was no non-local work to do */
846 if (canSkipWork(nb, iloc))
848 /* TODO An alternative way to signal that non-local work is
849 complete is to use a clEnqueueMarker+clEnqueueBarrier
850 pair. However, the use of bNonLocalStreamActive has the
851 advantage of being local to the host, so probably minimizes
852 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
853 test case, overall simulation performance was higher with
854 the API calls, but this has not been tested on AMD OpenCL,
855 so could be worth considering in future. */
856 nb->bNonLocalStreamActive = false;
857 return;
860 getGpuAtomRange(adat, aloc, adat_begin, adat_len);
862 /* beginning of timed D2H section */
863 if (bDoTime)
865 t->nb_d2h[iloc].openTimingRegion(stream);
868 /* With DD the local D2H transfer can only start after the non-local
869 has been launched. */
870 if (iloc == eintLocal && nb->bNonLocalStreamActive)
872 sync_ocl_event(stream, &(nb->nonlocal_done));
875 /* DtoH f */
876 ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
877 (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
879 /* kick off work */
880 cl_error = clFlush(stream);
881 assert(CL_SUCCESS == cl_error);
883 /* After the non-local D2H is launched the nonlocal_done event can be
884 recorded which signals that the local D2H can proceed. This event is not
885 placed after the non-local kernel because we first need the non-local
886 data back first. */
887 if (iloc == eintNonlocal)
889 #ifdef CL_VERSION_1_2
890 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
891 #else
892 cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
893 #endif
894 assert(CL_SUCCESS == cl_error);
895 nb->bNonLocalStreamActive = true;
898 /* only transfer energies in the local stream */
899 if (LOCAL_I(iloc))
901 /* DtoH fshift */
902 if (bCalcFshift)
904 ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
905 SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
908 /* DtoH energies */
909 if (bCalcEner)
911 ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
912 sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
914 ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
915 sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
919 if (bDoTime)
921 t->nb_d2h[iloc].closeTimingRegion(stream);
926 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
927 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
929 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
930 int kernel_type;
932 /* Benchmarking/development environment variables to force the use of
933 analytical or tabulated Ewald kernel. */
934 bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
935 bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
937 if (bForceAnalyticalEwald && bForceTabulatedEwald)
939 gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
940 "requested through environment variables.");
943 /* OpenCL: By default, use analytical Ewald
944 * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
946 * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
949 //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
950 if ((1 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
952 bUseAnalyticalEwald = true;
954 if (debug)
956 fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
959 else
961 bUseAnalyticalEwald = false;
963 if (debug)
965 fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
969 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
970 forces it (use it for debugging/benchmarking only). */
971 if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
973 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
975 else
977 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
980 return kernel_type;