Eliminate some OCL/CUDA code code duplication
[gromacs.git] / src / gromacs / mdlib / nbnxn_ocl / nbnxn_ocl.cpp
blob31f574863bd80b2481f09dfff826e0c9f7470c07
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief Define OpenCL implementation of nbnxn_gpu.h
38 * \author Anca Hamuraru <anca@streamcomputing.eu>
39 * \author Teemu Virolainen <teemu@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Szilárd Páll <pall.szilard@gmail.com>
42 * \ingroup module_mdlib
44 * TODO (psz):
45 * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
46 * - Rework the copying of OCL data structures done before every invocation of both
47 * nb and prune kernels (using fillin_ocl_structures); also consider at the same
48 * time calling clSetKernelArg only on the updated parameters (if tracking changed
49 * parameters is feasible);
50 * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
51 * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
52 * on the misc_ops_and_local_H2D_done event could be better expressed this way.
54 * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
55 * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
56 * is identical in the two implementations, so a 3-way split might allow sharing
57 * code;
58 * -
61 #include "gmxpre.h"
63 #include <assert.h>
64 #include <stdlib.h>
66 #if defined(_MSVC)
67 #include <limits>
68 #endif
70 #include "thread_mpi/atomic.h"
72 #include "gromacs/gpu_utils/oclutils.h"
73 #include "gromacs/hardware/hw_info.h"
74 #include "gromacs/mdlib/force_flags.h"
75 #include "gromacs/mdlib/nb_verlet.h"
76 #include "gromacs/mdlib/nbnxn_consts.h"
77 #include "gromacs/mdlib/nbnxn_gpu.h"
78 #include "gromacs/mdlib/nbnxn_gpu_common.h"
79 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
80 #include "gromacs/mdlib/nbnxn_pairlist.h"
81 #include "gromacs/pbcutil/ishift.h"
82 #include "gromacs/timing/gpu_timing.h"
83 #include "gromacs/utility/cstringutil.h"
84 #include "gromacs/utility/fatalerror.h"
85 #include "gromacs/utility/gmxassert.h"
87 #include "nbnxn_ocl_internal.h"
88 #include "nbnxn_ocl_types.h"
90 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
91 #define USE_TEXOBJ
92 #endif
94 /*! \brief Convenience constants */
95 //@{
96 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
97 static const int c_clSize = c_nbnxnGpuClusterSize;
98 //@}
100 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
101 //@{
102 static bool always_ener = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
103 static bool never_ener = (getenv("GMX_GPU_NEVER_ENER") != NULL);
104 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
105 //@}
107 /* Uncomment this define to enable kernel debugging */
108 //#define DEBUG_OCL
110 /*! \brief Specifies which kernel run to debug */
111 #define DEBUG_RUN_STEP 2
113 /*! \brief Validates the input global work size parameter.
115 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
117 cl_uint device_size_t_size_bits;
118 cl_uint host_size_t_size_bits;
120 assert(dinfo);
122 /* Each component of a global_work_size must not exceed the range given by the
123 sizeof(device size_t) for the device on which the kernel execution will
124 be enqueued. See:
125 https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
127 device_size_t_size_bits = dinfo->adress_bits;
128 host_size_t_size_bits = (cl_uint)(sizeof(size_t) * 8);
130 /* If sizeof(host size_t) <= sizeof(device size_t)
131 => global_work_size components will always be valid
132 else
133 => get device limit for global work size and
134 compare it against each component of global_work_size.
136 if (host_size_t_size_bits > device_size_t_size_bits)
138 size_t device_limit;
140 device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
142 for (int i = 0; i < work_dim; i++)
144 if (global_work_size[i] > device_limit)
146 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
147 "The number of nonbonded work units (=number of super-clusters) exceeds the"
148 "device capabilities. Global work size limit exceeded (%d > %d)!",
149 global_work_size[i], device_limit);
155 /* Constant arrays listing non-bonded kernel function names. The arrays are
156 * organized in 2-dim arrays by: electrostatics and VDW type.
158 * Note that the row- and column-order of function pointers has to match the
159 * order of corresponding enumerated electrostatics and vdw types, resp.,
160 * defined in nbnxn_cuda_types.h.
163 /*! \brief Force-only kernel function names. */
164 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
166 { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
167 { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
168 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
169 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
170 { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
171 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
174 /*! \brief Force + energy kernel function pointers. */
175 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
177 { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
178 { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
179 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
180 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
181 { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
182 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
185 /*! \brief Force + pruning kernel function pointers. */
186 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
188 { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
189 { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
190 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
191 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
192 { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
193 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
196 /*! \brief Force + energy + pruning kernel function pointers. */
197 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
199 { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
200 { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
201 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
202 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
203 { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
204 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
207 /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
209 * \param[in] kernel_pruneonly array of prune kernel objects
210 * \param[in] firstPrunePass true if the first pruning pass is being executed
212 static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
213 bool firstPrunePass)
215 cl_kernel *kernelPtr;
217 if (firstPrunePass)
219 kernelPtr = &(kernel_pruneonly[epruneFirst]);
221 else
223 kernelPtr = &(kernel_pruneonly[epruneRolling]);
225 // TODO: consider creating the prune kernel object here to avoid a
226 // clCreateKernel for the rolling prune kernel if this is not needed.
227 return *kernelPtr;
230 /*! \brief Return a pointer to the kernel version to be executed at the current step.
231 * OpenCL kernel objects are cached in nb. If the requested kernel is not
232 * found in the cache, it will be created and the cache will be updated.
234 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t *nb,
235 int eeltype,
236 int evdwtype,
237 bool bDoEne,
238 bool bDoPrune)
240 const char* kernel_name_to_run;
241 cl_kernel *kernel_ptr;
242 cl_int cl_error;
244 assert(eeltype < eelOclNR);
245 assert(evdwtype < evdwOclNR);
247 if (bDoEne)
249 if (bDoPrune)
251 kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
252 kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
254 else
256 kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
257 kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
260 else
262 if (bDoPrune)
264 kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
265 kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
267 else
269 kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
270 kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
274 if (NULL == kernel_ptr[0])
276 *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
277 assert(cl_error == CL_SUCCESS);
279 // TODO: handle errors
281 return *kernel_ptr;
284 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
286 static inline int calc_shmem_required_nonbonded(int vdwType,
287 bool bPrefetchLjParam)
289 int shmem;
291 /* size of shmem (force-buffers/xq/atom type preloading) */
292 /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
293 /* i-atom x+q in shared memory */
294 shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
295 /* cj in shared memory, for both warps separately */
296 shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
297 if (bPrefetchLjParam)
299 if (useLjCombRule(vdwType))
301 /* i-atom LJ combination parameters in shared memory */
302 shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
304 else
306 /* i-atom types in shared memory */
307 shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
310 /* force reduction buffers in shared memory */
311 shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
312 /* Warp vote. In fact it must be * number of warps in block.. */
313 shmem += sizeof(cl_uint) * 2; /* warp_any */
314 return shmem;
317 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
319 * The device can't use the same data structures as the host for two main reasons:
320 * - OpenCL restrictions (pointers are not accepted inside data structures)
321 * - some host side fields are not needed for the OpenCL kernels.
323 * This function is called before the launch of both nbnxn and prune kernels.
325 static void fillin_ocl_structures(cl_nbparam_t *nbp,
326 cl_nbparam_params_t *nbparams_params)
328 nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
329 nbparams_params->c_rf = nbp->c_rf;
330 nbparams_params->dispersion_shift = nbp->dispersion_shift;
331 nbparams_params->eeltype = nbp->eeltype;
332 nbparams_params->epsfac = nbp->epsfac;
333 nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
334 nbparams_params->ewald_beta = nbp->ewald_beta;
335 nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
336 nbparams_params->repulsion_shift = nbp->repulsion_shift;
337 nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
338 nbparams_params->rvdw_sq = nbp->rvdw_sq;
339 nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
340 nbparams_params->rvdw_switch = nbp->rvdw_switch;
341 nbparams_params->sh_ewald = nbp->sh_ewald;
342 nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
343 nbparams_params->two_k_rf = nbp->two_k_rf;
344 nbparams_params->vdwtype = nbp->vdwtype;
345 nbparams_params->vdw_switch = nbp->vdw_switch;
348 /*! \brief Enqueues a wait for event completion.
350 * Then it releases the event and sets it to 0.
351 * Don't use this function when more than one wait will be issued for the event.
352 * Equivalent to Cuda Stream Sync. */
353 static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
355 cl_int gmx_unused cl_error;
357 /* Enqueue wait */
358 #ifdef CL_VERSION_1_2
359 cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
360 #else
361 cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
362 #endif
364 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
366 /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
367 cl_error = clReleaseEvent(*ocl_event);
368 assert(CL_SUCCESS == cl_error);
369 *ocl_event = 0;
372 /*! \brief Launch GPU kernel
374 As we execute nonbonded workload in separate queues, before launching
375 the kernel we need to make sure that he following operations have completed:
376 - atomdata allocation and related H2D transfers (every nstlist step);
377 - pair list H2D transfer (every nstlist step);
378 - shift vector H2D transfer (every nstlist step);
379 - force (+shift force and energy) output clearing (every step).
381 These operations are issued in the local queue at the beginning of the step
382 and therefore always complete before the local kernel launch. The non-local
383 kernel is launched after the local on the same device/context, so this is
384 inherently scheduled after the operations in the local stream (including the
385 above "misc_ops").
386 However, for the sake of having a future-proof implementation, we use the
387 misc_ops_done event to record the point in time when the above operations
388 are finished and synchronize with this event in the non-local stream.
390 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
391 const struct nbnxn_atomdata_t *nbatom,
392 int flags,
393 int iloc)
395 cl_int cl_error;
396 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
397 /* OpenCL kernel launch-related stuff */
398 int shmem;
399 size_t local_work_size[3], global_work_size[3];
400 cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
402 cl_atomdata_t *adat = nb->atdat;
403 cl_nbparam_t *nbp = nb->nbparam;
404 cl_plist_t *plist = nb->plist[iloc];
405 cl_timers_t *t = nb->timers;
406 cl_command_queue stream = nb->stream[iloc];
408 bool bCalcEner = flags & GMX_FORCE_ENERGY;
409 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
410 bool bDoTime = nb->bDoTime;
411 cl_uint arg_no;
413 cl_nbparam_params_t nbparams_params;
414 #ifdef DEBUG_OCL
415 float * debug_buffer_h;
416 size_t debug_buffer_size;
417 #endif
419 /* turn energy calculation always on/off (for debugging/testing only) */
420 bCalcEner = (bCalcEner || always_ener) && !never_ener;
422 /* Don't launch the non-local kernel if there is no work to do.
423 Doing the same for the local kernel is more complicated, since the
424 local part of the force array also depends on the non-local kernel.
425 So to avoid complicating the code and to reduce the risk of bugs,
426 we always call the local kernel, the local x+q copy and later (not in
427 this function) the stream wait, local f copyback and the f buffer
428 clearing. All these operations, except for the local interaction kernel,
429 are needed for the non-local interactions. The skip of the local kernel
430 call is taken care of later in this function. */
431 if (canSkipWork(nb, iloc))
433 plist->haveFreshList = false;
435 return;
438 /* calculate the atom data index range based on locality */
439 if (LOCAL_I(iloc))
441 adat_begin = 0;
442 adat_len = adat->natoms_local;
444 else
446 adat_begin = adat->natoms_local;
447 adat_len = adat->natoms - adat->natoms_local;
450 /* beginning of timed HtoD section */
451 if (bDoTime)
453 t->nb_h2d[iloc].openTimingRegion(stream);
456 /* HtoD x, q */
457 ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
458 adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
460 if (bDoTime)
462 t->nb_h2d[iloc].closeTimingRegion(stream);
465 /* When we get here all misc operations issues in the local stream as well as
466 the local xq H2D are done,
467 so we record that in the local stream and wait for it in the nonlocal one. */
468 if (nb->bUseTwoStreams)
470 if (iloc == eintLocal)
472 #ifdef CL_VERSION_1_2
473 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
474 #else
475 cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
476 #endif
477 assert(CL_SUCCESS == cl_error);
479 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
480 * in the local stream in order to be able to sync with the above event
481 * from the non-local stream.
483 cl_error = clFlush(stream);
484 assert(CL_SUCCESS == cl_error);
486 else
488 sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
492 if (nbp->useDynamicPruning && plist->haveFreshList)
494 /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
495 (that's the way the timing accounting can distinguish between
496 separate prune kernel and combined force+prune).
498 nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
501 if (plist->nsci == 0)
503 /* Don't launch an empty local kernel (is not allowed with OpenCL).
504 * TODO: Separate H2D and kernel launch into separate functions.
506 return;
509 /* beginning of timed nonbonded calculation section */
510 if (bDoTime)
512 t->nb_k[iloc].openTimingRegion(stream);
515 /* get the pointer to the kernel flavor we need to use */
516 nb_kernel = select_nbnxn_kernel(nb,
517 nbp->eeltype,
518 nbp->vdwtype,
519 bCalcEner,
520 (plist->haveFreshList && !nb->timers->didPrune[iloc]) || always_prune);
522 /* kernel launch config */
523 local_work_size[0] = c_clSize;
524 local_work_size[1] = c_clSize;
525 local_work_size[2] = 1;
527 global_work_size[0] = plist->nsci * local_work_size[0];
528 global_work_size[1] = 1 * local_work_size[1];
529 global_work_size[2] = 1 * local_work_size[2];
531 validate_global_work_size(global_work_size, 3, nb->dev_info);
533 shmem = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
535 #ifdef DEBUG_OCL
537 static int run_step = 1;
539 if (DEBUG_RUN_STEP == run_step)
541 debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
542 debug_buffer_h = (float*)calloc(1, debug_buffer_size);
543 assert(NULL != debug_buffer_h);
545 if (NULL == nb->debug_buffer)
547 nb->debug_buffer = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
548 debug_buffer_size, debug_buffer_h, &cl_error);
550 assert(CL_SUCCESS == cl_error);
554 run_step++;
556 #endif
557 if (debug)
559 fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
560 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
561 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
562 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
563 c_numClPerSupercl, plist->na_c);
566 fillin_ocl_structures(nbp, &nbparams_params);
568 arg_no = 0;
569 cl_error = CL_SUCCESS;
570 if (!useLjCombRule(nb->nbparam->vdwtype))
572 cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
574 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
575 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
576 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
577 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
578 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
579 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
580 if (useLjCombRule(nb->nbparam->vdwtype))
582 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb));
584 else
586 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
588 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
589 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
590 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
591 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
592 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
593 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
594 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
595 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
596 cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
597 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
599 assert(cl_error == CL_SUCCESS);
601 if (cl_error)
603 printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
605 cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr);
606 assert(cl_error == CL_SUCCESS);
608 if (bDoTime)
610 t->nb_k[iloc].closeTimingRegion(stream);
613 #ifdef DEBUG_OCL
615 static int run_step = 1;
617 if (DEBUG_RUN_STEP == run_step)
619 FILE *pf;
620 char file_name[256] = {0};
622 ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
623 debug_buffer_size, stream, NULL);
625 // Make sure all data has been transfered back from device
626 clFinish(stream);
628 printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
630 sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
631 pf = fopen(file_name, "wt");
632 assert(pf != NULL);
634 fprintf(pf, "%20s", "");
635 for (int j = 0; j < global_work_size[0]; j++)
637 char label[20];
638 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
639 fprintf(pf, "%20s", label);
642 for (int i = 0; i < global_work_size[1]; i++)
644 char label[20];
645 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
646 fprintf(pf, "\n%20s", label);
648 for (int j = 0; j < global_work_size[0]; j++)
650 fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
653 //fprintf(pf, "\n");
656 fclose(pf);
658 printf(" done.\n");
661 free(debug_buffer_h);
662 debug_buffer_h = NULL;
665 run_step++;
667 #endif
671 /*! \brief Calculates the amount of shared memory required by the prune kernel.
673 * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
674 * for OpenCL local memory.
676 * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
677 * \returns the amount of local memory in bytes required by the pruning kernel
679 static inline int calc_shmem_required_prune(const int num_threads_z)
681 int shmem;
683 /* i-atom x in shared memory (for convenience we load all 4 components including q) */
684 shmem = c_numClPerSupercl * c_clSize * sizeof(float)*4;
685 /* cj in shared memory, for each warp separately */
686 shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
687 /* Warp vote, requires one uint per warp/32 threads per block. */
688 shmem += sizeof(cl_uint) * 2*num_threads_z;
690 return shmem;
693 void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
694 int iloc,
695 int numParts)
697 cl_int cl_error;
699 cl_atomdata_t *adat = nb->atdat;
700 cl_nbparam_t *nbp = nb->nbparam;
701 cl_plist_t *plist = nb->plist[iloc];
702 cl_timers_t *t = nb->timers;
703 cl_command_queue stream = nb->stream[iloc];
704 bool bDoTime = nb->bDoTime;
706 if (plist->haveFreshList)
708 GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
710 /* Set rollingPruningNumParts to signal that it is not set */
711 plist->rollingPruningNumParts = 0;
712 plist->rollingPruningPart = 0;
714 else
716 if (plist->rollingPruningNumParts == 0)
718 plist->rollingPruningNumParts = numParts;
720 else
722 GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
726 /* Use a local variable for part and update in plist, so we can return here
727 * without duplicating the part increment code.
729 int part = plist->rollingPruningPart;
731 plist->rollingPruningPart++;
732 if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
734 plist->rollingPruningPart = 0;
737 /* Compute the number of list entries to prune in this pass */
738 int numSciInPart = (plist->nsci - part)/numParts;
740 /* Don't launch the kernel if there is no work to do. */
741 if (numSciInPart <= 0)
743 plist->haveFreshList = false;
745 return;
748 GpuRegionTimer *timer = nullptr;
749 if (bDoTime)
751 timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
754 /* beginning of timed prune calculation section */
755 if (bDoTime)
757 timer->openTimingRegion(stream);
760 /* Kernel launch config:
761 * - The thread block dimensions match the size of i-clusters, j-clusters,
762 * and j-cluster concurrency, in x, y, and z, respectively.
763 * - The 1D block-grid contains as many blocks as super-clusters.
765 int num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
766 cl_kernel pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
768 /* kernel launch config */
769 size_t local_work_size[3], global_work_size[3];
770 local_work_size[0] = c_clSize;
771 local_work_size[1] = c_clSize;
772 local_work_size[2] = num_threads_z;
774 global_work_size[0] = numSciInPart * local_work_size[0];
775 global_work_size[1] = 1 * local_work_size[1];
776 global_work_size[2] = 1 * local_work_size[2];
778 validate_global_work_size(global_work_size, 3, nb->dev_info);
780 int shmem = calc_shmem_required_prune(num_threads_z);
782 if (debug)
784 fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
785 "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
786 "\tShMem: %d\n",
787 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
788 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
789 c_numClPerSupercl, plist->na_c, shmem);
792 cl_nbparam_params_t nbparams_params;
793 fillin_ocl_structures(nbp, &nbparams_params);
795 cl_uint arg_no = 0;
796 cl_error = CL_SUCCESS;
798 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
799 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->xq));
800 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
801 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->sci));
802 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
803 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->imask));
804 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(numParts));
805 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(part));
806 cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
807 assert(cl_error == CL_SUCCESS);
809 cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
810 nullptr, global_work_size, local_work_size,
811 0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr);
812 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
814 if (plist->haveFreshList)
816 plist->haveFreshList = false;
817 /* Mark that pruning has been done */
818 nb->timers->didPrune[iloc] = true;
820 else
822 /* Mark that rolling pruning has been done */
823 nb->timers->didRollingPrune[iloc] = true;
826 if (bDoTime)
828 timer->closeTimingRegion(stream);
832 /*! \brief
833 * Launch asynchronously the download of nonbonded forces from the GPU
834 * (and energies/shift forces if required).
836 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
837 const struct nbnxn_atomdata_t *nbatom,
838 int flags,
839 int aloc)
841 cl_int gmx_unused cl_error;
842 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
844 /* determine interaction locality from atom locality */
845 int iloc = gpuAtomToInteractionLocality(aloc);
847 cl_atomdata_t *adat = nb->atdat;
848 cl_timers_t *t = nb->timers;
849 bool bDoTime = nb->bDoTime;
850 cl_command_queue stream = nb->stream[iloc];
852 bool bCalcEner = flags & GMX_FORCE_ENERGY;
853 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
856 /* don't launch non-local copy-back if there was no non-local work to do */
857 if (canSkipWork(nb, iloc))
859 /* TODO An alternative way to signal that non-local work is
860 complete is to use a clEnqueueMarker+clEnqueueBarrier
861 pair. However, the use of bNonLocalStreamActive has the
862 advantage of being local to the host, so probably minimizes
863 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
864 test case, overall simulation performance was higher with
865 the API calls, but this has not been tested on AMD OpenCL,
866 so could be worth considering in future. */
867 nb->bNonLocalStreamActive = false;
868 return;
871 getGpuAtomRange(adat, aloc, adat_begin, adat_len);
873 /* beginning of timed D2H section */
874 if (bDoTime)
876 t->nb_d2h[iloc].openTimingRegion(stream);
879 /* With DD the local D2H transfer can only start after the non-local
880 has been launched. */
881 if (iloc == eintLocal && nb->bNonLocalStreamActive)
883 sync_ocl_event(stream, &(nb->nonlocal_done));
886 /* DtoH f */
887 ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
888 (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
890 /* kick off work */
891 cl_error = clFlush(stream);
892 assert(CL_SUCCESS == cl_error);
894 /* After the non-local D2H is launched the nonlocal_done event can be
895 recorded which signals that the local D2H can proceed. This event is not
896 placed after the non-local kernel because we first need the non-local
897 data back first. */
898 if (iloc == eintNonlocal)
900 #ifdef CL_VERSION_1_2
901 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
902 #else
903 cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
904 #endif
905 assert(CL_SUCCESS == cl_error);
906 nb->bNonLocalStreamActive = true;
909 /* only transfer energies in the local stream */
910 if (LOCAL_I(iloc))
912 /* DtoH fshift */
913 if (bCalcFshift)
915 ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
916 SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
919 /* DtoH energies */
920 if (bCalcEner)
922 ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
923 sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
925 ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
926 sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
930 if (bDoTime)
932 t->nb_d2h[iloc].closeTimingRegion(stream);
936 /*! \brief Count pruning kernel time if either kernel has been triggered
938 * We do the accounting for either of the two pruning kernel flavors:
939 * - 1st pass prune: ran during the current step (prior to the force kernel);
940 * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
942 * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
943 * after calling this function.
945 * \param[inout] timers structs with OCL timer objects
946 * \param[inout] timings GPU task timing data
947 * \param[in] iloc interaction locality
949 static void countPruneKernelTime(cl_timers_t *timers,
950 gmx_wallclock_gpu_nbnxn_t *timings,
951 const int iloc)
953 // We might have not done any pruning (e.g. if we skipped with empty domains).
954 if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
956 return;
959 if (timers->didPrune[iloc])
961 timings->pruneTime.c++;
962 timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
965 if (timers->didRollingPrune[iloc])
967 timings->dynamicPruneTime.c++;
968 timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
972 /*! \brief
973 * Wait for the asynchronously launched nonbonded calculations and data
974 * transfers to finish.
976 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
977 int flags, int aloc,
978 real *e_lj, real *e_el, rvec *fshift)
980 /* NOTE: only implemented for single-precision at this time */
981 cl_int gmx_unused cl_error;
983 /* determine interaction locality from atom locality */
984 int iloc = gpuAtomToInteractionLocality(aloc);
986 cl_plist_t *plist = nb->plist[iloc];
987 cl_timers_t *timers = nb->timers;
988 struct gmx_wallclock_gpu_nbnxn_t *timings = nb->timings;
989 cl_nb_staging nbst = nb->nbst;
991 bool bCalcEner = flags & GMX_FORCE_ENERGY;
992 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
994 /* turn energy calculation always on/off (for debugging/testing only) */
995 bCalcEner = (bCalcEner || always_ener) && !never_ener;
997 /* Launch wait/update timers & counters and do reduction into staging buffers
998 BUT skip it when during the non-local phase there was actually no work to do.
999 This is consistent with nbnxn_gpu_launch_kernel.
1001 NOTE: if timing with multiple GPUs (streams) becomes possible, the
1002 counters could end up being inconsistent due to not being incremented
1003 on some of the nodes! */
1004 if (!canSkipWork(nb, iloc))
1006 /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
1007 cl_error = clFinish(nb->stream[iloc]);
1008 assert(CL_SUCCESS == cl_error);
1010 /* timing data accumulation */
1011 if (nb->bDoTime)
1013 /* only increase counter once (at local F wait) */
1014 if (LOCAL_I(iloc))
1016 timings->nb_c++;
1017 timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
1020 /* kernel timings */
1021 timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += timers->nb_k[iloc].getLastRangeTime();
1023 /* X/q H2D and F D2H timings */
1024 timings->nb_h2d_t += timers->nb_h2d[iloc].getLastRangeTime();
1025 timings->nb_d2h_t += timers->nb_d2h[iloc].getLastRangeTime();
1027 /* Count the pruning kernel times for both cases:1st pass (at search step)
1028 and rolling pruning (if called at the previous step).
1029 We do the accounting here as this is the only sync point where we
1030 know (without checking or additional sync-ing) that prune tasks in
1031 in the current stream have completed (having just blocking-waited
1032 for the force D2H). */
1033 countPruneKernelTime(timers, timings, iloc);
1035 /* only count atdat and pair-list H2D at pair-search step */
1036 if (timers->didPairlistH2D[iloc])
1038 /* atdat transfer timing (add only once, at local F wait) */
1039 if (LOCAL_A(aloc))
1041 timings->pl_h2d_c++;
1042 timings->pl_h2d_t += timers->atdat.getLastRangeTime();
1045 timings->pl_h2d_t += timers->pl_h2d[iloc].getLastRangeTime();
1047 /* Clear the timing flag for the next step */
1048 timers->didPairlistH2D[iloc] = false;
1052 /* add up energies and shift forces (only once at local F wait) */
1053 if (LOCAL_I(iloc))
1055 if (bCalcEner)
1057 *e_lj += *nbst.e_lj;
1058 *e_el += *nbst.e_el;
1061 if (bCalcFshift)
1063 for (int i = 0; i < SHIFTS; i++)
1065 fshift[i][0] += (nbst.fshift)[i][0];
1066 fshift[i][1] += (nbst.fshift)[i][1];
1067 fshift[i][2] += (nbst.fshift)[i][2];
1073 /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
1074 timers->didPrune[iloc] = timers->didRollingPrune[iloc] = false;
1076 /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
1077 plist->haveFreshList = false;
1080 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
1081 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
1083 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
1084 int kernel_type;
1086 /* Benchmarking/development environment variables to force the use of
1087 analytical or tabulated Ewald kernel. */
1088 bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
1089 bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
1091 if (bForceAnalyticalEwald && bForceTabulatedEwald)
1093 gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
1094 "requested through environment variables.");
1097 /* OpenCL: By default, use analytical Ewald
1098 * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
1100 * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
1103 //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1104 if ((1 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1106 bUseAnalyticalEwald = true;
1108 if (debug)
1110 fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
1113 else
1115 bUseAnalyticalEwald = false;
1117 if (debug)
1119 fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
1123 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
1124 forces it (use it for debugging/benchmarking only). */
1125 if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
1127 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
1129 else
1131 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
1134 return kernel_type;