Refactor cj preload in the nonbonded OpenCL kernels
[gromacs.git] / src / gromacs / mdlib / nbnxn_ocl / nbnxn_ocl.cpp
blob8f80c77526039500626b76119d13358363f1d86a
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief Define OpenCL implementation of nbnxn_gpu.h
38 * \author Anca Hamuraru <anca@streamcomputing.eu>
39 * \author Teemu Virolainen <teemu@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Szilárd Páll <pall.szilard@gmail.com>
42 * \ingroup module_mdlib
44 * TODO (psz):
45 * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
46 * - Rework the copying of OCL data structures done before every invocation of both
47 * nb and prune kernels (using fillin_ocl_structures); also consider at the same
48 * time calling clSetKernelArg only on the updated parameters (if tracking changed
49 * parameters is feasible);
50 * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
51 * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
52 * on the misc_ops_and_local_H2D_done event could be better expressed this way.
54 * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
55 * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
56 * is identical in the two implementations, so a 3-way split might allow sharing
57 * code;
58 * -
61 #include "gmxpre.h"
63 #include <assert.h>
64 #include <stdlib.h>
66 #if defined(_MSVC)
67 #include <limits>
68 #endif
70 #include "thread_mpi/atomic.h"
72 #include "gromacs/gpu_utils/gputraits_ocl.h"
73 #include "gromacs/gpu_utils/oclutils.h"
74 #include "gromacs/hardware/hw_info.h"
75 #include "gromacs/mdlib/force_flags.h"
76 #include "gromacs/mdlib/nb_verlet.h"
77 #include "gromacs/mdlib/nbnxn_consts.h"
78 #include "gromacs/mdlib/nbnxn_gpu.h"
79 #include "gromacs/mdlib/nbnxn_gpu_common.h"
80 #include "gromacs/mdlib/nbnxn_gpu_common_utils.h"
81 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
82 #include "gromacs/mdlib/nbnxn_pairlist.h"
83 #include "gromacs/pbcutil/ishift.h"
84 #include "gromacs/timing/gpu_timing.h"
85 #include "gromacs/utility/cstringutil.h"
86 #include "gromacs/utility/fatalerror.h"
87 #include "gromacs/utility/gmxassert.h"
89 #include "nbnxn_ocl_internal.h"
90 #include "nbnxn_ocl_types.h"
93 /*! \brief Convenience constants */
94 //@{
95 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
96 static const int c_clSize = c_nbnxnGpuClusterSize;
97 //@}
100 /*! \brief Validates the input global work size parameter.
102 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
104 cl_uint device_size_t_size_bits;
105 cl_uint host_size_t_size_bits;
107 assert(dinfo);
109 /* Each component of a global_work_size must not exceed the range given by the
110 sizeof(device size_t) for the device on which the kernel execution will
111 be enqueued. See:
112 https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
114 device_size_t_size_bits = dinfo->adress_bits;
115 host_size_t_size_bits = (cl_uint)(sizeof(size_t) * 8);
117 /* If sizeof(host size_t) <= sizeof(device size_t)
118 => global_work_size components will always be valid
119 else
120 => get device limit for global work size and
121 compare it against each component of global_work_size.
123 if (host_size_t_size_bits > device_size_t_size_bits)
125 size_t device_limit;
127 device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
129 for (int i = 0; i < work_dim; i++)
131 if (global_work_size[i] > device_limit)
133 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
134 "The number of nonbonded work units (=number of super-clusters) exceeds the"
135 "device capabilities. Global work size limit exceeded (%d > %d)!",
136 global_work_size[i], device_limit);
142 /* Constant arrays listing non-bonded kernel function names. The arrays are
143 * organized in 2-dim arrays by: electrostatics and VDW type.
145 * Note that the row- and column-order of function pointers has to match the
146 * order of corresponding enumerated electrostatics and vdw types, resp.,
147 * defined in nbnxn_cuda_types.h.
150 /*! \brief Force-only kernel function names. */
151 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
153 { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
154 { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
155 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
156 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
157 { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
158 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
161 /*! \brief Force + energy kernel function pointers. */
162 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
164 { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
165 { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
166 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
167 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
168 { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
169 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
172 /*! \brief Force + pruning kernel function pointers. */
173 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
175 { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
176 { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
177 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
178 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
179 { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
180 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
183 /*! \brief Force + energy + pruning kernel function pointers. */
184 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
186 { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
187 { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
188 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
189 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
190 { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
191 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
194 /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
196 * \param[in] kernel_pruneonly array of prune kernel objects
197 * \param[in] firstPrunePass true if the first pruning pass is being executed
199 static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
200 bool firstPrunePass)
202 cl_kernel *kernelPtr;
204 if (firstPrunePass)
206 kernelPtr = &(kernel_pruneonly[epruneFirst]);
208 else
210 kernelPtr = &(kernel_pruneonly[epruneRolling]);
212 // TODO: consider creating the prune kernel object here to avoid a
213 // clCreateKernel for the rolling prune kernel if this is not needed.
214 return *kernelPtr;
217 /*! \brief Return a pointer to the kernel version to be executed at the current step.
218 * OpenCL kernel objects are cached in nb. If the requested kernel is not
219 * found in the cache, it will be created and the cache will be updated.
221 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t *nb,
222 int eeltype,
223 int evdwtype,
224 bool bDoEne,
225 bool bDoPrune)
227 const char* kernel_name_to_run;
228 cl_kernel *kernel_ptr;
229 cl_int cl_error;
231 assert(eeltype < eelOclNR);
232 assert(evdwtype < evdwOclNR);
234 if (bDoEne)
236 if (bDoPrune)
238 kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
239 kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
241 else
243 kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
244 kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
247 else
249 if (bDoPrune)
251 kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
252 kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
254 else
256 kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
257 kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
261 if (NULL == kernel_ptr[0])
263 *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
264 assert(cl_error == CL_SUCCESS);
266 // TODO: handle errors
268 return *kernel_ptr;
271 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
273 static inline int calc_shmem_required_nonbonded(int vdwType,
274 bool bPrefetchLjParam)
276 int shmem;
278 /* size of shmem (force-buffers/xq/atom type preloading) */
279 /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
280 /* i-atom x+q in shared memory */
281 shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
282 /* cj in shared memory, for both warps separately
283 * TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed.
285 shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
286 if (bPrefetchLjParam)
288 if (useLjCombRule(vdwType))
290 /* i-atom LJ combination parameters in shared memory */
291 shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
293 else
295 /* i-atom types in shared memory */
296 shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
299 /* force reduction buffers in shared memory */
300 shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
301 /* Warp vote. In fact it must be * number of warps in block.. */
302 shmem += sizeof(cl_uint) * 2; /* warp_any */
303 return shmem;
306 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
308 * The device can't use the same data structures as the host for two main reasons:
309 * - OpenCL restrictions (pointers are not accepted inside data structures)
310 * - some host side fields are not needed for the OpenCL kernels.
312 * This function is called before the launch of both nbnxn and prune kernels.
314 static void fillin_ocl_structures(cl_nbparam_t *nbp,
315 cl_nbparam_params_t *nbparams_params)
317 nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
318 nbparams_params->c_rf = nbp->c_rf;
319 nbparams_params->dispersion_shift = nbp->dispersion_shift;
320 nbparams_params->eeltype = nbp->eeltype;
321 nbparams_params->epsfac = nbp->epsfac;
322 nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
323 nbparams_params->ewald_beta = nbp->ewald_beta;
324 nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
325 nbparams_params->repulsion_shift = nbp->repulsion_shift;
326 nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
327 nbparams_params->rvdw_sq = nbp->rvdw_sq;
328 nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
329 nbparams_params->rvdw_switch = nbp->rvdw_switch;
330 nbparams_params->sh_ewald = nbp->sh_ewald;
331 nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
332 nbparams_params->two_k_rf = nbp->two_k_rf;
333 nbparams_params->vdwtype = nbp->vdwtype;
334 nbparams_params->vdw_switch = nbp->vdw_switch;
337 /*! \brief Enqueues a wait for event completion.
339 * Then it releases the event and sets it to 0.
340 * Don't use this function when more than one wait will be issued for the event.
341 * Equivalent to Cuda Stream Sync. */
342 static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
344 cl_int gmx_unused cl_error;
346 /* Enqueue wait */
347 cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
348 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
350 /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
351 cl_error = clReleaseEvent(*ocl_event);
352 assert(CL_SUCCESS == cl_error);
353 *ocl_event = 0;
356 /*! \brief Launch GPU kernel
358 As we execute nonbonded workload in separate queues, before launching
359 the kernel we need to make sure that he following operations have completed:
360 - atomdata allocation and related H2D transfers (every nstlist step);
361 - pair list H2D transfer (every nstlist step);
362 - shift vector H2D transfer (every nstlist step);
363 - force (+shift force and energy) output clearing (every step).
365 These operations are issued in the local queue at the beginning of the step
366 and therefore always complete before the local kernel launch. The non-local
367 kernel is launched after the local on the same device/context, so this is
368 inherently scheduled after the operations in the local stream (including the
369 above "misc_ops").
370 However, for the sake of having a future-proof implementation, we use the
371 misc_ops_done event to record the point in time when the above operations
372 are finished and synchronize with this event in the non-local stream.
374 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
375 const struct nbnxn_atomdata_t *nbatom,
376 int flags,
377 int iloc)
379 cl_int cl_error;
380 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
381 /* OpenCL kernel launch-related stuff */
382 int shmem;
383 size_t local_work_size[3], global_work_size[3];
384 cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
386 cl_atomdata_t *adat = nb->atdat;
387 cl_nbparam_t *nbp = nb->nbparam;
388 cl_plist_t *plist = nb->plist[iloc];
389 cl_timers_t *t = nb->timers;
390 cl_command_queue stream = nb->stream[iloc];
392 bool bCalcEner = flags & GMX_FORCE_ENERGY;
393 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
394 bool bDoTime = nb->bDoTime;
395 cl_uint arg_no;
397 cl_nbparam_params_t nbparams_params;
399 /* Don't launch the non-local kernel if there is no work to do.
400 Doing the same for the local kernel is more complicated, since the
401 local part of the force array also depends on the non-local kernel.
402 So to avoid complicating the code and to reduce the risk of bugs,
403 we always call the local kernel, the local x+q copy and later (not in
404 this function) the stream wait, local f copyback and the f buffer
405 clearing. All these operations, except for the local interaction kernel,
406 are needed for the non-local interactions. The skip of the local kernel
407 call is taken care of later in this function. */
408 if (canSkipWork(nb, iloc))
410 plist->haveFreshList = false;
412 return;
415 /* calculate the atom data index range based on locality */
416 if (LOCAL_I(iloc))
418 adat_begin = 0;
419 adat_len = adat->natoms_local;
421 else
423 adat_begin = adat->natoms_local;
424 adat_len = adat->natoms - adat->natoms_local;
427 /* beginning of timed HtoD section */
428 if (bDoTime)
430 t->nb_h2d[iloc].openTimingRegion(stream);
433 /* HtoD x, q */
434 ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
435 adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
437 if (bDoTime)
439 t->nb_h2d[iloc].closeTimingRegion(stream);
442 /* When we get here all misc operations issues in the local stream as well as
443 the local xq H2D are done,
444 so we record that in the local stream and wait for it in the nonlocal one. */
445 if (nb->bUseTwoStreams)
447 if (iloc == eintLocal)
449 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
450 assert(CL_SUCCESS == cl_error);
452 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
453 * in the local stream in order to be able to sync with the above event
454 * from the non-local stream.
456 cl_error = clFlush(stream);
457 assert(CL_SUCCESS == cl_error);
459 else
461 sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
465 if (nbp->useDynamicPruning && plist->haveFreshList)
467 /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
468 (that's the way the timing accounting can distinguish between
469 separate prune kernel and combined force+prune).
471 nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
474 if (plist->nsci == 0)
476 /* Don't launch an empty local kernel (is not allowed with OpenCL).
477 * TODO: Separate H2D and kernel launch into separate functions.
479 return;
482 /* beginning of timed nonbonded calculation section */
483 if (bDoTime)
485 t->nb_k[iloc].openTimingRegion(stream);
488 /* get the pointer to the kernel flavor we need to use */
489 nb_kernel = select_nbnxn_kernel(nb,
490 nbp->eeltype,
491 nbp->vdwtype,
492 bCalcEner,
493 (plist->haveFreshList && !nb->timers->didPrune[iloc]));
495 /* kernel launch config */
496 local_work_size[0] = c_clSize;
497 local_work_size[1] = c_clSize;
498 local_work_size[2] = 1;
500 global_work_size[0] = plist->nsci * local_work_size[0];
501 global_work_size[1] = 1 * local_work_size[1];
502 global_work_size[2] = 1 * local_work_size[2];
504 validate_global_work_size(global_work_size, 3, nb->dev_info);
506 shmem = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
508 if (debug)
510 fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
511 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
512 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
513 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
514 c_numClPerSupercl, plist->na_c);
517 fillin_ocl_structures(nbp, &nbparams_params);
519 arg_no = 0;
520 cl_error = CL_SUCCESS;
521 if (!useLjCombRule(nb->nbparam->vdwtype))
523 cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
525 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
526 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
527 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
528 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
529 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
530 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
531 if (useLjCombRule(nb->nbparam->vdwtype))
533 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb));
535 else
537 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
539 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
540 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
541 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
542 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
543 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
544 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
545 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
546 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
547 cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
549 assert(cl_error == CL_SUCCESS);
551 if (cl_error)
553 printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
555 cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr);
556 assert(cl_error == CL_SUCCESS);
558 if (bDoTime)
560 t->nb_k[iloc].closeTimingRegion(stream);
565 /*! \brief Calculates the amount of shared memory required by the prune kernel.
567 * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
568 * for OpenCL local memory.
570 * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
571 * \returns the amount of local memory in bytes required by the pruning kernel
573 static inline int calc_shmem_required_prune(const int num_threads_z)
575 int shmem;
577 /* i-atom x in shared memory (for convenience we load all 4 components including q) */
578 shmem = c_numClPerSupercl * c_clSize * sizeof(float)*4;
579 /* cj in shared memory, for each warp separately
580 * Note: only need to load once per wavefront, but to keep the code simple,
581 * for now we load twice on AMD.
583 shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
584 /* Warp vote, requires one uint per warp/32 threads per block. */
585 shmem += sizeof(cl_uint) * 2*num_threads_z;
587 return shmem;
590 void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
591 int iloc,
592 int numParts)
594 cl_int cl_error;
596 cl_atomdata_t *adat = nb->atdat;
597 cl_nbparam_t *nbp = nb->nbparam;
598 cl_plist_t *plist = nb->plist[iloc];
599 cl_timers_t *t = nb->timers;
600 cl_command_queue stream = nb->stream[iloc];
601 bool bDoTime = nb->bDoTime;
603 if (plist->haveFreshList)
605 GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
607 /* Set rollingPruningNumParts to signal that it is not set */
608 plist->rollingPruningNumParts = 0;
609 plist->rollingPruningPart = 0;
611 else
613 if (plist->rollingPruningNumParts == 0)
615 plist->rollingPruningNumParts = numParts;
617 else
619 GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
623 /* Use a local variable for part and update in plist, so we can return here
624 * without duplicating the part increment code.
626 int part = plist->rollingPruningPart;
628 plist->rollingPruningPart++;
629 if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
631 plist->rollingPruningPart = 0;
634 /* Compute the number of list entries to prune in this pass */
635 int numSciInPart = (plist->nsci - part)/numParts;
637 /* Don't launch the kernel if there is no work to do. */
638 if (numSciInPart <= 0)
640 plist->haveFreshList = false;
642 return;
645 GpuRegionTimer *timer = nullptr;
646 if (bDoTime)
648 timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
651 /* beginning of timed prune calculation section */
652 if (bDoTime)
654 timer->openTimingRegion(stream);
657 /* Kernel launch config:
658 * - The thread block dimensions match the size of i-clusters, j-clusters,
659 * and j-cluster concurrency, in x, y, and z, respectively.
660 * - The 1D block-grid contains as many blocks as super-clusters.
662 int num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
663 cl_kernel pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
665 /* kernel launch config */
666 size_t local_work_size[3], global_work_size[3];
667 local_work_size[0] = c_clSize;
668 local_work_size[1] = c_clSize;
669 local_work_size[2] = num_threads_z;
671 global_work_size[0] = numSciInPart * local_work_size[0];
672 global_work_size[1] = 1 * local_work_size[1];
673 global_work_size[2] = 1 * local_work_size[2];
675 validate_global_work_size(global_work_size, 3, nb->dev_info);
677 int shmem = calc_shmem_required_prune(num_threads_z);
679 if (debug)
681 fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
682 "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
683 "\tShMem: %d\n",
684 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
685 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
686 c_numClPerSupercl, plist->na_c, shmem);
689 cl_nbparam_params_t nbparams_params;
690 fillin_ocl_structures(nbp, &nbparams_params);
692 cl_uint arg_no = 0;
693 cl_error = CL_SUCCESS;
695 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
696 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->xq));
697 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
698 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->sci));
699 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
700 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->imask));
701 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(numParts));
702 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(part));
703 cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
704 assert(cl_error == CL_SUCCESS);
706 cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
707 nullptr, global_work_size, local_work_size,
708 0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr);
709 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
711 if (plist->haveFreshList)
713 plist->haveFreshList = false;
714 /* Mark that pruning has been done */
715 nb->timers->didPrune[iloc] = true;
717 else
719 /* Mark that rolling pruning has been done */
720 nb->timers->didRollingPrune[iloc] = true;
723 if (bDoTime)
725 timer->closeTimingRegion(stream);
729 /*! \brief
730 * Launch asynchronously the download of nonbonded forces from the GPU
731 * (and energies/shift forces if required).
733 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
734 const struct nbnxn_atomdata_t *nbatom,
735 int flags,
736 int aloc)
738 cl_int gmx_unused cl_error;
739 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
741 /* determine interaction locality from atom locality */
742 int iloc = gpuAtomToInteractionLocality(aloc);
744 cl_atomdata_t *adat = nb->atdat;
745 cl_timers_t *t = nb->timers;
746 bool bDoTime = nb->bDoTime;
747 cl_command_queue stream = nb->stream[iloc];
749 bool bCalcEner = flags & GMX_FORCE_ENERGY;
750 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
753 /* don't launch non-local copy-back if there was no non-local work to do */
754 if (canSkipWork(nb, iloc))
756 /* TODO An alternative way to signal that non-local work is
757 complete is to use a clEnqueueMarker+clEnqueueBarrier
758 pair. However, the use of bNonLocalStreamActive has the
759 advantage of being local to the host, so probably minimizes
760 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
761 test case, overall simulation performance was higher with
762 the API calls, but this has not been tested on AMD OpenCL,
763 so could be worth considering in future. */
764 nb->bNonLocalStreamActive = false;
765 return;
768 getGpuAtomRange(adat, aloc, adat_begin, adat_len);
770 /* beginning of timed D2H section */
771 if (bDoTime)
773 t->nb_d2h[iloc].openTimingRegion(stream);
776 /* With DD the local D2H transfer can only start after the non-local
777 has been launched. */
778 if (iloc == eintLocal && nb->bNonLocalStreamActive)
780 sync_ocl_event(stream, &(nb->nonlocal_done));
783 /* DtoH f */
784 ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
785 (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
787 /* kick off work */
788 cl_error = clFlush(stream);
789 assert(CL_SUCCESS == cl_error);
791 /* After the non-local D2H is launched the nonlocal_done event can be
792 recorded which signals that the local D2H can proceed. This event is not
793 placed after the non-local kernel because we first need the non-local
794 data back first. */
795 if (iloc == eintNonlocal)
797 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
798 assert(CL_SUCCESS == cl_error);
799 nb->bNonLocalStreamActive = true;
802 /* only transfer energies in the local stream */
803 if (LOCAL_I(iloc))
805 /* DtoH fshift */
806 if (bCalcFshift)
808 ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
809 SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
812 /* DtoH energies */
813 if (bCalcEner)
815 ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
816 sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
818 ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
819 sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
823 if (bDoTime)
825 t->nb_d2h[iloc].closeTimingRegion(stream);
830 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
831 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
833 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
834 int kernel_type;
836 /* Benchmarking/development environment variables to force the use of
837 analytical or tabulated Ewald kernel. */
838 bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
839 bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
841 if (bForceAnalyticalEwald && bForceTabulatedEwald)
843 gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
844 "requested through environment variables.");
847 /* OpenCL: By default, use analytical Ewald
848 * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
850 * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
853 //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
854 if ((1 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
856 bUseAnalyticalEwald = true;
858 if (debug)
860 fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
863 else
865 bUseAnalyticalEwald = false;
867 if (debug)
869 fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
873 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
874 forces it (use it for debugging/benchmarking only). */
875 if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
877 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
879 else
881 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
884 return kernel_type;