From 85d9a0aa677544c9a53d205ab715a662566169be Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Fri, 17 Jan 2020 15:43:38 +0100 Subject: [PATCH] Update some nbnxm kernel constants to constexpr Then OpenCL kernels now use preprocessor constants of the same names and values, which are set up on the JIT compilation line. This eliminates a header file used for compatibility. Added support also for the ocl_nbnxm_kernels testing target. Change-Id: Ibf8f7864dd08b14aeecad095cdbe25b1fbacc765 --- src/gromacs/nbnxm/constants.h | 74 ---------------------- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 16 ++--- src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh | 41 ++++++------ .../nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh | 12 ++-- src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh | 5 +- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 4 +- .../nbnxm/kernels_reference/kernel_gpu_ref.cpp | 20 +++--- .../nbnxm/kernels_reference/kernel_ref_inner.h | 2 +- src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h | 2 +- src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h | 2 +- src/gromacs/nbnxm/opencl/CMakeLists.txt | 9 ++- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 18 +++--- src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp | 16 +++-- src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh | 41 ++++++------ .../nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh | 25 ++++---- .../nbnxm/opencl/nbnxm_ocl_kernel_utils.clh | 16 ++--- src/gromacs/nbnxm/pairlist.h | 28 +++++++- 17 files changed, 145 insertions(+), 186 deletions(-) delete mode 100644 src/gromacs/nbnxm/constants.h diff --git a/src/gromacs/nbnxm/constants.h b/src/gromacs/nbnxm/constants.h deleted file mode 100644 index 41a3e15f70..0000000000 --- a/src/gromacs/nbnxm/constants.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ - -/*! \internal \file - * - * \brief - * Declares constants for the module - * - * \author Berk Hess - * \ingroup module_nbnxm - */ - -#ifndef GMX_NBNXN_CONSTANTS_H -#define GMX_NBNXN_CONSTANTS_H - -/*! \brief Lower limit for square interaction distances in nonbonded kernels. - * - * For smaller values we will overflow when calculating r^-1 or r^-12, but - * to keep it simple we always apply the limit from the tougher r^-12 condition. - */ -#if GMX_DOUBLE -// Some double precision SIMD architectures use single precision in the first -// step, so although the double precision criterion would allow smaller rsq, -// we need to stay in single precision with some margin for the N-R iterations. -# define NBNXN_MIN_RSQ 1.0e-36 -#else -// The worst intermediate value we might evaluate is r^-12, which -// means we should ensure r^2 stays above pow(GMX_FLOAT_MAX,-1.0/6.0)*1.01 (some margin) -# define NBNXN_MIN_RSQ 3.82e-07f // r > 6.2e-4 -#endif - - -//! The number of clusters in a super-cluster, used for GPU -#define c_nbnxnGpuNumClusterPerSupercluster 8 - -/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage - * of integers containing 32 bits. - */ -#define c_nbnxnGpuJgroupSize (32 / c_nbnxnGpuNumClusterPerSupercluster) - -#endif diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 22da9946f1..359e6c5905 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -338,19 +338,19 @@ static inline int calc_shmem_required_nonbonded(const int num_thre /* size of shmem (force-buffers/xq/atom type preloading) */ /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */ /* i-atom x+q in shared memory */ - shmem = c_numClPerSupercl * c_clSize * sizeof(float4); + shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4); /* cj in shared memory, for each warp separately */ shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int); if (nbp->vdwtype == evdwCuCUTCOMBGEOM || nbp->vdwtype == evdwCuCUTCOMBLB) { /* i-atom LJ combination parameters in shared memory */ - shmem += c_numClPerSupercl * c_clSize * sizeof(float2); + shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float2); } else { /* i-atom types in shared memory */ - shmem += c_numClPerSupercl * c_clSize * sizeof(int); + shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); } return shmem; @@ -552,8 +552,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n" "\tShMem: %zu\n", config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0], - config.gridSize[1], plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c, - config.sharedMemorySize); + config.gridSize[1], plist->nsci * c_nbnxnGpuNumClusterPerSupercluster, + c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize); } auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr; @@ -582,7 +582,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) int shmem; /* i-atom x in shared memory */ - shmem = c_numClPerSupercl * c_clSize * sizeof(float4); + shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4); /* cj in shared memory, for each warp separately */ shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int); @@ -676,8 +676,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n" "\tShMem: %zu\n", config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0], - config.gridSize[1], numSciInPart * c_numClPerSupercl, c_numClPerSupercl, - plist->na_c, config.sharedMemorySize); + config.gridSize[1], numSciInPart * c_nbnxnGpuNumClusterPerSupercluster, + c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize); } auto* timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh index 99b1f048cf..89f75da5f9 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh @@ -244,11 +244,11 @@ __launch_bounds__(THREADS_PER_BLOCK) unsigned int wexcl, imask, mask_ji; float4 xqbuf; float3 xi, xj, rv, f_ij, fcj_buf; - float3 fci_buf[c_numClPerSupercl]; /* i force buffer */ + float3 fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */ nbnxn_sci_t nb_sci; - /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */ - const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U); + /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */ + const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); /********************************************************************* * Set up shared memory pointers. @@ -262,7 +262,7 @@ __launch_bounds__(THREADS_PER_BLOCK) /* shmem buffer for i x+q pre-loading */ float4* xqib = (float4*)sm_nextSlotPtr; - sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xqib)); + sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib)); /* shmem buffer for cj, for each warp separately */ int* cjs = (int*)(sm_nextSlotPtr); @@ -273,11 +273,11 @@ __launch_bounds__(THREADS_PER_BLOCK) # ifndef LJ_COMB /* shmem buffer for i atom-type pre-loading */ int* atib = (int*)sm_nextSlotPtr; - sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*atib)); + sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib)); # else /* shmem buffer for i-atom LJ combination rule parameters */ float2* ljcpib = (float2*)sm_nextSlotPtr; - sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*ljcpib)); + sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib)); # endif /*********************************************************************/ @@ -289,7 +289,7 @@ __launch_bounds__(THREADS_PER_BLOCK) if (tidxz == 0) { /* Pre-load i-atom x and q into shared memory */ - ci = sci * c_numClPerSupercl + tidxj; + ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj; ai = ci * c_clSize + tidxi; float* shiftptr = (float*)&shift_vec[nb_sci.shift]; @@ -307,7 +307,7 @@ __launch_bounds__(THREADS_PER_BLOCK) } __syncthreads(); - for (i = 0; i < c_numClPerSupercl; i++) + for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { fci_buf[i] = make_float3(0.0f); } @@ -324,10 +324,10 @@ __launch_bounds__(THREADS_PER_BLOCK) E_el = 0.0f; # ifdef EXCLUSION_FORCES /* Ewald or RF */ - if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_numClPerSupercl) + if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster) { /* we have the diagonal: add the charge and LJ self interaction energy term */ - for (i = 0; i < c_numClPerSupercl; i++) + for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { # if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF qi = xqib[i * c_clSize + tidxi].w; @@ -336,12 +336,13 @@ __launch_bounds__(THREADS_PER_BLOCK) # ifdef LJ_EWALD # if DISABLE_CUDA_TEXTURES - E_lj += LDG( - &nbparam.nbfp[atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2]); + E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] + * (ntypes + 1) * 2]); # else E_lj += tex1Dfetch( nbparam.nbfp_texobj, - atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2); + atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] + * (ntypes + 1) * 2); # endif # endif } @@ -397,9 +398,9 @@ __launch_bounds__(THREADS_PER_BLOCK) Tested with up to nvcc 7.5 */ for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) { - if (imask & (superClInteractionMask << (jm * c_numClPerSupercl))) + if (imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster))) { - mask_ji = (1U << (jm * c_numClPerSupercl)); + mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster)); cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize]; aj = cj * c_clSize + tidxj; @@ -419,11 +420,11 @@ __launch_bounds__(THREADS_PER_BLOCK) # if !defined PRUNE_NBL # pragma unroll 8 # endif - for (i = 0; i < c_numClPerSupercl; i++) + for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { if (imask & mask_ji) { - ci = sci * c_numClPerSupercl + i; /* i cluster index */ + ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; /* i cluster index */ /* all threads load an atom from i cluster ci into shmem! */ xqbuf = xqib[i * c_clSize + tidxi]; @@ -475,7 +476,7 @@ __launch_bounds__(THREADS_PER_BLOCK) # endif /* LJ_COMB */ // Ensure distance do not become so small that r^-12 overflows - r2 = max(r2, NBNXN_MIN_RSQ); + r2 = max(r2, c_nbnxnMinDistanceSquared); inv_r = rsqrt(r2); inv_r2 = inv_r * inv_r; @@ -629,9 +630,9 @@ __launch_bounds__(THREADS_PER_BLOCK) float fshift_buf = 0.0f; /* reduce i forces */ - for (i = 0; i < c_numClPerSupercl; i++) + for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { - ai = (sci * c_numClPerSupercl + i) * c_clSize + tidxi; + ai = (sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi; reduce_force_i_warp_shfl(fci_buf[i], f, &fshift_buf, bCalcFshift, tidxj, ai, c_fullWarpMask); } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh index 8570e55c0b..e9c5b51143 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -152,7 +152,7 @@ nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu /* shmem buffer for i x+q pre-loading */ float4* xib = (float4*)sm_nextSlotPtr; - sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xib)); + sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib)); /* shmem buffer for cj, for each warp separately */ int* cjs = (int*)(sm_nextSlotPtr); @@ -171,7 +171,7 @@ nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu if (tidxz == 0) { /* Pre-load i-atom x and q into shared memory */ - int ci = sci * c_numClPerSupercl + tidxj; + int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj; int ai = ci * c_clSize + tidxi; /* We don't need q, but using float4 in shmem avoids bank conflicts. @@ -220,9 +220,9 @@ nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu # pragma unroll 4 for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) { - if (imaskCheck & (superClInteractionMask << (jm * c_numClPerSupercl))) + if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster))) { - unsigned int mask_ji = (1U << (jm * c_numClPerSupercl)); + unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster)); int cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize]; int aj = cj * c_clSize + tidxj; @@ -232,7 +232,7 @@ nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu float3 xj = make_float3(tmp.x, tmp.y, tmp.z); # pragma unroll 8 - for (int i = 0; i < c_numClPerSupercl; i++) + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { if (imaskCheck & mask_ji) { diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh index 6fc2e4523d..cbca9452b9 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh @@ -68,8 +68,9 @@ static const int __device__ c_clSizeSq = c_clSize * c_clSize; static const int __device__ c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit; /*! \brief Stride in the force accumualation buffer */ static const int __device__ c_fbufStride = c_clSizeSq; -/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */ -static const unsigned __device__ superClInteractionMask = ((1U << c_numClPerSupercl) - 1U); +/*! \brief i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */ +static const unsigned __device__ superClInteractionMask = + ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); static const float __device__ c_oneSixth = 0.16666667f; static const float __device__ c_oneTwelveth = 0.08333333f; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index be7d861639..863e6a1efc 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -72,10 +72,8 @@ const int c_cudaPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY /* TODO: consider moving this to kernel_utils */ /* Convenience defines */ -/*! \brief number of clusters per supercluster. */ -static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster; /*! \brief cluster size = number of atoms per cluster. */ -static const int c_clSize = c_nbnxnGpuClusterSize; +static constexpr int c_clSize = c_nbnxnGpuClusterSize; /*! \brief Electrostatic CUDA kernel flavors. * diff --git a/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp b/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp index ce7a72c2ff..00b7c647d5 100644 --- a/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp +++ b/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp @@ -52,8 +52,7 @@ #include "gromacs/pbcutil/ishift.h" #include "gromacs/utility/fatalerror.h" -static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster; -static const int c_clSize = c_nbnxnGpuClusterSize; +static constexpr int c_clSize = c_nbnxnGpuClusterSize; void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl, const nbnxn_atomdata_t* nbat, @@ -151,14 +150,14 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl, vctot = 0; Vvdwtot = 0; - if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_numClPerSupercl) + if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster) { /* we have the diagonal: * add the charge self interaction energy term */ - for (im = 0; im < c_numClPerSupercl; im++) + for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++) { - ci = sci * c_numClPerSupercl + im; + ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im; for (ic = 0; ic < c_clSize; ic++) { ia = ci * c_clSize + ic; @@ -186,16 +185,17 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl, { cj = nbl->cj4[cj4_ind].cj[jm]; - for (im = 0; im < c_numClPerSupercl; im++) + for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++) { /* We're only using the first imask, * but here imei[1].imask is identical. */ - if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_numClPerSupercl + im)) & 1) + if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im)) + & 1) { gmx_bool within_rlist; - ci = sci * c_numClPerSupercl + im; + ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im; within_rlist = FALSE; npair = 0; @@ -228,7 +228,7 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl, c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit; int_bit = static_cast( (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic] - >> (jm * c_numClPerSupercl + im)) + >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im)) & 1); js = ja * nbat->xstride; @@ -255,7 +255,7 @@ void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl, } // Ensure distance do not become so small that r^-12 overflows - rsq = std::max(rsq, NBNXN_MIN_RSQ); + rsq = std::max(rsq, c_nbnxnMinDistanceSquared); rinv = gmx::invsqrt(rsq); rinvsq = rinv * rinv; diff --git a/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h b/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h index 8bc5aab82a..59d9147aa6 100644 --- a/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h +++ b/src/gromacs/nbnxm/kernels_reference/kernel_ref_inner.h @@ -128,7 +128,7 @@ // Ensure the distances do not fall below the limit where r^-12 overflows. // This should never happen for normal interactions. - rsq = std::max(rsq, NBNXN_MIN_RSQ); + rsq = std::max(rsq, c_nbnxnMinDistanceSquared); #ifdef COUNT_PAIRS npair++; diff --git a/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h b/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h index 7cf25805a5..17e8f894fa 100644 --- a/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h +++ b/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_outer.h @@ -311,7 +311,7 @@ rcvdw2_S = SimdReal(ic->rvdw * ic->rvdw); #endif - minRsq_S = SimdReal(NBNXN_MIN_RSQ); + minRsq_S = SimdReal(c_nbnxnMinDistanceSquared); const real* gmx_restrict q = nbatParams.q.data(); const real facel = ic->epsfac; diff --git a/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h b/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h index e115c9b8ce..3559f2bbf1 100644 --- a/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h +++ b/src/gromacs/nbnxm/kernels_simd_4xm/kernel_outer.h @@ -329,7 +329,7 @@ rcvdw2_S = SimdReal(ic->rvdw * ic->rvdw); #endif - minRsq_S = SimdReal(NBNXN_MIN_RSQ); + minRsq_S = SimdReal(c_nbnxnMinDistanceSquared); const real* gmx_restrict q = nbatParams.q.data(); const real facel = ic->epsfac; diff --git a/src/gromacs/nbnxm/opencl/CMakeLists.txt b/src/gromacs/nbnxm/opencl/CMakeLists.txt index 9b573495d7..0c36906457 100644 --- a/src/gromacs/nbnxm/opencl/CMakeLists.txt +++ b/src/gromacs/nbnxm/opencl/CMakeLists.txt @@ -71,11 +71,18 @@ foreach(ELEC_DEF IN LISTS ELEC_DEFS) string(REGEX REPLACE ".*=" "" ELEC_NAME "${ELEC_DEF}") string(REGEX REPLACE ".*=" "" VDW_NAME "${VDW_DEF}") set(OBJ_FILE nbnxm_ocl_kernel${ELEC_NAME}${VDW_NAME}_${VENDOR}.o) + # The constants below duplicate various others (e.g. from pairlist.h) + # but as the kernels compiled here are not used for production, + # it will be OK if the values would fall out of sync. add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/nbnxm_ocl_kernels.cl ${CLANG_TIDY_ARGS} -Xclang -finclude-default-header -D_${VENDOR}_SOURCE_ -DGMX_OCL_FASTGEN ${ELEC_DEF} ${VDW_DEF} - -DNBNXN_GPU_CLUSTER_SIZE=${CLUSTER_SIZE} -DIATYPE_SHMEM + -Dc_nbnxnGpuClusterSize=${CLUSTER_SIZE} + -Dc_nbnxnMinDistanceSquared=3.82e-07F + -Dc_nbnxnGpuNumClusterPerSupercluster=8 + -Dc_nbnxnGpuJgroupSize=4 + -DIATYPE_SHMEM -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2 -Weverything -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused -Wno-cast-align -Wno-incompatible-pointer-types diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 9da920db76..d0e693f85f 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -95,8 +95,7 @@ namespace Nbnxm /*! \brief Convenience constants */ //@{ -static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster; -static const int c_clSize = c_nbnxnGpuClusterSize; +static constexpr int c_clSize = c_nbnxnGpuClusterSize; //@} @@ -396,7 +395,7 @@ static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjPar /* size of shmem (force-buffers/xq/atom type preloading) */ /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */ /* i-atom x+q in shared memory */ - shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */ + shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */ /* cj in shared memory, for both warps separately * TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed. */ @@ -406,12 +405,13 @@ static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjPar if (useLjCombRule(vdwType)) { /* i-atom LJ combination parameters in shared memory */ - shmem += c_numClPerSupercl * c_clSize * 2 * sizeof(float); /* atib abused for ljcp, float2 */ + shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2 + * sizeof(float); /* atib abused for ljcp, float2 */ } else { /* i-atom types in shared memory */ - shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */ + shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */ } } /* force reduction buffers in shared memory */ @@ -643,7 +643,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n", config.blockSize[0], config.blockSize[1], config.blockSize[2], config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], - plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c); + plist->nsci * c_nbnxnGpuNumClusterPerSupercluster, + c_nbnxnGpuNumClusterPerSupercluster, plist->na_c); } fillin_ocl_structures(nbp, &nbparams_params); @@ -697,7 +698,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) int shmem; /* i-atom x in shared memory (for convenience we load all 4 components including q) */ - shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; + shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* cj in shared memory, for each warp separately * Note: only need to load once per wavefront, but to keep the code simple, * for now we load twice on AMD. @@ -803,7 +804,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c "\tShMem: %zu\n", config.blockSize[0], config.blockSize[1], config.blockSize[2], config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], - plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c, config.sharedMemorySize); + plist->nsci * c_nbnxnGpuNumClusterPerSupercluster, + c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize); } cl_nbparam_params_t nbparams_params; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp index dba6415149..7374d433d3 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp @@ -184,16 +184,18 @@ void nbnxn_gpu_compile_kernels(NbnxmGpu* nb) std::string extraDefines = makeDefinesForKernelTypes(bFastGen, nb->nbparam->eeltype, nb->nbparam->vdwtype); - /* Here we pass macros and static const int variables defined + /* Here we pass macros and static const/constexpr int variables defined * in include files outside the opencl as macros, to avoid - * including those files in the JIT compilation that happens - * at runtime. This is particularly a problem for headers that - * depend on config.h, such as pairlist.h. */ + * including those files in the plain-C JIT compilation that happens + * at runtime. */ extraDefines += gmx::formatString( - " -DNBNXN_GPU_CLUSTER_SIZE=%d " + " -Dc_nbnxnGpuClusterSize=%d" + " -Dc_nbnxnMinDistanceSquared=%g" + " -Dc_nbnxnGpuNumClusterPerSupercluster=%d" + " -Dc_nbnxnGpuJgroupSize=%d" "%s", - c_nbnxnGpuClusterSize, /* Defined in nbnxn_pairlist.h */ - (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""); + c_nbnxnGpuClusterSize, c_nbnxnMinDistanceSquared, c_nbnxnGpuNumClusterPerSupercluster, + c_nbnxnGpuJgroupSize, (nb->bPrefetchLjParam) ? " -DIATYPE_SHMEM" : ""); try { /* TODO when we have a proper MPI-aware logging module, diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh index cbdb45f01a..b238843a91 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012-2018, The GROMACS development team. - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -177,10 +177,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) const int bidx = get_group_id(0); const int widx = tidx / WARP_SIZE; /* warp index */ - /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */ - const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U); + /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */ + const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); -#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE) +#define LOCAL_OFFSET (xqib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE) CjType cjs = 0; #if USE_CJ_PREFETCH /* shmem buffer for cj, for both warps separately */ @@ -194,11 +194,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) /* shmem buffer for i atom-type pre-loading */ __local int* atib = (__local int*)(LOCAL_OFFSET); //NOLINT(google-readability-casting) # undef LOCAL_OFFSET -# define LOCAL_OFFSET (atib + NCL_PER_SUPERCL * CL_SIZE) +# define LOCAL_OFFSET (atib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE) # else __local float2* ljcpib = (__local float2*)(LOCAL_OFFSET); # undef LOCAL_OFFSET -# define LOCAL_OFFSET (ljcpib + NCL_PER_SUPERCL * CL_SIZE) +# define LOCAL_OFFSET (ljcpib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE) # endif #endif @@ -225,10 +225,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) const int cij4_start = nb_sci.cj4_ind_start; /* first ...*/ const int cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */ - for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE) + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += CL_SIZE) { /* Pre-load i-atom x and q into shared memory */ - const int ci = sci * NCL_PER_SUPERCL + tidxj + i; + const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i; const int ai = ci * CL_SIZE + tidxi; float4 xqbuf = xq[ai] @@ -254,8 +254,8 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #endif barrier(CLK_LOCAL_MEM_FENCE); - float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */ - for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) + float3 fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */ + for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++) { fci_buf[ci_offset] = (float3)(0.0F); } @@ -272,17 +272,18 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) float E_el = 0.0F; # if defined EXCLUSION_FORCES /* Ewald or RF */ - if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * NCL_PER_SUPERCL) + if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster) { /* we have the diagonal: add the charge and LJ self interaction energy term */ - for (int i = 0; i < NCL_PER_SUPERCL; i++) + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { # if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF const float qi = xqib[i * CL_SIZE + tidxi].w; E_el += qi * qi; # endif # if defined LJ_EWALD - E_lj += nbfp_climg2d[atom_types[(sci * NCL_PER_SUPERCL + i) * CL_SIZE + tidxi] * (ntypes + 1) * 2]; + E_lj += nbfp_climg2d[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi] + * (ntypes + 1) * 2]; # endif /* LJ_EWALD */ } @@ -335,9 +336,9 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #endif for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) { - if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL))) + if (imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster))) { - unsigned int mask_ji = (1U << (jm * NCL_PER_SUPERCL)); + unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster)); const int cj = loadCj(cjs, pl_cj4[j4].cj, jm, tidxi, tidxj); const int aj = cj * CL_SIZE + tidxj; @@ -357,11 +358,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #if !defined PRUNE_NBL # pragma unroll 8 #endif - for (int i = 0; i < NCL_PER_SUPERCL; i++) + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { if (imask & mask_ji) { - const int gmx_unused ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */ + const int gmx_unused ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; /* i cluster index */ /* all threads load an atom from i cluster ci into shmem! */ const float4 xiqbuf = xqib[i * CL_SIZE + tidxi]; @@ -423,8 +424,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) # endif /* LJ_COMB_GEOM */ #endif /* LJ_COMB */ - // Ensure distance do not become so small that r^-12 overflows - r2 = max(r2, NBNXN_MIN_RSQ); + // Ensure distance do not become so small that r^-12 overflows. + // Cast to float to ensure the correct built-in max() function + // is called. + r2 = max(r2, (float)c_nbnxnMinDistanceSquared); const float inv_r = rsqrt(r2); const float inv_r2 = inv_r * inv_r; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh index 475c1d68b0..16dd0c4962 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -101,24 +101,23 @@ nbnxn_kernel_prune_rolling_opencl #endif // TODO move these consts to utils and unify their use with the nonbonded kernels - const int c_numClPerSupercl = NCL_PER_SUPERCL; - const int c_clSize = CL_SIZE; + const int c_clSize = CL_SIZE; // TODO pass this value at compile-time as a macro const int c_nbnxnGpuClusterpairSplit = 2; - /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */ - const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U); + /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */ + const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); -#define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize) +#define LOCAL_OFFSET (xib + c_nbnxnGpuNumClusterPerSupercluster * c_clSize) /* shmem buffer for i cj pre-loading */ CjType cjs = 0; #if USE_CJ_PREFETCH cjs = (((__local int*)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize); # undef LOCAL_OFFSET /* Offset calculated using xib because cjs depends on on tidxz! */ -# define LOCAL_OFFSET \ - (((__local int*)(xib + c_numClPerSupercl * c_clSize)) \ +# define LOCAL_OFFSET \ + (((__local int*)(xib + c_nbnxnGpuNumClusterPerSupercluster * c_clSize)) \ + (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize)) #endif #if !USE_SUBGROUP_ANY @@ -147,10 +146,10 @@ nbnxn_kernel_prune_rolling_opencl if (tidxz == 0) { - for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE) + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += CL_SIZE) { /* Pre-load i-atom x and q into shared memory */ - const int ci = sci * c_numClPerSupercl + tidxj + i; + const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i; const int ai = ci * c_clSize + tidxi; /* We don't need q, but using float4 in shmem avoids bank conflicts */ @@ -194,9 +193,9 @@ nbnxn_kernel_prune_rolling_opencl #pragma unroll 4 for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) { - if (imaskCheck & (superClInteractionMask << (jm * c_numClPerSupercl))) + if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster))) { - unsigned int mask_ji = (1U << (jm * c_numClPerSupercl)); + unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster)); const int cj = loadCj(cjs, pl_cj4[j4].cj, jm, tidxi, tidxj); const int aj = cj * c_clSize + tidxj; @@ -206,7 +205,7 @@ nbnxn_kernel_prune_rolling_opencl const float3 xj = (float3)(tmp.xyz); #pragma unroll 8 - for (int i = 0; i < c_numClPerSupercl; i++) + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { if (imaskCheck & mask_ji) { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh index 05f66de557..7d70a5908a 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh @@ -47,13 +47,11 @@ #include "gromacs/gpu_utils/device_utils.clh" #include "gromacs/gpu_utils/vectype_ops.clh" -#include "gromacs/nbnxm/constants.h" #include "gromacs/pbcutil/ishift.h" #include "nbnxm_ocl_consts.h" -#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE) -#define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster +#define CL_SIZE (c_nbnxnGpuClusterSize) #define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported @@ -233,8 +231,8 @@ typedef struct */ } nbnxn_excl_t; -/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */ -__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U); +/*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */ +__constant unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); gmx_opencl_inline void preloadCj4Generic(__local int* sm_cjPreload, const __global int* gm_cj, @@ -698,9 +696,9 @@ gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3* fci_buf, /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed * for CL_SIZE>4.*/ float2 fshift_buf = 0; - for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) + for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++) { - int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi; + int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi; float3 fin = fci_buf[ci_offset]; fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE); fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE); @@ -754,9 +752,9 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b __global float* fshift) { float fshift_buf = 0; - for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) + for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++) { - int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi; + int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi; int tidx = tidxi + tidxj * CL_SIZE; /* store i forces in shmem */ f_buf[tidx] = fci_buf[ci_offset].x; diff --git a/src/gromacs/nbnxm/pairlist.h b/src/gromacs/nbnxm/pairlist.h index 3f7c62705f..97805f7f1b 100644 --- a/src/gromacs/nbnxm/pairlist.h +++ b/src/gromacs/nbnxm/pairlist.h @@ -48,9 +48,6 @@ #include "gromacs/utility/enumerationhelpers.h" #include "gromacs/utility/real.h" -// This file with constants is separate from this file to be able -// to include it during OpenCL jitting without including config.h -#include "constants.h" #include "pairlistparams.h" struct NbnxnPairlistCpuWork; @@ -133,6 +130,31 @@ constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_1 = 0x0080c0e0U; //! \} //! \} +/*! \brief Lower limit for square interaction distances in nonbonded kernels. + * + * For smaller values we will overflow when calculating r^-1 or r^-12, but + * to keep it simple we always apply the limit from the tougher r^-12 condition. + */ +#if GMX_DOUBLE +// Some double precision SIMD architectures use single precision in the first +// step, so although the double precision criterion would allow smaller rsq, +// we need to stay in single precision with some margin for the N-R iterations. +constexpr double c_nbnxnMinDistanceSquared = 1.0e-36; +#else +// The worst intermediate value we might evaluate is r^-12, which +// means we should ensure r^2 stays above pow(GMX_FLOAT_MAX,-1.0/6.0)*1.01 (some margin) +constexpr float c_nbnxnMinDistanceSquared = 3.82e-07F; // r > 6.2e-4 +#endif + + +//! The number of clusters in a super-cluster, used for GPU +constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8; + +/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage + * of integers containing 32 bits. + */ +constexpr int c_nbnxnGpuJgroupSize = (32 / c_nbnxnGpuNumClusterPerSupercluster); + /*! \internal * \brief Simple pair-list i-unit */ -- 2.11.4.GIT