From 6b8847e89534340ecf6acfc24d751844bf154fb1 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Thu, 1 Nov 2018 16:59:30 +0100
Subject: [PATCH] Eliminate macro that expands to variable in PME GPU

The PME_SPREADGATHER_THREADS_PER_ATOM expanded to a statement containing
variables that needed to be declared. This has been eliminated and the
order constant is used through a constexpr.
Added consistency checks to make sure that the PME order stored in the
data structures matches the constant in question.

Also converted a few more macros to constexpr.

Change-Id: I73efea25c2da7cbde40df961f8bfae7e364bd74d
---
 src/gromacs/ewald/pme-gather.clh               |  7 +--
 src/gromacs/ewald/pme-gather.cu                |  9 ++--
 src/gromacs/ewald/pme-gpu-constants.h          | 18 +++++---
 src/gromacs/ewald/pme-gpu-internal.cpp         | 62 ++++++++++++--------------
 src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp | 11 +++--
 src/gromacs/ewald/pme-gpu-types-host.h         |  4 +-
 src/gromacs/ewald/pme-gpu-utils.clh            | 12 ++---
 src/gromacs/ewald/pme-gpu-utils.h              |  2 +-
 src/gromacs/ewald/pme-program.cl               |  4 +-
 src/gromacs/ewald/pme-spread.clh               | 12 ++---
 src/gromacs/ewald/pme-spread.cu                |  6 +--
 11 files changed, 75 insertions(+), 72 deletions(-)

diff --git a/src/gromacs/ewald/pme-gather.clh b/src/gromacs/ewald/pme-gather.clh
index b8ed1cf11a..4f275fa4cb 100644
--- a/src/gromacs/ewald/pme-gather.clh
+++ b/src/gromacs/ewald/pme-gather.clh
@@ -106,7 +106,7 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
     // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
 
     /* Number of data components and threads for a single atom */
-#define atomDataSize PME_SPREADGATHER_THREADS_PER_ATOM
+#define atomDataSize threadsPerAtom
     // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
     // All those guys are defines and not consts, because they go into the local memory array size.
 #define blockSize (atomsPerBlock * atomDataSize)
@@ -210,6 +210,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
 
     /* Some sizes which are defines and not consts because they go into the array size */
     #define blockSize (atomsPerBlock * atomDataSize)
+    assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
     #define smemPerDim warp_size
     #define smemReserved  ((DIM - 1) * smemPerDim)
     #define totalSharedMemory (smemReserved + blockSize)
@@ -266,8 +267,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
         const int    pny       = kernelParams.grid.realGridSizePadded[YY];
         const int    pnz       = kernelParams.grid.realGridSizePadded[ZZ];
 
-        const int    atomWarpIndex = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
-        const int    warpIndex     = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
+        const int    atomWarpIndex = atomIndexLocal % atomsPerWarp;
+        const int    warpIndex     = atomIndexLocal / atomsPerWarp;
 
         const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
         const int    splineIndexY    = getSplineParamIndex(splineIndexBase, YY, ithy);
diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu
index dfcce64f72..b8297c4d68 100644
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -233,11 +233,12 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
     float * __restrict__        gm_forces           = kernelParams.atoms.d_forces;
 
     /* Some sizes */
-    const int    atomsPerBlock  = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
-    const int    atomDataSize   = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */
-    const int    blockSize      = atomsPerBlock * atomDataSize;
-    const int    atomsPerWarp   = PME_SPREADGATHER_ATOMS_PER_WARP;
+    const int    atomsPerBlock  = (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
+    const int    atomDataSize   = c_pmeSpreadGatherThreadsPerAtom; /* Number of data components and threads for a single atom */
+    const int    atomsPerWarp   = c_pmeSpreadGatherAtomsPerWarp;
 
+    const int    blockSize      = atomsPerBlock * atomDataSize;
+    assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
     const int    blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
 
     /* These are the atom indices - for the shared and global memory */
diff --git a/src/gromacs/ewald/pme-gpu-constants.h b/src/gromacs/ewald/pme-gpu-constants.h
index 5f66f09d11..ca108915e2 100644
--- a/src/gromacs/ewald/pme-gpu-constants.h
+++ b/src/gromacs/ewald/pme-gpu-constants.h
@@ -64,7 +64,7 @@
  *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
  *        The atom index checks in the spread/gather code potentially hinder the performance.
  * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
- *        fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
+ *        fitting in is divisible by c_pmeAtomDataAlignment.
  *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
  *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
  * \todo Estimate performance differences
@@ -114,19 +114,25 @@ constexpr int c_virialAndEnergyCount = 7;
     The corresponding defines follow.
  */
 
+/*! \brief PME order parameter
+ *
+ *  Note that the GPU code, unlike the CPU, only supports order 4.
+ */
+constexpr int c_pmeGpuOrder = 4;
+
 /*! \brief
  * The number of GPU threads used for computing spread/gather contributions of a single atom as function of the PME order.
  * The assumption is currently that any thread processes only a single atom's contributions.
  * TODO: this assumption leads to minimum execution width of 16. See Redmine #2516
  */
-#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
+constexpr int c_pmeSpreadGatherThreadsPerAtom = (c_pmeGpuOrder * c_pmeGpuOrder);
 
 /*! \brief Minimum execution width of the PME spread and gather kernels.
  *
  * Due to the one thread per atom and order=4 implementation constraints, order^2 threads
- * should execute without synchronization needed. See PME_SPREADGATHER_THREADS_PER_ATOM
+ * should execute without synchronization needed. See c_pmeSpreadGatherThreadsPerAtom
  */
-constexpr int c_pmeSpreadGatherMinWarpSize = 16;
+constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom;
 
 /*! \brief
  * Atom data alignment (in terms of number of atoms).
@@ -136,7 +142,7 @@ constexpr int c_pmeSpreadGatherMinWarpSize = 16;
  * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
  * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
  */
-#define PME_ATOM_DATA_ALIGNMENT 32
+constexpr int c_pmeAtomDataAlignment = 32;
 
 /*
  * The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
@@ -170,7 +176,7 @@ constexpr int c_gatherMaxWarpsPerBlock = 4;
  * This macro depends on the templated order parameter (2 atoms per warp for order 4 and warp_size of 32).
  * It is mostly used for spline data layout tweaked for coalesced access.
  */
-#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
+constexpr int c_pmeSpreadGatherAtomsPerWarp = (warp_size / c_pmeSpreadGatherThreadsPerAtom);
 
 //! Spreading max block size in threads
 constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp
index d90ee204ad..ff9ea073c4 100644
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -102,15 +102,13 @@ static PmeGpuKernelParamsBase *pme_gpu_get_kernel_params_base_ptr(const PmeGpu *
 
 int pme_gpu_get_atom_data_alignment(const PmeGpu * /*unused*/)
 {
-    //TODO: this can be simplified, as PME_ATOM_DATA_ALIGNMENT is now constant
-    return PME_ATOM_DATA_ALIGNMENT;
+    //TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
+    return c_pmeAtomDataAlignment;
 }
 
 int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
 {
-    const int order = pmeGpu->common->pme_order;
-    GMX_ASSERT(order > 0, "Invalid PME order");
-    return pmeGpu->programHandle_->impl_->warpSize / PME_SPREADGATHER_THREADS_PER_ATOM;
+    return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
 }
 
 void pme_gpu_synchronize(const PmeGpu *pmeGpu)
@@ -591,11 +589,11 @@ void pme_gpu_destroy_3dfft(const PmeGpu *pmeGpu)
 
 int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
 {
-    if (order != 4)
+    if (order != c_pmeGpuOrder)
     {
         throw order;
     }
-    constexpr int fixedOrder = 4;
+    constexpr int fixedOrder = c_pmeGpuOrder;
     GMX_UNUSED_VALUE(fixedOrder);
 
     const int atomWarpIndex = atomIndex % atomsPerWarp;
@@ -743,6 +741,10 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
     pmeGpu->common->nk[YY]        = pme->nky;
     pmeGpu->common->nk[ZZ]        = pme->nkz;
     pmeGpu->common->pme_order     = pme->pme_order;
+    if (pmeGpu->common->pme_order != c_pmeGpuOrder)
+    {
+        GMX_THROW(gmx::NotImplementedError("pme_order != 4 is not implemented!"));
+    }
     for (int i = 0; i < DIM; i++)
     {
         pmeGpu->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGpu->common->nk[i]);
@@ -813,6 +815,7 @@ void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGpu, const pme_atomcomm
     const auto      atomCount    = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->atoms.nAtoms;
     const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
     const auto      pmeOrder     = pmeGpu->common->pme_order;
+    GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
 
     real           *cpuSplineBuffer;
     float          *h_splineBuffer;
@@ -990,14 +993,15 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
     const size_t blockSize  = pmeGpu->programHandle_->impl_->spreadWorkGroupSize;
 
     const int    order         = pmeGpu->common->pme_order;
-    const int    atomsPerBlock = blockSize / PME_SPREADGATHER_THREADS_PER_ATOM;
+    GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
+    const int    atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
     // TODO: pick smaller block size in runtime if needed
     // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
     // If doing so, change atomsPerBlock in the kernels as well.
     // TODO: test varying block sizes on modern arch-s as well
     // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
     //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
-    GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
+    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
 
     const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
     auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
@@ -1009,11 +1013,6 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
     config.gridSize[1]  = dimGrid.second;
     config.stream       = pmeGpu->archSpecific->pmeStream;
 
-    if (order != 4)
-    {
-        GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
-    }
-
     int  timingId;
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
     if (computeSplines)
@@ -1173,9 +1172,6 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
         pme_gpu_copy_input_forces(pmeGpu);
     }
 
-    const int    order           = pmeGpu->common->pme_order;
-    const auto  *kernelParamsPtr = pmeGpu->kernelParams.get();
-
     if (!pme_gpu_performs_FFT(pmeGpu) || pme_gpu_is_testing(pmeGpu))
     {
         pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float *>(h_grid));
@@ -1187,12 +1183,16 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
     }
 
     const size_t blockSize     = pmeGpu->programHandle_->impl_->gatherWorkGroupSize;
-    const int    atomsPerBlock = blockSize / PME_SPREADGATHER_THREADS_PER_ATOM;
-    GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
+    const int    atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
+    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
 
     const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
     auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
+
+    const int order = pmeGpu->common->pme_order;
+    GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
+
     KernelLaunchConfig config;
     config.blockSize[0] = config.blockSize[1] = order;
     config.blockSize[2] = atomsPerBlock;
@@ -1200,11 +1200,6 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
     config.gridSize[1]  = dimGrid.second;
     config.stream       = pmeGpu->archSpecific->pmeStream;
 
-    if (order != 4)
-    {
-        GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
-    }
-
     // TODO test different cache configs
 
     int  timingId = gtPME_GATHER;
@@ -1214,17 +1209,18 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
         pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
 
     pme_gpu_start_timing(pmeGpu, timingId);
-    auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+    auto       *timingEvent     = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+    const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
 #if c_canEmbedBuffers
-    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+    const auto  kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
 #else
-    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr,
-                                                      &kernelParamsPtr->atoms.d_coefficients,
-                                                      &kernelParamsPtr->grid.d_realGrid,
-                                                      &kernelParamsPtr->atoms.d_theta,
-                                                      &kernelParamsPtr->atoms.d_dtheta,
-                                                      &kernelParamsPtr->atoms.d_gridlineIndices,
-                                                      &kernelParamsPtr->atoms.d_forces);
+    const auto  kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr,
+                                                       &kernelParamsPtr->atoms.d_coefficients,
+                                                       &kernelParamsPtr->grid.d_realGrid,
+                                                       &kernelParamsPtr->atoms.d_theta,
+                                                       &kernelParamsPtr->atoms.d_dtheta,
+                                                       &kernelParamsPtr->atoms.d_gridlineIndices,
+                                                       &kernelParamsPtr->atoms.d_forces);
 #endif
     launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
     pme_gpu_stop_timing(pmeGpu, timingId);
diff --git a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
index f2c00f1516..43b533651d 100644
--- a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
+++ b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
@@ -140,12 +140,11 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
          * files outside as macros, to avoid including those files
          * in the JIT compilation that happens at runtime.
          */
-        constexpr int     order         = 4;
         const std::string commonDefines = gmx::formatString(
                     "-Dwarp_size=%zd "
                     "-Dorder=%d "
-                    "-DPME_SPREADGATHER_ATOMS_PER_WARP=%zd "
-                    "-DPME_SPREADGATHER_THREADS_PER_ATOM=%d "
+                    "-DatomsPerWarp=%zd "
+                    "-DthreadsPerAtom=%d "
                     // forwarding from pme-grid.h, used for spline computation table sizes only
                     "-Dc_pmeMaxUnitcellShift=%f "
                     // forwarding PME behavior constants from pme-gpu-constants.h
@@ -161,9 +160,9 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
                     // decomposition parameter placeholders
                     "-DwrapX=true -DwrapY=true ",
                     warpSize,
-                    order,
-                    warpSize / PME_SPREADGATHER_THREADS_PER_ATOM,
-                    PME_SPREADGATHER_THREADS_PER_ATOM,
+                    c_pmeGpuOrder,
+                    warpSize / c_pmeSpreadGatherThreadsPerAtom,
+                    c_pmeSpreadGatherThreadsPerAtom,
                     static_cast<float>(c_pmeMaxUnitcellShift),
                     c_usePadding,
                     c_skipNeutralAtoms,
diff --git a/src/gromacs/ewald/pme-gpu-types-host.h b/src/gromacs/ewald/pme-gpu-types-host.h
index 65a2ef9797..8a2625cb75 100644
--- a/src/gromacs/ewald/pme-gpu-types-host.h
+++ b/src/gromacs/ewald/pme-gpu-types-host.h
@@ -184,14 +184,14 @@ struct PmeGpu
      */
     PmeGpuStaging staging;
 
-    /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT.
+    /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment.
      * Used for kernel scheduling.
      * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
      * TODO: this and the next member represent a memory allocation/padding properties -
      * what a container type should do ideally.
      */
     int nAtomsPadded;
-    /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT
+    /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment
      * if c_usePadding is true.
      * Used only as a basic size for almost all the atom data allocations
      * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
diff --git a/src/gromacs/ewald/pme-gpu-utils.clh b/src/gromacs/ewald/pme-gpu-utils.clh
index fa666ab8af..fe0531dc49 100644
--- a/src/gromacs/ewald/pme-gpu-utils.clh
+++ b/src/gromacs/ewald/pme-gpu-utils.clh
@@ -40,7 +40,7 @@
  * This closely mirrors pme-gpu-utils.h (which is used in CUDA and unit tests), except with no templates.
  * Instead of templated parameters this file expects following defines during compilation:
  * - order - PME interpolation order;
- * - PME_SPREADGATHER_ATOMS_PER_WARP - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
+ * - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
  * - c_usePadding and c_skipNeutralAtoms - same as in pme-gpu-constants.h.
  *
  * \author Aleksei Iupinov <a.yupinov@gmail.com>
@@ -53,20 +53,20 @@
  * Feed the result into getSplineParamIndex() to get a full index.
  * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
  * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
- * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP).
+ * Removing warp dependency would also be nice (and would probably coincide with removing atomsPerWarp).
  *
  * \param[in] warpIndex        Warp index wrt the block.
- * \param[in] atomWarpIndex    Atom index wrt the warp (from 0 to PME_SPREADGATHER_ATOMS_PER_WARP - 1).
+ * \param[in] atomWarpIndex    Atom index wrt the warp (from 0 to atomsPerWarp - 1).
  *
  * \returns Index into theta or dtheta array using GPU layout.
  */
 inline int getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
 {
-    assert((atomWarpIndex >= 0) && (atomWarpIndex < PME_SPREADGATHER_ATOMS_PER_WARP));
+    assert((atomWarpIndex >= 0) && (atomWarpIndex < atomsPerWarp));
     const int dimIndex    = 0;
     const int splineIndex = 0;
     // The zeroes are here to preserve the full index formula for reference
-    return (((splineIndex + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+    return (((splineIndex + order * warpIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex);
 }
 
 /*! \internal \brief
@@ -87,7 +87,7 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex
 {
     assert((dimIndex >= XX) && (dimIndex < DIM));
     assert((splineIndex >= 0) && (splineIndex < order));
-    return (paramIndexBase + (splineIndex * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP);
+    return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
 }
 
 /*! \brief
diff --git a/src/gromacs/ewald/pme-gpu-utils.h b/src/gromacs/ewald/pme-gpu-utils.h
index 3c7e5f0fe1..e576a56045 100644
--- a/src/gromacs/ewald/pme-gpu-utils.h
+++ b/src/gromacs/ewald/pme-gpu-utils.h
@@ -64,7 +64,7 @@
  * Feed the result into getSplineParamIndex() to get a full index.
  * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
  * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
- * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP).
+ * Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp).
  *
  * \tparam order               PME order
  * \tparam atomsPerWarp        Number of atoms processed by a warp
diff --git a/src/gromacs/ewald/pme-program.cl b/src/gromacs/ewald/pme-program.cl
index eb0cdb1fe8..afbf17273d 100644
--- a/src/gromacs/ewald/pme-program.cl
+++ b/src/gromacs/ewald/pme-program.cl
@@ -57,7 +57,7 @@
 
 /* SPREAD/SPLINE */
 
-#define atomsPerBlock  (c_spreadWorkGroupSize / PME_SPREADGATHER_THREADS_PER_ATOM)
+#define atomsPerBlock  (c_spreadWorkGroupSize / threadsPerAtom)
 
 // spline/spread fused
 #define computeSplines 1
@@ -92,7 +92,7 @@
 
 /* GATHER */
 
-#define atomsPerBlock (c_gatherWorkGroupSize / PME_SPREADGATHER_THREADS_PER_ATOM)
+#define atomsPerBlock (c_gatherWorkGroupSize / threadsPerAtom)
 
 // gather
 #define overwriteForces 1
diff --git a/src/gromacs/ewald/pme-spread.clh b/src/gromacs/ewald/pme-spread.clh
index f4ef414230..5916e162c3 100644
--- a/src/gromacs/ewald/pme-spread.clh
+++ b/src/gromacs/ewald/pme-spread.clh
@@ -132,16 +132,16 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams  kernelParams,
     /* Thread index w.r.t. warp */
     const int threadWarpIndex = threadLocalIndex % warp_size;
     /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
-    const int atomWarpIndex = threadWarpIndex % PME_SPREADGATHER_ATOMS_PER_WARP;
+    const int atomWarpIndex = threadWarpIndex % atomsPerWarp;
     /* Atom index w.r.t. block/shared memory */
-    const int atomIndexLocal = warpIndex * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex;
+    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
 
     /* Atom index w.r.t. global memory */
     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
     /* Spline contribution index in one dimension */
-    const int orderIndex = threadWarpIndex / (PME_SPREADGATHER_ATOMS_PER_WARP * DIM);
+    const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
     /* Dimension index */
-    const int dimIndex = (threadWarpIndex - orderIndex * (PME_SPREADGATHER_ATOMS_PER_WARP * DIM)) / PME_SPREADGATHER_ATOMS_PER_WARP;
+    const int dimIndex = (threadWarpIndex - orderIndex * (atomsPerWarp * DIM)) / atomsPerWarp;
 
     /* Multi-purpose index of rvec/ivec atom data */
     const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
@@ -343,9 +343,9 @@ inline void spread_charges(const struct PmeOpenCLKernelParams  kernelParams,
         }
 
         /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
-        const int    atomWarpIndex  = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
+        const int    atomWarpIndex   = atomIndexLocal % atomsPerWarp;
         /* Warp index w.r.t. block - could probably be obtained easier? */
-        const int    warpIndex   = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
+        const int    warpIndex       = atomIndexLocal / atomsPerWarp;
 
         const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
         const int    splineIndexZ    = getSplineParamIndex(splineIndexBase, ZZ, ithz);
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu
index e45945551b..030367fd30 100644
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -124,7 +124,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams
     float * __restrict__ gm_dtheta          = kernelParams.atoms.d_dtheta;
     int * __restrict__   gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
 
-    const int            atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP;
+    const int            atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
 
     /* Fractional coordinates */
     __shared__ float sm_fractCoords[atomsPerBlock * DIM];
@@ -332,7 +332,7 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
     float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
 
 
-    const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP;
+    const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
 
     const int nx  = kernelParams.grid.realGridSize[XX];
     const int ny  = kernelParams.grid.realGridSize[YY];
@@ -417,7 +417,7 @@ template <
 __launch_bounds__(c_spreadMaxThreadsPerBlock)
 __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
 {
-    const int        atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+    const int        atomsPerBlock = c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
     // Gridline indices, ivec
     __shared__ int   sm_gridlineIndices[atomsPerBlock * DIM];
     // Charges
-- 
2.11.4.GIT