From 6b8847e89534340ecf6acfc24d751844bf154fb1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Thu, 1 Nov 2018 16:59:30 +0100 Subject: [PATCH] Eliminate macro that expands to variable in PME GPU The PME_SPREADGATHER_THREADS_PER_ATOM expanded to a statement containing variables that needed to be declared. This has been eliminated and the order constant is used through a constexpr. Added consistency checks to make sure that the PME order stored in the data structures matches the constant in question. Also converted a few more macros to constexpr. Change-Id: I73efea25c2da7cbde40df961f8bfae7e364bd74d --- src/gromacs/ewald/pme-gather.clh | 7 +-- src/gromacs/ewald/pme-gather.cu | 9 ++-- src/gromacs/ewald/pme-gpu-constants.h | 18 +++++--- src/gromacs/ewald/pme-gpu-internal.cpp | 62 ++++++++++++-------------- src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp | 11 +++-- src/gromacs/ewald/pme-gpu-types-host.h | 4 +- src/gromacs/ewald/pme-gpu-utils.clh | 12 ++--- src/gromacs/ewald/pme-gpu-utils.h | 2 +- src/gromacs/ewald/pme-program.cl | 4 +- src/gromacs/ewald/pme-spread.clh | 12 ++--- src/gromacs/ewald/pme-spread.cu | 6 +-- 11 files changed, 75 insertions(+), 72 deletions(-) diff --git a/src/gromacs/ewald/pme-gather.clh b/src/gromacs/ewald/pme-gather.clh index b8ed1cf11a..4f275fa4cb 100644 --- a/src/gromacs/ewald/pme-gather.clh +++ b/src/gromacs/ewald/pme-gather.clh @@ -106,7 +106,7 @@ inline void reduce_atom_forces(__local float * __restrict__ sm_forces, // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514 /* Number of data components and threads for a single atom */ -#define atomDataSize PME_SPREADGATHER_THREADS_PER_ATOM +#define atomDataSize threadsPerAtom // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements // All those guys are defines and not consts, because they go into the local memory array size. #define blockSize (atomsPerBlock * atomDataSize) @@ -210,6 +210,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe /* Some sizes which are defines and not consts because they go into the array size */ #define blockSize (atomsPerBlock * atomDataSize) + assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2))); #define smemPerDim warp_size #define smemReserved ((DIM - 1) * smemPerDim) #define totalSharedMemory (smemReserved + blockSize) @@ -266,8 +267,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe const int pny = kernelParams.grid.realGridSizePadded[YY]; const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; - const int atomWarpIndex = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP; - const int warpIndex = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP; + const int atomWarpIndex = atomIndexLocal % atomsPerWarp; + const int warpIndex = atomIndexLocal / atomsPerWarp; const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu index dfcce64f72..b8297c4d68 100644 --- a/src/gromacs/ewald/pme-gather.cu +++ b/src/gromacs/ewald/pme-gather.cu @@ -233,11 +233,12 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams) float * __restrict__ gm_forces = kernelParams.atoms.d_forces; /* Some sizes */ - const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM); - const int atomDataSize = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */ - const int blockSize = atomsPerBlock * atomDataSize; - const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP; + const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom); + const int atomDataSize = c_pmeSpreadGatherThreadsPerAtom; /* Number of data components and threads for a single atom */ + const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp; + const int blockSize = atomsPerBlock * atomDataSize; + assert(blockSize == blockDim.x * blockDim.y * blockDim.z); const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x; /* These are the atom indices - for the shared and global memory */ diff --git a/src/gromacs/ewald/pme-gpu-constants.h b/src/gromacs/ewald/pme-gpu-constants.h index 5f66f09d11..ca108915e2 100644 --- a/src/gromacs/ewald/pme-gpu-constants.h +++ b/src/gromacs/ewald/pme-gpu-constants.h @@ -64,7 +64,7 @@ * (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp). * The atom index checks in the spread/gather code potentially hinder the performance. * true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms - * fitting in is divisible by PME_ATOM_DATA_ALIGNMENT. + * fitting in is divisible by c_pmeAtomDataAlignment. * The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen. * Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now). * \todo Estimate performance differences @@ -114,19 +114,25 @@ constexpr int c_virialAndEnergyCount = 7; The corresponding defines follow. */ +/*! \brief PME order parameter + * + * Note that the GPU code, unlike the CPU, only supports order 4. + */ +constexpr int c_pmeGpuOrder = 4; + /*! \brief * The number of GPU threads used for computing spread/gather contributions of a single atom as function of the PME order. * The assumption is currently that any thread processes only a single atom's contributions. * TODO: this assumption leads to minimum execution width of 16. See Redmine #2516 */ -#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order) +constexpr int c_pmeSpreadGatherThreadsPerAtom = (c_pmeGpuOrder * c_pmeGpuOrder); /*! \brief Minimum execution width of the PME spread and gather kernels. * * Due to the one thread per atom and order=4 implementation constraints, order^2 threads - * should execute without synchronization needed. See PME_SPREADGATHER_THREADS_PER_ATOM + * should execute without synchronization needed. See c_pmeSpreadGatherThreadsPerAtom */ -constexpr int c_pmeSpreadGatherMinWarpSize = 16; +constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom; /*! \brief * Atom data alignment (in terms of number of atoms). @@ -136,7 +142,7 @@ constexpr int c_pmeSpreadGatherMinWarpSize = 16; * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this. * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather(). */ -#define PME_ATOM_DATA_ALIGNMENT 32 +constexpr int c_pmeAtomDataAlignment = 32; /* * The execution widths for PME GPU kernels, used both on host and device for correct scheduling. @@ -170,7 +176,7 @@ constexpr int c_gatherMaxWarpsPerBlock = 4; * This macro depends on the templated order parameter (2 atoms per warp for order 4 and warp_size of 32). * It is mostly used for spline data layout tweaked for coalesced access. */ -#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM) +constexpr int c_pmeSpreadGatherAtomsPerWarp = (warp_size / c_pmeSpreadGatherThreadsPerAtom); //! Spreading max block size in threads constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size; diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp index d90ee204ad..ff9ea073c4 100644 --- a/src/gromacs/ewald/pme-gpu-internal.cpp +++ b/src/gromacs/ewald/pme-gpu-internal.cpp @@ -102,15 +102,13 @@ static PmeGpuKernelParamsBase *pme_gpu_get_kernel_params_base_ptr(const PmeGpu * int pme_gpu_get_atom_data_alignment(const PmeGpu * /*unused*/) { - //TODO: this can be simplified, as PME_ATOM_DATA_ALIGNMENT is now constant - return PME_ATOM_DATA_ALIGNMENT; + //TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant + return c_pmeAtomDataAlignment; } int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu) { - const int order = pmeGpu->common->pme_order; - GMX_ASSERT(order > 0, "Invalid PME order"); - return pmeGpu->programHandle_->impl_->warpSize / PME_SPREADGATHER_THREADS_PER_ATOM; + return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom; } void pme_gpu_synchronize(const PmeGpu *pmeGpu) @@ -591,11 +589,11 @@ void pme_gpu_destroy_3dfft(const PmeGpu *pmeGpu) int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp) { - if (order != 4) + if (order != c_pmeGpuOrder) { throw order; } - constexpr int fixedOrder = 4; + constexpr int fixedOrder = c_pmeGpuOrder; GMX_UNUSED_VALUE(fixedOrder); const int atomWarpIndex = atomIndex % atomsPerWarp; @@ -743,6 +741,10 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme) pmeGpu->common->nk[YY] = pme->nky; pmeGpu->common->nk[ZZ] = pme->nkz; pmeGpu->common->pme_order = pme->pme_order; + if (pmeGpu->common->pme_order != c_pmeGpuOrder) + { + GMX_THROW(gmx::NotImplementedError("pme_order != 4 is not implemented!")); + } for (int i = 0; i < DIM; i++) { pmeGpu->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGpu->common->nk[i]); @@ -813,6 +815,7 @@ void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGpu, const pme_atomcomm const auto atomCount = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->atoms.nAtoms; const auto atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu); const auto pmeOrder = pmeGpu->common->pme_order; + GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented"); real *cpuSplineBuffer; float *h_splineBuffer; @@ -990,14 +993,15 @@ void pme_gpu_spread(const PmeGpu *pmeGpu, const size_t blockSize = pmeGpu->programHandle_->impl_->spreadWorkGroupSize; const int order = pmeGpu->common->pme_order; - const int atomsPerBlock = blockSize / PME_SPREADGATHER_THREADS_PER_ATOM; + GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented"); + const int atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom; // TODO: pick smaller block size in runtime if needed // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms)) // If doing so, change atomsPerBlock in the kernels as well. // TODO: test varying block sizes on modern arch-s as well // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define) - GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size"); + GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size"); const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); @@ -1009,11 +1013,6 @@ void pme_gpu_spread(const PmeGpu *pmeGpu, config.gridSize[1] = dimGrid.second; config.stream = pmeGpu->archSpecific->pmeStream; - if (order != 4) - { - GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!")); - } - int timingId; PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; if (computeSplines) @@ -1173,9 +1172,6 @@ void pme_gpu_gather(PmeGpu *pmeGpu, pme_gpu_copy_input_forces(pmeGpu); } - const int order = pmeGpu->common->pme_order; - const auto *kernelParamsPtr = pmeGpu->kernelParams.get(); - if (!pme_gpu_performs_FFT(pmeGpu) || pme_gpu_is_testing(pmeGpu)) { pme_gpu_copy_input_gather_grid(pmeGpu, const_cast(h_grid)); @@ -1187,12 +1183,16 @@ void pme_gpu_gather(PmeGpu *pmeGpu, } const size_t blockSize = pmeGpu->programHandle_->impl_->gatherWorkGroupSize; - const int atomsPerBlock = blockSize / PME_SPREADGATHER_THREADS_PER_ATOM; - GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size"); + const int atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom; + GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. gathering block size"); const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); + + const int order = pmeGpu->common->pme_order; + GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented"); + KernelLaunchConfig config; config.blockSize[0] = config.blockSize[1] = order; config.blockSize[2] = atomsPerBlock; @@ -1200,11 +1200,6 @@ void pme_gpu_gather(PmeGpu *pmeGpu, config.gridSize[1] = dimGrid.second; config.stream = pmeGpu->archSpecific->pmeStream; - if (order != 4) - { - GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!")); - } - // TODO test different cache configs int timingId = gtPME_GATHER; @@ -1214,17 +1209,18 @@ void pme_gpu_gather(PmeGpu *pmeGpu, pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel; pme_gpu_start_timing(pmeGpu, timingId); - auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + const auto *kernelParamsPtr = pmeGpu->kernelParams.get(); #if c_canEmbedBuffers - const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); #else - const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr, - &kernelParamsPtr->atoms.d_coefficients, - &kernelParamsPtr->grid.d_realGrid, - &kernelParamsPtr->atoms.d_theta, - &kernelParamsPtr->atoms.d_dtheta, - &kernelParamsPtr->atoms.d_gridlineIndices, - &kernelParamsPtr->atoms.d_forces); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr, + &kernelParamsPtr->atoms.d_coefficients, + &kernelParamsPtr->grid.d_realGrid, + &kernelParamsPtr->atoms.d_theta, + &kernelParamsPtr->atoms.d_dtheta, + &kernelParamsPtr->atoms.d_gridlineIndices, + &kernelParamsPtr->atoms.d_forces); #endif launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs); pme_gpu_stop_timing(pmeGpu, timingId); diff --git a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp index f2c00f1516..43b533651d 100644 --- a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp +++ b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp @@ -140,12 +140,11 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo) * files outside as macros, to avoid including those files * in the JIT compilation that happens at runtime. */ - constexpr int order = 4; const std::string commonDefines = gmx::formatString( "-Dwarp_size=%zd " "-Dorder=%d " - "-DPME_SPREADGATHER_ATOMS_PER_WARP=%zd " - "-DPME_SPREADGATHER_THREADS_PER_ATOM=%d " + "-DatomsPerWarp=%zd " + "-DthreadsPerAtom=%d " // forwarding from pme-grid.h, used for spline computation table sizes only "-Dc_pmeMaxUnitcellShift=%f " // forwarding PME behavior constants from pme-gpu-constants.h @@ -161,9 +160,9 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo) // decomposition parameter placeholders "-DwrapX=true -DwrapY=true ", warpSize, - order, - warpSize / PME_SPREADGATHER_THREADS_PER_ATOM, - PME_SPREADGATHER_THREADS_PER_ATOM, + c_pmeGpuOrder, + warpSize / c_pmeSpreadGatherThreadsPerAtom, + c_pmeSpreadGatherThreadsPerAtom, static_cast(c_pmeMaxUnitcellShift), c_usePadding, c_skipNeutralAtoms, diff --git a/src/gromacs/ewald/pme-gpu-types-host.h b/src/gromacs/ewald/pme-gpu-types-host.h index 65a2ef9797..8a2625cb75 100644 --- a/src/gromacs/ewald/pme-gpu-types-host.h +++ b/src/gromacs/ewald/pme-gpu-types-host.h @@ -184,14 +184,14 @@ struct PmeGpu */ PmeGpuStaging staging; - /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT. + /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment. * Used for kernel scheduling. * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying. * TODO: this and the next member represent a memory allocation/padding properties - * what a container type should do ideally. */ int nAtomsPadded; - /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT + /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment * if c_usePadding is true. * Used only as a basic size for almost all the atom data allocations * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP). diff --git a/src/gromacs/ewald/pme-gpu-utils.clh b/src/gromacs/ewald/pme-gpu-utils.clh index fa666ab8af..fe0531dc49 100644 --- a/src/gromacs/ewald/pme-gpu-utils.clh +++ b/src/gromacs/ewald/pme-gpu-utils.clh @@ -40,7 +40,7 @@ * This closely mirrors pme-gpu-utils.h (which is used in CUDA and unit tests), except with no templates. * Instead of templated parameters this file expects following defines during compilation: * - order - PME interpolation order; - * - PME_SPREADGATHER_ATOMS_PER_WARP - number of atoms processed by a warp (fixed for spread and gather kernels to be the same); + * - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same); * - c_usePadding and c_skipNeutralAtoms - same as in pme-gpu-constants.h. * * \author Aleksei Iupinov @@ -53,20 +53,20 @@ * Feed the result into getSplineParamIndex() to get a full index. * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it. * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme. - * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP). + * Removing warp dependency would also be nice (and would probably coincide with removing atomsPerWarp). * * \param[in] warpIndex Warp index wrt the block. - * \param[in] atomWarpIndex Atom index wrt the warp (from 0 to PME_SPREADGATHER_ATOMS_PER_WARP - 1). + * \param[in] atomWarpIndex Atom index wrt the warp (from 0 to atomsPerWarp - 1). * * \returns Index into theta or dtheta array using GPU layout. */ inline int getSplineParamIndexBase(int warpIndex, int atomWarpIndex) { - assert((atomWarpIndex >= 0) && (atomWarpIndex < PME_SPREADGATHER_ATOMS_PER_WARP)); + assert((atomWarpIndex >= 0) && (atomWarpIndex < atomsPerWarp)); const int dimIndex = 0; const int splineIndex = 0; // The zeroes are here to preserve the full index formula for reference - return (((splineIndex + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex); + return (((splineIndex + order * warpIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex); } /*! \internal \brief @@ -87,7 +87,7 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex { assert((dimIndex >= XX) && (dimIndex < DIM)); assert((splineIndex >= 0) && (splineIndex < order)); - return (paramIndexBase + (splineIndex * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP); + return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp); } /*! \brief diff --git a/src/gromacs/ewald/pme-gpu-utils.h b/src/gromacs/ewald/pme-gpu-utils.h index 3c7e5f0fe1..e576a56045 100644 --- a/src/gromacs/ewald/pme-gpu-utils.h +++ b/src/gromacs/ewald/pme-gpu-utils.h @@ -64,7 +64,7 @@ * Feed the result into getSplineParamIndex() to get a full index. * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it. * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme. - * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP). + * Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp). * * \tparam order PME order * \tparam atomsPerWarp Number of atoms processed by a warp diff --git a/src/gromacs/ewald/pme-program.cl b/src/gromacs/ewald/pme-program.cl index eb0cdb1fe8..afbf17273d 100644 --- a/src/gromacs/ewald/pme-program.cl +++ b/src/gromacs/ewald/pme-program.cl @@ -57,7 +57,7 @@ /* SPREAD/SPLINE */ -#define atomsPerBlock (c_spreadWorkGroupSize / PME_SPREADGATHER_THREADS_PER_ATOM) +#define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom) // spline/spread fused #define computeSplines 1 @@ -92,7 +92,7 @@ /* GATHER */ -#define atomsPerBlock (c_gatherWorkGroupSize / PME_SPREADGATHER_THREADS_PER_ATOM) +#define atomsPerBlock (c_gatherWorkGroupSize / threadsPerAtom) // gather #define overwriteForces 1 diff --git a/src/gromacs/ewald/pme-spread.clh b/src/gromacs/ewald/pme-spread.clh index f4ef414230..5916e162c3 100644 --- a/src/gromacs/ewald/pme-spread.clh +++ b/src/gromacs/ewald/pme-spread.clh @@ -132,16 +132,16 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, /* Thread index w.r.t. warp */ const int threadWarpIndex = threadLocalIndex % warp_size; /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */ - const int atomWarpIndex = threadWarpIndex % PME_SPREADGATHER_ATOMS_PER_WARP; + const int atomWarpIndex = threadWarpIndex % atomsPerWarp; /* Atom index w.r.t. block/shared memory */ - const int atomIndexLocal = warpIndex * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex; + const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex; /* Atom index w.r.t. global memory */ const int atomIndexGlobal = atomIndexOffset + atomIndexLocal; /* Spline contribution index in one dimension */ - const int orderIndex = threadWarpIndex / (PME_SPREADGATHER_ATOMS_PER_WARP * DIM); + const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM); /* Dimension index */ - const int dimIndex = (threadWarpIndex - orderIndex * (PME_SPREADGATHER_ATOMS_PER_WARP * DIM)) / PME_SPREADGATHER_ATOMS_PER_WARP; + const int dimIndex = (threadWarpIndex - orderIndex * (atomsPerWarp * DIM)) / atomsPerWarp; /* Multi-purpose index of rvec/ivec atom data */ const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex; @@ -343,9 +343,9 @@ inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams, } /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */ - const int atomWarpIndex = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP; + const int atomWarpIndex = atomIndexLocal % atomsPerWarp; /* Warp index w.r.t. block - could probably be obtained easier? */ - const int warpIndex = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP; + const int warpIndex = atomIndexLocal / atomsPerWarp; const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu index e45945551b..030367fd30 100644 --- a/src/gromacs/ewald/pme-spread.cu +++ b/src/gromacs/ewald/pme-spread.cu @@ -124,7 +124,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta; int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices; - const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP; + const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp; /* Fractional coordinates */ __shared__ float sm_fractCoords[atomsPerBlock * DIM]; @@ -332,7 +332,7 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams float * __restrict__ gm_grid = kernelParams.grid.d_realGrid; - const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP; + const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp; const int nx = kernelParams.grid.realGridSize[XX]; const int ny = kernelParams.grid.realGridSize[YY]; @@ -417,7 +417,7 @@ template < __launch_bounds__(c_spreadMaxThreadsPerBlock) __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams) { - const int atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM; + const int atomsPerBlock = c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom; // Gridline indices, ivec __shared__ int sm_gridlineIndices[atomsPerBlock * DIM]; // Charges -- 2.11.4.GIT