2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013-2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 * \brief Implements PME GPU spline calculation and charge spreading in CUDA.
40 * TODO: consider always pre-sorting particles (as in DD case).
42 * \author Aleksei Iupinov <a.yupinov@gmail.com>
49 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
52 #include "pme_calculate_splines.cuh"
53 #include "pme_gpu_utils.h"
57 * Charge spreading onto the grid.
58 * This corresponds to the CPU function spread_coefficients_bsplines_thread().
59 * Optional second stage of the spline_and_spread_kernel.
61 * \tparam[in] order PME interpolation order.
62 * \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
63 * \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
64 * \tparam[in] useOrderThreads A boolean which Tells if we should use order threads per atom (order*order used if false)
65 * \param[in] kernelParams Input PME CUDA data in constant memory.
66 * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
67 * \param[in] atomCharge Atom charge/coefficient of atom processed by thread.
68 * \param[in] sm_gridlineIndices Atom gridline indices in the shared memory.
69 * \param[in] sm_theta Atom spline values in the shared memory.
72 const int order, const bool wrapX, const bool wrapY, const bool useOrderThreads >
73 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
75 const float * atomCharge,
76 const int * __restrict__ sm_gridlineIndices,
77 const float * __restrict__ sm_theta)
79 /* Global memory pointer to the output grid */
80 float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
83 const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
85 const int nx = kernelParams.grid.realGridSize[XX];
86 const int ny = kernelParams.grid.realGridSize[YY];
87 const int nz = kernelParams.grid.realGridSize[ZZ];
88 const int pny = kernelParams.grid.realGridSizePadded[YY];
89 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
91 const int offx = 0, offy = 0, offz = 0; // unused for now
93 const int atomIndexLocal = threadIdx.z;
94 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
96 const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
97 const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
98 if (chargeCheck & globalCheck)
100 // Spline Z coordinates
101 const int ithz = threadIdx.x;
103 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
104 const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy;
105 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
110 /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
111 const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
112 /* Warp index w.r.t. block - could probably be obtained easier? */
113 const int warpIndex = atomIndexLocal / atomsPerWarp;
115 const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
116 const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
117 const float thetaZ = sm_theta[splineIndexZ];
119 /* loop not used if order*order threads per atom */
120 const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
121 const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
122 for (int ithy = ithyMin; ithy < ithyMax; ithy++)
124 int iy = iyBase + ithy;
125 if (wrapY & (iy >= ny))
130 const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
131 float thetaY = sm_theta[splineIndexY];
132 const float Val = thetaZ * thetaY * (*atomCharge);
133 assert(isfinite(Val));
134 const int offset = iy * pnz + iz;
137 for (int ithx = 0; (ithx < order); ithx++)
139 int ix = ixBase + ithx;
140 if (wrapX & (ix >= nx))
144 const int gridIndexGlobal = ix * pny * pnz + offset;
145 const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
146 const float thetaX = sm_theta[splineIndexX];
147 assert(isfinite(thetaX));
148 assert(isfinite(gm_grid[gridIndexGlobal]));
149 atomicAdd(gm_grid + gridIndexGlobal, thetaX * Val);
156 * A spline computation and charge spreading kernel function.
158 * Two tuning parameters can be used for additional performance. For small systems and for debugging
159 * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
160 * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
162 * \tparam[in] order PME interpolation order.
163 * \tparam[in] computeSplines A boolean which tells if the spline parameter and
164 * gridline indices' computation should be performed.
165 * \tparam[in] spreadCharges A boolean which tells if the charge spreading should be performed.
166 * \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
167 * \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
168 * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory.
169 * \tparam[in] useOrderThreads A boolean which tells if we should use order threads per atom (order*order used if false).
170 * \param[in] kernelParams Input PME CUDA data in constant memory.
174 const bool computeSplines,
175 const bool spreadCharges,
178 const bool writeGlobal,
179 const bool useOrderThreads
181 __launch_bounds__(c_spreadMaxThreadsPerBlock)
182 CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE
183 __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
185 const int atomsPerBlock = useOrderThreads ? c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
186 c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
187 // Gridline indices, ivec
188 __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
190 __shared__ float sm_theta[atomsPerBlock * DIM * order];
193 const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom :
194 c_pmeSpreadGatherAtomsPerWarp;
199 const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
200 const int atomIndexOffset = blockIndex * atomsPerBlock;
202 /* Thread index w.r.t. block */
203 const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
204 + (threadIdx.y * blockDim.x) + threadIdx.x;
205 /* Warp index w.r.t. block - could probably be obtained easier? */
206 const int warpIndex = threadLocalId / warp_size;
208 /* Atom index w.r.t. warp */
209 const int atomWarpIndex = threadIdx.z %atomsPerWarp;
210 /* Atom index w.r.t. block/shared memory */
211 const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
212 /* Atom index w.r.t. global memory */
213 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
215 /* Early return for fully empty blocks at the end
216 * (should only happen for billions of input atoms)
218 if (atomIndexOffset >= kernelParams.atoms.nAtoms)
222 /* Charges, required for both spline and spread */
223 if (c_useAtomDataPrefetch)
225 __shared__ float sm_coefficients[atomsPerBlock];
226 pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
228 atomCharge = sm_coefficients[atomIndexLocal];
232 atomCharge = kernelParams.atoms.d_coefficients[atomIndexGlobal];
237 if (c_useAtomDataPrefetch)
240 __shared__ float sm_coordinates[DIM * atomsPerBlock];
242 /* Staging coordinates */
243 pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
245 atomX.x = sm_coordinates[atomIndexLocal*DIM+XX];
246 atomX.y = sm_coordinates[atomIndexLocal*DIM+YY];
247 atomX.z = sm_coordinates[atomIndexLocal*DIM+ZZ];
251 atomX.x = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
252 atomX.y = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
253 atomX.z = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
255 calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>
256 (kernelParams, atomIndexOffset, atomX, atomCharge,
257 sm_theta, &dtheta, sm_gridlineIndices);
262 /* Staging the data for spread
263 * (the data is assumed to be in GPU global memory with proper layout already,
264 * as in after running the spline kernel)
266 /* Spline data - only thetas (dthetas will only be needed in gather) */
267 pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta, kernelParams.atoms.d_theta);
268 /* Gridline indices */
269 pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices, kernelParams.atoms.d_gridlineIndices);
277 spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, atomIndexOffset, &atomCharge,
278 sm_gridlineIndices, sm_theta);
282 //! Kernel instantiations
283 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, true>(const PmeGpuCudaKernelParams);
284 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, true>(const PmeGpuCudaKernelParams);
285 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, true>(const PmeGpuCudaKernelParams);
287 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, true>(const PmeGpuCudaKernelParams);
289 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, false>(const PmeGpuCudaKernelParams);
290 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, false>(const PmeGpuCudaKernelParams);
291 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, false>(const PmeGpuCudaKernelParams);
293 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, false>(const PmeGpuCudaKernelParams);