src/gromacs/ewald/pme_gather.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME force gathering in CUDA.
  38  *
  39  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include <cassert>
  45
  46 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  47
  48 #include "pme.cuh"
  49 #include "pme_calculate_splines.cuh"
  50 #include "pme_gpu_utils.h"
  51 #include "pme_grid.h"
  52
  53 /*! \brief
  54  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
  55  */
  56 __device__ __forceinline__ float read_grid_size(const float *realGridSizeFP,
  57                                                 const int    dimIndex)
  58 {
  59     switch (dimIndex)
  60     {
  61         case XX: return realGridSizeFP[XX];
  62         case YY: return realGridSizeFP[YY];
  63         case ZZ: return realGridSizeFP[ZZ];
  64     }
  65     assert(false);
  66     return 0.0f;
  67 }
  68
  69 /*! \brief Reduce the partial force contributions.
  70  *
  71  * \tparam[in] order              The PME order (must be 4).
  72  * \tparam[in] atomDataSize       The number of partial force contributions for each atom (currently order^2 == 16)
  73  * \tparam[in] blockSize          The CUDA block size
  74  * \param[out] sm_forces          Shared memory array with the output forces (number of elements is number of atoms per block)
  75  * \param[in]  atomIndexLocal     Local atom index
  76  * \param[in]  splineIndex        Spline index
  77  * \param[in]  lineIndex          Line index (same as threadLocalId)
  78  * \param[in]  realGridSizeFP     Local grid size constant
  79  * \param[in]  fx                 Input force partial component X
  80  * \param[in]  fy                 Input force partial component Y
  81  * \param[in]  fz                 Input force partial component Z
  82  */
  83 template <
  84     const int order,
  85     const int atomDataSize,
  86     const int blockSize
  87     >
  88 __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forces,
  89                                                    const int             atomIndexLocal,
  90                                                    const int             splineIndex,
  91                                                    const int             lineIndex,
  92                                                    const float          *realGridSizeFP,
  93                                                    float                &fx,
  94                                                    float                &fy,
  95                                                    float                &fz)
  96 {
  97     if (!(order & (order - 1))) // Only for orders of power of 2
  98     {
  99         const unsigned int activeMask = c_fullWarpMask;
 100
 101         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
 102         // TODO: find out if this is the best in terms of transactions count
 103         static_assert(order == 4, "Only order of 4 is implemented");
 104         static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
 105         const int width = atomDataSize;
 106
 107         fx += __shfl_down_sync(activeMask, fx, 1, width);
 108         fy += __shfl_up_sync  (activeMask, fy, 1, width);
 109         fz += __shfl_down_sync(activeMask, fz, 1, width);
 110
 111         if (splineIndex & 1)
 112         {
 113             fx = fy;
 114         }
 115
 116         fx += __shfl_down_sync(activeMask, fx, 2, width);
 117         fz += __shfl_up_sync  (activeMask, fz, 2, width);
 118
 119         if (splineIndex & 2)
 120         {
 121             fx = fz;
 122         }
 123
 124         // By now fx contains intermediate quad sums of all 3 components:
 125         // splineIndex    0            1            2 and 3      4            5            6 and 7      8...
 126         // sum of...      fx0 to fx3   fy0 to fy3   fz0 to fz3   fx4 to fx7   fy4 to fy7   fz4 to fz7   etc.
 127
 128         // We have to just further reduce those groups of 4
 129         for (int delta = 4; delta < atomDataSize; delta <<= 1)
 130         {
 131             fx += __shfl_down_sync(activeMask, fx, delta, width);
 132         }
 133
 134         const int dimIndex = splineIndex;
 135         if (dimIndex < DIM)
 136         {
 137             const float n = read_grid_size(realGridSizeFP, dimIndex);
 138             *((float *)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
 139         }
 140     }
 141     else
 142     {
 143         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
 144         // which are stored separately (first 2 dimensions only)
 145         const int         smemPerDim   = warp_size;
 146         const int         smemReserved = (DIM) *smemPerDim;
 147         __shared__ float  sm_forceReduction[smemReserved + blockSize];
 148         __shared__ float *sm_forceTemp[DIM];
 149
 150         const int         numWarps  = blockSize / smemPerDim;
 151         const int         minStride = max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
 152
 153 #pragma unroll
 154         for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
 155         {
 156             int elementIndex = smemReserved + lineIndex;
 157             // Store input force contributions
 158             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
 159             // sync here because two warps write data that the first one consumes below
 160             __syncthreads();
 161             // Reduce to fit into smemPerDim (warp size)
 162 #pragma unroll
 163             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
 164             {
 165                 if (splineIndex < redStride)
 166                 {
 167                     sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
 168                 }
 169             }
 170             __syncthreads();
 171             // Last iteration - packing everything to be nearby, storing convenience pointer
 172             sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
 173             int redStride = minStride;
 174             if (splineIndex < redStride)
 175             {
 176                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
 177                 sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
 178             }
 179             __syncthreads();
 180         }
 181
 182         assert ((blockSize / warp_size) >= DIM);
 183         //assert (atomsPerBlock <= warp_size);
 184
 185         const int warpIndex = lineIndex / warp_size;
 186         const int dimIndex  = warpIndex;
 187
 188         // First 3 warps can now process 1 dimension each
 189         if (dimIndex < DIM)
 190         {
 191             int sourceIndex = lineIndex % warp_size;
 192 #pragma unroll
 193             for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
 194             {
 195                 if (!(splineIndex & redStride))
 196                 {
 197                     sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
 198                 }
 199             }
 200
 201             __syncwarp();
 202
 203             const float n         = read_grid_size(realGridSizeFP, dimIndex);
 204             const int   atomIndex = sourceIndex / minStride;
 205
 206             if (sourceIndex == minStride * atomIndex)
 207             {
 208                 *((float *)(&sm_forces[atomIndex]) + dimIndex) = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
 209             }
 210         }
 211     }
 212 }
 213
 214 /*! \brief
 215  * A CUDA kernel which gathers the atom forces from the grid.
 216  * The grid is assumed to be wrapped in dimension Z.
 217  *
 218  * \tparam[in] order                The PME order (must be 4 currently).
 219  * \tparam[in] overwriteForces      True: the forces are written to the output buffer;
 220  *                                  False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
 221  * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
 222  * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
 223  * \tparam[in] readGlobal           Tells if we should read spline values from global memory
 224  * \tparam[in] useOrderThreads      Tells if we should use order threads per atom (order*order used if false)
 225  * \param[in]  kernelParams         All the PME GPU data.
 226  */
 227 template <
 228     const int order,
 229     const bool overwriteForces,
 230     const bool wrapX,
 231     const bool wrapY,
 232     const bool readGlobal,
 233     const bool useOrderThreads
 234     >
 235 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP)
 236 __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
 237 {
 238     /* Global memory pointers */
 239     const float * __restrict__  gm_coefficients     = kernelParams.atoms.d_coefficients;
 240     const float * __restrict__  gm_grid             = kernelParams.grid.d_realGrid;
 241     float * __restrict__        gm_forces           = kernelParams.atoms.d_forces;
 242
 243     /* Global memory pointers for readGlobal */
 244     const float * __restrict__  gm_theta            = kernelParams.atoms.d_theta;
 245     const float * __restrict__  gm_dtheta           = kernelParams.atoms.d_dtheta;
 246     const int * __restrict__    gm_gridlineIndices  = kernelParams.atoms.d_gridlineIndices;
 247
 248     float3 atomX;
 249     float  atomCharge;
 250
 251     /* Some sizes */
 252     const int    atomsPerBlock   = useOrderThreads ? (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom) :
 253         (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
 254     const int    blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
 255
 256     /* Number of data components and threads for a single atom */
 257     const int    atomDataSize   = useOrderThreads ? c_pmeSpreadGatherThreadsPerAtom4ThPerAtom : c_pmeSpreadGatherThreadsPerAtom;
 258     const int    atomsPerWarp   = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
 259
 260     const int    blockSize      = atomsPerBlock * atomDataSize;
 261     assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
 262
 263     /* These are the atom indices - for the shared and global memory */
 264     const int         atomIndexLocal    = threadIdx.z;
 265     const int         atomIndexOffset   = blockIndex * atomsPerBlock;
 266     const int         atomIndexGlobal   = atomIndexOffset + atomIndexLocal;
 267
 268     /* Early return for fully empty blocks at the end
 269      * (should only happen for billions of input atoms)
 270      */
 271     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
 272     {
 273         return;
 274     }
 275     // 4 warps per block, 8 atoms per warp *3 *4
 276     const int         splineParamsSize             = atomsPerBlock * DIM * order;
 277     const int         gridlineIndicesSize          = atomsPerBlock * DIM;
 278     __shared__ int    sm_gridlineIndices[gridlineIndicesSize];
 279     __shared__ float  sm_theta[splineParamsSize];
 280     __shared__ float  sm_dtheta[splineParamsSize];
 281
 282     /* Spline Z coordinates */
 283     const int ithz = threadIdx.x;
 284
 285     /* These are the spline contribution indices in shared memory */
 286     const int       splineIndex =  threadIdx.y * blockDim.x + threadIdx.x;
 287     const int       lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
 288
 289     const int       threadLocalId    = (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x*threadIdx.y   + threadIdx.x;
 290     const int       threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
 291
 292     if (readGlobal)
 293     {
 294         /* Read splines */
 295         const int localGridlineIndicesIndex  = threadLocalId;
 296         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
 297         const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
 298         if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
 299         {
 300             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
 301             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
 302         }
 303         /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
 304            with order*order threads per atom, it is only required for each thread to load one data value */
 305
 306         const int iMin = 0;
 307         const int iMax = useOrderThreads ? 3 : 1;
 308
 309         for (int i = iMin; i < iMax; i++)
 310         {
 311             int localSplineParamsIndex  = threadLocalId + i*threadLocalIdMax;   /* i will always be zero for order*order threads per atom */
 312             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
 313             int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
 314             if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
 315             {
 316                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
 317                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
 318                 assert(isfinite(sm_theta[localSplineParamsIndex]));
 319                 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
 320             }
 321         }
 322         __syncthreads();
 323     }
 324     else
 325     {
 326         /* Recaclulate  Splines  */
 327         if (c_useAtomDataPrefetch)
 328         {
 329             //charges
 330             __shared__ float sm_coefficients[atomsPerBlock];
 331             // Coordinates
 332             __shared__ float sm_coordinates[DIM * atomsPerBlock];
 333             /* Staging coefficients/charges */
 334             pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
 335
 336             /* Staging coordinates */
 337             pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
 338             __syncthreads();
 339             atomX.x    = sm_coordinates[atomIndexLocal*DIM+XX];
 340             atomX.y    = sm_coordinates[atomIndexLocal*DIM+YY];
 341             atomX.z    = sm_coordinates[atomIndexLocal*DIM+ZZ];
 342             atomCharge = sm_coefficients[atomIndexLocal];
 343
 344         }
 345         else
 346         {
 347             atomCharge =  gm_coefficients[atomIndexGlobal];
 348             atomX.x    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
 349             atomX.y    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
 350             atomX.z    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
 351         }
 352         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(kernelParams, atomIndexOffset,
 353                                                                            atomX, atomCharge,
 354                                                                            sm_theta, sm_dtheta,
 355                                                                            sm_gridlineIndices);
 356         __syncwarp();
 357     }
 358     float           fx = 0.0f;
 359     float           fy = 0.0f;
 360     float           fz = 0.0f;
 361
 362     const int       globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 363     const int       chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 364
 365     if (chargeCheck & globalCheck)
 366     {
 367         const int    nx        = kernelParams.grid.realGridSize[XX];
 368         const int    ny        = kernelParams.grid.realGridSize[YY];
 369         const int    nz        = kernelParams.grid.realGridSize[ZZ];
 370         const int    pny       = kernelParams.grid.realGridSizePadded[YY];
 371         const int    pnz       = kernelParams.grid.realGridSizePadded[ZZ];
 372
 373         const int    atomWarpIndex = atomIndexLocal % atomsPerWarp;
 374         const int    warpIndex     = atomIndexLocal / atomsPerWarp;
 375
 376         const int    splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
 377         const int    splineIndexZ    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
 378         const float2 tdz             =  make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ] );
 379
 380         int          iz             = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
 381         const int    ixBase         = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 382
 383         if (iz >= nz)
 384         {
 385             iz -= nz;
 386         }
 387         int       constOffset, iy;
 388
 389         const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
 390         const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
 391         for (int ithy = ithyMin; ithy < ithyMax; ithy++)
 392         {
 393             const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
 394             const float2 tdy             =  make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY] );
 395
 396             iy             = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
 397             if (wrapY & (iy >= ny))
 398             {
 399                 iy -= ny;
 400             }
 401             constOffset    = iy * pnz + iz;
 402
 403 #pragma unroll
 404             for (int ithx = 0; (ithx < order); ithx++)
 405             {
 406                 int ix = ixBase + ithx;
 407                 if (wrapX & (ix >= nx))
 408                 {
 409                     ix -= nx;
 410                 }
 411                 const int     gridIndexGlobal  = ix * pny * pnz + constOffset;
 412                 assert(gridIndexGlobal >= 0);
 413                 const float   gridValue    = gm_grid[gridIndexGlobal];
 414                 assert(isfinite(gridValue));
 415                 const int     splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
 416                 const float2  tdx          = make_float2( sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
 417                 const float   fxy1         = tdz.x * gridValue;
 418                 const float   fz1          = tdz.y * gridValue;
 419                 fx += tdx.y * tdy.x * fxy1;
 420                 fy += tdx.x * tdy.y * fxy1;
 421                 fz += tdx.x * tdy.x * fz1;
 422             }
 423         }
 424     }
 425
 426     // Reduction of partial force contributions
 427     __shared__ float3 sm_forces[atomsPerBlock];
 428     reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces,
 429                                                        atomIndexLocal, splineIndex, lineIndex,
 430                                                        kernelParams.grid.realGridSizeFP,
 431                                                        fx, fy, fz);
 432     __syncthreads();
 433
 434     /* Calculating the final forces with no component branching, atomsPerBlock threads */
 435     const int forceIndexLocal  = threadLocalId;
 436     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
 437     const int calcIndexCheck   = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
 438     if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
 439     {
 440         const float3  atomForces               = sm_forces[forceIndexLocal];
 441         const float   negCoefficient           = -gm_coefficients[forceIndexGlobal];
 442         float3        result;
 443         result.x                   = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
 444         result.y                   = negCoefficient * (kernelParams.current.recipBox[XX][YY] * atomForces.x + kernelParams.current.recipBox[YY][YY] * atomForces.y);
 445         result.z                   = negCoefficient * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x + kernelParams.current.recipBox[YY][ZZ] * atomForces.y + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
 446         sm_forces[forceIndexLocal] = result;
 447     }
 448
 449     __syncwarp();
 450     assert(atomsPerBlock <= warp_size);
 451
 452     /* Writing or adding the final forces component-wise, single warp */
 453     const int blockForcesSize = atomsPerBlock * DIM;
 454     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
 455     const int iterThreads     = blockForcesSize / numIter;
 456     if (threadLocalId < iterThreads)
 457     {
 458 #pragma unroll
 459         for (int i = 0; i < numIter; i++)
 460         {
 461             int         outputIndexLocal  = i * iterThreads + threadLocalId;
 462             int         outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
 463             const int   globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
 464             if (globalOutputCheck)
 465             {
 466                 const float outputForceComponent = ((float *)sm_forces)[outputIndexLocal];
 467                 if (overwriteForces)
 468                 {
 469                     gm_forces[outputIndexGlobal] = outputForceComponent;
 470                 }
 471                 else
 472                 {
 473                     gm_forces[outputIndexGlobal] += outputForceComponent;
 474                 }
 475             }
 476         }
 477     }
 478 }
 479
 480 //! Kernel instantiations
 481 template __global__ void pme_gather_kernel<4, true, true, true, true, true>(const PmeGpuCudaKernelParams);
 482 template __global__ void pme_gather_kernel<4, true, true, true, true, false>(const PmeGpuCudaKernelParams);
 483 template __global__ void pme_gather_kernel<4, false, true, true, true, true>(const PmeGpuCudaKernelParams);
 484 template __global__ void pme_gather_kernel<4, false, true, true, true, false>(const PmeGpuCudaKernelParams);
 485 template __global__ void pme_gather_kernel<4, true, true, true, false, true>(const PmeGpuCudaKernelParams);
 486 template __global__ void pme_gather_kernel<4, true, true, true, false, false>(const PmeGpuCudaKernelParams);
 487 template __global__ void pme_gather_kernel<4, false, true, true, false, true>(const PmeGpuCudaKernelParams);
 488 template __global__ void pme_gather_kernel<4, false, true, true, false, false>(const PmeGpuCudaKernelParams);