From 525e24eee5b77c9c66a96be5749f1e3dbf613514 Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Fri, 25 Aug 2017 19:16:13 +0200 Subject: [PATCH] CUDA 9/Volta support for PME Change-Id: Icd5cdf16f9118347179dfcbdd162f0cb39cbdd69 --- src/gromacs/ewald/pme-gather.cu | 16 +++++++++------- src/gromacs/ewald/pme-solve.cu | 33 +++++++++++++++++---------------- src/gromacs/ewald/pme-spread.cu | 1 + 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu index 17d0ae587e..92f72e3394 100644 --- a/src/gromacs/ewald/pme-gather.cu +++ b/src/gromacs/ewald/pme-gather.cu @@ -104,23 +104,25 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc #if (GMX_PTX_ARCH >= 300) if (!(order & (order - 1))) // Only for orders of power of 2 { + const unsigned int activeMask = c_fullWarpMask; + // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl // TODO: find out if this is the best in terms of transactions count static_assert(order == 4, "Only order of 4 is implemented"); static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)"); const int width = atomDataSize; - fx += __shfl_down(fx, 1, width); - fy += __shfl_up (fy, 1, width); - fz += __shfl_down(fz, 1, width); + fx += gmx_shfl_down_sync(activeMask, fx, 1, width); + fy += gmx_shfl_up_sync (activeMask, fy, 1, width); + fz += gmx_shfl_down_sync(activeMask, fz, 1, width); if (splineIndex & 1) { fx = fy; } - fx += __shfl_down(fx, 2, width); - fz += __shfl_up (fz, 2, width); + fx += gmx_shfl_down_sync(activeMask, fx, 2, width); + fz += gmx_shfl_up_sync (activeMask, fz, 2, width); if (splineIndex & 2) { @@ -134,7 +136,7 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc // We have to just further reduce those groups of 4 for (int delta = 4; delta < atomDataSize; delta <<= 1) { - fx += __shfl_down(fx, delta, width); + fx += gmx_shfl_down_sync(activeMask, fx, delta, width); } const int dimIndex = splineIndex; @@ -377,7 +379,7 @@ __global__ void pme_gather_kernel(const pme_gpu_cuda_kernel_params_t kernelPa sm_forces[forceIndexLocal] = result; } - // No sync here + gmx_syncwarp(); assert(atomsPerBlock <= warp_size); /* Writing or adding the final forces component-wise, single warp */ diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu index 649b6851da..3eebe92351 100644 --- a/src/gromacs/ewald/pme-solve.cu +++ b/src/gromacs/ewald/pme-solve.cu @@ -267,16 +267,17 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne */ /* We can only reduce warp-wise */ - const int width = warp_size; + const int width = warp_size; + const unsigned int activeMask = c_fullWarpMask; /* Making pair sums */ - virxx += __shfl_down(virxx, 1, width); - viryy += __shfl_up (viryy, 1, width); - virzz += __shfl_down(virzz, 1, width); - virxy += __shfl_up (virxy, 1, width); - virxz += __shfl_down(virxz, 1, width); - viryz += __shfl_up (viryz, 1, width); - energy += __shfl_down(energy, 1, width); + virxx += gmx_shfl_down_sync(activeMask, virxx, 1, width); + viryy += gmx_shfl_up_sync (activeMask, viryy, 1, width); + virzz += gmx_shfl_down_sync(activeMask, virzz, 1, width); + virxy += gmx_shfl_up_sync (activeMask, virxy, 1, width); + virxz += gmx_shfl_down_sync(activeMask, virxz, 1, width); + viryz += gmx_shfl_up_sync (activeMask, viryz, 1, width); + energy += gmx_shfl_down_sync(activeMask, energy, 1, width); if (threadLocalId & 1) { virxx = viryy; // virxx now holds virxx and viryy pair sums @@ -285,10 +286,10 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne } /* Making quad sums */ - virxx += __shfl_down(virxx, 2, width); - virzz += __shfl_up (virzz, 2, width); - virxz += __shfl_down(virxz, 2, width); - energy += __shfl_up(energy, 2, width); + virxx += gmx_shfl_down_sync(activeMask, virxx, 2, width); + virzz += gmx_shfl_up_sync (activeMask, virzz, 2, width); + virxz += gmx_shfl_down_sync(activeMask, virxz, 2, width); + energy += gmx_shfl_up_sync (activeMask, energy, 2, width); if (threadLocalId & 2) { virxx = virzz; // virxx now holds quad sums of virxx, virxy, virzz and virxy @@ -296,8 +297,8 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne } /* Making octet sums */ - virxx += __shfl_down(virxx, 4, width); - virxz += __shfl_up(virxz, 4, width); + virxx += gmx_shfl_down_sync(activeMask, virxx, 4, width); + virxz += gmx_shfl_up_sync (activeMask, virxz, 4, width); if (threadLocalId & 4) { virxx = virxz; // virxx now holds all 7 components' octet sums + unused paddings @@ -307,7 +308,7 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne #pragma unroll for (int delta = 8; delta < width; delta <<= 1) { - virxx += __shfl_down(virxx, delta, width); + virxx += gmx_shfl_down_sync(activeMask, virxx, delta, width); } /* Now first 7 threads of each warp have the full output contributions in virxx */ @@ -347,7 +348,7 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne #pragma unroll for (int delta = stride; delta < warp_size; delta <<= 1) { - output += __shfl_down(output, delta, warp_size); + output += gmx_shfl_down_sync(activeMask, output, delta, warp_size); } /* Final output */ if (validComponentIndex) diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu index ee9287459d..58e5890f7f 100644 --- a/src/gromacs/ewald/pme-spread.cu +++ b/src/gromacs/ewald/pme-spread.cu @@ -458,6 +458,7 @@ __global__ void pme_spline_and_spread_kernel(const pme_gpu_cuda_kernel_params_t __syncthreads(); calculate_splines(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates, sm_coefficients, sm_theta, sm_gridlineIndices); + gmx_syncwarp(); } else { -- 2.11.4.GIT