From 525e24eee5b77c9c66a96be5749f1e3dbf613514 Mon Sep 17 00:00:00 2001
From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Fri, 25 Aug 2017 19:16:13 +0200
Subject: [PATCH] CUDA 9/Volta support for PME

Change-Id: Icd5cdf16f9118347179dfcbdd162f0cb39cbdd69
---
 src/gromacs/ewald/pme-gather.cu | 16 +++++++++-------
 src/gromacs/ewald/pme-solve.cu  | 33 +++++++++++++++++----------------
 src/gromacs/ewald/pme-spread.cu |  1 +
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu
index 17d0ae587e..92f72e3394 100644
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -104,23 +104,25 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
 #if (GMX_PTX_ARCH >= 300)
     if (!(order & (order - 1))) // Only for orders of power of 2
     {
+        const unsigned int activeMask = c_fullWarpMask;
+
         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
         // TODO: find out if this is the best in terms of transactions count
         static_assert(order == 4, "Only order of 4 is implemented");
         static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
         const int width = atomDataSize;
 
-        fx += __shfl_down(fx, 1, width);
-        fy += __shfl_up  (fy, 1, width);
-        fz += __shfl_down(fz, 1, width);
+        fx += gmx_shfl_down_sync(activeMask, fx, 1, width);
+        fy += gmx_shfl_up_sync  (activeMask, fy, 1, width);
+        fz += gmx_shfl_down_sync(activeMask, fz, 1, width);
 
         if (splineIndex & 1)
         {
             fx = fy;
         }
 
-        fx += __shfl_down(fx, 2, width);
-        fz += __shfl_up  (fz, 2, width);
+        fx += gmx_shfl_down_sync(activeMask, fx, 2, width);
+        fz += gmx_shfl_up_sync  (activeMask, fz, 2, width);
 
         if (splineIndex & 2)
         {
@@ -134,7 +136,7 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
         // We have to just further reduce those groups of 4
         for (int delta = 4; delta < atomDataSize; delta <<= 1)
         {
-            fx += __shfl_down(fx, delta, width);
+            fx += gmx_shfl_down_sync(activeMask, fx, delta, width);
         }
 
         const int dimIndex = splineIndex;
@@ -377,7 +379,7 @@ __global__ void pme_gather_kernel(const pme_gpu_cuda_kernel_params_t    kernelPa
         sm_forces[forceIndexLocal] = result;
     }
 
-    // No sync here
+    gmx_syncwarp();
     assert(atomsPerBlock <= warp_size);
 
     /* Writing or adding the final forces component-wise, single warp */
diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu
index 649b6851da..3eebe92351 100644
--- a/src/gromacs/ewald/pme-solve.cu
+++ b/src/gromacs/ewald/pme-solve.cu
@@ -267,16 +267,17 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne
          */
 
         /* We can only reduce warp-wise */
-        const int width = warp_size;
+        const int          width      = warp_size;
+        const unsigned int activeMask = c_fullWarpMask;
 
         /* Making pair sums */
-        virxx  += __shfl_down(virxx, 1, width);
-        viryy  += __shfl_up  (viryy, 1, width);
-        virzz  += __shfl_down(virzz, 1, width);
-        virxy  += __shfl_up  (virxy, 1, width);
-        virxz  += __shfl_down(virxz, 1, width);
-        viryz  += __shfl_up  (viryz, 1, width);
-        energy += __shfl_down(energy, 1, width);
+        virxx  += gmx_shfl_down_sync(activeMask, virxx, 1, width);
+        viryy  += gmx_shfl_up_sync  (activeMask, viryy, 1, width);
+        virzz  += gmx_shfl_down_sync(activeMask, virzz, 1, width);
+        virxy  += gmx_shfl_up_sync  (activeMask, virxy, 1, width);
+        virxz  += gmx_shfl_down_sync(activeMask, virxz, 1, width);
+        viryz  += gmx_shfl_up_sync  (activeMask, viryz, 1, width);
+        energy += gmx_shfl_down_sync(activeMask, energy, 1, width);
         if (threadLocalId & 1)
         {
             virxx = viryy; // virxx now holds virxx and viryy pair sums
@@ -285,10 +286,10 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne
         }
 
         /* Making quad sums */
-        virxx  += __shfl_down(virxx, 2, width);
-        virzz  += __shfl_up  (virzz, 2, width);
-        virxz  += __shfl_down(virxz, 2, width);
-        energy += __shfl_up(energy, 2, width);
+        virxx  += gmx_shfl_down_sync(activeMask, virxx, 2, width);
+        virzz  += gmx_shfl_up_sync  (activeMask, virzz, 2, width);
+        virxz  += gmx_shfl_down_sync(activeMask, virxz, 2, width);
+        energy += gmx_shfl_up_sync  (activeMask, energy, 2, width);
         if (threadLocalId & 2)
         {
             virxx = virzz;  // virxx now holds quad sums of virxx, virxy, virzz and virxy
@@ -296,8 +297,8 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne
         }
 
         /* Making octet sums */
-        virxx += __shfl_down(virxx, 4, width);
-        virxz += __shfl_up(virxz, 4, width);
+        virxx += gmx_shfl_down_sync(activeMask, virxx, 4, width);
+        virxz += gmx_shfl_up_sync  (activeMask, virxz, 4, width);
         if (threadLocalId & 4)
         {
             virxx = virxz; // virxx now holds all 7 components' octet sums + unused paddings
@@ -307,7 +308,7 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne
 #pragma unroll
         for (int delta = 8; delta < width; delta <<= 1)
         {
-            virxx += __shfl_down(virxx, delta, width);
+            virxx += gmx_shfl_down_sync(activeMask, virxx, delta, width);
         }
         /* Now first 7 threads of each warp have the full output contributions in virxx */
 
@@ -347,7 +348,7 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne
 #pragma unroll
             for (int delta = stride; delta < warp_size; delta <<= 1)
             {
-                output += __shfl_down(output, delta, warp_size);
+                output += gmx_shfl_down_sync(activeMask, output, delta, warp_size);
             }
             /* Final output */
             if (validComponentIndex)
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu
index ee9287459d..58e5890f7f 100644
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -458,6 +458,7 @@ __global__ void pme_spline_and_spread_kernel(const pme_gpu_cuda_kernel_params_t
         __syncthreads();
         calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
                                                 sm_coefficients, sm_theta, sm_gridlineIndices);
+        gmx_syncwarp();
     }
     else
     {
-- 
2.11.4.GIT