From 97f9f399c1a82e1bf63d96642855b434ce020dcd Mon Sep 17 00:00:00 2001
From: Jiri Kraus <jkraus@nvidia.com>
Date: Wed, 19 Jul 2017 08:51:45 -0700
Subject: [PATCH] CUDA 9/Volta support for the nonbonded module

The Volta architecture introduces independent thread scheduling. This
new architectural feature breaks implicit warp synchronous programming
and requires new intrinsics for warp wide operations like shfl.

This change implements the necessary sync point for the Volta
architecture and replaces the deprecated warp-intrinsics with their
_sycn version.

Note that the current implementation is conservative and aims for Volta
compatibility only and the implementation is likely not optimal (for
details see nbnxn_cuda_kernel.cuh).

Change-Id: I38dd572992cf14ce5a7158d0bbc3086b54f18676
---
 cmake/gmxManageNvccConfig.cmake                    | 14 +++-
 src/gromacs/gpu_utils/cuda_arch_utils.cuh          | 78 +++++++++++++++++++++-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 38 ++++++++---
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh   | 50 +++++++-------
 4 files changed, 145 insertions(+), 35 deletions(-)

diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake
index 706343c1f8..a32b19f17c 100644
--- a/cmake/gmxManageNvccConfig.cmake
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -146,14 +146,17 @@ else()
     #     => compile sm_20, sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
     # - with CUDA >=8.0         CC 6.0-6.2 is supported (but we know nothing about CC 6.2, so we won't generate code or it)
     #     => compile sm_20, sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
-    #
+    # - with CUDA >=9.0         CC 7.0 is supported and CC 2.0 is no longer supported
+    #     => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70 SASS, and compute_70 PTX
     #
     #   Note that CUDA 6.5.19 second patch release supports cc 5.2 too, but
     #   CUDA_VERSION does not contain patch version and having PTX 5.0 JIT-ed is
     #   equally fast as compiling with sm_5.2 anyway.
 
     # First add flags that trigger SASS (binary) code generation for physical arch
-    list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_20,code=sm_20")
+    if(CUDA_VERSION VERSION_LESS "9.00") # < 9.0
+        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_20,code=sm_20")
+    endif()
     list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_30,code=sm_30")
     list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=sm_35")
 
@@ -168,6 +171,9 @@ else()
         list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=sm_60")
         list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=sm_61")
     endif()
+    if(NOT CUDA_VERSION VERSION_LESS "9.0") # >= 9.0
+        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=sm_70")
+    endif()
 
     # Next add flags that trigger PTX code generation for the newest supported virtual arch
     # that's useful to JIT to future architectures
@@ -177,9 +183,11 @@ else()
         list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=compute_50")
     elseif(CUDA_VERSION VERSION_LESS "8.0")
         list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=compute_52")
-    else() # version >= 8.0
+    elseif(CUDA_VERSION VERSION_LESS "9.0")
         list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=compute_60")
         list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=compute_61")
+    else() # version >= 9.0
+        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_70,code=compute_70")
     endif()
 endif()
 
diff --git a/src/gromacs/gpu_utils/cuda_arch_utils.cuh b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
index 8be983365c..b6a2304726 100644
--- a/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -35,6 +35,8 @@
 #ifndef CUDA_ARCH_UTILS_CUH_
 #define CUDA_ARCH_UTILS_CUH_
 
+#include "config.h"
+
 /*! \file
  *  \brief CUDA arch dependent definitions.
  *
@@ -57,5 +59,79 @@
  */
 static const int warp_size      = 32;
 static const int warp_size_log2 = 5;
+/*! \brief Bitmask corresponding to all threads active in a warp.
+ *  NOTE that here too we assume 32-wide warps.
+ */
+static const unsigned int c_fullWarpMask = 0xffffffff;
+
+/* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
+
+/*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic.  */
+static __forceinline__ __device__
+void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
+{
+#if GMX_CUDA_VERSION < 9000
+    /* no sync needed on pre-Volta. */
+    GMX_UNUSED_VALUE(activeMask);
+#else
+    __syncwarp(activeMask);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic.  */
+static __forceinline__ __device__
+unsigned int gmx_ballot_sync(const unsigned int activeMask,
+                             const int          pred)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __ballot(pred);
+#else
+    return __ballot_sync(activeMask, pred);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic.  */
+static __forceinline__ __device__
+int gmx_any_sync(const unsigned int activeMask,
+                 const int          pred)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __any(pred);
+#else
+    return __any_sync(activeMask, pred);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic.  */
+template <typename T>
+static __forceinline__ __device__
+T gmx_shfl_up_sync(const unsigned int activeMask,
+                   const T            var,
+                   unsigned int       offset)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __shfl_up(var, offset);
+#else
+    return __shfl_up_sync(activeMask, var, offset);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic.  */
+template <typename T>
+static __forceinline__ __device__
+T gmx_shfl_down_sync(const unsigned int activeMask,
+                     const T            var,
+                     unsigned int       offset)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __shfl_down(var, offset);
+#else
+    return __shfl_down_sync(activeMask, var, offset);
+#endif
+}
 
 #endif /* CUDA_ARCH_UTILS_CUH_ */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 192b20d35e..a9411e42ec 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -108,6 +108,15 @@
  *
  * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
  * shuffle-based reduction, hence CC >= 3.0.
+ *
+ *
+ * NOTEs / TODO on Volta / CUDA 9 support extensions:
+ * - the current way of computing active mask using ballot_sync() should be
+ *   reconsidered: we can compute all masks with bitwise ops iso ballot and
+ *   secondly, all conditionals are warp-uniform, so the sync is not needed;
+ * - reconsider the use of __syncwarp(): its only role is currently to prevent
+ *   WAR hazard due to the cj preload; we should try to replace it with direct
+ *   loads (which may be faster given the improved L1 on Volta).
  */
 
 /* Kernel launch bounds for different compute capabilities. The value of NTHREAD_Z
@@ -342,14 +351,18 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
 #endif                                  /* CALC_ENERGIES */
 
+    int          j4LoopStart      = cij4_start + tidxz;
+    unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
-    for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
+    for (j4 = j4LoopStart; j4 < cij4_end; j4 += NTHREAD_Z)
     {
         wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
         imask       = pl_cj4[j4].imei[widx].imask;
         wexcl       = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
 
+        unsigned int imaskSkipConditionThreadMask = j4LoopThreadMask;
 #ifndef PRUNE_NBL
+        imaskSkipConditionThreadMask = gmx_ballot_sync(j4LoopThreadMask, imask);
         if (imask)
 #endif
         {
@@ -358,6 +371,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
             {
                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
             }
+            gmx_syncwarp(imaskSkipConditionThreadMask);
 
             /* Unrolling this loop
                - with pruning leads to register spilling;
@@ -365,7 +379,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                Tested with up to nvcc 7.5 */
             for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
             {
-                if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
+                const unsigned int jmSkipCondition           = imask & (superClInteractionMask << (jm * c_numClPerSupercl));
+                const unsigned int jmSkipConditionThreadMask = gmx_ballot_sync(imaskSkipConditionThreadMask, jmSkipCondition);
+                if (jmSkipCondition)
                 {
                     mask_ji = (1U << (jm * c_numClPerSupercl));
 
@@ -389,7 +405,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
                     for (i = 0; i < c_numClPerSupercl; i++)
                     {
-                        if (imask & mask_ji)
+                        const unsigned int iInnerSkipCondition           = imask & mask_ji;
+                        const unsigned int iInnerSkipConditionThreadMask = gmx_ballot_sync(jmSkipConditionThreadMask, iInnerSkipCondition);
+                        if (iInnerSkipCondition)
                         {
                             ci      = sci * c_numClPerSupercl + i; /* i cluster index */
 
@@ -405,7 +423,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                             /* If _none_ of the atoms pairs are in cutoff range,
                                the bit corresponding to the current
                                cluster-pair in imask gets set to 0. */
-                            if (!__any(r2 < rlist_sq))
+                            if (!gmx_any_sync(iInnerSkipConditionThreadMask, r2 < rlist_sq))
                             {
                                 imask &= ~mask_ji;
                             }
@@ -568,7 +586,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                     }
 
                     /* reduce j forces */
-                    reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj);
+                    reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj, jmSkipConditionThreadMask);
                 }
             }
 #ifdef PRUNE_NBL
@@ -577,6 +595,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
             pl_cj4[j4].imei[widx].imask = imask;
 #endif
         }
+        // avoid shared memory WAR hazards between loop iterations
+        gmx_syncwarp(j4LoopThreadMask);
+        // update thread mask for next loop iteration
+        j4LoopThreadMask = gmx_ballot_sync(j4LoopThreadMask, (j4 + NTHREAD_Z) < cij4_end);
     }
 
     /* skip central shifts when summing shift forces */
@@ -593,7 +615,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
         ai  = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
         reduce_force_i_warp_shfl(fci_buf[i], f,
                                  &fshift_buf, bCalcFshift,
-                                 tidxj, ai);
+                                 tidxj, ai, c_fullWarpMask);
     }
 
     /* add up local shift forces into global mem, tidxj indexes x,y,z */
@@ -604,7 +626,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
 #ifdef CALC_ENERGIES
     /* reduce the energies over warps and store into global memory */
-    reduce_energy_warp_shfl(E_lj, E_el, e_lj, e_el, tidx);
+    reduce_energy_warp_shfl(E_lj, E_el, e_lj, e_el, tidx, c_fullWarpMask);
 #endif
 }
 #endif /* FUNCTION_DECLARATION_ONLY */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index 07979c1248..82077bb96a 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -50,6 +50,7 @@
  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  * code that is in double precision.
  */
+#include "config.h"
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
@@ -67,16 +68,16 @@
 
 /*! \brief Log of the i and j cluster size.
  *  change this together with c_clSize !*/
-static const int    c_clSizeLog2  = 3;
+static const int          c_clSizeLog2  = 3;
 /*! \brief Square of cluster size. */
-static const int    c_clSizeSq    = c_clSize*c_clSize;
+static const int          c_clSizeSq    = c_clSize*c_clSize;
 /*! \brief j-cluster size after split (4 in the current implementation). */
-static const int    c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+static const int          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
 /*! \brief Stride in the force accumualation buffer */
-static const int    c_fbufStride  = c_clSizeSq;
+static const int          c_fbufStride  = c_clSizeSq;
 
-static const float  c_oneSixth    = 0.16666667f;
-static const float  c_oneTwelveth = 0.08333333f;
+static const float        c_oneSixth    = 0.16666667f;
+static const float        c_oneTwelveth = 0.08333333f;
 
 /* With multiple compilation units this ensures that texture refs are available
    in the the kernels' compilation units. */
@@ -445,26 +446,27 @@ void reduce_force_j_generic(float *f_buf, float3 *fout,
 #if GMX_PTX_ARCH >= 300
 static __forceinline__ __device__
 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
-                              int tidxi, int aidx)
+                              int tidxi, int aidx,
+                              const unsigned int activemask)
 {
-    f.x += __shfl_down(f.x, 1);
-    f.y += __shfl_up  (f.y, 1);
-    f.z += __shfl_down(f.z, 1);
+    f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+    f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
+    f.z += gmx_shfl_down_sync(activemask, f.z, 1);
 
     if (tidxi & 1)
     {
         f.x = f.y;
     }
 
-    f.x += __shfl_down(f.x, 2);
-    f.z += __shfl_up  (f.z, 2);
+    f.x += gmx_shfl_down_sync(activemask, f.x, 2);
+    f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
 
     if (tidxi & 2)
     {
         f.x = f.z;
     }
 
-    f.x += __shfl_down(f.x, 4);
+    f.x += gmx_shfl_down_sync(activemask, f.x, 4);
 
     if (tidxi < 3)
     {
@@ -572,19 +574,20 @@ void reduce_force_i(float *f_buf, float3 *f,
 static __forceinline__ __device__
 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
                               float *fshift_buf, bool bCalcFshift,
-                              int tidxj, int aidx)
+                              int tidxj, int aidx,
+                              const unsigned int activemask)
 {
-    fin.x += __shfl_down(fin.x, c_clSize);
-    fin.y += __shfl_up  (fin.y, c_clSize);
-    fin.z += __shfl_down(fin.z, c_clSize);
+    fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
+    fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
+    fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
 
     if (tidxj & 1)
     {
         fin.x = fin.y;
     }
 
-    fin.x += __shfl_down(fin.x, 2*c_clSize);
-    fin.z += __shfl_up  (fin.z, 2*c_clSize);
+    fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
+    fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
 
     if (tidxj & 2)
     {
@@ -649,7 +652,8 @@ void reduce_energy_pow2(volatile float *buf,
 static __forceinline__ __device__
 void reduce_energy_warp_shfl(float E_lj, float E_el,
                              float *e_lj, float *e_el,
-                             int tidx)
+                             int tidx,
+                             const unsigned int activemask)
 {
     int i, sh;
 
@@ -657,8 +661,8 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
 #pragma unroll 5
     for (i = 0; i < 5; i++)
     {
-        E_lj += __shfl_down(E_lj, sh);
-        E_el += __shfl_down(E_el, sh);
+        E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
+        E_el += gmx_shfl_down_sync(activemask, E_el, sh);
         sh   += sh;
     }
 
-- 
2.11.4.GIT