From 9afedd58b9f578e2390a373e12b8e211ef7539c0 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 29 Apr 2019 17:42:48 +0200
Subject: [PATCH] Fix for the CUDA version of LINCS

This fixes a couple of Jenkins Post-submit failures:
1. Clang warnings in the CUDA portion of code.
2. New CUDA LINCS failure on the host with CUDA, but without
   capable GPU.

Also done is minor renaming and typo correction.

Change-Id: I4f1d89d4bf5a6f3f0083186cb67787cdf6536bb9
---
 src/gromacs/mdlib/lincs_cuda_impl.cu | 28 ++++++++++++++--------------
 src/gromacs/mdlib/lincs_cuda_impl.h  |  2 +-
 src/gromacs/mdlib/tests/constr.cpp   | 24 ++++++++++++++++++++----
 3 files changed, 35 insertions(+), 19 deletions(-)
diff --git a/src/gromacs/mdlib/lincs_cuda_impl.cu b/src/gromacs/mdlib/lincs_cuda_impl.cu
index f4327e0df7..06d99bea5a 100644
--- a/src/gromacs/mdlib/lincs_cuda_impl.cu
+++ b/src/gromacs/mdlib/lincs_cuda_impl.cu
@@ -78,7 +78,7 @@ namespace gmx
 
 //! Number of CUDA threads in a block
 constexpr static int c_threadsPerBlock = 256;
-//! Maximum number of threads in a block (for __launch_bonds__)
+//! Maximum number of threads in a block (for __launch_bounds__)
 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
 
 /*! \brief Main kernel for LINCS constraints.
@@ -119,7 +119,7 @@ __global__ void lincs_kernel(LincsCudaKernelParameters   kernelParams,
     const int2*   __restrict__  gm_constraints              = kernelParams.d_constraints;
     const float*  __restrict__  gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
     const int*    __restrict__  gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts;
-    const int*    __restrict__  gm_coupledConstraintsIdxes  = kernelParams.d_coupledConstraintsIdxes;
+    const int*    __restrict__  gm_coupledConstraintsIdxes  = kernelParams.d_coupledConstraintsIndices;
     const float*  __restrict__  gm_massFactors              = kernelParams.d_massFactors;
     float*  __restrict__        gm_matrixA                  = kernelParams.d_matrixA;
     const float*  __restrict__  gm_inverseMasses            = kernelParams.d_inverseMasses;
@@ -370,10 +370,10 @@ __global__ void lincs_kernel(LincsCudaKernelParameters   kernelParams,
         // half of it sums two values. This procedure is repeated until only one thread is left.
         // Only works if the threads per blocks is a power of two (hence static_assert
         // in the beginning of the kernel).
-        for (int divideBy = 2; divideBy <= blockDim.x; divideBy *= 2)
+        for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
         {
             int dividedAt = blockDim.x/divideBy;
-            if (threadIdx.x < dividedAt)
+            if (static_cast<int>(threadIdx.x) < dividedAt)
             {
                 for (int d = 0; d < 6; d++)
                 {
@@ -532,8 +532,8 @@ LincsCuda::Impl::Impl(int numAtoms,
 
     static_assert(sizeof(real) == sizeof(float),
                   "Real numbers should be in single precision in GPU code.");
-    static_assert(c_threadsPerBlock > 0 && !(c_threadsPerBlock & (c_threadsPerBlock - 1) == 0),
-                  "Nmber of threads per block should be a power of two in order for reduction to work.");
+    static_assert(c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
+                  "Number of threads per block should be a power of two in order for reduction to work.");
 
     // This is temporary. LINCS should not manage coordinates.
     allocateDeviceBuffer(&kernelParams_.d_x,  numAtoms, nullptr);
@@ -621,7 +621,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
     // Number of constraints, coupled with the current one (CPU)
     std::vector<int>    coupledConstraintsCountsHost;
     // List of coupled with the current one (CPU)
-    std::vector<int>    coupledConstraintsIdxesHost;
+    std::vector<int>    coupledConstraintsIndicesHost;
     // Mass factors (CPU)
     std::vector<float>  massFactorsHost;
 
@@ -708,7 +708,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
     // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
     // We map a single thread to a single constraint, hence each thread 'c' will be using one element from
     // coupledConstraintsCountsHost array, which is the number of constraints coupled to the constraint 'c'.
-    // The coupled constraints indexes are placed into the coupledConstraintsIdxesHost array. Latter is organized
+    // The coupled constraints indexes are placed into the coupledConstraintsIndicesHost array. Latter is organized
     // as a one-dimensional array to ensure good memory alignment. It is addressed as [c + i*numConstraintsThreads],
     // where 'i' goes from zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of
     // the array --- a number, greater then total number of constraints, taking into account the splits in the
@@ -730,7 +730,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
     }
 
     coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
-    coupledConstraintsIdxesHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
+    coupledConstraintsIndicesHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
     massFactorsHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
 
     for (int c1 = 0; c1 < numConstraints; c1++)
@@ -755,7 +755,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
             {
                 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
 
-                coupledConstraintsIdxesHost.at(index) = splitMap.at(c2);
+                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 
                 int   center = c1a1;
 
@@ -780,7 +780,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
             {
                 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
 
-                coupledConstraintsIdxesHost.at(index) = splitMap.at(c2);
+                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 
                 int   center = c1a2;
 
@@ -807,7 +807,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
             freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 
             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
-            freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes);
+            freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
             freeDeviceBuffer(&kernelParams_.d_massFactors);
             freeDeviceBuffer(&kernelParams_.d_matrixA);
 
@@ -820,7 +820,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, kernelParams_.numConstraintsThreads, nullptr);
 
         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, kernelParams_.numConstraintsThreads, nullptr);
-        allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
         allocateDeviceBuffer(&kernelParams_.d_massFactors, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
         allocateDeviceBuffer(&kernelParams_.d_matrixA, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
 
@@ -836,7 +836,7 @@ void LincsCuda::Impl::set(const t_idef    &idef,
     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, coupledConstraintsCountsHost.data(),
                        0, kernelParams_.numConstraintsThreads,
                        stream_, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, coupledConstraintsIdxesHost.data(),
+    copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
                        0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
                        stream_, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(),
diff --git a/src/gromacs/mdlib/lincs_cuda_impl.h b/src/gromacs/mdlib/lincs_cuda_impl.h
index 8876bb9d69..880ae7a045 100644
--- a/src/gromacs/mdlib/lincs_cuda_impl.h
+++ b/src/gromacs/mdlib/lincs_cuda_impl.h
@@ -94,7 +94,7 @@ struct LincsCudaKernelParameters
     //! Number of constraints, coupled with the current one (GPU)
     int                *d_coupledConstraintsCounts;
     //! List of coupled with the current one (GPU)
-    int                *d_coupledConstraintsIdxes;
+    int                *d_coupledConstraintsIndices;
     //! Elements of the coupling matrix.
     float              *d_matrixA;
     //! Mass factors (GPU)
diff --git a/src/gromacs/mdlib/tests/constr.cpp b/src/gromacs/mdlib/tests/constr.cpp
index a5f7a3b215..983958cb50 100644
--- a/src/gromacs/mdlib/tests/constr.cpp
+++ b/src/gromacs/mdlib/tests/constr.cpp
@@ -64,6 +64,7 @@
 #include "gromacs/fileio/gmxfio.h"
 #include "gromacs/gmxlib/nrnb.h"
 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/paddedvector.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/math/vectypes.h"
@@ -222,7 +223,7 @@ struct ConstraintsTestData
             }
 
             // Saving constraints to check if they are satisfied after algorithm was applied
-            constraints_   = constraints;   // Constraints indexes (in type-i-j format)
+            constraints_   = constraints;   // Constraints indices (in type-i-j format)
             constraintsR0_ = constraintsR0; // Equilibrium distances for each type of constraint
 
             invdt_  = 1.0/timestep;         // Inverse timestep
@@ -406,6 +407,14 @@ struct ConstraintsTestData
  */
 typedef std::tuple<std::string, std::string> ConstraintsTestParameters;
 
+/*! \brief Names of all availible algorithms
+ *
+ * Constructed from the algorithms_ field of the test class.
+ * Used as the list of values of second parameter in ConstraintsTestParameters.
+ */
+static std::vector<std::string> algorithmsNames;
+
+
 /*! \brief Test fixture for constraints.
  *
  * The fixture uses following test systems:
@@ -465,9 +474,16 @@ class ConstraintsTest : public ::testing::TestWithParam<ConstraintsTestParameter
             algorithms_["LINCS"] = applyLincs;
             // LINCS using CUDA (if CUDA is available)
 #if GMX_GPU == GMX_GPU_CUDA
-            algorithms_["LINCS_CUDA"] = applyLincsCuda;
+            std::string errorMessage;
+            if (canDetectGpus(&errorMessage))
+            {
+                algorithms_["LINCS_CUDA"] = applyLincsCuda;
+            }
 #endif
-
+            for (const auto &algorithm : algorithms_)
+            {
+                algorithmsNames.push_back(algorithm.first);
+            }
         }
 
         /*! \brief
@@ -1146,7 +1162,7 @@ TEST_P(ConstraintsTest, TriangleOfConstraints){
 #if GMX_GPU == GMX_GPU_CUDA
 INSTANTIATE_TEST_CASE_P(WithParameters, ConstraintsTest,
                             ::testing::Combine(::testing::Values("PBCNone", "PBCXYZ"),
-                                                   ::testing::Values("SHAKE", "LINCS", "LINCS_CUDA")));
+                                                   ::testing::ValuesIn(algorithmsNames)));
 #endif
 
 #if GMX_GPU != GMX_GPU_CUDA
-- 
2.11.4.GIT