From 0a1aae785a6f746adbaae672736462f2ff1cf948 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Wed, 20 Feb 2019 15:30:29 +0100
Subject: [PATCH] CUDA version of LINCS constraints.

Implementation of the LINCS constraints for NVIDIA GPUs.
Currently works isolated from the other parts of the code:
coordinates and velocities are copied to and from GPU on
every integration timestep. Part of the GPU-only loop.
Loosely based on change 9162 by Alan Gray. To enable,
set the environmental variable GMX_LINCS_GPU.

Limitations:
1. Works only if the constraints can be split in short
uncoupled groups (currently < 256, designed for H-bonds
constraints).
2. Does not change the matrix inversion order for costraints
   triangles.
3. Does not support free energy computations.
4. Assumes no communications between domains (i.e. assumes that
   there is no constraints connecting atoms from two different
   domains).
5. Number of thread per blocks should be a power of 2 for
   reduction of virial to work.

TODOs:
1. Move more data from the global memory to local.
2. Change .at() to []
3. Add sorting by the number of coupled constraints to decrease
   warp divergencies.
4. numAtoms should be changeable (for multi-GPU case).

Refs #2816, #2885

Change-Id: I3c975cf898053b7467bcd30459e60ce2c8852be6
---
 src/gromacs/mdlib/CMakeLists.txt      |   5 +
 src/gromacs/mdlib/constr.cpp          |  58 +-
 src/gromacs/mdlib/lincs_cuda.h        | 175 ++++++
 src/gromacs/mdlib/lincs_cuda_impl.cpp | 128 +++++
 src/gromacs/mdlib/lincs_cuda_impl.cu  | 996 ++++++++++++++++++++++++++++++++++
 src/gromacs/mdlib/lincs_cuda_impl.h   | 257 +++++++++
 src/gromacs/mdlib/tests/constr.cpp    | 408 ++++++++------
 7 files changed, 1857 insertions(+), 170 deletions(-)
 create mode 100644 src/gromacs/mdlib/lincs_cuda.h
 create mode 100644 src/gromacs/mdlib/lincs_cuda_impl.cpp
 create mode 100644 src/gromacs/mdlib/lincs_cuda_impl.cu
 create mode 100644 src/gromacs/mdlib/lincs_cuda_impl.h
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index baa5803657..16637f9990 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -38,5 +38,10 @@ set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
 if (BUILD_TESTING)
     add_subdirectory(tests)
 endif()
+if(GMX_USE_CUDA)
+    gmx_add_libgromacs_sources(
+       lincs_cuda_impl.cu
+       )
+endif()
 
 gmx_install_headers(simulationsignal.h)
diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp
index fe61663bff..6371c2b80d 100644
--- a/src/gromacs/mdlib/constr.cpp
+++ b/src/gromacs/mdlib/constr.cpp
@@ -62,6 +62,7 @@
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/lincs.h"
+#include "gromacs/mdlib/lincs_cuda.h"
 #include "gromacs/mdlib/settle.h"
 #include "gromacs/mdlib/shake.h"
 #include "gromacs/mdtypes/commrec.h"
@@ -87,6 +88,9 @@
 namespace gmx
 {
 
+//! Whether the GPU version of LINCS should be used.
+static const bool c_useGpuLincs = (getenv("GMX_LINCS_GPU") != nullptr);
+
 /* \brief Impl class for Constraints
  *
  * \todo Members like md, idef are valid only for the lifetime of a
@@ -178,11 +182,13 @@ class Constraints::Impl
         /*!\brief Input options.
          *
          * \todo Replace with IMdpOptions */
-        const t_inputrec &ir;
+        const t_inputrec            &ir;
         //! Flop counting support.
-        t_nrnb           *nrnb = nullptr;
+        t_nrnb                      *nrnb = nullptr;
         //! Tracks wallcycle usage.
-        gmx_wallcycle    *wcycle;
+        gmx_wallcycle               *wcycle;
+        //! Valid LINCS CUDA object when that implementation is being used, nullptr otherwise.
+        std::unique_ptr<LincsCuda>   lincsCuda;
 };
 
 Constraints::~Constraints() = default;
@@ -482,6 +488,25 @@ Constraints::Impl::apply(bool                  bLog,
             bDump = TRUE;
         }
     }
+    else
+    {
+        if (lincsCuda != nullptr && c_useGpuLincs)
+        {
+            GMX_RELEASE_ASSERT(ir.efep == efepNO || dvdlambda == nullptr,
+                               "Free energy perturbation is not supported by the GPU version of LINCS.\n");
+            lincsCuda->setPbc(pbc_null);
+            lincsCuda->copyCoordinatesToGpu(x, xprime);
+            lincsCuda->copyVelocitiesToGpu(v);
+            lincsCuda->apply(v != nullptr,
+                             invdt,
+                             vir != nullptr, vir_r_m_dr);
+            lincsCuda->copyCoordinatesFromGpu(xprime);
+            if (v != nullptr)
+            {
+                lincsCuda->copyVelocitiesFromGpu(v);
+            }
+        }
+    }
 
     if (shaked != nullptr)
     {
@@ -917,7 +942,14 @@ Constraints::Impl::setConstraints(const gmx_localtop_t &top,
          */
         if (ir.eConstrAlg == econtLINCS)
         {
-            set_lincs(top.idef, md, EI_DYNAMICS(ir.eI), cr, lincsd);
+            if (c_useGpuLincs)
+            {
+                lincsCuda->set(top.idef, md);
+            }
+            else
+            {
+                set_lincs(top.idef, md, EI_DYNAMICS(ir.eI), cr, lincsd);
+            }
         }
         if (ir.eConstrAlg == econtSHAKE)
         {
@@ -1067,10 +1099,20 @@ Constraints::Impl::Impl(const gmx_mtop_t     &mtop_p,
 
         if (ir.eConstrAlg == econtLINCS)
         {
-            lincsd = init_lincs(log, mtop,
-                                nflexcon, at2con_mt,
-                                DOMAINDECOMP(cr) && cr->dd->splitConstraints,
-                                ir.nLincsIter, ir.nProjOrder);
+            if (c_useGpuLincs)
+            {
+                fprintf(log, "Initializing LINCS on a GPU for %d atoms\n", mtop.natoms);
+                GMX_RELEASE_ASSERT(nflexcon == 0, "Flexible constraints are not supported by the GPU-based implementation of LINCS.\n");
+                GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), "CUDA version of LINCS is not supported with domain decomposition");
+                lincsCuda = std::make_unique<LincsCuda>(mtop.natoms, ir.nLincsIter, ir.nProjOrder);
+            }
+            else
+            {
+                lincsd = init_lincs(log, mtop,
+                                    nflexcon, at2con_mt,
+                                    DOMAINDECOMP(cr) && cr->dd->splitConstraints,
+                                    ir.nLincsIter, ir.nProjOrder);
+            }
         }
 
         if (ir.eConstrAlg == econtSHAKE)
diff --git a/src/gromacs/mdlib/lincs_cuda.h b/src/gromacs/mdlib/lincs_cuda.h
new file mode 100644
index 0000000000..b3c8f0e165
--- /dev/null
+++ b/src/gromacs/mdlib/lincs_cuda.h
@@ -0,0 +1,175 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declaration of high-level functions of CUDA implementation of LINCS.
+ *
+ * \todo This should only list interfaces needed for libgromacs clients.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ * \inlibraryapi
+ */
+#ifndef GMX_MDLIB_LINCS_CUDA_H
+#define GMX_MDLIB_LINCS_CUDA_H
+
+
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class LincsCuda
+{
+
+    public:
+        /*! \brief Constructor.
+         *
+         * \param[in] numAtoms         Number of atoms
+         * \param[in] numIterations    Number of iteration for the correction of the projection.
+         * \param[in] expansionOrder   Order of the matrix inversion algorithm.
+         */
+        LincsCuda(int numAtoms,
+                  int numIterations,
+                  int expansionOrder);
+        ~LincsCuda();
+
+        /*! \brief Apply LINCS.
+         *
+         * Applies LINCS to coordinates and velocities, Method uses this class data structures
+         * which should be updated when needed using update method.
+         *
+         * \param[in] updateVelocities  If the velocities should be constrained.
+         * \param[in] invdt             Reciprocal timestep (to scale Lagrange
+         *                              multipliers when velocities are updated)
+         * \param[in] computeVirial     If virial should be updated.
+         * \param[in,out] virialScaled  Scaled virial tensor to be updated.
+         */
+        void apply(bool    updateVelocities,
+                   real    invdt,
+                   bool    computeVirial,
+                   tensor  virialScaled);
+
+        /*! \brief
+         * Update data-structures (e.g. after NB search step).
+         *
+         * Updates the constraints data. Should be called if the particles were sorted,
+         * redistributed between domains, etc.
+         *
+         * Information about constraints should be taken from:
+         *     idef.il[F_CONSTR].iatoms  --- type (T) of constraint and two atom indexes (i1, i2)
+         *     idef.iparams[T].constr.dA --- target length for constraint of type T
+         * From t_mdatom, the code should take:
+         *     md.invmass  --- array of inverse square root of masses for each atom in the system.
+         *
+         * \param[in] idef  Local topology data to get information on constraints from.
+         * \param[in] md    Atoms data to get atom masses from.
+         */
+        void set(const t_idef         &idef,
+                 const t_mdatoms      &md);
+
+        /*! \brief
+         * Update PBC data.
+         *
+         * \param[in] pbc  The PBC data in t_pbc format.
+         */
+        void setPbc(const t_pbc *pbc);
+
+        /*! \brief
+         * Copy coordinates from provided CPU location to GPU.
+         *
+         * Copies the coordinates before the integration step (x) and coordinates
+         * after the integration step (xp) from the provided CPU location to GPU.
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_x   CPU pointer where coordinates should be copied from.
+         * \param[in] h_xp  CPU pointer where coordinates should be copied from.
+         */
+        void copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp);
+
+        /*! \brief
+         * Copy velocities from provided CPU location to GPU.
+         *
+         * Nothing is done if the argument provided is a nullptr.
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  CPU pointer where velocities should be copied from.
+         */
+        void copyVelocitiesToGpu(const rvec *h_v);
+
+        /*! \brief
+         * Copy coordinates from GPU to provided CPU location.
+         *
+         * Copies the constrained coordinates to the provided location. The coordinates
+         * are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[out] h_xp  CPU pointer where coordinates should be copied to.
+         */
+        void copyCoordinatesFromGpu(rvec *h_xp);
+
+        /*! \brief
+         * Copy velocities from GPU to provided CPU location.
+         *
+         * The velocities are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  Pointer to velocities data.
+         */
+        void copyVelocitiesFromGpu(rvec *h_v);
+
+        /*! \brief
+         * Set the internal GPU-memory x, xprime and v pointers.
+         *
+         * Data is not copied. The data are assumed to be in float3/fvec format
+         * (float3 is used internally, but the data layout should be identical).
+         *
+         * \param[in] d_x  Pointer to the coordinates before integrator update (on GPU)
+         * \param[in] d_xp Pointer to the coordinates after integrator update, before update (on GPU)
+         * \param[in] d_v  Pointer to the velocities before integrator update (on GPU)
+         */
+        void setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v);
+
+    private:
+        class Impl;
+        gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/lincs_cuda_impl.cpp b/src/gromacs/mdlib/lincs_cuda_impl.cpp
new file mode 100644
index 0000000000..11959554c5
--- /dev/null
+++ b/src/gromacs/mdlib/lincs_cuda_impl.cpp
@@ -0,0 +1,128 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement LINCS CUDA interfaces for non-GPU builds.
+ *
+ * Currently, reports and exits if any of the interfaces are called.
+ * May replace CPU LINCS module in future. Currently used to satisfy
+ * compiler on systems, where CUDA is not available.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/mdlib/lincs_cuda.h"
+
+#if GMX_GPU != GMX_GPU_CUDA
+
+namespace gmx
+{
+
+/*!\brief Impl class stub.*/
+class LincsCuda::Impl
+{
+};
+
+/*!\brief Constructor stub.*/
+LincsCuda::LincsCuda(gmx_unused int numAtoms,
+                     gmx_unused int numIterations,
+                     gmx_unused int expansionOrder)
+    : impl_(nullptr)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+LincsCuda::~LincsCuda() = default;
+
+/*!\brief Apply LINCS stub.*/
+void LincsCuda::apply(gmx_unused const bool  updateVelocities,
+                      gmx_unused const real  invdt,
+                      gmx_unused const bool  computeVirial,
+                      gmx_unused tensor      virialScaled)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+/*!\brief Set data structures stub.*/
+void LincsCuda::set(gmx_unused const t_idef    &idef,
+                    gmx_unused const t_mdatoms &md)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+/*!\brief Set PBC stub.*/
+void LincsCuda::setPbc(gmx_unused const t_pbc *pbc)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+/*! \brief Copy coordinates from provided CPU location to GPU stub.*/
+void LincsCuda::copyCoordinatesToGpu(gmx_unused const rvec *h_x, gmx_unused const rvec *h_xp)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+/*! \brief Copy velocities from provided CPU location to GPU stub.*/
+void LincsCuda::copyVelocitiesToGpu(gmx_unused const rvec *h_v)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+/*! \brief Copy coordinates from GPU to provided CPU location stub.*/
+void LincsCuda::copyCoordinatesFromGpu(gmx_unused rvec *h_xp)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+/*! \brief Copy velocities from GPU to provided CPU location stub.*/
+void LincsCuda::copyVelocitiesFromGpu(gmx_unused rvec *h_v)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+/*! \brief Set the internal GPU-memory d_x, d_xprime and d_v pointers stub.*/
+void LincsCuda::setXVPointers(gmx_unused rvec *d_x, gmx_unused rvec *d_xp, gmx_unused rvec *d_v)
+{
+    GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation.");
+}
+
+}      // namespace gmx
+
+#endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/mdlib/lincs_cuda_impl.cu b/src/gromacs/mdlib/lincs_cuda_impl.cu
new file mode 100644
index 0000000000..f4327e0df7
--- /dev/null
+++ b/src/gromacs/mdlib/lincs_cuda_impl.cu
@@ -0,0 +1,996 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements LINCS using CUDA
+ *
+ * This file contains implementation of LINCS constraints algorithm
+ * using CUDA, including class initialization, data-structures management
+ * and GPU kernel.
+ *
+ * \note Management of coordinates, velocities, CUDA stream and periodic boundary exists here as a temporary
+ *       scaffolding to allow this class to be used as a stand-alone unit. The scaffolding is intended to be
+ *       removed once constraints are integrated with update module.
+ * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "lincs_cuda_impl.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <cmath>
+
+#include <algorithm>
+
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.cuh"
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/lincs_cuda.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
+#include "gromacs/topology/ifunc.h"
+
+namespace gmx
+{
+
+//! Number of CUDA threads in a block
+constexpr static int c_threadsPerBlock = 256;
+//! Maximum number of threads in a block (for __launch_bonds__)
+constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+/*! \brief Main kernel for LINCS constraints.
+ *
+ * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
+ *
+ * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
+ * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
+ * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
+ * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
+ * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
+ * there is no need to synchronize across the device. However, extensive communication in a thread block
+ * are still needed.
+ *
+ * \todo Reduce synchronization overhead. Some ideas are:
+ *        1. Consider going to warp-level synchronization for the coupled constraints.
+ *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
+ *           the device level).
+ *        3. Use analytical solution for matrix A inversion.
+ *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
+ *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
+ *       See Redmine issue #2885 for details (https://redmine.gromacs.org/issues/2885)
+ *
+ * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
+ * \param[in]     invdt         Inverse timestep (needed to update velocities).
+ */
+template <bool updateVelocities, bool computeVirial>
+__launch_bounds__(c_maxThreadsPerBlock)
+__global__ void lincs_kernel(LincsCudaKernelParameters   kernelParams,
+                             const float                 invdt)
+{
+    const PbcAiuc               pbcAiuc                     = kernelParams.pbcAiuc;
+    const int                   numConstraintsThreads       = kernelParams.numConstraintsThreads;
+    const int                   numIterations               = kernelParams.numIterations;
+    const int                   expansionOrder              = kernelParams.expansionOrder;
+    const float3* __restrict__  gm_x                        = kernelParams.d_x;
+    float3*                     gm_xp                       = kernelParams.d_xp;
+    const int2*   __restrict__  gm_constraints              = kernelParams.d_constraints;
+    const float*  __restrict__  gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
+    const int*    __restrict__  gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts;
+    const int*    __restrict__  gm_coupledConstraintsIdxes  = kernelParams.d_coupledConstraintsIdxes;
+    const float*  __restrict__  gm_massFactors              = kernelParams.d_massFactors;
+    float*  __restrict__        gm_matrixA                  = kernelParams.d_matrixA;
+    const float*  __restrict__  gm_inverseMasses            = kernelParams.d_inverseMasses;
+    float3*                     gm_v                        = kernelParams.d_v;
+    float*  __restrict__        gm_virialScaled             = kernelParams.d_virialScaled;
+
+    int threadIndex                                         = blockIdx.x*blockDim.x+threadIdx.x;
+
+    // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
+    // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
+    assert(threadIndex < numConstraintsThreads);
+
+    // Vectors connecting constrained atoms before algorithm was applied.
+    // Needed to construct constrain matrix A
+    extern __shared__ float3 sm_r[];
+
+    int2                     pair = gm_constraints[threadIndex];
+    int                      i    = pair.x;
+    int                      j    = pair.y;
+
+    // Mass-scaled Lagrange multiplier
+    float  lagrangeScaled = 0.0f;
+
+    float  targetLength;
+    float  inverseMassi;
+    float  inverseMassj;
+    float  sqrtReducedMass;
+
+    float3 xi;
+    float3 xj;
+    float3 rc;
+
+    // i == -1 indicates dummy constraint at the end of the thread block.
+    bool isDummyThread = (i == -1);
+
+    // Everything computed for these dummies will be equal to zero
+    if (isDummyThread)
+    {
+        targetLength    = 0.0f;
+        inverseMassi    = 0.0f;
+        inverseMassj    = 0.0f;
+        sqrtReducedMass = 0.0f;
+
+        xi = make_float3(0.0f, 0.0f, 0.0f);
+        xj = make_float3(0.0f, 0.0f, 0.0f);
+        rc = make_float3(0.0f, 0.0f, 0.0f);
+    }
+    else
+    {
+        // Collecting data
+        targetLength    = gm_constraintsTargetLengths[threadIndex];
+        inverseMassi    = gm_inverseMasses[i];
+        inverseMassj    = gm_inverseMasses[j];
+        sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
+
+        xi = gm_x[i];
+        xj = gm_x[j];
+
+        float3 dx   = pbcDxAiuc(pbcAiuc, xi, xj);
+
+        float  rlen = rsqrtf(dx.x*dx.x + dx.y*dx.y + dx.z*dx.z);
+        rc   = rlen*dx;
+    }
+
+    sm_r[threadIdx.x] = rc;
+    // Make sure that all r's are saved into shared memory
+    // before they are accessed in the loop below
+    __syncthreads();
+
+    /*
+     * Constructing LINCS matrix (A)
+     */
+
+    // Only non-zero values are saved (for coupled constraints)
+    int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
+    for (int n = 0; n < coupledConstraintsCount; n++)
+    {
+        int    index = n*numConstraintsThreads + threadIndex;
+        int    c1    = gm_coupledConstraintsIdxes[index];
+
+        float3 rc1 = sm_r[c1 - blockIdx.x*blockDim.x];
+        gm_matrixA[index] = gm_massFactors[index]*(rc.x*rc1.x + rc.y*rc1.y + rc.z*rc1.z);
+    }
+
+    // Skipping in dummy threads
+    if (!isDummyThread)
+    {
+        xi = gm_xp[i];
+        xj = gm_xp[j];
+    }
+
+    float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
+
+    float  sol = sqrtReducedMass*((rc.x*dx.x + rc.y*dx.y + rc.z*dx.z) - targetLength);
+
+    /*
+     *  Inverse matrix using a set of expansionOrder matrix multiplications
+     */
+
+    // This will use the same memory space as sm_r, which is no longer needed.
+    extern __shared__ float  sm_rhs[];
+    // Save current right-hand-side vector in the shared memory
+    sm_rhs[threadIdx.x] = sol;
+
+    for (int rec = 0; rec < expansionOrder; rec++)
+    {
+        // Making sure that all sm_rhs are saved before they are accessed in a loop below
+        __syncthreads();
+        float mvb = 0.0f;
+
+        for (int n = 0; n < coupledConstraintsCount; n++)
+        {
+            int index = n*numConstraintsThreads + threadIndex;
+            int c1    = gm_coupledConstraintsIdxes[index];
+            // Convolute current right-hand-side with A
+            // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
+            mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
+        }
+        // 'Switch' rhs vectors, save current result
+        // These values will be accessed in the loop above during the next iteration.
+        sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
+        sol  = sol + mvb;
+    }
+
+    // Current mass-scaled Lagrange multipliers
+    lagrangeScaled = sqrtReducedMass*sol;
+
+    // Save updated coordinates before correction for the rotational lengthening
+    float3 tmp     = rc*lagrangeScaled;
+
+    // Writing for all but dummy constraints
+    if (!isDummyThread)
+    {
+        atomicAdd(&gm_xp[i], -tmp*inverseMassi);
+        atomicAdd(&gm_xp[j],  tmp*inverseMassj);
+    }
+
+    /*
+     *  Correction for centripetal effects
+     */
+    for (int iter = 0; iter < numIterations; iter++)
+    {
+        // Make sure that all xp's are saved: atomic operation calls before are
+        // communicating current xp[..] values across thread block.
+        __syncthreads();
+
+        if (!isDummyThread)
+        {
+            xi = gm_xp[i];
+            xj = gm_xp[j];
+        }
+
+        float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
+
+        float  len2  = targetLength*targetLength;
+        float  dlen2 = 2.0f*len2 - norm2(dx);
+
+        // TODO A little bit more effective but slightly less readable version of the below would be:
+        //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
+        float  proj;
+        if (dlen2 > 0.0f)
+        {
+            proj = sqrtReducedMass*(targetLength - dlen2*rsqrt(dlen2));
+        }
+        else
+        {
+            proj = sqrtReducedMass*targetLength;
+        }
+
+        sm_rhs[threadIdx.x]   = proj;
+        float sol = proj;
+
+        /*
+         * Same matrix inversion as above is used for updated data
+         */
+        for (int rec = 0; rec < expansionOrder; rec++)
+        {
+            // Make sure that all elements of rhs are saved into shared memory
+            __syncthreads();
+            float mvb = 0;
+
+            for (int n = 0; n < coupledConstraintsCount; n++)
+            {
+                int index = n*numConstraintsThreads + threadIndex;
+                int c1    = gm_coupledConstraintsIdxes[index];
+
+                mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
+
+            }
+            sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
+            sol  = sol + mvb;
+        }
+
+        // Add corrections to Lagrange multipliers
+        float sqrtmu_sol  = sqrtReducedMass*sol;
+        lagrangeScaled += sqrtmu_sol;
+
+        // Save updated coordinates for the next iteration
+        // Dummy constraints are skipped
+        if (!isDummyThread)
+        {
+            float3 tmp = rc*sqrtmu_sol;
+            atomicAdd(&gm_xp[i], -tmp*inverseMassi);
+            atomicAdd(&gm_xp[j],  tmp*inverseMassj);
+        }
+    }
+
+    // Updating particle velocities for all but dummy threads
+    if (updateVelocities && !isDummyThread)
+    {
+        float3 tmp     = rc*invdt*lagrangeScaled;
+        atomicAdd(&gm_v[i], -tmp*inverseMassi);
+        atomicAdd(&gm_v[j],  tmp*inverseMassj);
+    }
+
+
+    if (computeVirial)
+    {
+        // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
+        // (targetLength) and the normalized vector connecting constrained atoms before
+        // the algorithm was applied (rc). The evaluation of virial in each thread is
+        // followed by basic reduction for the values inside single thread block.
+        // Then, the values are reduced across grid by atomicAdd(...).
+        //
+        // TODO Shuffle reduction.
+        // TODO Should be unified and/or done once when virial is actually needed.
+        // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
+        //      one that works for any datatype.
+
+        // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
+        // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
+        // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
+        // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
+        // two are no longer in use.
+        extern __shared__ float  sm_threadVirial[];
+        float                    mult = targetLength*lagrangeScaled;
+        sm_threadVirial[0*blockDim.x + threadIdx.x] = mult*rc.x*rc.x;
+        sm_threadVirial[1*blockDim.x + threadIdx.x] = mult*rc.x*rc.y;
+        sm_threadVirial[2*blockDim.x + threadIdx.x] = mult*rc.x*rc.z;
+        sm_threadVirial[3*blockDim.x + threadIdx.x] = mult*rc.y*rc.y;
+        sm_threadVirial[4*blockDim.x + threadIdx.x] = mult*rc.y*rc.z;
+        sm_threadVirial[5*blockDim.x + threadIdx.x] = mult*rc.z*rc.z;
+
+        __syncthreads();
+
+        // Reduce up to one virial per thread block. All blocks are divided by half, the first
+        // half of threads sums two virials. Then the first half is divided by two and the first
+        // half of it sums two values. This procedure is repeated until only one thread is left.
+        // Only works if the threads per blocks is a power of two (hence static_assert
+        // in the beginning of the kernel).
+        for (int divideBy = 2; divideBy <= blockDim.x; divideBy *= 2)
+        {
+            int dividedAt = blockDim.x/divideBy;
+            if (threadIdx.x < dividedAt)
+            {
+                for (int d = 0; d < 6; d++)
+                {
+                    sm_threadVirial[d*blockDim.x + threadIdx.x] += sm_threadVirial[d*blockDim.x + (threadIdx.x + dividedAt)];
+                }
+            }
+            // Syncronize if not within one warp
+            if (dividedAt > warpSize/2)
+            {
+                __syncthreads();
+            }
+        }
+        // First 6 threads in the block add the results of 6 tensor components to the global memory address.
+        if (threadIdx.x < 6)
+        {
+            atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x*blockDim.x]);
+        }
+    }
+
+    return;
+}
+
+/*! \brief Select templated kernel.
+ *
+ * Returns pointer to a CUDA kernel based on provided booleans.
+ *
+ * \param[in] updateVelocities  If the velocities should be constrained.
+ * \param[in] computeVirial     If virial should be updated.
+ *
+ * \return                      Pointer to CUDA kernel
+ */
+inline auto getLincsKernelPtr(const bool  updateVelocities,
+                              const bool  computeVirial)
+{
+
+    auto kernelPtr = lincs_kernel<true, true>;
+    if (updateVelocities && computeVirial)
+    {
+        kernelPtr = lincs_kernel<true, true>;
+    }
+    else if (updateVelocities && !computeVirial)
+    {
+        kernelPtr = lincs_kernel<true, false>;
+    }
+    else if (!updateVelocities && computeVirial)
+    {
+        kernelPtr = lincs_kernel<false, true>;
+    }
+    else if (!updateVelocities && !computeVirial)
+    {
+        kernelPtr = lincs_kernel<false, false>;
+    }
+    return kernelPtr;
+}
+
+/*! \brief Apply LINCS.
+ *
+ * Applies LINCS to coordinates and velocities, stored on GPU.
+ * Data at pointers xPrime and v (class fields) change in the GPU
+ * memory. The results are not automatically copied back to the CPU
+ * memory. Method uses this class data structures which should be
+ * updated when needed using update method.
+ *
+ * \param[in] updateVelocities  If the velocities should be constrained.
+ * \param[in] invdt             Reciprocal timestep (to scale Lagrange
+ *                              multipliers when velocities are updated)
+ * \param[in] computeVirial     If virial should be updated.
+ * \param[in,out] virialScaled  Scaled virial tensor to be updated.
+ */
+void LincsCuda::Impl::apply(const bool  updateVelocities,
+                            const real  invdt,
+                            const bool  computeVirial,
+                            tensor      virialScaled)
+{
+    ensureNoPendingCudaError("In CUDA version of LINCS");
+
+    if (computeVirial)
+    {
+        // Fill with zeros so the values can be reduced to it
+        // Only 6 values are needed because virial is symmetrical
+        clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, stream_);
+    }
+
+    auto               kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1)/c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+
+    // Shared memory is used to store:
+    // -- Current coordinates (3 floats per thread)
+    // -- Right-hand-sides for matrix inversion (2 floats per thread)
+    // -- Virial tensor components (6 floats per thread)
+    // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
+    // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
+    // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
+    if (computeVirial)
+    {
+        config.sharedMemorySize = c_threadsPerBlock*6*sizeof(float);
+    }
+    else
+    {
+        config.sharedMemorySize = c_threadsPerBlock*3*sizeof(float);
+    }
+    config.stream           = stream_;
+
+    const auto     kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
+                                                          &kernelParams_,
+                                                          &invdt);
+
+    launchGpuKernel(kernelPtr, config, nullptr,
+                    "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
+
+    if (computeVirial)
+    {
+        // Copy LINCS virial data and add it to the common virial
+        copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled,
+                             0, 6,
+                             stream_, GpuApiCallBehavior::Sync, nullptr);
+
+        // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
+        virialScaled[XX][XX] += h_virialScaled_[0];
+        virialScaled[XX][YY] += h_virialScaled_[1];
+        virialScaled[XX][ZZ] += h_virialScaled_[2];
+
+        virialScaled[YY][XX] += h_virialScaled_[1];
+        virialScaled[YY][YY] += h_virialScaled_[3];
+        virialScaled[YY][ZZ] += h_virialScaled_[4];
+
+        virialScaled[ZZ][XX] += h_virialScaled_[2];
+        virialScaled[ZZ][YY] += h_virialScaled_[4];
+        virialScaled[ZZ][ZZ] += h_virialScaled_[5];
+    }
+
+    return;
+}
+
+/*! \brief Create LINCS object
+ *
+ * \param[in] numAtoms        Number of atoms that will be handles by LINCS.
+ *                            Used to compute the memory size in allocations and copy.
+ * \param[in] numIterations   Number of iterations used to compute inverse matrix.
+ * \param[in] expansionOrder  LINCS projection order for correcting the direction of constraint.
+ */
+LincsCuda::Impl::Impl(int numAtoms,
+                      int numIterations,
+                      int expansionOrder)
+    : numAtoms_(numAtoms)
+{
+    kernelParams_.numIterations              = numIterations;
+    kernelParams_.expansionOrder             = expansionOrder;
+
+    static_assert(sizeof(real) == sizeof(float),
+                  "Real numbers should be in single precision in GPU code.");
+    static_assert(c_threadsPerBlock > 0 && !(c_threadsPerBlock & (c_threadsPerBlock - 1) == 0),
+                  "Nmber of threads per block should be a power of two in order for reduction to work.");
+
+    // This is temporary. LINCS should not manage coordinates.
+    allocateDeviceBuffer(&kernelParams_.d_x,  numAtoms, nullptr);
+    allocateDeviceBuffer(&kernelParams_.d_xp, numAtoms, nullptr);
+    allocateDeviceBuffer(&kernelParams_.d_v,  numAtoms, nullptr);
+
+    allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
+    h_virialScaled_.resize(6);
+
+    // The data arrays should be expanded/reallocated on first call of set() function.
+    maxConstraintsNumberSoFar_ = 0;
+    // Use default stream.
+    // TODO The stream should/can be assigned by the GPU schedule when the code will be integrated.
+    stream_ = nullptr;
+
+}
+
+LincsCuda::Impl::~Impl()
+{
+}
+
+/*! \brief Helper function to go through constraints recursively.
+ *
+ *  For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
+ *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
+ *  coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
+ *  value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
+ *  after it in the constraints array.
+ *
+ * \param[in]     a                  Atom index.
+ * \param[in,out] spaceNeeded        Indicates if the constraint was already counted and stores
+ *                                   the number of constraints (i) connected to it and (ii) located
+ *                                   after it in memory. This array is filled by this recursive function.
+ *                                   For a set of coupled constraints, only for the first one in this list
+ *                                   the number of consecutive coupled constraints is needed: if there is
+ *                                   not enough space for this set of constraints in the thread block,
+ *                                   the group has to be moved to the next one.
+ * \param[in]     atomAdjacencyList  Stores information about connections between atoms.
+ */
+inline int countCoupled(int a, std::vector<int> *spaceNeeded,
+                        std::vector<std::vector<std::tuple<int, int, int> > > *atomsAdjacencyList)
+
+{
+    int c2, a2, sign;
+    int counted = 0;
+    for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
+    {
+        std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
+        if (spaceNeeded->at(c2) == -1)
+        {
+            spaceNeeded->at(c2) = 0; // To indicate we've been here
+            counted            += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
+        }
+    }
+    return counted;
+}
+
+/*! \brief
+ * Update data-structures (e.g. after NB search step).
+ *
+ * Updates the constraints data and copies it to the GPU. Should be
+ * called if the particles were sorted, redistributed between domains, etc.
+ * This version uses common data formats so it can be called from anywhere
+ * in the code. Does not recycle the data preparation routines from the CPU
+ * version. Works only with simple case when all the constraints in idef are
+ * are handled by a single GPU. Triangles are not handled as special case.
+ *
+ * Information about constraints is taken from:
+ *     idef.il[F_CONSTR].iatoms  --- type (T) of constraint and two atom indexes (i1, i2)
+ *     idef.iparams[T].constr.dA --- target length for constraint of type T
+ * From t_mdatom, the code takes:
+ *     md.invmass  --- array of inverse square root of masses for each atom in the system.
+ *
+ * \param[in] idef  Local topology data to get information on constraints from.
+ * \param[in] md    Atoms data to get atom masses from.
+ */
+void LincsCuda::Impl::set(const t_idef    &idef,
+                          const t_mdatoms &md)
+{
+
+    // List of constrained atoms (CPU memory)
+    std::vector<int2>   constraintsHost;
+    // Equilibrium distances for the constraints (CPU)
+    std::vector<float>  constraintsTargetLengthsHost;
+    // Number of constraints, coupled with the current one (CPU)
+    std::vector<int>    coupledConstraintsCountsHost;
+    // List of coupled with the current one (CPU)
+    std::vector<int>    coupledConstraintsIdxesHost;
+    // Mass factors (CPU)
+    std::vector<float>  massFactorsHost;
+
+    // List of constrained atoms in local topology
+    t_iatom  *iatoms         = idef.il[F_CONSTR].iatoms;
+    const int stride         = NRAL(F_CONSTR) + 1;
+    const int numConstraints = idef.il[F_CONSTR].nr/stride;
+    // Constructing adjacency list --- usefull intermediate structure
+    std::vector<std::vector<std::tuple<int, int, int> > > atomsAdjacencyList(numAtoms_);
+    for (int c = 0; c < numConstraints; c++)
+    {
+        int a1     = iatoms[stride*c + 1];
+        int a2     = iatoms[stride*c + 2];
+
+        // Each constraint will be represented as a tuple, containing index of the second constrained atom,
+        // index of the constraint and a sign that indicates the order of atoms in which they are listed.
+        // Sign is needed to compute the mass factors.
+        atomsAdjacencyList.at(a1).push_back(std::make_tuple(a2, c, +1));
+        atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
+    }
+
+    // Compute, how many coupled constraints are in front of each constraint.
+    // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
+    // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
+    // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
+    // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
+    std::vector<int> spaceNeeded;
+    spaceNeeded.resize(numConstraints, -1);
+    std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
+    for (int c = 0; c < numConstraints; c++)
+    {
+        int a1     = iatoms[stride*c + 1];
+        int a2     = iatoms[stride*c + 2];
+        if (spaceNeeded.at(c) == -1)
+        {
+            spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList) +
+                countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
+        }
+    }
+
+    // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
+    // takes into account the empty spaces which might be needed in the end of each thread block.
+    std::vector<int> splitMap;
+    splitMap.resize(numConstraints, -1);
+    int              currentMapIndex = 0;
+    for (int c = 0; c < numConstraints; c++)
+    {
+        // Check if coupled constraints all fit in one block
+        GMX_RELEASE_ASSERT(spaceNeeded.at(c) < c_threadsPerBlock, "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
+                           "Most likely, you are trying to use GPU version of LINCS with constraints on all-bonds, "
+                           "which is not supported. Try using H-bonds constraints only.");
+        if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
+        {
+            currentMapIndex = ((currentMapIndex/c_threadsPerBlock) + 1) * c_threadsPerBlock;
+        }
+        splitMap.at(c) = currentMapIndex;
+        currentMapIndex++;
+    }
+    kernelParams_.numConstraintsThreads = currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
+    GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0, "Number of threads should be a multiple of the block size");
+
+    // Initialize constraints and their target indexes taking into account the splits in the data arrays.
+    int2 pair;
+    pair.x = -1;
+    pair.y = -1;
+    constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
+    std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
+    constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
+    std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
+    for (int c = 0; c < numConstraints; c++)
+    {
+        int  a1     = iatoms[stride*c + 1];
+        int  a2     = iatoms[stride*c + 2];
+        int  type   = iatoms[stride*c];
+
+        int2 pair;
+        pair.x = a1;
+        pair.y = a2;
+        constraintsHost.at(splitMap.at(c))              = pair;
+        constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
+
+    }
+
+    // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
+    // We map a single thread to a single constraint, hence each thread 'c' will be using one element from
+    // coupledConstraintsCountsHost array, which is the number of constraints coupled to the constraint 'c'.
+    // The coupled constraints indexes are placed into the coupledConstraintsIdxesHost array. Latter is organized
+    // as a one-dimensional array to ensure good memory alignment. It is addressed as [c + i*numConstraintsThreads],
+    // where 'i' goes from zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of
+    // the array --- a number, greater then total number of constraints, taking into account the splits in the
+    // constraints array due to the GPU block borders. This number can be adjusted to improve memory access pattern.
+    // Mass factors are saved in a similar data structure.
+    int              maxCoupledConstraints = 0;
+    for (int c = 0; c < numConstraints; c++)
+    {
+        int a1     = iatoms[stride*c + 1];
+        int a2     = iatoms[stride*c + 2];
+
+        // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
+        int nCoupedConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
+
+        if (nCoupedConstraints > maxCoupledConstraints)
+        {
+            maxCoupledConstraints = nCoupedConstraints;
+        }
+    }
+
+    coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
+    coupledConstraintsIdxesHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
+    massFactorsHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
+
+    for (int c1 = 0; c1 < numConstraints; c1++)
+    {
+        coupledConstraintsCountsHost.at(splitMap.at(c1))  = 0;
+        int c1a1     = iatoms[stride*c1 + 1];
+        int c1a2     = iatoms[stride*c1 + 2];
+        int c2;
+        int c2a1;
+        int c2a2;
+
+        int sign;
+
+        // Constraints, coupled trough the first atom.
+        c2a1 = c1a1;
+        for (unsigned j = 0; j < atomsAdjacencyList.at(c1a1).size(); j++)
+        {
+
+            std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a1).at(j);
+
+            if (c1 != c2)
+            {
+                int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
+
+                coupledConstraintsIdxesHost.at(index) = splitMap.at(c2);
+
+                int   center = c1a1;
+
+                float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
+                float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
+
+                massFactorsHost.at(index) = -sign*md.invmass[center]*sqrtmu1*sqrtmu2;
+
+                coupledConstraintsCountsHost.at(splitMap.at(c1))++;
+
+            }
+        }
+
+        // Constraints, coupled through the second atom.
+        c2a1 = c1a2;
+        for (unsigned j = 0; j < atomsAdjacencyList.at(c1a2).size(); j++)
+        {
+
+            std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a2).at(j);
+
+            if (c1 != c2)
+            {
+                int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
+
+                coupledConstraintsIdxesHost.at(index) = splitMap.at(c2);
+
+                int   center = c1a2;
+
+                float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
+                float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
+
+                massFactorsHost.at(index) = sign*md.invmass[center]*sqrtmu1*sqrtmu2;
+
+                coupledConstraintsCountsHost.at(splitMap.at(c1))++;
+
+            }
+        }
+    }
+
+    // (Re)allocate the memory, if the number of constraints has increased.
+    if (numConstraints > maxConstraintsNumberSoFar_)
+    {
+        // Free memory if it was allocated before (i.e. if not the first time here).
+        if (maxConstraintsNumberSoFar_ > 0)
+        {
+            freeDeviceBuffer(&kernelParams_.d_inverseMasses);
+
+            freeDeviceBuffer(&kernelParams_.d_constraints);
+            freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
+
+            freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
+            freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes);
+            freeDeviceBuffer(&kernelParams_.d_massFactors);
+            freeDeviceBuffer(&kernelParams_.d_matrixA);
+
+        }
+        maxConstraintsNumberSoFar_ = numConstraints;
+
+        allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms_, nullptr);
+
+        allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, kernelParams_.numConstraintsThreads, nullptr);
+
+        allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, kernelParams_.numConstraintsThreads, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_massFactors, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_matrixA, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
+
+    }
+
+    // Copy data to GPU.
+    copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(),
+                       0, kernelParams_.numConstraintsThreads,
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, constraintsTargetLengthsHost.data(),
+                       0, kernelParams_.numConstraintsThreads,
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, coupledConstraintsCountsHost.data(),
+                       0, kernelParams_.numConstraintsThreads,
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, coupledConstraintsIdxesHost.data(),
+                       0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(),
+                       0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+
+    GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
+    copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass,
+                       0, numAtoms_,
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+
+}
+
+/*! \brief
+ * Update PBC data.
+ *
+ * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter.
+ *
+ * \param[in] pbc  The PBC data in t_pbc format.
+ */
+void LincsCuda::Impl::setPbc(const t_pbc *pbc)
+{
+    setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
+}
+
+/*! \brief
+ * Copy coordinates from provided CPU location to GPU.
+ *
+ * Copies the coordinates before the integration step (x) and coordinates
+ * after the integration step (xp) from the provided CPU location to GPU.
+ * The data are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_x   CPU pointer where coordinates should be copied from.
+ * \param[in] h_xp  CPU pointer where coordinates should be copied from.
+ */
+void LincsCuda::Impl::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp)
+{
+    copyToDeviceBuffer(&kernelParams_.d_x, (float3*)h_x, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&kernelParams_.d_xp, (float3*)h_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Copy velocities from provided CPU location to GPU.
+ *
+ * Nothing is done if the argument provided is a nullptr.
+ * The data are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_v  CPU pointer where velocities should be copied from.
+ */
+void LincsCuda::Impl::copyVelocitiesToGpu(const rvec *h_v)
+{
+    if (h_v != nullptr)
+    {
+        copyToDeviceBuffer(&kernelParams_.d_v, (float3*)h_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+    }
+}
+
+/*! \brief
+ * Copy coordinates from GPU to provided CPU location.
+ *
+ * Copies the constrained coordinates to the provided location. The coordinates
+ * are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[out] h_xp  CPU pointer where coordinates should be copied to.
+ */
+void LincsCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp)
+{
+    copyFromDeviceBuffer((float3*)h_xp, &kernelParams_.d_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Copy velocities from GPU to provided CPU location.
+ *
+ * The velocities are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_v  Pointer to velocities data.
+ */
+void LincsCuda::Impl::copyVelocitiesFromGpu(rvec *h_v)
+{
+    copyFromDeviceBuffer((float3*)h_v, &kernelParams_.d_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Set the internal GPU-memory x, xprime and v pointers.
+ *
+ * Data is not copied. The data are assumed to be in float3/fvec format
+ * (float3 is used internally, but the data layout should be identical).
+ *
+ * \param[in] d_x   Pointer to the coordinates before integrator update (on GPU)
+ * \param[in] d_xp  Pointer to the coordinates after integrator update, before update (on GPU)
+ * \param[in] d_v   Pointer to the velocities before integrator update (on GPU)
+ */
+void LincsCuda::Impl::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v)
+{
+    kernelParams_.d_x  = (float3*)d_x;
+    kernelParams_.d_xp = (float3*)d_xp;
+    kernelParams_.d_v  = (float3*)d_v;
+}
+
+
+LincsCuda::LincsCuda(const int numAtoms,
+                     const int numIterations,
+                     const int expansionOrder)
+    : impl_(new Impl(numAtoms, numIterations, expansionOrder))
+{
+}
+
+LincsCuda::~LincsCuda() = default;
+
+void LincsCuda::apply(const bool  updateVelocities,
+                      const real  invdt,
+                      const bool  computeVirial,
+                      tensor      virialScaled)
+{
+    impl_->apply(updateVelocities,
+                 invdt,
+                 computeVirial,
+                 virialScaled);
+}
+
+void LincsCuda::setPbc(const t_pbc *pbc)
+{
+    impl_->setPbc(pbc);
+}
+
+void LincsCuda::set(const t_idef    &idef,
+                    const t_mdatoms &md)
+{
+    impl_->set(idef, md);
+}
+
+void LincsCuda::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp)
+{
+    impl_->copyCoordinatesToGpu(h_x, h_xp);
+}
+
+void LincsCuda::copyVelocitiesToGpu(const rvec *h_v)
+{
+    impl_->copyVelocitiesToGpu(h_v);
+}
+
+void LincsCuda::copyCoordinatesFromGpu(rvec *h_xp)
+{
+    impl_->copyCoordinatesFromGpu(h_xp);
+}
+
+void LincsCuda::copyVelocitiesFromGpu(rvec *h_v)
+{
+    impl_->copyVelocitiesFromGpu(h_v);
+}
+
+void LincsCuda::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v)
+{
+    impl_->setXVPointers(d_x, d_xp, d_v);
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/lincs_cuda_impl.h b/src/gromacs/mdlib/lincs_cuda_impl.h
new file mode 100644
index 0000000000..8876bb9d69
--- /dev/null
+++ b/src/gromacs/mdlib/lincs_cuda_impl.h
@@ -0,0 +1,257 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares CUDA implementation class for LINCS
+ *
+ * This header file is needed to include from both the device-side
+ * kernels file, and the host-side management code.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_LINCS_CUDA_IMPL_H
+#define GMX_MDLIB_LINCS_CUDA_IMPL_H
+
+
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/lincs_cuda.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
+#include "gromacs/topology/idef.h"
+
+namespace gmx
+{
+
+/* \brief LINCS parameters and GPU pointers
+ *
+ * This is used to accumulate all the parameters and pointers so they can be passed
+ * to the GPU as a single structure.
+ *
+ */
+struct LincsCudaKernelParameters
+{
+    //! Periodic boundary data
+    PbcAiuc             pbcAiuc;
+    //! Order of expansion when inverting the matrix
+    int                 expansionOrder;
+    //! Number of iterations used to correct the projection
+    int                 numIterations;
+    //! Coordinates before the timestep (in the GPU memory)
+    float3             *d_x;
+    //! Coordinates after the timestep, before constraining (on GPU)
+    float3             *d_xp;
+    //! Velocities of atoms (on GPU)
+    float3             *d_v;
+    //! 1/mass for all atoms (GPU)
+    float              *d_inverseMasses;
+    //! Scaled virial tensor (6 floats: [XX, XY, XZ, YY, YZ, ZZ], GPU)
+    float              *d_virialScaled;
+    /*! \brief Total number of threads.
+     *
+     *  This covers all constraints and gaps in the ends of the thread blocks
+     *  that are necessary to avoid inter-block synchronizations.
+     *  Should be a multiple of block size (the last block is filled with dummy to the end).
+     */
+    int                 numConstraintsThreads;
+    //! List of constrained atoms (GPU memory)
+    int2               *d_constraints;
+    //! Equilibrium distances for the constraints (GPU)
+    float              *d_constraintsTargetLengths;
+    //! Number of constraints, coupled with the current one (GPU)
+    int                *d_coupledConstraintsCounts;
+    //! List of coupled with the current one (GPU)
+    int                *d_coupledConstraintsIdxes;
+    //! Elements of the coupling matrix.
+    float              *d_matrixA;
+    //! Mass factors (GPU)
+    float              *d_massFactors;
+};
+
+/*! \internal \brief Class with interfaces and data for CUDA version of LINCS. */
+class LincsCuda::Impl
+{
+
+    public:
+
+        /*! \brief Constructor.
+         *
+         * \param[in] numAtoms         Number of atoms
+         * \param[in] numIterations    Number of iteration for the correction of the projection.
+         * \param[in] expansionOrder   Order of the matrix inversion algorithm.
+         */
+        Impl(int numAtoms,
+             int numIterations,
+             int expansionOrder);
+        /*! \brief Destructor.*/
+        ~Impl();
+
+        /*! \brief Apply LINCS.
+         *
+         * Applies LINCS to coordinates and velocities, stored on GPU.
+         * Data at pointers xPrime and v (class fields) change in the GPU
+         * memory. The results are not automatically copied back to the CPU
+         * memory. Method uses this class data structures which should be
+         * updated when needed using update method.
+         *
+         * \param[in] updateVelocities  If the velocities should be constrained.
+         * \param[in] invdt             Reciprocal timestep (to scale Lagrange
+         *                              multipliers when velocities are updated)
+         * \param[in] computeVirial     If virial should be updated.
+         * \param[in,out] virialScaled  Scaled virial tensor to be updated.
+         */
+        void apply(bool    updateVelocities,
+                   real    invdt,
+                   bool    computeVirial,
+                   tensor  virialScaled);
+
+        /*! \brief
+         * Update data-structures (e.g. after NB search step).
+         *
+         * Updates the constraints data and copies it to the GPU. Should be
+         * called if the particles were sorted, redistributed between domains, etc.
+         * This version uses common data formats so it can be called from anywhere
+         * in the code. Does not recycle the data preparation routines from the CPU
+         * version. Works only with simple case when all the constraints in idef are
+         * are handled by a single GPU. Triangles are not handled as special case.
+         *
+         * Information about constraints is taken from:
+         *     idef.il[F_CONSTR].iatoms  --- type (T) of constraint and two atom indexes (i1, i2)
+         *     idef.iparams[T].constr.dA --- target length for constraint of type T
+         * From t_mdatom, the code takes:
+         *     md.invmass  --- array of inverse square root of masses for each atom in the system.
+         *
+         * \param[in] idef  Local topology data to get information on constraints from.
+         * \param[in] md    Atoms data to get atom masses from.
+         */
+        void set(const t_idef    &idef,
+                 const t_mdatoms &md);
+
+        /*! \brief
+         * Update PBC data.
+         *
+         * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter.
+         *
+         * \param[in] pbc The PBC data in t_pbc format.
+         */
+        void setPbc(const t_pbc *pbc);
+
+        /*! \brief
+         * Copy coordinates from provided CPU location to GPU.
+         *
+         * Copies the coordinates before the integration step (x) and coordinates
+         * after the integration step (xp) from the provided CPU location to GPU.
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_x   CPU pointer where coordinates should be copied from.
+         * \param[in] h_xp  CPU pointer where coordinates should be copied from.
+         */
+        void copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp);
+
+        /*! \brief
+         * Copy velocities from provided CPU location to GPU.
+         *
+         * Nothing is done if the argument provided is a nullptr.
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  CPU pointer where velocities should be copied from.
+         */
+        void copyVelocitiesToGpu(const rvec *h_v);
+
+        /*! \brief
+         * Copy coordinates from GPU to provided CPU location.
+         *
+         * Copies the constrained coordinates to the provided location. The coordinates
+         * are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[out] h_xp CPU pointer where coordinates should be copied to.
+         */
+        void copyCoordinatesFromGpu(rvec *h_xp);
+
+        /*! \brief
+         * Copy velocities from GPU to provided CPU location.
+         *
+         * The velocities are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  Pointer to velocities data.
+         */
+        void copyVelocitiesFromGpu(rvec *h_v);
+
+        /*! \brief
+         * Set the internal GPU-memory x, xprime and v pointers.
+         *
+         * Data is not copied. The data are assumed to be in float3/fvec format
+         * (float3 is used internally, but the data layout should be identical).
+         *
+         * \param[in] d_x   Pointer to the coordinates before integrator update (on GPU)
+         * \param[in] d_xp  Pointer to the coordinates after integrator update, before update (on GPU)
+         * \param[in] d_v   Pointer to the velocities before integrator update (on GPU)
+         */
+        void setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v);
+
+    private:
+
+        /*! \brief CUDA stream
+         *
+         *  \todo Currently default stream (nullptr) is used. The streams should be managed outside this module
+         *        and passed here if needed.
+         */
+        cudaStream_t              stream_;
+
+        //! Parameters and pointers, passed to the CUDA kernel
+        LincsCudaKernelParameters kernelParams_;
+
+        /*! \brief Number of atoms
+         *
+         * \todo Probably, it should not be stored by this class and passed as an argument when needed.
+         *       This question is coupled with the general relocation of the coordinates and velocities.
+         */
+        int                       numAtoms_;
+
+        //! Scaled virial tensor (6 floats: [XX, XY, XZ, YY, YZ, ZZ])
+        std::vector<float>        h_virialScaled_;
+
+        /*! \brief Maximum total of constraints assigned to this class so far.
+         *
+         * If the new number of constraints is larger then previous maximum, the GPU data arrays are
+         * reallocated.
+         */
+        int                       maxConstraintsNumberSoFar_;
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/tests/constr.cpp b/src/gromacs/mdlib/tests/constr.cpp
index 6f05453dcd..a5f7a3b215 100644
--- a/src/gromacs/mdlib/tests/constr.cpp
+++ b/src/gromacs/mdlib/tests/constr.cpp
@@ -35,6 +35,12 @@
 /*! \internal \file
  * \brief SHAKE and LINCS tests.
  *
+ * \todo Better tests for virial are needed.
+ * \todo Tests for bigger systems to test threads synchronization,
+ *       reduction, etc. on the GPU.
+ * \todo Tests for algorithms for derivatives.
+ * \todo Free-energy perturbation tests
+ *
  * \author Artem Zhmurov <zhmurov@gmail.com>
  * \ingroup module_mdlib
  */
@@ -43,6 +49,8 @@
 
 #include "gromacs/mdlib/constr.h"
 
+#include "config.h"
+
 #include <assert.h>
 
 #include <cmath>
@@ -61,6 +69,7 @@
 #include "gromacs/math/vectypes.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/lincs.h"
+#include "gromacs/mdlib/lincs_cuda.h"
 #include "gromacs/mdlib/shake.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/mdtypes/inputrec.h"
@@ -91,38 +100,63 @@ namespace test
 struct ConstraintsTestData
 {
     public:
-        std::string           title_;                   //!< Human-friendly name for a system
-        int                   nAtom_;                   //!< Number of atoms
-        gmx_mtop_t            mtop_;                    //!< Topology
-        std::vector<real>     masses_;                  //!< Masses
-        std::vector<real>     invmass_;                 //!< Inverse masses
-        t_commrec             cr_;                      //!< Communication record
-        t_inputrec            ir_;                      //!< Input record (info that usually in .mdp file)
-        t_idef                idef_;                    //!< Local topology
-        t_mdatoms             md_;                      //!< MD atoms
-        gmx_multisim_t        ms_;                      //!< Multisim data
-        t_nrnb                nrnb_;                    //!< Computational time array (normally used in benchmarks)
-
-        real                  invdt_;                   //!< Inverse timestep
-        int                   nflexcon_   = 0;          //!< Number of flexible constraints
-        bool                  computeVirial_;           //!< Whether the virial should be computed
-        tensor                virialScaled_;            //!< Scaled virial
-        tensor                virialScaledRef_;         //!< Scaled virial (reference values)
-        bool                  compute_dHdLambda_;       //!< If the free energy is computed
-        real                  dHdLambda_;               //!< For free energy computation
-        real                  dHdLambdaRef_;            //!< For free energy computation (reference value)
-
-        PaddedVector<RVec>    x_;                       //!< Coordinates before the timestep
-        PaddedVector<RVec>    xPrime_;                  //!< Coordinates after timestep, output for the constraints
-        PaddedVector<RVec>    xPrime0_;                 //!< Backup for coordinates (for reset)
-        PaddedVector<RVec>    xPrime2_;                 //!< Intermediate set of coordinates used by LINCS and
-                                                        //!< SHAKE for different purposes
-        PaddedVector<RVec>    v_;                       //!< Velocities
-        PaddedVector<RVec>    v0_;                      //!< Backup for velocities (for reset)
-
-        // Fields to store constraints data for testing
-        std::vector<int>      constraints_;             //!< Constraints data (type1-i1-j1-type2-i2-j2-...)
-        std::vector<real>     constraintsR0_;           //!< Target lengths for all constraint types
+        //! Human-friendly name for a system
+        std::string           title_;
+        //! Number of atoms
+        int                   numAtoms_;
+        //! Topology
+        gmx_mtop_t            mtop_;
+        //! Masses
+        std::vector<real>     masses_;
+        //! Inverse masses
+        std::vector<real>     invmass_;
+        //! Communication record
+        t_commrec             cr_;
+        //! Input record (info that usually in .mdp file)
+        t_inputrec            ir_;
+        //! Local topology
+        t_idef                idef_;
+        //! MD atoms
+        t_mdatoms             md_;
+        //! Multisim data
+        gmx_multisim_t        ms_;
+        //! Computational time array (normally used to benchmark performance)
+        t_nrnb                nrnb_;
+
+        //! Inverse timestep
+        real                  invdt_;
+        //! Number of flexible constraints
+        int                   nflexcon_   = 0;
+        //! Whether the virial should be computed
+        bool                  computeVirial_;
+        //! Scaled virial
+        tensor                virialScaled_;
+        //! Scaled virial (reference values)
+        tensor                virialScaledRef_;
+        //! If the free energy is computed
+        bool                  compute_dHdLambda_;
+        //! For free energy computation
+        real                  dHdLambda_;
+        //! For free energy computation (reference value)
+        real                  dHdLambdaRef_;
+
+        //! Coordinates before the timestep
+        PaddedVector<RVec>    x_;
+        //! Coordinates after timestep, output for the constraints
+        PaddedVector<RVec>    xPrime_;
+        //! Backup for coordinates (for reset)
+        PaddedVector<RVec>    xPrime0_;
+        //! Intermediate set of coordinates (normally used for projection correction)
+        PaddedVector<RVec>    xPrime2_;
+        //! Velocities
+        PaddedVector<RVec>    v_;
+        //! Backup for velocities (for reset)
+        PaddedVector<RVec>    v0_;
+
+        //! Constraints data (type1-i1-j1-type2-i2-j2-...)
+        std::vector<int>      constraints_;
+        //! Target lengths for all constraint types
+        std::vector<real>     constraintsR0_;
 
         /*! \brief
          * Constructor for the object with all parameters and variables needed by constraints algorithms.
@@ -132,57 +166,57 @@ struct ConstraintsTestData
          * are saved to allow for reset. The constraints data are stored for testing after constraints
          * were applied.
          *
-         * \param[in]  title              Human-friendly name of the system.
-         * \param[in]  nAtom              Number of atoms in the system.
-         * \param[in]  masses             Atom masses. Size of this vector should be equal to nAtom.
-         * \param[in]  constraints        List of constraints, organized in triples of integers.
-         *                                First integer is the index of type for a constraint, second
-         *                                and third are the indices of constrained atoms. The types
-         *                                of constraints should be sequential but not necessarily
-         *                                start from zero (which is the way they normally are in
-         *                                GROMACS).
-         * \param[in]  constraintsR0      Target values for bond lengths for bonds of each type. The
-         *                                size of this vector should be equal to the total number of
-         *                                unique types in constraints vector.
-         * \param[in]  computeVirial      Whether the virial should be computed.
-         * \param[in]  virialScaledRef    Reference values for scaled virial tensor.
-         * \param[in]  compute_dHdLambda  Whether free energy should be computed.
-         * \param[in]  dHdLambdaRef       Reference value for dHdLambda.
-         * \param[in]  initialTime        Initial time.
-         * \param[in]  timestep           Timestep.
-         * \param[in]  x                  Coordinates before integration step.
-         * \param[in]  xPrime             Coordinates after integration step, but before constraining.
-         * \param[in]  v                  Velocities before constraining.
-         * \param[in]  shakeTolerance     Target tolerance for SHAKE.
-         * \param[in]  shakeUseSOR        Use successive over-relaxation method for SHAKE iterations.
-         *                                The general formula is:
-         *                                   x_n+1 = (1-omega)*x_n + omega*f(x_n),
-         *                                where omega = 1 if SOR is off and may be < 1 if SOR is on.
-         * \param[in]  nLincsIter         Number of iterations used to compute the inverse matrix.
-         * \param[in]  nProjOrder         The order for algorithm that adjusts the direction of the
-         *                                bond after constraints are applied.
-         * \param[in]  lincsWarnAngle     The threshold value for the change in bond angle. When
-         *                                exceeded the program will issue a warning.
+         * \param[in]  title                Human-friendly name of the system.
+         * \param[in]  numAtoms             Number of atoms in the system.
+         * \param[in]  masses               Atom masses. Size of this vector should be equal to numAtoms.
+         * \param[in]  constraints          List of constraints, organized in triples of integers.
+         *                                  First integer is the index of type for a constraint, second
+         *                                  and third are the indices of constrained atoms. The types
+         *                                  of constraints should be sequential but not necessarily
+         *                                  start from zero (which is the way they normally are in
+         *                                  GROMACS).
+         * \param[in]  constraintsR0        Target values for bond lengths for bonds of each type. The
+         *                                  size of this vector should be equal to the total number of
+         *                                  unique types in constraints vector.
+         * \param[in]  computeVirial        Whether the virial should be computed.
+         * \param[in]  virialScaledRef      Reference values for scaled virial tensor.
+         * \param[in]  compute_dHdLambda    Whether free energy should be computed.
+         * \param[in]  dHdLambdaRef         Reference value for dHdLambda.
+         * \param[in]  initialTime          Initial time.
+         * \param[in]  timestep             Timestep.
+         * \param[in]  x                    Coordinates before integration step.
+         * \param[in]  xPrime               Coordinates after integration step, but before constraining.
+         * \param[in]  v                    Velocities before constraining.
+         * \param[in]  shakeTolerance       Target tolerance for SHAKE.
+         * \param[in]  shakeUseSOR          Use successive over-relaxation method for SHAKE iterations.
+         *                                  The general formula is:
+         *                                     x_n+1 = (1-omega)*x_n + omega*f(x_n),
+         *                                  where omega = 1 if SOR is off and may be < 1 if SOR is on.
+         * \param[in]  lincsNumIterations   Number of iterations used to compute the inverse matrix.
+         * \param[in]  lincsExpansionOrder  The order for algorithm that adjusts the direction of the
+         *                                  bond after constraints are applied.
+         * \param[in]  lincsWarnAngle       The threshold value for the change in bond angle. When
+         *                                  exceeded the program will issue a warning.
          *
          */
         ConstraintsTestData(const std::string &title,
-                            int nAtom, std::vector<real> masses,
+                            int numAtoms, std::vector<real> masses,
                             std::vector<int> constraints, std::vector<real> constraintsR0,
                             bool computeVirial, tensor virialScaledRef,
                             bool compute_dHdLambda, float dHdLambdaRef,
                             real initialTime, real timestep,
                             const std::vector<RVec> &x, const std::vector<RVec> &xPrime, const std::vector<RVec> &v,
                             real shakeTolerance, gmx_bool shakeUseSOR,
-                            int nLincsIter, int nProjOrder, real lincsWarnAngle)
+                            int lincsNumIterations, int lincsExpansionOrder, real lincsWarnAngle)
         {
-            title_ = title;   // Human-friendly name of the system
-            nAtom_ = nAtom;   // Number of atoms
+            title_    = title;    // Human-friendly name of the system
+            numAtoms_ = numAtoms; // Number of atoms
 
             // Masses of atoms
             masses_ = masses;
-            invmass_.resize(nAtom); // Vector of inverse masses
+            invmass_.resize(numAtoms); // Vector of inverse masses
 
-            for (int i = 0; i < nAtom; i++)
+            for (int i = 0; i < numAtoms; i++)
             {
                 invmass_[i] = 1.0/masses.at(i);
             }
@@ -211,8 +245,8 @@ struct ConstraintsTestData
             md_.nMassPerturbed = 0;
             md_.lambda         = 0.0;
             md_.invmass        = invmass_.data();
-            md_.nr             = nAtom;
-            md_.homenr         = nAtom;
+            md_.nr             = numAtoms;
+            md_.homenr         = numAtoms;
 
             // Virial evaluation
             computeVirial_ = computeVirial;
@@ -281,7 +315,7 @@ struct ConstraintsTestData
             interactionListEmpty.iatoms.resize(0);
 
             gmx_moltype_t molType;
-            molType.atoms.nr             = nAtom;
+            molType.atoms.nr             = numAtoms;
             molType.ilist.at(F_CONSTR)   = interactionList;
             molType.ilist.at(F_CONSTRNC) = interactionListEmpty;
             mtop_.moltype.push_back(molType);
@@ -291,7 +325,7 @@ struct ConstraintsTestData
             molBlock.nmol = 1;
             mtop_.molblock.push_back(molBlock);
 
-            mtop_.natoms = nAtom;
+            mtop_.natoms = numAtoms;
             mtop_.ffparams.iparams.resize(maxType + 1);
             for (int i = 0; i <= maxType; i++)
             {
@@ -300,13 +334,13 @@ struct ConstraintsTestData
             mtop_.bIntermolecularInteractions = false;
 
             // Coordinates and velocities
-            x_.resizeWithPadding(nAtom_);
-            xPrime_.resizeWithPadding(nAtom_);
-            xPrime0_.resizeWithPadding(nAtom_);
-            xPrime2_.resizeWithPadding(nAtom_);
+            x_.resizeWithPadding(numAtoms);
+            xPrime_.resizeWithPadding(numAtoms);
+            xPrime0_.resizeWithPadding(numAtoms);
+            xPrime2_.resizeWithPadding(numAtoms);
 
-            v_.resizeWithPadding(nAtom_);
-            v0_.resizeWithPadding(nAtom_);
+            v_.resizeWithPadding(numAtoms);
+            v0_.resizeWithPadding(numAtoms);
 
             std::copy(x.begin(), x.end(), x_.begin());
             std::copy(xPrime.begin(), xPrime.end(), xPrime_.begin());
@@ -317,13 +351,13 @@ struct ConstraintsTestData
             std::copy(v.begin(), v.end(), v0_.begin());
 
             // SHAKE-specific parameters
-            ir_.shake_tol           = shakeTolerance;
-            ir_.bShakeSOR           = shakeUseSOR;
+            ir_.shake_tol            = shakeTolerance;
+            ir_.bShakeSOR            = shakeUseSOR;
 
             // LINCS-specific parameters
-            ir_.nLincsIter          = nLincsIter;
-            ir_.nProjOrder          = nProjOrder;
-            ir_.LincsWarnAngle      = lincsWarnAngle;
+            ir_.nLincsIter     = lincsNumIterations;
+            ir_.nProjOrder     = lincsExpansionOrder;
+            ir_.LincsWarnAngle = lincsWarnAngle;
         }
 
         /*! \brief
@@ -368,7 +402,7 @@ struct ConstraintsTestData
  * The test will run for all possible combinations of accessible
  * values of the:
  * 1. PBC setup ("PBCNONE" or "PBCXYZ")
- * 2. The algorithm ("SHAKE" or "LINCS").
+ * 2. The algorithm ("SHAKE", "LINCS" or "LINCS_GPU").
  */
 typedef std::tuple<std::string, std::string> ConstraintsTestParameters;
 
@@ -429,6 +463,10 @@ class ConstraintsTest : public ::testing::TestWithParam<ConstraintsTestParameter
             algorithms_["SHAKE"] = applyShake;
             // LINCS
             algorithms_["LINCS"] = applyLincs;
+            // LINCS using CUDA (if CUDA is available)
+#if GMX_GPU == GMX_GPU_CUDA
+            algorithms_["LINCS_CUDA"] = applyLincsCuda;
+#endif
 
         }
 
@@ -521,6 +559,27 @@ class ConstraintsTest : public ::testing::TestWithParam<ConstraintsTestParameter
         }
 
         /*! \brief
+         * Initialize and apply LINCS constraints on CUDA-enabled GPU.
+         *
+         * \param[in] testData        Test data structure.
+         * \param[in] pbc             Periodic boundary data.
+         */
+        static void applyLincsCuda(ConstraintsTestData *testData, t_pbc pbc)
+        {
+            auto lincsCuda = std::make_unique<LincsCuda>(testData->numAtoms_,
+                                                         testData->ir_.nLincsIter,
+                                                         testData->ir_.nProjOrder);
+            lincsCuda->set(testData->idef_, testData->md_);
+            lincsCuda->setPbc(&pbc);
+            lincsCuda->copyCoordinatesToGpu(as_rvec_array(testData->x_.data()),
+                                            as_rvec_array(testData->xPrime_.data()));
+            lincsCuda->copyVelocitiesToGpu(as_rvec_array(testData->v_.data()));
+            lincsCuda->apply(true, testData->invdt_, testData->computeVirial_, testData->virialScaled_);
+            lincsCuda->copyCoordinatesFromGpu(as_rvec_array(testData->xPrime_.data()));
+            lincsCuda->copyVelocitiesFromGpu(as_rvec_array(testData->v_.data()));
+        }
+
+        /*! \brief
          * The test on the final length of constrained bonds.
          *
          * Goes through all the constraints and checks if the final length of all the constraints is equal
@@ -613,14 +672,14 @@ class ConstraintsTest : public ::testing::TestWithParam<ConstraintsTestParameter
 
             RVec comPrime0({0.0, 0.0, 0.0});
             RVec comPrime({0.0, 0.0, 0.0});
-            for (int i = 0; i < testData.nAtom_; i++)
+            for (int i = 0; i < testData.numAtoms_; i++)
             {
                 comPrime0 += testData.masses_[i]*testData.xPrime0_[i];
                 comPrime  += testData.masses_[i]*testData.xPrime_[i];
             }
 
-            comPrime0 /= testData.nAtom_;
-            comPrime  /= testData.nAtom_;
+            comPrime0 /= testData.numAtoms_;
+            comPrime  /= testData.numAtoms_;
 
             EXPECT_REAL_EQ_TOL(comPrime[XX], comPrime0[XX], tolerance)
             << "Center of mass was shifted by constraints in x-direction.";
@@ -644,13 +703,13 @@ class ConstraintsTest : public ::testing::TestWithParam<ConstraintsTestParameter
 
             RVec comV0({0.0, 0.0, 0.0});
             RVec comV({0.0, 0.0, 0.0});
-            for (int i = 0; i < testData.nAtom_; i++)
+            for (int i = 0; i < testData.numAtoms_; i++)
             {
                 comV0 += testData.masses_[i]*testData.v0_[i];
                 comV  += testData.masses_[i]*testData.v_[i];
             }
-            comV0 /= testData.nAtom_;
-            comV  /= testData.nAtom_;
+            comV0 /= testData.numAtoms_;
+            comV  /= testData.numAtoms_;
 
             EXPECT_REAL_EQ_TOL(comV[XX], comV0[XX], tolerance)
             << "Velocity of the center of mass in x-direction has been changed by constraints.";
@@ -673,7 +732,7 @@ class ConstraintsTest : public ::testing::TestWithParam<ConstraintsTestParameter
         void checkFinalCoordinates(std::vector<RVec> xPrimeRef, FloatingPointTolerance tolerance,
                                    const ConstraintsTestData &testData)
         {
-            for (int i = 0; i < testData.nAtom_; i++)
+            for (int i = 0; i < testData.numAtoms_; i++)
             {
                 for (int d = 0; d < DIM; d++)
                 {
@@ -721,13 +780,11 @@ class ConstraintsTest : public ::testing::TestWithParam<ConstraintsTestParameter
             "Computed value for dV/dLambda is not equal to the reference value. ";
         }
 
-
-
 };
 
 TEST_P(ConstraintsTest, SingleConstraint){
-    std::string       title = "one constraint (e.g. OH)";
-    int               nAtom = 2;
+    std::string       title    = "one constraint (e.g. OH)";
+    int               numAtoms = 2;
 
     std::vector<real> masses        = {1.0, 12.0};
     std::vector<int>  constraints   = {0, 0, 1};
@@ -744,24 +801,24 @@ TEST_P(ConstraintsTest, SingleConstraint){
 
     tensor            virialScaledRef = {{-5.58e-04,  5.58e-04, 0.00e+00 },
                                          { 5.58e-04, -5.58e-04, 0.00e+00 },
-                                         { 0.00e+00,  0.00e+00, 0.00e+00 } };
+                                         { 0.00e+00,  0.00e+00, 0.00e+00 }};
 
     real              shakeTolerance         = 0.0001;
     gmx_bool          shakeUseSOR            = false;
 
-    int               lincsNIter             = 1;
-    int               lincsNProjOrder        = 4;
-    real              lincsWarnAngle         = 30.0;
+    int               lincsNIter                      = 1;
+    int               lincslincsExpansionOrder        = 4;
+    real              lincsWarnAngle                  = 30.0;
 
     std::unique_ptr<ConstraintsTestData>   testData = std::make_unique<ConstraintsTestData>
-            (title, nAtom, masses,
+            (title, numAtoms, masses,
             constraints, constraintsR0,
             true, virialScaledRef,
             false, 0,
             real(0.0), real(0.001),
             x, xPrime, v,
             shakeTolerance, shakeUseSOR,
-            lincsNIter, lincsNProjOrder, lincsWarnAngle);
+            lincsNIter, lincslincsExpansionOrder, lincsWarnAngle);
     std::string pbcName;
     std::string algorithmName;
     std::tie(pbcName, algorithmName) = GetParam();
@@ -781,11 +838,11 @@ TEST_P(ConstraintsTest, SingleConstraint){
 
 TEST_P(ConstraintsTest, TwoDisjointConstraints){
 
-    std::string       title         = "two disjoint constraints";
-    int               nAtom         = 4;
-    std::vector<real> masses        = {0.5, 1.0/3.0, 0.25, 1.0};
-    std::vector<int>  constraints   = {0, 0, 1, 1, 2, 3};
-    std::vector<real> constraintsR0 = {2.0, 1.0};
+    std::string       title            = "two disjoint constraints";
+    int               numAtoms         = 4;
+    std::vector<real> masses           = {0.5, 1.0/3.0, 0.25, 1.0};
+    std::vector<int>  constraints      = {0, 0, 1, 1, 2, 3};
+    std::vector<real> constraintsR0    = {2.0, 1.0};
 
 
     std::vector<RVec> x = {{  2.50, -3.10, 15.70 },
@@ -803,24 +860,26 @@ TEST_P(ConstraintsTest, TwoDisjointConstraints){
                            { 0.0, 0.0, 1.0 },
                            { 0.0, 0.0, 0.0 }};
 
-    tensor            virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    tensor            virialScaledRef = {{ 3.3e-03, -1.7e-04,  5.6e-04 },
+                                         {-1.7e-04,  8.9e-06, -2.8e-05 },
+                                         { 5.6e-04, -2.8e-05,  8.9e-05 }};
 
     real              shakeTolerance         = 0.0001;
     gmx_bool          shakeUseSOR            = false;
 
-    int               lincsNIter             = 1;
-    int               lincsNProjOrder        = 4;
-    real              lincsWarnAngle         = 30.0;
+    int               lincsNIter                      = 1;
+    int               lincslincsExpansionOrder        = 4;
+    real              lincsWarnAngle                  = 30.0;
 
     std::unique_ptr<ConstraintsTestData>   testData = std::make_unique<ConstraintsTestData>
-            (title, nAtom, masses,
+            (title, numAtoms, masses,
             constraints, constraintsR0,
-            false, virialScaledRef,
+            true, virialScaledRef,
             false, 0,
             real(0.0), real(0.001),
             x, xPrime, v,
             shakeTolerance, shakeUseSOR,
-            lincsNIter, lincsNProjOrder, lincsWarnAngle);
+            lincsNIter, lincslincsExpansionOrder, lincsWarnAngle);
 
     std::string pbcName;
     std::string algorithmName;
@@ -835,15 +894,17 @@ TEST_P(ConstraintsTest, TwoDisjointConstraints){
     checkCOMCoordinates(absoluteTolerance(0.0001), *testData);
     checkCOMVelocity(absoluteTolerance(0.0001), *testData);
 
+    checkVirialTensor(absoluteTolerance(0.0001), *testData);
+
 }
 
 TEST_P(ConstraintsTest, ThreeSequentialConstraints){
 
-    std::string       title         = "three atoms, connected longitudinally (e.g. CH2)";
-    int               nAtom         = 3;
-    std::vector<real> masses        = {1.0, 12.0, 16.0 };
-    std::vector<int>  constraints   = {0, 0, 1, 1, 1, 2};
-    std::vector<real> constraintsR0 = {0.1, 0.2};
+    std::string       title            = "three atoms, connected longitudinally (e.g. CH2)";
+    int               numAtoms         = 3;
+    std::vector<real> masses           = {1.0, 12.0, 16.0 };
+    std::vector<int>  constraints      = {0, 0, 1, 1, 1, 2};
+    std::vector<real> constraintsR0    = {0.1, 0.2};
 
     real              oneTenthOverSqrtTwo    = 0.1_real / std::sqrt(2.0_real);
     real              twoTenthsOverSqrtThree = 0.2_real / std::sqrt(3.0_real);
@@ -860,24 +921,26 @@ TEST_P(ConstraintsTest, ThreeSequentialConstraints){
                            { 0.0, 1.0, 0.0 },
                            { 0.0, 0.0, 1.0 }};
 
-    tensor            virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    tensor            virialScaledRef = {{ 4.14e-03, 4.14e-03, 3.31e-03},
+                                         { 4.14e-03, 4.14e-03, 3.31e-03},
+                                         { 3.31e-03, 3.31e-03, 3.31e-03}};
 
     real              shakeTolerance         = 0.0001;
     gmx_bool          shakeUseSOR            = false;
 
-    int               lincsNIter             = 1;
-    int               lincsNProjOrder        = 4;
-    real              lincsWarnAngle         = 30.0;
+    int               lincsNIter                      = 1;
+    int               lincslincsExpansionOrder        = 4;
+    real              lincsWarnAngle                  = 30.0;
 
     std::unique_ptr<ConstraintsTestData>   testData = std::make_unique<ConstraintsTestData>
-            (title, nAtom, masses,
+            (title, numAtoms, masses,
             constraints, constraintsR0,
-            false, virialScaledRef,
+            true, virialScaledRef,
             false, 0,
             real(0.0), real(0.001),
             x, xPrime, v,
             shakeTolerance, shakeUseSOR,
-            lincsNIter, lincsNProjOrder, lincsWarnAngle);
+            lincsNIter, lincslincsExpansionOrder, lincsWarnAngle);
 
     std::string pbcName;
     std::string algorithmName;
@@ -892,15 +955,17 @@ TEST_P(ConstraintsTest, ThreeSequentialConstraints){
     checkCOMCoordinates(absoluteTolerance(0.0001), *testData);
     checkCOMVelocity(absoluteTolerance(0.0001), *testData);
 
+    checkVirialTensor(absoluteTolerance(0.0001), *testData);
+
 }
 
 TEST_P(ConstraintsTest, ThreeConstraintsWithCentralAtom){
 
-    std::string       title         = "three atoms, connected to the central atom (e.g. CH3)";
-    int               nAtom         = 4;
-    std::vector<real> masses        = {12.0, 1.0, 1.0, 1.0 };
-    std::vector<int>  constraints   = {0, 0, 1, 0, 0, 2, 0, 0, 3};
-    std::vector<real> constraintsR0 = {0.1};
+    std::string       title            = "three atoms, connected to the central atom (e.g. CH3)";
+    int               numAtoms         = 4;
+    std::vector<real> masses           = {12.0, 1.0, 1.0, 1.0 };
+    std::vector<int>  constraints      = {0, 0, 1, 0, 0, 2, 0, 0, 3};
+    std::vector<real> constraintsR0    = {0.1};
 
 
     std::vector<RVec> x = {{ 0.00,  0.00,  0.00 },
@@ -918,24 +983,26 @@ TEST_P(ConstraintsTest, ThreeConstraintsWithCentralAtom){
                            { 1.0, 0.0, 0.0 },
                            { 1.0, 0.0, 0.0 }};
 
-    tensor            virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    tensor            virialScaledRef = {{7.14e-04, 0.00e+00, 0.00e+00},
+                                         {0.00e+00, 1.08e-03, 0.00e+00},
+                                         {0.00e+00, 0.00e+00, 1.15e-03}};
 
     real              shakeTolerance         = 0.0001;
     gmx_bool          shakeUseSOR            = false;
 
-    int               lincsNIter             = 1;
-    int               lincsNProjOrder        = 4;
-    real              lincsWarnAngle         = 30.0;
+    int               lincsNIter                      = 1;
+    int               lincslincsExpansionOrder        = 4;
+    real              lincsWarnAngle                  = 30.0;
 
     std::unique_ptr<ConstraintsTestData>   testData = std::make_unique<ConstraintsTestData>
-            (title, nAtom, masses,
+            (title, numAtoms, masses,
             constraints, constraintsR0,
-            false, virialScaledRef,
+            true, virialScaledRef,
             false, 0,
             real(0.0), real(0.001),
             x, xPrime, v,
             shakeTolerance, shakeUseSOR,
-            lincsNIter, lincsNProjOrder, lincsWarnAngle);
+            lincsNIter, lincslincsExpansionOrder, lincsWarnAngle);
 
     std::string pbcName;
     std::string algorithmName;
@@ -950,15 +1017,16 @@ TEST_P(ConstraintsTest, ThreeConstraintsWithCentralAtom){
     checkCOMCoordinates(absoluteTolerance(0.0001), *testData);
     checkCOMVelocity(absoluteTolerance(0.0001), *testData);
 
+    checkVirialTensor(absoluteTolerance(0.0001), *testData);
 }
 
 TEST_P(ConstraintsTest, FourSequentialConstraints){
 
-    std::string       title         = "four atoms, connected longitudinally";
-    int               nAtom         = 4;
-    std::vector<real> masses        = {0.5, 1.0/3.0, 0.25, 1.0};
-    std::vector<int>  constraints   = {0, 0, 1, 1, 1, 2, 2, 2, 3};
-    std::vector<real> constraintsR0 = {2.0, 1.0, 1.0};
+    std::string       title            = "four atoms, connected longitudinally";
+    int               numAtoms         = 4;
+    std::vector<real> masses           = {0.5, 1.0/3.0, 0.25, 1.0};
+    std::vector<int>  constraints      = {0, 0, 1, 1, 1, 2, 2, 2, 3};
+    std::vector<real> constraintsR0    = {2.0, 1.0, 1.0};
 
 
     std::vector<RVec> x = {{  2.50, -3.10, 15.70 },
@@ -976,24 +1044,26 @@ TEST_P(ConstraintsTest, FourSequentialConstraints){
                            { 0.0, 0.0, -4.0 },
                            { 0.0, 0.0, -1.0 }};
 
-    tensor            virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    tensor            virialScaledRef = {{ 1.15e-01, -4.20e-03,  2.12e-02},
+                                         {-4.20e-03,  1.70e-04, -6.41e-04},
+                                         { 2.12e-02, -6.41e-04,  5.45e-03}};
 
     real              shakeTolerance         = 0.0001;
     gmx_bool          shakeUseSOR            = false;
 
-    int               lincsNIter             = 4;
-    int               lincsNProjOrder        = 8;
-    real              lincsWarnAngle         = 30.0;
+    int               lincsNIter                      = 4;
+    int               lincslincsExpansionOrder        = 8;
+    real              lincsWarnAngle                  = 30.0;
 
     std::unique_ptr<ConstraintsTestData>   testData = std::make_unique<ConstraintsTestData>
-            (title, nAtom, masses,
+            (title, numAtoms, masses,
             constraints, constraintsR0,
-            false, virialScaledRef,
+            true, virialScaledRef,
             false, 0,
             real(0.0), real(0.001),
             x, xPrime, v,
             shakeTolerance, shakeUseSOR,
-            lincsNIter, lincsNProjOrder, lincsWarnAngle);
+            lincsNIter, lincslincsExpansionOrder, lincsWarnAngle);
 
     std::string pbcName;
     std::string algorithmName;
@@ -1008,15 +1078,17 @@ TEST_P(ConstraintsTest, FourSequentialConstraints){
     checkCOMCoordinates(absoluteTolerance(0.0001), *testData);
     checkCOMVelocity(absoluteTolerance(0.0001), *testData);
 
+    checkVirialTensor(absoluteTolerance(0.01), *testData);
+
 }
 
 TEST_P(ConstraintsTest, TriangleOfConstraints){
 
-    std::string       title         = "basic triangle (tree atoms, connected to each other)";
-    int               nAtom         = 3;
-    std::vector<real> masses        = {1.0, 1.0, 1.0};
-    std::vector<int>  constraints   = {0, 0, 1, 2, 0, 2, 1, 1, 2};
-    std::vector<real> constraintsR0 = {0.1, 0.1, 0.1};
+    std::string       title            = "basic triangle (tree atoms, connected to each other)";
+    int               numAtoms         = 3;
+    std::vector<real> masses           = {1.0, 1.0, 1.0};
+    std::vector<int>  constraints      = {0, 0, 1, 2, 0, 2, 1, 1, 2};
+    std::vector<real> constraintsR0    = {0.1, 0.1, 0.1};
 
     real              oneTenthOverSqrtTwo    = 0.1_real / std::sqrt(2.0_real);
 
@@ -1032,24 +1104,26 @@ TEST_P(ConstraintsTest, TriangleOfConstraints){
                            { -2.0, -2.0, -2.0 },
                            {  1.0,  1.0,  1.0 }};
 
-    tensor            virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    tensor            virialScaledRef = {{ 6.00e-04, -1.61e-03,  1.01e-03},
+                                         {-1.61e-03,  2.53e-03, -9.25e-04},
+                                         { 1.01e-03, -9.25e-04, -8.05e-05}};
 
     real              shakeTolerance         = 0.0001;
     gmx_bool          shakeUseSOR            = false;
 
-    int               lincsNIter             = 1;
-    int               lincsNProjOrder        = 4;
-    real              lincsWarnAngle         = 30.0;
+    int               lincsNIter                      = 1;
+    int               lincslincsExpansionOrder        = 4;
+    real              lincsWarnAngle                  = 30.0;
 
     std::unique_ptr<ConstraintsTestData>   testData = std::make_unique<ConstraintsTestData>
-            (title, nAtom, masses,
+            (title, numAtoms, masses,
             constraints, constraintsR0,
-            false, virialScaledRef,
+            true, virialScaledRef,
             false, 0,
             real(0.0), real(0.001),
             x, xPrime, v,
             shakeTolerance, shakeUseSOR,
-            lincsNIter, lincsNProjOrder, lincsWarnAngle);
+            lincsNIter, lincslincsExpansionOrder, lincsWarnAngle);
 
     std::string pbcName;
     std::string algorithmName;
@@ -1064,12 +1138,22 @@ TEST_P(ConstraintsTest, TriangleOfConstraints){
     checkCOMCoordinates(absoluteTolerance(0.0001), *testData);
     checkCOMVelocity(absoluteTolerance(0.0001), *testData);
 
+    checkVirialTensor(absoluteTolerance(0.00001), *testData);
+
 }
 
 
+#if GMX_GPU == GMX_GPU_CUDA
+INSTANTIATE_TEST_CASE_P(WithParameters, ConstraintsTest,
+                            ::testing::Combine(::testing::Values("PBCNone", "PBCXYZ"),
+                                                   ::testing::Values("SHAKE", "LINCS", "LINCS_CUDA")));
+#endif
+
+#if GMX_GPU != GMX_GPU_CUDA
 INSTANTIATE_TEST_CASE_P(WithParameters, ConstraintsTest,
                             ::testing::Combine(::testing::Values("PBCNone", "PBCXYZ"),
                                                    ::testing::Values("SHAKE", "LINCS")));
+#endif
 
 } // namespace test
 } // namespace gmx
-- 
2.11.4.GIT