From 1c8eb7c5398332b4d485f2cba086c3d8d80068c2 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 18 Mar 2019 19:03:59 +0100
Subject: [PATCH] Combine CUDA Leap-Frog, LINCS and SETTLE. I.

This is the first step in combining constraints and integrator
into "UpdateAndConstraints" module. The initial merge does not
imply any performance optimisation or code clean-up. Hence, this
patch keeps all the temporary infrastructure that was built
around SETTLE, LINCS and Leap-Frog to allow them to function as
a separate units. In the following commits, this infrastructure
will be removed and these three implementations will be more closely
integrated. To enable, set GMX_UPDATE_CONSTRAIN_GPU environment
variable. Note, that environment variables GMX_LINCS_GPU,
GMX_SETTLE_GPU and GMX_INTEGRATE_GPU will no longer work.

Refs #2816, #2888

Change-Id: I8730aad0ecaa0230686fe89d1157b0da2f01f7bc
---
 src/gromacs/mdlib/CMakeLists.txt                 |   8 +-
 src/gromacs/mdlib/constr.cpp                     | 144 +++-------
 src/gromacs/mdlib/update_constrain_cuda.h        | 188 +++++++++++++
 src/gromacs/mdlib/update_constrain_cuda_impl.cpp | 139 ++++++++++
 src/gromacs/mdlib/update_constrain_cuda_impl.cu  | 335 +++++++++++++++++++++++
 src/gromacs/mdlib/update_constrain_cuda_impl.h   | 215 +++++++++++++++
 src/gromacs/mdrun/md.cpp                         |  73 +++--
 7 files changed, 954 insertions(+), 148 deletions(-)
 create mode 100644 src/gromacs/mdlib/update_constrain_cuda.h
 create mode 100644 src/gromacs/mdlib/update_constrain_cuda_impl.cpp
 create mode 100644 src/gromacs/mdlib/update_constrain_cuda_impl.cu
 create mode 100644 src/gromacs/mdlib/update_constrain_cuda_impl.h
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index e613fecab6..d543f04651 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -40,14 +40,10 @@ if (BUILD_TESTING)
 endif()
 if(GMX_USE_CUDA)
     gmx_add_libgromacs_sources(
+       leapfrog_cuda_impl.cu
        lincs_cuda_impl.cu
        settle_cuda_impl.cu
-       )
-endif()
-
-if(GMX_USE_CUDA)
-    gmx_add_libgromacs_sources(
-       leapfrog_cuda_impl.cu
+       update_constrain_cuda_impl.cu
        )
 endif()
 
diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp
index 3e2673ce75..fe61663bff 100644
--- a/src/gromacs/mdlib/constr.cpp
+++ b/src/gromacs/mdlib/constr.cpp
@@ -62,9 +62,7 @@
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/lincs.h"
-#include "gromacs/mdlib/lincs_cuda.h"
 #include "gromacs/mdlib/settle.h"
-#include "gromacs/mdlib/settle_cuda.h"
 #include "gromacs/mdlib/shake.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/mdtypes/inputrec.h"
@@ -89,11 +87,6 @@
 namespace gmx
 {
 
-//! Whether the GPU version of LINCS should be used.
-static const bool c_useGpuLincs  = (getenv("GMX_LINCS_GPU")  != nullptr);
-//! Whether the GPU version of SETTLE should be used.
-static const bool c_useGpuSettle = (getenv("GMX_SETTLE_GPU") != nullptr);
-
 /* \brief Impl class for Constraints
  *
  * \todo Members like md, idef are valid only for the lifetime of a
@@ -185,15 +178,11 @@ class Constraints::Impl
         /*!\brief Input options.
          *
          * \todo Replace with IMdpOptions */
-        const t_inputrec            &ir;
+        const t_inputrec &ir;
         //! Flop counting support.
-        t_nrnb                      *nrnb = nullptr;
+        t_nrnb           *nrnb = nullptr;
         //! Tracks wallcycle usage.
-        gmx_wallcycle               *wcycle;
-        //! Valid LINCS CUDA object when that implementation is being used, nullptr otherwise.
-        std::unique_ptr<LincsCuda>   lincsCuda;
-        //! SETTLE CUDA object or a dummy if CUDA is not enabled for SETTLE.
-        std::unique_ptr<SettleCuda>  settleCuda;
+        gmx_wallcycle    *wcycle;
 };
 
 Constraints::~Constraints() = default;
@@ -493,25 +482,6 @@ Constraints::Impl::apply(bool                  bLog,
             bDump = TRUE;
         }
     }
-    else
-    {
-        if (lincsCuda != nullptr && c_useGpuLincs)
-        {
-            GMX_RELEASE_ASSERT(ir.efep == efepNO || dvdlambda == nullptr,
-                               "Free energy perturbation is not supported by the GPU version of LINCS.\n");
-            lincsCuda->setPbc(pbc_null);
-            lincsCuda->copyCoordinatesToGpu(x, xprime);
-            lincsCuda->copyVelocitiesToGpu(v);
-            lincsCuda->apply(v != nullptr,
-                             invdt,
-                             vir != nullptr, vir_r_m_dr);
-            lincsCuda->copyCoordinatesFromGpu(xprime);
-            if (v != nullptr)
-            {
-                lincsCuda->copyVelocitiesFromGpu(v);
-            }
-        }
-    }
 
     if (shaked != nullptr)
     {
@@ -540,61 +510,41 @@ Constraints::Impl::apply(bool                  bLog,
         switch (econq)
         {
             case ConstraintVariable::Positions:
-                // If settled was initialized, use CPU version of settle
-                if (settled != nullptr)
-                {
 #pragma omp parallel for num_threads(nth) schedule(static)
-                    for (th = 0; th < nth; th++)
+                for (th = 0; th < nth; th++)
+                {
+                    try
                     {
-                        try
+                        if (th > 0)
                         {
-                            if (th > 0)
-                            {
-                                clear_mat(vir_r_m_dr_th[th]);
-                            }
-
-                            csettle(settled,
-                                    nth, th,
-                                    pbc_null,
-                                    x[0], xprime[0],
-                                    invdt, v ? v[0] : nullptr,
-                                    vir != nullptr,
-                                    th == 0 ? vir_r_m_dr : vir_r_m_dr_th[th],
-                                    th == 0 ? &bSettleErrorHasOccurred0 : &bSettleErrorHasOccurred[th]);
+                            clear_mat(vir_r_m_dr_th[th]);
                         }
-                        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
-                    }
-                    inc_nrnb(nrnb, eNR_SETTLE, nsettle);
-                    if (v != nullptr)
-                    {
-                        inc_nrnb(nrnb, eNR_CONSTR_V, nsettle*3);
-                    }
-                    if (vir != nullptr)
-                    {
-                        inc_nrnb(nrnb, eNR_CONSTR_VIR, nsettle*3);
+
+                        csettle(settled,
+                                nth, th,
+                                pbc_null,
+                                x[0], xprime[0],
+                                invdt, v ? v[0] : nullptr,
+                                vir != nullptr,
+                                th == 0 ? vir_r_m_dr : vir_r_m_dr_th[th],
+                                th == 0 ? &bSettleErrorHasOccurred0 : &bSettleErrorHasOccurred[th]);
                     }
+                    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
                 }
-                else
+                inc_nrnb(nrnb, eNR_SETTLE, nsettle);
+                if (v != nullptr)
                 {
-                    // If CPU version of SETTLE was not initialized, GPU version should have being.
-                    GMX_ASSERT(settleCuda != nullptr, "There are settles, but nither CPU nor CUDA version of SETTLE was initialized.");
-                    settleCuda->setPbc(pbc_null);
-                    settleCuda->copyCoordinatesToGpu(x, xprime);
-                    settleCuda->copyVelocitiesToGpu(v);
-                    settleCuda->apply(v   != nullptr, invdt,
-                                      vir != nullptr, vir_r_m_dr);
-                    settleCuda->copyCoordinatesFromGpu(xprime);
-                    if (v != nullptr)
-                    {
-                        settleCuda->copyVelocitiesFromGpu(v);
-                    }
+                    inc_nrnb(nrnb, eNR_CONSTR_V, nsettle*3);
+                }
+                if (vir != nullptr)
+                {
+                    inc_nrnb(nrnb, eNR_CONSTR_VIR, nsettle*3);
                 }
                 break;
             case ConstraintVariable::Velocities:
             case ConstraintVariable::Derivative:
             case ConstraintVariable::Force:
             case ConstraintVariable::ForceDispl:
-                GMX_RELEASE_ASSERT(settled != nullptr, "SETTLE projection correction is not implemented on GPU.");
 #pragma omp parallel for num_threads(nth) schedule(static)
                 for (th = 0; th < nth; th++)
                 {
@@ -967,14 +917,7 @@ Constraints::Impl::setConstraints(const gmx_localtop_t &top,
          */
         if (ir.eConstrAlg == econtLINCS)
         {
-            if (c_useGpuLincs)
-            {
-                lincsCuda->set(top.idef, md);
-            }
-            else
-            {
-                set_lincs(top.idef, md, EI_DYNAMICS(ir.eI), cr, lincsd);
-            }
+            set_lincs(top.idef, md, EI_DYNAMICS(ir.eI), cr, lincsd);
         }
         if (ir.eConstrAlg == econtSHAKE)
         {
@@ -996,10 +939,6 @@ Constraints::Impl::setConstraints(const gmx_localtop_t &top,
         settle_set_constraints(settled,
                                &idef->il[F_SETTLE], md);
     }
-    else if (settleCuda != nullptr && c_useGpuSettle)
-    {
-        settleCuda->set(top.idef, md);
-    }
 
     /* Make a selection of the local atoms for essential dynamics */
     if (ed && cr->dd)
@@ -1128,20 +1067,10 @@ Constraints::Impl::Impl(const gmx_mtop_t     &mtop_p,
 
         if (ir.eConstrAlg == econtLINCS)
         {
-            if (c_useGpuLincs)
-            {
-                fprintf(log, "Initializing LINCS on a GPU for %d atoms\n", mtop.natoms);
-                GMX_RELEASE_ASSERT(nflexcon == 0, "Flexible constraints are not supported by the GPU-based implementation of LINCS.\n");
-                GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), "CUDA version of LINCS is not supported with domain decomposition");
-                lincsCuda = std::make_unique<LincsCuda>(mtop.natoms, ir.nLincsIter, ir.nProjOrder);
-            }
-            else
-            {
-                lincsd = init_lincs(log, mtop,
-                                    nflexcon, at2con_mt,
-                                    DOMAINDECOMP(cr) && cr->dd->splitConstraints,
-                                    ir.nLincsIter, ir.nProjOrder);
-            }
+            lincsd = init_lincs(log, mtop,
+                                nflexcon, at2con_mt,
+                                DOMAINDECOMP(cr) && cr->dd->splitConstraints,
+                                ir.nLincsIter, ir.nProjOrder);
         }
 
         if (ir.eConstrAlg == econtSHAKE)
@@ -1170,17 +1099,8 @@ Constraints::Impl::Impl(const gmx_mtop_t     &mtop_p,
 
         bInterCGsettles = inter_charge_group_settles(mtop);
 
-        if (c_useGpuSettle)
-        {
-            fprintf(log, "Initializing SETTLE on a GPU for %d atoms\n", mtop.natoms);
-            GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), "CUDA version of SETTLE is not supported with domain decomposition");
-            gmx_omp_nthreads_set(emntSETTLE, 1);
-            settleCuda = std::make_unique<SettleCuda>(mtop.natoms, mtop);
-        }
-        else
-        {
-            settled         = settle_init(mtop);
-        }
+        settled         = settle_init(mtop);
+
         /* Make an atom to settle index for use in domain decomposition */
         for (size_t mt = 0; mt < mtop.moltype.size(); mt++)
         {
diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h
new file mode 100644
index 0000000000..f80b7c91dd
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_cuda.h
@@ -0,0 +1,188 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declaration of high-level functions of CUDA implementation of update and constrain class.
+ *
+ * \todo This should only list interfaces needed for libgromacs clients (e.g.
+ *       management of coordinates, velocities and forces should not be here).
+ * \todo Change "cuda" suffix to "gpu"
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ * \inlibraryapi
+ */
+#ifndef GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
+#define GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
+
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class UpdateConstrainCuda
+{
+
+    public:
+        /*! \brief Create Update-Constrain object.
+         *
+         * \param[in] numAtoms  Number of atoms.
+         *
+         * \param[in] ir        Input record data: LINCS takes number of iterations and order of
+         *                      projection from it.
+         * \param[in] mtop      Topology of the system: SETTLE gets the masses for O and H atoms
+         *                      and target O-H and H-H distances from this object.
+         */
+        UpdateConstrainCuda(int                numAtoms,
+                            const t_inputrec  &ir,
+                            const gmx_mtop_t  &mtop);
+
+        ~UpdateConstrainCuda();
+
+        /*! \brief Integrate
+         *
+         * Integrates the equation of motion using Leap-Frog algorithm and applies
+         * LINCS and SETTLE constraints.
+         * Updates d_xp_ and d_v_ fields of this object.
+         *
+         * \param[in] dt                Timestep
+         * \param[in] updateVelocities  If the velocities should be constrained.
+         * \param[in] computeVirial     If virial should be updated.
+         * \param[out] virial           Place to save virial tensor.
+         */
+        void integrate(real       dt,
+                       bool       updateVelocities,
+                       bool       computeVirial,
+                       tensor     virial);
+
+        /*! \brief
+         * Update data-structures (e.g. after NB search step).
+         *
+         * \param[in] idef    System topology
+         * \param[in] md      Atoms data. Can be used to update masses if needed (not used now).
+         */
+        void set(const t_idef     &idef,
+                 const t_mdatoms  &md);
+
+        /*! \brief
+         * Update PBC data.
+         *
+         * Converts PBC data from t_pbc into the PbcAiuc format and stores the latter.
+         *
+         * \param[in] pbc The PBC data in t_pbc format.
+         */
+        void setPbc(const t_pbc *pbc);
+
+        /*! \brief
+         * Copy coordinates from CPU to GPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_x  CPU pointer where coordinates should be copied from.
+         */
+        void copyCoordinatesToGpu(const rvec *h_x);
+
+        /*! \brief
+         * Copy velocities from CPU to GPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  CPU pointer where velocities should be copied from.
+         */
+        void copyVelocitiesToGpu(const rvec *h_v);
+
+        /*! \brief
+         * Copy forces from CPU to GPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_f  CPU pointer where forces should be copied from.
+         */
+        void copyForcesToGpu(const rvec *h_f);
+
+        /*! \brief
+         * Copy coordinates from GPU to CPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[out] h_xp CPU pointer where coordinates should be copied to.
+         */
+        void copyCoordinatesFromGpu(rvec *h_xp);
+
+        /*! \brief
+         * Copy velocities from GPU to CPU.
+         *
+         * The velocities are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  Pointer to velocities data.
+         */
+        void copyVelocitiesFromGpu(rvec *h_v);
+
+        /*! \brief
+         * Copy forces from GPU to CPU.
+         *
+         * The forces are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_f  Pointer to forces data.
+         */
+        void copyForcesFromGpu(rvec *h_f);
+
+        /*! \brief
+         * Set the internal GPU-memory x, xprime and v pointers.
+         *
+         * Data is not copied. The data are assumed to be in float3/fvec format
+         * (float3 is used internally, but the data layout should be identical).
+         *
+         * \param[in] d_x  Pointer to the coordinates for the input (on GPU)
+         * \param[in] d_xp Pointer to the coordinates for the output (on GPU)
+         * \param[in] d_v  Pointer to the velocities (on GPU)
+         * \param[in] d_f  Pointer to the forces (on GPU)
+         */
+        void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
+
+    private:
+        class Impl;
+        gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+} //namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
new file mode 100644
index 0000000000..fd12faacf0
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
@@ -0,0 +1,139 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Stub for update and constraints class CPU implementation.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/mdlib/update_constrain_cuda.h"
+
+#if GMX_GPU != GMX_GPU_CUDA
+
+namespace gmx
+{
+
+/*!\brief Impl class stub. */
+class UpdateConstrainCuda::Impl
+{
+};
+
+/*!\brief Constructor stub. */
+UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused int                   numAtoms,
+                                         gmx_unused const t_inputrec     &ir,
+                                         gmx_unused const gmx_mtop_t     &mtop)
+    : impl_(nullptr)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+UpdateConstrainCuda::~UpdateConstrainCuda() = default;
+
+/*!\brief integrate stub. */
+void UpdateConstrainCuda::integrate(gmx_unused const real  dt,
+                                    gmx_unused const bool  updateVelocities,
+                                    gmx_unused const bool  computeVirial,
+                                    gmx_unused tensor      virialScaled)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*!\brief Set method stub. */
+void UpdateConstrainCuda::set(gmx_unused const t_idef     &idef,
+                              gmx_unused const t_mdatoms  &md)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*!\brief Set PBC stub. */
+void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*! \brief Copy coordinates from provided CPU location to GPU stub. */
+void UpdateConstrainCuda::copyCoordinatesToGpu(gmx_unused const rvec *h_x)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*! \brief Copy velocities from provided CPU location to GPU stub. */
+void UpdateConstrainCuda::copyVelocitiesToGpu(gmx_unused const rvec *h_v)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*! \brief Copy forces from CPU to GPU stub. */
+void UpdateConstrainCuda::copyForcesToGpu(gmx_unused const rvec *h_f)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*! \brief Copy coordinates from GPU to provided CPU location stub. */
+void UpdateConstrainCuda::copyCoordinatesFromGpu(gmx_unused rvec *h_xp)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*! \brief Copy velocities from GPU to provided CPU location stub. */
+void UpdateConstrainCuda::copyVelocitiesFromGpu(gmx_unused rvec *h_v)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*! \brief Copy forces from GPU to CPU stub. */
+void UpdateConstrainCuda::copyForcesFromGpu(gmx_unused rvec *h_f)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+/*! \brief Set the internal GPU-memory x, xprime and v pointers stub. */
+void UpdateConstrainCuda::setXVFPointers(gmx_unused rvec *d_x,
+                                         gmx_unused rvec *d_xp,
+                                         gmx_unused rvec *d_v,
+                                         gmx_unused rvec *d_f)
+{
+    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
+}      // namespace gmx
+
+#endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
new file mode 100644
index 0000000000..957963f49c
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
@@ -0,0 +1,335 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements update and constraints class using CUDA.
+ *
+ * The class combines Leap-Frog integrator with LINCS and SETTLE constraints.
+ *
+ * \todo This class should take over the management of coordinates, velocities
+ *       forces, virial, and PBC from its members (i.e. from Leap-Frog, LINCS
+ *       and SETTLE).
+ * \todo The computational procedures in members should be integrated to improve
+ *       computational performance.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "update_constrain_cuda_impl.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <cmath>
+
+#include <algorithm>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.cuh"
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/leapfrog_cuda.h"
+#include "gromacs/mdlib/lincs_cuda.h"
+#include "gromacs/mdlib/settle_cuda.h"
+#include "gromacs/mdlib/update_constrain_cuda.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
+
+namespace gmx
+{
+
+/*! \brief Integrate
+ *
+ * Integrates the equation of motion using Leap-Frog algorithm and applies
+ * LINCS and SETTLE constraints.
+ * Updates d_xp_ and d_v_ fields of this object.
+ *
+ * \param[in] dt                Timestep
+ * \param[in] updateVelocities  If the velocities should be constrained.
+ * \param[in] computeVirial     If virial should be updated.
+ * \param[out] virial           Place to save virial tensor.
+ */
+void UpdateConstrainCuda::Impl::integrate(const real  dt,
+                                          const bool  updateVelocities,
+                                          const bool  computeVirial,
+                                          tensor      virial)
+{
+    // Clearing virial matrix
+    // TODO There is no point in having saparate virial matrix for constraints
+    clear_mat(virial);
+
+    integrator_->integrate(dt);
+    lincsCuda_->apply(updateVelocities, 1.0/dt, computeVirial, virial);
+    settleCuda_->apply(updateVelocities, 1.0/dt, computeVirial, virial);
+
+    // scaledVirial -> virial (methods above returns scaled values)
+    float scaleFactor = 0.5f/(dt*dt);
+    for (int i = 0; i < DIM; i++)
+    {
+        for (int j = 0; j < DIM; j++)
+        {
+            virial[i][j] = scaleFactor*virial[i][j];
+        }
+    }
+
+    return;
+}
+
+/*! \brief Create Update-Constrain object
+ *
+ * \param[in] numAtoms  Number of atoms.
+ * \param[in] ir        Input record data: LINCS takes number of iterations and order of
+ *                      projection from it.
+ * \param[in] mtop      Topology of the system: SETTLE gets the masses for O and H atoms
+ *                      and target O-H and H-H distances from this object.
+ */
+UpdateConstrainCuda::Impl::Impl(int                numAtoms,
+                                const t_inputrec  &ir,
+                                const gmx_mtop_t  &mtop)
+    : numAtoms_(numAtoms)
+{
+    allocateDeviceBuffer(&d_x_,              numAtoms, nullptr);
+    allocateDeviceBuffer(&d_xp_,             numAtoms, nullptr);
+    allocateDeviceBuffer(&d_v_,              numAtoms, nullptr);
+    allocateDeviceBuffer(&d_f_,              numAtoms, nullptr);
+    allocateDeviceBuffer(&d_inverseMasses_,  numAtoms, nullptr);
+
+    // TODO When the code will be integrated into the schedule, it will be assigned non-default stream.
+    stream_ = nullptr;
+
+    GMX_RELEASE_ASSERT(numAtoms == mtop.natoms, "State and topology number of atoms should be the same.");
+    integrator_ = std::make_unique<LeapFrogCuda>(numAtoms);
+    lincsCuda_  = std::make_unique<LincsCuda>(mtop.natoms, ir.nLincsIter, ir.nProjOrder);
+    settleCuda_ = std::make_unique<SettleCuda>(mtop.natoms, mtop);
+
+    integrator_->setXVFPointers((rvec*)d_x_, (rvec*)d_xp_, (rvec*)d_v_, (rvec*)d_f_);
+    lincsCuda_->setXVPointers((rvec*)d_x_, (rvec*)d_xp_, (rvec*)d_v_);
+    settleCuda_->setXVPointers((rvec*)d_x_, (rvec*)d_xp_,  (rvec*)d_v_);
+}
+
+UpdateConstrainCuda::Impl::~Impl()
+{
+}
+
+/*! \brief
+ * Update data-structures (e.g. after NB search step).
+ *
+ * \param[in] idef    System topology
+ * \param[in] md      Atoms data.
+ */
+void UpdateConstrainCuda::Impl::set(const t_idef      &idef,
+                                    const t_mdatoms   &md)
+{
+    // Integrator should also update something, but it does not even have a method yet
+    integrator_->set(md);
+    lincsCuda_->set(idef, md);
+    settleCuda_->set(idef, md);
+}
+
+/*! \brief
+ * Update PBC data.
+ *
+ * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter.
+ *
+ * \param[in] pbc The PBC data in t_pbc format.
+ */
+void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc)
+{
+    setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
+    integrator_->setPbc(pbc);
+    lincsCuda_->setPbc(pbc);
+    settleCuda_->setPbc(pbc);
+}
+
+/*! \brief
+ * Copy coordinates from CPU to GPU.
+ *
+ * The data are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_x  CPU pointer where coordinates should be copied from.
+ */
+void UpdateConstrainCuda::Impl::copyCoordinatesToGpu(const rvec *h_x)
+{
+    copyToDeviceBuffer(&d_x_, (float3*)h_x, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Copy velocities from CPU to GPU.
+ *
+ * The data are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_v  CPU pointer where velocities should be copied from.
+ */
+void UpdateConstrainCuda::Impl::copyVelocitiesToGpu(const rvec *h_v)
+{
+    copyToDeviceBuffer(&d_v_, (float3*)h_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Copy forces from CPU to GPU.
+ *
+ * The data are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_f  CPU pointer where forces should be copied from.
+ */
+void UpdateConstrainCuda::Impl::copyForcesToGpu(const rvec *h_f)
+{
+    copyToDeviceBuffer(&d_f_, (float3*)h_f, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Copy coordinates from GPU to CPU.
+ *
+ * The data are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[out] h_xp CPU pointer where coordinates should be copied to.
+ */
+void UpdateConstrainCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp)
+{
+    copyFromDeviceBuffer((float3*)h_xp, &d_xp_, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Copy velocities from GPU to CPU.
+ *
+ * The velocities are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_v  Pointer to velocities data.
+ */
+void UpdateConstrainCuda::Impl::copyVelocitiesFromGpu(rvec *h_v)
+{
+    copyFromDeviceBuffer((float3*)h_v, &d_v_, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Copy forces from GPU to CPU.
+ *
+ * The forces are assumed to be in float3/fvec format (single precision).
+ *
+ * \param[in] h_f  Pointer to forces data.
+ */
+void UpdateConstrainCuda::Impl::copyForcesFromGpu(rvec *h_f)
+{
+    copyFromDeviceBuffer((float3*)h_f, &d_f_, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+/*! \brief
+ * Set the internal GPU-memory x, xprime and v pointers.
+ *
+ * Data is not copied. The data are assumed to be in float3/fvec format
+ * (float3 is used internally, but the data layout should be identical).
+ *
+ * \param[in] d_x  Pointer to the coordinates for the input (on GPU)
+ * \param[in] d_xp Pointer to the coordinates for the output (on GPU)
+ * \param[in] d_v  Pointer to the velocities (on GPU)
+ * \param[in] d_f  Pointer to the forces (on GPU)
+ */
+void UpdateConstrainCuda::Impl::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
+{
+    d_x_  = (float3*)d_x;
+    d_xp_ = (float3*)d_xp;
+    d_v_  = (float3*)d_v;
+    d_f_  = (float3*)d_f;
+}
+
+
+UpdateConstrainCuda::UpdateConstrainCuda(int                numAtoms,
+                                         const t_inputrec  &ir,
+                                         const gmx_mtop_t  &mtop)
+    : impl_(new Impl(numAtoms, ir, mtop))
+{
+}
+
+UpdateConstrainCuda::~UpdateConstrainCuda() = default;
+
+void UpdateConstrainCuda::integrate(const real  dt,
+                                    const bool  updateVelocities,
+                                    const bool  computeVirial,
+                                    tensor      virialScaled)
+{
+    impl_->integrate(dt, updateVelocities, computeVirial, virialScaled);
+}
+
+void UpdateConstrainCuda::set(const t_idef               &idef,
+                              const t_mdatoms gmx_unused &md)
+{
+    impl_->set(idef, md);
+}
+
+void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
+{
+    impl_->setPbc(pbc);
+}
+
+void UpdateConstrainCuda::copyCoordinatesToGpu(const rvec *h_x)
+{
+    impl_->copyCoordinatesToGpu(h_x);
+}
+
+void UpdateConstrainCuda::copyVelocitiesToGpu(const rvec *h_v)
+{
+    impl_->copyVelocitiesToGpu(h_v);
+}
+
+void UpdateConstrainCuda::copyForcesToGpu(const rvec *h_f)
+{
+    impl_->copyForcesToGpu(h_f);
+}
+
+void UpdateConstrainCuda::copyCoordinatesFromGpu(rvec *h_xp)
+{
+    impl_->copyCoordinatesFromGpu(h_xp);
+}
+
+void UpdateConstrainCuda::copyVelocitiesFromGpu(rvec *h_v)
+{
+    impl_->copyVelocitiesFromGpu(h_v);
+}
+
+void UpdateConstrainCuda::copyForcesFromGpu(rvec *h_f)
+{
+    impl_->copyForcesFromGpu(h_f);
+}
+
+void UpdateConstrainCuda::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
+{
+    impl_->setXVFPointers(d_x, d_xp, d_v, d_f);
+}
+
+} //namespace gmx
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h
new file mode 100644
index 0000000000..ca526431bf
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h
@@ -0,0 +1,215 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares CUDA implementation class for update and constraints.
+ *
+ * This header file is needed to include from both the device-side
+ * kernels file, and the host-side management code.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_IMPL_H
+#define GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_IMPL_H
+
+#include "gromacs/mdlib/leapfrog_cuda.h"
+#include "gromacs/mdlib/lincs_cuda.h"
+#include "gromacs/mdlib/settle_cuda.h"
+#include "gromacs/mdlib/update_constrain_cuda.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
+#include "gromacs/topology/idef.h"
+
+namespace gmx
+{
+
+/*! \internal \brief Class with interfaces and data for CUDA version of Update-Constraint. */
+class UpdateConstrainCuda::Impl
+{
+
+    public:
+        /*! \brief Create Update-Constrain object
+         *
+         * \param[in] numAtoms  Number of atoms.
+         * \param[in] ir        Input record data: LINCS takes number of iterations and order of
+         *                      projection from it.
+         * \param[in] mtop      Topology of the system: SETTLE gets the masses for O and H atoms
+         *                      and target O-H and H-H distances from this object.
+         */
+        Impl(int                   numAtoms,
+             const t_inputrec     &ir,
+             const gmx_mtop_t     &mtop);
+
+        ~Impl();
+
+        /*! \brief Integrate
+         *
+         * Integrates the equation of motion using Leap-Frog algorithm and applies
+         * LINCS and SETTLE constraints.
+         * Updates d_xp_ and d_v_ fields of this object.
+         *
+         * \param[in] dt                Timestep
+         * \param[in] updateVelocities  If the velocities should be constrained.
+         * \param[in] computeVirial     If virial should be updated.
+         * \param[out] virial           Place to save virial tensor.
+         */
+        void integrate(const real  dt,
+                       const bool  updateVelocities,
+                       const bool  computeVirial,
+                       tensor      virial);
+
+        /*! \brief
+         * Update data-structures (e.g. after NB search step).
+         *
+         * \param[in] idef    System topology
+         * \param[in] md      Atoms data. Can be used to update masses if needed (not used now).
+         */
+        void set(const t_idef     &idef,
+                 const t_mdatoms  &md);
+
+        /*! \brief
+         * Update PBC data.
+         *
+         * Converts PBC data from t_pbc into the PbcAiuc format and stores the latter.
+         *
+         * \param[in] pbc The PBC data in t_pbc format.
+         */
+        void setPbc(const t_pbc *pbc);
+
+        /*! \brief
+         * Copy coordinates from CPU to GPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_x  CPU pointer where coordinates should be copied from.
+         */
+        void copyCoordinatesToGpu(const rvec *h_x);
+
+        /*! \brief
+         * Copy velocities from CPU to GPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  CPU pointer where velocities should be copied from.
+         */
+        void copyVelocitiesToGpu(const rvec *h_v);
+
+        /*! \brief
+         * Copy forces from CPU to GPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_f  CPU pointer where forces should be copied from.
+         */
+        void copyForcesToGpu(const rvec *h_f);
+
+        /*! \brief
+         * Copy coordinates from GPU to CPU.
+         *
+         * The data are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[out] h_xp CPU pointer where coordinates should be copied to.
+         */
+        void copyCoordinatesFromGpu(rvec *h_xp);
+
+        /*! \brief
+         * Copy velocities from GPU to CPU.
+         *
+         * The velocities are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_v  Pointer to velocities data.
+         */
+        void copyVelocitiesFromGpu(rvec *h_v);
+
+        /*! \brief
+         * Copy forces from GPU to CPU.
+         *
+         * The forces are assumed to be in float3/fvec format (single precision).
+         *
+         * \param[in] h_f  Pointer to forces data.
+         */
+        void copyForcesFromGpu(rvec *h_f);
+
+        /*! \brief
+         * Set the internal GPU-memory x, xprime and v pointers.
+         *
+         * Data is not copied. The data are assumed to be in float3/fvec format
+         * (float3 is used internally, but the data layout should be identical).
+         *
+         * \param[in] d_x   Pointer to the coordinates for the input (on GPU)
+         * \param[in] d_xp  Pointer to the coordinates for the output (on GPU)
+         * \param[in] d_v   Pointer to the velocities (on GPU)
+         * \param[in] d_f   Pointer to the forces (on GPU)
+         */
+        void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
+
+    private:
+
+        //! CUDA stream
+        cudaStream_t        stream_;
+
+        //! Periodic boundary data
+        PbcAiuc             pbcAiuc_;
+
+        //! Number of atoms
+        int                 numAtoms_;
+
+        //! Coordinates before the timestep (on GPU)
+        float3             *d_x_;
+        //! Coordinates after the timestep (on GPU).
+        float3             *d_xp_;
+        //! Velocities of atoms (on GPU)
+        float3             *d_v_;
+        //! Forces, exerted by atoms (on GPU)
+        float3             *d_f_;
+
+        //! 1/mass for all atoms (GPU)
+        real               *d_inverseMasses_;
+
+        //! Leap-Frog integrator
+        std::unique_ptr<LeapFrogCuda>  integrator_;
+        //! LINCS CUDA object to use for non-water constraints
+        std::unique_ptr<LincsCuda>     lincsCuda_;
+        //! SETTLE CUDA object for water constrains
+        std::unique_ptr<SettleCuda>    settleCuda_;
+
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index ab53955c31..aba6ba9ca1 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -83,7 +83,6 @@
 #include "gromacs/mdlib/force.h"
 #include "gromacs/mdlib/force_flags.h"
 #include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/leapfrog_cuda.h"
 #include "gromacs/mdlib/md_support.h"
 #include "gromacs/mdlib/mdatoms.h"
 #include "gromacs/mdlib/mdoutf.h"
@@ -96,6 +95,7 @@
 #include "gromacs/mdlib/tgroup.h"
 #include "gromacs/mdlib/trajectory_writing.h"
 #include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/update_constrain_cuda.h"
 #include "gromacs/mdlib/vcm.h"
 #include "gromacs/mdlib/vsite.h"
 #include "gromacs/mdrunutility/handlerestart.h"
@@ -147,8 +147,8 @@
 
 using gmx::SimulationSignaller;
 
-//! Whether the GPU version of Leap-Frog integrator should be used.
-static const bool c_useGpuLeapFrog = (getenv("GMX_INTEGRATE_GPU") != nullptr);
+//! Whether the GPU versions of Leap-Frog integrator and LINCS and SHAKE constraints
+static const bool c_useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
 
 void gmx::Simulator::do_md()
 {
@@ -289,6 +289,11 @@ void gmx::Simulator::do_md()
     std::unique_ptr<t_state> stateInstance;
     t_state *                state;
 
+
+    auto mdatoms = mdAtoms->mdatoms();
+
+    std::unique_ptr<UpdateConstrainCuda> integrator;
+
     if (DOMAINDECOMP(cr))
     {
         dd_init_local_top(*top_global, &top);
@@ -323,9 +328,23 @@ void gmx::Simulator::do_md()
                                   &graph, mdAtoms, constr, vsite, shellfc);
 
         upd.setNumAtoms(state->natoms);
-    }
 
-    auto mdatoms = mdAtoms->mdatoms();
+        if (c_useGpuUpdateConstrain)
+        {
+            GMX_RELEASE_ASSERT(ir->eI == eiMD, "Only md integrator is supported on the GPU.");
+            GMX_RELEASE_ASSERT(ir->etc == etcNO, "Temperature coupling is not supported on the GPU.");
+            GMX_RELEASE_ASSERT(ir->epc == epcNO, "Pressure coupling is not supported on the GPU.");
+            GMX_RELEASE_ASSERT(!mdatoms->haveVsites, "Virtual sites are not supported on the GPU");
+            GMX_LOG(mdlog.info).asParagraph().
+                appendText("Using CUDA GPU-based update and constraints module.");
+            integrator = std::make_unique<UpdateConstrainCuda>(state->natoms, *ir, *top_global);
+            integrator->set(top.idef, *mdatoms);
+            t_pbc pbc;
+            set_pbc(&pbc, epbcXYZ, state->box);
+            integrator->setPbc(&pbc);
+        }
+
+    }
 
     // NOTE: The global state is no longer used at this point.
     // But state_global is still used as temporary storage space for writing
@@ -624,16 +643,6 @@ void gmx::Simulator::do_md()
      *             Loop over MD steps
      *
      ************************************************************/
-    std::unique_ptr<LeapFrogCuda> integrator;
-    if (c_useGpuLeapFrog)
-    {
-        GMX_RELEASE_ASSERT(ir->eI == eiMD, "Only md integrator is supported on the GPU.");
-        GMX_RELEASE_ASSERT(ir->etc == etcNO, "Temperature coupling is not supported on the GPU.");
-        GMX_RELEASE_ASSERT(ir->epc == epcNO, "Pressure coupling is not supported on the GPU.");
-        GMX_RELEASE_ASSERT(!mdatoms->haveVsites, "Virtual sites are not supported on the GPU");
-        integrator = std::make_unique<LeapFrogCuda>(state->natoms);
-        integrator->set(*mdatoms);
-    }
 
     bFirstStep       = TRUE;
     /* Skip the first Nose-Hoover integration when we get the state from tpx */
@@ -1183,32 +1192,36 @@ void gmx::Simulator::do_md()
             copy_rvecn(as_rvec_array(state->x.data()), cbuf, 0, state->natoms);
         }
 
-
-        if (c_useGpuLeapFrog)
+        if (c_useGpuUpdateConstrain)
         {
             integrator->copyCoordinatesToGpu(state->x.rvec_array());
             integrator->copyVelocitiesToGpu(state->v.rvec_array());
             integrator->copyForcesToGpu(as_rvec_array(f.data()));
-            integrator->integrate(ir->delta_t);
-            integrator->copyCoordinatesFromGpu(upd.xp()->rvec_array());
+
+            // This applies Leap-Frog, LINCS and SETTLE in a succession
+            integrator->integrate(ir->delta_t, true, bCalcVir, shake_vir);
+
+            integrator->copyCoordinatesFromGpu(state->x.rvec_array());
             integrator->copyVelocitiesFromGpu(state->v.rvec_array());
         }
         else
         {
             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
                           ekind, M, &upd, etrtPOSITION, cr, constr);
+
+            wallcycle_stop(wcycle, ewcUPDATE);
+
+            constrain_coordinates(step, &dvdl_constr, state,
+                                  shake_vir,
+                                  &upd, constr,
+                                  bCalcVir, do_log, do_ene);
+
+            update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
+                                  cr, nrnb, wcycle, &upd, constr, do_log, do_ene);
+            finish_update(ir, mdatoms,
+                          state, graph,
+                          nrnb, wcycle, &upd, constr);
         }
-        wallcycle_stop(wcycle, ewcUPDATE);
-
-        constrain_coordinates(step, &dvdl_constr, state,
-                              shake_vir,
-                              &upd, constr,
-                              bCalcVir, do_log, do_ene);
-        update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
-                              cr, nrnb, wcycle, &upd, constr, do_log, do_ene);
-        finish_update(ir, mdatoms,
-                      state, graph,
-                      nrnb, wcycle, &upd, constr);
 
         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
         {
-- 
2.11.4.GIT