From 0a1aae785a6f746adbaae672736462f2ff1cf948 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Wed, 20 Feb 2019 15:30:29 +0100 Subject: [PATCH] CUDA version of LINCS constraints. Implementation of the LINCS constraints for NVIDIA GPUs. Currently works isolated from the other parts of the code: coordinates and velocities are copied to and from GPU on every integration timestep. Part of the GPU-only loop. Loosely based on change 9162 by Alan Gray. To enable, set the environmental variable GMX_LINCS_GPU. Limitations: 1. Works only if the constraints can be split in short uncoupled groups (currently < 256, designed for H-bonds constraints). 2. Does not change the matrix inversion order for costraints triangles. 3. Does not support free energy computations. 4. Assumes no communications between domains (i.e. assumes that there is no constraints connecting atoms from two different domains). 5. Number of thread per blocks should be a power of 2 for reduction of virial to work. TODOs: 1. Move more data from the global memory to local. 2. Change .at() to [] 3. Add sorting by the number of coupled constraints to decrease warp divergencies. 4. numAtoms should be changeable (for multi-GPU case). Refs #2816, #2885 Change-Id: I3c975cf898053b7467bcd30459e60ce2c8852be6 --- src/gromacs/mdlib/CMakeLists.txt | 5 + src/gromacs/mdlib/constr.cpp | 58 +- src/gromacs/mdlib/lincs_cuda.h | 175 ++++++ src/gromacs/mdlib/lincs_cuda_impl.cpp | 128 +++++ src/gromacs/mdlib/lincs_cuda_impl.cu | 996 ++++++++++++++++++++++++++++++++++ src/gromacs/mdlib/lincs_cuda_impl.h | 257 +++++++++ src/gromacs/mdlib/tests/constr.cpp | 408 ++++++++------ 7 files changed, 1857 insertions(+), 170 deletions(-) create mode 100644 src/gromacs/mdlib/lincs_cuda.h create mode 100644 src/gromacs/mdlib/lincs_cuda_impl.cpp create mode 100644 src/gromacs/mdlib/lincs_cuda_impl.cu create mode 100644 src/gromacs/mdlib/lincs_cuda_impl.h diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt index baa5803657..16637f9990 100644 --- a/src/gromacs/mdlib/CMakeLists.txt +++ b/src/gromacs/mdlib/CMakeLists.txt @@ -38,5 +38,10 @@ set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE) if (BUILD_TESTING) add_subdirectory(tests) endif() +if(GMX_USE_CUDA) + gmx_add_libgromacs_sources( + lincs_cuda_impl.cu + ) +endif() gmx_install_headers(simulationsignal.h) diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp index fe61663bff..6371c2b80d 100644 --- a/src/gromacs/mdlib/constr.cpp +++ b/src/gromacs/mdlib/constr.cpp @@ -62,6 +62,7 @@ #include "gromacs/math/vec.h" #include "gromacs/mdlib/gmx_omp_nthreads.h" #include "gromacs/mdlib/lincs.h" +#include "gromacs/mdlib/lincs_cuda.h" #include "gromacs/mdlib/settle.h" #include "gromacs/mdlib/shake.h" #include "gromacs/mdtypes/commrec.h" @@ -87,6 +88,9 @@ namespace gmx { +//! Whether the GPU version of LINCS should be used. +static const bool c_useGpuLincs = (getenv("GMX_LINCS_GPU") != nullptr); + /* \brief Impl class for Constraints * * \todo Members like md, idef are valid only for the lifetime of a @@ -178,11 +182,13 @@ class Constraints::Impl /*!\brief Input options. * * \todo Replace with IMdpOptions */ - const t_inputrec &ir; + const t_inputrec &ir; //! Flop counting support. - t_nrnb *nrnb = nullptr; + t_nrnb *nrnb = nullptr; //! Tracks wallcycle usage. - gmx_wallcycle *wcycle; + gmx_wallcycle *wcycle; + //! Valid LINCS CUDA object when that implementation is being used, nullptr otherwise. + std::unique_ptr lincsCuda; }; Constraints::~Constraints() = default; @@ -482,6 +488,25 @@ Constraints::Impl::apply(bool bLog, bDump = TRUE; } } + else + { + if (lincsCuda != nullptr && c_useGpuLincs) + { + GMX_RELEASE_ASSERT(ir.efep == efepNO || dvdlambda == nullptr, + "Free energy perturbation is not supported by the GPU version of LINCS.\n"); + lincsCuda->setPbc(pbc_null); + lincsCuda->copyCoordinatesToGpu(x, xprime); + lincsCuda->copyVelocitiesToGpu(v); + lincsCuda->apply(v != nullptr, + invdt, + vir != nullptr, vir_r_m_dr); + lincsCuda->copyCoordinatesFromGpu(xprime); + if (v != nullptr) + { + lincsCuda->copyVelocitiesFromGpu(v); + } + } + } if (shaked != nullptr) { @@ -917,7 +942,14 @@ Constraints::Impl::setConstraints(const gmx_localtop_t &top, */ if (ir.eConstrAlg == econtLINCS) { - set_lincs(top.idef, md, EI_DYNAMICS(ir.eI), cr, lincsd); + if (c_useGpuLincs) + { + lincsCuda->set(top.idef, md); + } + else + { + set_lincs(top.idef, md, EI_DYNAMICS(ir.eI), cr, lincsd); + } } if (ir.eConstrAlg == econtSHAKE) { @@ -1067,10 +1099,20 @@ Constraints::Impl::Impl(const gmx_mtop_t &mtop_p, if (ir.eConstrAlg == econtLINCS) { - lincsd = init_lincs(log, mtop, - nflexcon, at2con_mt, - DOMAINDECOMP(cr) && cr->dd->splitConstraints, - ir.nLincsIter, ir.nProjOrder); + if (c_useGpuLincs) + { + fprintf(log, "Initializing LINCS on a GPU for %d atoms\n", mtop.natoms); + GMX_RELEASE_ASSERT(nflexcon == 0, "Flexible constraints are not supported by the GPU-based implementation of LINCS.\n"); + GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), "CUDA version of LINCS is not supported with domain decomposition"); + lincsCuda = std::make_unique(mtop.natoms, ir.nLincsIter, ir.nProjOrder); + } + else + { + lincsd = init_lincs(log, mtop, + nflexcon, at2con_mt, + DOMAINDECOMP(cr) && cr->dd->splitConstraints, + ir.nLincsIter, ir.nProjOrder); + } } if (ir.eConstrAlg == econtSHAKE) diff --git a/src/gromacs/mdlib/lincs_cuda.h b/src/gromacs/mdlib/lincs_cuda.h new file mode 100644 index 0000000000..b3c8f0e165 --- /dev/null +++ b/src/gromacs/mdlib/lincs_cuda.h @@ -0,0 +1,175 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * + * \brief Declaration of high-level functions of CUDA implementation of LINCS. + * + * \todo This should only list interfaces needed for libgromacs clients. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + * \inlibraryapi + */ +#ifndef GMX_MDLIB_LINCS_CUDA_H +#define GMX_MDLIB_LINCS_CUDA_H + + +#include "gromacs/mdlib/constr.h" +#include "gromacs/mdtypes/mdatom.h" +#include "gromacs/topology/idef.h" +#include "gromacs/utility/classhelpers.h" + +namespace gmx +{ + +class LincsCuda +{ + + public: + /*! \brief Constructor. + * + * \param[in] numAtoms Number of atoms + * \param[in] numIterations Number of iteration for the correction of the projection. + * \param[in] expansionOrder Order of the matrix inversion algorithm. + */ + LincsCuda(int numAtoms, + int numIterations, + int expansionOrder); + ~LincsCuda(); + + /*! \brief Apply LINCS. + * + * Applies LINCS to coordinates and velocities, Method uses this class data structures + * which should be updated when needed using update method. + * + * \param[in] updateVelocities If the velocities should be constrained. + * \param[in] invdt Reciprocal timestep (to scale Lagrange + * multipliers when velocities are updated) + * \param[in] computeVirial If virial should be updated. + * \param[in,out] virialScaled Scaled virial tensor to be updated. + */ + void apply(bool updateVelocities, + real invdt, + bool computeVirial, + tensor virialScaled); + + /*! \brief + * Update data-structures (e.g. after NB search step). + * + * Updates the constraints data. Should be called if the particles were sorted, + * redistributed between domains, etc. + * + * Information about constraints should be taken from: + * idef.il[F_CONSTR].iatoms --- type (T) of constraint and two atom indexes (i1, i2) + * idef.iparams[T].constr.dA --- target length for constraint of type T + * From t_mdatom, the code should take: + * md.invmass --- array of inverse square root of masses for each atom in the system. + * + * \param[in] idef Local topology data to get information on constraints from. + * \param[in] md Atoms data to get atom masses from. + */ + void set(const t_idef &idef, + const t_mdatoms &md); + + /*! \brief + * Update PBC data. + * + * \param[in] pbc The PBC data in t_pbc format. + */ + void setPbc(const t_pbc *pbc); + + /*! \brief + * Copy coordinates from provided CPU location to GPU. + * + * Copies the coordinates before the integration step (x) and coordinates + * after the integration step (xp) from the provided CPU location to GPU. + * The data are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_x CPU pointer where coordinates should be copied from. + * \param[in] h_xp CPU pointer where coordinates should be copied from. + */ + void copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp); + + /*! \brief + * Copy velocities from provided CPU location to GPU. + * + * Nothing is done if the argument provided is a nullptr. + * The data are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_v CPU pointer where velocities should be copied from. + */ + void copyVelocitiesToGpu(const rvec *h_v); + + /*! \brief + * Copy coordinates from GPU to provided CPU location. + * + * Copies the constrained coordinates to the provided location. The coordinates + * are assumed to be in float3/fvec format (single precision). + * + * \param[out] h_xp CPU pointer where coordinates should be copied to. + */ + void copyCoordinatesFromGpu(rvec *h_xp); + + /*! \brief + * Copy velocities from GPU to provided CPU location. + * + * The velocities are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_v Pointer to velocities data. + */ + void copyVelocitiesFromGpu(rvec *h_v); + + /*! \brief + * Set the internal GPU-memory x, xprime and v pointers. + * + * Data is not copied. The data are assumed to be in float3/fvec format + * (float3 is used internally, but the data layout should be identical). + * + * \param[in] d_x Pointer to the coordinates before integrator update (on GPU) + * \param[in] d_xp Pointer to the coordinates after integrator update, before update (on GPU) + * \param[in] d_v Pointer to the velocities before integrator update (on GPU) + */ + void setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v); + + private: + class Impl; + gmx::PrivateImplPointer impl_; + +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/mdlib/lincs_cuda_impl.cpp b/src/gromacs/mdlib/lincs_cuda_impl.cpp new file mode 100644 index 0000000000..11959554c5 --- /dev/null +++ b/src/gromacs/mdlib/lincs_cuda_impl.cpp @@ -0,0 +1,128 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief May be used to implement LINCS CUDA interfaces for non-GPU builds. + * + * Currently, reports and exits if any of the interfaces are called. + * May replace CPU LINCS module in future. Currently used to satisfy + * compiler on systems, where CUDA is not available. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#include "gmxpre.h" + +#include "config.h" + +#include "gromacs/mdlib/lincs_cuda.h" + +#if GMX_GPU != GMX_GPU_CUDA + +namespace gmx +{ + +/*!\brief Impl class stub.*/ +class LincsCuda::Impl +{ +}; + +/*!\brief Constructor stub.*/ +LincsCuda::LincsCuda(gmx_unused int numAtoms, + gmx_unused int numIterations, + gmx_unused int expansionOrder) + : impl_(nullptr) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +LincsCuda::~LincsCuda() = default; + +/*!\brief Apply LINCS stub.*/ +void LincsCuda::apply(gmx_unused const bool updateVelocities, + gmx_unused const real invdt, + gmx_unused const bool computeVirial, + gmx_unused tensor virialScaled) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +/*!\brief Set data structures stub.*/ +void LincsCuda::set(gmx_unused const t_idef &idef, + gmx_unused const t_mdatoms &md) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +/*!\brief Set PBC stub.*/ +void LincsCuda::setPbc(gmx_unused const t_pbc *pbc) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +/*! \brief Copy coordinates from provided CPU location to GPU stub.*/ +void LincsCuda::copyCoordinatesToGpu(gmx_unused const rvec *h_x, gmx_unused const rvec *h_xp) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +/*! \brief Copy velocities from provided CPU location to GPU stub.*/ +void LincsCuda::copyVelocitiesToGpu(gmx_unused const rvec *h_v) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +/*! \brief Copy coordinates from GPU to provided CPU location stub.*/ +void LincsCuda::copyCoordinatesFromGpu(gmx_unused rvec *h_xp) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +/*! \brief Copy velocities from GPU to provided CPU location stub.*/ +void LincsCuda::copyVelocitiesFromGpu(gmx_unused rvec *h_v) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +/*! \brief Set the internal GPU-memory d_x, d_xprime and d_v pointers stub.*/ +void LincsCuda::setXVPointers(gmx_unused rvec *d_x, gmx_unused rvec *d_xp, gmx_unused rvec *d_v) +{ + GMX_ASSERT(false, "A CPU stub for LINCS was called insted of the correct implementation."); +} + +} // namespace gmx + +#endif /* GMX_GPU != GMX_GPU_CUDA */ diff --git a/src/gromacs/mdlib/lincs_cuda_impl.cu b/src/gromacs/mdlib/lincs_cuda_impl.cu new file mode 100644 index 0000000000..f4327e0df7 --- /dev/null +++ b/src/gromacs/mdlib/lincs_cuda_impl.cu @@ -0,0 +1,996 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements LINCS using CUDA + * + * This file contains implementation of LINCS constraints algorithm + * using CUDA, including class initialization, data-structures management + * and GPU kernel. + * + * \note Management of coordinates, velocities, CUDA stream and periodic boundary exists here as a temporary + * scaffolding to allow this class to be used as a stand-alone unit. The scaffolding is intended to be + * removed once constraints are integrated with update module. + * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu". + * + * \author Artem Zhmurov + * \author Alan Gray + * + * \ingroup module_mdlib + */ +#include "gmxpre.h" + +#include "lincs_cuda_impl.h" + +#include +#include + +#include + +#include + +#include "gromacs/gpu_utils/cuda_arch_utils.cuh" +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/devicebuffer.cuh" +#include "gromacs/gpu_utils/gputraits.cuh" +#include "gromacs/gpu_utils/vectype_ops.cuh" +#include "gromacs/math/vec.h" +#include "gromacs/mdlib/constr.h" +#include "gromacs/mdlib/lincs_cuda.h" +#include "gromacs/pbcutil/pbc.h" +#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh" +#include "gromacs/topology/ifunc.h" + +namespace gmx +{ + +//! Number of CUDA threads in a block +constexpr static int c_threadsPerBlock = 256; +//! Maximum number of threads in a block (for __launch_bonds__) +constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock; + +/*! \brief Main kernel for LINCS constraints. + * + * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm. + * + * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are + * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved + * by moving active threads to the next block, if the correspondent group of coupled constraints is to big + * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e. + * threads that are not required to do actual work. Since constraints from different blocks are not coupled, + * there is no need to synchronize across the device. However, extensive communication in a thread block + * are still needed. + * + * \todo Reduce synchronization overhead. Some ideas are: + * 1. Consider going to warp-level synchronization for the coupled constraints. + * 2. Move more data to local/shared memory and try to get rid of atomic operations (at least on + * the device level). + * 3. Use analytical solution for matrix A inversion. + * 4. Introduce mapping of thread id to both single constraint and single atom, thus designating + * Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints. + * See Redmine issue #2885 for details (https://redmine.gromacs.org/issues/2885) + * + * \param[in,out] kernelParams All parameters and pointers for the kernel condensed in single struct. + * \param[in] invdt Inverse timestep (needed to update velocities). + */ +template +__launch_bounds__(c_maxThreadsPerBlock) +__global__ void lincs_kernel(LincsCudaKernelParameters kernelParams, + const float invdt) +{ + const PbcAiuc pbcAiuc = kernelParams.pbcAiuc; + const int numConstraintsThreads = kernelParams.numConstraintsThreads; + const int numIterations = kernelParams.numIterations; + const int expansionOrder = kernelParams.expansionOrder; + const float3* __restrict__ gm_x = kernelParams.d_x; + float3* gm_xp = kernelParams.d_xp; + const int2* __restrict__ gm_constraints = kernelParams.d_constraints; + const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths; + const int* __restrict__ gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts; + const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIdxes; + const float* __restrict__ gm_massFactors = kernelParams.d_massFactors; + float* __restrict__ gm_matrixA = kernelParams.d_matrixA; + const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses; + float3* gm_v = kernelParams.d_v; + float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled; + + int threadIndex = blockIdx.x*blockDim.x+threadIdx.x; + + // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize). + // This is to ensure proper synchronizations and reduction. All array are padded to the required size. + assert(threadIndex < numConstraintsThreads); + + // Vectors connecting constrained atoms before algorithm was applied. + // Needed to construct constrain matrix A + extern __shared__ float3 sm_r[]; + + int2 pair = gm_constraints[threadIndex]; + int i = pair.x; + int j = pair.y; + + // Mass-scaled Lagrange multiplier + float lagrangeScaled = 0.0f; + + float targetLength; + float inverseMassi; + float inverseMassj; + float sqrtReducedMass; + + float3 xi; + float3 xj; + float3 rc; + + // i == -1 indicates dummy constraint at the end of the thread block. + bool isDummyThread = (i == -1); + + // Everything computed for these dummies will be equal to zero + if (isDummyThread) + { + targetLength = 0.0f; + inverseMassi = 0.0f; + inverseMassj = 0.0f; + sqrtReducedMass = 0.0f; + + xi = make_float3(0.0f, 0.0f, 0.0f); + xj = make_float3(0.0f, 0.0f, 0.0f); + rc = make_float3(0.0f, 0.0f, 0.0f); + } + else + { + // Collecting data + targetLength = gm_constraintsTargetLengths[threadIndex]; + inverseMassi = gm_inverseMasses[i]; + inverseMassj = gm_inverseMasses[j]; + sqrtReducedMass = rsqrt(inverseMassi + inverseMassj); + + xi = gm_x[i]; + xj = gm_x[j]; + + float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); + + float rlen = rsqrtf(dx.x*dx.x + dx.y*dx.y + dx.z*dx.z); + rc = rlen*dx; + } + + sm_r[threadIdx.x] = rc; + // Make sure that all r's are saved into shared memory + // before they are accessed in the loop below + __syncthreads(); + + /* + * Constructing LINCS matrix (A) + */ + + // Only non-zero values are saved (for coupled constraints) + int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex]; + for (int n = 0; n < coupledConstraintsCount; n++) + { + int index = n*numConstraintsThreads + threadIndex; + int c1 = gm_coupledConstraintsIdxes[index]; + + float3 rc1 = sm_r[c1 - blockIdx.x*blockDim.x]; + gm_matrixA[index] = gm_massFactors[index]*(rc.x*rc1.x + rc.y*rc1.y + rc.z*rc1.z); + } + + // Skipping in dummy threads + if (!isDummyThread) + { + xi = gm_xp[i]; + xj = gm_xp[j]; + } + + float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); + + float sol = sqrtReducedMass*((rc.x*dx.x + rc.y*dx.y + rc.z*dx.z) - targetLength); + + /* + * Inverse matrix using a set of expansionOrder matrix multiplications + */ + + // This will use the same memory space as sm_r, which is no longer needed. + extern __shared__ float sm_rhs[]; + // Save current right-hand-side vector in the shared memory + sm_rhs[threadIdx.x] = sol; + + for (int rec = 0; rec < expansionOrder; rec++) + { + // Making sure that all sm_rhs are saved before they are accessed in a loop below + __syncthreads(); + float mvb = 0.0f; + + for (int n = 0; n < coupledConstraintsCount; n++) + { + int index = n*numConstraintsThreads + threadIndex; + int c1 = gm_coupledConstraintsIdxes[index]; + // Convolute current right-hand-side with A + // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations + mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)]; + } + // 'Switch' rhs vectors, save current result + // These values will be accessed in the loop above during the next iteration. + sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb; + sol = sol + mvb; + } + + // Current mass-scaled Lagrange multipliers + lagrangeScaled = sqrtReducedMass*sol; + + // Save updated coordinates before correction for the rotational lengthening + float3 tmp = rc*lagrangeScaled; + + // Writing for all but dummy constraints + if (!isDummyThread) + { + atomicAdd(&gm_xp[i], -tmp*inverseMassi); + atomicAdd(&gm_xp[j], tmp*inverseMassj); + } + + /* + * Correction for centripetal effects + */ + for (int iter = 0; iter < numIterations; iter++) + { + // Make sure that all xp's are saved: atomic operation calls before are + // communicating current xp[..] values across thread block. + __syncthreads(); + + if (!isDummyThread) + { + xi = gm_xp[i]; + xj = gm_xp[j]; + } + + float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); + + float len2 = targetLength*targetLength; + float dlen2 = 2.0f*len2 - norm2(dx); + + // TODO A little bit more effective but slightly less readable version of the below would be: + // float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2)); + float proj; + if (dlen2 > 0.0f) + { + proj = sqrtReducedMass*(targetLength - dlen2*rsqrt(dlen2)); + } + else + { + proj = sqrtReducedMass*targetLength; + } + + sm_rhs[threadIdx.x] = proj; + float sol = proj; + + /* + * Same matrix inversion as above is used for updated data + */ + for (int rec = 0; rec < expansionOrder; rec++) + { + // Make sure that all elements of rhs are saved into shared memory + __syncthreads(); + float mvb = 0; + + for (int n = 0; n < coupledConstraintsCount; n++) + { + int index = n*numConstraintsThreads + threadIndex; + int c1 = gm_coupledConstraintsIdxes[index]; + + mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)]; + + } + sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb; + sol = sol + mvb; + } + + // Add corrections to Lagrange multipliers + float sqrtmu_sol = sqrtReducedMass*sol; + lagrangeScaled += sqrtmu_sol; + + // Save updated coordinates for the next iteration + // Dummy constraints are skipped + if (!isDummyThread) + { + float3 tmp = rc*sqrtmu_sol; + atomicAdd(&gm_xp[i], -tmp*inverseMassi); + atomicAdd(&gm_xp[j], tmp*inverseMassj); + } + } + + // Updating particle velocities for all but dummy threads + if (updateVelocities && !isDummyThread) + { + float3 tmp = rc*invdt*lagrangeScaled; + atomicAdd(&gm_v[i], -tmp*inverseMassi); + atomicAdd(&gm_v[j], tmp*inverseMassj); + } + + + if (computeVirial) + { + // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length + // (targetLength) and the normalized vector connecting constrained atoms before + // the algorithm was applied (rc). The evaluation of virial in each thread is + // followed by basic reduction for the values inside single thread block. + // Then, the values are reduced across grid by atomicAdd(...). + // + // TODO Shuffle reduction. + // TODO Should be unified and/or done once when virial is actually needed. + // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally, + // one that works for any datatype. + + // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only + // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength, + // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel. + // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter + // two are no longer in use. + extern __shared__ float sm_threadVirial[]; + float mult = targetLength*lagrangeScaled; + sm_threadVirial[0*blockDim.x + threadIdx.x] = mult*rc.x*rc.x; + sm_threadVirial[1*blockDim.x + threadIdx.x] = mult*rc.x*rc.y; + sm_threadVirial[2*blockDim.x + threadIdx.x] = mult*rc.x*rc.z; + sm_threadVirial[3*blockDim.x + threadIdx.x] = mult*rc.y*rc.y; + sm_threadVirial[4*blockDim.x + threadIdx.x] = mult*rc.y*rc.z; + sm_threadVirial[5*blockDim.x + threadIdx.x] = mult*rc.z*rc.z; + + __syncthreads(); + + // Reduce up to one virial per thread block. All blocks are divided by half, the first + // half of threads sums two virials. Then the first half is divided by two and the first + // half of it sums two values. This procedure is repeated until only one thread is left. + // Only works if the threads per blocks is a power of two (hence static_assert + // in the beginning of the kernel). + for (int divideBy = 2; divideBy <= blockDim.x; divideBy *= 2) + { + int dividedAt = blockDim.x/divideBy; + if (threadIdx.x < dividedAt) + { + for (int d = 0; d < 6; d++) + { + sm_threadVirial[d*blockDim.x + threadIdx.x] += sm_threadVirial[d*blockDim.x + (threadIdx.x + dividedAt)]; + } + } + // Syncronize if not within one warp + if (dividedAt > warpSize/2) + { + __syncthreads(); + } + } + // First 6 threads in the block add the results of 6 tensor components to the global memory address. + if (threadIdx.x < 6) + { + atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x*blockDim.x]); + } + } + + return; +} + +/*! \brief Select templated kernel. + * + * Returns pointer to a CUDA kernel based on provided booleans. + * + * \param[in] updateVelocities If the velocities should be constrained. + * \param[in] computeVirial If virial should be updated. + * + * \return Pointer to CUDA kernel + */ +inline auto getLincsKernelPtr(const bool updateVelocities, + const bool computeVirial) +{ + + auto kernelPtr = lincs_kernel; + if (updateVelocities && computeVirial) + { + kernelPtr = lincs_kernel; + } + else if (updateVelocities && !computeVirial) + { + kernelPtr = lincs_kernel; + } + else if (!updateVelocities && computeVirial) + { + kernelPtr = lincs_kernel; + } + else if (!updateVelocities && !computeVirial) + { + kernelPtr = lincs_kernel; + } + return kernelPtr; +} + +/*! \brief Apply LINCS. + * + * Applies LINCS to coordinates and velocities, stored on GPU. + * Data at pointers xPrime and v (class fields) change in the GPU + * memory. The results are not automatically copied back to the CPU + * memory. Method uses this class data structures which should be + * updated when needed using update method. + * + * \param[in] updateVelocities If the velocities should be constrained. + * \param[in] invdt Reciprocal timestep (to scale Lagrange + * multipliers when velocities are updated) + * \param[in] computeVirial If virial should be updated. + * \param[in,out] virialScaled Scaled virial tensor to be updated. + */ +void LincsCuda::Impl::apply(const bool updateVelocities, + const real invdt, + const bool computeVirial, + tensor virialScaled) +{ + ensureNoPendingCudaError("In CUDA version of LINCS"); + + if (computeVirial) + { + // Fill with zeros so the values can be reduced to it + // Only 6 values are needed because virial is symmetrical + clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, stream_); + } + + auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial); + + KernelLaunchConfig config; + config.blockSize[0] = c_threadsPerBlock; + config.blockSize[1] = 1; + config.blockSize[2] = 1; + config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1)/c_threadsPerBlock; + config.gridSize[1] = 1; + config.gridSize[2] = 1; + + // Shared memory is used to store: + // -- Current coordinates (3 floats per thread) + // -- Right-hand-sides for matrix inversion (2 floats per thread) + // -- Virial tensor components (6 floats per thread) + // Since none of these three are needed simultaneously, they can be saved at the same shared memory address + // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only + // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not. + if (computeVirial) + { + config.sharedMemorySize = c_threadsPerBlock*6*sizeof(float); + } + else + { + config.sharedMemorySize = c_threadsPerBlock*3*sizeof(float); + } + config.stream = stream_; + + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, + &kernelParams_, + &invdt); + + launchGpuKernel(kernelPtr, config, nullptr, + "lincs_kernel", kernelArgs); + + if (computeVirial) + { + // Copy LINCS virial data and add it to the common virial + copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, + 0, 6, + stream_, GpuApiCallBehavior::Sync, nullptr); + + // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object + virialScaled[XX][XX] += h_virialScaled_[0]; + virialScaled[XX][YY] += h_virialScaled_[1]; + virialScaled[XX][ZZ] += h_virialScaled_[2]; + + virialScaled[YY][XX] += h_virialScaled_[1]; + virialScaled[YY][YY] += h_virialScaled_[3]; + virialScaled[YY][ZZ] += h_virialScaled_[4]; + + virialScaled[ZZ][XX] += h_virialScaled_[2]; + virialScaled[ZZ][YY] += h_virialScaled_[4]; + virialScaled[ZZ][ZZ] += h_virialScaled_[5]; + } + + return; +} + +/*! \brief Create LINCS object + * + * \param[in] numAtoms Number of atoms that will be handles by LINCS. + * Used to compute the memory size in allocations and copy. + * \param[in] numIterations Number of iterations used to compute inverse matrix. + * \param[in] expansionOrder LINCS projection order for correcting the direction of constraint. + */ +LincsCuda::Impl::Impl(int numAtoms, + int numIterations, + int expansionOrder) + : numAtoms_(numAtoms) +{ + kernelParams_.numIterations = numIterations; + kernelParams_.expansionOrder = expansionOrder; + + static_assert(sizeof(real) == sizeof(float), + "Real numbers should be in single precision in GPU code."); + static_assert(c_threadsPerBlock > 0 && !(c_threadsPerBlock & (c_threadsPerBlock - 1) == 0), + "Nmber of threads per block should be a power of two in order for reduction to work."); + + // This is temporary. LINCS should not manage coordinates. + allocateDeviceBuffer(&kernelParams_.d_x, numAtoms, nullptr); + allocateDeviceBuffer(&kernelParams_.d_xp, numAtoms, nullptr); + allocateDeviceBuffer(&kernelParams_.d_v, numAtoms, nullptr); + + allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr); + h_virialScaled_.resize(6); + + // The data arrays should be expanded/reallocated on first call of set() function. + maxConstraintsNumberSoFar_ = 0; + // Use default stream. + // TODO The stream should/can be assigned by the GPU schedule when the code will be integrated. + stream_ = nullptr; + +} + +LincsCuda::Impl::~Impl() +{ +} + +/*! \brief Helper function to go through constraints recursively. + * + * For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array. + * This information is used to split the array of constraints between thread blocks on a GPU so there is no + * coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the + * value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located + * after it in the constraints array. + * + * \param[in] a Atom index. + * \param[in,out] spaceNeeded Indicates if the constraint was already counted and stores + * the number of constraints (i) connected to it and (ii) located + * after it in memory. This array is filled by this recursive function. + * For a set of coupled constraints, only for the first one in this list + * the number of consecutive coupled constraints is needed: if there is + * not enough space for this set of constraints in the thread block, + * the group has to be moved to the next one. + * \param[in] atomAdjacencyList Stores information about connections between atoms. + */ +inline int countCoupled(int a, std::vector *spaceNeeded, + std::vector > > *atomsAdjacencyList) + +{ + int c2, a2, sign; + int counted = 0; + for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++) + { + std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i); + if (spaceNeeded->at(c2) == -1) + { + spaceNeeded->at(c2) = 0; // To indicate we've been here + counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList); + } + } + return counted; +} + +/*! \brief + * Update data-structures (e.g. after NB search step). + * + * Updates the constraints data and copies it to the GPU. Should be + * called if the particles were sorted, redistributed between domains, etc. + * This version uses common data formats so it can be called from anywhere + * in the code. Does not recycle the data preparation routines from the CPU + * version. Works only with simple case when all the constraints in idef are + * are handled by a single GPU. Triangles are not handled as special case. + * + * Information about constraints is taken from: + * idef.il[F_CONSTR].iatoms --- type (T) of constraint and two atom indexes (i1, i2) + * idef.iparams[T].constr.dA --- target length for constraint of type T + * From t_mdatom, the code takes: + * md.invmass --- array of inverse square root of masses for each atom in the system. + * + * \param[in] idef Local topology data to get information on constraints from. + * \param[in] md Atoms data to get atom masses from. + */ +void LincsCuda::Impl::set(const t_idef &idef, + const t_mdatoms &md) +{ + + // List of constrained atoms (CPU memory) + std::vector constraintsHost; + // Equilibrium distances for the constraints (CPU) + std::vector constraintsTargetLengthsHost; + // Number of constraints, coupled with the current one (CPU) + std::vector coupledConstraintsCountsHost; + // List of coupled with the current one (CPU) + std::vector coupledConstraintsIdxesHost; + // Mass factors (CPU) + std::vector massFactorsHost; + + // List of constrained atoms in local topology + t_iatom *iatoms = idef.il[F_CONSTR].iatoms; + const int stride = NRAL(F_CONSTR) + 1; + const int numConstraints = idef.il[F_CONSTR].nr/stride; + // Constructing adjacency list --- usefull intermediate structure + std::vector > > atomsAdjacencyList(numAtoms_); + for (int c = 0; c < numConstraints; c++) + { + int a1 = iatoms[stride*c + 1]; + int a2 = iatoms[stride*c + 2]; + + // Each constraint will be represented as a tuple, containing index of the second constrained atom, + // index of the constraint and a sign that indicates the order of atoms in which they are listed. + // Sign is needed to compute the mass factors. + atomsAdjacencyList.at(a1).push_back(std::make_tuple(a2, c, +1)); + atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1)); + } + + // Compute, how many coupled constraints are in front of each constraint. + // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block. + // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint + // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the + // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted. + std::vector spaceNeeded; + spaceNeeded.resize(numConstraints, -1); + std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1); + for (int c = 0; c < numConstraints; c++) + { + int a1 = iatoms[stride*c + 1]; + int a2 = iatoms[stride*c + 2]; + if (spaceNeeded.at(c) == -1) + { + spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList) + + countCoupled(a2, &spaceNeeded, &atomsAdjacencyList); + } + } + + // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which + // takes into account the empty spaces which might be needed in the end of each thread block. + std::vector splitMap; + splitMap.resize(numConstraints, -1); + int currentMapIndex = 0; + for (int c = 0; c < numConstraints; c++) + { + // Check if coupled constraints all fit in one block + GMX_RELEASE_ASSERT(spaceNeeded.at(c) < c_threadsPerBlock, "Maximum number of coupled constraints exceedes the size of the CUDA thread block. " + "Most likely, you are trying to use GPU version of LINCS with constraints on all-bonds, " + "which is not supported. Try using H-bonds constraints only."); + if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock) + { + currentMapIndex = ((currentMapIndex/c_threadsPerBlock) + 1) * c_threadsPerBlock; + } + splitMap.at(c) = currentMapIndex; + currentMapIndex++; + } + kernelParams_.numConstraintsThreads = currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock; + GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0, "Number of threads should be a multiple of the block size"); + + // Initialize constraints and their target indexes taking into account the splits in the data arrays. + int2 pair; + pair.x = -1; + pair.y = -1; + constraintsHost.resize(kernelParams_.numConstraintsThreads, pair); + std::fill(constraintsHost.begin(), constraintsHost.end(), pair); + constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0); + std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0); + for (int c = 0; c < numConstraints; c++) + { + int a1 = iatoms[stride*c + 1]; + int a2 = iatoms[stride*c + 2]; + int type = iatoms[stride*c]; + + int2 pair; + pair.x = a1; + pair.y = a2; + constraintsHost.at(splitMap.at(c)) = pair; + constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA; + + } + + // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint). + // We map a single thread to a single constraint, hence each thread 'c' will be using one element from + // coupledConstraintsCountsHost array, which is the number of constraints coupled to the constraint 'c'. + // The coupled constraints indexes are placed into the coupledConstraintsIdxesHost array. Latter is organized + // as a one-dimensional array to ensure good memory alignment. It is addressed as [c + i*numConstraintsThreads], + // where 'i' goes from zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of + // the array --- a number, greater then total number of constraints, taking into account the splits in the + // constraints array due to the GPU block borders. This number can be adjusted to improve memory access pattern. + // Mass factors are saved in a similar data structure. + int maxCoupledConstraints = 0; + for (int c = 0; c < numConstraints; c++) + { + int a1 = iatoms[stride*c + 1]; + int a2 = iatoms[stride*c + 2]; + + // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'. + int nCoupedConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2; + + if (nCoupedConstraints > maxCoupledConstraints) + { + maxCoupledConstraints = nCoupedConstraints; + } + } + + coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0); + coupledConstraintsIdxesHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1); + massFactorsHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1); + + for (int c1 = 0; c1 < numConstraints; c1++) + { + coupledConstraintsCountsHost.at(splitMap.at(c1)) = 0; + int c1a1 = iatoms[stride*c1 + 1]; + int c1a2 = iatoms[stride*c1 + 2]; + int c2; + int c2a1; + int c2a2; + + int sign; + + // Constraints, coupled trough the first atom. + c2a1 = c1a1; + for (unsigned j = 0; j < atomsAdjacencyList.at(c1a1).size(); j++) + { + + std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a1).at(j); + + if (c1 != c2) + { + int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1); + + coupledConstraintsIdxesHost.at(index) = splitMap.at(c2); + + int center = c1a1; + + float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]); + float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]); + + massFactorsHost.at(index) = -sign*md.invmass[center]*sqrtmu1*sqrtmu2; + + coupledConstraintsCountsHost.at(splitMap.at(c1))++; + + } + } + + // Constraints, coupled through the second atom. + c2a1 = c1a2; + for (unsigned j = 0; j < atomsAdjacencyList.at(c1a2).size(); j++) + { + + std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a2).at(j); + + if (c1 != c2) + { + int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1); + + coupledConstraintsIdxesHost.at(index) = splitMap.at(c2); + + int center = c1a2; + + float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]); + float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]); + + massFactorsHost.at(index) = sign*md.invmass[center]*sqrtmu1*sqrtmu2; + + coupledConstraintsCountsHost.at(splitMap.at(c1))++; + + } + } + } + + // (Re)allocate the memory, if the number of constraints has increased. + if (numConstraints > maxConstraintsNumberSoFar_) + { + // Free memory if it was allocated before (i.e. if not the first time here). + if (maxConstraintsNumberSoFar_ > 0) + { + freeDeviceBuffer(&kernelParams_.d_inverseMasses); + + freeDeviceBuffer(&kernelParams_.d_constraints); + freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths); + + freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts); + freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes); + freeDeviceBuffer(&kernelParams_.d_massFactors); + freeDeviceBuffer(&kernelParams_.d_matrixA); + + } + maxConstraintsNumberSoFar_ = numConstraints; + + allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms_, nullptr); + + allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr); + allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, kernelParams_.numConstraintsThreads, nullptr); + + allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, kernelParams_.numConstraintsThreads, nullptr); + allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr); + allocateDeviceBuffer(&kernelParams_.d_massFactors, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr); + allocateDeviceBuffer(&kernelParams_.d_matrixA, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr); + + } + + // Copy data to GPU. + copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), + 0, kernelParams_.numConstraintsThreads, + stream_, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, constraintsTargetLengthsHost.data(), + 0, kernelParams_.numConstraintsThreads, + stream_, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, coupledConstraintsCountsHost.data(), + 0, kernelParams_.numConstraintsThreads, + stream_, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, coupledConstraintsIdxesHost.data(), + 0, maxCoupledConstraints*kernelParams_.numConstraintsThreads, + stream_, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), + 0, maxCoupledConstraints*kernelParams_.numConstraintsThreads, + stream_, GpuApiCallBehavior::Sync, nullptr); + + GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n"); + copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, + 0, numAtoms_, + stream_, GpuApiCallBehavior::Sync, nullptr); + +} + +/*! \brief + * Update PBC data. + * + * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter. + * + * \param[in] pbc The PBC data in t_pbc format. + */ +void LincsCuda::Impl::setPbc(const t_pbc *pbc) +{ + setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc); +} + +/*! \brief + * Copy coordinates from provided CPU location to GPU. + * + * Copies the coordinates before the integration step (x) and coordinates + * after the integration step (xp) from the provided CPU location to GPU. + * The data are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_x CPU pointer where coordinates should be copied from. + * \param[in] h_xp CPU pointer where coordinates should be copied from. + */ +void LincsCuda::Impl::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp) +{ + copyToDeviceBuffer(&kernelParams_.d_x, (float3*)h_x, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&kernelParams_.d_xp, (float3*)h_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr); +} + +/*! \brief + * Copy velocities from provided CPU location to GPU. + * + * Nothing is done if the argument provided is a nullptr. + * The data are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_v CPU pointer where velocities should be copied from. + */ +void LincsCuda::Impl::copyVelocitiesToGpu(const rvec *h_v) +{ + if (h_v != nullptr) + { + copyToDeviceBuffer(&kernelParams_.d_v, (float3*)h_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr); + } +} + +/*! \brief + * Copy coordinates from GPU to provided CPU location. + * + * Copies the constrained coordinates to the provided location. The coordinates + * are assumed to be in float3/fvec format (single precision). + * + * \param[out] h_xp CPU pointer where coordinates should be copied to. + */ +void LincsCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp) +{ + copyFromDeviceBuffer((float3*)h_xp, &kernelParams_.d_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr); +} + +/*! \brief + * Copy velocities from GPU to provided CPU location. + * + * The velocities are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_v Pointer to velocities data. + */ +void LincsCuda::Impl::copyVelocitiesFromGpu(rvec *h_v) +{ + copyFromDeviceBuffer((float3*)h_v, &kernelParams_.d_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr); +} + +/*! \brief + * Set the internal GPU-memory x, xprime and v pointers. + * + * Data is not copied. The data are assumed to be in float3/fvec format + * (float3 is used internally, but the data layout should be identical). + * + * \param[in] d_x Pointer to the coordinates before integrator update (on GPU) + * \param[in] d_xp Pointer to the coordinates after integrator update, before update (on GPU) + * \param[in] d_v Pointer to the velocities before integrator update (on GPU) + */ +void LincsCuda::Impl::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v) +{ + kernelParams_.d_x = (float3*)d_x; + kernelParams_.d_xp = (float3*)d_xp; + kernelParams_.d_v = (float3*)d_v; +} + + +LincsCuda::LincsCuda(const int numAtoms, + const int numIterations, + const int expansionOrder) + : impl_(new Impl(numAtoms, numIterations, expansionOrder)) +{ +} + +LincsCuda::~LincsCuda() = default; + +void LincsCuda::apply(const bool updateVelocities, + const real invdt, + const bool computeVirial, + tensor virialScaled) +{ + impl_->apply(updateVelocities, + invdt, + computeVirial, + virialScaled); +} + +void LincsCuda::setPbc(const t_pbc *pbc) +{ + impl_->setPbc(pbc); +} + +void LincsCuda::set(const t_idef &idef, + const t_mdatoms &md) +{ + impl_->set(idef, md); +} + +void LincsCuda::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp) +{ + impl_->copyCoordinatesToGpu(h_x, h_xp); +} + +void LincsCuda::copyVelocitiesToGpu(const rvec *h_v) +{ + impl_->copyVelocitiesToGpu(h_v); +} + +void LincsCuda::copyCoordinatesFromGpu(rvec *h_xp) +{ + impl_->copyCoordinatesFromGpu(h_xp); +} + +void LincsCuda::copyVelocitiesFromGpu(rvec *h_v) +{ + impl_->copyVelocitiesFromGpu(h_v); +} + +void LincsCuda::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v) +{ + impl_->setXVPointers(d_x, d_xp, d_v); +} + +} // namespace gmx diff --git a/src/gromacs/mdlib/lincs_cuda_impl.h b/src/gromacs/mdlib/lincs_cuda_impl.h new file mode 100644 index 0000000000..8876bb9d69 --- /dev/null +++ b/src/gromacs/mdlib/lincs_cuda_impl.h @@ -0,0 +1,257 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declares CUDA implementation class for LINCS + * + * This header file is needed to include from both the device-side + * kernels file, and the host-side management code. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#ifndef GMX_MDLIB_LINCS_CUDA_IMPL_H +#define GMX_MDLIB_LINCS_CUDA_IMPL_H + + +#include "gromacs/mdlib/constr.h" +#include "gromacs/mdlib/lincs_cuda.h" +#include "gromacs/mdtypes/mdatom.h" +#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh" +#include "gromacs/topology/idef.h" + +namespace gmx +{ + +/* \brief LINCS parameters and GPU pointers + * + * This is used to accumulate all the parameters and pointers so they can be passed + * to the GPU as a single structure. + * + */ +struct LincsCudaKernelParameters +{ + //! Periodic boundary data + PbcAiuc pbcAiuc; + //! Order of expansion when inverting the matrix + int expansionOrder; + //! Number of iterations used to correct the projection + int numIterations; + //! Coordinates before the timestep (in the GPU memory) + float3 *d_x; + //! Coordinates after the timestep, before constraining (on GPU) + float3 *d_xp; + //! Velocities of atoms (on GPU) + float3 *d_v; + //! 1/mass for all atoms (GPU) + float *d_inverseMasses; + //! Scaled virial tensor (6 floats: [XX, XY, XZ, YY, YZ, ZZ], GPU) + float *d_virialScaled; + /*! \brief Total number of threads. + * + * This covers all constraints and gaps in the ends of the thread blocks + * that are necessary to avoid inter-block synchronizations. + * Should be a multiple of block size (the last block is filled with dummy to the end). + */ + int numConstraintsThreads; + //! List of constrained atoms (GPU memory) + int2 *d_constraints; + //! Equilibrium distances for the constraints (GPU) + float *d_constraintsTargetLengths; + //! Number of constraints, coupled with the current one (GPU) + int *d_coupledConstraintsCounts; + //! List of coupled with the current one (GPU) + int *d_coupledConstraintsIdxes; + //! Elements of the coupling matrix. + float *d_matrixA; + //! Mass factors (GPU) + float *d_massFactors; +}; + +/*! \internal \brief Class with interfaces and data for CUDA version of LINCS. */ +class LincsCuda::Impl +{ + + public: + + /*! \brief Constructor. + * + * \param[in] numAtoms Number of atoms + * \param[in] numIterations Number of iteration for the correction of the projection. + * \param[in] expansionOrder Order of the matrix inversion algorithm. + */ + Impl(int numAtoms, + int numIterations, + int expansionOrder); + /*! \brief Destructor.*/ + ~Impl(); + + /*! \brief Apply LINCS. + * + * Applies LINCS to coordinates and velocities, stored on GPU. + * Data at pointers xPrime and v (class fields) change in the GPU + * memory. The results are not automatically copied back to the CPU + * memory. Method uses this class data structures which should be + * updated when needed using update method. + * + * \param[in] updateVelocities If the velocities should be constrained. + * \param[in] invdt Reciprocal timestep (to scale Lagrange + * multipliers when velocities are updated) + * \param[in] computeVirial If virial should be updated. + * \param[in,out] virialScaled Scaled virial tensor to be updated. + */ + void apply(bool updateVelocities, + real invdt, + bool computeVirial, + tensor virialScaled); + + /*! \brief + * Update data-structures (e.g. after NB search step). + * + * Updates the constraints data and copies it to the GPU. Should be + * called if the particles were sorted, redistributed between domains, etc. + * This version uses common data formats so it can be called from anywhere + * in the code. Does not recycle the data preparation routines from the CPU + * version. Works only with simple case when all the constraints in idef are + * are handled by a single GPU. Triangles are not handled as special case. + * + * Information about constraints is taken from: + * idef.il[F_CONSTR].iatoms --- type (T) of constraint and two atom indexes (i1, i2) + * idef.iparams[T].constr.dA --- target length for constraint of type T + * From t_mdatom, the code takes: + * md.invmass --- array of inverse square root of masses for each atom in the system. + * + * \param[in] idef Local topology data to get information on constraints from. + * \param[in] md Atoms data to get atom masses from. + */ + void set(const t_idef &idef, + const t_mdatoms &md); + + /*! \brief + * Update PBC data. + * + * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter. + * + * \param[in] pbc The PBC data in t_pbc format. + */ + void setPbc(const t_pbc *pbc); + + /*! \brief + * Copy coordinates from provided CPU location to GPU. + * + * Copies the coordinates before the integration step (x) and coordinates + * after the integration step (xp) from the provided CPU location to GPU. + * The data are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_x CPU pointer where coordinates should be copied from. + * \param[in] h_xp CPU pointer where coordinates should be copied from. + */ + void copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp); + + /*! \brief + * Copy velocities from provided CPU location to GPU. + * + * Nothing is done if the argument provided is a nullptr. + * The data are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_v CPU pointer where velocities should be copied from. + */ + void copyVelocitiesToGpu(const rvec *h_v); + + /*! \brief + * Copy coordinates from GPU to provided CPU location. + * + * Copies the constrained coordinates to the provided location. The coordinates + * are assumed to be in float3/fvec format (single precision). + * + * \param[out] h_xp CPU pointer where coordinates should be copied to. + */ + void copyCoordinatesFromGpu(rvec *h_xp); + + /*! \brief + * Copy velocities from GPU to provided CPU location. + * + * The velocities are assumed to be in float3/fvec format (single precision). + * + * \param[in] h_v Pointer to velocities data. + */ + void copyVelocitiesFromGpu(rvec *h_v); + + /*! \brief + * Set the internal GPU-memory x, xprime and v pointers. + * + * Data is not copied. The data are assumed to be in float3/fvec format + * (float3 is used internally, but the data layout should be identical). + * + * \param[in] d_x Pointer to the coordinates before integrator update (on GPU) + * \param[in] d_xp Pointer to the coordinates after integrator update, before update (on GPU) + * \param[in] d_v Pointer to the velocities before integrator update (on GPU) + */ + void setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v); + + private: + + /*! \brief CUDA stream + * + * \todo Currently default stream (nullptr) is used. The streams should be managed outside this module + * and passed here if needed. + */ + cudaStream_t stream_; + + //! Parameters and pointers, passed to the CUDA kernel + LincsCudaKernelParameters kernelParams_; + + /*! \brief Number of atoms + * + * \todo Probably, it should not be stored by this class and passed as an argument when needed. + * This question is coupled with the general relocation of the coordinates and velocities. + */ + int numAtoms_; + + //! Scaled virial tensor (6 floats: [XX, XY, XZ, YY, YZ, ZZ]) + std::vector h_virialScaled_; + + /*! \brief Maximum total of constraints assigned to this class so far. + * + * If the new number of constraints is larger then previous maximum, the GPU data arrays are + * reallocated. + */ + int maxConstraintsNumberSoFar_; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/mdlib/tests/constr.cpp b/src/gromacs/mdlib/tests/constr.cpp index 6f05453dcd..a5f7a3b215 100644 --- a/src/gromacs/mdlib/tests/constr.cpp +++ b/src/gromacs/mdlib/tests/constr.cpp @@ -35,6 +35,12 @@ /*! \internal \file * \brief SHAKE and LINCS tests. * + * \todo Better tests for virial are needed. + * \todo Tests for bigger systems to test threads synchronization, + * reduction, etc. on the GPU. + * \todo Tests for algorithms for derivatives. + * \todo Free-energy perturbation tests + * * \author Artem Zhmurov * \ingroup module_mdlib */ @@ -43,6 +49,8 @@ #include "gromacs/mdlib/constr.h" +#include "config.h" + #include #include @@ -61,6 +69,7 @@ #include "gromacs/math/vectypes.h" #include "gromacs/mdlib/gmx_omp_nthreads.h" #include "gromacs/mdlib/lincs.h" +#include "gromacs/mdlib/lincs_cuda.h" #include "gromacs/mdlib/shake.h" #include "gromacs/mdtypes/commrec.h" #include "gromacs/mdtypes/inputrec.h" @@ -91,38 +100,63 @@ namespace test struct ConstraintsTestData { public: - std::string title_; //!< Human-friendly name for a system - int nAtom_; //!< Number of atoms - gmx_mtop_t mtop_; //!< Topology - std::vector masses_; //!< Masses - std::vector invmass_; //!< Inverse masses - t_commrec cr_; //!< Communication record - t_inputrec ir_; //!< Input record (info that usually in .mdp file) - t_idef idef_; //!< Local topology - t_mdatoms md_; //!< MD atoms - gmx_multisim_t ms_; //!< Multisim data - t_nrnb nrnb_; //!< Computational time array (normally used in benchmarks) - - real invdt_; //!< Inverse timestep - int nflexcon_ = 0; //!< Number of flexible constraints - bool computeVirial_; //!< Whether the virial should be computed - tensor virialScaled_; //!< Scaled virial - tensor virialScaledRef_; //!< Scaled virial (reference values) - bool compute_dHdLambda_; //!< If the free energy is computed - real dHdLambda_; //!< For free energy computation - real dHdLambdaRef_; //!< For free energy computation (reference value) - - PaddedVector x_; //!< Coordinates before the timestep - PaddedVector xPrime_; //!< Coordinates after timestep, output for the constraints - PaddedVector xPrime0_; //!< Backup for coordinates (for reset) - PaddedVector xPrime2_; //!< Intermediate set of coordinates used by LINCS and - //!< SHAKE for different purposes - PaddedVector v_; //!< Velocities - PaddedVector v0_; //!< Backup for velocities (for reset) - - // Fields to store constraints data for testing - std::vector constraints_; //!< Constraints data (type1-i1-j1-type2-i2-j2-...) - std::vector constraintsR0_; //!< Target lengths for all constraint types + //! Human-friendly name for a system + std::string title_; + //! Number of atoms + int numAtoms_; + //! Topology + gmx_mtop_t mtop_; + //! Masses + std::vector masses_; + //! Inverse masses + std::vector invmass_; + //! Communication record + t_commrec cr_; + //! Input record (info that usually in .mdp file) + t_inputrec ir_; + //! Local topology + t_idef idef_; + //! MD atoms + t_mdatoms md_; + //! Multisim data + gmx_multisim_t ms_; + //! Computational time array (normally used to benchmark performance) + t_nrnb nrnb_; + + //! Inverse timestep + real invdt_; + //! Number of flexible constraints + int nflexcon_ = 0; + //! Whether the virial should be computed + bool computeVirial_; + //! Scaled virial + tensor virialScaled_; + //! Scaled virial (reference values) + tensor virialScaledRef_; + //! If the free energy is computed + bool compute_dHdLambda_; + //! For free energy computation + real dHdLambda_; + //! For free energy computation (reference value) + real dHdLambdaRef_; + + //! Coordinates before the timestep + PaddedVector x_; + //! Coordinates after timestep, output for the constraints + PaddedVector xPrime_; + //! Backup for coordinates (for reset) + PaddedVector xPrime0_; + //! Intermediate set of coordinates (normally used for projection correction) + PaddedVector xPrime2_; + //! Velocities + PaddedVector v_; + //! Backup for velocities (for reset) + PaddedVector v0_; + + //! Constraints data (type1-i1-j1-type2-i2-j2-...) + std::vector constraints_; + //! Target lengths for all constraint types + std::vector constraintsR0_; /*! \brief * Constructor for the object with all parameters and variables needed by constraints algorithms. @@ -132,57 +166,57 @@ struct ConstraintsTestData * are saved to allow for reset. The constraints data are stored for testing after constraints * were applied. * - * \param[in] title Human-friendly name of the system. - * \param[in] nAtom Number of atoms in the system. - * \param[in] masses Atom masses. Size of this vector should be equal to nAtom. - * \param[in] constraints List of constraints, organized in triples of integers. - * First integer is the index of type for a constraint, second - * and third are the indices of constrained atoms. The types - * of constraints should be sequential but not necessarily - * start from zero (which is the way they normally are in - * GROMACS). - * \param[in] constraintsR0 Target values for bond lengths for bonds of each type. The - * size of this vector should be equal to the total number of - * unique types in constraints vector. - * \param[in] computeVirial Whether the virial should be computed. - * \param[in] virialScaledRef Reference values for scaled virial tensor. - * \param[in] compute_dHdLambda Whether free energy should be computed. - * \param[in] dHdLambdaRef Reference value for dHdLambda. - * \param[in] initialTime Initial time. - * \param[in] timestep Timestep. - * \param[in] x Coordinates before integration step. - * \param[in] xPrime Coordinates after integration step, but before constraining. - * \param[in] v Velocities before constraining. - * \param[in] shakeTolerance Target tolerance for SHAKE. - * \param[in] shakeUseSOR Use successive over-relaxation method for SHAKE iterations. - * The general formula is: - * x_n+1 = (1-omega)*x_n + omega*f(x_n), - * where omega = 1 if SOR is off and may be < 1 if SOR is on. - * \param[in] nLincsIter Number of iterations used to compute the inverse matrix. - * \param[in] nProjOrder The order for algorithm that adjusts the direction of the - * bond after constraints are applied. - * \param[in] lincsWarnAngle The threshold value for the change in bond angle. When - * exceeded the program will issue a warning. + * \param[in] title Human-friendly name of the system. + * \param[in] numAtoms Number of atoms in the system. + * \param[in] masses Atom masses. Size of this vector should be equal to numAtoms. + * \param[in] constraints List of constraints, organized in triples of integers. + * First integer is the index of type for a constraint, second + * and third are the indices of constrained atoms. The types + * of constraints should be sequential but not necessarily + * start from zero (which is the way they normally are in + * GROMACS). + * \param[in] constraintsR0 Target values for bond lengths for bonds of each type. The + * size of this vector should be equal to the total number of + * unique types in constraints vector. + * \param[in] computeVirial Whether the virial should be computed. + * \param[in] virialScaledRef Reference values for scaled virial tensor. + * \param[in] compute_dHdLambda Whether free energy should be computed. + * \param[in] dHdLambdaRef Reference value for dHdLambda. + * \param[in] initialTime Initial time. + * \param[in] timestep Timestep. + * \param[in] x Coordinates before integration step. + * \param[in] xPrime Coordinates after integration step, but before constraining. + * \param[in] v Velocities before constraining. + * \param[in] shakeTolerance Target tolerance for SHAKE. + * \param[in] shakeUseSOR Use successive over-relaxation method for SHAKE iterations. + * The general formula is: + * x_n+1 = (1-omega)*x_n + omega*f(x_n), + * where omega = 1 if SOR is off and may be < 1 if SOR is on. + * \param[in] lincsNumIterations Number of iterations used to compute the inverse matrix. + * \param[in] lincsExpansionOrder The order for algorithm that adjusts the direction of the + * bond after constraints are applied. + * \param[in] lincsWarnAngle The threshold value for the change in bond angle. When + * exceeded the program will issue a warning. * */ ConstraintsTestData(const std::string &title, - int nAtom, std::vector masses, + int numAtoms, std::vector masses, std::vector constraints, std::vector constraintsR0, bool computeVirial, tensor virialScaledRef, bool compute_dHdLambda, float dHdLambdaRef, real initialTime, real timestep, const std::vector &x, const std::vector &xPrime, const std::vector &v, real shakeTolerance, gmx_bool shakeUseSOR, - int nLincsIter, int nProjOrder, real lincsWarnAngle) + int lincsNumIterations, int lincsExpansionOrder, real lincsWarnAngle) { - title_ = title; // Human-friendly name of the system - nAtom_ = nAtom; // Number of atoms + title_ = title; // Human-friendly name of the system + numAtoms_ = numAtoms; // Number of atoms // Masses of atoms masses_ = masses; - invmass_.resize(nAtom); // Vector of inverse masses + invmass_.resize(numAtoms); // Vector of inverse masses - for (int i = 0; i < nAtom; i++) + for (int i = 0; i < numAtoms; i++) { invmass_[i] = 1.0/masses.at(i); } @@ -211,8 +245,8 @@ struct ConstraintsTestData md_.nMassPerturbed = 0; md_.lambda = 0.0; md_.invmass = invmass_.data(); - md_.nr = nAtom; - md_.homenr = nAtom; + md_.nr = numAtoms; + md_.homenr = numAtoms; // Virial evaluation computeVirial_ = computeVirial; @@ -281,7 +315,7 @@ struct ConstraintsTestData interactionListEmpty.iatoms.resize(0); gmx_moltype_t molType; - molType.atoms.nr = nAtom; + molType.atoms.nr = numAtoms; molType.ilist.at(F_CONSTR) = interactionList; molType.ilist.at(F_CONSTRNC) = interactionListEmpty; mtop_.moltype.push_back(molType); @@ -291,7 +325,7 @@ struct ConstraintsTestData molBlock.nmol = 1; mtop_.molblock.push_back(molBlock); - mtop_.natoms = nAtom; + mtop_.natoms = numAtoms; mtop_.ffparams.iparams.resize(maxType + 1); for (int i = 0; i <= maxType; i++) { @@ -300,13 +334,13 @@ struct ConstraintsTestData mtop_.bIntermolecularInteractions = false; // Coordinates and velocities - x_.resizeWithPadding(nAtom_); - xPrime_.resizeWithPadding(nAtom_); - xPrime0_.resizeWithPadding(nAtom_); - xPrime2_.resizeWithPadding(nAtom_); + x_.resizeWithPadding(numAtoms); + xPrime_.resizeWithPadding(numAtoms); + xPrime0_.resizeWithPadding(numAtoms); + xPrime2_.resizeWithPadding(numAtoms); - v_.resizeWithPadding(nAtom_); - v0_.resizeWithPadding(nAtom_); + v_.resizeWithPadding(numAtoms); + v0_.resizeWithPadding(numAtoms); std::copy(x.begin(), x.end(), x_.begin()); std::copy(xPrime.begin(), xPrime.end(), xPrime_.begin()); @@ -317,13 +351,13 @@ struct ConstraintsTestData std::copy(v.begin(), v.end(), v0_.begin()); // SHAKE-specific parameters - ir_.shake_tol = shakeTolerance; - ir_.bShakeSOR = shakeUseSOR; + ir_.shake_tol = shakeTolerance; + ir_.bShakeSOR = shakeUseSOR; // LINCS-specific parameters - ir_.nLincsIter = nLincsIter; - ir_.nProjOrder = nProjOrder; - ir_.LincsWarnAngle = lincsWarnAngle; + ir_.nLincsIter = lincsNumIterations; + ir_.nProjOrder = lincsExpansionOrder; + ir_.LincsWarnAngle = lincsWarnAngle; } /*! \brief @@ -368,7 +402,7 @@ struct ConstraintsTestData * The test will run for all possible combinations of accessible * values of the: * 1. PBC setup ("PBCNONE" or "PBCXYZ") - * 2. The algorithm ("SHAKE" or "LINCS"). + * 2. The algorithm ("SHAKE", "LINCS" or "LINCS_GPU"). */ typedef std::tuple ConstraintsTestParameters; @@ -429,6 +463,10 @@ class ConstraintsTest : public ::testing::TestWithParam(testData->numAtoms_, + testData->ir_.nLincsIter, + testData->ir_.nProjOrder); + lincsCuda->set(testData->idef_, testData->md_); + lincsCuda->setPbc(&pbc); + lincsCuda->copyCoordinatesToGpu(as_rvec_array(testData->x_.data()), + as_rvec_array(testData->xPrime_.data())); + lincsCuda->copyVelocitiesToGpu(as_rvec_array(testData->v_.data())); + lincsCuda->apply(true, testData->invdt_, testData->computeVirial_, testData->virialScaled_); + lincsCuda->copyCoordinatesFromGpu(as_rvec_array(testData->xPrime_.data())); + lincsCuda->copyVelocitiesFromGpu(as_rvec_array(testData->v_.data())); + } + + /*! \brief * The test on the final length of constrained bonds. * * Goes through all the constraints and checks if the final length of all the constraints is equal @@ -613,14 +672,14 @@ class ConstraintsTest : public ::testing::TestWithParam xPrimeRef, FloatingPointTolerance tolerance, const ConstraintsTestData &testData) { - for (int i = 0; i < testData.nAtom_; i++) + for (int i = 0; i < testData.numAtoms_; i++) { for (int d = 0; d < DIM; d++) { @@ -721,13 +780,11 @@ class ConstraintsTest : public ::testing::TestWithParam masses = {1.0, 12.0}; std::vector constraints = {0, 0, 1}; @@ -744,24 +801,24 @@ TEST_P(ConstraintsTest, SingleConstraint){ tensor virialScaledRef = {{-5.58e-04, 5.58e-04, 0.00e+00 }, { 5.58e-04, -5.58e-04, 0.00e+00 }, - { 0.00e+00, 0.00e+00, 0.00e+00 } }; + { 0.00e+00, 0.00e+00, 0.00e+00 }}; real shakeTolerance = 0.0001; gmx_bool shakeUseSOR = false; - int lincsNIter = 1; - int lincsNProjOrder = 4; - real lincsWarnAngle = 30.0; + int lincsNIter = 1; + int lincslincsExpansionOrder = 4; + real lincsWarnAngle = 30.0; std::unique_ptr testData = std::make_unique - (title, nAtom, masses, + (title, numAtoms, masses, constraints, constraintsR0, true, virialScaledRef, false, 0, real(0.0), real(0.001), x, xPrime, v, shakeTolerance, shakeUseSOR, - lincsNIter, lincsNProjOrder, lincsWarnAngle); + lincsNIter, lincslincsExpansionOrder, lincsWarnAngle); std::string pbcName; std::string algorithmName; std::tie(pbcName, algorithmName) = GetParam(); @@ -781,11 +838,11 @@ TEST_P(ConstraintsTest, SingleConstraint){ TEST_P(ConstraintsTest, TwoDisjointConstraints){ - std::string title = "two disjoint constraints"; - int nAtom = 4; - std::vector masses = {0.5, 1.0/3.0, 0.25, 1.0}; - std::vector constraints = {0, 0, 1, 1, 2, 3}; - std::vector constraintsR0 = {2.0, 1.0}; + std::string title = "two disjoint constraints"; + int numAtoms = 4; + std::vector masses = {0.5, 1.0/3.0, 0.25, 1.0}; + std::vector constraints = {0, 0, 1, 1, 2, 3}; + std::vector constraintsR0 = {2.0, 1.0}; std::vector x = {{ 2.50, -3.10, 15.70 }, @@ -803,24 +860,26 @@ TEST_P(ConstraintsTest, TwoDisjointConstraints){ { 0.0, 0.0, 1.0 }, { 0.0, 0.0, 0.0 }}; - tensor virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + tensor virialScaledRef = {{ 3.3e-03, -1.7e-04, 5.6e-04 }, + {-1.7e-04, 8.9e-06, -2.8e-05 }, + { 5.6e-04, -2.8e-05, 8.9e-05 }}; real shakeTolerance = 0.0001; gmx_bool shakeUseSOR = false; - int lincsNIter = 1; - int lincsNProjOrder = 4; - real lincsWarnAngle = 30.0; + int lincsNIter = 1; + int lincslincsExpansionOrder = 4; + real lincsWarnAngle = 30.0; std::unique_ptr testData = std::make_unique - (title, nAtom, masses, + (title, numAtoms, masses, constraints, constraintsR0, - false, virialScaledRef, + true, virialScaledRef, false, 0, real(0.0), real(0.001), x, xPrime, v, shakeTolerance, shakeUseSOR, - lincsNIter, lincsNProjOrder, lincsWarnAngle); + lincsNIter, lincslincsExpansionOrder, lincsWarnAngle); std::string pbcName; std::string algorithmName; @@ -835,15 +894,17 @@ TEST_P(ConstraintsTest, TwoDisjointConstraints){ checkCOMCoordinates(absoluteTolerance(0.0001), *testData); checkCOMVelocity(absoluteTolerance(0.0001), *testData); + checkVirialTensor(absoluteTolerance(0.0001), *testData); + } TEST_P(ConstraintsTest, ThreeSequentialConstraints){ - std::string title = "three atoms, connected longitudinally (e.g. CH2)"; - int nAtom = 3; - std::vector masses = {1.0, 12.0, 16.0 }; - std::vector constraints = {0, 0, 1, 1, 1, 2}; - std::vector constraintsR0 = {0.1, 0.2}; + std::string title = "three atoms, connected longitudinally (e.g. CH2)"; + int numAtoms = 3; + std::vector masses = {1.0, 12.0, 16.0 }; + std::vector constraints = {0, 0, 1, 1, 1, 2}; + std::vector constraintsR0 = {0.1, 0.2}; real oneTenthOverSqrtTwo = 0.1_real / std::sqrt(2.0_real); real twoTenthsOverSqrtThree = 0.2_real / std::sqrt(3.0_real); @@ -860,24 +921,26 @@ TEST_P(ConstraintsTest, ThreeSequentialConstraints){ { 0.0, 1.0, 0.0 }, { 0.0, 0.0, 1.0 }}; - tensor virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + tensor virialScaledRef = {{ 4.14e-03, 4.14e-03, 3.31e-03}, + { 4.14e-03, 4.14e-03, 3.31e-03}, + { 3.31e-03, 3.31e-03, 3.31e-03}}; real shakeTolerance = 0.0001; gmx_bool shakeUseSOR = false; - int lincsNIter = 1; - int lincsNProjOrder = 4; - real lincsWarnAngle = 30.0; + int lincsNIter = 1; + int lincslincsExpansionOrder = 4; + real lincsWarnAngle = 30.0; std::unique_ptr testData = std::make_unique - (title, nAtom, masses, + (title, numAtoms, masses, constraints, constraintsR0, - false, virialScaledRef, + true, virialScaledRef, false, 0, real(0.0), real(0.001), x, xPrime, v, shakeTolerance, shakeUseSOR, - lincsNIter, lincsNProjOrder, lincsWarnAngle); + lincsNIter, lincslincsExpansionOrder, lincsWarnAngle); std::string pbcName; std::string algorithmName; @@ -892,15 +955,17 @@ TEST_P(ConstraintsTest, ThreeSequentialConstraints){ checkCOMCoordinates(absoluteTolerance(0.0001), *testData); checkCOMVelocity(absoluteTolerance(0.0001), *testData); + checkVirialTensor(absoluteTolerance(0.0001), *testData); + } TEST_P(ConstraintsTest, ThreeConstraintsWithCentralAtom){ - std::string title = "three atoms, connected to the central atom (e.g. CH3)"; - int nAtom = 4; - std::vector masses = {12.0, 1.0, 1.0, 1.0 }; - std::vector constraints = {0, 0, 1, 0, 0, 2, 0, 0, 3}; - std::vector constraintsR0 = {0.1}; + std::string title = "three atoms, connected to the central atom (e.g. CH3)"; + int numAtoms = 4; + std::vector masses = {12.0, 1.0, 1.0, 1.0 }; + std::vector constraints = {0, 0, 1, 0, 0, 2, 0, 0, 3}; + std::vector constraintsR0 = {0.1}; std::vector x = {{ 0.00, 0.00, 0.00 }, @@ -918,24 +983,26 @@ TEST_P(ConstraintsTest, ThreeConstraintsWithCentralAtom){ { 1.0, 0.0, 0.0 }, { 1.0, 0.0, 0.0 }}; - tensor virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + tensor virialScaledRef = {{7.14e-04, 0.00e+00, 0.00e+00}, + {0.00e+00, 1.08e-03, 0.00e+00}, + {0.00e+00, 0.00e+00, 1.15e-03}}; real shakeTolerance = 0.0001; gmx_bool shakeUseSOR = false; - int lincsNIter = 1; - int lincsNProjOrder = 4; - real lincsWarnAngle = 30.0; + int lincsNIter = 1; + int lincslincsExpansionOrder = 4; + real lincsWarnAngle = 30.0; std::unique_ptr testData = std::make_unique - (title, nAtom, masses, + (title, numAtoms, masses, constraints, constraintsR0, - false, virialScaledRef, + true, virialScaledRef, false, 0, real(0.0), real(0.001), x, xPrime, v, shakeTolerance, shakeUseSOR, - lincsNIter, lincsNProjOrder, lincsWarnAngle); + lincsNIter, lincslincsExpansionOrder, lincsWarnAngle); std::string pbcName; std::string algorithmName; @@ -950,15 +1017,16 @@ TEST_P(ConstraintsTest, ThreeConstraintsWithCentralAtom){ checkCOMCoordinates(absoluteTolerance(0.0001), *testData); checkCOMVelocity(absoluteTolerance(0.0001), *testData); + checkVirialTensor(absoluteTolerance(0.0001), *testData); } TEST_P(ConstraintsTest, FourSequentialConstraints){ - std::string title = "four atoms, connected longitudinally"; - int nAtom = 4; - std::vector masses = {0.5, 1.0/3.0, 0.25, 1.0}; - std::vector constraints = {0, 0, 1, 1, 1, 2, 2, 2, 3}; - std::vector constraintsR0 = {2.0, 1.0, 1.0}; + std::string title = "four atoms, connected longitudinally"; + int numAtoms = 4; + std::vector masses = {0.5, 1.0/3.0, 0.25, 1.0}; + std::vector constraints = {0, 0, 1, 1, 1, 2, 2, 2, 3}; + std::vector constraintsR0 = {2.0, 1.0, 1.0}; std::vector x = {{ 2.50, -3.10, 15.70 }, @@ -976,24 +1044,26 @@ TEST_P(ConstraintsTest, FourSequentialConstraints){ { 0.0, 0.0, -4.0 }, { 0.0, 0.0, -1.0 }}; - tensor virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + tensor virialScaledRef = {{ 1.15e-01, -4.20e-03, 2.12e-02}, + {-4.20e-03, 1.70e-04, -6.41e-04}, + { 2.12e-02, -6.41e-04, 5.45e-03}}; real shakeTolerance = 0.0001; gmx_bool shakeUseSOR = false; - int lincsNIter = 4; - int lincsNProjOrder = 8; - real lincsWarnAngle = 30.0; + int lincsNIter = 4; + int lincslincsExpansionOrder = 8; + real lincsWarnAngle = 30.0; std::unique_ptr testData = std::make_unique - (title, nAtom, masses, + (title, numAtoms, masses, constraints, constraintsR0, - false, virialScaledRef, + true, virialScaledRef, false, 0, real(0.0), real(0.001), x, xPrime, v, shakeTolerance, shakeUseSOR, - lincsNIter, lincsNProjOrder, lincsWarnAngle); + lincsNIter, lincslincsExpansionOrder, lincsWarnAngle); std::string pbcName; std::string algorithmName; @@ -1008,15 +1078,17 @@ TEST_P(ConstraintsTest, FourSequentialConstraints){ checkCOMCoordinates(absoluteTolerance(0.0001), *testData); checkCOMVelocity(absoluteTolerance(0.0001), *testData); + checkVirialTensor(absoluteTolerance(0.01), *testData); + } TEST_P(ConstraintsTest, TriangleOfConstraints){ - std::string title = "basic triangle (tree atoms, connected to each other)"; - int nAtom = 3; - std::vector masses = {1.0, 1.0, 1.0}; - std::vector constraints = {0, 0, 1, 2, 0, 2, 1, 1, 2}; - std::vector constraintsR0 = {0.1, 0.1, 0.1}; + std::string title = "basic triangle (tree atoms, connected to each other)"; + int numAtoms = 3; + std::vector masses = {1.0, 1.0, 1.0}; + std::vector constraints = {0, 0, 1, 2, 0, 2, 1, 1, 2}; + std::vector constraintsR0 = {0.1, 0.1, 0.1}; real oneTenthOverSqrtTwo = 0.1_real / std::sqrt(2.0_real); @@ -1032,24 +1104,26 @@ TEST_P(ConstraintsTest, TriangleOfConstraints){ { -2.0, -2.0, -2.0 }, { 1.0, 1.0, 1.0 }}; - tensor virialScaledRef = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + tensor virialScaledRef = {{ 6.00e-04, -1.61e-03, 1.01e-03}, + {-1.61e-03, 2.53e-03, -9.25e-04}, + { 1.01e-03, -9.25e-04, -8.05e-05}}; real shakeTolerance = 0.0001; gmx_bool shakeUseSOR = false; - int lincsNIter = 1; - int lincsNProjOrder = 4; - real lincsWarnAngle = 30.0; + int lincsNIter = 1; + int lincslincsExpansionOrder = 4; + real lincsWarnAngle = 30.0; std::unique_ptr testData = std::make_unique - (title, nAtom, masses, + (title, numAtoms, masses, constraints, constraintsR0, - false, virialScaledRef, + true, virialScaledRef, false, 0, real(0.0), real(0.001), x, xPrime, v, shakeTolerance, shakeUseSOR, - lincsNIter, lincsNProjOrder, lincsWarnAngle); + lincsNIter, lincslincsExpansionOrder, lincsWarnAngle); std::string pbcName; std::string algorithmName; @@ -1064,12 +1138,22 @@ TEST_P(ConstraintsTest, TriangleOfConstraints){ checkCOMCoordinates(absoluteTolerance(0.0001), *testData); checkCOMVelocity(absoluteTolerance(0.0001), *testData); + checkVirialTensor(absoluteTolerance(0.00001), *testData); + } +#if GMX_GPU == GMX_GPU_CUDA +INSTANTIATE_TEST_CASE_P(WithParameters, ConstraintsTest, + ::testing::Combine(::testing::Values("PBCNone", "PBCXYZ"), + ::testing::Values("SHAKE", "LINCS", "LINCS_CUDA"))); +#endif + +#if GMX_GPU != GMX_GPU_CUDA INSTANTIATE_TEST_CASE_P(WithParameters, ConstraintsTest, ::testing::Combine(::testing::Values("PBCNone", "PBCXYZ"), ::testing::Values("SHAKE", "LINCS"))); +#endif } // namespace test } // namespace gmx -- 2.11.4.GIT