src/gromacs/mdlib/lincs_cuda_impl.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements LINCS using CUDA
  38  *
  39  * This file contains implementation of LINCS constraints algorithm
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \note Management of coordinates, velocities, CUDA stream and periodic boundary exists here as a temporary
  44  *       scaffolding to allow this class to be used as a stand-alone unit. The scaffolding is intended to be
  45  *       removed once constraints are integrated with update module.
  46  * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
  47  *
  48  * \author Artem Zhmurov <zhmurov@gmail.com>
  49  * \author Alan Gray <alang@nvidia.com>
  50  *
  51  * \ingroup module_mdlib
  52  */
  53 #include "gmxpre.h"
  54
  55 #include "lincs_cuda_impl.h"
  56
  57 #include <assert.h>
  58 #include <stdio.h>
  59
  60 #include <cmath>
  61
  62 #include <algorithm>
  63
  64 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  65 #include "gromacs/gpu_utils/cudautils.cuh"
  66 #include "gromacs/gpu_utils/devicebuffer.cuh"
  67 #include "gromacs/gpu_utils/gputraits.cuh"
  68 #include "gromacs/gpu_utils/vectype_ops.cuh"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/mdlib/constr.h"
  71 #include "gromacs/mdlib/lincs_cuda.h"
  72 #include "gromacs/pbcutil/pbc.h"
  73 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  74 #include "gromacs/topology/ifunc.h"
  75
  76 namespace gmx
  77 {
  78
  79 //! Number of CUDA threads in a block
  80 constexpr static int c_threadsPerBlock = 256;
  81 //! Maximum number of threads in a block (for __launch_bonds__)
  82 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  83
  84 /*! \brief Main kernel for LINCS constraints.
  85  *
  86  * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
  87  *
  88  * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
  89  * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
  90  * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
  91  * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
  92  * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
  93  * there is no need to synchronize across the device. However, extensive communication in a thread block
  94  * are still needed.
  95  *
  96  * \todo Reduce synchronization overhead. Some ideas are:
  97  *        1. Consider going to warp-level synchronization for the coupled constraints.
  98  *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
  99  *           the device level).
 100  *        3. Use analytical solution for matrix A inversion.
 101  *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
 102  *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
 103  *       See Redmine issue #2885 for details (https://redmine.gromacs.org/issues/2885)
 104  *
 105  * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
 106  * \param[in]     invdt         Inverse timestep (needed to update velocities).
 107  */
 108 template <bool updateVelocities, bool computeVirial>
 109 __launch_bounds__(c_maxThreadsPerBlock)
 110 __global__ void lincs_kernel(LincsCudaKernelParameters   kernelParams,
 111                              const float                 invdt)
 112 {
 113     const PbcAiuc               pbcAiuc                     = kernelParams.pbcAiuc;
 114     const int                   numConstraintsThreads       = kernelParams.numConstraintsThreads;
 115     const int                   numIterations               = kernelParams.numIterations;
 116     const int                   expansionOrder              = kernelParams.expansionOrder;
 117     const float3* __restrict__  gm_x                        = kernelParams.d_x;
 118     float3*                     gm_xp                       = kernelParams.d_xp;
 119     const int2*   __restrict__  gm_constraints              = kernelParams.d_constraints;
 120     const float*  __restrict__  gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
 121     const int*    __restrict__  gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts;
 122     const int*    __restrict__  gm_coupledConstraintsIdxes  = kernelParams.d_coupledConstraintsIdxes;
 123     const float*  __restrict__  gm_massFactors              = kernelParams.d_massFactors;
 124     float*  __restrict__        gm_matrixA                  = kernelParams.d_matrixA;
 125     const float*  __restrict__  gm_inverseMasses            = kernelParams.d_inverseMasses;
 126     float3*                     gm_v                        = kernelParams.d_v;
 127     float*  __restrict__        gm_virialScaled             = kernelParams.d_virialScaled;
 128
 129     int threadIndex                                         = blockIdx.x*blockDim.x+threadIdx.x;
 130
 131     // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
 132     // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
 133     assert(threadIndex < numConstraintsThreads);
 134
 135     // Vectors connecting constrained atoms before algorithm was applied.
 136     // Needed to construct constrain matrix A
 137     extern __shared__ float3 sm_r[];
 138
 139     int2                     pair = gm_constraints[threadIndex];
 140     int                      i    = pair.x;
 141     int                      j    = pair.y;
 142
 143     // Mass-scaled Lagrange multiplier
 144     float  lagrangeScaled = 0.0f;
 145
 146     float  targetLength;
 147     float  inverseMassi;
 148     float  inverseMassj;
 149     float  sqrtReducedMass;
 150
 151     float3 xi;
 152     float3 xj;
 153     float3 rc;
 154
 155     // i == -1 indicates dummy constraint at the end of the thread block.
 156     bool isDummyThread = (i == -1);
 157
 158     // Everything computed for these dummies will be equal to zero
 159     if (isDummyThread)
 160     {
 161         targetLength    = 0.0f;
 162         inverseMassi    = 0.0f;
 163         inverseMassj    = 0.0f;
 164         sqrtReducedMass = 0.0f;
 165
 166         xi = make_float3(0.0f, 0.0f, 0.0f);
 167         xj = make_float3(0.0f, 0.0f, 0.0f);
 168         rc = make_float3(0.0f, 0.0f, 0.0f);
 169     }
 170     else
 171     {
 172         // Collecting data
 173         targetLength    = gm_constraintsTargetLengths[threadIndex];
 174         inverseMassi    = gm_inverseMasses[i];
 175         inverseMassj    = gm_inverseMasses[j];
 176         sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
 177
 178         xi = gm_x[i];
 179         xj = gm_x[j];
 180
 181         float3 dx   = pbcDxAiuc(pbcAiuc, xi, xj);
 182
 183         float  rlen = rsqrtf(dx.x*dx.x + dx.y*dx.y + dx.z*dx.z);
 184         rc   = rlen*dx;
 185     }
 186
 187     sm_r[threadIdx.x] = rc;
 188     // Make sure that all r's are saved into shared memory
 189     // before they are accessed in the loop below
 190     __syncthreads();
 191
 192     /*
 193      * Constructing LINCS matrix (A)
 194      */
 195
 196     // Only non-zero values are saved (for coupled constraints)
 197     int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
 198     for (int n = 0; n < coupledConstraintsCount; n++)
 199     {
 200         int    index = n*numConstraintsThreads + threadIndex;
 201         int    c1    = gm_coupledConstraintsIdxes[index];
 202
 203         float3 rc1 = sm_r[c1 - blockIdx.x*blockDim.x];
 204         gm_matrixA[index] = gm_massFactors[index]*(rc.x*rc1.x + rc.y*rc1.y + rc.z*rc1.z);
 205     }
 206
 207     // Skipping in dummy threads
 208     if (!isDummyThread)
 209     {
 210         xi = gm_xp[i];
 211         xj = gm_xp[j];
 212     }
 213
 214     float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 215
 216     float  sol = sqrtReducedMass*((rc.x*dx.x + rc.y*dx.y + rc.z*dx.z) - targetLength);
 217
 218     /*
 219      *  Inverse matrix using a set of expansionOrder matrix multiplications
 220      */
 221
 222     // This will use the same memory space as sm_r, which is no longer needed.
 223     extern __shared__ float  sm_rhs[];
 224     // Save current right-hand-side vector in the shared memory
 225     sm_rhs[threadIdx.x] = sol;
 226
 227     for (int rec = 0; rec < expansionOrder; rec++)
 228     {
 229         // Making sure that all sm_rhs are saved before they are accessed in a loop below
 230         __syncthreads();
 231         float mvb = 0.0f;
 232
 233         for (int n = 0; n < coupledConstraintsCount; n++)
 234         {
 235             int index = n*numConstraintsThreads + threadIndex;
 236             int c1    = gm_coupledConstraintsIdxes[index];
 237             // Convolute current right-hand-side with A
 238             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
 239             mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
 240         }
 241         // 'Switch' rhs vectors, save current result
 242         // These values will be accessed in the loop above during the next iteration.
 243         sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
 244         sol  = sol + mvb;
 245     }
 246
 247     // Current mass-scaled Lagrange multipliers
 248     lagrangeScaled = sqrtReducedMass*sol;
 249
 250     // Save updated coordinates before correction for the rotational lengthening
 251     float3 tmp     = rc*lagrangeScaled;
 252
 253     // Writing for all but dummy constraints
 254     if (!isDummyThread)
 255     {
 256         atomicAdd(&gm_xp[i], -tmp*inverseMassi);
 257         atomicAdd(&gm_xp[j],  tmp*inverseMassj);
 258     }
 259
 260     /*
 261      *  Correction for centripetal effects
 262      */
 263     for (int iter = 0; iter < numIterations; iter++)
 264     {
 265         // Make sure that all xp's are saved: atomic operation calls before are
 266         // communicating current xp[..] values across thread block.
 267         __syncthreads();
 268
 269         if (!isDummyThread)
 270         {
 271             xi = gm_xp[i];
 272             xj = gm_xp[j];
 273         }
 274
 275         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 276
 277         float  len2  = targetLength*targetLength;
 278         float  dlen2 = 2.0f*len2 - norm2(dx);
 279
 280         // TODO A little bit more effective but slightly less readable version of the below would be:
 281         //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
 282         float  proj;
 283         if (dlen2 > 0.0f)
 284         {
 285             proj = sqrtReducedMass*(targetLength - dlen2*rsqrt(dlen2));
 286         }
 287         else
 288         {
 289             proj = sqrtReducedMass*targetLength;
 290         }
 291
 292         sm_rhs[threadIdx.x]   = proj;
 293         float sol = proj;
 294
 295         /*
 296          * Same matrix inversion as above is used for updated data
 297          */
 298         for (int rec = 0; rec < expansionOrder; rec++)
 299         {
 300             // Make sure that all elements of rhs are saved into shared memory
 301             __syncthreads();
 302             float mvb = 0;
 303
 304             for (int n = 0; n < coupledConstraintsCount; n++)
 305             {
 306                 int index = n*numConstraintsThreads + threadIndex;
 307                 int c1    = gm_coupledConstraintsIdxes[index];
 308
 309                 mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
 310
 311             }
 312             sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
 313             sol  = sol + mvb;
 314         }
 315
 316         // Add corrections to Lagrange multipliers
 317         float sqrtmu_sol  = sqrtReducedMass*sol;
 318         lagrangeScaled += sqrtmu_sol;
 319
 320         // Save updated coordinates for the next iteration
 321         // Dummy constraints are skipped
 322         if (!isDummyThread)
 323         {
 324             float3 tmp = rc*sqrtmu_sol;
 325             atomicAdd(&gm_xp[i], -tmp*inverseMassi);
 326             atomicAdd(&gm_xp[j],  tmp*inverseMassj);
 327         }
 328     }
 329
 330     // Updating particle velocities for all but dummy threads
 331     if (updateVelocities && !isDummyThread)
 332     {
 333         float3 tmp     = rc*invdt*lagrangeScaled;
 334         atomicAdd(&gm_v[i], -tmp*inverseMassi);
 335         atomicAdd(&gm_v[j],  tmp*inverseMassj);
 336     }
 337
 338
 339     if (computeVirial)
 340     {
 341         // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
 342         // (targetLength) and the normalized vector connecting constrained atoms before
 343         // the algorithm was applied (rc). The evaluation of virial in each thread is
 344         // followed by basic reduction for the values inside single thread block.
 345         // Then, the values are reduced across grid by atomicAdd(...).
 346         //
 347         // TODO Shuffle reduction.
 348         // TODO Should be unified and/or done once when virial is actually needed.
 349         // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
 350         //      one that works for any datatype.
 351
 352         // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
 353         // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
 354         // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
 355         // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
 356         // two are no longer in use.
 357         extern __shared__ float  sm_threadVirial[];
 358         float                    mult = targetLength*lagrangeScaled;
 359         sm_threadVirial[0*blockDim.x + threadIdx.x] = mult*rc.x*rc.x;
 360         sm_threadVirial[1*blockDim.x + threadIdx.x] = mult*rc.x*rc.y;
 361         sm_threadVirial[2*blockDim.x + threadIdx.x] = mult*rc.x*rc.z;
 362         sm_threadVirial[3*blockDim.x + threadIdx.x] = mult*rc.y*rc.y;
 363         sm_threadVirial[4*blockDim.x + threadIdx.x] = mult*rc.y*rc.z;
 364         sm_threadVirial[5*blockDim.x + threadIdx.x] = mult*rc.z*rc.z;
 365
 366         __syncthreads();
 367
 368         // Reduce up to one virial per thread block. All blocks are divided by half, the first
 369         // half of threads sums two virials. Then the first half is divided by two and the first
 370         // half of it sums two values. This procedure is repeated until only one thread is left.
 371         // Only works if the threads per blocks is a power of two (hence static_assert
 372         // in the beginning of the kernel).
 373         for (int divideBy = 2; divideBy <= blockDim.x; divideBy *= 2)
 374         {
 375             int dividedAt = blockDim.x/divideBy;
 376             if (threadIdx.x < dividedAt)
 377             {
 378                 for (int d = 0; d < 6; d++)
 379                 {
 380                     sm_threadVirial[d*blockDim.x + threadIdx.x] += sm_threadVirial[d*blockDim.x + (threadIdx.x + dividedAt)];
 381                 }
 382             }
 383             // Syncronize if not within one warp
 384             if (dividedAt > warpSize/2)
 385             {
 386                 __syncthreads();
 387             }
 388         }
 389         // First 6 threads in the block add the results of 6 tensor components to the global memory address.
 390         if (threadIdx.x < 6)
 391         {
 392             atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x*blockDim.x]);
 393         }
 394     }
 395
 396     return;
 397 }
 398
 399 /*! \brief Select templated kernel.
 400  *
 401  * Returns pointer to a CUDA kernel based on provided booleans.
 402  *
 403  * \param[in] updateVelocities  If the velocities should be constrained.
 404  * \param[in] computeVirial     If virial should be updated.
 405  *
 406  * \return                      Pointer to CUDA kernel
 407  */
 408 inline auto getLincsKernelPtr(const bool  updateVelocities,
 409                               const bool  computeVirial)
 410 {
 411
 412     auto kernelPtr = lincs_kernel<true, true>;
 413     if (updateVelocities && computeVirial)
 414     {
 415         kernelPtr = lincs_kernel<true, true>;
 416     }
 417     else if (updateVelocities && !computeVirial)
 418     {
 419         kernelPtr = lincs_kernel<true, false>;
 420     }
 421     else if (!updateVelocities && computeVirial)
 422     {
 423         kernelPtr = lincs_kernel<false, true>;
 424     }
 425     else if (!updateVelocities && !computeVirial)
 426     {
 427         kernelPtr = lincs_kernel<false, false>;
 428     }
 429     return kernelPtr;
 430 }
 431
 432 /*! \brief Apply LINCS.
 433  *
 434  * Applies LINCS to coordinates and velocities, stored on GPU.
 435  * Data at pointers xPrime and v (class fields) change in the GPU
 436  * memory. The results are not automatically copied back to the CPU
 437  * memory. Method uses this class data structures which should be
 438  * updated when needed using update method.
 439  *
 440  * \param[in] updateVelocities  If the velocities should be constrained.
 441  * \param[in] invdt             Reciprocal timestep (to scale Lagrange
 442  *                              multipliers when velocities are updated)
 443  * \param[in] computeVirial     If virial should be updated.
 444  * \param[in,out] virialScaled  Scaled virial tensor to be updated.
 445  */
 446 void LincsCuda::Impl::apply(const bool  updateVelocities,
 447                             const real  invdt,
 448                             const bool  computeVirial,
 449                             tensor      virialScaled)
 450 {
 451     ensureNoPendingCudaError("In CUDA version of LINCS");
 452
 453     if (computeVirial)
 454     {
 455         // Fill with zeros so the values can be reduced to it
 456         // Only 6 values are needed because virial is symmetrical
 457         clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, stream_);
 458     }
 459
 460     auto               kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
 461
 462     KernelLaunchConfig config;
 463     config.blockSize[0]     = c_threadsPerBlock;
 464     config.blockSize[1]     = 1;
 465     config.blockSize[2]     = 1;
 466     config.gridSize[0]      = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1)/c_threadsPerBlock;
 467     config.gridSize[1]      = 1;
 468     config.gridSize[2]      = 1;
 469
 470     // Shared memory is used to store:
 471     // -- Current coordinates (3 floats per thread)
 472     // -- Right-hand-sides for matrix inversion (2 floats per thread)
 473     // -- Virial tensor components (6 floats per thread)
 474     // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
 475     // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
 476     // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
 477     if (computeVirial)
 478     {
 479         config.sharedMemorySize = c_threadsPerBlock*6*sizeof(float);
 480     }
 481     else
 482     {
 483         config.sharedMemorySize = c_threadsPerBlock*3*sizeof(float);
 484     }
 485     config.stream           = stream_;
 486
 487     const auto     kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
 488                                                           &kernelParams_,
 489                                                           &invdt);
 490
 491     launchGpuKernel(kernelPtr, config, nullptr,
 492                     "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
 493
 494     if (computeVirial)
 495     {
 496         // Copy LINCS virial data and add it to the common virial
 497         copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled,
 498                              0, 6,
 499                              stream_, GpuApiCallBehavior::Sync, nullptr);
 500
 501         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
 502         virialScaled[XX][XX] += h_virialScaled_[0];
 503         virialScaled[XX][YY] += h_virialScaled_[1];
 504         virialScaled[XX][ZZ] += h_virialScaled_[2];
 505
 506         virialScaled[YY][XX] += h_virialScaled_[1];
 507         virialScaled[YY][YY] += h_virialScaled_[3];
 508         virialScaled[YY][ZZ] += h_virialScaled_[4];
 509
 510         virialScaled[ZZ][XX] += h_virialScaled_[2];
 511         virialScaled[ZZ][YY] += h_virialScaled_[4];
 512         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
 513     }
 514
 515     return;
 516 }
 517
 518 /*! \brief Create LINCS object
 519  *
 520  * \param[in] numAtoms        Number of atoms that will be handles by LINCS.
 521  *                            Used to compute the memory size in allocations and copy.
 522  * \param[in] numIterations   Number of iterations used to compute inverse matrix.
 523  * \param[in] expansionOrder  LINCS projection order for correcting the direction of constraint.
 524  */
 525 LincsCuda::Impl::Impl(int numAtoms,
 526                       int numIterations,
 527                       int expansionOrder)
 528     : numAtoms_(numAtoms)
 529 {
 530     kernelParams_.numIterations              = numIterations;
 531     kernelParams_.expansionOrder             = expansionOrder;
 532
 533     static_assert(sizeof(real) == sizeof(float),
 534                   "Real numbers should be in single precision in GPU code.");
 535     static_assert(c_threadsPerBlock > 0 && !(c_threadsPerBlock & (c_threadsPerBlock - 1) == 0),
 536                   "Nmber of threads per block should be a power of two in order for reduction to work.");
 537
 538     // This is temporary. LINCS should not manage coordinates.
 539     allocateDeviceBuffer(&kernelParams_.d_x,  numAtoms, nullptr);
 540     allocateDeviceBuffer(&kernelParams_.d_xp, numAtoms, nullptr);
 541     allocateDeviceBuffer(&kernelParams_.d_v,  numAtoms, nullptr);
 542
 543     allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
 544     h_virialScaled_.resize(6);
 545
 546     // The data arrays should be expanded/reallocated on first call of set() function.
 547     maxConstraintsNumberSoFar_ = 0;
 548     // Use default stream.
 549     // TODO The stream should/can be assigned by the GPU schedule when the code will be integrated.
 550     stream_ = nullptr;
 551
 552 }
 553
 554 LincsCuda::Impl::~Impl()
 555 {
 556 }
 557
 558 /*! \brief Helper function to go through constraints recursively.
 559  *
 560  *  For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
 561  *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
 562  *  coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
 563  *  value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
 564  *  after it in the constraints array.
 565  *
 566  * \param[in]     a                  Atom index.
 567  * \param[in,out] spaceNeeded        Indicates if the constraint was already counted and stores
 568  *                                   the number of constraints (i) connected to it and (ii) located
 569  *                                   after it in memory. This array is filled by this recursive function.
 570  *                                   For a set of coupled constraints, only for the first one in this list
 571  *                                   the number of consecutive coupled constraints is needed: if there is
 572  *                                   not enough space for this set of constraints in the thread block,
 573  *                                   the group has to be moved to the next one.
 574  * \param[in]     atomAdjacencyList  Stores information about connections between atoms.
 575  */
 576 inline int countCoupled(int a, std::vector<int> *spaceNeeded,
 577                         std::vector<std::vector<std::tuple<int, int, int> > > *atomsAdjacencyList)
 578
 579 {
 580     int c2, a2, sign;
 581     int counted = 0;
 582     for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
 583     {
 584         std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
 585         if (spaceNeeded->at(c2) == -1)
 586         {
 587             spaceNeeded->at(c2) = 0; // To indicate we've been here
 588             counted            += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
 589         }
 590     }
 591     return counted;
 592 }
 593
 594 /*! \brief
 595  * Update data-structures (e.g. after NB search step).
 596  *
 597  * Updates the constraints data and copies it to the GPU. Should be
 598  * called if the particles were sorted, redistributed between domains, etc.
 599  * This version uses common data formats so it can be called from anywhere
 600  * in the code. Does not recycle the data preparation routines from the CPU
 601  * version. Works only with simple case when all the constraints in idef are
 602  * are handled by a single GPU. Triangles are not handled as special case.
 603  *
 604  * Information about constraints is taken from:
 605  *     idef.il[F_CONSTR].iatoms  --- type (T) of constraint and two atom indexes (i1, i2)
 606  *     idef.iparams[T].constr.dA --- target length for constraint of type T
 607  * From t_mdatom, the code takes:
 608  *     md.invmass  --- array of inverse square root of masses for each atom in the system.
 609  *
 610  * \param[in] idef  Local topology data to get information on constraints from.
 611  * \param[in] md    Atoms data to get atom masses from.
 612  */
 613 void LincsCuda::Impl::set(const t_idef    &idef,
 614                           const t_mdatoms &md)
 615 {
 616
 617     // List of constrained atoms (CPU memory)
 618     std::vector<int2>   constraintsHost;
 619     // Equilibrium distances for the constraints (CPU)
 620     std::vector<float>  constraintsTargetLengthsHost;
 621     // Number of constraints, coupled with the current one (CPU)
 622     std::vector<int>    coupledConstraintsCountsHost;
 623     // List of coupled with the current one (CPU)
 624     std::vector<int>    coupledConstraintsIdxesHost;
 625     // Mass factors (CPU)
 626     std::vector<float>  massFactorsHost;
 627
 628     // List of constrained atoms in local topology
 629     t_iatom  *iatoms         = idef.il[F_CONSTR].iatoms;
 630     const int stride         = NRAL(F_CONSTR) + 1;
 631     const int numConstraints = idef.il[F_CONSTR].nr/stride;
 632     // Constructing adjacency list --- usefull intermediate structure
 633     std::vector<std::vector<std::tuple<int, int, int> > > atomsAdjacencyList(numAtoms_);
 634     for (int c = 0; c < numConstraints; c++)
 635     {
 636         int a1     = iatoms[stride*c + 1];
 637         int a2     = iatoms[stride*c + 2];
 638
 639         // Each constraint will be represented as a tuple, containing index of the second constrained atom,
 640         // index of the constraint and a sign that indicates the order of atoms in which they are listed.
 641         // Sign is needed to compute the mass factors.
 642         atomsAdjacencyList.at(a1).push_back(std::make_tuple(a2, c, +1));
 643         atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
 644     }
 645
 646     // Compute, how many coupled constraints are in front of each constraint.
 647     // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
 648     // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
 649     // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
 650     // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
 651     std::vector<int> spaceNeeded;
 652     spaceNeeded.resize(numConstraints, -1);
 653     std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
 654     for (int c = 0; c < numConstraints; c++)
 655     {
 656         int a1     = iatoms[stride*c + 1];
 657         int a2     = iatoms[stride*c + 2];
 658         if (spaceNeeded.at(c) == -1)
 659         {
 660             spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList) +
 661                 countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
 662         }
 663     }
 664
 665     // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
 666     // takes into account the empty spaces which might be needed in the end of each thread block.
 667     std::vector<int> splitMap;
 668     splitMap.resize(numConstraints, -1);
 669     int              currentMapIndex = 0;
 670     for (int c = 0; c < numConstraints; c++)
 671     {
 672         // Check if coupled constraints all fit in one block
 673         GMX_RELEASE_ASSERT(spaceNeeded.at(c) < c_threadsPerBlock, "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
 674                            "Most likely, you are trying to use GPU version of LINCS with constraints on all-bonds, "
 675                            "which is not supported. Try using H-bonds constraints only.");
 676         if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
 677         {
 678             currentMapIndex = ((currentMapIndex/c_threadsPerBlock) + 1) * c_threadsPerBlock;
 679         }
 680         splitMap.at(c) = currentMapIndex;
 681         currentMapIndex++;
 682     }
 683     kernelParams_.numConstraintsThreads = currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
 684     GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0, "Number of threads should be a multiple of the block size");
 685
 686     // Initialize constraints and their target indexes taking into account the splits in the data arrays.
 687     int2 pair;
 688     pair.x = -1;
 689     pair.y = -1;
 690     constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
 691     std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
 692     constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
 693     std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
 694     for (int c = 0; c < numConstraints; c++)
 695     {
 696         int  a1     = iatoms[stride*c + 1];
 697         int  a2     = iatoms[stride*c + 2];
 698         int  type   = iatoms[stride*c];
 699
 700         int2 pair;
 701         pair.x = a1;
 702         pair.y = a2;
 703         constraintsHost.at(splitMap.at(c))              = pair;
 704         constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
 705
 706     }
 707
 708     // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
 709     // We map a single thread to a single constraint, hence each thread 'c' will be using one element from
 710     // coupledConstraintsCountsHost array, which is the number of constraints coupled to the constraint 'c'.
 711     // The coupled constraints indexes are placed into the coupledConstraintsIdxesHost array. Latter is organized
 712     // as a one-dimensional array to ensure good memory alignment. It is addressed as [c + i*numConstraintsThreads],
 713     // where 'i' goes from zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of
 714     // the array --- a number, greater then total number of constraints, taking into account the splits in the
 715     // constraints array due to the GPU block borders. This number can be adjusted to improve memory access pattern.
 716     // Mass factors are saved in a similar data structure.
 717     int              maxCoupledConstraints = 0;
 718     for (int c = 0; c < numConstraints; c++)
 719     {
 720         int a1     = iatoms[stride*c + 1];
 721         int a2     = iatoms[stride*c + 2];
 722
 723         // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
 724         int nCoupedConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
 725
 726         if (nCoupedConstraints > maxCoupledConstraints)
 727         {
 728             maxCoupledConstraints = nCoupedConstraints;
 729         }
 730     }
 731
 732     coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
 733     coupledConstraintsIdxesHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
 734     massFactorsHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
 735
 736     for (int c1 = 0; c1 < numConstraints; c1++)
 737     {
 738         coupledConstraintsCountsHost.at(splitMap.at(c1))  = 0;
 739         int c1a1     = iatoms[stride*c1 + 1];
 740         int c1a2     = iatoms[stride*c1 + 2];
 741         int c2;
 742         int c2a1;
 743         int c2a2;
 744
 745         int sign;
 746
 747         // Constraints, coupled trough the first atom.
 748         c2a1 = c1a1;
 749         for (unsigned j = 0; j < atomsAdjacencyList.at(c1a1).size(); j++)
 750         {
 751
 752             std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a1).at(j);
 753
 754             if (c1 != c2)
 755             {
 756                 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
 757
 758                 coupledConstraintsIdxesHost.at(index) = splitMap.at(c2);
 759
 760                 int   center = c1a1;
 761
 762                 float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 763                 float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 764
 765                 massFactorsHost.at(index) = -sign*md.invmass[center]*sqrtmu1*sqrtmu2;
 766
 767                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 768
 769             }
 770         }
 771
 772         // Constraints, coupled through the second atom.
 773         c2a1 = c1a2;
 774         for (unsigned j = 0; j < atomsAdjacencyList.at(c1a2).size(); j++)
 775         {
 776
 777             std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a2).at(j);
 778
 779             if (c1 != c2)
 780             {
 781                 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
 782
 783                 coupledConstraintsIdxesHost.at(index) = splitMap.at(c2);
 784
 785                 int   center = c1a2;
 786
 787                 float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 788                 float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 789
 790                 massFactorsHost.at(index) = sign*md.invmass[center]*sqrtmu1*sqrtmu2;
 791
 792                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 793
 794             }
 795         }
 796     }
 797
 798     // (Re)allocate the memory, if the number of constraints has increased.
 799     if (numConstraints > maxConstraintsNumberSoFar_)
 800     {
 801         // Free memory if it was allocated before (i.e. if not the first time here).
 802         if (maxConstraintsNumberSoFar_ > 0)
 803         {
 804             freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 805
 806             freeDeviceBuffer(&kernelParams_.d_constraints);
 807             freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 808
 809             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 810             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes);
 811             freeDeviceBuffer(&kernelParams_.d_massFactors);
 812             freeDeviceBuffer(&kernelParams_.d_matrixA);
 813
 814         }
 815         maxConstraintsNumberSoFar_ = numConstraints;
 816
 817         allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms_, nullptr);
 818
 819         allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
 820         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, kernelParams_.numConstraintsThreads, nullptr);
 821
 822         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, kernelParams_.numConstraintsThreads, nullptr);
 823         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
 824         allocateDeviceBuffer(&kernelParams_.d_massFactors, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
 825         allocateDeviceBuffer(&kernelParams_.d_matrixA, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
 826
 827     }
 828
 829     // Copy data to GPU.
 830     copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(),
 831                        0, kernelParams_.numConstraintsThreads,
 832                        stream_, GpuApiCallBehavior::Sync, nullptr);
 833     copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, constraintsTargetLengthsHost.data(),
 834                        0, kernelParams_.numConstraintsThreads,
 835                        stream_, GpuApiCallBehavior::Sync, nullptr);
 836     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, coupledConstraintsCountsHost.data(),
 837                        0, kernelParams_.numConstraintsThreads,
 838                        stream_, GpuApiCallBehavior::Sync, nullptr);
 839     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIdxes, coupledConstraintsIdxesHost.data(),
 840                        0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
 841                        stream_, GpuApiCallBehavior::Sync, nullptr);
 842     copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(),
 843                        0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
 844                        stream_, GpuApiCallBehavior::Sync, nullptr);
 845
 846     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
 847     copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass,
 848                        0, numAtoms_,
 849                        stream_, GpuApiCallBehavior::Sync, nullptr);
 850
 851 }
 852
 853 /*! \brief
 854  * Update PBC data.
 855  *
 856  * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter.
 857  *
 858  * \param[in] pbc  The PBC data in t_pbc format.
 859  */
 860 void LincsCuda::Impl::setPbc(const t_pbc *pbc)
 861 {
 862     setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
 863 }
 864
 865 /*! \brief
 866  * Copy coordinates from provided CPU location to GPU.
 867  *
 868  * Copies the coordinates before the integration step (x) and coordinates
 869  * after the integration step (xp) from the provided CPU location to GPU.
 870  * The data are assumed to be in float3/fvec format (single precision).
 871  *
 872  * \param[in] h_x   CPU pointer where coordinates should be copied from.
 873  * \param[in] h_xp  CPU pointer where coordinates should be copied from.
 874  */
 875 void LincsCuda::Impl::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp)
 876 {
 877     copyToDeviceBuffer(&kernelParams_.d_x, (float3*)h_x, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
 878     copyToDeviceBuffer(&kernelParams_.d_xp, (float3*)h_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
 879 }
 880
 881 /*! \brief
 882  * Copy velocities from provided CPU location to GPU.
 883  *
 884  * Nothing is done if the argument provided is a nullptr.
 885  * The data are assumed to be in float3/fvec format (single precision).
 886  *
 887  * \param[in] h_v  CPU pointer where velocities should be copied from.
 888  */
 889 void LincsCuda::Impl::copyVelocitiesToGpu(const rvec *h_v)
 890 {
 891     if (h_v != nullptr)
 892     {
 893         copyToDeviceBuffer(&kernelParams_.d_v, (float3*)h_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
 894     }
 895 }
 896
 897 /*! \brief
 898  * Copy coordinates from GPU to provided CPU location.
 899  *
 900  * Copies the constrained coordinates to the provided location. The coordinates
 901  * are assumed to be in float3/fvec format (single precision).
 902  *
 903  * \param[out] h_xp  CPU pointer where coordinates should be copied to.
 904  */
 905 void LincsCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp)
 906 {
 907     copyFromDeviceBuffer((float3*)h_xp, &kernelParams_.d_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
 908 }
 909
 910 /*! \brief
 911  * Copy velocities from GPU to provided CPU location.
 912  *
 913  * The velocities are assumed to be in float3/fvec format (single precision).
 914  *
 915  * \param[in] h_v  Pointer to velocities data.
 916  */
 917 void LincsCuda::Impl::copyVelocitiesFromGpu(rvec *h_v)
 918 {
 919     copyFromDeviceBuffer((float3*)h_v, &kernelParams_.d_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
 920 }
 921
 922 /*! \brief
 923  * Set the internal GPU-memory x, xprime and v pointers.
 924  *
 925  * Data is not copied. The data are assumed to be in float3/fvec format
 926  * (float3 is used internally, but the data layout should be identical).
 927  *
 928  * \param[in] d_x   Pointer to the coordinates before integrator update (on GPU)
 929  * \param[in] d_xp  Pointer to the coordinates after integrator update, before update (on GPU)
 930  * \param[in] d_v   Pointer to the velocities before integrator update (on GPU)
 931  */
 932 void LincsCuda::Impl::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v)
 933 {
 934     kernelParams_.d_x  = (float3*)d_x;
 935     kernelParams_.d_xp = (float3*)d_xp;
 936     kernelParams_.d_v  = (float3*)d_v;
 937 }
 938
 939
 940 LincsCuda::LincsCuda(const int numAtoms,
 941                      const int numIterations,
 942                      const int expansionOrder)
 943     : impl_(new Impl(numAtoms, numIterations, expansionOrder))
 944 {
 945 }
 946
 947 LincsCuda::~LincsCuda() = default;
 948
 949 void LincsCuda::apply(const bool  updateVelocities,
 950                       const real  invdt,
 951                       const bool  computeVirial,
 952                       tensor      virialScaled)
 953 {
 954     impl_->apply(updateVelocities,
 955                  invdt,
 956                  computeVirial,
 957                  virialScaled);
 958 }
 959
 960 void LincsCuda::setPbc(const t_pbc *pbc)
 961 {
 962     impl_->setPbc(pbc);
 963 }
 964
 965 void LincsCuda::set(const t_idef    &idef,
 966                     const t_mdatoms &md)
 967 {
 968     impl_->set(idef, md);
 969 }
 970
 971 void LincsCuda::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp)
 972 {
 973     impl_->copyCoordinatesToGpu(h_x, h_xp);
 974 }
 975
 976 void LincsCuda::copyVelocitiesToGpu(const rvec *h_v)
 977 {
 978     impl_->copyVelocitiesToGpu(h_v);
 979 }
 980
 981 void LincsCuda::copyCoordinatesFromGpu(rvec *h_xp)
 982 {
 983     impl_->copyCoordinatesFromGpu(h_xp);
 984 }
 985
 986 void LincsCuda::copyVelocitiesFromGpu(rvec *h_v)
 987 {
 988     impl_->copyVelocitiesFromGpu(h_v);
 989 }
 990
 991 void LincsCuda::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v)
 992 {
 993     impl_->setXVPointers(d_x, d_xp, d_v);
 994 }
 995
 996 } // namespace gmx