src/gromacs/mdlib/lincs_cuda_impl.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements LINCS using CUDA
  38  *
  39  * This file contains implementation of LINCS constraints algorithm
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \note Management of CUDA stream and periodic boundary exists here as a temporary
  44  *       scaffolding to allow this class to be used as a stand-alone unit. The scaffolding
  45  *        is intended to be removed once constraints are fully integrated with update module.
  46  * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
  47  *
  48  * \author Artem Zhmurov <zhmurov@gmail.com>
  49  * \author Alan Gray <alang@nvidia.com>
  50  *
  51  * \ingroup module_mdlib
  52  */
  53 #include "gmxpre.h"
  54
  55 #include "lincs_cuda_impl.h"
  56
  57 #include <assert.h>
  58 #include <stdio.h>
  59
  60 #include <cmath>
  61
  62 #include <algorithm>
  63
  64 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  65 #include "gromacs/gpu_utils/cudautils.cuh"
  66 #include "gromacs/gpu_utils/devicebuffer.cuh"
  67 #include "gromacs/gpu_utils/gputraits.cuh"
  68 #include "gromacs/gpu_utils/vectype_ops.cuh"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/mdlib/constr.h"
  71 #include "gromacs/mdlib/lincs_cuda.h"
  72 #include "gromacs/pbcutil/pbc.h"
  73 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  74 #include "gromacs/topology/ifunc.h"
  75
  76 namespace gmx
  77 {
  78
  79 //! Number of CUDA threads in a block
  80 constexpr static int c_threadsPerBlock = 256;
  81 //! Maximum number of threads in a block (for __launch_bounds__)
  82 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  83
  84 /*! \brief Main kernel for LINCS constraints.
  85  *
  86  * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
  87  *
  88  * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
  89  * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
  90  * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
  91  * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
  92  * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
  93  * there is no need to synchronize across the device. However, extensive communication in a thread block
  94  * are still needed.
  95  *
  96  * \todo Reduce synchronization overhead. Some ideas are:
  97  *        1. Consider going to warp-level synchronization for the coupled constraints.
  98  *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
  99  *           the device level).
 100  *        3. Use analytical solution for matrix A inversion.
 101  *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
 102  *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
 103  *       See Redmine issue #2885 for details (https://redmine.gromacs.org/issues/2885)
 104  * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
 105          operations. Investigate this issue further.
 106  *
 107  * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
 108  * \param[in]     invdt         Inverse timestep (needed to update velocities).
 109  */
 110 template <bool updateVelocities, bool computeVirial>
 111 __launch_bounds__(c_maxThreadsPerBlock)
 112 __global__ void lincs_kernel(LincsCudaKernelParameters   kernelParams,
 113                              const float3* __restrict__  gm_x,
 114                              float3*                     gm_xp,
 115                              float3*                     gm_v,
 116                              const float                 invdt)
 117 {
 118     const PbcAiuc               pbcAiuc                     = kernelParams.pbcAiuc;
 119     const int                   numConstraintsThreads       = kernelParams.numConstraintsThreads;
 120     const int                   numIterations               = kernelParams.numIterations;
 121     const int                   expansionOrder              = kernelParams.expansionOrder;
 122     const int2*   __restrict__  gm_constraints              = kernelParams.d_constraints;
 123     const float*  __restrict__  gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
 124     const int*    __restrict__  gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts;
 125     const int*    __restrict__  gm_coupledConstraintsIdxes  = kernelParams.d_coupledConstraintsIndices;
 126     const float*  __restrict__  gm_massFactors              = kernelParams.d_massFactors;
 127     float*  __restrict__        gm_matrixA                  = kernelParams.d_matrixA;
 128     const float*  __restrict__  gm_inverseMasses            = kernelParams.d_inverseMasses;
 129     float*  __restrict__        gm_virialScaled             = kernelParams.d_virialScaled;
 130
 131     int threadIndex                                         = blockIdx.x*blockDim.x+threadIdx.x;
 132
 133     // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
 134     // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
 135     assert(threadIndex < numConstraintsThreads);
 136
 137     // Vectors connecting constrained atoms before algorithm was applied.
 138     // Needed to construct constrain matrix A
 139     extern __shared__ float3 sm_r[];
 140
 141     int2                     pair = gm_constraints[threadIndex];
 142     int                      i    = pair.x;
 143     int                      j    = pair.y;
 144
 145     // Mass-scaled Lagrange multiplier
 146     float  lagrangeScaled = 0.0f;
 147
 148     float  targetLength;
 149     float  inverseMassi;
 150     float  inverseMassj;
 151     float  sqrtReducedMass;
 152
 153     float3 xi;
 154     float3 xj;
 155     float3 rc;
 156
 157     // i == -1 indicates dummy constraint at the end of the thread block.
 158     bool isDummyThread = (i == -1);
 159
 160     // Everything computed for these dummies will be equal to zero
 161     if (isDummyThread)
 162     {
 163         targetLength    = 0.0f;
 164         inverseMassi    = 0.0f;
 165         inverseMassj    = 0.0f;
 166         sqrtReducedMass = 0.0f;
 167
 168         xi = make_float3(0.0f, 0.0f, 0.0f);
 169         xj = make_float3(0.0f, 0.0f, 0.0f);
 170         rc = make_float3(0.0f, 0.0f, 0.0f);
 171     }
 172     else
 173     {
 174         // Collecting data
 175         targetLength    = gm_constraintsTargetLengths[threadIndex];
 176         inverseMassi    = gm_inverseMasses[i];
 177         inverseMassj    = gm_inverseMasses[j];
 178         sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
 179
 180         xi = gm_x[i];
 181         xj = gm_x[j];
 182
 183         float3 dx   = pbcDxAiuc(pbcAiuc, xi, xj);
 184
 185         float  rlen = rsqrtf(dx.x*dx.x + dx.y*dx.y + dx.z*dx.z);
 186         rc   = rlen*dx;
 187     }
 188
 189     sm_r[threadIdx.x] = rc;
 190     // Make sure that all r's are saved into shared memory
 191     // before they are accessed in the loop below
 192     __syncthreads();
 193
 194     /*
 195      * Constructing LINCS matrix (A)
 196      */
 197
 198     // Only non-zero values are saved (for coupled constraints)
 199     int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
 200     for (int n = 0; n < coupledConstraintsCount; n++)
 201     {
 202         int    index = n*numConstraintsThreads + threadIndex;
 203         int    c1    = gm_coupledConstraintsIdxes[index];
 204
 205         float3 rc1 = sm_r[c1 - blockIdx.x*blockDim.x];
 206         gm_matrixA[index] = gm_massFactors[index]*(rc.x*rc1.x + rc.y*rc1.y + rc.z*rc1.z);
 207     }
 208
 209     // Skipping in dummy threads
 210     if (!isDummyThread)
 211     {
 212         xi = gm_xp[i];
 213         xj = gm_xp[j];
 214     }
 215
 216     float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 217
 218     float  sol = sqrtReducedMass*((rc.x*dx.x + rc.y*dx.y + rc.z*dx.z) - targetLength);
 219
 220     /*
 221      *  Inverse matrix using a set of expansionOrder matrix multiplications
 222      */
 223
 224     // This will use the same memory space as sm_r, which is no longer needed.
 225     extern __shared__ float  sm_rhs[];
 226     // Save current right-hand-side vector in the shared memory
 227     sm_rhs[threadIdx.x] = sol;
 228
 229     for (int rec = 0; rec < expansionOrder; rec++)
 230     {
 231         // Making sure that all sm_rhs are saved before they are accessed in a loop below
 232         __syncthreads();
 233         float mvb = 0.0f;
 234
 235         for (int n = 0; n < coupledConstraintsCount; n++)
 236         {
 237             int index = n*numConstraintsThreads + threadIndex;
 238             int c1    = gm_coupledConstraintsIdxes[index];
 239             // Convolute current right-hand-side with A
 240             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
 241             mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
 242         }
 243         // 'Switch' rhs vectors, save current result
 244         // These values will be accessed in the loop above during the next iteration.
 245         sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
 246         sol  = sol + mvb;
 247     }
 248
 249     // Current mass-scaled Lagrange multipliers
 250     lagrangeScaled = sqrtReducedMass*sol;
 251
 252     // Save updated coordinates before correction for the rotational lengthening
 253     float3 tmp     = rc*lagrangeScaled;
 254
 255     // Writing for all but dummy constraints
 256     if (!isDummyThread)
 257     {
 258         atomicAdd(&gm_xp[i], -tmp*inverseMassi);
 259         atomicAdd(&gm_xp[j],  tmp*inverseMassj);
 260     }
 261
 262     /*
 263      *  Correction for centripetal effects
 264      */
 265     for (int iter = 0; iter < numIterations; iter++)
 266     {
 267         // Make sure that all xp's are saved: atomic operation calls before are
 268         // communicating current xp[..] values across thread block.
 269         __syncthreads();
 270
 271         if (!isDummyThread)
 272         {
 273             xi = gm_xp[i];
 274             xj = gm_xp[j];
 275         }
 276
 277         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 278
 279         float  len2  = targetLength*targetLength;
 280         float  dlen2 = 2.0f*len2 - norm2(dx);
 281
 282         // TODO A little bit more effective but slightly less readable version of the below would be:
 283         //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
 284         float  proj;
 285         if (dlen2 > 0.0f)
 286         {
 287             proj = sqrtReducedMass*(targetLength - dlen2*rsqrt(dlen2));
 288         }
 289         else
 290         {
 291             proj = sqrtReducedMass*targetLength;
 292         }
 293
 294         sm_rhs[threadIdx.x]   = proj;
 295         float sol = proj;
 296
 297         /*
 298          * Same matrix inversion as above is used for updated data
 299          */
 300         for (int rec = 0; rec < expansionOrder; rec++)
 301         {
 302             // Make sure that all elements of rhs are saved into shared memory
 303             __syncthreads();
 304             float mvb = 0;
 305
 306             for (int n = 0; n < coupledConstraintsCount; n++)
 307             {
 308                 int index = n*numConstraintsThreads + threadIndex;
 309                 int c1    = gm_coupledConstraintsIdxes[index];
 310
 311                 mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
 312
 313             }
 314             sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
 315             sol  = sol + mvb;
 316         }
 317
 318         // Add corrections to Lagrange multipliers
 319         float sqrtmu_sol  = sqrtReducedMass*sol;
 320         lagrangeScaled += sqrtmu_sol;
 321
 322         // Save updated coordinates for the next iteration
 323         // Dummy constraints are skipped
 324         if (!isDummyThread)
 325         {
 326             float3 tmp = rc*sqrtmu_sol;
 327             atomicAdd(&gm_xp[i], -tmp*inverseMassi);
 328             atomicAdd(&gm_xp[j],  tmp*inverseMassj);
 329         }
 330     }
 331
 332     // Updating particle velocities for all but dummy threads
 333     if (updateVelocities && !isDummyThread)
 334     {
 335         float3 tmp     = rc*invdt*lagrangeScaled;
 336         atomicAdd(&gm_v[i], -tmp*inverseMassi);
 337         atomicAdd(&gm_v[j],  tmp*inverseMassj);
 338     }
 339
 340
 341     if (computeVirial)
 342     {
 343         // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
 344         // (targetLength) and the normalized vector connecting constrained atoms before
 345         // the algorithm was applied (rc). The evaluation of virial in each thread is
 346         // followed by basic reduction for the values inside single thread block.
 347         // Then, the values are reduced across grid by atomicAdd(...).
 348         //
 349         // TODO Shuffle reduction.
 350         // TODO Should be unified and/or done once when virial is actually needed.
 351         // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
 352         //      one that works for any datatype.
 353
 354         // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
 355         // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
 356         // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
 357         // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
 358         // two are no longer in use.
 359         extern __shared__ float  sm_threadVirial[];
 360         float                    mult = targetLength*lagrangeScaled;
 361         sm_threadVirial[0*blockDim.x + threadIdx.x] = mult*rc.x*rc.x;
 362         sm_threadVirial[1*blockDim.x + threadIdx.x] = mult*rc.x*rc.y;
 363         sm_threadVirial[2*blockDim.x + threadIdx.x] = mult*rc.x*rc.z;
 364         sm_threadVirial[3*blockDim.x + threadIdx.x] = mult*rc.y*rc.y;
 365         sm_threadVirial[4*blockDim.x + threadIdx.x] = mult*rc.y*rc.z;
 366         sm_threadVirial[5*blockDim.x + threadIdx.x] = mult*rc.z*rc.z;
 367
 368         __syncthreads();
 369
 370         // Reduce up to one virial per thread block. All blocks are divided by half, the first
 371         // half of threads sums two virials. Then the first half is divided by two and the first
 372         // half of it sums two values. This procedure is repeated until only one thread is left.
 373         // Only works if the threads per blocks is a power of two (hence static_assert
 374         // in the beginning of the kernel).
 375         for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
 376         {
 377             int dividedAt = blockDim.x/divideBy;
 378             if (static_cast<int>(threadIdx.x) < dividedAt)
 379             {
 380                 for (int d = 0; d < 6; d++)
 381                 {
 382                     sm_threadVirial[d*blockDim.x + threadIdx.x] += sm_threadVirial[d*blockDim.x + (threadIdx.x + dividedAt)];
 383                 }
 384             }
 385             // Syncronize if not within one warp
 386             if (dividedAt > warpSize/2)
 387             {
 388                 __syncthreads();
 389             }
 390         }
 391         // First 6 threads in the block add the results of 6 tensor components to the global memory address.
 392         if (threadIdx.x < 6)
 393         {
 394             atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x*blockDim.x]);
 395         }
 396     }
 397
 398     return;
 399 }
 400
 401 /*! \brief Select templated kernel.
 402  *
 403  * Returns pointer to a CUDA kernel based on provided booleans.
 404  *
 405  * \param[in] updateVelocities  If the velocities should be constrained.
 406  * \param[in] computeVirial     If virial should be updated.
 407  *
 408  * \return                      Pointer to CUDA kernel
 409  */
 410 inline auto getLincsKernelPtr(const bool  updateVelocities,
 411                               const bool  computeVirial)
 412 {
 413
 414     auto kernelPtr = lincs_kernel<true, true>;
 415     if (updateVelocities && computeVirial)
 416     {
 417         kernelPtr = lincs_kernel<true, true>;
 418     }
 419     else if (updateVelocities && !computeVirial)
 420     {
 421         kernelPtr = lincs_kernel<true, false>;
 422     }
 423     else if (!updateVelocities && computeVirial)
 424     {
 425         kernelPtr = lincs_kernel<false, true>;
 426     }
 427     else if (!updateVelocities && !computeVirial)
 428     {
 429         kernelPtr = lincs_kernel<false, false>;
 430     }
 431     return kernelPtr;
 432 }
 433
 434 void LincsCuda::Impl::apply(const float3 *d_x,
 435                             float3       *d_xp,
 436                             const bool    updateVelocities,
 437                             float3       *d_v,
 438                             const real    invdt,
 439                             const bool    computeVirial,
 440                             tensor        virialScaled)
 441 {
 442     ensureNoPendingCudaError("In CUDA version of LINCS");
 443
 444     // Early exit if no constraints
 445     if (kernelParams_.numConstraintsThreads == 0)
 446     {
 447         return;
 448     }
 449
 450     if (computeVirial)
 451     {
 452         // Fill with zeros so the values can be reduced to it
 453         // Only 6 values are needed because virial is symmetrical
 454         clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, stream_);
 455     }
 456
 457     auto               kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
 458
 459     KernelLaunchConfig config;
 460     config.blockSize[0]     = c_threadsPerBlock;
 461     config.blockSize[1]     = 1;
 462     config.blockSize[2]     = 1;
 463     config.gridSize[0]      = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1)/c_threadsPerBlock;
 464     config.gridSize[1]      = 1;
 465     config.gridSize[2]      = 1;
 466
 467     // Shared memory is used to store:
 468     // -- Current coordinates (3 floats per thread)
 469     // -- Right-hand-sides for matrix inversion (2 floats per thread)
 470     // -- Virial tensor components (6 floats per thread)
 471     // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
 472     // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
 473     // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
 474     if (computeVirial)
 475     {
 476         config.sharedMemorySize = c_threadsPerBlock*6*sizeof(float);
 477     }
 478     else
 479     {
 480         config.sharedMemorySize = c_threadsPerBlock*3*sizeof(float);
 481     }
 482     config.stream           = stream_;
 483
 484     // This is to satisfy prepareGpuKernelArguments(...)
 485     // It there a better way?
 486     const float3 * gm_x  = d_x;
 487     float3       * gm_xp = d_xp;
 488     float3       * gm_v  = d_v;
 489
 490     const auto     kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
 491                                                           &kernelParams_,
 492                                                           &gm_x, &gm_xp,
 493                                                           &gm_v, &invdt);
 494
 495     launchGpuKernel(kernelPtr, config, nullptr,
 496                     "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
 497
 498     if (computeVirial)
 499     {
 500         // Copy LINCS virial data and add it to the common virial
 501         copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled,
 502                              0, 6,
 503                              stream_, GpuApiCallBehavior::Sync, nullptr);
 504
 505         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
 506         virialScaled[XX][XX] += h_virialScaled_[0];
 507         virialScaled[XX][YY] += h_virialScaled_[1];
 508         virialScaled[XX][ZZ] += h_virialScaled_[2];
 509
 510         virialScaled[YY][XX] += h_virialScaled_[1];
 511         virialScaled[YY][YY] += h_virialScaled_[3];
 512         virialScaled[YY][ZZ] += h_virialScaled_[4];
 513
 514         virialScaled[ZZ][XX] += h_virialScaled_[2];
 515         virialScaled[ZZ][YY] += h_virialScaled_[4];
 516         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
 517     }
 518
 519     return;
 520 }
 521
 522 void LincsCuda::Impl::copyApplyCopy(const int   numAtoms,
 523                                     const rvec *h_x,
 524                                     rvec       *h_xp,
 525                                     bool        updateVelocities,
 526                                     rvec       *h_v,
 527                                     real        invdt,
 528                                     bool        computeVirial,
 529                                     tensor      virialScaled)
 530 {
 531
 532     float3 *d_x, *d_xp, *d_v;
 533
 534     allocateDeviceBuffer(&d_x,  numAtoms, nullptr);
 535     allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
 536     allocateDeviceBuffer(&d_v,  numAtoms, nullptr);
 537
 538     copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, stream_, GpuApiCallBehavior::Sync, nullptr);
 539     copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, stream_, GpuApiCallBehavior::Sync, nullptr);
 540     if (updateVelocities)
 541     {
 542         copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, stream_, GpuApiCallBehavior::Sync, nullptr);
 543     }
 544     apply(d_x, d_xp,
 545           updateVelocities, d_v, invdt,
 546           computeVirial, virialScaled);
 547
 548     copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, stream_, GpuApiCallBehavior::Sync, nullptr);
 549     if (updateVelocities)
 550     {
 551         copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, stream_, GpuApiCallBehavior::Sync, nullptr);
 552     }
 553
 554     freeDeviceBuffer(&d_x);
 555     freeDeviceBuffer(&d_xp);
 556     freeDeviceBuffer(&d_v);
 557 }
 558
 559 LincsCuda::Impl::Impl(int numIterations,
 560                       int expansionOrder)
 561 {
 562     kernelParams_.numIterations              = numIterations;
 563     kernelParams_.expansionOrder             = expansionOrder;
 564
 565     static_assert(sizeof(real) == sizeof(float),
 566                   "Real numbers should be in single precision in GPU code.");
 567     static_assert(c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
 568                   "Number of threads per block should be a power of two in order for reduction to work.");
 569
 570     allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
 571     h_virialScaled_.resize(6);
 572
 573     // The data arrays should be expanded/reallocated on first call of set() function.
 574     numConstraintsThreadsAlloc_ = 0;
 575     numAtomsAlloc_              = 0;
 576     // Use default stream.
 577     // TODO The stream should/can be assigned by the GPU schedule when the code will be integrated.
 578     stream_ = nullptr;
 579
 580 }
 581
 582 LincsCuda::Impl::~Impl()
 583 {
 584     freeDeviceBuffer(&kernelParams_.d_virialScaled);
 585
 586     if (numConstraintsThreadsAlloc_ > 0)
 587     {
 588         freeDeviceBuffer(&kernelParams_.d_constraints);
 589         freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 590
 591         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 592         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 593         freeDeviceBuffer(&kernelParams_.d_massFactors);
 594         freeDeviceBuffer(&kernelParams_.d_matrixA);
 595     }
 596     if (numAtomsAlloc_ > 0)
 597     {
 598         freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 599     }
 600 }
 601
 602 /*! \brief Helper function to go through constraints recursively.
 603  *
 604  *  For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
 605  *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
 606  *  coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
 607  *  value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
 608  *  after it in the constraints array.
 609  *
 610  * \param[in]     a                  Atom index.
 611  * \param[in,out] spaceNeeded        Indicates if the constraint was already counted and stores
 612  *                                   the number of constraints (i) connected to it and (ii) located
 613  *                                   after it in memory. This array is filled by this recursive function.
 614  *                                   For a set of coupled constraints, only for the first one in this list
 615  *                                   the number of consecutive coupled constraints is needed: if there is
 616  *                                   not enough space for this set of constraints in the thread block,
 617  *                                   the group has to be moved to the next one.
 618  * \param[in]     atomAdjacencyList  Stores information about connections between atoms.
 619  */
 620 inline int countCoupled(int a, std::vector<int> *spaceNeeded,
 621                         std::vector<std::vector<std::tuple<int, int, int> > > *atomsAdjacencyList)
 622
 623 {
 624     int c2, a2, sign;
 625     int counted = 0;
 626     for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
 627     {
 628         std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
 629         if (spaceNeeded->at(c2) == -1)
 630         {
 631             spaceNeeded->at(c2) = 0; // To indicate we've been here
 632             counted            += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
 633         }
 634     }
 635     return counted;
 636 }
 637
 638 void LincsCuda::Impl::set(const t_idef    &idef,
 639                           const t_mdatoms &md)
 640 {
 641     int                 numAtoms = md.nr;
 642     // List of constrained atoms (CPU memory)
 643     std::vector<int2>   constraintsHost;
 644     // Equilibrium distances for the constraints (CPU)
 645     std::vector<float>  constraintsTargetLengthsHost;
 646     // Number of constraints, coupled with the current one (CPU)
 647     std::vector<int>    coupledConstraintsCountsHost;
 648     // List of coupled with the current one (CPU)
 649     std::vector<int>    coupledConstraintsIndicesHost;
 650     // Mass factors (CPU)
 651     std::vector<float>  massFactorsHost;
 652
 653     // List of constrained atoms in local topology
 654     t_iatom  *iatoms         = idef.il[F_CONSTR].iatoms;
 655     const int stride         = NRAL(F_CONSTR) + 1;
 656     const int numConstraints = idef.il[F_CONSTR].nr/stride;
 657
 658     // Early exit if no constraints
 659     if (numConstraints == 0)
 660     {
 661         kernelParams_.numConstraintsThreads = 0;
 662         return;
 663     }
 664
 665     // Constructing adjacency list --- usefull intermediate structure
 666     std::vector<std::vector<std::tuple<int, int, int> > > atomsAdjacencyList(numAtoms);
 667     for (int c = 0; c < numConstraints; c++)
 668     {
 669         int a1     = iatoms[stride*c + 1];
 670         int a2     = iatoms[stride*c + 2];
 671
 672         // Each constraint will be represented as a tuple, containing index of the second constrained atom,
 673         // index of the constraint and a sign that indicates the order of atoms in which they are listed.
 674         // Sign is needed to compute the mass factors.
 675         atomsAdjacencyList.at(a1).push_back(std::make_tuple(a2, c, +1));
 676         atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
 677     }
 678
 679     // Compute, how many coupled constraints are in front of each constraint.
 680     // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
 681     // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
 682     // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
 683     // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
 684     std::vector<int> spaceNeeded;
 685     spaceNeeded.resize(numConstraints, -1);
 686     std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
 687     for (int c = 0; c < numConstraints; c++)
 688     {
 689         int a1     = iatoms[stride*c + 1];
 690         int a2     = iatoms[stride*c + 2];
 691         if (spaceNeeded.at(c) == -1)
 692         {
 693             spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList) +
 694                 countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
 695         }
 696     }
 697
 698     // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
 699     // takes into account the empty spaces which might be needed in the end of each thread block.
 700     std::vector<int> splitMap;
 701     splitMap.resize(numConstraints, -1);
 702     int              currentMapIndex = 0;
 703     for (int c = 0; c < numConstraints; c++)
 704     {
 705         // Check if coupled constraints all fit in one block
 706         GMX_RELEASE_ASSERT(spaceNeeded.at(c) < c_threadsPerBlock, "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
 707                            "Most likely, you are trying to use GPU version of LINCS with constraints on all-bonds, "
 708                            "which is not supported. Try using H-bonds constraints only.");
 709         if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
 710         {
 711             currentMapIndex = ((currentMapIndex/c_threadsPerBlock) + 1) * c_threadsPerBlock;
 712         }
 713         splitMap.at(c) = currentMapIndex;
 714         currentMapIndex++;
 715     }
 716     kernelParams_.numConstraintsThreads = currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
 717     GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0, "Number of threads should be a multiple of the block size");
 718
 719     // Initialize constraints and their target indexes taking into account the splits in the data arrays.
 720     int2 pair;
 721     pair.x = -1;
 722     pair.y = -1;
 723     constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
 724     std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
 725     constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
 726     std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
 727     for (int c = 0; c < numConstraints; c++)
 728     {
 729         int  a1     = iatoms[stride*c + 1];
 730         int  a2     = iatoms[stride*c + 2];
 731         int  type   = iatoms[stride*c];
 732
 733         int2 pair;
 734         pair.x = a1;
 735         pair.y = a2;
 736         constraintsHost.at(splitMap.at(c))              = pair;
 737         constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
 738
 739     }
 740
 741     // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
 742     // We map a single thread to a single constraint, hence each thread 'c' will be using one element from
 743     // coupledConstraintsCountsHost array, which is the number of constraints coupled to the constraint 'c'.
 744     // The coupled constraints indexes are placed into the coupledConstraintsIndicesHost array. Latter is organized
 745     // as a one-dimensional array to ensure good memory alignment. It is addressed as [c + i*numConstraintsThreads],
 746     // where 'i' goes from zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of
 747     // the array --- a number, greater then total number of constraints, taking into account the splits in the
 748     // constraints array due to the GPU block borders. This number can be adjusted to improve memory access pattern.
 749     // Mass factors are saved in a similar data structure.
 750     int              maxCoupledConstraints = 0;
 751     for (int c = 0; c < numConstraints; c++)
 752     {
 753         int a1     = iatoms[stride*c + 1];
 754         int a2     = iatoms[stride*c + 2];
 755
 756         // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
 757         int nCoupedConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
 758
 759         if (nCoupedConstraints > maxCoupledConstraints)
 760         {
 761             maxCoupledConstraints = nCoupedConstraints;
 762         }
 763     }
 764
 765     coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
 766     coupledConstraintsIndicesHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
 767     massFactorsHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
 768
 769     for (int c1 = 0; c1 < numConstraints; c1++)
 770     {
 771         coupledConstraintsCountsHost.at(splitMap.at(c1))  = 0;
 772         int c1a1     = iatoms[stride*c1 + 1];
 773         int c1a2     = iatoms[stride*c1 + 2];
 774         int c2;
 775         int c2a1;
 776         int c2a2;
 777
 778         int sign;
 779
 780         // Constraints, coupled trough the first atom.
 781         c2a1 = c1a1;
 782         for (unsigned j = 0; j < atomsAdjacencyList.at(c1a1).size(); j++)
 783         {
 784
 785             std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a1).at(j);
 786
 787             if (c1 != c2)
 788             {
 789                 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
 790
 791                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 792
 793                 int   center = c1a1;
 794
 795                 float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 796                 float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 797
 798                 massFactorsHost.at(index) = -sign*md.invmass[center]*sqrtmu1*sqrtmu2;
 799
 800                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 801
 802             }
 803         }
 804
 805         // Constraints, coupled through the second atom.
 806         c2a1 = c1a2;
 807         for (unsigned j = 0; j < atomsAdjacencyList.at(c1a2).size(); j++)
 808         {
 809
 810             std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a2).at(j);
 811
 812             if (c1 != c2)
 813             {
 814                 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
 815
 816                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 817
 818                 int   center = c1a2;
 819
 820                 float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 821                 float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 822
 823                 massFactorsHost.at(index) = sign*md.invmass[center]*sqrtmu1*sqrtmu2;
 824
 825                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 826
 827             }
 828         }
 829     }
 830
 831     // (Re)allocate the memory, if the number of constraints has increased.
 832     if (kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_)
 833     {
 834         // Free memory if it was allocated before (i.e. if not the first time here).
 835         if (numConstraintsThreadsAlloc_ > 0)
 836         {
 837             freeDeviceBuffer(&kernelParams_.d_constraints);
 838             freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 839
 840             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 841             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 842             freeDeviceBuffer(&kernelParams_.d_massFactors);
 843             freeDeviceBuffer(&kernelParams_.d_matrixA);
 844
 845         }
 846
 847         numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
 848
 849         allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
 850         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, kernelParams_.numConstraintsThreads, nullptr);
 851
 852         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, kernelParams_.numConstraintsThreads, nullptr);
 853         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
 854         allocateDeviceBuffer(&kernelParams_.d_massFactors, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
 855         allocateDeviceBuffer(&kernelParams_.d_matrixA, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
 856
 857     }
 858
 859     // (Re)allocate the memory, if the number of atoms has increased.
 860     if (numAtoms > numAtomsAlloc_)
 861     {
 862         if (numAtomsAlloc_ > 0)
 863         {
 864             freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 865         }
 866         numAtomsAlloc_ = numAtoms;
 867         allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, nullptr);
 868     }
 869
 870     // Copy data to GPU.
 871     copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(),
 872                        0, kernelParams_.numConstraintsThreads,
 873                        stream_, GpuApiCallBehavior::Sync, nullptr);
 874     copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, constraintsTargetLengthsHost.data(),
 875                        0, kernelParams_.numConstraintsThreads,
 876                        stream_, GpuApiCallBehavior::Sync, nullptr);
 877     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, coupledConstraintsCountsHost.data(),
 878                        0, kernelParams_.numConstraintsThreads,
 879                        stream_, GpuApiCallBehavior::Sync, nullptr);
 880     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
 881                        0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
 882                        stream_, GpuApiCallBehavior::Sync, nullptr);
 883     copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(),
 884                        0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
 885                        stream_, GpuApiCallBehavior::Sync, nullptr);
 886
 887     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
 888     copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass,
 889                        0, numAtoms,
 890                        stream_, GpuApiCallBehavior::Sync, nullptr);
 891
 892 }
 893
 894 void LincsCuda::Impl::setPbc(const t_pbc *pbc)
 895 {
 896     setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
 897 }
 898
 899 LincsCuda::LincsCuda(const int numIterations,
 900                      const int expansionOrder)
 901     : impl_(new Impl(numIterations, expansionOrder))
 902 {
 903 }
 904
 905 LincsCuda::~LincsCuda() = default;
 906
 907 void LincsCuda::copyApplyCopy(const int   numAtoms,
 908                               const rvec *h_x,
 909                               rvec       *h_xp,
 910                               const bool  updateVelocities,
 911                               rvec       *h_v,
 912                               const real  invdt,
 913                               const bool  computeVirial,
 914                               tensor      virialScaled)
 915 {
 916     impl_->copyApplyCopy(numAtoms, h_x, h_xp,
 917                          updateVelocities, h_v, invdt,
 918                          computeVirial, virialScaled);
 919 }
 920
 921 void LincsCuda::setPbc(const t_pbc *pbc)
 922 {
 923     impl_->setPbc(pbc);
 924 }
 925
 926 void LincsCuda::set(const t_idef    &idef,
 927                     const t_mdatoms &md)
 928 {
 929     impl_->set(idef, md);
 930 }
 931
 932 } // namespace gmx