2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements LINCS using CUDA
39 * This file contains implementation of LINCS constraints algorithm
40 * using CUDA, including class initialization, data-structures management
43 * \note Management of coordinates, velocities, CUDA stream and periodic boundary exists here as a temporary
44 * scaffolding to allow this class to be used as a stand-alone unit. The scaffolding is intended to be
45 * removed once constraints are integrated with update module.
46 * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
48 * \author Artem Zhmurov <zhmurov@gmail.com>
49 * \author Alan Gray <alang@nvidia.com>
51 * \ingroup module_mdlib
55 #include "lincs_cuda_impl.h"
64 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
65 #include "gromacs/gpu_utils/cudautils.cuh"
66 #include "gromacs/gpu_utils/devicebuffer.cuh"
67 #include "gromacs/gpu_utils/gputraits.cuh"
68 #include "gromacs/gpu_utils/vectype_ops.cuh"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/mdlib/constr.h"
71 #include "gromacs/mdlib/lincs_cuda.h"
72 #include "gromacs/pbcutil/pbc.h"
73 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
74 #include "gromacs/topology/ifunc.h"
79 //! Number of CUDA threads in a block
80 constexpr static int c_threadsPerBlock = 256;
81 //! Maximum number of threads in a block (for __launch_bounds__)
82 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
84 /*! \brief Main kernel for LINCS constraints.
86 * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
88 * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
89 * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
90 * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
91 * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
92 * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
93 * there is no need to synchronize across the device. However, extensive communication in a thread block
96 * \todo Reduce synchronization overhead. Some ideas are:
97 * 1. Consider going to warp-level synchronization for the coupled constraints.
98 * 2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
100 * 3. Use analytical solution for matrix A inversion.
101 * 4. Introduce mapping of thread id to both single constraint and single atom, thus designating
102 * Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
103 * See Redmine issue #2885 for details (https://redmine.gromacs.org/issues/2885)
105 * \param[in,out] kernelParams All parameters and pointers for the kernel condensed in single struct.
106 * \param[in] invdt Inverse timestep (needed to update velocities).
108 template <bool updateVelocities, bool computeVirial>
109 __launch_bounds__(c_maxThreadsPerBlock)
110 __global__ void lincs_kernel(LincsCudaKernelParameters kernelParams,
113 const PbcAiuc pbcAiuc = kernelParams.pbcAiuc;
114 const int numConstraintsThreads = kernelParams.numConstraintsThreads;
115 const int numIterations = kernelParams.numIterations;
116 const int expansionOrder = kernelParams.expansionOrder;
117 const float3* __restrict__ gm_x = kernelParams.d_x;
118 float3* gm_xp = kernelParams.d_xp;
119 const int2* __restrict__ gm_constraints = kernelParams.d_constraints;
120 const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
121 const int* __restrict__ gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts;
122 const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
123 const float* __restrict__ gm_massFactors = kernelParams.d_massFactors;
124 float* __restrict__ gm_matrixA = kernelParams.d_matrixA;
125 const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses;
126 float3* gm_v = kernelParams.d_v;
127 float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled;
129 int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
131 // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
132 // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
133 assert(threadIndex < numConstraintsThreads);
135 // Vectors connecting constrained atoms before algorithm was applied.
136 // Needed to construct constrain matrix A
137 extern __shared__ float3 sm_r[];
139 int2 pair = gm_constraints[threadIndex];
143 // Mass-scaled Lagrange multiplier
144 float lagrangeScaled = 0.0f;
149 float sqrtReducedMass;
155 // i == -1 indicates dummy constraint at the end of the thread block.
156 bool isDummyThread = (i == -1);
158 // Everything computed for these dummies will be equal to zero
164 sqrtReducedMass = 0.0f;
166 xi = make_float3(0.0f, 0.0f, 0.0f);
167 xj = make_float3(0.0f, 0.0f, 0.0f);
168 rc = make_float3(0.0f, 0.0f, 0.0f);
173 targetLength = gm_constraintsTargetLengths[threadIndex];
174 inverseMassi = gm_inverseMasses[i];
175 inverseMassj = gm_inverseMasses[j];
176 sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
181 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
183 float rlen = rsqrtf(dx.x*dx.x + dx.y*dx.y + dx.z*dx.z);
187 sm_r[threadIdx.x] = rc;
188 // Make sure that all r's are saved into shared memory
189 // before they are accessed in the loop below
193 * Constructing LINCS matrix (A)
196 // Only non-zero values are saved (for coupled constraints)
197 int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
198 for (int n = 0; n < coupledConstraintsCount; n++)
200 int index = n*numConstraintsThreads + threadIndex;
201 int c1 = gm_coupledConstraintsIdxes[index];
203 float3 rc1 = sm_r[c1 - blockIdx.x*blockDim.x];
204 gm_matrixA[index] = gm_massFactors[index]*(rc.x*rc1.x + rc.y*rc1.y + rc.z*rc1.z);
207 // Skipping in dummy threads
214 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
216 float sol = sqrtReducedMass*((rc.x*dx.x + rc.y*dx.y + rc.z*dx.z) - targetLength);
219 * Inverse matrix using a set of expansionOrder matrix multiplications
222 // This will use the same memory space as sm_r, which is no longer needed.
223 extern __shared__ float sm_rhs[];
224 // Save current right-hand-side vector in the shared memory
225 sm_rhs[threadIdx.x] = sol;
227 for (int rec = 0; rec < expansionOrder; rec++)
229 // Making sure that all sm_rhs are saved before they are accessed in a loop below
233 for (int n = 0; n < coupledConstraintsCount; n++)
235 int index = n*numConstraintsThreads + threadIndex;
236 int c1 = gm_coupledConstraintsIdxes[index];
237 // Convolute current right-hand-side with A
238 // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
239 mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
241 // 'Switch' rhs vectors, save current result
242 // These values will be accessed in the loop above during the next iteration.
243 sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
247 // Current mass-scaled Lagrange multipliers
248 lagrangeScaled = sqrtReducedMass*sol;
250 // Save updated coordinates before correction for the rotational lengthening
251 float3 tmp = rc*lagrangeScaled;
253 // Writing for all but dummy constraints
256 atomicAdd(&gm_xp[i], -tmp*inverseMassi);
257 atomicAdd(&gm_xp[j], tmp*inverseMassj);
261 * Correction for centripetal effects
263 for (int iter = 0; iter < numIterations; iter++)
265 // Make sure that all xp's are saved: atomic operation calls before are
266 // communicating current xp[..] values across thread block.
275 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
277 float len2 = targetLength*targetLength;
278 float dlen2 = 2.0f*len2 - norm2(dx);
280 // TODO A little bit more effective but slightly less readable version of the below would be:
281 // float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
285 proj = sqrtReducedMass*(targetLength - dlen2*rsqrt(dlen2));
289 proj = sqrtReducedMass*targetLength;
292 sm_rhs[threadIdx.x] = proj;
296 * Same matrix inversion as above is used for updated data
298 for (int rec = 0; rec < expansionOrder; rec++)
300 // Make sure that all elements of rhs are saved into shared memory
304 for (int n = 0; n < coupledConstraintsCount; n++)
306 int index = n*numConstraintsThreads + threadIndex;
307 int c1 = gm_coupledConstraintsIdxes[index];
309 mvb = mvb + gm_matrixA[index]*sm_rhs[c1 - blockIdx.x*blockDim.x + blockDim.x*(rec % 2)];
312 sm_rhs[threadIdx.x + blockDim.x*((rec + 1) % 2)] = mvb;
316 // Add corrections to Lagrange multipliers
317 float sqrtmu_sol = sqrtReducedMass*sol;
318 lagrangeScaled += sqrtmu_sol;
320 // Save updated coordinates for the next iteration
321 // Dummy constraints are skipped
324 float3 tmp = rc*sqrtmu_sol;
325 atomicAdd(&gm_xp[i], -tmp*inverseMassi);
326 atomicAdd(&gm_xp[j], tmp*inverseMassj);
330 // Updating particle velocities for all but dummy threads
331 if (updateVelocities && !isDummyThread)
333 float3 tmp = rc*invdt*lagrangeScaled;
334 atomicAdd(&gm_v[i], -tmp*inverseMassi);
335 atomicAdd(&gm_v[j], tmp*inverseMassj);
341 // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
342 // (targetLength) and the normalized vector connecting constrained atoms before
343 // the algorithm was applied (rc). The evaluation of virial in each thread is
344 // followed by basic reduction for the values inside single thread block.
345 // Then, the values are reduced across grid by atomicAdd(...).
347 // TODO Shuffle reduction.
348 // TODO Should be unified and/or done once when virial is actually needed.
349 // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
350 // one that works for any datatype.
352 // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
353 // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
354 // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
355 // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
356 // two are no longer in use.
357 extern __shared__ float sm_threadVirial[];
358 float mult = targetLength*lagrangeScaled;
359 sm_threadVirial[0*blockDim.x + threadIdx.x] = mult*rc.x*rc.x;
360 sm_threadVirial[1*blockDim.x + threadIdx.x] = mult*rc.x*rc.y;
361 sm_threadVirial[2*blockDim.x + threadIdx.x] = mult*rc.x*rc.z;
362 sm_threadVirial[3*blockDim.x + threadIdx.x] = mult*rc.y*rc.y;
363 sm_threadVirial[4*blockDim.x + threadIdx.x] = mult*rc.y*rc.z;
364 sm_threadVirial[5*blockDim.x + threadIdx.x] = mult*rc.z*rc.z;
368 // Reduce up to one virial per thread block. All blocks are divided by half, the first
369 // half of threads sums two virials. Then the first half is divided by two and the first
370 // half of it sums two values. This procedure is repeated until only one thread is left.
371 // Only works if the threads per blocks is a power of two (hence static_assert
372 // in the beginning of the kernel).
373 for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
375 int dividedAt = blockDim.x/divideBy;
376 if (static_cast<int>(threadIdx.x) < dividedAt)
378 for (int d = 0; d < 6; d++)
380 sm_threadVirial[d*blockDim.x + threadIdx.x] += sm_threadVirial[d*blockDim.x + (threadIdx.x + dividedAt)];
383 // Syncronize if not within one warp
384 if (dividedAt > warpSize/2)
389 // First 6 threads in the block add the results of 6 tensor components to the global memory address.
392 atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x*blockDim.x]);
399 /*! \brief Select templated kernel.
401 * Returns pointer to a CUDA kernel based on provided booleans.
403 * \param[in] updateVelocities If the velocities should be constrained.
404 * \param[in] computeVirial If virial should be updated.
406 * \return Pointer to CUDA kernel
408 inline auto getLincsKernelPtr(const bool updateVelocities,
409 const bool computeVirial)
412 auto kernelPtr = lincs_kernel<true, true>;
413 if (updateVelocities && computeVirial)
415 kernelPtr = lincs_kernel<true, true>;
417 else if (updateVelocities && !computeVirial)
419 kernelPtr = lincs_kernel<true, false>;
421 else if (!updateVelocities && computeVirial)
423 kernelPtr = lincs_kernel<false, true>;
425 else if (!updateVelocities && !computeVirial)
427 kernelPtr = lincs_kernel<false, false>;
432 /*! \brief Apply LINCS.
434 * Applies LINCS to coordinates and velocities, stored on GPU.
435 * Data at pointers xPrime and v (class fields) change in the GPU
436 * memory. The results are not automatically copied back to the CPU
437 * memory. Method uses this class data structures which should be
438 * updated when needed using update method.
440 * \param[in] updateVelocities If the velocities should be constrained.
441 * \param[in] invdt Reciprocal timestep (to scale Lagrange
442 * multipliers when velocities are updated)
443 * \param[in] computeVirial If virial should be updated.
444 * \param[in,out] virialScaled Scaled virial tensor to be updated.
446 void LincsCuda::Impl::apply(const bool updateVelocities,
448 const bool computeVirial,
451 ensureNoPendingCudaError("In CUDA version of LINCS");
455 // Fill with zeros so the values can be reduced to it
456 // Only 6 values are needed because virial is symmetrical
457 clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, stream_);
460 auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
462 KernelLaunchConfig config;
463 config.blockSize[0] = c_threadsPerBlock;
464 config.blockSize[1] = 1;
465 config.blockSize[2] = 1;
466 config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1)/c_threadsPerBlock;
467 config.gridSize[1] = 1;
468 config.gridSize[2] = 1;
470 // Shared memory is used to store:
471 // -- Current coordinates (3 floats per thread)
472 // -- Right-hand-sides for matrix inversion (2 floats per thread)
473 // -- Virial tensor components (6 floats per thread)
474 // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
475 // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
476 // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
479 config.sharedMemorySize = c_threadsPerBlock*6*sizeof(float);
483 config.sharedMemorySize = c_threadsPerBlock*3*sizeof(float);
485 config.stream = stream_;
487 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
491 launchGpuKernel(kernelPtr, config, nullptr,
492 "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
496 // Copy LINCS virial data and add it to the common virial
497 copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled,
499 stream_, GpuApiCallBehavior::Sync, nullptr);
501 // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
502 virialScaled[XX][XX] += h_virialScaled_[0];
503 virialScaled[XX][YY] += h_virialScaled_[1];
504 virialScaled[XX][ZZ] += h_virialScaled_[2];
506 virialScaled[YY][XX] += h_virialScaled_[1];
507 virialScaled[YY][YY] += h_virialScaled_[3];
508 virialScaled[YY][ZZ] += h_virialScaled_[4];
510 virialScaled[ZZ][XX] += h_virialScaled_[2];
511 virialScaled[ZZ][YY] += h_virialScaled_[4];
512 virialScaled[ZZ][ZZ] += h_virialScaled_[5];
518 /*! \brief Create LINCS object
520 * \param[in] numAtoms Number of atoms that will be handles by LINCS.
521 * Used to compute the memory size in allocations and copy.
522 * \param[in] numIterations Number of iterations used to compute inverse matrix.
523 * \param[in] expansionOrder LINCS projection order for correcting the direction of constraint.
525 LincsCuda::Impl::Impl(int numAtoms,
528 : numAtoms_(numAtoms)
530 kernelParams_.numIterations = numIterations;
531 kernelParams_.expansionOrder = expansionOrder;
533 static_assert(sizeof(real) == sizeof(float),
534 "Real numbers should be in single precision in GPU code.");
535 static_assert(c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
536 "Number of threads per block should be a power of two in order for reduction to work.");
538 // This is temporary. LINCS should not manage coordinates.
539 allocateDeviceBuffer(&kernelParams_.d_x, numAtoms, nullptr);
540 allocateDeviceBuffer(&kernelParams_.d_xp, numAtoms, nullptr);
541 allocateDeviceBuffer(&kernelParams_.d_v, numAtoms, nullptr);
543 allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
544 h_virialScaled_.resize(6);
546 // The data arrays should be expanded/reallocated on first call of set() function.
547 maxConstraintsNumberSoFar_ = 0;
548 // Use default stream.
549 // TODO The stream should/can be assigned by the GPU schedule when the code will be integrated.
554 LincsCuda::Impl::~Impl()
558 /*! \brief Helper function to go through constraints recursively.
560 * For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
561 * This information is used to split the array of constraints between thread blocks on a GPU so there is no
562 * coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
563 * value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
564 * after it in the constraints array.
566 * \param[in] a Atom index.
567 * \param[in,out] spaceNeeded Indicates if the constraint was already counted and stores
568 * the number of constraints (i) connected to it and (ii) located
569 * after it in memory. This array is filled by this recursive function.
570 * For a set of coupled constraints, only for the first one in this list
571 * the number of consecutive coupled constraints is needed: if there is
572 * not enough space for this set of constraints in the thread block,
573 * the group has to be moved to the next one.
574 * \param[in] atomAdjacencyList Stores information about connections between atoms.
576 inline int countCoupled(int a, std::vector<int> *spaceNeeded,
577 std::vector<std::vector<std::tuple<int, int, int> > > *atomsAdjacencyList)
582 for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
584 std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
585 if (spaceNeeded->at(c2) == -1)
587 spaceNeeded->at(c2) = 0; // To indicate we've been here
588 counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
595 * Update data-structures (e.g. after NB search step).
597 * Updates the constraints data and copies it to the GPU. Should be
598 * called if the particles were sorted, redistributed between domains, etc.
599 * This version uses common data formats so it can be called from anywhere
600 * in the code. Does not recycle the data preparation routines from the CPU
601 * version. Works only with simple case when all the constraints in idef are
602 * are handled by a single GPU. Triangles are not handled as special case.
604 * Information about constraints is taken from:
605 * idef.il[F_CONSTR].iatoms --- type (T) of constraint and two atom indexes (i1, i2)
606 * idef.iparams[T].constr.dA --- target length for constraint of type T
607 * From t_mdatom, the code takes:
608 * md.invmass --- array of inverse square root of masses for each atom in the system.
610 * \param[in] idef Local topology data to get information on constraints from.
611 * \param[in] md Atoms data to get atom masses from.
613 void LincsCuda::Impl::set(const t_idef &idef,
617 // List of constrained atoms (CPU memory)
618 std::vector<int2> constraintsHost;
619 // Equilibrium distances for the constraints (CPU)
620 std::vector<float> constraintsTargetLengthsHost;
621 // Number of constraints, coupled with the current one (CPU)
622 std::vector<int> coupledConstraintsCountsHost;
623 // List of coupled with the current one (CPU)
624 std::vector<int> coupledConstraintsIndicesHost;
625 // Mass factors (CPU)
626 std::vector<float> massFactorsHost;
628 // List of constrained atoms in local topology
629 t_iatom *iatoms = idef.il[F_CONSTR].iatoms;
630 const int stride = NRAL(F_CONSTR) + 1;
631 const int numConstraints = idef.il[F_CONSTR].nr/stride;
632 // Constructing adjacency list --- usefull intermediate structure
633 std::vector<std::vector<std::tuple<int, int, int> > > atomsAdjacencyList(numAtoms_);
634 for (int c = 0; c < numConstraints; c++)
636 int a1 = iatoms[stride*c + 1];
637 int a2 = iatoms[stride*c + 2];
639 // Each constraint will be represented as a tuple, containing index of the second constrained atom,
640 // index of the constraint and a sign that indicates the order of atoms in which they are listed.
641 // Sign is needed to compute the mass factors.
642 atomsAdjacencyList.at(a1).push_back(std::make_tuple(a2, c, +1));
643 atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
646 // Compute, how many coupled constraints are in front of each constraint.
647 // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
648 // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
649 // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
650 // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
651 std::vector<int> spaceNeeded;
652 spaceNeeded.resize(numConstraints, -1);
653 std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
654 for (int c = 0; c < numConstraints; c++)
656 int a1 = iatoms[stride*c + 1];
657 int a2 = iatoms[stride*c + 2];
658 if (spaceNeeded.at(c) == -1)
660 spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList) +
661 countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
665 // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
666 // takes into account the empty spaces which might be needed in the end of each thread block.
667 std::vector<int> splitMap;
668 splitMap.resize(numConstraints, -1);
669 int currentMapIndex = 0;
670 for (int c = 0; c < numConstraints; c++)
672 // Check if coupled constraints all fit in one block
673 GMX_RELEASE_ASSERT(spaceNeeded.at(c) < c_threadsPerBlock, "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
674 "Most likely, you are trying to use GPU version of LINCS with constraints on all-bonds, "
675 "which is not supported. Try using H-bonds constraints only.");
676 if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
678 currentMapIndex = ((currentMapIndex/c_threadsPerBlock) + 1) * c_threadsPerBlock;
680 splitMap.at(c) = currentMapIndex;
683 kernelParams_.numConstraintsThreads = currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
684 GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0, "Number of threads should be a multiple of the block size");
686 // Initialize constraints and their target indexes taking into account the splits in the data arrays.
690 constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
691 std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
692 constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
693 std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
694 for (int c = 0; c < numConstraints; c++)
696 int a1 = iatoms[stride*c + 1];
697 int a2 = iatoms[stride*c + 2];
698 int type = iatoms[stride*c];
703 constraintsHost.at(splitMap.at(c)) = pair;
704 constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
708 // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
709 // We map a single thread to a single constraint, hence each thread 'c' will be using one element from
710 // coupledConstraintsCountsHost array, which is the number of constraints coupled to the constraint 'c'.
711 // The coupled constraints indexes are placed into the coupledConstraintsIndicesHost array. Latter is organized
712 // as a one-dimensional array to ensure good memory alignment. It is addressed as [c + i*numConstraintsThreads],
713 // where 'i' goes from zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of
714 // the array --- a number, greater then total number of constraints, taking into account the splits in the
715 // constraints array due to the GPU block borders. This number can be adjusted to improve memory access pattern.
716 // Mass factors are saved in a similar data structure.
717 int maxCoupledConstraints = 0;
718 for (int c = 0; c < numConstraints; c++)
720 int a1 = iatoms[stride*c + 1];
721 int a2 = iatoms[stride*c + 2];
723 // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
724 int nCoupedConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
726 if (nCoupedConstraints > maxCoupledConstraints)
728 maxCoupledConstraints = nCoupedConstraints;
732 coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
733 coupledConstraintsIndicesHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
734 massFactorsHost.resize(maxCoupledConstraints*kernelParams_.numConstraintsThreads, -1);
736 for (int c1 = 0; c1 < numConstraints; c1++)
738 coupledConstraintsCountsHost.at(splitMap.at(c1)) = 0;
739 int c1a1 = iatoms[stride*c1 + 1];
740 int c1a2 = iatoms[stride*c1 + 2];
747 // Constraints, coupled trough the first atom.
749 for (unsigned j = 0; j < atomsAdjacencyList.at(c1a1).size(); j++)
752 std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a1).at(j);
756 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
758 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
762 float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
763 float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
765 massFactorsHost.at(index) = -sign*md.invmass[center]*sqrtmu1*sqrtmu2;
767 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
772 // Constraints, coupled through the second atom.
774 for (unsigned j = 0; j < atomsAdjacencyList.at(c1a2).size(); j++)
777 std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a2).at(j);
781 int index = kernelParams_.numConstraintsThreads*coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1);
783 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
787 float sqrtmu1 = 1.0/sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
788 float sqrtmu2 = 1.0/sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
790 massFactorsHost.at(index) = sign*md.invmass[center]*sqrtmu1*sqrtmu2;
792 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
798 // (Re)allocate the memory, if the number of constraints has increased.
799 if (numConstraints > maxConstraintsNumberSoFar_)
801 // Free memory if it was allocated before (i.e. if not the first time here).
802 if (maxConstraintsNumberSoFar_ > 0)
804 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
806 freeDeviceBuffer(&kernelParams_.d_constraints);
807 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
809 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
810 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
811 freeDeviceBuffer(&kernelParams_.d_massFactors);
812 freeDeviceBuffer(&kernelParams_.d_matrixA);
815 maxConstraintsNumberSoFar_ = numConstraints;
817 allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms_, nullptr);
819 allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
820 allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, kernelParams_.numConstraintsThreads, nullptr);
822 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, kernelParams_.numConstraintsThreads, nullptr);
823 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
824 allocateDeviceBuffer(&kernelParams_.d_massFactors, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
825 allocateDeviceBuffer(&kernelParams_.d_matrixA, maxCoupledConstraints*kernelParams_.numConstraintsThreads, nullptr);
830 copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(),
831 0, kernelParams_.numConstraintsThreads,
832 stream_, GpuApiCallBehavior::Sync, nullptr);
833 copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, constraintsTargetLengthsHost.data(),
834 0, kernelParams_.numConstraintsThreads,
835 stream_, GpuApiCallBehavior::Sync, nullptr);
836 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, coupledConstraintsCountsHost.data(),
837 0, kernelParams_.numConstraintsThreads,
838 stream_, GpuApiCallBehavior::Sync, nullptr);
839 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
840 0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
841 stream_, GpuApiCallBehavior::Sync, nullptr);
842 copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(),
843 0, maxCoupledConstraints*kernelParams_.numConstraintsThreads,
844 stream_, GpuApiCallBehavior::Sync, nullptr);
846 GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
847 copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass,
849 stream_, GpuApiCallBehavior::Sync, nullptr);
856 * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter.
858 * \param[in] pbc The PBC data in t_pbc format.
860 void LincsCuda::Impl::setPbc(const t_pbc *pbc)
862 setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
866 * Copy coordinates from provided CPU location to GPU.
868 * Copies the coordinates before the integration step (x) and coordinates
869 * after the integration step (xp) from the provided CPU location to GPU.
870 * The data are assumed to be in float3/fvec format (single precision).
872 * \param[in] h_x CPU pointer where coordinates should be copied from.
873 * \param[in] h_xp CPU pointer where coordinates should be copied from.
875 void LincsCuda::Impl::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp)
877 copyToDeviceBuffer(&kernelParams_.d_x, (float3*)h_x, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
878 copyToDeviceBuffer(&kernelParams_.d_xp, (float3*)h_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
882 * Copy velocities from provided CPU location to GPU.
884 * Nothing is done if the argument provided is a nullptr.
885 * The data are assumed to be in float3/fvec format (single precision).
887 * \param[in] h_v CPU pointer where velocities should be copied from.
889 void LincsCuda::Impl::copyVelocitiesToGpu(const rvec *h_v)
893 copyToDeviceBuffer(&kernelParams_.d_v, (float3*)h_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
898 * Copy coordinates from GPU to provided CPU location.
900 * Copies the constrained coordinates to the provided location. The coordinates
901 * are assumed to be in float3/fvec format (single precision).
903 * \param[out] h_xp CPU pointer where coordinates should be copied to.
905 void LincsCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp)
907 copyFromDeviceBuffer((float3*)h_xp, &kernelParams_.d_xp, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
911 * Copy velocities from GPU to provided CPU location.
913 * The velocities are assumed to be in float3/fvec format (single precision).
915 * \param[in] h_v Pointer to velocities data.
917 void LincsCuda::Impl::copyVelocitiesFromGpu(rvec *h_v)
919 copyFromDeviceBuffer((float3*)h_v, &kernelParams_.d_v, 0, numAtoms_, stream_, GpuApiCallBehavior::Sync, nullptr);
923 * Set the internal GPU-memory x, xprime and v pointers.
925 * Data is not copied. The data are assumed to be in float3/fvec format
926 * (float3 is used internally, but the data layout should be identical).
928 * \param[in] d_x Pointer to the coordinates before integrator update (on GPU)
929 * \param[in] d_xp Pointer to the coordinates after integrator update, before update (on GPU)
930 * \param[in] d_v Pointer to the velocities before integrator update (on GPU)
932 void LincsCuda::Impl::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v)
934 kernelParams_.d_x = (float3*)d_x;
935 kernelParams_.d_xp = (float3*)d_xp;
936 kernelParams_.d_v = (float3*)d_v;
940 LincsCuda::LincsCuda(const int numAtoms,
941 const int numIterations,
942 const int expansionOrder)
943 : impl_(new Impl(numAtoms, numIterations, expansionOrder))
947 LincsCuda::~LincsCuda() = default;
949 void LincsCuda::apply(const bool updateVelocities,
951 const bool computeVirial,
954 impl_->apply(updateVelocities,
960 void LincsCuda::setPbc(const t_pbc *pbc)
965 void LincsCuda::set(const t_idef &idef,
968 impl_->set(idef, md);
971 void LincsCuda::copyCoordinatesToGpu(const rvec *h_x, const rvec *h_xp)
973 impl_->copyCoordinatesToGpu(h_x, h_xp);
976 void LincsCuda::copyVelocitiesToGpu(const rvec *h_v)
978 impl_->copyVelocitiesToGpu(h_v);
981 void LincsCuda::copyCoordinatesFromGpu(rvec *h_xp)
983 impl_->copyCoordinatesFromGpu(h_xp);
986 void LincsCuda::copyVelocitiesFromGpu(rvec *h_v)
988 impl_->copyVelocitiesFromGpu(h_v);
991 void LincsCuda::setXVPointers(rvec *d_x, rvec *d_xp, rvec *d_v)
993 impl_->setXVPointers(d_x, d_xp, d_v);