src/gromacs/mdlib/lincs.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief Defines LINCS code.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_mdlib
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "lincs.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cmath>
  52 #include <cstdlib>
  53
  54 #include <algorithm>
  55 #include <vector>
  56
  57 #include "gromacs/domdec/domdec.h"
  58 #include "gromacs/domdec/domdec_struct.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/math/functions.h"
  61 #include "gromacs/math/paddedvector.h"
  62 #include "gromacs/math/units.h"
  63 #include "gromacs/math/vec.h"
  64 #include "gromacs/mdlib/constr.h"
  65 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  66 #include "gromacs/mdrunutility/multisim.h"
  67 #include "gromacs/mdtypes/commrec.h"
  68 #include "gromacs/mdtypes/inputrec.h"
  69 #include "gromacs/mdtypes/md_enums.h"
  70 #include "gromacs/mdtypes/mdatom.h"
  71 #include "gromacs/pbcutil/pbc.h"
  72 #include "gromacs/pbcutil/pbc_simd.h"
  73 #include "gromacs/simd/simd.h"
  74 #include "gromacs/simd/simd_math.h"
  75 #include "gromacs/simd/vector_operations.h"
  76 #include "gromacs/topology/block.h"
  77 #include "gromacs/topology/mtop_util.h"
  78 #include "gromacs/utility/alignedallocator.h"
  79 #include "gromacs/utility/arrayref.h"
  80 #include "gromacs/utility/basedefinitions.h"
  81 #include "gromacs/utility/bitmask.h"
  82 #include "gromacs/utility/cstringutil.h"
  83 #include "gromacs/utility/exceptions.h"
  84 #include "gromacs/utility/fatalerror.h"
  85 #include "gromacs/utility/gmxomp.h"
  86 #include "gromacs/utility/pleasecite.h"
  87
  88 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  89
  90 namespace gmx
  91 {
  92
  93 //! Indices of the two atoms involved in a single constraint
  94 struct AtomPair
  95 {
  96     //! \brief Constructor, does not initialize to catch bugs and faster construction
  97     AtomPair()
  98     {
  99     }
 100
 101     //! Index of atom 1
 102     int index1;
 103     //! Index of atom 2
 104     int index2;
 105 };
 106
 107 //! Unit of work within LINCS.
 108 struct Task
 109 {
 110     //! First constraint for this task.
 111     int              b0 = 0;
 112     //! b1-1 is the last constraint for this task.
 113     int              b1 = 0;
 114     //! The number of constraints in triangles.
 115     int              ntriangle = 0;
 116     //! The list of triangle constraints.
 117     std::vector<int> triangle;
 118     //! The bits tell if the matrix element should be used.
 119     std::vector<int> tri_bits;
 120     //! Constraint index for updating atom data.
 121     std::vector<int> ind;
 122     //! Constraint index for updating atom data.
 123     std::vector<int> ind_r;
 124     //! Temporary variable for virial calculation.
 125     tensor           vir_r_m_dr = {{0}};
 126     //! Temporary variable for lambda derivative.
 127     real             dhdlambda;
 128 };
 129
 130 /*! \brief Data for LINCS algorithm.
 131  */
 132 class Lincs
 133 {
 134     public:
 135         //! The global number of constraints.
 136         int             ncg = 0;
 137         //! The global number of flexible constraints.
 138         int             ncg_flex = 0;
 139         //! The global number of constraints in triangles.
 140         int             ncg_triangle = 0;
 141         //! The number of iterations.
 142         int             nIter = 0;
 143         //! The order of the matrix expansion.
 144         int             nOrder = 0;
 145         //! The maximum number of constraints connected to a single atom.
 146         int             max_connect = 0;
 147
 148         //! The number of real constraints.
 149         int              nc_real = 0;
 150         //! The number of constraints including padding for SIMD.
 151         int              nc = 0;
 152         //! The number of constraint connections.
 153         int              ncc = 0;
 154         //! The FE lambda value used for filling blc and blmf.
 155         real             matlam = 0;
 156         //! mapping from topology to LINCS constraints.
 157         std::vector<int> con_index;
 158         //! The reference distance in topology A.
 159         std::vector < real, AlignedAllocator < real>> bllen0;
 160         //! The reference distance in top B - the r.d. in top A.
 161         std::vector < real, AlignedAllocator < real>> ddist;
 162         //! The atom pairs involved in the constraints.
 163         std::vector<AtomPair> atoms;
 164         //! 1/sqrt(invmass1  invmass2).
 165         std::vector < real, AlignedAllocator < real>> blc;
 166         //! As blc, but with all masses 1.
 167         std::vector < real, AlignedAllocator < real>> blc1;
 168         //! Index into blbnb and blmf.
 169         std::vector<int>  blnr;
 170         //! List of constraint connections.
 171         std::vector<int>  blbnb;
 172         //! The local number of constraints in triangles.
 173         int               ntriangle = 0;
 174         //! The number of constraint connections in triangles.
 175         int               ncc_triangle = 0;
 176         //! Communicate before each LINCS interation.
 177         bool              bCommIter = false;
 178         //! Matrix of mass factors for constraint connections.
 179         std::vector<real> blmf;
 180         //! As blmf, but with all masses 1.
 181         std::vector<real> blmf1;
 182         //! The reference bond length.
 183         std::vector < real, AlignedAllocator < real>> bllen;
 184         //! The local atom count per constraint, can be NULL.
 185         std::vector<int>  nlocat;
 186
 187         /*! \brief The number of tasks used for LINCS work.
 188          *
 189          * \todo This is mostly used to loop over \c task, which would
 190          * be nicer to do with range-based for loops, but the thread
 191          * index is used for constructing bit masks and organizing the
 192          * virial output buffer, so other things need to change,
 193          * first. */
 194         int                        ntask = 0;
 195         /*! \brief LINCS thread division */
 196         std::vector<Task>          task;
 197         //! Atom flags for thread parallelization.
 198         std::vector<gmx_bitmask_t> atf;
 199         //! Are the LINCS tasks interdependent?
 200         bool                       bTaskDep = false;
 201         //! Are there triangle constraints that cross task borders?
 202         bool                       bTaskDepTri = false;
 203         //! Arrays for temporary storage in the LINCS algorithm.
 204         /*! @{ */
 205         PaddedVector<gmx::RVec>                   tmpv;
 206         std::vector<real>                         tmpncc;
 207         std::vector < real, AlignedAllocator < real>> tmp1;
 208         std::vector < real, AlignedAllocator < real>> tmp2;
 209         std::vector < real, AlignedAllocator < real>> tmp3;
 210         std::vector < real, AlignedAllocator < real>> tmp4;
 211         /*! @} */
 212         //! The Lagrange multipliers times -1.
 213         std::vector < real, AlignedAllocator < real>> mlambda;
 214         //! Storage for the constraint RMS relative deviation output.
 215         std::array<real, 2> rmsdData = {{0}};
 216 };
 217
 218 /*! \brief Define simd_width for memory allocation used for SIMD code */
 219 #if GMX_SIMD_HAVE_REAL
 220 static const int simd_width = GMX_SIMD_REAL_WIDTH;
 221 #else
 222 static const int simd_width = 1;
 223 #endif
 224
 225 ArrayRef<real> lincs_rmsdData(Lincs *lincsd)
 226 {
 227     return lincsd->rmsdData;
 228 }
 229
 230 real lincs_rmsd(const Lincs *lincsd)
 231 {
 232     if (lincsd->rmsdData[0] > 0)
 233     {
 234         return std::sqrt(lincsd->rmsdData[1]/lincsd->rmsdData[0]);
 235     }
 236     else
 237     {
 238         return 0;
 239     }
 240 }
 241
 242 /*! \brief Do a set of nrec LINCS matrix multiplications.
 243  *
 244  * This function will return with up to date thread-local
 245  * constraint data, without an OpenMP barrier.
 246  */
 247 static void lincs_matrix_expand(const Lincs               &lincsd,
 248                                 const Task                &li_task,
 249                                 gmx::ArrayRef<const real>  blcc,
 250                                 gmx::ArrayRef<real>        rhs1,
 251                                 gmx::ArrayRef<real>        rhs2,
 252                                 gmx::ArrayRef<real>        sol)
 253 {
 254     gmx::ArrayRef<const int> blnr  = lincsd.blnr;
 255     gmx::ArrayRef<const int> blbnb = lincsd.blbnb;
 256
 257     const int                b0   = li_task.b0;
 258     const int                b1   = li_task.b1;
 259     const int                nrec = lincsd.nOrder;
 260
 261     for (int rec = 0; rec < nrec; rec++)
 262     {
 263         if (lincsd.bTaskDep)
 264         {
 265 #pragma omp barrier
 266         }
 267         for (int b = b0; b < b1; b++)
 268         {
 269             real mvb;
 270             int  n;
 271
 272             mvb = 0;
 273             for (n = blnr[b]; n < blnr[b+1]; n++)
 274             {
 275                 mvb = mvb + blcc[n]*rhs1[blbnb[n]];
 276             }
 277             rhs2[b] = mvb;
 278             sol[b]  = sol[b] + mvb;
 279         }
 280
 281         std::swap(rhs1, rhs2);
 282     }   /* nrec*(ncons+2*nrtot) flops */
 283
 284     if (lincsd.ntriangle > 0)
 285     {
 286         /* Perform an extra nrec recursions for only the constraints
 287          * involved in rigid triangles.
 288          * In this way their accuracy should come close to those of the other
 289          * constraints, since traingles of constraints can produce eigenvalues
 290          * around 0.7, while the effective eigenvalue for bond constraints
 291          * is around 0.4 (and 0.7*0.7=0.5).
 292          */
 293
 294         if (lincsd.bTaskDep)
 295         {
 296             /* We need a barrier here, since other threads might still be
 297              * reading the contents of rhs1 and/o rhs2.
 298              * We could avoid this barrier by introducing two extra rhs
 299              * arrays for the triangle constraints only.
 300              */
 301 #pragma omp barrier
 302         }
 303
 304         /* Constraints involved in a triangle are ensured to be in the same
 305          * LINCS task. This means no barriers are required during the extra
 306          * iterations for the triangle constraints.
 307          */
 308         gmx::ArrayRef<const int> triangle = li_task.triangle;
 309         gmx::ArrayRef<const int> tri_bits = li_task.tri_bits;
 310
 311         for (int rec = 0; rec < nrec; rec++)
 312         {
 313             for (int tb = 0; tb < li_task.ntriangle; tb++)
 314             {
 315                 int  b, bits, nr0, nr1, n;
 316                 real mvb;
 317
 318                 b    = triangle[tb];
 319                 bits = tri_bits[tb];
 320                 mvb  = 0;
 321                 nr0  = blnr[b];
 322                 nr1  = blnr[b+1];
 323                 for (n = nr0; n < nr1; n++)
 324                 {
 325                     if (bits & (1 << (n - nr0)))
 326                     {
 327                         mvb = mvb + blcc[n]*rhs1[blbnb[n]];
 328                     }
 329                 }
 330                 rhs2[b] = mvb;
 331                 sol[b]  = sol[b] + mvb;
 332             }
 333
 334             std::swap(rhs1, rhs2);
 335         }   /* nrec*(ntriangle + ncc_triangle*2) flops */
 336
 337         if (lincsd.bTaskDepTri)
 338         {
 339             /* The constraints triangles are decoupled from each other,
 340              * but constraints in one triangle cross thread task borders.
 341              * We could probably avoid this with more advanced setup code.
 342              */
 343 #pragma omp barrier
 344         }
 345     }
 346 }
 347
 348 //! Update atomic coordinates when an index is not required.
 349 static void
 350 lincs_update_atoms_noind(int                             ncons,
 351                          gmx::ArrayRef<const AtomPair>   atoms,
 352                          real                            preFactor,
 353                          gmx::ArrayRef<const real>       fac,
 354                          gmx::ArrayRef<const gmx::RVec>  r,
 355                          const real                     *invmass,
 356                          rvec                           *x)
 357 {
 358     if (invmass != nullptr)
 359     {
 360         for (int b = 0; b < ncons; b++)
 361         {
 362             int  i     = atoms[b].index1;
 363             int  j     = atoms[b].index2;
 364             real mvb   = preFactor*fac[b];
 365             real im1   = invmass[i];
 366             real im2   = invmass[j];
 367             real tmp0  = r[b][0]*mvb;
 368             real tmp1  = r[b][1]*mvb;
 369             real tmp2  = r[b][2]*mvb;
 370             x[i][0]   -= tmp0*im1;
 371             x[i][1]   -= tmp1*im1;
 372             x[i][2]   -= tmp2*im1;
 373             x[j][0]   += tmp0*im2;
 374             x[j][1]   += tmp1*im2;
 375             x[j][2]   += tmp2*im2;
 376         }   /* 16 ncons flops */
 377     }
 378     else
 379     {
 380         for (int b = 0; b < ncons; b++)
 381         {
 382             int  i     = atoms[b].index1;
 383             int  j     = atoms[b].index2;
 384             real mvb   = preFactor*fac[b];
 385             real tmp0  = r[b][0]*mvb;
 386             real tmp1  = r[b][1]*mvb;
 387             real tmp2  = r[b][2]*mvb;
 388             x[i][0]   -= tmp0;
 389             x[i][1]   -= tmp1;
 390             x[i][2]   -= tmp2;
 391             x[j][0]   += tmp0;
 392             x[j][1]   += tmp1;
 393             x[j][2]   += tmp2;
 394         }
 395     }
 396 }
 397
 398 //! Update atomic coordinates when an index is required.
 399 static void
 400 lincs_update_atoms_ind(gmx::ArrayRef<const int>        ind,
 401                        gmx::ArrayRef<const AtomPair>   atoms,
 402                        real                            preFactor,
 403                        gmx::ArrayRef<const real>       fac,
 404                        gmx::ArrayRef<const gmx::RVec>  r,
 405                        const real                     *invmass,
 406                        rvec                           *x)
 407 {
 408     if (invmass != nullptr)
 409     {
 410         for (int b : ind)
 411         {
 412             int  i     = atoms[b].index1;
 413             int  j     = atoms[b].index2;
 414             real mvb   = preFactor*fac[b];
 415             real im1   = invmass[i];
 416             real im2   = invmass[j];
 417             real tmp0  = r[b][0]*mvb;
 418             real tmp1  = r[b][1]*mvb;
 419             real tmp2  = r[b][2]*mvb;
 420             x[i][0]   -= tmp0*im1;
 421             x[i][1]   -= tmp1*im1;
 422             x[i][2]   -= tmp2*im1;
 423             x[j][0]   += tmp0*im2;
 424             x[j][1]   += tmp1*im2;
 425             x[j][2]   += tmp2*im2;
 426         }   /* 16 ncons flops */
 427     }
 428     else
 429     {
 430         for (int b : ind)
 431         {
 432             int  i     = atoms[b].index1;
 433             int  j     = atoms[b].index2;
 434             real mvb   = preFactor*fac[b];
 435             real tmp0  = r[b][0]*mvb;
 436             real tmp1  = r[b][1]*mvb;
 437             real tmp2  = r[b][2]*mvb;
 438             x[i][0]   -= tmp0;
 439             x[i][1]   -= tmp1;
 440             x[i][2]   -= tmp2;
 441             x[j][0]   += tmp0;
 442             x[j][1]   += tmp1;
 443             x[j][2]   += tmp2;
 444         }   /* 16 ncons flops */
 445     }
 446 }
 447
 448 //! Update coordinates for atoms.
 449 static void
 450 lincs_update_atoms(Lincs                          *li,
 451                    int                             th,
 452                    real                            preFactor,
 453                    gmx::ArrayRef<const real>       fac,
 454                    gmx::ArrayRef<const gmx::RVec>  r,
 455                    const real                     *invmass,
 456                    rvec                           *x)
 457 {
 458     if (li->ntask == 1)
 459     {
 460         /* Single thread, we simply update for all constraints */
 461         lincs_update_atoms_noind(li->nc_real,
 462                                  li->atoms, preFactor, fac, r, invmass, x);
 463     }
 464     else
 465     {
 466         /* Update the atom vector components for our thread local
 467          * constraints that only access our local atom range.
 468          * This can be done without a barrier.
 469          */
 470         lincs_update_atoms_ind(li->task[th].ind,
 471                                li->atoms, preFactor, fac, r, invmass, x);
 472
 473         if (!li->task[li->ntask].ind.empty())
 474         {
 475             /* Update the constraints that operate on atoms
 476              * in multiple thread atom blocks on the master thread.
 477              */
 478 #pragma omp barrier
 479 #pragma omp master
 480             {
 481                 lincs_update_atoms_ind(li->task[li->ntask].ind,
 482                                        li->atoms, preFactor, fac, r, invmass, x);
 483             }
 484         }
 485     }
 486 }
 487
 488 #if GMX_SIMD_HAVE_REAL
 489 //! Helper function so that we can run TSAN with SIMD support (where implemented).
 490 template <int align>
 491 static inline void gmx_simdcall
 492 gatherLoadUTransposeTSANSafe(const real         *base,
 493                              const std::int32_t *offset,
 494                              SimdReal           *v0,
 495                              SimdReal           *v1,
 496                              SimdReal           *v2)
 497 {
 498 #if (CMAKE_BUILD_TYPE == CMAKE_BUILD_TYPE_TSAN) && GMX_SIMD_X86_AVX2_256
 499     // This function is only implemented in this case
 500     gatherLoadUTransposeSafe<align>(base, offset, v0, v1, v2);
 501 #else
 502     gatherLoadUTranspose<align>(base, offset, v0, v1, v2);
 503 #endif
 504 }
 505
 506 /*! \brief Calculate the constraint distance vectors r to project on from x.
 507  *
 508  * Determine the right-hand side of the matrix equation using quantity f.
 509  * This function only differs from calc_dr_x_xp_simd below in that
 510  * no constraint length is subtracted and no PBC is used for f. */
 511 static void gmx_simdcall
 512 calc_dr_x_f_simd(int                            b0,
 513                  int                            b1,
 514                  gmx::ArrayRef<const AtomPair>  atoms,
 515                  const rvec * gmx_restrict      x,
 516                  const rvec * gmx_restrict      f,
 517                  const real * gmx_restrict      blc,
 518                  const real *                   pbc_simd,
 519                  rvec * gmx_restrict            r,
 520                  real * gmx_restrict            rhs,
 521                  real * gmx_restrict            sol)
 522 {
 523     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 524
 525     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 526
 527     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 528     {
 529         offset2[i] = i;
 530     }
 531
 532     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 533     {
 534         SimdReal x0_S, y0_S, z0_S;
 535         SimdReal x1_S, y1_S, z1_S;
 536         SimdReal rx_S, ry_S, rz_S, n2_S, il_S;
 537         SimdReal fx_S, fy_S, fz_S, ip_S, rhs_S;
 538         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset0[GMX_SIMD_REAL_WIDTH];
 539         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset1[GMX_SIMD_REAL_WIDTH];
 540
 541         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 542         {
 543             offset0[i] = atoms[bs + i].index1;
 544             offset1[i] = atoms[bs + i].index2;
 545         }
 546
 547         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
 548         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
 549         rx_S = x0_S - x1_S;
 550         ry_S = y0_S - y1_S;
 551         rz_S = z0_S - z1_S;
 552
 553         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 554
 555         n2_S  = norm2(rx_S, ry_S, rz_S);
 556         il_S  = invsqrt(n2_S);
 557
 558         rx_S  = rx_S * il_S;
 559         ry_S  = ry_S * il_S;
 560         rz_S  = rz_S * il_S;
 561
 562         transposeScatterStoreU<3>(reinterpret_cast<real *>(r + bs), offset2, rx_S, ry_S, rz_S);
 563
 564         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(f), offset0, &x0_S, &y0_S, &z0_S);
 565         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(f), offset1, &x1_S, &y1_S, &z1_S);
 566         fx_S = x0_S - x1_S;
 567         fy_S = y0_S - y1_S;
 568         fz_S = z0_S - z1_S;
 569
 570         ip_S  = iprod(rx_S, ry_S, rz_S, fx_S, fy_S, fz_S);
 571
 572         rhs_S = load<SimdReal>(blc + bs) * ip_S;
 573
 574         store(rhs + bs, rhs_S);
 575         store(sol + bs, rhs_S);
 576     }
 577 }
 578 #endif // GMX_SIMD_HAVE_REAL
 579
 580 /*! \brief LINCS projection, works on derivatives of the coordinates. */
 581 static void do_lincsp(const rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
 582                       Lincs *lincsd, int th,
 583                       real *invmass,
 584                       ConstraintVariable econq, bool bCalcDHDL,
 585                       bool bCalcVir, tensor rmdf)
 586 {
 587     const int                     b0 = lincsd->task[th].b0;
 588     const int                     b1 = lincsd->task[th].b1;
 589
 590     gmx::ArrayRef<const AtomPair> atoms = lincsd->atoms;
 591     gmx::ArrayRef<gmx::RVec>      r     = lincsd->tmpv;
 592     gmx::ArrayRef<const int>      blnr  = lincsd->blnr;
 593     gmx::ArrayRef<const int>      blbnb = lincsd->blbnb;
 594
 595     gmx::ArrayRef<const real>     blc;
 596     gmx::ArrayRef<const real>     blmf;
 597     if (econq != ConstraintVariable::Force)
 598     {
 599         /* Use mass-weighted parameters */
 600         blc  = lincsd->blc;
 601         blmf = lincsd->blmf;
 602     }
 603     else
 604     {
 605         /* Use non mass-weighted parameters */
 606         blc  = lincsd->blc1;
 607         blmf = lincsd->blmf1;
 608     }
 609     gmx::ArrayRef<real> blcc = lincsd->tmpncc;
 610     gmx::ArrayRef<real> rhs1 = lincsd->tmp1;
 611     gmx::ArrayRef<real> rhs2 = lincsd->tmp2;
 612     gmx::ArrayRef<real> sol  = lincsd->tmp3;
 613
 614 #if GMX_SIMD_HAVE_REAL
 615     /* This SIMD code does the same as the plain-C code after the #else.
 616      * The only difference is that we always call pbc code, as with SIMD
 617      * the overhead of pbc computation (when not needed) is small.
 618      */
 619     alignas(GMX_SIMD_ALIGNMENT) real    pbc_simd[9*GMX_SIMD_REAL_WIDTH];
 620
 621     /* Convert the pbc struct for SIMD */
 622     set_pbc_simd(pbc, pbc_simd);
 623
 624     /* Compute normalized x i-j vectors, store in r.
 625      * Compute the inner product of r and xp i-j and store in rhs1.
 626      */
 627     calc_dr_x_f_simd(b0, b1, atoms, x, f, blc.data(),
 628                      pbc_simd,
 629                      as_rvec_array(r.data()), rhs1.data(), sol.data());
 630
 631 #else   // GMX_SIMD_HAVE_REAL
 632
 633     /* Compute normalized i-j vectors */
 634     if (pbc)
 635     {
 636         for (int b = b0; b < b1; b++)
 637         {
 638             rvec dx;
 639
 640             pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
 641             unitv(dx, r[b]);
 642         }
 643     }
 644     else
 645     {
 646         for (int b = b0; b < b1; b++)
 647         {
 648             rvec dx;
 649
 650             rvec_sub(x[atoms[b].index1], x[atoms[b].index2], dx);
 651             unitv(dx, r[b]);
 652         }   /* 16 ncons flops */
 653     }
 654
 655     for (int b = b0; b < b1; b++)
 656     {
 657         int  i   = atoms[b].index1;
 658         int  j   = atoms[b].index2;
 659         real mvb = blc[b]*(r[b][0]*(f[i][0] - f[j][0]) +
 660                            r[b][1]*(f[i][1] - f[j][1]) +
 661                            r[b][2]*(f[i][2] - f[j][2]));
 662         rhs1[b] = mvb;
 663         sol[b]  = mvb;
 664         /* 7 flops */
 665     }
 666
 667 #endif  // GMX_SIMD_HAVE_REAL
 668
 669     if (lincsd->bTaskDep)
 670     {
 671         /* We need a barrier, since the matrix construction below
 672          * can access entries in r of other threads.
 673          */
 674 #pragma omp barrier
 675     }
 676
 677     /* Construct the (sparse) LINCS matrix */
 678     for (int b = b0; b < b1; b++)
 679     {
 680         for (int n = blnr[b]; n < blnr[b+1]; n++)
 681         {
 682             blcc[n] = blmf[n]*::iprod(r[b], r[blbnb[n]]);
 683         }   /* 6 nr flops */
 684     }
 685     /* Together: 23*ncons + 6*nrtot flops */
 686
 687     lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
 688     /* nrec*(ncons+2*nrtot) flops */
 689
 690     if (econq == ConstraintVariable::Deriv_FlexCon)
 691     {
 692         /* We only want to constraint the flexible constraints,
 693          * so we mask out the normal ones by setting sol to 0.
 694          */
 695         for (int b = b0; b < b1; b++)
 696         {
 697             if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 698             {
 699                 sol[b] = 0;
 700             }
 701         }
 702     }
 703
 704     /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
 705     for (int b = b0; b < b1; b++)
 706     {
 707         sol[b] *= blc[b];
 708     }
 709
 710     /* When constraining forces, we should not use mass weighting,
 711      * so we pass invmass=NULL, which results in the use of 1 for all atoms.
 712      */
 713     lincs_update_atoms(lincsd, th, 1.0, sol, r,
 714                        (econq != ConstraintVariable::Force) ? invmass : nullptr, fp);
 715
 716     if (bCalcDHDL)
 717     {
 718         real dhdlambda = 0;
 719         for (int b = b0; b < b1; b++)
 720         {
 721             dhdlambda -= sol[b]*lincsd->ddist[b];
 722         }
 723
 724         lincsd->task[th].dhdlambda = dhdlambda;
 725     }
 726
 727     if (bCalcVir)
 728     {
 729         /* Constraint virial,
 730          * determines sum r_bond x delta f,
 731          * where delta f is the constraint correction
 732          * of the quantity that is being constrained.
 733          */
 734         for (int b = b0; b < b1; b++)
 735         {
 736             const real mvb = lincsd->bllen[b]*sol[b];
 737             for (int i = 0; i < DIM; i++)
 738             {
 739                 const real tmp1 = mvb*r[b][i];
 740                 for (int j = 0; j < DIM; j++)
 741                 {
 742                     rmdf[i][j] += tmp1*r[b][j];
 743                 }
 744             }
 745         }   /* 23 ncons flops */
 746     }
 747 }
 748
 749 #if GMX_SIMD_HAVE_REAL
 750
 751 /*! \brief Calculate the constraint distance vectors r to project on from x.
 752  *
 753  * Determine the right-hand side of the matrix equation using coordinates xp. */
 754 static void gmx_simdcall
 755 calc_dr_x_xp_simd(int                            b0,
 756                   int                            b1,
 757                   gmx::ArrayRef<const AtomPair>  atoms,
 758                   const rvec * gmx_restrict      x,
 759                   const rvec * gmx_restrict      xp,
 760                   const real * gmx_restrict      bllen,
 761                   const real * gmx_restrict      blc,
 762                   const real *                   pbc_simd,
 763                   rvec * gmx_restrict            r,
 764                   real * gmx_restrict            rhs,
 765                   real * gmx_restrict            sol)
 766 {
 767     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 768     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 769
 770     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 771     {
 772         offset2[i] = i;
 773     }
 774
 775     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 776     {
 777         SimdReal x0_S, y0_S, z0_S;
 778         SimdReal x1_S, y1_S, z1_S;
 779         SimdReal rx_S, ry_S, rz_S, n2_S, il_S;
 780         SimdReal rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
 781         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset0[GMX_SIMD_REAL_WIDTH];
 782         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset1[GMX_SIMD_REAL_WIDTH];
 783
 784         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 785         {
 786             offset0[i] = atoms[bs + i].index1;
 787             offset1[i] = atoms[bs + i].index2;
 788         }
 789
 790         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
 791         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
 792         rx_S = x0_S - x1_S;
 793         ry_S = y0_S - y1_S;
 794         rz_S = z0_S - z1_S;
 795
 796         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 797
 798         n2_S  = norm2(rx_S, ry_S, rz_S);
 799         il_S  = invsqrt(n2_S);
 800
 801         rx_S  = rx_S * il_S;
 802         ry_S  = ry_S * il_S;
 803         rz_S  = rz_S * il_S;
 804
 805         transposeScatterStoreU<3>(reinterpret_cast<real *>(r + bs), offset2, rx_S, ry_S, rz_S);
 806
 807         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(xp), offset0, &x0_S, &y0_S, &z0_S);
 808         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(xp), offset1, &x1_S, &y1_S, &z1_S);
 809
 810         rxp_S = x0_S - x1_S;
 811         ryp_S = y0_S - y1_S;
 812         rzp_S = z0_S - z1_S;
 813
 814         pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
 815
 816         ip_S  = iprod(rx_S, ry_S, rz_S, rxp_S, ryp_S, rzp_S);
 817
 818         rhs_S = load<SimdReal>(blc + bs) * (ip_S - load<SimdReal>(bllen + bs));
 819
 820         store(rhs + bs, rhs_S);
 821         store(sol + bs, rhs_S);
 822     }
 823 }
 824 #endif // GMX_SIMD_HAVE_REAL
 825
 826 /*! \brief Determine the distances and right-hand side for the next iteration. */
 827 gmx_unused static void calc_dist_iter(
 828         int                           b0,
 829         int                           b1,
 830         gmx::ArrayRef<const AtomPair> atoms,
 831         const rvec * gmx_restrict     xp,
 832         const real * gmx_restrict     bllen,
 833         const real * gmx_restrict     blc,
 834         const t_pbc *                 pbc,
 835         real                          wfac,
 836         real * gmx_restrict           rhs,
 837         real * gmx_restrict           sol,
 838         bool *                        bWarn)
 839 {
 840     for (int b = b0; b < b1; b++)
 841     {
 842         real len = bllen[b];
 843         rvec dx;
 844         if (pbc)
 845         {
 846             pbc_dx_aiuc(pbc, xp[atoms[b].index1], xp[atoms[b].index2], dx);
 847         }
 848         else
 849         {
 850             rvec_sub(xp[atoms[b].index1], xp[atoms[b].index2], dx);
 851         }
 852         real len2  = len*len;
 853         real dlen2 = 2*len2 - ::norm2(dx);
 854         if (dlen2 < wfac*len2)
 855         {
 856             /* not race free - see detailed comment in caller */
 857             *bWarn = TRUE;
 858         }
 859         real mvb;
 860         if (dlen2 > 0)
 861         {
 862             mvb = blc[b]*(len - dlen2*gmx::invsqrt(dlen2));
 863         }
 864         else
 865         {
 866             mvb = blc[b]*len;
 867         }
 868         rhs[b]  = mvb;
 869         sol[b]  = mvb;
 870     }   /* 20*ncons flops */
 871 }
 872
 873 #if GMX_SIMD_HAVE_REAL
 874 /*! \brief As calc_dist_iter(), but using SIMD intrinsics. */
 875 static void gmx_simdcall
 876 calc_dist_iter_simd(int                           b0,
 877                     int                           b1,
 878                     gmx::ArrayRef<const AtomPair> atoms,
 879                     const rvec * gmx_restrict     x,
 880                     const real * gmx_restrict     bllen,
 881                     const real * gmx_restrict     blc,
 882                     const real *                  pbc_simd,
 883                     real                          wfac,
 884                     real * gmx_restrict           rhs,
 885                     real * gmx_restrict           sol,
 886                     bool *                        bWarn)
 887 {
 888     SimdReal        min_S(GMX_REAL_MIN);
 889     SimdReal        two_S(2.0);
 890     SimdReal        wfac_S(wfac);
 891     SimdBool        warn_S;
 892
 893     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 894
 895     /* Initialize all to FALSE */
 896     warn_S = (two_S < setZero());
 897
 898     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 899     {
 900         SimdReal x0_S, y0_S, z0_S;
 901         SimdReal x1_S, y1_S, z1_S;
 902         SimdReal rx_S, ry_S, rz_S, n2_S;
 903         SimdReal len_S, len2_S, dlen2_S, lc_S, blc_S;
 904         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset0[GMX_SIMD_REAL_WIDTH];
 905         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset1[GMX_SIMD_REAL_WIDTH];
 906
 907         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 908         {
 909             offset0[i] = atoms[bs + i].index1;
 910             offset1[i] = atoms[bs + i].index2;
 911         }
 912
 913         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
 914         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
 915
 916         rx_S = x0_S - x1_S;
 917         ry_S = y0_S - y1_S;
 918         rz_S = z0_S - z1_S;
 919
 920         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 921
 922         n2_S    = norm2(rx_S, ry_S, rz_S);
 923
 924         len_S   = load<SimdReal>(bllen + bs);
 925         len2_S  = len_S * len_S;
 926
 927         dlen2_S = fms(two_S, len2_S, n2_S);
 928
 929         warn_S  = warn_S || (dlen2_S < (wfac_S * len2_S));
 930
 931         /* Avoid 1/0 by taking the max with REAL_MIN.
 932          * Note: when dlen2 is close to zero (90 degree constraint rotation),
 933          * the accuracy of the algorithm is no longer relevant.
 934          */
 935         dlen2_S = max(dlen2_S, min_S);
 936
 937         lc_S    = fnma(dlen2_S, invsqrt(dlen2_S), len_S);
 938
 939         blc_S   = load<SimdReal>(blc + bs);
 940
 941         lc_S    = blc_S * lc_S;
 942
 943         store(rhs + bs, lc_S);
 944         store(sol + bs, lc_S);
 945     }
 946
 947     if (anyTrue(warn_S))
 948     {
 949         *bWarn = TRUE;
 950     }
 951 }
 952 #endif // GMX_SIMD_HAVE_REAL
 953
 954 //! Implements LINCS constraining.
 955 static void do_lincs(const rvec *x, rvec *xp, const matrix box, t_pbc *pbc,
 956                      Lincs *lincsd, int th,
 957                      const real *invmass,
 958                      const t_commrec *cr,
 959                      bool bCalcDHDL,
 960                      real wangle, bool *bWarn,
 961                      real invdt, rvec * gmx_restrict v,
 962                      bool bCalcVir, tensor vir_r_m_dr)
 963 {
 964     const int                     b0 = lincsd->task[th].b0;
 965     const int                     b1 = lincsd->task[th].b1;
 966
 967     gmx::ArrayRef<const AtomPair> atoms   = lincsd->atoms;
 968     gmx::ArrayRef<gmx::RVec>      r       = lincsd->tmpv;
 969     gmx::ArrayRef<const int>      blnr    = lincsd->blnr;
 970     gmx::ArrayRef<const int>      blbnb   = lincsd->blbnb;
 971     gmx::ArrayRef<const real>     blc     = lincsd->blc;
 972     gmx::ArrayRef<const real>     blmf    = lincsd->blmf;
 973     gmx::ArrayRef<const real>     bllen   = lincsd->bllen;
 974     gmx::ArrayRef<real>           blcc    = lincsd->tmpncc;
 975     gmx::ArrayRef<real>           rhs1    = lincsd->tmp1;
 976     gmx::ArrayRef<real>           rhs2    = lincsd->tmp2;
 977     gmx::ArrayRef<real>           sol     = lincsd->tmp3;
 978     gmx::ArrayRef<real>           blc_sol = lincsd->tmp4;
 979     gmx::ArrayRef<real>           mlambda = lincsd->mlambda;
 980     gmx::ArrayRef<const int>      nlocat  = lincsd->nlocat;
 981
 982 #if GMX_SIMD_HAVE_REAL
 983
 984     /* This SIMD code does the same as the plain-C code after the #else.
 985      * The only difference is that we always call pbc code, as with SIMD
 986      * the overhead of pbc computation (when not needed) is small.
 987      */
 988     alignas(GMX_SIMD_ALIGNMENT) real    pbc_simd[9*GMX_SIMD_REAL_WIDTH];
 989
 990     /* Convert the pbc struct for SIMD */
 991     set_pbc_simd(pbc, pbc_simd);
 992
 993     /* Compute normalized x i-j vectors, store in r.
 994      * Compute the inner product of r and xp i-j and store in rhs1.
 995      */
 996     calc_dr_x_xp_simd(b0, b1, atoms, x, xp, bllen.data(), blc.data(),
 997                       pbc_simd,
 998                       as_rvec_array(r.data()), rhs1.data(), sol.data());
 999
1000 #else   // GMX_SIMD_HAVE_REAL
1001
1002     if (pbc)
1003     {
1004         /* Compute normalized i-j vectors */
1005         for (int b = b0; b < b1; b++)
1006         {
1007             rvec dx;
1008             pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
1009             unitv(dx, r[b]);
1010
1011             pbc_dx_aiuc(pbc, xp[atoms[b].index1], xp[atoms[b].index2], dx);
1012             real mvb  = blc[b]*(::iprod(r[b], dx) - bllen[b]);
1013             rhs1[b]   = mvb;
1014             sol[b]    = mvb;
1015         }
1016     }
1017     else
1018     {
1019         /* Compute normalized i-j vectors */
1020         for (int b = b0; b < b1; b++)
1021         {
1022             int  i    = atoms[b].index1;
1023             int  j    = atoms[b].index2;
1024             real tmp0 = x[i][0] - x[j][0];
1025             real tmp1 = x[i][1] - x[j][1];
1026             real tmp2 = x[i][2] - x[j][2];
1027             real rlen = gmx::invsqrt(tmp0*tmp0 + tmp1*tmp1 + tmp2*tmp2);
1028             r[b][0]   = rlen*tmp0;
1029             r[b][1]   = rlen*tmp1;
1030             r[b][2]   = rlen*tmp2;
1031             /* 16 ncons flops */
1032
1033             real mvb  = blc[b]*(r[b][0]*(xp[i][0] - xp[j][0]) +
1034                                 r[b][1]*(xp[i][1] - xp[j][1]) +
1035                                 r[b][2]*(xp[i][2] - xp[j][2]) - bllen[b]);
1036             rhs1[b]   = mvb;
1037             sol[b]    = mvb;
1038             /* 10 flops */
1039         }
1040         /* Together: 26*ncons + 6*nrtot flops */
1041     }
1042
1043 #endif  // GMX_SIMD_HAVE_REAL
1044
1045     if (lincsd->bTaskDep)
1046     {
1047         /* We need a barrier, since the matrix construction below
1048          * can access entries in r of other threads.
1049          */
1050 #pragma omp barrier
1051     }
1052
1053     /* Construct the (sparse) LINCS matrix */
1054     for (int b = b0; b < b1; b++)
1055     {
1056         for (int n = blnr[b]; n < blnr[b+1]; n++)
1057         {
1058             blcc[n] = blmf[n]*gmx::dot(r[b], r[blbnb[n]]);
1059         }
1060     }
1061     /* Together: 26*ncons + 6*nrtot flops */
1062
1063     lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
1064     /* nrec*(ncons+2*nrtot) flops */
1065
1066 #if GMX_SIMD_HAVE_REAL
1067     for (int b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1068     {
1069         SimdReal t1 = load<SimdReal>(blc.data() + b);
1070         SimdReal t2 = load<SimdReal>(sol.data() + b);
1071         store(mlambda.data() + b, t1 * t2);
1072     }
1073 #else
1074     for (int b = b0; b < b1; b++)
1075     {
1076         mlambda[b] = blc[b]*sol[b];
1077     }
1078 #endif  // GMX_SIMD_HAVE_REAL
1079
1080     /* Update the coordinates */
1081     lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
1082
1083     /*
1084      ********  Correction for centripetal effects  ********
1085      */
1086
1087     real wfac;
1088
1089     wfac = std::cos(DEG2RAD*wangle);
1090     wfac = wfac*wfac;
1091
1092     for (int iter = 0; iter < lincsd->nIter; iter++)
1093     {
1094         if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints))
1095         {
1096 #pragma omp barrier
1097 #pragma omp master
1098             {
1099                 /* Communicate the corrected non-local coordinates */
1100                 if (DOMAINDECOMP(cr))
1101                 {
1102                     dd_move_x_constraints(cr->dd, box, xp, nullptr, FALSE);
1103                 }
1104             }
1105 #pragma omp barrier
1106         }
1107         else if (lincsd->bTaskDep)
1108         {
1109 #pragma omp barrier
1110         }
1111
1112 #if GMX_SIMD_HAVE_REAL
1113         calc_dist_iter_simd(b0, b1, atoms,
1114                             xp, bllen.data(), blc.data(), pbc_simd, wfac,
1115                             rhs1.data(), sol.data(), bWarn);
1116 #else
1117         calc_dist_iter(b0, b1, atoms, xp,
1118                        bllen.data(), blc.data(), pbc, wfac,
1119                        rhs1.data(), sol.data(), bWarn);
1120         /* 20*ncons flops */
1121 #endif      // GMX_SIMD_HAVE_REAL
1122
1123         lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
1124         /* nrec*(ncons+2*nrtot) flops */
1125
1126 #if GMX_SIMD_HAVE_REAL
1127         for (int b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1128         {
1129             SimdReal t1  = load<SimdReal>(blc.data() + b);
1130             SimdReal t2  = load<SimdReal>(sol.data() + b);
1131             SimdReal mvb = t1 * t2;
1132             store(blc_sol.data() + b, mvb);
1133             store(mlambda.data() + b, load<SimdReal>(mlambda.data() + b) + mvb);
1134         }
1135 #else
1136         for (int b = b0; b < b1; b++)
1137         {
1138             real mvb    = blc[b]*sol[b];
1139             blc_sol[b]  = mvb;
1140             mlambda[b] += mvb;
1141         }
1142 #endif      // GMX_SIMD_HAVE_REAL
1143
1144         /* Update the coordinates */
1145         lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
1146     }
1147     /* nit*ncons*(37+9*nrec) flops */
1148
1149     if (v != nullptr)
1150     {
1151         /* Update the velocities */
1152         lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
1153         /* 16 ncons flops */
1154     }
1155
1156     if (!nlocat.empty() && (bCalcDHDL || bCalcVir))
1157     {
1158         if (lincsd->bTaskDep)
1159         {
1160             /* In lincs_update_atoms threads might cross-read mlambda */
1161 #pragma omp barrier
1162         }
1163
1164         /* Only account for local atoms */
1165         for (int b = b0; b < b1; b++)
1166         {
1167             mlambda[b] *= 0.5*nlocat[b];
1168         }
1169     }
1170
1171     if (bCalcDHDL)
1172     {
1173         real dhdl = 0;
1174         for (int b = b0; b < b1; b++)
1175         {
1176             /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1177              * later after the contributions are reduced over the threads.
1178              */
1179             dhdl -= lincsd->mlambda[b]*lincsd->ddist[b];
1180         }
1181         lincsd->task[th].dhdlambda = dhdl;
1182     }
1183
1184     if (bCalcVir)
1185     {
1186         /* Constraint virial */
1187         for (int b = b0; b < b1; b++)
1188         {
1189             real tmp0 = -bllen[b]*mlambda[b];
1190             for (int i = 0; i < DIM; i++)
1191             {
1192                 real tmp1 = tmp0*r[b][i];
1193                 for (int j = 0; j < DIM; j++)
1194                 {
1195                     vir_r_m_dr[i][j] -= tmp1*r[b][j];
1196                 }
1197             }
1198         }   /* 22 ncons flops */
1199     }
1200
1201     /* Total:
1202      * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1203      * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1204      *
1205      * (26+nrec)*ncons + (6+2*nrec)*nrtot
1206      * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1207      * if nit=1
1208      * (63+nrec)*ncons + (6+4*nrec)*nrtot
1209      */
1210 }
1211
1212 /*! \brief Sets the elements in the LINCS matrix for task task. */
1213 static void set_lincs_matrix_task(Lincs                *li,
1214                                   Task                 *li_task,
1215                                   const real           *invmass,
1216                                   int                  *ncc_triangle,
1217                                   int                  *nCrossTaskTriangles)
1218 {
1219     /* Construct the coupling coefficient matrix blmf */
1220     li_task->ntriangle   = 0;
1221     *ncc_triangle        = 0;
1222     *nCrossTaskTriangles = 0;
1223     for (int i = li_task->b0; i < li_task->b1; i++)
1224     {
1225         const int a1 = li->atoms[i].index1;
1226         const int a2 = li->atoms[i].index2;
1227         for (int n = li->blnr[i]; n < li->blnr[i + 1]; n++)
1228         {
1229             const int k = li->blbnb[n];
1230
1231             /* If we are using multiple, independent tasks for LINCS,
1232              * the calls to check_assign_connected should have
1233              * put all connected constraints in our task.
1234              */
1235             assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
1236
1237             int sign;
1238             if (a1 == li->atoms[k].index1 || a2 == li->atoms[k].index2)
1239             {
1240                 sign = -1;
1241             }
1242             else
1243             {
1244                 sign = 1;
1245             }
1246             int center;
1247             int end;
1248             if (a1 == li->atoms[k].index1 || a1 == li->atoms[k].index2)
1249             {
1250                 center = a1;
1251                 end    = a2;
1252             }
1253             else
1254             {
1255                 center = a2;
1256                 end    = a1;
1257             }
1258             li->blmf[n]  = sign*invmass[center]*li->blc[i]*li->blc[k];
1259             li->blmf1[n] = sign*0.5;
1260             if (li->ncg_triangle > 0)
1261             {
1262                 /* Look for constraint triangles */
1263                 for (int nk = li->blnr[k]; nk < li->blnr[k + 1]; nk++)
1264                 {
1265                     const int kk = li->blbnb[nk];
1266                     if (kk != i && kk != k &&
1267                         (li->atoms[kk].index1 == end ||
1268                          li->atoms[kk].index2 == end))
1269                     {
1270                         /* Check if the constraints in this triangle actually
1271                          * belong to a different task. We still assign them
1272                          * here, since it's convenient for the triangle
1273                          * iterations, but we then need an extra barrier.
1274                          */
1275                         if (k  < li_task->b0 || k  >= li_task->b1 ||
1276                             kk < li_task->b0 || kk >= li_task->b1)
1277                         {
1278                             (*nCrossTaskTriangles)++;
1279                         }
1280
1281                         if (li_task->ntriangle == 0 ||
1282                             li_task->triangle[li_task->ntriangle - 1] < i)
1283                         {
1284                             /* Add this constraint to the triangle list */
1285                             li_task->triangle[li_task->ntriangle] = i;
1286                             li_task->tri_bits[li_task->ntriangle] = 0;
1287                             li_task->ntriangle++;
1288                             if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li_task->tri_bits[0])*8 - 1))
1289                             {
1290                                 gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %zu allowed for constraints participating in triangles",
1291                                           li->blnr[i+1] - li->blnr[i],
1292                                           sizeof(li_task->tri_bits[0])*8-1);
1293                             }
1294                         }
1295                         li_task->tri_bits[li_task->ntriangle-1] |= (1 << (n - li->blnr[i]));
1296                         (*ncc_triangle)++;
1297                     }
1298                 }
1299             }
1300         }
1301     }
1302 }
1303
1304 /*! \brief Sets the elements in the LINCS matrix. */
1305 static void set_lincs_matrix(Lincs *li, real *invmass, real lambda)
1306 {
1307     const real invsqrt2 = 0.7071067811865475244;
1308
1309     for (int i = 0; (i < li->nc); i++)
1310     {
1311         const int a1 = li->atoms[i].index1;
1312         const int a2 = li->atoms[i].index2;
1313         li->blc[i]   = gmx::invsqrt(invmass[a1] + invmass[a2]);
1314         li->blc1[i]  = invsqrt2;
1315     }
1316
1317     /* Construct the coupling coefficient matrix blmf */
1318     int ntriangle = 0, ncc_triangle = 0, nCrossTaskTriangles = 0;
1319 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle, nCrossTaskTriangles) num_threads(li->ntask) schedule(static)
1320     for (int th = 0; th < li->ntask; th++)
1321     {
1322         try
1323         {
1324             set_lincs_matrix_task(li, &li->task[th], invmass,
1325                                   &ncc_triangle, &nCrossTaskTriangles);
1326             ntriangle += li->task[th].ntriangle;
1327         }
1328         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1329     }
1330     li->ntriangle    = ntriangle;
1331     li->ncc_triangle = ncc_triangle;
1332     li->bTaskDepTri  = (nCrossTaskTriangles > 0);
1333
1334     if (debug)
1335     {
1336         fprintf(debug, "The %d constraints participate in %d triangles\n",
1337                 li->nc, li->ntriangle);
1338         fprintf(debug, "There are %d constraint couplings, of which %d in triangles\n",
1339                 li->ncc, li->ncc_triangle);
1340         if (li->ntriangle > 0 && li->ntask > 1)
1341         {
1342             fprintf(debug, "%d constraint triangles contain constraints assigned to different tasks\n",
1343                     nCrossTaskTriangles);
1344         }
1345     }
1346
1347     /* Set matlam,
1348      * so we know with which lambda value the masses have been set.
1349      */
1350     li->matlam = lambda;
1351 }
1352
1353 //! Finds all triangles of atoms that share constraints to a central atom.
1354 static int count_triangle_constraints(const InteractionLists &ilist,
1355                                       const t_blocka         &at2con)
1356 {
1357     int      ncon1, ncon_tot;
1358     int      c0, n1, c1, ac1, n2, c2;
1359     int      ncon_triangle;
1360
1361     ncon1    = ilist[F_CONSTR].size()/3;
1362     ncon_tot = ncon1 + ilist[F_CONSTRNC].size()/3;
1363
1364     gmx::ArrayRef<const int> ia1 = ilist[F_CONSTR].iatoms;
1365     gmx::ArrayRef<const int> ia2 = ilist[F_CONSTRNC].iatoms;
1366
1367     ncon_triangle = 0;
1368     for (c0 = 0; c0 < ncon_tot; c0++)
1369     {
1370         bool       bTriangle = FALSE;
1371         const int *iap       = constr_iatomptr(ia1, ia2, c0);
1372         const int  a00       = iap[1];
1373         const int  a01       = iap[2];
1374         for (n1 = at2con.index[a01]; n1 < at2con.index[a01+1]; n1++)
1375         {
1376             c1 = at2con.a[n1];
1377             if (c1 != c0)
1378             {
1379                 const int *iap = constr_iatomptr(ia1, ia2, c1);
1380                 const int  a10 = iap[1];
1381                 const int  a11 = iap[2];
1382                 if (a10 == a01)
1383                 {
1384                     ac1 = a11;
1385                 }
1386                 else
1387                 {
1388                     ac1 = a10;
1389                 }
1390                 for (n2 = at2con.index[ac1]; n2 < at2con.index[ac1+1]; n2++)
1391                 {
1392                     c2 = at2con.a[n2];
1393                     if (c2 != c0 && c2 != c1)
1394                     {
1395                         const int *iap = constr_iatomptr(ia1, ia2, c2);
1396                         const int  a20 = iap[1];
1397                         const int  a21 = iap[2];
1398                         if (a20 == a00 || a21 == a00)
1399                         {
1400                             bTriangle = TRUE;
1401                         }
1402                     }
1403                 }
1404             }
1405         }
1406         if (bTriangle)
1407         {
1408             ncon_triangle++;
1409         }
1410     }
1411
1412     return ncon_triangle;
1413 }
1414
1415 //! Finds sequences of sequential constraints.
1416 static bool more_than_two_sequential_constraints(const InteractionLists &ilist,
1417                                                  const t_blocka         &at2con)
1418 {
1419     int       ncon1, ncon_tot, c;
1420     bool      bMoreThanTwoSequentialConstraints;
1421
1422     ncon1    = ilist[F_CONSTR].size()/3;
1423     ncon_tot = ncon1 + ilist[F_CONSTRNC].size()/3;
1424
1425     gmx::ArrayRef<const int> ia1 = ilist[F_CONSTR].iatoms;
1426     gmx::ArrayRef<const int> ia2 = ilist[F_CONSTRNC].iatoms;
1427
1428     bMoreThanTwoSequentialConstraints = FALSE;
1429     for (c = 0; c < ncon_tot && !bMoreThanTwoSequentialConstraints; c++)
1430     {
1431         const int *iap = constr_iatomptr(ia1, ia2, c);
1432         const int  a1  = iap[1];
1433         const int  a2  = iap[2];
1434         /* Check if this constraint has constraints connected at both atoms */
1435         if (at2con.index[a1+1] - at2con.index[a1] > 1 &&
1436             at2con.index[a2+1] - at2con.index[a2] > 1)
1437         {
1438             bMoreThanTwoSequentialConstraints = TRUE;
1439         }
1440     }
1441
1442     return bMoreThanTwoSequentialConstraints;
1443 }
1444
1445 Lincs *init_lincs(FILE *fplog, const gmx_mtop_t &mtop,
1446                   int nflexcon_global, ArrayRef<const t_blocka> at2con,
1447                   bool bPLINCS, int nIter, int nProjOrder)
1448 {
1449     // TODO this should become a unique_ptr
1450     Lincs                *li;
1451     bool                  bMoreThanTwoSeq;
1452
1453     if (fplog)
1454     {
1455         fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n",
1456                 bPLINCS ? " Parallel" : "");
1457     }
1458
1459     li = new Lincs;
1460
1461     li->ncg      =
1462         gmx_mtop_ftype_count(mtop, F_CONSTR) +
1463         gmx_mtop_ftype_count(mtop, F_CONSTRNC);
1464     li->ncg_flex = nflexcon_global;
1465
1466     li->nIter  = nIter;
1467     li->nOrder = nProjOrder;
1468
1469     li->max_connect = 0;
1470     for (size_t mt = 0; mt < mtop.moltype.size(); mt++)
1471     {
1472         for (int a = 0; a < mtop.moltype[mt].atoms.nr; a++)
1473         {
1474             li->max_connect = std::max(li->max_connect,
1475                                        at2con[mt].index[a + 1] - at2con[mt].index[a]);
1476         }
1477     }
1478
1479     li->ncg_triangle = 0;
1480     bMoreThanTwoSeq  = FALSE;
1481     for (const gmx_molblock_t &molb : mtop.molblock)
1482     {
1483         const gmx_moltype_t &molt = mtop.moltype[molb.type];
1484
1485         li->ncg_triangle +=
1486             molb.nmol*
1487             count_triangle_constraints(molt.ilist, at2con[molb.type]);
1488
1489         if (!bMoreThanTwoSeq &&
1490             more_than_two_sequential_constraints(molt.ilist, at2con[molb.type]))
1491         {
1492             bMoreThanTwoSeq = TRUE;
1493         }
1494     }
1495
1496     /* Check if we need to communicate not only before LINCS,
1497      * but also before each iteration.
1498      * The check for only two sequential constraints is only
1499      * useful for the common case of H-bond only constraints.
1500      * With more effort we could also make it useful for small
1501      * molecules with nr. sequential constraints <= nOrder-1.
1502      */
1503     li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
1504
1505     if (debug && bPLINCS)
1506     {
1507         fprintf(debug, "PLINCS communication before each iteration: %d\n",
1508                 static_cast<int>(li->bCommIter));
1509     }
1510
1511     /* LINCS can run on any number of threads.
1512      * Currently the number is fixed for the whole simulation,
1513      * but it could be set in set_lincs().
1514      * The current constraint to task assignment code can create independent
1515      * tasks only when not more than two constraints are connected sequentially.
1516      */
1517     li->ntask    = gmx_omp_nthreads_get(emntLINCS);
1518     li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
1519     if (debug)
1520     {
1521         fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n",
1522                 li->ntask, li->bTaskDep ? "" : "in");
1523     }
1524     if (li->ntask == 1)
1525     {
1526         li->task.resize(1);
1527     }
1528     else
1529     {
1530         /* Allocate an extra elements for "task-overlap" constraints */
1531         li->task.resize(li->ntask + 1);
1532     }
1533
1534     if (bPLINCS || li->ncg_triangle > 0)
1535     {
1536         please_cite(fplog, "Hess2008a");
1537     }
1538     else
1539     {
1540         please_cite(fplog, "Hess97a");
1541     }
1542
1543     if (fplog)
1544     {
1545         fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1546         if (bPLINCS)
1547         {
1548             fprintf(fplog, "There are constraints between atoms in different decomposition domains,\n"
1549                     "will communicate selected coordinates each lincs iteration\n");
1550         }
1551         if (li->ncg_triangle > 0)
1552         {
1553             fprintf(fplog,
1554                     "%d constraints are involved in constraint triangles,\n"
1555                     "will apply an additional matrix expansion of order %d for couplings\n"
1556                     "between constraints inside triangles\n",
1557                     li->ncg_triangle, li->nOrder);
1558         }
1559     }
1560
1561     return li;
1562 }
1563
1564 void done_lincs(Lincs *li)
1565 {
1566     delete li;
1567 }
1568
1569 /*! \brief Sets up the work division over the threads. */
1570 static void lincs_thread_setup(Lincs *li, int natoms)
1571 {
1572     li->atf.resize(natoms);
1573
1574     gmx::ArrayRef<gmx_bitmask_t> atf = li->atf;
1575
1576     /* Clear the atom flags */
1577     for (gmx_bitmask_t &mask : atf)
1578     {
1579         bitmask_clear(&mask);
1580     }
1581
1582     if (li->ntask > BITMASK_SIZE)
1583     {
1584         gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
1585     }
1586
1587     for (int th = 0; th < li->ntask; th++)
1588     {
1589         const Task *li_task = &li->task[th];
1590
1591         /* For each atom set a flag for constraints from each */
1592         for (int b = li_task->b0; b < li_task->b1; b++)
1593         {
1594             bitmask_set_bit(&atf[li->atoms[b].index1], th);
1595             bitmask_set_bit(&atf[li->atoms[b].index2], th);
1596         }
1597     }
1598
1599 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1600     for (int th = 0; th < li->ntask; th++)
1601     {
1602         try
1603         {
1604             Task          *li_task;
1605             gmx_bitmask_t  mask;
1606             int            b;
1607
1608             li_task = &li->task[th];
1609
1610             bitmask_init_low_bits(&mask, th);
1611
1612             li_task->ind.clear();
1613             li_task->ind_r.clear();
1614             for (b = li_task->b0; b < li_task->b1; b++)
1615             {
1616                 /* We let the constraint with the lowest thread index
1617                  * operate on atoms with constraints from multiple threads.
1618                  */
1619                 if (bitmask_is_disjoint(atf[li->atoms[b].index1], mask) &&
1620                     bitmask_is_disjoint(atf[li->atoms[b].index2], mask))
1621                 {
1622                     /* Add the constraint to the local atom update index */
1623                     li_task->ind.push_back(b);
1624                 }
1625                 else
1626                 {
1627                     /* Add the constraint to the rest block */
1628                     li_task->ind_r.push_back(b);
1629                 }
1630             }
1631         }
1632         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1633     }
1634
1635     /* We need to copy all constraints which have not be assigned
1636      * to a thread to a separate list which will be handled by one thread.
1637      */
1638     Task *li_m = &li->task[li->ntask];
1639
1640     li_m->ind.clear();
1641     for (int th = 0; th < li->ntask; th++)
1642     {
1643         const Task &li_task = li->task[th];
1644
1645         for (int ind_r : li_task.ind_r)
1646         {
1647             li_m->ind.push_back(ind_r);
1648         }
1649
1650         if (debug)
1651         {
1652             fprintf(debug, "LINCS thread %d: %zu constraints\n",
1653                     th, li_task.ind.size());
1654         }
1655     }
1656
1657     if (debug)
1658     {
1659         fprintf(debug, "LINCS thread r: %zu constraints\n",
1660                 li_m->ind.size());
1661     }
1662 }
1663
1664 //! Assign a constraint.
1665 static void assign_constraint(Lincs *li,
1666                               int constraint_index,
1667                               int a1, int a2,
1668                               real lenA, real lenB,
1669                               const t_blocka *at2con)
1670 {
1671     int con;
1672
1673     con = li->nc;
1674
1675     /* Make an mapping of local topology constraint index to LINCS index */
1676     li->con_index[constraint_index] = con;
1677
1678     li->bllen0[con]       = lenA;
1679     li->ddist[con]        = lenB - lenA;
1680     /* Set the length to the topology A length */
1681     li->bllen[con]        = lenA;
1682     li->atoms[con].index1 = a1;
1683     li->atoms[con].index2 = a2;
1684
1685     /* Make space in the constraint connection matrix for constraints
1686      * connected to both end of the current constraint.
1687      */
1688     li->ncc +=
1689         at2con->index[a1 + 1] - at2con->index[a1] - 1 +
1690         at2con->index[a2 + 1] - at2con->index[a2] - 1;
1691
1692     li->blnr[con + 1] = li->ncc;
1693
1694     /* Increase the constraint count */
1695     li->nc++;
1696 }
1697
1698 /*! \brief Check if constraint with topology index constraint_index is connected
1699  * to other constraints, and if so add those connected constraints to our task. */
1700 static void check_assign_connected(Lincs *li,
1701                                    const t_iatom *iatom,
1702                                    const t_idef &idef,
1703                                    bool bDynamics,
1704                                    int a1, int a2,
1705                                    const t_blocka *at2con)
1706 {
1707     /* Currently this function only supports constraint groups
1708      * in which all constraints share at least one atom
1709      * (e.g. H-bond constraints).
1710      * Check both ends of the current constraint for
1711      * connected constraints. We need to assign those
1712      * to the same task.
1713      */
1714     int end;
1715
1716     for (end = 0; end < 2; end++)
1717     {
1718         int a, k;
1719
1720         a = (end == 0 ? a1 : a2);
1721
1722         for (k = at2con->index[a]; k < at2con->index[a + 1]; k++)
1723         {
1724             int cc;
1725
1726             cc = at2con->a[k];
1727             /* Check if constraint cc has not yet been assigned */
1728             if (li->con_index[cc] == -1)
1729             {
1730                 int  type;
1731                 real lenA, lenB;
1732
1733                 type = iatom[cc*3];
1734                 lenA = idef.iparams[type].constr.dA;
1735                 lenB = idef.iparams[type].constr.dB;
1736
1737                 if (bDynamics || lenA != 0 || lenB != 0)
1738                 {
1739                     assign_constraint(li, cc, iatom[3*cc + 1], iatom[3*cc + 2], lenA, lenB, at2con);
1740                 }
1741             }
1742         }
1743     }
1744 }
1745
1746 /*! \brief Check if constraint with topology index constraint_index is involved
1747  * in a constraint triangle, and if so add the other two constraints
1748  * in the triangle to our task. */
1749 static void check_assign_triangle(Lincs *li,
1750                                   const t_iatom *iatom,
1751                                   const t_idef &idef,
1752                                   bool bDynamics,
1753                                   int constraint_index,
1754                                   int a1, int a2,
1755                                   const t_blocka *at2con)
1756 {
1757     int nca, cc[32], ca[32], k;
1758     int c_triangle[2] = { -1, -1 };
1759
1760     nca = 0;
1761     for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
1762     {
1763         int c;
1764
1765         c = at2con->a[k];
1766         if (c != constraint_index)
1767         {
1768             int aa1, aa2;
1769
1770             aa1 = iatom[c*3 + 1];
1771             aa2 = iatom[c*3 + 2];
1772             if (aa1 != a1)
1773             {
1774                 cc[nca] = c;
1775                 ca[nca] = aa1;
1776                 nca++;
1777             }
1778             if (aa2 != a1)
1779             {
1780                 cc[nca] = c;
1781                 ca[nca] = aa2;
1782                 nca++;
1783             }
1784         }
1785     }
1786
1787     for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
1788     {
1789         int c;
1790
1791         c = at2con->a[k];
1792         if (c != constraint_index)
1793         {
1794             int aa1, aa2, i;
1795
1796             aa1 = iatom[c*3 + 1];
1797             aa2 = iatom[c*3 + 2];
1798             if (aa1 != a2)
1799             {
1800                 for (i = 0; i < nca; i++)
1801                 {
1802                     if (aa1 == ca[i])
1803                     {
1804                         c_triangle[0] = cc[i];
1805                         c_triangle[1] = c;
1806                     }
1807                 }
1808             }
1809             if (aa2 != a2)
1810             {
1811                 for (i = 0; i < nca; i++)
1812                 {
1813                     if (aa2 == ca[i])
1814                     {
1815                         c_triangle[0] = cc[i];
1816                         c_triangle[1] = c;
1817                     }
1818                 }
1819             }
1820         }
1821     }
1822
1823     if (c_triangle[0] >= 0)
1824     {
1825         int end;
1826
1827         for (end = 0; end < 2; end++)
1828         {
1829             /* Check if constraint c_triangle[end] has not yet been assigned */
1830             if (li->con_index[c_triangle[end]] == -1)
1831             {
1832                 int  i, type;
1833                 real lenA, lenB;
1834
1835                 i    = c_triangle[end]*3;
1836                 type = iatom[i];
1837                 lenA = idef.iparams[type].constr.dA;
1838                 lenB = idef.iparams[type].constr.dB;
1839
1840                 if (bDynamics || lenA != 0 || lenB != 0)
1841                 {
1842                     assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
1843                 }
1844             }
1845         }
1846     }
1847 }
1848
1849 //! Sets matrix indices.
1850 static void set_matrix_indices(Lincs                *li,
1851                                const Task           &li_task,
1852                                const t_blocka       *at2con,
1853                                bool                  bSortMatrix)
1854 {
1855     for (int b = li_task.b0; b < li_task.b1; b++)
1856     {
1857         const int a1 = li->atoms[b].index1;
1858         const int a2 = li->atoms[b].index2;
1859
1860         int       i = li->blnr[b];
1861         for (int k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
1862         {
1863             int concon = li->con_index[at2con->a[k]];
1864             if (concon != b)
1865             {
1866                 li->blbnb[i++] = concon;
1867             }
1868         }
1869         for (int k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
1870         {
1871             int concon = li->con_index[at2con->a[k]];
1872             if (concon != b)
1873             {
1874                 li->blbnb[i++] = concon;
1875             }
1876         }
1877
1878         if (bSortMatrix)
1879         {
1880             /* Order the blbnb matrix to optimize memory access */
1881             std::sort(li->blbnb.begin()+li->blnr[b], li->blbnb.begin()+li->blnr[b+1]);
1882         }
1883     }
1884 }
1885
1886 void set_lincs(const t_idef         &idef,
1887                const t_mdatoms      &md,
1888                bool                  bDynamics,
1889                const t_commrec      *cr,
1890                Lincs                *li)
1891 {
1892     int          natoms;
1893     t_blocka     at2con;
1894     t_iatom     *iatom;
1895
1896     li->nc_real = 0;
1897     li->nc      = 0;
1898     li->ncc     = 0;
1899     /* Zero the thread index ranges.
1900      * Otherwise without local constraints we could return with old ranges.
1901      */
1902     for (int i = 0; i < li->ntask; i++)
1903     {
1904         li->task[i].b0   = 0;
1905         li->task[i].b1   = 0;
1906         li->task[i].ind.clear();
1907     }
1908     if (li->ntask > 1)
1909     {
1910         li->task[li->ntask].ind.clear();
1911     }
1912
1913     /* This is the local topology, so there are only F_CONSTR constraints */
1914     if (idef.il[F_CONSTR].nr == 0)
1915     {
1916         /* There are no constraints,
1917          * we do not need to fill any data structures.
1918          */
1919         return;
1920     }
1921
1922     if (debug)
1923     {
1924         fprintf(debug, "Building the LINCS connectivity\n");
1925     }
1926
1927     if (DOMAINDECOMP(cr))
1928     {
1929         if (cr->dd->constraints)
1930         {
1931             int start;
1932
1933             dd_get_constraint_range(cr->dd, &start, &natoms);
1934         }
1935         else
1936         {
1937             natoms = dd_numHomeAtoms(*cr->dd);
1938         }
1939     }
1940     else
1941     {
1942         natoms = md.homenr;
1943     }
1944
1945     at2con = make_at2con(natoms, idef.il, idef.iparams,
1946                          flexibleConstraintTreatment(bDynamics));
1947
1948     const int ncon_tot = idef.il[F_CONSTR].nr/3;
1949
1950     /* Ensure we have enough padding for aligned loads for each thread */
1951     const int numEntries = ncon_tot + li->ntask*simd_width;
1952     li->con_index.resize(numEntries);
1953     li->bllen0.resize(numEntries);
1954     li->ddist.resize(numEntries);
1955     li->atoms.resize(numEntries);
1956     li->blc.resize(numEntries);
1957     li->blc1.resize(numEntries);
1958     li->blnr.resize(numEntries + 1);
1959     li->bllen.resize(numEntries);
1960     li->tmpv.resizeWithPadding(numEntries);
1961     if (DOMAINDECOMP(cr))
1962     {
1963         li->nlocat.resize(numEntries);
1964     }
1965     li->tmp1.resize(numEntries);
1966     li->tmp2.resize(numEntries);
1967     li->tmp3.resize(numEntries);
1968     li->tmp4.resize(numEntries);
1969     li->mlambda.resize(numEntries);
1970
1971     iatom = idef.il[F_CONSTR].iatoms;
1972
1973     li->blnr[0]   = li->ncc;
1974
1975     /* Assign the constraints for li->ntask LINCS tasks.
1976      * We target a uniform distribution of constraints over the tasks.
1977      * Note that when flexible constraints are present, but are removed here
1978      * (e.g. because we are doing EM) we get imbalance, but since that doesn't
1979      * happen during normal MD, that's ok.
1980      */
1981
1982     /* Determine the number of constraints we need to assign here */
1983     int ncon_assign = ncon_tot;
1984     if (!bDynamics)
1985     {
1986         /* With energy minimization, flexible constraints are ignored
1987          * (and thus minimized, as they should be).
1988          */
1989         ncon_assign -= countFlexibleConstraints(idef.il, idef.iparams);
1990     }
1991
1992     /* Set the target constraint count per task to exactly uniform,
1993      * this might be overridden below.
1994      */
1995     int ncon_target = (ncon_assign + li->ntask - 1)/li->ntask;
1996
1997     /* Mark all constraints as unassigned by setting their index to -1 */
1998     for (int con = 0; con < ncon_tot; con++)
1999     {
2000         li->con_index[con] = -1;
2001     }
2002
2003     int con = 0;
2004     for (int th = 0; th < li->ntask; th++)
2005     {
2006         Task *li_task;
2007
2008         li_task = &li->task[th];
2009
2010 #if GMX_SIMD_HAVE_REAL
2011         /* With indepedent tasks we likely have H-bond constraints or constraint
2012          * pairs. The connected constraints will be pulled into the task, so the
2013          * constraints per task will often exceed ncon_target.
2014          * Triangle constraints can also increase the count, but there are
2015          * relatively few of those, so we usually expect to get ncon_target.
2016          */
2017         if (li->bTaskDep)
2018         {
2019             /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
2020              * since otherwise a lot of operations can be wasted.
2021              * There are several ways to round here, we choose the one
2022              * that alternates block sizes, which helps with Intel HT.
2023              */
2024             ncon_target = ((ncon_assign*(th + 1))/li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1) & ~(GMX_SIMD_REAL_WIDTH - 1);
2025         }
2026 #endif      // GMX_SIMD==2 && GMX_SIMD_HAVE_REAL
2027
2028         /* Continue filling the arrays where we left off with the previous task,
2029          * including padding for SIMD.
2030          */
2031         li_task->b0 = li->nc;
2032
2033         while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
2034         {
2035             if (li->con_index[con] == -1)
2036             {
2037                 int  type, a1, a2;
2038                 real lenA, lenB;
2039
2040                 type   = iatom[3*con];
2041                 a1     = iatom[3*con + 1];
2042                 a2     = iatom[3*con + 2];
2043                 lenA   = idef.iparams[type].constr.dA;
2044                 lenB   = idef.iparams[type].constr.dB;
2045                 /* Skip the flexible constraints when not doing dynamics */
2046                 if (bDynamics || lenA != 0 || lenB != 0)
2047                 {
2048                     assign_constraint(li, con, a1, a2, lenA, lenB, &at2con);
2049
2050                     if (li->ntask > 1 && !li->bTaskDep)
2051                     {
2052                         /* We can generate independent tasks. Check if we
2053                          * need to assign connected constraints to our task.
2054                          */
2055                         check_assign_connected(li, iatom, idef, bDynamics,
2056                                                a1, a2, &at2con);
2057                     }
2058                     if (li->ntask > 1 && li->ncg_triangle > 0)
2059                     {
2060                         /* Ensure constraints in one triangle are assigned
2061                          * to the same task.
2062                          */
2063                         check_assign_triangle(li, iatom, idef, bDynamics,
2064                                               con, a1, a2, &at2con);
2065                     }
2066                 }
2067             }
2068
2069             con++;
2070         }
2071
2072         li_task->b1 = li->nc;
2073
2074         if (simd_width > 1)
2075         {
2076             /* Copy the last atom pair indices and lengths for constraints
2077              * up to a multiple of simd_width, such that we can do all
2078              * SIMD operations without having to worry about end effects.
2079              */
2080             int i, last;
2081
2082             li->nc = ((li_task->b1 + simd_width - 1)/simd_width)*simd_width;
2083             last   = li_task->b1 - 1;
2084             for (i = li_task->b1; i < li->nc; i++)
2085             {
2086                 li->atoms[i]     = li->atoms[last];
2087                 li->bllen0[i]    = li->bllen0[last];
2088                 li->ddist[i]     = li->ddist[last];
2089                 li->bllen[i]     = li->bllen[last];
2090                 li->blnr[i + 1]  = li->blnr[last + 1];
2091             }
2092         }
2093
2094         /* Keep track of how many constraints we assigned */
2095         li->nc_real += li_task->b1 - li_task->b0;
2096
2097         if (debug)
2098         {
2099             fprintf(debug, "LINCS task %d constraints %d - %d\n",
2100                     th, li_task->b0, li_task->b1);
2101         }
2102     }
2103
2104     assert(li->nc_real == ncon_assign);
2105
2106     bool bSortMatrix;
2107
2108     /* Without DD we order the blbnb matrix to optimize memory access.
2109      * With DD the overhead of sorting is more than the gain during access.
2110      */
2111     bSortMatrix = !DOMAINDECOMP(cr);
2112
2113     li->blbnb.resize(li->ncc);
2114
2115 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2116     for (int th = 0; th < li->ntask; th++)
2117     {
2118         try
2119         {
2120             Task &li_task = li->task[th];
2121
2122             if (li->ncg_triangle > 0)
2123             {
2124                 /* This is allocating too much, but it is difficult to improve */
2125                 li_task.triangle.resize(li_task.b1 - li_task.b0);
2126                 li_task.tri_bits.resize(li_task.b1 - li_task.b0);
2127             }
2128
2129             set_matrix_indices(li, li_task, &at2con, bSortMatrix);
2130         }
2131         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2132     }
2133
2134     done_blocka(&at2con);
2135
2136     if (cr->dd == nullptr)
2137     {
2138         /* Since the matrix is static, we should free some memory */
2139         li->blbnb.resize(li->ncc);
2140     }
2141
2142     li->blmf.resize(li->ncc);
2143     li->blmf1.resize(li->ncc);
2144     li->tmpncc.resize(li->ncc);
2145
2146     gmx::ArrayRef<const int> nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
2147     if (!nlocat_dd.empty())
2148     {
2149         /* Convert nlocat from local topology to LINCS constraint indexing */
2150         for (con = 0; con < ncon_tot; con++)
2151         {
2152             li->nlocat[li->con_index[con]] = nlocat_dd[con];
2153         }
2154     }
2155     else
2156     {
2157         li->nlocat.clear();
2158     }
2159
2160     if (debug)
2161     {
2162         fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n",
2163                 li->nc_real, li->nc, li->ncc);
2164     }
2165
2166     if (li->ntask > 1)
2167     {
2168         lincs_thread_setup(li, md.nr);
2169     }
2170
2171     set_lincs_matrix(li, md.invmass, md.lambda);
2172 }
2173
2174 //! Issues a warning when LINCS constraints cannot be satisfied.
2175 static void lincs_warning(gmx_domdec_t *dd, const rvec *x, rvec *xprime, t_pbc *pbc,
2176                           int ncons, gmx::ArrayRef<const AtomPair> atoms,
2177                           gmx::ArrayRef<const real> bllen, real wangle,
2178                           int maxwarn, int *warncount)
2179 {
2180     real wfac = std::cos(DEG2RAD*wangle);
2181
2182     fprintf(stderr,
2183             "bonds that rotated more than %g degrees:\n"
2184             " atom 1 atom 2  angle  previous, current, constraint length\n",
2185             wangle);
2186
2187     for (int b = 0; b < ncons; b++)
2188     {
2189         const int i = atoms[b].index1;
2190         const int j = atoms[b].index2;
2191         rvec      v0;
2192         rvec      v1;
2193         if (pbc)
2194         {
2195             pbc_dx_aiuc(pbc, x[i], x[j], v0);
2196             pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
2197         }
2198         else
2199         {
2200             rvec_sub(x[i], x[j], v0);
2201             rvec_sub(xprime[i], xprime[j], v1);
2202         }
2203         real d0     = norm(v0);
2204         real d1     = norm(v1);
2205         real cosine = ::iprod(v0, v1)/(d0*d1);
2206         if (cosine < wfac)
2207         {
2208             fprintf(stderr,
2209                     " %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n",
2210                     ddglatnr(dd, i), ddglatnr(dd, j),
2211                     RAD2DEG*std::acos(cosine), d0, d1, bllen[b]);
2212             if (!std::isfinite(d1))
2213             {
2214                 gmx_fatal(FARGS, "Bond length not finite.");
2215             }
2216
2217             (*warncount)++;
2218         }
2219     }
2220     if (*warncount > maxwarn)
2221     {
2222         too_many_constraint_warnings(econtLINCS, *warncount);
2223     }
2224 }
2225
2226 //! Status information about how well LINCS satisified the constraints in this domain
2227 struct LincsDeviations
2228 {
2229     //! The maximum over all bonds in this domain of the relative deviation in bond lengths
2230     real maxDeviation = 0;
2231     //! Sum over all bonds in this domain of the squared relative deviation
2232     real sumSquaredDeviation = 0;
2233     //! Index of bond with max deviation
2234     int  indexOfMaxDeviation = -1;
2235     //! Number of bonds constrained in this domain
2236     int  numConstraints = 0;
2237 };
2238
2239 //! Determine how well the constraints have been satisfied.
2240 static LincsDeviations
2241 makeLincsDeviations(const Lincs &lincsd,
2242                     const rvec  *x,
2243                     const t_pbc *pbc)
2244 {
2245     LincsDeviations                    result;
2246     const ArrayRef<const AtomPair>     atoms  = lincsd.atoms;
2247     const ArrayRef<const real>         bllen  = lincsd.bllen;
2248     const ArrayRef<const int>          nlocat = lincsd.nlocat;
2249
2250     for (int task = 0; task < lincsd.ntask; task++)
2251     {
2252         for (int b = lincsd.task[task].b0; b < lincsd.task[task].b1; b++)
2253         {
2254             rvec dx;
2255             if (pbc)
2256             {
2257                 pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
2258             }
2259             else
2260             {
2261                 rvec_sub(x[atoms[b].index1], x[atoms[b].index2], dx);
2262             }
2263             real r2  = ::norm2(dx);
2264             real len = r2*gmx::invsqrt(r2);
2265             real d   = std::abs(len/bllen[b] - 1.0_real);
2266             if (d > result.maxDeviation && (nlocat.empty() || nlocat[b]))
2267             {
2268                 result.maxDeviation        = d;
2269                 result.indexOfMaxDeviation = b;
2270             }
2271             if (nlocat.empty())
2272             {
2273                 result.sumSquaredDeviation += d*d;
2274                 result.numConstraints++;
2275             }
2276             else
2277             {
2278                 result.sumSquaredDeviation += nlocat[b]*d*d;
2279                 result.numConstraints      += nlocat[b];
2280             }
2281         }
2282     }
2283
2284     if (!nlocat.empty())
2285     {
2286         result.numConstraints      /= 2;
2287         result.sumSquaredDeviation *= 0.5;
2288     }
2289     return result;
2290 }
2291
2292 bool constrain_lincs(bool computeRmsd,
2293                      const t_inputrec &ir,
2294                      int64_t step,
2295                      Lincs *lincsd, const t_mdatoms &md,
2296                      const t_commrec *cr,
2297                      const gmx_multisim_t *ms,
2298                      const rvec *x, rvec *xprime, rvec *min_proj,
2299                      const matrix box, t_pbc *pbc,
2300                      real lambda, real *dvdlambda,
2301                      real invdt, rvec *v,
2302                      bool bCalcVir, tensor vir_r_m_dr,
2303                      ConstraintVariable econq,
2304                      t_nrnb *nrnb,
2305                      int maxwarn, int *warncount)
2306 {
2307     bool bOK = TRUE;
2308
2309     /* This boolean should be set by a flag passed to this routine.
2310      * We can also easily check if any constraint length is changed,
2311      * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2312      */
2313     bool bCalcDHDL = (ir.efep != efepNO && dvdlambda != nullptr);
2314
2315     if (lincsd->nc == 0 && cr->dd == nullptr)
2316     {
2317         if (computeRmsd)
2318         {
2319             lincsd->rmsdData = {{0}};
2320         }
2321
2322         return bOK;
2323     }
2324
2325     if (econq == ConstraintVariable::Positions)
2326     {
2327         /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2328          * also with efep!=fepNO.
2329          */
2330         if (ir.efep != efepNO)
2331         {
2332             if (md.nMassPerturbed && lincsd->matlam != md.lambda)
2333             {
2334                 set_lincs_matrix(lincsd, md.invmass, md.lambda);
2335             }
2336
2337             for (int i = 0; i < lincsd->nc; i++)
2338             {
2339                 lincsd->bllen[i] = lincsd->bllen0[i] + lambda*lincsd->ddist[i];
2340             }
2341         }
2342
2343         if (lincsd->ncg_flex)
2344         {
2345             /* Set the flexible constraint lengths to the old lengths */
2346             if (pbc != nullptr)
2347             {
2348                 for (int i = 0; i < lincsd->nc; i++)
2349                 {
2350                     if (lincsd->bllen[i] == 0)
2351                     {
2352                         rvec dx;
2353                         pbc_dx_aiuc(pbc, x[lincsd->atoms[i].index1], x[lincsd->atoms[i].index2], dx);
2354                         lincsd->bllen[i] = norm(dx);
2355                     }
2356                 }
2357             }
2358             else
2359             {
2360                 for (int i = 0; i < lincsd->nc; i++)
2361                 {
2362                     if (lincsd->bllen[i] == 0)
2363                     {
2364                         lincsd->bllen[i] =
2365                             std::sqrt(distance2(x[lincsd->atoms[i].index1],
2366                                                 x[lincsd->atoms[i].index2]));
2367                     }
2368                 }
2369             }
2370         }
2371
2372         const bool printDebugOutput = ((debug != nullptr) && lincsd->nc > 0);
2373         if (printDebugOutput)
2374         {
2375             LincsDeviations deviations = makeLincsDeviations(*lincsd, xprime, pbc);
2376             fprintf(debug, "   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
2377             fprintf(debug, "       Before LINCS          %.6f    %.6f %6d %6d\n",
2378                     std::sqrt(deviations.sumSquaredDeviation/deviations.numConstraints),
2379                     deviations.maxDeviation,
2380                     ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2381                     ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2382         }
2383
2384         /* This bWarn var can be updated by multiple threads
2385          * at the same time. But as we only need to detect
2386          * if a warning occurred or not, this is not an issue.
2387          */
2388         bool bWarn = FALSE;
2389
2390         /* The OpenMP parallel region of constrain_lincs for coords */
2391 #pragma omp parallel num_threads(lincsd->ntask)
2392         {
2393             try
2394             {
2395                 int th = gmx_omp_get_thread_num();
2396
2397                 clear_mat(lincsd->task[th].vir_r_m_dr);
2398
2399                 do_lincs(x, xprime, box, pbc, lincsd, th,
2400                          md.invmass, cr,
2401                          bCalcDHDL,
2402                          ir.LincsWarnAngle, &bWarn,
2403                          invdt, v, bCalcVir,
2404                          th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2405             }
2406             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2407         }
2408
2409         if (computeRmsd || printDebugOutput || bWarn)
2410         {
2411             LincsDeviations deviations = makeLincsDeviations(*lincsd, xprime, pbc);
2412
2413             if (computeRmsd)
2414             {
2415                 // This is reduced across domains in compute_globals and
2416                 // reported to the log file.
2417                 lincsd->rmsdData[0] = deviations.numConstraints;
2418                 lincsd->rmsdData[1] = deviations.sumSquaredDeviation;
2419             }
2420             else
2421             {
2422                 // This is never read
2423                 lincsd->rmsdData = {{0}};
2424             }
2425             if (printDebugOutput)
2426             {
2427                 fprintf(debug,
2428                         "        After LINCS          %.6f    %.6f %6d %6d\n\n",
2429                         std::sqrt(deviations.sumSquaredDeviation/deviations.numConstraints),
2430                         deviations.maxDeviation,
2431                         ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2432                         ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2433             }
2434
2435             if (bWarn)
2436             {
2437                 if (maxwarn < INT_MAX)
2438                 {
2439                     std::string simMesg;
2440                     if (isMultiSim(ms))
2441                     {
2442                         simMesg += gmx::formatString(" in simulation %d", ms->sim);
2443                     }
2444                     fprintf(stderr,
2445                             "\nStep %" PRId64 ", time %g (ps)  LINCS WARNING%s\n"
2446                             "relative constraint deviation after LINCS:\n"
2447                             "rms %.6f, max %.6f (between atoms %d and %d)\n",
2448                             step, ir.init_t+step*ir.delta_t,
2449                             simMesg.c_str(),
2450                             std::sqrt(deviations.sumSquaredDeviation/deviations.numConstraints),
2451                             deviations.maxDeviation,
2452                             ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2453                             ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2454
2455                     lincs_warning(cr->dd, x, xprime, pbc,
2456                                   lincsd->nc, lincsd->atoms, lincsd->bllen,
2457                                   ir.LincsWarnAngle, maxwarn, warncount);
2458                 }
2459                 bOK = (deviations.maxDeviation < 0.5);
2460             }
2461         }
2462
2463         if (lincsd->ncg_flex)
2464         {
2465             for (int i = 0; (i < lincsd->nc); i++)
2466             {
2467                 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
2468                 {
2469                     lincsd->bllen[i] = 0;
2470                 }
2471             }
2472         }
2473     }
2474     else
2475     {
2476         /* The OpenMP parallel region of constrain_lincs for derivatives */
2477 #pragma omp parallel num_threads(lincsd->ntask)
2478         {
2479             try
2480             {
2481                 int th = gmx_omp_get_thread_num();
2482
2483                 do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
2484                           md.invmass, econq, bCalcDHDL,
2485                           bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2486             }
2487             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2488         }
2489     }
2490
2491     if (bCalcDHDL)
2492     {
2493         /* Reduce the dH/dlambda contributions over the threads */
2494         real dhdlambda;
2495         int  th;
2496
2497         dhdlambda = 0;
2498         for (th = 0; th < lincsd->ntask; th++)
2499         {
2500             dhdlambda += lincsd->task[th].dhdlambda;
2501         }
2502         if (econq == ConstraintVariable::Positions)
2503         {
2504             /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2505             /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2506             dhdlambda /= ir.delta_t*ir.delta_t;
2507         }
2508         *dvdlambda += dhdlambda;
2509     }
2510
2511     if (bCalcVir && lincsd->ntask > 1)
2512     {
2513         for (int i = 1; i < lincsd->ntask; i++)
2514         {
2515             m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
2516         }
2517     }
2518
2519     /* count assuming nit=1 */
2520     inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
2521     inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
2522     if (lincsd->ntriangle > 0)
2523     {
2524         inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder*lincsd->ncc_triangle);
2525     }
2526     if (v)
2527     {
2528         inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real*2);
2529     }
2530     if (bCalcVir)
2531     {
2532         inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
2533     }
2534
2535     return bOK;
2536 }
2537
2538 }  // namespace gmx