src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "vsite.h"
  40
  41 #include <stdio.h>
  42
  43 #include <algorithm>
  44 #include <vector>
  45
  46 #include "gromacs/domdec/domdec.h"
  47 #include "gromacs/domdec/domdec_struct.h"
  48 #include "gromacs/gmxlib/network.h"
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/math/functions.h"
  51 #include "gromacs/math/vec.h"
  52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  53 #include "gromacs/mdtypes/commrec.h"
  54 #include "gromacs/pbcutil/ishift.h"
  55 #include "gromacs/pbcutil/mshift.h"
  56 #include "gromacs/pbcutil/pbc.h"
  57 #include "gromacs/topology/ifunc.h"
  58 #include "gromacs/topology/mtop_util.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/fatalerror.h"
  61 #include "gromacs/utility/gmxassert.h"
  62 #include "gromacs/utility/gmxomp.h"
  63 #include "gromacs/utility/smalloc.h"
  64
  65
  66 /* The strategy used here for assigning virtual sites to (thread-)tasks
  67  * is as follows:
  68  *
  69  * We divide the atom range that vsites operate on (natoms_local with DD,
  70  * 0 - last atom involved in vsites without DD) equally over all threads.
  71  *
  72  * Vsites in the local range constructed from atoms in the local range
  73  * and/or other vsites that are fully local are assigned to a simple,
  74  * independent task.
  75  *
  76  * Vsites that are not assigned after using the above criterion get assigned
  77  * to a so called "interdependent" thread task when none of the constructing
  78  * atoms is a vsite. These tasks are called interdependent, because one task
  79  * accesses atoms assigned to a different task/thread.
  80  * Note that this option is turned off with large (local) atom counts
  81  * to avoid high memory usage.
  82  *
  83  * Any remaining vsites are assigned to a separate master thread task.
  84  */
  85
  86 using gmx::RVec;
  87
  88 static void init_ilist(t_ilist *ilist)
  89 {
  90     for (int i = 0; i < F_NRE; i++)
  91     {
  92         ilist[i].nr     = 0;
  93         ilist[i].nalloc = 0;
  94         ilist[i].iatoms = nullptr;
  95     }
  96 }
  97
  98 /*! \brief List of atom indices belonging to a task */
  99 struct AtomIndex {
 100     //! List of atom indices
 101     std::vector<int> atom;
 102 };
 103
 104 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
 105 struct InterdependentTask
 106 {
 107     //! The interaction lists, only vsite entries are used
 108     t_ilist                ilist[F_NRE];
 109     //! Thread/task-local force buffer
 110     std::vector<RVec>      force;
 111     //! The atom indices of the vsites of our task
 112     std::vector<int>       vsite;
 113     //! Flags if elements in force are spread to or not
 114     std::vector<bool>      use;
 115     //! The number of entries set to true in use
 116     int                    nuse;
 117     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 118     std::vector<AtomIndex> atomIndex;
 119     //! List of tasks (force blocks) this task spread forces to
 120     std::vector<int>       spreadTask;
 121     //! List of tasks that write to this tasks force block range
 122     std::vector<int>       reduceTask;
 123
 124     InterdependentTask()
 125     {
 126         init_ilist(ilist);
 127         nuse = 0;
 128     }
 129 };
 130
 131 /*! \brief Vsite thread task data structure */
 132 struct VsiteThread {
 133     //! Start of atom range of this task
 134     int                rangeStart;
 135     //! End of atom range of this task
 136     int                rangeEnd;
 137     //! The interaction lists, only vsite entries are used
 138     t_ilist            ilist[F_NRE];
 139     //! Local fshift accumulation buffer
 140     rvec               fshift[SHIFTS];
 141     //! Local virial dx*df accumulation buffer
 142     matrix             dxdf;
 143     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 144     bool               useInterdependentTask;
 145     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 146     InterdependentTask idTask;
 147
 148     VsiteThread()
 149     {
 150         rangeStart            = 0;
 151         rangeEnd              = 0;
 152         init_ilist(ilist);
 153         clear_rvecs(SHIFTS, fshift);
 154         clear_mat(dxdf);
 155         useInterdependentTask = false;
 156     }
 157 };
 158
 159
 160 /* The start and end values of for the vsite indices in the ftype enum.
 161  * The validity of these values is checked in init_vsite.
 162  * This is used to avoid loops over all ftypes just to get the vsite entries.
 163  * (We should replace the fixed ilist array by only the used entries.)
 164  */
 165 static const int c_ftypeVsiteStart = F_VSITE2;
 166 static const int c_ftypeVsiteEnd   = F_VSITEN + 1;
 167
 168
 169 /* Returns the sum of the vsite ilist sizes over all vsite types */
 170 static int vsiteIlistNrCount(const t_ilist *ilist)
 171 {
 172     int nr = 0;
 173     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 174     {
 175         nr += ilist[ftype].nr;
 176     }
 177
 178     return nr;
 179 }
 180
 181 static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 182 {
 183     if (pbc)
 184     {
 185         return pbc_dx_aiuc(pbc, xi, xj, dx);
 186     }
 187     else
 188     {
 189         rvec_sub(xi, xj, dx);
 190         return CENTRAL;
 191     }
 192 }
 193
 194 /* Vsite construction routines */
 195
 196 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc *pbc)
 197 {
 198     real b = 1 - a;
 199     /* 1 flop */
 200
 201     if (pbc)
 202     {
 203         rvec dx;
 204         pbc_dx_aiuc(pbc, xj, xi, dx);
 205         x[XX] = xi[XX] + a*dx[XX];
 206         x[YY] = xi[YY] + a*dx[YY];
 207         x[ZZ] = xi[ZZ] + a*dx[ZZ];
 208     }
 209     else
 210     {
 211         x[XX] = b*xi[XX] + a*xj[XX];
 212         x[YY] = b*xi[YY] + a*xj[YY];
 213         x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 214         /* 9 Flops */
 215     }
 216
 217     /* TOTAL: 10 flops */
 218 }
 219
 220 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 221                           const t_pbc *pbc)
 222 {
 223     real c = 1 - a - b;
 224     /* 2 flops */
 225
 226     if (pbc)
 227     {
 228         rvec dxj, dxk;
 229
 230         pbc_dx_aiuc(pbc, xj, xi, dxj);
 231         pbc_dx_aiuc(pbc, xk, xi, dxk);
 232         x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 233         x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 234         x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 235     }
 236     else
 237     {
 238         x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 239         x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 240         x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 241         /* 15 Flops */
 242     }
 243
 244     /* TOTAL: 17 flops */
 245 }
 246
 247 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 248                             const t_pbc *pbc)
 249 {
 250     rvec xij, xjk, temp;
 251     real c;
 252
 253     pbc_rvec_sub(pbc, xj, xi, xij);
 254     pbc_rvec_sub(pbc, xk, xj, xjk);
 255     /* 6 flops */
 256
 257     /* temp goes from i to a point on the line jk */
 258     temp[XX] = xij[XX] + a*xjk[XX];
 259     temp[YY] = xij[YY] + a*xjk[YY];
 260     temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 261     /* 6 flops */
 262
 263     c = b*gmx::invsqrt(iprod(temp, temp));
 264     /* 6 + 10 flops */
 265
 266     x[XX] = xi[XX] + c*temp[XX];
 267     x[YY] = xi[YY] + c*temp[YY];
 268     x[ZZ] = xi[ZZ] + c*temp[ZZ];
 269     /* 6 Flops */
 270
 271     /* TOTAL: 34 flops */
 272 }
 273
 274 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc *pbc)
 275 {
 276     rvec xij, xjk, xp;
 277     real a1, b1, c1, invdij;
 278
 279     pbc_rvec_sub(pbc, xj, xi, xij);
 280     pbc_rvec_sub(pbc, xk, xj, xjk);
 281     /* 6 flops */
 282
 283     invdij = gmx::invsqrt(iprod(xij, xij));
 284     c1     = invdij * invdij * iprod(xij, xjk);
 285     xp[XX] = xjk[XX] - c1*xij[XX];
 286     xp[YY] = xjk[YY] - c1*xij[YY];
 287     xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 288     a1     = a*invdij;
 289     b1     = b*gmx::invsqrt(iprod(xp, xp));
 290     /* 45 */
 291
 292     x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 293     x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 294     x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 295     /* 12 Flops */
 296
 297     /* TOTAL: 63 flops */
 298 }
 299
 300 static void constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x,
 301                              real a, real b, real c, const t_pbc *pbc)
 302 {
 303     rvec xij, xik, temp;
 304
 305     pbc_rvec_sub(pbc, xj, xi, xij);
 306     pbc_rvec_sub(pbc, xk, xi, xik);
 307     cprod(xij, xik, temp);
 308     /* 15 Flops */
 309
 310     x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 311     x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 312     x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 313     /* 18 Flops */
 314
 315     /* TOTAL: 33 flops */
 316 }
 317
 318 static void constr_vsite4FD(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 319                             real a, real b, real c, const t_pbc *pbc)
 320 {
 321     rvec xij, xjk, xjl, temp;
 322     real d;
 323
 324     pbc_rvec_sub(pbc, xj, xi, xij);
 325     pbc_rvec_sub(pbc, xk, xj, xjk);
 326     pbc_rvec_sub(pbc, xl, xj, xjl);
 327     /* 9 flops */
 328
 329     /* temp goes from i to a point on the plane jkl */
 330     temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 331     temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 332     temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 333     /* 12 flops */
 334
 335     d = c*gmx::invsqrt(iprod(temp, temp));
 336     /* 6 + 10 flops */
 337
 338     x[XX] = xi[XX] + d*temp[XX];
 339     x[YY] = xi[YY] + d*temp[YY];
 340     x[ZZ] = xi[ZZ] + d*temp[ZZ];
 341     /* 6 Flops */
 342
 343     /* TOTAL: 43 flops */
 344 }
 345
 346 static void constr_vsite4FDN(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 347                              real a, real b, real c, const t_pbc *pbc)
 348 {
 349     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 350     real d;
 351
 352     pbc_rvec_sub(pbc, xj, xi, xij);
 353     pbc_rvec_sub(pbc, xk, xi, xik);
 354     pbc_rvec_sub(pbc, xl, xi, xil);
 355     /* 9 flops */
 356
 357     ra[XX] = a*xik[XX];
 358     ra[YY] = a*xik[YY];
 359     ra[ZZ] = a*xik[ZZ];
 360
 361     rb[XX] = b*xil[XX];
 362     rb[YY] = b*xil[YY];
 363     rb[ZZ] = b*xil[ZZ];
 364
 365     /* 6 flops */
 366
 367     rvec_sub(ra, xij, rja);
 368     rvec_sub(rb, xij, rjb);
 369     /* 6 flops */
 370
 371     cprod(rja, rjb, rm);
 372     /* 9 flops */
 373
 374     d = c*gmx::invsqrt(norm2(rm));
 375     /* 5+5+1 flops */
 376
 377     x[XX] = xi[XX] + d*rm[XX];
 378     x[YY] = xi[YY] + d*rm[YY];
 379     x[ZZ] = xi[ZZ] + d*rm[ZZ];
 380     /* 6 Flops */
 381
 382     /* TOTAL: 47 flops */
 383 }
 384
 385
 386 static int constr_vsiten(const t_iatom *ia, const t_iparams ip[],
 387                          rvec *x, const t_pbc *pbc)
 388 {
 389     rvec x1, dx;
 390     dvec dsum;
 391     int  n3, av, ai;
 392     real a;
 393
 394     n3 = 3*ip[ia[0]].vsiten.n;
 395     av = ia[1];
 396     ai = ia[2];
 397     copy_rvec(x[ai], x1);
 398     clear_dvec(dsum);
 399     for (int i = 3; i < n3; i += 3)
 400     {
 401         ai = ia[i+2];
 402         a  = ip[ia[i]].vsiten.a;
 403         if (pbc)
 404         {
 405             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 406         }
 407         else
 408         {
 409             rvec_sub(x[ai], x1, dx);
 410         }
 411         dsum[XX] += a*dx[XX];
 412         dsum[YY] += a*dx[YY];
 413         dsum[ZZ] += a*dx[ZZ];
 414         /* 9 Flops */
 415     }
 416
 417     x[av][XX] = x1[XX] + dsum[XX];
 418     x[av][YY] = x1[YY] + dsum[YY];
 419     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 420
 421     return n3;
 422 }
 423
 424
 425 static void construct_vsites_thread(const gmx_vsite_t *vsite,
 426                                     rvec x[],
 427                                     real dt, rvec *v,
 428                                     const t_iparams ip[], const t_ilist ilist[],
 429                                     const t_pbc *pbc_null)
 430 {
 431     gmx_bool     bPBCAll;
 432     real         inv_dt;
 433     const t_pbc *pbc_null2;
 434     int         *vsite_pbc;
 435
 436     if (v != nullptr)
 437     {
 438         inv_dt = 1.0/dt;
 439     }
 440     else
 441     {
 442         inv_dt = 1.0;
 443     }
 444
 445     bPBCAll = (pbc_null != nullptr && !vsite->bHaveChargeGroups);
 446
 447     pbc_null2 = nullptr;
 448     vsite_pbc = nullptr;
 449     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 450     {
 451         if (ilist[ftype].nr == 0)
 452         {
 453             continue;
 454         }
 455
 456         {   // TODO remove me
 457             int            nra = interaction_function[ftype].nratoms;
 458             int            inc = 1 + nra;
 459             int            nr  = ilist[ftype].nr;
 460
 461             const t_iatom *ia = ilist[ftype].iatoms;
 462
 463             if (bPBCAll)
 464             {
 465                 pbc_null2 = pbc_null;
 466             }
 467             else if (pbc_null != nullptr)
 468             {
 469                 vsite_pbc = vsite->vsite_pbc_loc[ftype - c_ftypeVsiteStart];
 470             }
 471
 472             for (int i = 0; i < nr; )
 473             {
 474                 int  tp     = ia[0];
 475                 /* The vsite and constructing atoms */
 476                 int  avsite = ia[1];
 477                 int  ai     = ia[2];
 478                 /* Constants for constructing vsites */
 479                 real a1     = ip[tp].vsite.a;
 480                 /* Check what kind of pbc we need to use */
 481                 int  pbc_atom;
 482                 rvec xpbc;
 483                 if (bPBCAll)
 484                 {
 485                     /* No charge groups, vsite follows its own pbc */
 486                     pbc_atom = avsite;
 487                     copy_rvec(x[avsite], xpbc);
 488                 }
 489                 else if (vsite_pbc != nullptr)
 490                 {
 491                     pbc_atom = vsite_pbc[i/(1 + nra)];
 492                     if (pbc_atom > -2)
 493                     {
 494                         if (pbc_atom >= 0)
 495                         {
 496                             /* We need to copy the coordinates here,
 497                              * single for single atom cg's pbc_atom
 498                              * is the vsite itself.
 499                              */
 500                             copy_rvec(x[pbc_atom], xpbc);
 501                         }
 502                         pbc_null2 = pbc_null;
 503                     }
 504                     else
 505                     {
 506                         pbc_null2 = nullptr;
 507                     }
 508                 }
 509                 else
 510                 {
 511                     pbc_atom = -2;
 512                 }
 513                 /* Copy the old position */
 514                 rvec xv;
 515                 copy_rvec(x[avsite], xv);
 516
 517                 /* Construct the vsite depending on type */
 518                 int  aj, ak, al;
 519                 real b1, c1;
 520                 switch (ftype)
 521                 {
 522                     case F_VSITE2:
 523                         aj = ia[3];
 524                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 525                         break;
 526                     case F_VSITE3:
 527                         aj = ia[3];
 528                         ak = ia[4];
 529                         b1 = ip[tp].vsite.b;
 530                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 531                         break;
 532                     case F_VSITE3FD:
 533                         aj = ia[3];
 534                         ak = ia[4];
 535                         b1 = ip[tp].vsite.b;
 536                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 537                         break;
 538                     case F_VSITE3FAD:
 539                         aj = ia[3];
 540                         ak = ia[4];
 541                         b1 = ip[tp].vsite.b;
 542                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 543                         break;
 544                     case F_VSITE3OUT:
 545                         aj = ia[3];
 546                         ak = ia[4];
 547                         b1 = ip[tp].vsite.b;
 548                         c1 = ip[tp].vsite.c;
 549                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 550                         break;
 551                     case F_VSITE4FD:
 552                         aj = ia[3];
 553                         ak = ia[4];
 554                         al = ia[5];
 555                         b1 = ip[tp].vsite.b;
 556                         c1 = ip[tp].vsite.c;
 557                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 558                                         pbc_null2);
 559                         break;
 560                     case F_VSITE4FDN:
 561                         aj = ia[3];
 562                         ak = ia[4];
 563                         al = ia[5];
 564                         b1 = ip[tp].vsite.b;
 565                         c1 = ip[tp].vsite.c;
 566                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 567                                          pbc_null2);
 568                         break;
 569                     case F_VSITEN:
 570                         inc = constr_vsiten(ia, ip, x, pbc_null2);
 571                         break;
 572                     default:
 573                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 574                                   ftype, __FILE__, __LINE__);
 575                 }
 576
 577                 if (pbc_atom >= 0)
 578                 {
 579                     /* Match the pbc of this vsite to the rest of its charge group */
 580                     rvec dx;
 581                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xpbc, dx);
 582                     if (ishift != CENTRAL)
 583                     {
 584                         rvec_add(xpbc, dx, x[avsite]);
 585                     }
 586                 }
 587                 if (v != nullptr)
 588                 {
 589                     /* Calculate velocity of vsite... */
 590                     rvec vv;
 591                     rvec_sub(x[avsite], xv, vv);
 592                     svmul(inv_dt, vv, v[avsite]);
 593                 }
 594
 595                 /* Increment loop variables */
 596                 i  += inc;
 597                 ia += inc;
 598             }
 599         }
 600     }
 601 }
 602
 603 void construct_vsites(const gmx_vsite_t *vsite,
 604                       rvec x[],
 605                       real dt, rvec *v,
 606                       const t_iparams ip[], const t_ilist ilist[],
 607                       int ePBC, gmx_bool bMolPBC,
 608                       t_commrec *cr, matrix box)
 609 {
 610     t_pbc     pbc, *pbc_null;
 611
 612     gmx_bool  bDomDec = cr && DOMAINDECOMP(cr);
 613
 614     /* We only need to do pbc when we have inter-cg vsites */
 615     if (ePBC != epbcNONE && (bDomDec || bMolPBC) && vsite->n_intercg_vsite)
 616     {
 617         /* This is wasting some CPU time as we now do this multiple times
 618          * per MD step.
 619          */
 620         ivec null_ivec;
 621         clear_ivec(null_ivec);
 622         pbc_null = set_pbc_dd(&pbc, ePBC,
 623                               DOMAINDECOMP(cr) ? cr->dd->nc : null_ivec,
 624                               FALSE, box);
 625     }
 626     else
 627     {
 628         pbc_null = nullptr;
 629     }
 630
 631     if (bDomDec)
 632     {
 633         dd_move_x_vsites(cr->dd, box, x);
 634     }
 635
 636     if (vsite->nthreads == 1)
 637     {
 638         construct_vsites_thread(vsite,
 639                                 x, dt, v,
 640                                 ip, ilist,
 641                                 pbc_null);
 642     }
 643     else
 644     {
 645 #pragma omp parallel num_threads(vsite->nthreads)
 646         {
 647             try
 648             {
 649                 int th = gmx_omp_get_thread_num();
 650                 construct_vsites_thread(vsite,
 651                                         x, dt, v,
 652                                         ip, vsite->tData[th]->ilist,
 653                                         pbc_null);
 654                 if (vsite->tData[th]->useInterdependentTask)
 655                 {
 656                     /* Here we don't need a barrier (unlike the spreading),
 657                      * since both tasks only construct vsites from particles,
 658                      * or local vsites, not from non-local vsites.
 659                      */
 660                     construct_vsites_thread(vsite,
 661                                             x, dt, v,
 662                                             ip, vsite->tData[th]->idTask.ilist,
 663                                             pbc_null);
 664                 }
 665             }
 666             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 667         }
 668         /* Now we can construct the vsites that might depend on other vsites */
 669         construct_vsites_thread(vsite,
 670                                 x, dt, v,
 671                                 ip, vsite->tData[vsite->nthreads]->ilist,
 672                                 pbc_null);
 673     }
 674 }
 675
 676 static void spread_vsite2(const t_iatom ia[], real a,
 677                           const rvec x[],
 678                           rvec f[], rvec fshift[],
 679                           const t_pbc *pbc, const t_graph *g)
 680 {
 681     rvec    fi, fj, dx;
 682     t_iatom av, ai, aj;
 683     ivec    di;
 684     int     siv, sij;
 685
 686     av = ia[1];
 687     ai = ia[2];
 688     aj = ia[3];
 689
 690     svmul(1 - a, f[av], fi);
 691     svmul(    a, f[av], fj);
 692     /* 7 flop */
 693
 694     rvec_inc(f[ai], fi);
 695     rvec_inc(f[aj], fj);
 696     /* 6 Flops */
 697
 698     if (g)
 699     {
 700         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 701         siv = IVEC2IS(di);
 702         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 703         sij = IVEC2IS(di);
 704     }
 705     else if (pbc)
 706     {
 707         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 708         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 709     }
 710     else
 711     {
 712         siv = CENTRAL;
 713         sij = CENTRAL;
 714     }
 715
 716     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 717     {
 718         rvec_inc(fshift[siv], f[av]);
 719         rvec_dec(fshift[CENTRAL], fi);
 720         rvec_dec(fshift[sij], fj);
 721     }
 722
 723     /* TOTAL: 13 flops */
 724 }
 725
 726 void construct_vsites_mtop(gmx_vsite_t *vsite,
 727                            gmx_mtop_t *mtop, rvec x[])
 728 {
 729     int as = 0;
 730     for (int mb = 0; mb < mtop->nmolblock; mb++)
 731     {
 732         const gmx_molblock_t *molb = &mtop->molblock[mb];
 733         const gmx_moltype_t  *molt = &mtop->moltype[molb->type];
 734         for (int mol = 0; mol < molb->nmol; mol++)
 735         {
 736             construct_vsites(vsite, x+as, 0.0, nullptr,
 737                              mtop->ffparams.iparams, molt->ilist,
 738                              epbcNONE, TRUE, nullptr, nullptr);
 739             as += molt->atoms.nr;
 740         }
 741     }
 742 }
 743
 744 static void spread_vsite3(const t_iatom ia[], real a, real b,
 745                           const rvec x[],
 746                           rvec f[], rvec fshift[],
 747                           const t_pbc *pbc, const t_graph *g)
 748 {
 749     rvec    fi, fj, fk, dx;
 750     int     av, ai, aj, ak;
 751     ivec    di;
 752     int     siv, sij, sik;
 753
 754     av = ia[1];
 755     ai = ia[2];
 756     aj = ia[3];
 757     ak = ia[4];
 758
 759     svmul(1 - a - b, f[av], fi);
 760     svmul(        a, f[av], fj);
 761     svmul(        b, f[av], fk);
 762     /* 11 flops */
 763
 764     rvec_inc(f[ai], fi);
 765     rvec_inc(f[aj], fj);
 766     rvec_inc(f[ak], fk);
 767     /* 9 Flops */
 768
 769     if (g)
 770     {
 771         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 772         siv = IVEC2IS(di);
 773         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 774         sij = IVEC2IS(di);
 775         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 776         sik = IVEC2IS(di);
 777     }
 778     else if (pbc)
 779     {
 780         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 781         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 782         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 783     }
 784     else
 785     {
 786         siv = CENTRAL;
 787         sij = CENTRAL;
 788         sik = CENTRAL;
 789     }
 790
 791     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 792     {
 793         rvec_inc(fshift[siv], f[av]);
 794         rvec_dec(fshift[CENTRAL], fi);
 795         rvec_dec(fshift[sij], fj);
 796         rvec_dec(fshift[sik], fk);
 797     }
 798
 799     /* TOTAL: 20 flops */
 800 }
 801
 802 static void spread_vsite3FD(const t_iatom ia[], real a, real b,
 803                             const rvec x[],
 804                             rvec f[], rvec fshift[],
 805                             gmx_bool VirCorr, matrix dxdf,
 806                             const t_pbc *pbc, const t_graph *g)
 807 {
 808     real    c, invl, fproj, a1;
 809     rvec    xvi, xij, xjk, xix, fv, temp;
 810     t_iatom av, ai, aj, ak;
 811     int     svi, sji, skj;
 812     ivec    di;
 813
 814     av = ia[1];
 815     ai = ia[2];
 816     aj = ia[3];
 817     ak = ia[4];
 818     copy_rvec(f[av], fv);
 819
 820     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 821     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 822     /* 6 flops */
 823
 824     /* xix goes from i to point x on the line jk */
 825     xix[XX] = xij[XX]+a*xjk[XX];
 826     xix[YY] = xij[YY]+a*xjk[YY];
 827     xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 828     /* 6 flops */
 829
 830     invl = gmx::invsqrt(iprod(xix, xix));
 831     c    = b*invl;
 832     /* 4 + ?10? flops */
 833
 834     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 835
 836     temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 837     temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 838     temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 839     /* 16 */
 840
 841     /* c is already calculated in constr_vsite3FD
 842        storing c somewhere will save 26 flops!     */
 843
 844     a1         = 1 - a;
 845     f[ai][XX] += fv[XX] - temp[XX];
 846     f[ai][YY] += fv[YY] - temp[YY];
 847     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 848     f[aj][XX] += a1*temp[XX];
 849     f[aj][YY] += a1*temp[YY];
 850     f[aj][ZZ] += a1*temp[ZZ];
 851     f[ak][XX] += a*temp[XX];
 852     f[ak][YY] += a*temp[YY];
 853     f[ak][ZZ] += a*temp[ZZ];
 854     /* 19 Flops */
 855
 856     if (g)
 857     {
 858         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 859         svi = IVEC2IS(di);
 860         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 861         sji = IVEC2IS(di);
 862         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 863         skj = IVEC2IS(di);
 864     }
 865     else if (pbc)
 866     {
 867         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 868     }
 869     else
 870     {
 871         svi = CENTRAL;
 872     }
 873
 874     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 875     {
 876         rvec_dec(fshift[svi], fv);
 877         fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 878         fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 879         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 880         fshift[    sji][XX] += temp[XX];
 881         fshift[    sji][YY] += temp[YY];
 882         fshift[    sji][ZZ] += temp[ZZ];
 883         fshift[    skj][XX] += a*temp[XX];
 884         fshift[    skj][YY] += a*temp[YY];
 885         fshift[    skj][ZZ] += a*temp[ZZ];
 886     }
 887
 888     if (VirCorr)
 889     {
 890         /* When VirCorr=TRUE, the virial for the current forces is not
 891          * calculated from the redistributed forces. This means that
 892          * the effect of non-linear virtual site constructions on the virial
 893          * needs to be added separately. This contribution can be calculated
 894          * in many ways, but the simplest and cheapest way is to use
 895          * the first constructing atom ai as a reference position in space:
 896          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 897          */
 898         rvec xiv;
 899
 900         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 901
 902         for (int i = 0; i < DIM; i++)
 903         {
 904             for (int j = 0; j < DIM; j++)
 905             {
 906                 /* As xix is a linear combination of j and k, use that here */
 907                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 908             }
 909         }
 910     }
 911
 912     /* TOTAL: 61 flops */
 913 }
 914
 915 static void spread_vsite3FAD(const t_iatom ia[], real a, real b,
 916                              const rvec x[],
 917                              rvec f[], rvec fshift[],
 918                              gmx_bool VirCorr, matrix dxdf,
 919                              const t_pbc *pbc, const t_graph *g)
 920 {
 921     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 922     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 923     t_iatom av, ai, aj, ak;
 924     int     svi, sji, skj, d;
 925     ivec    di;
 926
 927     av = ia[1];
 928     ai = ia[2];
 929     aj = ia[3];
 930     ak = ia[4];
 931     copy_rvec(f[ia[1]], fv);
 932
 933     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 934     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 935     /* 6 flops */
 936
 937     invdij    = gmx::invsqrt(iprod(xij, xij));
 938     invdij2   = invdij * invdij;
 939     c1        = iprod(xij, xjk) * invdij2;
 940     xperp[XX] = xjk[XX] - c1*xij[XX];
 941     xperp[YY] = xjk[YY] - c1*xij[YY];
 942     xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 943     /* xperp in plane ijk, perp. to ij */
 944     invdp = gmx::invsqrt(iprod(xperp, xperp));
 945     a1    = a*invdij;
 946     b1    = b*invdp;
 947     /* 45 flops */
 948
 949     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 950        storing them somewhere will save 45 flops!     */
 951
 952     fproj = iprod(xij, fv)*invdij2;
 953     svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
 954     svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
 955     svmul(b1*fproj,                   xperp, f3);
 956     /* 23 flops */
 957
 958     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
 959     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
 960     for (d = 0; (d < DIM); d++)
 961     {
 962         f1[d] *= a1;
 963         f2[d] *= b1;
 964     }
 965     /* 12 flops */
 966
 967     c2         = 1 + c1;
 968     f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
 969     f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
 970     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
 971     f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
 972     f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
 973     f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
 974     f[ak][XX] +=                      f2[XX];
 975     f[ak][YY] +=                      f2[YY];
 976     f[ak][ZZ] +=                      f2[ZZ];
 977     /* 30 Flops */
 978
 979     if (g)
 980     {
 981         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 982         svi = IVEC2IS(di);
 983         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 984         sji = IVEC2IS(di);
 985         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 986         skj = IVEC2IS(di);
 987     }
 988     else if (pbc)
 989     {
 990         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 991     }
 992     else
 993     {
 994         svi = CENTRAL;
 995     }
 996
 997     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 998     {
 999         rvec_dec(fshift[svi], fv);
1000         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
1001         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
1002         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
1003         fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
1004         fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
1005         fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
1006         fshift[    skj][XX] +=                          f2[XX];
1007         fshift[    skj][YY] +=                          f2[YY];
1008         fshift[    skj][ZZ] +=                          f2[ZZ];
1009     }
1010
1011     if (VirCorr)
1012     {
1013         rvec xiv;
1014         int  i, j;
1015
1016         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1017
1018         for (i = 0; i < DIM; i++)
1019         {
1020             for (j = 0; j < DIM; j++)
1021             {
1022                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1023                 dxdf[i][j] +=
1024                     -xiv[i]*fv[j]
1025                     + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
1026                     + xjk[i]*f2[j];
1027             }
1028         }
1029     }
1030
1031     /* TOTAL: 113 flops */
1032 }
1033
1034 static void spread_vsite3OUT(const t_iatom ia[], real a, real b, real c,
1035                              const rvec x[],
1036                              rvec f[], rvec fshift[],
1037                              gmx_bool VirCorr, matrix dxdf,
1038                              const t_pbc *pbc, const t_graph *g)
1039 {
1040     rvec    xvi, xij, xik, fv, fj, fk;
1041     real    cfx, cfy, cfz;
1042     int     av, ai, aj, ak;
1043     ivec    di;
1044     int     svi, sji, ski;
1045
1046     av = ia[1];
1047     ai = ia[2];
1048     aj = ia[3];
1049     ak = ia[4];
1050
1051     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1052     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1053     /* 6 Flops */
1054
1055     copy_rvec(f[av], fv);
1056
1057     cfx = c*fv[XX];
1058     cfy = c*fv[YY];
1059     cfz = c*fv[ZZ];
1060     /* 3 Flops */
1061
1062     fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
1063     fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
1064     fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
1065
1066     fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
1067     fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
1068     fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
1069     /* 30 Flops */
1070
1071     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1072     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1073     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1074     rvec_inc(f[aj], fj);
1075     rvec_inc(f[ak], fk);
1076     /* 15 Flops */
1077
1078     if (g)
1079     {
1080         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1081         svi = IVEC2IS(di);
1082         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1083         sji = IVEC2IS(di);
1084         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1085         ski = IVEC2IS(di);
1086     }
1087     else if (pbc)
1088     {
1089         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1090     }
1091     else
1092     {
1093         svi = CENTRAL;
1094     }
1095
1096     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1097     {
1098         rvec_dec(fshift[svi], fv);
1099         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1100         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1101         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1102         rvec_inc(fshift[sji], fj);
1103         rvec_inc(fshift[ski], fk);
1104     }
1105
1106     if (VirCorr)
1107     {
1108         rvec xiv;
1109
1110         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1111
1112         for (int i = 0; i < DIM; i++)
1113         {
1114             for (int j = 0; j < DIM; j++)
1115             {
1116                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
1117             }
1118         }
1119     }
1120
1121     /* TOTAL: 54 flops */
1122 }
1123
1124 static void spread_vsite4FD(const t_iatom ia[], real a, real b, real c,
1125                             const rvec x[],
1126                             rvec f[], rvec fshift[],
1127                             gmx_bool VirCorr, matrix dxdf,
1128                             const t_pbc *pbc, const t_graph *g)
1129 {
1130     real    d, invl, fproj, a1;
1131     rvec    xvi, xij, xjk, xjl, xix, fv, temp;
1132     int     av, ai, aj, ak, al;
1133     ivec    di;
1134     int     svi, sji, skj, slj, m;
1135
1136     av = ia[1];
1137     ai = ia[2];
1138     aj = ia[3];
1139     ak = ia[4];
1140     al = ia[5];
1141
1142     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1143     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1144     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1145     /* 9 flops */
1146
1147     /* xix goes from i to point x on the plane jkl */
1148     for (m = 0; m < DIM; m++)
1149     {
1150         xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
1151     }
1152     /* 12 flops */
1153
1154     invl = gmx::invsqrt(iprod(xix, xix));
1155     d    = c*invl;
1156     /* 4 + ?10? flops */
1157
1158     copy_rvec(f[av], fv);
1159
1160     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
1161
1162     for (m = 0; m < DIM; m++)
1163     {
1164         temp[m] = d*(fv[m] - fproj*xix[m]);
1165     }
1166     /* 16 */
1167
1168     /* c is already calculated in constr_vsite3FD
1169        storing c somewhere will save 35 flops!     */
1170
1171     a1 = 1 - a - b;
1172     for (m = 0; m < DIM; m++)
1173     {
1174         f[ai][m] += fv[m] - temp[m];
1175         f[aj][m] += a1*temp[m];
1176         f[ak][m] += a*temp[m];
1177         f[al][m] += b*temp[m];
1178     }
1179     /* 26 Flops */
1180
1181     if (g)
1182     {
1183         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1184         svi = IVEC2IS(di);
1185         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1186         sji = IVEC2IS(di);
1187         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1188         skj = IVEC2IS(di);
1189         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1190         slj = IVEC2IS(di);
1191     }
1192     else if (pbc)
1193     {
1194         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1195     }
1196     else
1197     {
1198         svi = CENTRAL;
1199     }
1200
1201     if (fshift &&
1202         (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1203     {
1204         rvec_dec(fshift[svi], fv);
1205         for (m = 0; m < DIM; m++)
1206         {
1207             fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
1208             fshift[    sji][m] += temp[m];
1209             fshift[    skj][m] += a*temp[m];
1210             fshift[    slj][m] += b*temp[m];
1211         }
1212     }
1213
1214     if (VirCorr)
1215     {
1216         rvec xiv;
1217         int  i, j;
1218
1219         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1220
1221         for (i = 0; i < DIM; i++)
1222         {
1223             for (j = 0; j < DIM; j++)
1224             {
1225                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
1226             }
1227         }
1228     }
1229
1230     /* TOTAL: 77 flops */
1231 }
1232
1233
1234 static void spread_vsite4FDN(const t_iatom ia[], real a, real b, real c,
1235                              const rvec x[],
1236                              rvec f[], rvec fshift[],
1237                              gmx_bool VirCorr, matrix dxdf,
1238                              const t_pbc *pbc, const t_graph *g)
1239 {
1240     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1241     rvec fv, fj, fk, fl;
1242     real invrm, denom;
1243     real cfx, cfy, cfz;
1244     ivec di;
1245     int  av, ai, aj, ak, al;
1246     int  svi, sij, sik, sil;
1247
1248     /* DEBUG: check atom indices */
1249     av = ia[1];
1250     ai = ia[2];
1251     aj = ia[3];
1252     ak = ia[4];
1253     al = ia[5];
1254
1255     copy_rvec(f[av], fv);
1256
1257     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1258     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1259     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1260     /* 9 flops */
1261
1262     ra[XX] = a*xik[XX];
1263     ra[YY] = a*xik[YY];
1264     ra[ZZ] = a*xik[ZZ];
1265
1266     rb[XX] = b*xil[XX];
1267     rb[YY] = b*xil[YY];
1268     rb[ZZ] = b*xil[ZZ];
1269
1270     /* 6 flops */
1271
1272     rvec_sub(ra, xij, rja);
1273     rvec_sub(rb, xij, rjb);
1274     rvec_sub(rb, ra, rab);
1275     /* 9 flops */
1276
1277     cprod(rja, rjb, rm);
1278     /* 9 flops */
1279
1280     invrm = gmx::invsqrt(norm2(rm));
1281     denom = invrm*invrm;
1282     /* 5+5+2 flops */
1283
1284     cfx = c*invrm*fv[XX];
1285     cfy = c*invrm*fv[YY];
1286     cfz = c*invrm*fv[ZZ];
1287     /* 6 Flops */
1288
1289     cprod(rm, rab, rt);
1290     /* 9 flops */
1291
1292     rt[XX] *= denom;
1293     rt[YY] *= denom;
1294     rt[ZZ] *= denom;
1295     /* 3flops */
1296
1297     fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
1298     fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
1299     fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
1300     /* 30 flops */
1301
1302     cprod(rjb, rm, rt);
1303     /* 9 flops */
1304
1305     rt[XX] *= denom*a;
1306     rt[YY] *= denom*a;
1307     rt[ZZ] *= denom*a;
1308     /* 3flops */
1309
1310     fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
1311     fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
1312     fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1313     /* 36 flops */
1314
1315     cprod(rm, rja, rt);
1316     /* 9 flops */
1317
1318     rt[XX] *= denom*b;
1319     rt[YY] *= denom*b;
1320     rt[ZZ] *= denom*b;
1321     /* 3flops */
1322
1323     fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
1324     fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
1325     fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1326     /* 36 flops */
1327
1328     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1329     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1330     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1331     rvec_inc(f[aj], fj);
1332     rvec_inc(f[ak], fk);
1333     rvec_inc(f[al], fl);
1334     /* 21 flops */
1335
1336     if (g)
1337     {
1338         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1339         svi = IVEC2IS(di);
1340         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1341         sij = IVEC2IS(di);
1342         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1343         sik = IVEC2IS(di);
1344         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1345         sil = IVEC2IS(di);
1346     }
1347     else if (pbc)
1348     {
1349         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1350     }
1351     else
1352     {
1353         svi = CENTRAL;
1354     }
1355
1356     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1357     {
1358         rvec_dec(fshift[svi], fv);
1359         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1360         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1361         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1362         rvec_inc(fshift[sij], fj);
1363         rvec_inc(fshift[sik], fk);
1364         rvec_inc(fshift[sil], fl);
1365     }
1366
1367     if (VirCorr)
1368     {
1369         rvec xiv;
1370         int  i, j;
1371
1372         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1373
1374         for (i = 0; i < DIM; i++)
1375         {
1376             for (j = 0; j < DIM; j++)
1377             {
1378                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
1379             }
1380         }
1381     }
1382
1383     /* Total: 207 flops (Yuck!) */
1384 }
1385
1386
1387 static int spread_vsiten(const t_iatom ia[], const t_iparams ip[],
1388                          const rvec x[],
1389                          rvec f[], rvec fshift[],
1390                          const t_pbc *pbc, const t_graph *g)
1391 {
1392     rvec xv, dx, fi;
1393     int  n3, av, i, ai;
1394     real a;
1395     ivec di;
1396     int  siv;
1397
1398     n3 = 3*ip[ia[0]].vsiten.n;
1399     av = ia[1];
1400     copy_rvec(x[av], xv);
1401
1402     for (i = 0; i < n3; i += 3)
1403     {
1404         ai = ia[i+2];
1405         if (g)
1406         {
1407             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1408             siv = IVEC2IS(di);
1409         }
1410         else if (pbc)
1411         {
1412             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1413         }
1414         else
1415         {
1416             siv = CENTRAL;
1417         }
1418         a = ip[ia[i]].vsiten.a;
1419         svmul(a, f[av], fi);
1420         rvec_inc(f[ai], fi);
1421         if (fshift && siv != CENTRAL)
1422         {
1423             rvec_inc(fshift[siv], fi);
1424             rvec_dec(fshift[CENTRAL], fi);
1425         }
1426         /* 6 Flops */
1427     }
1428
1429     return n3;
1430 }
1431
1432
1433 static int vsite_count(const t_ilist *ilist, int ftype)
1434 {
1435     if (ftype == F_VSITEN)
1436     {
1437         return ilist[ftype].nr/3;
1438     }
1439     else
1440     {
1441         return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
1442     }
1443 }
1444
1445 static void spread_vsite_f_thread(const gmx_vsite_t *vsite,
1446                                   const rvec x[],
1447                                   rvec f[], rvec *fshift,
1448                                   gmx_bool VirCorr, matrix dxdf,
1449                                   t_iparams ip[], const t_ilist ilist[],
1450                                   const t_graph *g, const t_pbc *pbc_null)
1451 {
1452     gmx_bool     bPBCAll;
1453     const t_pbc *pbc_null2;
1454     const int   *vsite_pbc;
1455
1456     bPBCAll = (pbc_null != nullptr && !vsite->bHaveChargeGroups);
1457
1458     /* this loop goes backwards to be able to build *
1459      * higher type vsites from lower types         */
1460     pbc_null2 = nullptr;
1461     vsite_pbc = nullptr;
1462     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1463     {
1464         if (ilist[ftype].nr == 0)
1465         {
1466             continue;
1467         }
1468
1469         {   // TODO remove me
1470             int            nra = interaction_function[ftype].nratoms;
1471             int            inc = 1 + nra;
1472             int            nr  = ilist[ftype].nr;
1473
1474             const t_iatom *ia = ilist[ftype].iatoms;
1475
1476             if (bPBCAll)
1477             {
1478                 pbc_null2 = pbc_null;
1479             }
1480             else if (pbc_null != nullptr)
1481             {
1482                 vsite_pbc = vsite->vsite_pbc_loc[ftype - c_ftypeVsiteStart];
1483             }
1484
1485             for (int i = 0; i < nr; )
1486             {
1487                 if (vsite_pbc != nullptr)
1488                 {
1489                     if (vsite_pbc[i/(1 + nra)] > -2)
1490                     {
1491                         pbc_null2 = pbc_null;
1492                     }
1493                     else
1494                     {
1495                         pbc_null2 = nullptr;
1496                     }
1497                 }
1498
1499                 int tp = ia[0];
1500
1501                 /* Constants for constructing */
1502                 real a1, b1, c1;
1503                 a1 = ip[tp].vsite.a;
1504                 /* Construct the vsite depending on type */
1505                 switch (ftype)
1506                 {
1507                     case F_VSITE2:
1508                         spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
1509                         break;
1510                     case F_VSITE3:
1511                         b1 = ip[tp].vsite.b;
1512                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1513                         break;
1514                     case F_VSITE3FD:
1515                         b1 = ip[tp].vsite.b;
1516                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1517                         break;
1518                     case F_VSITE3FAD:
1519                         b1 = ip[tp].vsite.b;
1520                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1521                         break;
1522                     case F_VSITE3OUT:
1523                         b1 = ip[tp].vsite.b;
1524                         c1 = ip[tp].vsite.c;
1525                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1526                         break;
1527                     case F_VSITE4FD:
1528                         b1 = ip[tp].vsite.b;
1529                         c1 = ip[tp].vsite.c;
1530                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1531                         break;
1532                     case F_VSITE4FDN:
1533                         b1 = ip[tp].vsite.b;
1534                         c1 = ip[tp].vsite.c;
1535                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1536                         break;
1537                     case F_VSITEN:
1538                         inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
1539                         break;
1540                     default:
1541                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
1542                                   ftype, __FILE__, __LINE__);
1543                 }
1544                 clear_rvec(f[ia[1]]);
1545
1546                 /* Increment loop variables */
1547                 i  += inc;
1548                 ia += inc;
1549             }
1550         }
1551     }
1552 }
1553
1554 /*! \brief Clears the task force buffer elements that are written by task idTask */
1555 static void clearTaskForceBufferUsedElements(InterdependentTask *idTask)
1556 {
1557     int ntask = idTask->spreadTask.size();
1558     for (int ti = 0; ti < ntask; ti++)
1559     {
1560         const AtomIndex *atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1561         int              natom    = atomList->atom.size();
1562         RVec            *force    = idTask->force.data();
1563         for (int i = 0; i < natom; i++)
1564         {
1565             clear_rvec(force[atomList->atom[i]]);
1566         }
1567     }
1568 }
1569
1570 void spread_vsite_f(const gmx_vsite_t *vsite,
1571                     const rvec * gmx_restrict x,
1572                     rvec * gmx_restrict f, rvec * gmx_restrict fshift,
1573                     gmx_bool VirCorr, matrix vir,
1574                     t_nrnb *nrnb, const t_idef *idef,
1575                     int ePBC, gmx_bool bMolPBC, const t_graph *g, const matrix box,
1576                     t_commrec *cr)
1577 {
1578     t_pbc pbc, *pbc_null;
1579
1580     /* We only need to do pbc when we have inter-cg vsites */
1581     if ((DOMAINDECOMP(cr) || bMolPBC) && vsite->n_intercg_vsite)
1582     {
1583         /* This is wasting some CPU time as we now do this multiple times
1584          * per MD step.
1585          */
1586         pbc_null = set_pbc_dd(&pbc, ePBC, cr->dd ? cr->dd->nc : nullptr, FALSE, box);
1587     }
1588     else
1589     {
1590         pbc_null = nullptr;
1591     }
1592
1593     if (DOMAINDECOMP(cr))
1594     {
1595         dd_clear_f_vsites(cr->dd, f);
1596     }
1597
1598     if (vsite->nthreads == 1)
1599     {
1600         if (VirCorr)
1601         {
1602             clear_mat(vsite->tData[0]->dxdf);
1603         }
1604         spread_vsite_f_thread(vsite,
1605                               x, f, fshift,
1606                               VirCorr, vsite->tData[0]->dxdf,
1607                               idef->iparams, idef->il,
1608                               g, pbc_null);
1609     }
1610     else
1611     {
1612         /* First spread the vsites that might depend on non-local vsites */
1613         if (VirCorr)
1614         {
1615             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1616         }
1617         spread_vsite_f_thread(vsite,
1618                               x, f, fshift,
1619                               VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1620                               idef->iparams,
1621                               vsite->tData[vsite->nthreads]->ilist,
1622                               g, pbc_null);
1623
1624 #pragma omp parallel num_threads(vsite->nthreads)
1625         {
1626             try
1627             {
1628                 int          thread = gmx_omp_get_thread_num();
1629                 VsiteThread *tData  = vsite->tData[thread];
1630
1631                 rvec        *fshift_t;
1632                 if (thread == 0 || fshift == nullptr)
1633                 {
1634                     fshift_t = fshift;
1635                 }
1636                 else
1637                 {
1638                     fshift_t = tData->fshift;
1639
1640                     for (int i = 0; i < SHIFTS; i++)
1641                     {
1642                         clear_rvec(fshift_t[i]);
1643                     }
1644                 }
1645                 if (VirCorr)
1646                 {
1647                     clear_mat(tData->dxdf);
1648                 }
1649
1650                 if (tData->useInterdependentTask)
1651                 {
1652                     /* Spread the vsites that spread outside our local range.
1653                      * This is done using a thread-local force buffer force.
1654                      * First we need to copy the input vsite forces to force.
1655                      */
1656                     InterdependentTask *idTask = &tData->idTask;
1657
1658                     /* Clear the buffer elements set by our task during
1659                      * the last call to spread_vsite_f.
1660                      */
1661                     clearTaskForceBufferUsedElements(idTask);
1662
1663                     int nvsite = idTask->vsite.size();
1664                     for (int i = 0; i < nvsite; i++)
1665                     {
1666                         copy_rvec(f[idTask->vsite[i]],
1667                                   idTask->force[idTask->vsite[i]]);
1668                     }
1669                     spread_vsite_f_thread(vsite,
1670                                           x, as_rvec_array(idTask->force.data()), fshift_t,
1671                                           VirCorr, tData->dxdf,
1672                                           idef->iparams,
1673                                           tData->idTask.ilist,
1674                                           g, pbc_null);
1675
1676                     /* We need a barrier before reducing forces below
1677                      * that have been produced by a different thread above.
1678                      */
1679 #pragma omp barrier
1680
1681                     /* Loop over all thread task and reduce forces they
1682                      * produced on atoms that fall in our range.
1683                      * Note that atomic reduction would be a simpler solution,
1684                      * but that might not have good support on all platforms.
1685                      */
1686                     int ntask = idTask->reduceTask.size();
1687                     for (int ti = 0; ti < ntask; ti++)
1688                     {
1689                         const InterdependentTask *idt_foreign = &vsite->tData[idTask->reduceTask[ti]]->idTask;
1690                         const AtomIndex          *atomList    = &idt_foreign->atomIndex[thread];
1691                         const RVec               *f_foreign   = idt_foreign->force.data();
1692
1693                         int natom = atomList->atom.size();
1694                         for (int i = 0; i < natom; i++)
1695                         {
1696                             int ind = atomList->atom[i];
1697                             rvec_inc(f[ind], f_foreign[ind]);
1698                             /* Clearing of f_foreign is done at the next step */
1699                         }
1700                     }
1701                     /* Clear the vsite forces, both in f and force */
1702                     for (int i = 0; i < nvsite; i++)
1703                     {
1704                         int ind = tData->idTask.vsite[i];
1705                         clear_rvec(f[ind]);
1706                         clear_rvec(tData->idTask.force[ind]);
1707                     }
1708                 }
1709
1710                 /* Spread the vsites that spread locally only */
1711                 spread_vsite_f_thread(vsite,
1712                                       x, f, fshift_t,
1713                                       VirCorr, tData->dxdf,
1714                                       idef->iparams,
1715                                       tData->ilist,
1716                                       g, pbc_null);
1717             }
1718             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1719         }
1720
1721         if (fshift != nullptr)
1722         {
1723             for (int th = 1; th < vsite->nthreads; th++)
1724             {
1725                 for (int i = 0; i < SHIFTS; i++)
1726                 {
1727                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1728                 }
1729             }
1730         }
1731     }
1732
1733     if (VirCorr)
1734     {
1735         for (int th = 0; th < (vsite->nthreads == 1 ? 1 : vsite->nthreads + 1); th++)
1736         {
1737             for (int i = 0; i < DIM; i++)
1738             {
1739                 for (int j = 0; j < DIM; j++)
1740                 {
1741                     vir[i][j] += -0.5*vsite->tData[th]->dxdf[i][j];
1742                 }
1743             }
1744         }
1745     }
1746
1747     if (DOMAINDECOMP(cr))
1748     {
1749         dd_move_f_vsites(cr->dd, f, fshift);
1750     }
1751
1752     inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
1753     inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
1754     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1755     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1756     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1757     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1758     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1759     inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
1760 }
1761
1762 static int *atom2cg(t_block *cgs)
1763 {
1764     int *a2cg;
1765
1766     snew(a2cg, cgs->index[cgs->nr]);
1767     for (int cg = 0; cg < cgs->nr; cg++)
1768     {
1769         for (int i = cgs->index[cg]; i < cgs->index[cg+1]; i++)
1770         {
1771             a2cg[i] = cg;
1772         }
1773     }
1774
1775     return a2cg;
1776 }
1777
1778 int count_intercg_vsites(const gmx_mtop_t *mtop)
1779 {
1780     gmx_molblock_t *molb;
1781     gmx_moltype_t  *molt;
1782     int            *a2cg;
1783     int             n_intercg_vsite;
1784
1785     n_intercg_vsite = 0;
1786     for (int mb = 0; mb < mtop->nmolblock; mb++)
1787     {
1788         molb = &mtop->molblock[mb];
1789         molt = &mtop->moltype[molb->type];
1790
1791         a2cg = atom2cg(&molt->cgs);
1792         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1793         {
1794             int            nral = NRAL(ftype);
1795             t_ilist       *il   = &molt->ilist[ftype];
1796             const t_iatom *ia   = il->iatoms;
1797             for (int i = 0; i < il->nr; i += 1 + nral)
1798             {
1799                 int cg = a2cg[ia[1+i]];
1800                 for (int a = 1; a < nral; a++)
1801                 {
1802                     if (a2cg[ia[1+a]] != cg)
1803                     {
1804                         n_intercg_vsite += molb->nmol;
1805                         break;
1806                     }
1807                 }
1808             }
1809         }
1810         sfree(a2cg);
1811     }
1812
1813     return n_intercg_vsite;
1814 }
1815
1816 static int **get_vsite_pbc(const t_iparams *iparams, const t_ilist *ilist,
1817                            const t_atom *atom, const t_mdatoms *md,
1818                            const t_block *cgs, const int *a2cg)
1819 {
1820     char    *pbc_set;
1821
1822     /* Make an array that tells if the pbc of an atom is set */
1823     snew(pbc_set, cgs->index[cgs->nr]);
1824     /* PBC is set for all non vsites */
1825     for (int a = 0; a < cgs->index[cgs->nr]; a++)
1826     {
1827         if ((atom && atom[a].ptype != eptVSite) ||
1828             (md   && md->ptype[a]  != eptVSite))
1829         {
1830             pbc_set[a] = 1;
1831         }
1832     }
1833
1834     int **vsite_pbc;
1835     snew(vsite_pbc, F_VSITEN-F_VSITE2+1);
1836
1837     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1838     {
1839         {   // TODO remove me
1840             int            nral = NRAL(ftype);
1841             const t_ilist *il   = &ilist[ftype];
1842             const t_iatom *ia   = il->iatoms;
1843             int           *vsite_pbc_f;
1844
1845             snew(vsite_pbc[ftype-F_VSITE2], il->nr/(1 + nral));
1846             vsite_pbc_f = vsite_pbc[ftype-F_VSITE2];
1847
1848             int i = 0;
1849             while (i < il->nr)
1850             {
1851                 int     vsi   = i/(1 + nral);
1852                 t_iatom vsite = ia[i+1];
1853                 int     cg_v  = a2cg[vsite];
1854                 /* A value of -2 signals that this vsite and its contructing
1855                  * atoms are all within the same cg, so no pbc is required.
1856                  */
1857                 vsite_pbc_f[vsi] = -2;
1858                 /* Check if constructing atoms are outside the vsite's cg */
1859                 int nc3 = 0;
1860                 if (ftype == F_VSITEN)
1861                 {
1862                     nc3 = 3*iparams[ia[i]].vsiten.n;
1863                     for (int j = 0; j < nc3; j += 3)
1864                     {
1865                         if (a2cg[ia[i+j+2]] != cg_v)
1866                         {
1867                             vsite_pbc_f[vsi] = -1;
1868                         }
1869                     }
1870                 }
1871                 else
1872                 {
1873                     for (int a = 1; a < nral; a++)
1874                     {
1875                         if (a2cg[ia[i+1+a]] != cg_v)
1876                         {
1877                             vsite_pbc_f[vsi] = -1;
1878                         }
1879                     }
1880                 }
1881                 if (vsite_pbc_f[vsi] == -1)
1882                 {
1883                     /* Check if this is the first processed atom of a vsite only cg */
1884                     gmx_bool bViteOnlyCG_and_FirstAtom = TRUE;
1885                     for (int a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
1886                     {
1887                         /* Non-vsites already have pbc set, so simply check for pbc_set */
1888                         if (pbc_set[a])
1889                         {
1890                             bViteOnlyCG_and_FirstAtom = FALSE;
1891                             break;
1892                         }
1893                     }
1894                     if (bViteOnlyCG_and_FirstAtom)
1895                     {
1896                         /* First processed atom of a vsite only charge group.
1897                          * The pbc of the input coordinates to construct_vsites
1898                          * should be preserved.
1899                          */
1900                         vsite_pbc_f[vsi] = vsite;
1901                     }
1902                     else if (cg_v != a2cg[ia[1+i+1]])
1903                     {
1904                         /* This vsite has a different charge group index
1905                          * than it's first constructing atom
1906                          * and the charge group has more than one atom,
1907                          * search for the first normal particle
1908                          * or vsite that already had its pbc defined.
1909                          * If nothing is found, use full pbc for this vsite.
1910                          */
1911                         for (int a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
1912                         {
1913                             if (a != vsite && pbc_set[a])
1914                             {
1915                                 vsite_pbc_f[vsi] = a;
1916                                 if (gmx_debug_at)
1917                                 {
1918                                     fprintf(debug, "vsite %d match pbc with atom %d\n",
1919                                             vsite+1, a+1);
1920                                 }
1921                                 break;
1922                             }
1923                         }
1924                         if (gmx_debug_at)
1925                         {
1926                             fprintf(debug, "vsite atom %d  cg %d - %d pbc atom %d\n",
1927                                     vsite+1, cgs->index[cg_v]+1, cgs->index[cg_v+1],
1928                                     vsite_pbc_f[vsi]+1);
1929                         }
1930                     }
1931                 }
1932                 if (ftype == F_VSITEN)
1933                 {
1934                     /* The other entries in vsite_pbc_f are not used for center vsites */
1935                     i += nc3;
1936                 }
1937                 else
1938                 {
1939                     i += 1 + nral;
1940                 }
1941
1942                 /* This vsite now has its pbc defined */
1943                 pbc_set[vsite] = 1;
1944             }
1945         }
1946     }
1947
1948     sfree(pbc_set);
1949
1950     return vsite_pbc;
1951 }
1952
1953
1954 gmx_vsite_t *init_vsite(const gmx_mtop_t *mtop, t_commrec *cr,
1955                         gmx_bool bSerial_NoPBC)
1956 {
1957     gmx_vsite_t   *vsite;
1958     gmx_moltype_t *molt;
1959
1960     /* check if there are vsites */
1961     int nvsite = 0;
1962     for (int ftype = 0; ftype < F_NRE; ftype++)
1963     {
1964         if (interaction_function[ftype].flags & IF_VSITE)
1965         {
1966             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
1967
1968             nvsite += gmx_mtop_ftype_count(mtop, ftype);
1969         }
1970         else
1971         {
1972             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
1973         }
1974     }
1975
1976     if (nvsite == 0)
1977     {
1978         return nullptr;
1979     }
1980
1981     snew(vsite, 1);
1982
1983     vsite->n_intercg_vsite   = count_intercg_vsites(mtop);
1984
1985     vsite->bHaveChargeGroups = (ncg_mtop(mtop) < mtop->natoms);
1986
1987     /* If we don't have charge groups, the vsite follows its own pbc */
1988     if (!bSerial_NoPBC &&
1989         vsite->bHaveChargeGroups &&
1990         vsite->n_intercg_vsite > 0 && DOMAINDECOMP(cr))
1991     {
1992         vsite->nvsite_pbc_molt = mtop->nmoltype;
1993         snew(vsite->vsite_pbc_molt, vsite->nvsite_pbc_molt);
1994         for (int mt = 0; mt < mtop->nmoltype; mt++)
1995         {
1996             molt = &mtop->moltype[mt];
1997             /* Make an atom to charge group index */
1998             int *a2cg = atom2cg(&molt->cgs);
1999             vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop->ffparams.iparams,
2000                                                       molt->ilist,
2001                                                       molt->atoms.atom, nullptr,
2002                                                       &molt->cgs, a2cg);
2003             sfree(a2cg);
2004         }
2005
2006         snew(vsite->vsite_pbc_loc_nalloc, c_ftypeVsiteEnd - c_ftypeVsiteStart);
2007         snew(vsite->vsite_pbc_loc,        c_ftypeVsiteEnd - c_ftypeVsiteStart);
2008     }
2009
2010     if (bSerial_NoPBC)
2011     {
2012         vsite->nthreads = 1;
2013     }
2014     else
2015     {
2016         vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
2017     }
2018     if (!bSerial_NoPBC)
2019     {
2020         /* We need one extra thread data structure for the overlap vsites */
2021         snew(vsite->tData, vsite->nthreads + 1);
2022 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2023         for (int thread = 0; thread < vsite->nthreads; thread++)
2024         {
2025             try
2026             {
2027                 vsite->tData[thread] = new VsiteThread;
2028
2029                 InterdependentTask *idTask = &vsite->tData[thread]->idTask;
2030                 idTask->nuse               = 0;
2031                 idTask->atomIndex.resize(vsite->nthreads);
2032             }
2033             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2034         }
2035         if (vsite->nthreads > 1)
2036         {
2037             vsite->tData[vsite->nthreads] = new VsiteThread;
2038         }
2039     }
2040
2041     vsite->taskIndex       = nullptr;
2042     vsite->taskIndexNalloc = 0;
2043
2044     return vsite;
2045 }
2046
2047 static gmx_inline void flagAtom(InterdependentTask *idTask, int atom,
2048                                 int thread, int nthread, int natperthread)
2049 {
2050     if (!idTask->use[atom])
2051     {
2052         idTask->use[atom] = true;
2053         thread            = atom/natperthread;
2054         /* Assign all non-local atom force writes to thread 0 */
2055         if (thread >= nthread)
2056         {
2057             thread        = 0;
2058         }
2059         idTask->atomIndex[thread].atom.push_back(atom);
2060     }
2061 }
2062
2063 /*\brief Here we try to assign all vsites that are in our local range.
2064  *
2065  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2066  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2067  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2068  * but not on other vsites are assigned to task tData->id_task.ilist.
2069  * taskIndex[] is set for all vsites in our range, either to our local tasks
2070  * or to the single last task as taskIndex[]=2*nthreads.
2071  */
2072 static void assignVsitesToThread(VsiteThread           *tData,
2073                                  int                    thread,
2074                                  int                    nthread,
2075                                  int                    natperthread,
2076                                  int                   *taskIndex,
2077                                  const t_ilist         *ilist,
2078                                  const t_iparams       *ip,
2079                                  const unsigned short  *ptype)
2080 {
2081     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2082     {
2083         tData->ilist[ftype].nr        = 0;
2084         tData->idTask.ilist[ftype].nr = 0;
2085
2086         int      nral1 = 1 + NRAL(ftype);
2087         int      inc   = nral1;
2088         t_iatom *iat   = ilist[ftype].iatoms;
2089         for (int i = 0; i < ilist[ftype].nr; )
2090         {
2091             if (ftype == F_VSITEN)
2092             {
2093                 /* The 3 below is from 1+NRAL(ftype)=3 */
2094                 inc = ip[iat[i]].vsiten.n*3;
2095             }
2096
2097             if (iat[1 + i] <  tData->rangeStart ||
2098                 iat[1 + i] >= tData->rangeEnd)
2099             {
2100                 /* This vsite belongs to a different thread */
2101                 i += inc;
2102                 continue;
2103             }
2104
2105             /* We would like to assign this vsite to task thread,
2106              * but it might depend on atoms outside the atom range of thread
2107              * or on another vsite not assigned to task thread.
2108              */
2109             int task = thread;
2110             if (ftype != F_VSITEN)
2111             {
2112                 for (int j = i + 2; j < i + nral1; j++)
2113                 {
2114                     /* Do a range check to avoid a harmless race on taskIndex */
2115                     if (iat[j] <  tData->rangeStart ||
2116                         iat[j] >= tData->rangeEnd ||
2117                         taskIndex[iat[j]] != thread)
2118                     {
2119                         if (!tData->useInterdependentTask ||
2120                             ptype[iat[j]] == eptVSite)
2121                         {
2122                             /* At least one constructing atom is a vsite
2123                              * that is not assigned to the same thread.
2124                              * Put this vsite into a separate task.
2125                              */
2126                             task = 2*nthread;
2127                             break;
2128                         }
2129
2130                         /* There are constructing atoms outside our range,
2131                          * put this vsite into a second task to be executed
2132                          * on the same thread. During construction no barrier
2133                          * is needed between the two tasks on the same thread.
2134                          * During spreading we need to run this task with
2135                          * an additional thread-local intermediate force buffer
2136                          * (or atomic reduction) and a barrier between the two
2137                          * tasks.
2138                          */
2139                         task = nthread + thread;
2140                     }
2141                 }
2142             }
2143             else
2144             {
2145                 for (int j = i + 2; j < i + inc; j += 3)
2146                 {
2147                     /* Do a range check to avoid a harmless race on taskIndex */
2148                     if (iat[j] <  tData->rangeStart ||
2149                         iat[j] >= tData->rangeEnd ||
2150                         taskIndex[iat[j]] != thread)
2151                     {
2152                         GMX_ASSERT(ptype[iat[j]] != eptVSite, "A vsite to be assigned in assignVsitesToThread has a vsite as a constructing atom that does not belong to our task, such vsites should be assigned to the single 'master' task");
2153
2154                         task = nthread + thread;
2155                     }
2156                 }
2157             }
2158
2159             /* Update this vsite's thread index entry */
2160             taskIndex[iat[1+i]] = task;
2161
2162             if (task == thread || task == nthread + thread)
2163             {
2164                 /* Copy this vsite to the thread data struct of thread */
2165                 t_ilist *il_task;
2166                 if (task == thread)
2167                 {
2168                     il_task = &tData->ilist[ftype];
2169                 }
2170                 else
2171                 {
2172                     il_task = &tData->idTask.ilist[ftype];
2173                 }
2174                 /* Ensure we have sufficient memory allocated */
2175                 if (il_task->nr + inc > il_task->nalloc)
2176                 {
2177                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2178                     srenew(il_task->iatoms, il_task->nalloc);
2179                 }
2180                 /* Copy the vsite data to the thread-task local array */
2181                 for (int j = i; j < i + inc; j++)
2182                 {
2183                     il_task->iatoms[il_task->nr++] = iat[j];
2184                 }
2185                 if (task == nthread + thread)
2186                 {
2187                     /* This vsite write outside our own task force block.
2188                      * Put it into the interdependent task list and flag
2189                      * the atoms involved for reduction.
2190                      */
2191                     tData->idTask.vsite.push_back(iat[i + 1]);
2192                     if (ftype != F_VSITEN)
2193                     {
2194                         for (int j = i + 2; j < i + nral1; j++)
2195                         {
2196                             flagAtom(&tData->idTask, iat[j],
2197                                      thread, nthread, natperthread);
2198                         }
2199                     }
2200                     else
2201                     {
2202                         for (int j = i + 2; j < i + inc; j += 3)
2203                         {
2204                             flagAtom(&tData->idTask, iat[j],
2205                                      thread, nthread, natperthread);
2206                         }
2207                     }
2208                 }
2209             }
2210
2211             i += inc;
2212         }
2213     }
2214 }
2215
2216 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2217 static void assignVsitesToSingleTask(VsiteThread     *tData,
2218                                      int              task,
2219                                      const int       *taskIndex,
2220                                      const t_ilist   *ilist,
2221                                      const t_iparams *ip)
2222 {
2223     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2224     {
2225         tData->ilist[ftype].nr        = 0;
2226         tData->idTask.ilist[ftype].nr = 0;
2227
2228         int      nral1   = 1 + NRAL(ftype);
2229         int      inc     = nral1;
2230         t_iatom *iat     = ilist[ftype].iatoms;
2231         t_ilist *il_task = &tData->ilist[ftype];
2232
2233         for (int i = 0; i < ilist[ftype].nr; )
2234         {
2235             if (ftype == F_VSITEN)
2236             {
2237                 /* The 3 below is from 1+NRAL(ftype)=3 */
2238                 inc = ip[iat[i]].vsiten.n*3;
2239             }
2240             /* Check if the vsite is assigned to our task */
2241             if (taskIndex[iat[1 + i]] == task)
2242             {
2243                 /* Ensure we have sufficient memory allocated */
2244                 if (il_task->nr + inc > il_task->nalloc)
2245                 {
2246                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2247                     srenew(il_task->iatoms, il_task->nalloc);
2248                 }
2249                 /* Copy the vsite data to the thread-task local array */
2250                 for (int j = i; j < i + inc; j++)
2251                 {
2252                     il_task->iatoms[il_task->nr++] = iat[j];
2253                 }
2254             }
2255
2256             i += inc;
2257         }
2258     }
2259 }
2260
2261 void split_vsites_over_threads(const t_ilist   *ilist,
2262                                const t_iparams *ip,
2263                                const t_mdatoms *mdatoms,
2264                                gmx_bool         bLimitRange,
2265                                gmx_vsite_t     *vsite)
2266 {
2267     int      vsite_atom_range, natperthread;
2268
2269     if (vsite->nthreads == 1)
2270     {
2271         /* Nothing to do */
2272         return;
2273     }
2274
2275     /* The current way of distributing the vsites over threads in primitive.
2276      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2277      * without taking into account how the vsites are distributed.
2278      * Without domain decomposition we bLimitRange=TRUE and we at least
2279      * tighten the upper bound of the range (useful for common systems
2280      * such as a vsite-protein in 3-site water).
2281      * With domain decomposition, as long as the vsites are distributed
2282      * uniformly in each domain along the major dimension, usually x,
2283      * it will also perform well.
2284      */
2285     if (bLimitRange)
2286     {
2287         vsite_atom_range = -1;
2288         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2289         {
2290             {   // TODO remove me
2291                 if (ftype != F_VSITEN)
2292                 {
2293                     int            nral1 = 1 + NRAL(ftype);
2294                     const t_iatom *iat   = ilist[ftype].iatoms;
2295                     for (int i = 0; i < ilist[ftype].nr; i += nral1)
2296                     {
2297                         for (int j = i + 1; j < i + nral1; j++)
2298                         {
2299                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2300                         }
2301                     }
2302                 }
2303                 else
2304                 {
2305                     int            vs_ind_end;
2306
2307                     const t_iatom *iat = ilist[ftype].iatoms;
2308
2309                     int            i = 0;
2310                     while (i < ilist[ftype].nr)
2311                     {
2312                         /* The 3 below is from 1+NRAL(ftype)=3 */
2313                         vs_ind_end = i + ip[iat[i]].vsiten.n*3;
2314
2315                         vsite_atom_range = std::max(vsite_atom_range, iat[i+1]);
2316                         while (i < vs_ind_end)
2317                         {
2318                             vsite_atom_range = std::max(vsite_atom_range, iat[i+2]);
2319                             i               += 3;
2320                         }
2321                     }
2322                 }
2323             }
2324         }
2325         vsite_atom_range++;
2326         natperthread     = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
2327     }
2328     else
2329     {
2330         /* Any local or not local atom could be involved in virtual sites.
2331          * But since we usually have very few non-local virtual sites
2332          * (only non-local vsites that depend on local vsites),
2333          * we distribute the local atom range equally over the threads.
2334          * When assigning vsites to threads, we should take care that the last
2335          * threads also covers the non-local range.
2336          */
2337         vsite_atom_range = mdatoms->nr;
2338         natperthread     = (mdatoms->homenr + vsite->nthreads - 1)/vsite->nthreads;
2339     }
2340
2341     if (debug)
2342     {
2343         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
2344     }
2345
2346     /* To simplify the vsite assignment, we make an index which tells us
2347      * to which task particles, both non-vsites and vsites, are assigned.
2348      */
2349     if (mdatoms->nr > vsite->taskIndexNalloc)
2350     {
2351         vsite->taskIndexNalloc = over_alloc_large(mdatoms->nr);
2352         srenew(vsite->taskIndex, vsite->taskIndexNalloc);
2353     }
2354
2355     /* Initialize the task index array. Here we assign the non-vsite
2356      * particles to task=thread, so we easily figure out if vsites
2357      * depend on local and/or non-local particles in assignVsitesToThread.
2358      */
2359     int *taskIndex = vsite->taskIndex;
2360     {
2361         int  thread = 0;
2362         for (int i = 0; i < mdatoms->nr; i++)
2363         {
2364             if (mdatoms->ptype[i] == eptVSite)
2365             {
2366                 /* vsites are not assigned to a task yet */
2367                 taskIndex[i] = -1;
2368             }
2369             else
2370             {
2371                 /* assign non-vsite particles to task thread */
2372                 taskIndex[i] = thread;
2373             }
2374             if (i == (thread + 1)*natperthread && thread < vsite->nthreads)
2375             {
2376                 thread++;
2377             }
2378         }
2379     }
2380
2381 #pragma omp parallel num_threads(vsite->nthreads)
2382     {
2383         try
2384         {
2385             int          thread = gmx_omp_get_thread_num();
2386             VsiteThread *tData  = vsite->tData[thread];
2387
2388             /* Clear the buffer use flags that were set before */
2389             if (tData->useInterdependentTask)
2390             {
2391                 InterdependentTask *idTask = &tData->idTask;
2392
2393                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2394                  * we clear the force buffer at the next step,
2395                  * so we need to do it here as well.
2396                  */
2397                 clearTaskForceBufferUsedElements(idTask);
2398
2399                 idTask->vsite.resize(0);
2400                 for (int t = 0; t < vsite->nthreads; t++)
2401                 {
2402                     AtomIndex *atomIndex = &idTask->atomIndex[t];
2403                     int        natom     = atomIndex->atom.size();
2404                     for (int i = 0; i < natom; i++)
2405                     {
2406                         idTask->use[atomIndex->atom[i]] = false;
2407                     }
2408                     atomIndex->atom.resize(0);
2409                 }
2410                 idTask->nuse = 0;
2411             }
2412
2413             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2414              * we don't use task2 with more than 200000 atoms. This doesn't
2415              * affect performance, since with such a large range relatively few
2416              * vsites will end up in the separate task.
2417              * Note that useTask2 should be the same for all threads.
2418              */
2419             tData->useInterdependentTask = (vsite_atom_range <= 200000);
2420             if (tData->useInterdependentTask)
2421             {
2422                 size_t              natoms_use_in_vsites = vsite_atom_range;
2423                 InterdependentTask *idTask               = &tData->idTask;
2424                 /* To avoid resizing and re-clearing every nstlist steps,
2425                  * we never down size the force buffer.
2426                  */
2427                 if (natoms_use_in_vsites > idTask->force.size() ||
2428                     natoms_use_in_vsites > idTask->use.size())
2429                 {
2430                     idTask->force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2431                     idTask->use.resize(natoms_use_in_vsites, false);
2432                 }
2433             }
2434
2435             /* Assign all vsites that can execute independently on threads */
2436             tData->rangeStart     =  thread     *natperthread;
2437             if (thread < vsite->nthreads - 1)
2438             {
2439                 tData->rangeEnd   = (thread + 1)*natperthread;
2440             }
2441             else
2442             {
2443                 /* The last thread should cover up to the end of the range */
2444                 tData->rangeEnd   = mdatoms->nr;
2445             }
2446             assignVsitesToThread(tData,
2447                                  thread, vsite->nthreads,
2448                                  natperthread,
2449                                  taskIndex,
2450                                  ilist, ip, mdatoms->ptype);
2451
2452             if (tData->useInterdependentTask)
2453             {
2454                 /* In the worst case, all tasks write to force ranges of
2455                  * all other tasks, leading to #tasks^2 scaling (this is only
2456                  * the overhead, the actual flops remain constant).
2457                  * But in most cases there is far less coupling. To improve
2458                  * scaling at high thread counts we therefore construct
2459                  * an index to only loop over the actually affected tasks.
2460                  */
2461                 InterdependentTask *idTask = &tData->idTask;
2462
2463                 /* Ensure assignVsitesToThread finished on other threads */
2464 #pragma omp barrier
2465
2466                 idTask->spreadTask.resize(0);
2467                 idTask->reduceTask.resize(0);
2468                 for (int t = 0; t < vsite->nthreads; t++)
2469                 {
2470                     /* Do we write to the force buffer of task t? */
2471                     if (idTask->atomIndex[t].atom.size() > 0)
2472                     {
2473                         idTask->spreadTask.push_back(t);
2474                     }
2475                     /* Does task t write to our force buffer? */
2476                     if (vsite->tData[t]->idTask.atomIndex[thread].atom.size() > 0)
2477                     {
2478                         idTask->reduceTask.push_back(t);
2479                     }
2480                 }
2481             }
2482         }
2483         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2484     }
2485     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2486      * to a single task that will not run in parallel with other tasks.
2487      */
2488     assignVsitesToSingleTask(vsite->tData[vsite->nthreads],
2489                              2*vsite->nthreads,
2490                              taskIndex,
2491                              ilist, ip);
2492
2493     if (debug && vsite->nthreads > 1)
2494     {
2495         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2496                 vsite->tData[0]->useInterdependentTask);
2497         for (int th = 0; th < vsite->nthreads + 1; th++)
2498         {
2499             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2500         }
2501         fprintf(debug, "\n");
2502
2503         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2504         {
2505             if (ilist[ftype].nr > 0)
2506             {
2507                 fprintf(debug, "%-20s thread dist:",
2508                         interaction_function[ftype].longname);
2509                 for (int th = 0; th < vsite->nthreads + 1; th++)
2510                 {
2511                     fprintf(debug, " %4d %4d ",
2512                             vsite->tData[th]->ilist[ftype].nr,
2513                             vsite->tData[th]->idTask.ilist[ftype].nr);
2514                 }
2515                 fprintf(debug, "\n");
2516             }
2517         }
2518     }
2519
2520 #ifndef NDEBUG
2521     int nrOrig     = vsiteIlistNrCount(ilist);
2522     int nrThreaded = 0;
2523     for (int th = 0; th < vsite->nthreads + 1; th++)
2524     {
2525         nrThreaded +=
2526             vsiteIlistNrCount(vsite->tData[th]->ilist) +
2527             vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2528     }
2529     GMX_ASSERT(nrThreaded == nrOrig, "The number of virtual sites assigned to all thread task has to match the total number of virtual sites");
2530 #endif
2531 }
2532
2533 void set_vsite_top(gmx_vsite_t *vsite, gmx_localtop_t *top, t_mdatoms *md,
2534                    t_commrec *cr)
2535 {
2536     if (vsite->n_intercg_vsite > 0)
2537     {
2538         if (vsite->bHaveChargeGroups)
2539         {
2540             /* Make an atom to charge group index */
2541             int *a2cg = atom2cg(&top->cgs);
2542             vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
2543                                                  top->idef.il, nullptr, md,
2544                                                  &top->cgs, a2cg);
2545             sfree(a2cg);
2546         }
2547
2548     }
2549
2550     if (vsite->nthreads > 1)
2551     {
2552         if (vsite->bHaveChargeGroups)
2553         {
2554             gmx_fatal(FARGS, "The combination of threading, virtual sites and charge groups is not implemented");
2555         }
2556
2557         split_vsites_over_threads(top->idef.il, top->idef.iparams,
2558                                   md, !DOMAINDECOMP(cr), vsite);
2559     }
2560 }