2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/domdec/domdec.h"
49 #include "gromacs/domdec/domdec_struct.h"
50 #include "gromacs/gmxlib/network.h"
51 #include "gromacs/gmxlib/nrnb.h"
52 #include "gromacs/math/functions.h"
53 #include "gromacs/math/vec.h"
54 #include "gromacs/mdlib/gmx_omp_nthreads.h"
55 #include "gromacs/mdtypes/commrec.h"
56 #include "gromacs/mdtypes/mdatom.h"
57 #include "gromacs/pbcutil/ishift.h"
58 #include "gromacs/pbcutil/mshift.h"
59 #include "gromacs/pbcutil/pbc.h"
60 #include "gromacs/timing/wallcycle.h"
61 #include "gromacs/topology/ifunc.h"
62 #include "gromacs/topology/mtop_util.h"
63 #include "gromacs/topology/topology.h"
64 #include "gromacs/utility/exceptions.h"
65 #include "gromacs/utility/fatalerror.h"
66 #include "gromacs/utility/gmxassert.h"
67 #include "gromacs/utility/gmxomp.h"
69 /* The strategy used here for assigning virtual sites to (thread-)tasks
72 * We divide the atom range that vsites operate on (natoms_local with DD,
73 * 0 - last atom involved in vsites without DD) equally over all threads.
75 * Vsites in the local range constructed from atoms in the local range
76 * and/or other vsites that are fully local are assigned to a simple,
79 * Vsites that are not assigned after using the above criterion get assigned
80 * to a so called "interdependent" thread task when none of the constructing
81 * atoms is a vsite. These tasks are called interdependent, because one task
82 * accesses atoms assigned to a different task/thread.
83 * Note that this option is turned off with large (local) atom counts
84 * to avoid high memory usage.
86 * Any remaining vsites are assigned to a separate master thread task.
91 static void init_ilist(t_ilist
* ilist
)
93 for (int i
= 0; i
< F_NRE
; i
++)
97 ilist
[i
].iatoms
= nullptr;
101 /*! \brief List of atom indices belonging to a task */
104 //! List of atom indices
105 std::vector
<int> atom
;
108 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
109 struct InterdependentTask
111 //! The interaction lists, only vsite entries are used
112 t_ilist ilist
[F_NRE
];
113 //! Thread/task-local force buffer
114 std::vector
<RVec
> force
;
115 //! The atom indices of the vsites of our task
116 std::vector
<int> vsite
;
117 //! Flags if elements in force are spread to or not
118 std::vector
<bool> use
;
119 //! The number of entries set to true in use
121 //! Array of atoms indices, size nthreads, covering all nuse set elements in use
122 std::vector
<AtomIndex
> atomIndex
;
123 //! List of tasks (force blocks) this task spread forces to
124 std::vector
<int> spreadTask
;
125 //! List of tasks that write to this tasks force block range
126 std::vector
<int> reduceTask
;
135 /*! \brief Vsite thread task data structure */
138 //! Start of atom range of this task
140 //! End of atom range of this task
142 //! The interaction lists, only vsite entries are used
143 t_ilist ilist
[F_NRE
];
144 //! Local fshift accumulation buffer
146 //! Local virial dx*df accumulation buffer
148 //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
149 bool useInterdependentTask
;
150 //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
151 InterdependentTask idTask
;
153 /*! \brief Constructor */
159 clear_rvecs(SHIFTS
, fshift
);
161 useInterdependentTask
= false;
165 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
167 * \param[in] ilist The interaction list
170 static int vsiteIlistNrCount(const T
* ilist
)
173 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
175 nr
+= ilist
[ftype
].size();
181 static int pbc_rvec_sub(const t_pbc
* pbc
, const rvec xi
, const rvec xj
, rvec dx
)
185 return pbc_dx_aiuc(pbc
, xi
, xj
, dx
);
189 rvec_sub(xi
, xj
, dx
);
194 static inline real
inverseNorm(const rvec x
)
196 return gmx::invsqrt(iprod(x
, x
));
199 /* Vsite construction routines */
201 static void constr_vsite2(const rvec xi
, const rvec xj
, rvec x
, real a
, const t_pbc
* pbc
)
209 pbc_dx_aiuc(pbc
, xj
, xi
, dx
);
210 x
[XX
] = xi
[XX
] + a
* dx
[XX
];
211 x
[YY
] = xi
[YY
] + a
* dx
[YY
];
212 x
[ZZ
] = xi
[ZZ
] + a
* dx
[ZZ
];
216 x
[XX
] = b
* xi
[XX
] + a
* xj
[XX
];
217 x
[YY
] = b
* xi
[YY
] + a
* xj
[YY
];
218 x
[ZZ
] = b
* xi
[ZZ
] + a
* xj
[ZZ
];
222 /* TOTAL: 10 flops */
225 static void constr_vsite2FD(const rvec xi
, const rvec xj
, rvec x
, real a
, const t_pbc
* pbc
)
228 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
231 const real b
= a
* inverseNorm(xij
);
234 x
[XX
] = xi
[XX
] + b
* xij
[XX
];
235 x
[YY
] = xi
[YY
] + b
* xij
[YY
];
236 x
[ZZ
] = xi
[ZZ
] + b
* xij
[ZZ
];
239 /* TOTAL: 25 flops */
242 static void constr_vsite3(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, const t_pbc
* pbc
)
251 pbc_dx_aiuc(pbc
, xj
, xi
, dxj
);
252 pbc_dx_aiuc(pbc
, xk
, xi
, dxk
);
253 x
[XX
] = xi
[XX
] + a
* dxj
[XX
] + b
* dxk
[XX
];
254 x
[YY
] = xi
[YY
] + a
* dxj
[YY
] + b
* dxk
[YY
];
255 x
[ZZ
] = xi
[ZZ
] + a
* dxj
[ZZ
] + b
* dxk
[ZZ
];
259 x
[XX
] = c
* xi
[XX
] + a
* xj
[XX
] + b
* xk
[XX
];
260 x
[YY
] = c
* xi
[YY
] + a
* xj
[YY
] + b
* xk
[YY
];
261 x
[ZZ
] = c
* xi
[ZZ
] + a
* xj
[ZZ
] + b
* xk
[ZZ
];
265 /* TOTAL: 17 flops */
268 static void constr_vsite3FD(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, const t_pbc
* pbc
)
273 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
274 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
277 /* temp goes from i to a point on the line jk */
278 temp
[XX
] = xij
[XX
] + a
* xjk
[XX
];
279 temp
[YY
] = xij
[YY
] + a
* xjk
[YY
];
280 temp
[ZZ
] = xij
[ZZ
] + a
* xjk
[ZZ
];
283 c
= b
* inverseNorm(temp
);
286 x
[XX
] = xi
[XX
] + c
* temp
[XX
];
287 x
[YY
] = xi
[YY
] + c
* temp
[YY
];
288 x
[ZZ
] = xi
[ZZ
] + c
* temp
[ZZ
];
291 /* TOTAL: 34 flops */
294 static void constr_vsite3FAD(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, const t_pbc
* pbc
)
297 real a1
, b1
, c1
, invdij
;
299 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
300 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
303 invdij
= inverseNorm(xij
);
304 c1
= invdij
* invdij
* iprod(xij
, xjk
);
305 xp
[XX
] = xjk
[XX
] - c1
* xij
[XX
];
306 xp
[YY
] = xjk
[YY
] - c1
* xij
[YY
];
307 xp
[ZZ
] = xjk
[ZZ
] - c1
* xij
[ZZ
];
309 b1
= b
* inverseNorm(xp
);
312 x
[XX
] = xi
[XX
] + a1
* xij
[XX
] + b1
* xp
[XX
];
313 x
[YY
] = xi
[YY
] + a1
* xij
[YY
] + b1
* xp
[YY
];
314 x
[ZZ
] = xi
[ZZ
] + a1
* xij
[ZZ
] + b1
* xp
[ZZ
];
317 /* TOTAL: 63 flops */
321 constr_vsite3OUT(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, real c
, const t_pbc
* pbc
)
325 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
326 pbc_rvec_sub(pbc
, xk
, xi
, xik
);
327 cprod(xij
, xik
, temp
);
330 x
[XX
] = xi
[XX
] + a
* xij
[XX
] + b
* xik
[XX
] + c
* temp
[XX
];
331 x
[YY
] = xi
[YY
] + a
* xij
[YY
] + b
* xik
[YY
] + c
* temp
[YY
];
332 x
[ZZ
] = xi
[ZZ
] + a
* xij
[ZZ
] + b
* xik
[ZZ
] + c
* temp
[ZZ
];
335 /* TOTAL: 33 flops */
338 static void constr_vsite4FD(const rvec xi
,
348 rvec xij
, xjk
, xjl
, temp
;
351 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
352 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
353 pbc_rvec_sub(pbc
, xl
, xj
, xjl
);
356 /* temp goes from i to a point on the plane jkl */
357 temp
[XX
] = xij
[XX
] + a
* xjk
[XX
] + b
* xjl
[XX
];
358 temp
[YY
] = xij
[YY
] + a
* xjk
[YY
] + b
* xjl
[YY
];
359 temp
[ZZ
] = xij
[ZZ
] + a
* xjk
[ZZ
] + b
* xjl
[ZZ
];
362 d
= c
* inverseNorm(temp
);
365 x
[XX
] = xi
[XX
] + d
* temp
[XX
];
366 x
[YY
] = xi
[YY
] + d
* temp
[YY
];
367 x
[ZZ
] = xi
[ZZ
] + d
* temp
[ZZ
];
370 /* TOTAL: 43 flops */
373 static void constr_vsite4FDN(const rvec xi
,
383 rvec xij
, xik
, xil
, ra
, rb
, rja
, rjb
, rm
;
386 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
387 pbc_rvec_sub(pbc
, xk
, xi
, xik
);
388 pbc_rvec_sub(pbc
, xl
, xi
, xil
);
391 ra
[XX
] = a
* xik
[XX
];
392 ra
[YY
] = a
* xik
[YY
];
393 ra
[ZZ
] = a
* xik
[ZZ
];
395 rb
[XX
] = b
* xil
[XX
];
396 rb
[YY
] = b
* xil
[YY
];
397 rb
[ZZ
] = b
* xil
[ZZ
];
401 rvec_sub(ra
, xij
, rja
);
402 rvec_sub(rb
, xij
, rjb
);
408 d
= c
* inverseNorm(rm
);
411 x
[XX
] = xi
[XX
] + d
* rm
[XX
];
412 x
[YY
] = xi
[YY
] + d
* rm
[YY
];
413 x
[ZZ
] = xi
[ZZ
] + d
* rm
[ZZ
];
416 /* TOTAL: 47 flops */
420 static int constr_vsiten(const t_iatom
* ia
, const t_iparams ip
[], rvec
* x
, const t_pbc
* pbc
)
427 n3
= 3 * ip
[ia
[0]].vsiten
.n
;
430 copy_rvec(x
[ai
], x1
);
432 for (int i
= 3; i
< n3
; i
+= 3)
435 a
= ip
[ia
[i
]].vsiten
.a
;
438 pbc_dx_aiuc(pbc
, x
[ai
], x1
, dx
);
442 rvec_sub(x
[ai
], x1
, dx
);
444 dsum
[XX
] += a
* dx
[XX
];
445 dsum
[YY
] += a
* dx
[YY
];
446 dsum
[ZZ
] += a
* dx
[ZZ
];
450 x
[av
][XX
] = x1
[XX
] + dsum
[XX
];
451 x
[av
][YY
] = x1
[YY
] + dsum
[YY
];
452 x
[av
][ZZ
] = x1
[ZZ
] + dsum
[ZZ
];
457 /*! \brief PBC modes for vsite construction and spreading */
460 all
, // Apply normal, simple PBC for all vsites
461 none
// No PBC treatment needed
464 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
466 * \param[in] pbcPtr A pointer to a PBC struct or nullptr when no PBC treatment is required
468 static PbcMode
getPbcMode(const t_pbc
* pbcPtr
)
470 if (pbcPtr
== nullptr)
472 return PbcMode::none
;
480 static void construct_vsites_thread(rvec x
[],
483 const t_iparams ip
[],
484 const t_ilist ilist
[],
485 const t_pbc
* pbc_null
)
497 const PbcMode pbcMode
= getPbcMode(pbc_null
);
498 /* We need another pbc pointer, as with charge groups we switch per vsite */
499 const t_pbc
* pbc_null2
= pbc_null
;
501 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
503 if (ilist
[ftype
].nr
== 0)
509 int nra
= interaction_function
[ftype
].nratoms
;
511 int nr
= ilist
[ftype
].nr
;
513 const t_iatom
* ia
= ilist
[ftype
].iatoms
;
515 for (int i
= 0; i
< nr
;)
518 /* The vsite and constructing atoms */
521 /* Constants for constructing vsites */
522 real a1
= ip
[tp
].vsite
.a
;
523 /* Copy the old position */
525 copy_rvec(x
[avsite
], xv
);
527 /* Construct the vsite depending on type */
534 constr_vsite2(x
[ai
], x
[aj
], x
[avsite
], a1
, pbc_null2
);
538 constr_vsite2FD(x
[ai
], x
[aj
], x
[avsite
], a1
, pbc_null2
);
544 constr_vsite3(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
550 constr_vsite3FD(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
556 constr_vsite3FAD(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
563 constr_vsite3OUT(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, c1
, pbc_null2
);
571 constr_vsite4FD(x
[ai
], x
[aj
], x
[ak
], x
[al
], x
[avsite
], a1
, b1
, c1
, pbc_null2
);
579 constr_vsite4FDN(x
[ai
], x
[aj
], x
[ak
], x
[al
], x
[avsite
], a1
, b1
, c1
, pbc_null2
);
581 case F_VSITEN
: inc
= constr_vsiten(ia
, ip
, x
, pbc_null2
); break;
583 gmx_fatal(FARGS
, "No such vsite type %d in %s, line %d", ftype
, __FILE__
, __LINE__
);
586 if (pbcMode
== PbcMode::all
)
588 /* Keep the vsite in the same periodic image as before */
590 int ishift
= pbc_dx_aiuc(pbc_null
, x
[avsite
], xv
, dx
);
591 if (ishift
!= CENTRAL
)
593 rvec_add(xv
, dx
, x
[avsite
]);
598 /* Calculate velocity of vsite... */
600 rvec_sub(x
[avsite
], xv
, vv
);
601 svmul(inv_dt
, vv
, v
[avsite
]);
604 /* Increment loop variables */
612 void construct_vsites(const gmx_vsite_t
* vsite
,
616 const t_iparams ip
[],
617 const t_ilist ilist
[],
623 const bool useDomdec
= (vsite
!= nullptr && vsite
->useDomdec
);
624 GMX_ASSERT(!useDomdec
|| (cr
!= nullptr && DOMAINDECOMP(cr
)),
625 "When vsites are set up with domain decomposition, we need a valid commrec");
626 // TODO: Remove this assertion when we remove charge groups
627 GMX_ASSERT(vsite
!= nullptr || pbcType
== PbcType::No
,
628 "Without a vsite struct we can not do PBC (in case we have charge groups)");
630 t_pbc pbc
, *pbc_null
;
632 /* We only need to do pbc when we have inter-cg vsites.
633 * Note that with domain decomposition we do not need to apply PBC here
634 * when we have at least 3 domains along each dimension. Currently we
635 * do not optimize this case.
637 if (pbcType
!= PbcType::No
&& (useDomdec
|| bMolPBC
)
638 && !(vsite
!= nullptr && vsite
->numInterUpdategroupVsites
== 0))
640 /* This is wasting some CPU time as we now do this multiple times
644 clear_ivec(null_ivec
);
645 pbc_null
= set_pbc_dd(&pbc
, pbcType
, useDomdec
? cr
->dd
->numCells
: null_ivec
, FALSE
, box
);
654 dd_move_x_vsites(cr
->dd
, box
, x
);
657 if (vsite
== nullptr || vsite
->nthreads
== 1)
659 construct_vsites_thread(x
, dt
, v
, ip
, ilist
, pbc_null
);
663 #pragma omp parallel num_threads(vsite->nthreads)
667 const int th
= gmx_omp_get_thread_num();
668 const VsiteThread
& tData
= *vsite
->tData
[th
];
669 GMX_ASSERT(tData
.rangeStart
>= 0,
670 "The thread data should be initialized before calling construct_vsites");
672 construct_vsites_thread(x
, dt
, v
, ip
, tData
.ilist
, pbc_null
);
673 if (tData
.useInterdependentTask
)
675 /* Here we don't need a barrier (unlike the spreading),
676 * since both tasks only construct vsites from particles,
677 * or local vsites, not from non-local vsites.
679 construct_vsites_thread(x
, dt
, v
, ip
, tData
.idTask
.ilist
, pbc_null
);
682 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
684 /* Now we can construct the vsites that might depend on other vsites */
685 construct_vsites_thread(x
, dt
, v
, ip
, vsite
->tData
[vsite
->nthreads
]->ilist
, pbc_null
);
689 static void spread_vsite2(const t_iatom ia
[],
706 svmul(1 - a
, f
[av
], fi
);
716 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, av
), di
);
718 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, aj
), di
);
723 siv
= pbc_dx_aiuc(pbc
, x
[ai
], x
[av
], dx
);
724 sij
= pbc_dx_aiuc(pbc
, x
[ai
], x
[aj
], dx
);
732 if (fshift
&& (siv
!= CENTRAL
|| sij
!= CENTRAL
))
734 rvec_inc(fshift
[siv
], f
[av
]);
735 rvec_dec(fshift
[CENTRAL
], fi
);
736 rvec_dec(fshift
[sij
], fj
);
739 /* TOTAL: 13 flops */
742 void constructVsitesGlobal(const gmx_mtop_t
& mtop
, gmx::ArrayRef
<gmx::RVec
> x
)
744 GMX_ASSERT(x
.ssize() >= mtop
.natoms
, "x should contain the whole system");
745 GMX_ASSERT(!mtop
.moleculeBlockIndices
.empty(),
746 "molblock indices are needed in constructVsitesGlobal");
748 for (size_t mb
= 0; mb
< mtop
.molblock
.size(); mb
++)
750 const gmx_molblock_t
& molb
= mtop
.molblock
[mb
];
751 const gmx_moltype_t
& molt
= mtop
.moltype
[molb
.type
];
752 if (vsiteIlistNrCount(molt
.ilist
.data()) > 0)
754 int atomOffset
= mtop
.moleculeBlockIndices
[mb
].globalAtomStart
;
755 for (int mol
= 0; mol
< molb
.nmol
; mol
++)
757 t_ilist ilist
[F_NRE
];
758 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
760 ilist
[ftype
].nr
= molt
.ilist
[ftype
].size();
761 ilist
[ftype
].iatoms
= const_cast<t_iatom
*>(molt
.ilist
[ftype
].iatoms
.data());
764 construct_vsites(nullptr, as_rvec_array(x
.data()) + atomOffset
, 0.0, nullptr,
765 mtop
.ffparams
.iparams
.data(), ilist
, PbcType::No
, TRUE
, nullptr, nullptr);
766 atomOffset
+= molt
.atoms
.nr
;
772 static void spread_vsite2FD(const t_iatom ia
[],
782 const int av
= ia
[1];
783 const int ai
= ia
[2];
784 const int aj
= ia
[3];
786 copy_rvec(f
[av
], fv
);
789 int sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
792 const real invDistance
= inverseNorm(xij
);
793 const real b
= a
* invDistance
;
796 const real fproj
= iprod(xij
, fv
) * invDistance
* invDistance
;
799 fj
[XX
] = b
* (fv
[XX
] - fproj
* xij
[XX
]);
800 fj
[YY
] = b
* (fv
[YY
] - fproj
* xij
[YY
]);
801 fj
[ZZ
] = b
* (fv
[ZZ
] - fproj
* xij
[ZZ
]);
804 /* b is already calculated in constr_vsite2FD
805 storing b somewhere will save flops. */
807 f
[ai
][XX
] += fv
[XX
] - fj
[XX
];
808 f
[ai
][YY
] += fv
[YY
] - fj
[YY
];
809 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
];
821 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
823 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
829 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
836 if (svi
!= CENTRAL
|| sji
!= CENTRAL
)
838 rvec_dec(fshift
[svi
], fv
);
839 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
];
840 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
];
841 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
];
842 fshift
[sji
][XX
] += fj
[XX
];
843 fshift
[sji
][YY
] += fj
[YY
];
844 fshift
[sji
][ZZ
] += fj
[ZZ
];
850 /* When VirCorr=TRUE, the virial for the current forces is not
851 * calculated from the redistributed forces. This means that
852 * the effect of non-linear virtual site constructions on the virial
853 * needs to be added separately. This contribution can be calculated
854 * in many ways, but the simplest and cheapest way is to use
855 * the first constructing atom ai as a reference position in space:
856 * subtract (xv-xi)*fv and add (xj-xi)*fj.
860 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
862 for (int i
= 0; i
< DIM
; i
++)
864 for (int j
= 0; j
< DIM
; j
++)
866 /* As xix is a linear combination of j and k, use that here */
867 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * fj
[j
];
872 /* TOTAL: 38 flops */
875 static void spread_vsite3(const t_iatom ia
[],
894 svmul(1 - a
- b
, f
[av
], fi
);
906 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, ia
[1]), di
);
908 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, aj
), di
);
910 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, ak
), di
);
915 siv
= pbc_dx_aiuc(pbc
, x
[ai
], x
[av
], dx
);
916 sij
= pbc_dx_aiuc(pbc
, x
[ai
], x
[aj
], dx
);
917 sik
= pbc_dx_aiuc(pbc
, x
[ai
], x
[ak
], dx
);
926 if (fshift
&& (siv
!= CENTRAL
|| sij
!= CENTRAL
|| sik
!= CENTRAL
))
928 rvec_inc(fshift
[siv
], f
[av
]);
929 rvec_dec(fshift
[CENTRAL
], fi
);
930 rvec_dec(fshift
[sij
], fj
);
931 rvec_dec(fshift
[sik
], fk
);
934 /* TOTAL: 20 flops */
937 static void spread_vsite3FD(const t_iatom ia
[],
949 rvec xvi
, xij
, xjk
, xix
, fv
, temp
;
950 t_iatom av
, ai
, aj
, ak
;
958 copy_rvec(f
[av
], fv
);
960 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
961 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
964 /* xix goes from i to point x on the line jk */
965 xix
[XX
] = xij
[XX
] + a
* xjk
[XX
];
966 xix
[YY
] = xij
[YY
] + a
* xjk
[YY
];
967 xix
[ZZ
] = xij
[ZZ
] + a
* xjk
[ZZ
];
970 const real invDistance
= inverseNorm(xix
);
971 const real c
= b
* invDistance
;
974 fproj
= iprod(xix
, fv
) * invDistance
* invDistance
; /* = (xix . f)/(xix . xix) */
976 temp
[XX
] = c
* (fv
[XX
] - fproj
* xix
[XX
]);
977 temp
[YY
] = c
* (fv
[YY
] - fproj
* xix
[YY
]);
978 temp
[ZZ
] = c
* (fv
[ZZ
] - fproj
* xix
[ZZ
]);
981 /* c is already calculated in constr_vsite3FD
982 storing c somewhere will save 26 flops! */
985 f
[ai
][XX
] += fv
[XX
] - temp
[XX
];
986 f
[ai
][YY
] += fv
[YY
] - temp
[YY
];
987 f
[ai
][ZZ
] += fv
[ZZ
] - temp
[ZZ
];
988 f
[aj
][XX
] += a1
* temp
[XX
];
989 f
[aj
][YY
] += a1
* temp
[YY
];
990 f
[aj
][ZZ
] += a1
* temp
[ZZ
];
991 f
[ak
][XX
] += a
* temp
[XX
];
992 f
[ak
][YY
] += a
* temp
[YY
];
993 f
[ak
][ZZ
] += a
* temp
[ZZ
];
998 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
1000 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1002 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, aj
), di
);
1007 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1014 if (fshift
&& (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
))
1016 rvec_dec(fshift
[svi
], fv
);
1017 fshift
[CENTRAL
][XX
] += fv
[XX
] - (1 + a
) * temp
[XX
];
1018 fshift
[CENTRAL
][YY
] += fv
[YY
] - (1 + a
) * temp
[YY
];
1019 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - (1 + a
) * temp
[ZZ
];
1020 fshift
[sji
][XX
] += temp
[XX
];
1021 fshift
[sji
][YY
] += temp
[YY
];
1022 fshift
[sji
][ZZ
] += temp
[ZZ
];
1023 fshift
[skj
][XX
] += a
* temp
[XX
];
1024 fshift
[skj
][YY
] += a
* temp
[YY
];
1025 fshift
[skj
][ZZ
] += a
* temp
[ZZ
];
1030 /* When VirCorr=TRUE, the virial for the current forces is not
1031 * calculated from the redistributed forces. This means that
1032 * the effect of non-linear virtual site constructions on the virial
1033 * needs to be added separately. This contribution can be calculated
1034 * in many ways, but the simplest and cheapest way is to use
1035 * the first constructing atom ai as a reference position in space:
1036 * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
1040 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1042 for (int i
= 0; i
< DIM
; i
++)
1044 for (int j
= 0; j
< DIM
; j
++)
1046 /* As xix is a linear combination of j and k, use that here */
1047 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xix
[i
] * temp
[j
];
1052 /* TOTAL: 61 flops */
1055 static void spread_vsite3FAD(const t_iatom ia
[],
1066 rvec xvi
, xij
, xjk
, xperp
, Fpij
, Fppp
, fv
, f1
, f2
, f3
;
1067 real a1
, b1
, c1
, c2
, invdij
, invdij2
, invdp
, fproj
;
1068 t_iatom av
, ai
, aj
, ak
;
1069 int svi
, sji
, skj
, d
;
1076 copy_rvec(f
[ia
[1]], fv
);
1078 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1079 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
1082 invdij
= inverseNorm(xij
);
1083 invdij2
= invdij
* invdij
;
1084 c1
= iprod(xij
, xjk
) * invdij2
;
1085 xperp
[XX
] = xjk
[XX
] - c1
* xij
[XX
];
1086 xperp
[YY
] = xjk
[YY
] - c1
* xij
[YY
];
1087 xperp
[ZZ
] = xjk
[ZZ
] - c1
* xij
[ZZ
];
1088 /* xperp in plane ijk, perp. to ij */
1089 invdp
= inverseNorm(xperp
);
1094 /* a1, b1 and c1 are already calculated in constr_vsite3FAD
1095 storing them somewhere will save 45 flops! */
1097 fproj
= iprod(xij
, fv
) * invdij2
;
1098 svmul(fproj
, xij
, Fpij
); /* proj. f on xij */
1099 svmul(iprod(xperp
, fv
) * invdp
* invdp
, xperp
, Fppp
); /* proj. f on xperp */
1100 svmul(b1
* fproj
, xperp
, f3
);
1103 rvec_sub(fv
, Fpij
, f1
); /* f1 = f - Fpij */
1104 rvec_sub(f1
, Fppp
, f2
); /* f2 = f - Fpij - Fppp */
1105 for (d
= 0; (d
< DIM
); d
++)
1113 f
[ai
][XX
] += fv
[XX
] - f1
[XX
] + c1
* f2
[XX
] + f3
[XX
];
1114 f
[ai
][YY
] += fv
[YY
] - f1
[YY
] + c1
* f2
[YY
] + f3
[YY
];
1115 f
[ai
][ZZ
] += fv
[ZZ
] - f1
[ZZ
] + c1
* f2
[ZZ
] + f3
[ZZ
];
1116 f
[aj
][XX
] += f1
[XX
] - c2
* f2
[XX
] - f3
[XX
];
1117 f
[aj
][YY
] += f1
[YY
] - c2
* f2
[YY
] - f3
[YY
];
1118 f
[aj
][ZZ
] += f1
[ZZ
] - c2
* f2
[ZZ
] - f3
[ZZ
];
1119 f
[ak
][XX
] += f2
[XX
];
1120 f
[ak
][YY
] += f2
[YY
];
1121 f
[ak
][ZZ
] += f2
[ZZ
];
1126 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
1128 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1130 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, aj
), di
);
1135 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1142 if (fshift
&& (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
))
1144 rvec_dec(fshift
[svi
], fv
);
1145 fshift
[CENTRAL
][XX
] += fv
[XX
] - f1
[XX
] - (1 - c1
) * f2
[XX
] + f3
[XX
];
1146 fshift
[CENTRAL
][YY
] += fv
[YY
] - f1
[YY
] - (1 - c1
) * f2
[YY
] + f3
[YY
];
1147 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - f1
[ZZ
] - (1 - c1
) * f2
[ZZ
] + f3
[ZZ
];
1148 fshift
[sji
][XX
] += f1
[XX
] - c1
* f2
[XX
] - f3
[XX
];
1149 fshift
[sji
][YY
] += f1
[YY
] - c1
* f2
[YY
] - f3
[YY
];
1150 fshift
[sji
][ZZ
] += f1
[ZZ
] - c1
* f2
[ZZ
] - f3
[ZZ
];
1151 fshift
[skj
][XX
] += f2
[XX
];
1152 fshift
[skj
][YY
] += f2
[YY
];
1153 fshift
[skj
][ZZ
] += f2
[ZZ
];
1161 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1163 for (i
= 0; i
< DIM
; i
++)
1165 for (j
= 0; j
< DIM
; j
++)
1167 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1168 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * (f1
[j
] + (1 - c2
) * f2
[j
] - f3
[j
])
1174 /* TOTAL: 113 flops */
1177 static void spread_vsite3OUT(const t_iatom ia
[],
1189 rvec xvi
, xij
, xik
, fv
, fj
, fk
;
1200 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1201 ski
= pbc_rvec_sub(pbc
, x
[ak
], x
[ai
], xik
);
1204 copy_rvec(f
[av
], fv
);
1211 fj
[XX
] = a
* fv
[XX
] - xik
[ZZ
] * cfy
+ xik
[YY
] * cfz
;
1212 fj
[YY
] = xik
[ZZ
] * cfx
+ a
* fv
[YY
] - xik
[XX
] * cfz
;
1213 fj
[ZZ
] = -xik
[YY
] * cfx
+ xik
[XX
] * cfy
+ a
* fv
[ZZ
];
1215 fk
[XX
] = b
* fv
[XX
] + xij
[ZZ
] * cfy
- xij
[YY
] * cfz
;
1216 fk
[YY
] = -xij
[ZZ
] * cfx
+ b
* fv
[YY
] + xij
[XX
] * cfz
;
1217 fk
[ZZ
] = xij
[YY
] * cfx
- xij
[XX
] * cfy
+ b
* fv
[ZZ
];
1220 f
[ai
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
];
1221 f
[ai
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
];
1222 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
];
1223 rvec_inc(f
[aj
], fj
);
1224 rvec_inc(f
[ak
], fk
);
1229 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
1231 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1233 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, ai
), di
);
1238 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1245 if (fshift
&& (svi
!= CENTRAL
|| sji
!= CENTRAL
|| ski
!= CENTRAL
))
1247 rvec_dec(fshift
[svi
], fv
);
1248 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
];
1249 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
];
1250 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
];
1251 rvec_inc(fshift
[sji
], fj
);
1252 rvec_inc(fshift
[ski
], fk
);
1259 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1261 for (int i
= 0; i
< DIM
; i
++)
1263 for (int j
= 0; j
< DIM
; j
++)
1265 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * fj
[j
] + xik
[i
] * fk
[j
];
1270 /* TOTAL: 54 flops */
1273 static void spread_vsite4FD(const t_iatom ia
[],
1286 rvec xvi
, xij
, xjk
, xjl
, xix
, fv
, temp
;
1287 int av
, ai
, aj
, ak
, al
;
1289 int svi
, sji
, skj
, slj
, m
;
1297 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1298 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
1299 slj
= pbc_rvec_sub(pbc
, x
[al
], x
[aj
], xjl
);
1302 /* xix goes from i to point x on the plane jkl */
1303 for (m
= 0; m
< DIM
; m
++)
1305 xix
[m
] = xij
[m
] + a
* xjk
[m
] + b
* xjl
[m
];
1309 const real invDistance
= inverseNorm(xix
);
1310 const real d
= c
* invDistance
;
1311 /* 4 + ?10? flops */
1313 copy_rvec(f
[av
], fv
);
1315 fproj
= iprod(xix
, fv
) * invDistance
* invDistance
; /* = (xix . f)/(xix . xix) */
1317 for (m
= 0; m
< DIM
; m
++)
1319 temp
[m
] = d
* (fv
[m
] - fproj
* xix
[m
]);
1323 /* c is already calculated in constr_vsite3FD
1324 storing c somewhere will save 35 flops! */
1327 for (m
= 0; m
< DIM
; m
++)
1329 f
[ai
][m
] += fv
[m
] - temp
[m
];
1330 f
[aj
][m
] += a1
* temp
[m
];
1331 f
[ak
][m
] += a
* temp
[m
];
1332 f
[al
][m
] += b
* temp
[m
];
1338 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
1340 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1342 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, aj
), di
);
1344 ivec_sub(SHIFT_IVEC(g
, al
), SHIFT_IVEC(g
, aj
), di
);
1349 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1356 if (fshift
&& (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
|| slj
!= CENTRAL
))
1358 rvec_dec(fshift
[svi
], fv
);
1359 for (m
= 0; m
< DIM
; m
++)
1361 fshift
[CENTRAL
][m
] += fv
[m
] - (1 + a
+ b
) * temp
[m
];
1362 fshift
[sji
][m
] += temp
[m
];
1363 fshift
[skj
][m
] += a
* temp
[m
];
1364 fshift
[slj
][m
] += b
* temp
[m
];
1373 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1375 for (i
= 0; i
< DIM
; i
++)
1377 for (j
= 0; j
< DIM
; j
++)
1379 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xix
[i
] * temp
[j
];
1384 /* TOTAL: 77 flops */
1388 static void spread_vsite4FDN(const t_iatom ia
[],
1400 rvec xvi
, xij
, xik
, xil
, ra
, rb
, rja
, rjb
, rab
, rm
, rt
;
1401 rvec fv
, fj
, fk
, fl
;
1405 int av
, ai
, aj
, ak
, al
;
1406 int svi
, sij
, sik
, sil
;
1408 /* DEBUG: check atom indices */
1415 copy_rvec(f
[av
], fv
);
1417 sij
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1418 sik
= pbc_rvec_sub(pbc
, x
[ak
], x
[ai
], xik
);
1419 sil
= pbc_rvec_sub(pbc
, x
[al
], x
[ai
], xil
);
1422 ra
[XX
] = a
* xik
[XX
];
1423 ra
[YY
] = a
* xik
[YY
];
1424 ra
[ZZ
] = a
* xik
[ZZ
];
1426 rb
[XX
] = b
* xil
[XX
];
1427 rb
[YY
] = b
* xil
[YY
];
1428 rb
[ZZ
] = b
* xil
[ZZ
];
1432 rvec_sub(ra
, xij
, rja
);
1433 rvec_sub(rb
, xij
, rjb
);
1434 rvec_sub(rb
, ra
, rab
);
1437 cprod(rja
, rjb
, rm
);
1440 invrm
= inverseNorm(rm
);
1441 denom
= invrm
* invrm
;
1444 cfx
= c
* invrm
* fv
[XX
];
1445 cfy
= c
* invrm
* fv
[YY
];
1446 cfz
= c
* invrm
* fv
[ZZ
];
1457 fj
[XX
] = (-rm
[XX
] * rt
[XX
]) * cfx
+ (rab
[ZZ
] - rm
[YY
] * rt
[XX
]) * cfy
1458 + (-rab
[YY
] - rm
[ZZ
] * rt
[XX
]) * cfz
;
1459 fj
[YY
] = (-rab
[ZZ
] - rm
[XX
] * rt
[YY
]) * cfx
+ (-rm
[YY
] * rt
[YY
]) * cfy
1460 + (rab
[XX
] - rm
[ZZ
] * rt
[YY
]) * cfz
;
1461 fj
[ZZ
] = (rab
[YY
] - rm
[XX
] * rt
[ZZ
]) * cfx
+ (-rab
[XX
] - rm
[YY
] * rt
[ZZ
]) * cfy
1462 + (-rm
[ZZ
] * rt
[ZZ
]) * cfz
;
1468 rt
[XX
] *= denom
* a
;
1469 rt
[YY
] *= denom
* a
;
1470 rt
[ZZ
] *= denom
* a
;
1473 fk
[XX
] = (-rm
[XX
] * rt
[XX
]) * cfx
+ (-a
* rjb
[ZZ
] - rm
[YY
] * rt
[XX
]) * cfy
1474 + (a
* rjb
[YY
] - rm
[ZZ
] * rt
[XX
]) * cfz
;
1475 fk
[YY
] = (a
* rjb
[ZZ
] - rm
[XX
] * rt
[YY
]) * cfx
+ (-rm
[YY
] * rt
[YY
]) * cfy
1476 + (-a
* rjb
[XX
] - rm
[ZZ
] * rt
[YY
]) * cfz
;
1477 fk
[ZZ
] = (-a
* rjb
[YY
] - rm
[XX
] * rt
[ZZ
]) * cfx
+ (a
* rjb
[XX
] - rm
[YY
] * rt
[ZZ
]) * cfy
1478 + (-rm
[ZZ
] * rt
[ZZ
]) * cfz
;
1484 rt
[XX
] *= denom
* b
;
1485 rt
[YY
] *= denom
* b
;
1486 rt
[ZZ
] *= denom
* b
;
1489 fl
[XX
] = (-rm
[XX
] * rt
[XX
]) * cfx
+ (b
* rja
[ZZ
] - rm
[YY
] * rt
[XX
]) * cfy
1490 + (-b
* rja
[YY
] - rm
[ZZ
] * rt
[XX
]) * cfz
;
1491 fl
[YY
] = (-b
* rja
[ZZ
] - rm
[XX
] * rt
[YY
]) * cfx
+ (-rm
[YY
] * rt
[YY
]) * cfy
1492 + (b
* rja
[XX
] - rm
[ZZ
] * rt
[YY
]) * cfz
;
1493 fl
[ZZ
] = (b
* rja
[YY
] - rm
[XX
] * rt
[ZZ
]) * cfx
+ (-b
* rja
[XX
] - rm
[YY
] * rt
[ZZ
]) * cfy
1494 + (-rm
[ZZ
] * rt
[ZZ
]) * cfz
;
1497 f
[ai
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
] - fl
[XX
];
1498 f
[ai
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
] - fl
[YY
];
1499 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
] - fl
[ZZ
];
1500 rvec_inc(f
[aj
], fj
);
1501 rvec_inc(f
[ak
], fk
);
1502 rvec_inc(f
[al
], fl
);
1507 ivec_sub(SHIFT_IVEC(g
, av
), SHIFT_IVEC(g
, ai
), di
);
1509 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1511 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, ai
), di
);
1513 ivec_sub(SHIFT_IVEC(g
, al
), SHIFT_IVEC(g
, ai
), di
);
1518 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1525 if (fshift
&& (svi
!= CENTRAL
|| sij
!= CENTRAL
|| sik
!= CENTRAL
|| sil
!= CENTRAL
))
1527 rvec_dec(fshift
[svi
], fv
);
1528 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
] - fl
[XX
];
1529 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
] - fl
[YY
];
1530 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
] - fl
[ZZ
];
1531 rvec_inc(fshift
[sij
], fj
);
1532 rvec_inc(fshift
[sik
], fk
);
1533 rvec_inc(fshift
[sil
], fl
);
1541 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1543 for (i
= 0; i
< DIM
; i
++)
1545 for (j
= 0; j
< DIM
; j
++)
1547 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * fj
[j
] + xik
[i
] * fk
[j
] + xil
[i
] * fl
[j
];
1552 /* Total: 207 flops (Yuck!) */
1556 static int spread_vsiten(const t_iatom ia
[],
1557 const t_iparams ip
[],
1570 n3
= 3 * ip
[ia
[0]].vsiten
.n
;
1572 copy_rvec(x
[av
], xv
);
1574 for (i
= 0; i
< n3
; i
+= 3)
1579 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, av
), di
);
1584 siv
= pbc_dx_aiuc(pbc
, x
[ai
], xv
, dx
);
1590 a
= ip
[ia
[i
]].vsiten
.a
;
1591 svmul(a
, f
[av
], fi
);
1592 rvec_inc(f
[ai
], fi
);
1593 if (fshift
&& siv
!= CENTRAL
)
1595 rvec_inc(fshift
[siv
], fi
);
1596 rvec_dec(fshift
[CENTRAL
], fi
);
1605 static int vsite_count(const t_ilist
* ilist
, int ftype
)
1607 if (ftype
== F_VSITEN
)
1609 return ilist
[ftype
].nr
/ 3;
1613 return ilist
[ftype
].nr
/ (1 + interaction_function
[ftype
].nratoms
);
1617 static void spread_vsite_f_thread(const rvec x
[],
1623 const t_ilist ilist
[],
1625 const t_pbc
* pbc_null
)
1627 const PbcMode pbcMode
= getPbcMode(pbc_null
);
1628 /* We need another pbc pointer, as with charge groups we switch per vsite */
1629 const t_pbc
* pbc_null2
= pbc_null
;
1630 gmx::ArrayRef
<const int> vsite_pbc
;
1632 /* this loop goes backwards to be able to build *
1633 * higher type vsites from lower types */
1634 for (int ftype
= c_ftypeVsiteEnd
- 1; ftype
>= c_ftypeVsiteStart
; ftype
--)
1636 if (ilist
[ftype
].nr
== 0)
1642 int nra
= interaction_function
[ftype
].nratoms
;
1644 int nr
= ilist
[ftype
].nr
;
1646 const t_iatom
* ia
= ilist
[ftype
].iatoms
;
1648 if (pbcMode
== PbcMode::all
)
1650 pbc_null2
= pbc_null
;
1653 for (int i
= 0; i
< nr
;)
1657 /* Constants for constructing */
1659 a1
= ip
[tp
].vsite
.a
;
1660 /* Construct the vsite depending on type */
1663 case F_VSITE2
: spread_vsite2(ia
, a1
, x
, f
, fshift
, pbc_null2
, g
); break;
1665 spread_vsite2FD(ia
, a1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1668 b1
= ip
[tp
].vsite
.b
;
1669 spread_vsite3(ia
, a1
, b1
, x
, f
, fshift
, pbc_null2
, g
);
1672 b1
= ip
[tp
].vsite
.b
;
1673 spread_vsite3FD(ia
, a1
, b1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1676 b1
= ip
[tp
].vsite
.b
;
1677 spread_vsite3FAD(ia
, a1
, b1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1680 b1
= ip
[tp
].vsite
.b
;
1681 c1
= ip
[tp
].vsite
.c
;
1682 spread_vsite3OUT(ia
, a1
, b1
, c1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1685 b1
= ip
[tp
].vsite
.b
;
1686 c1
= ip
[tp
].vsite
.c
;
1687 spread_vsite4FD(ia
, a1
, b1
, c1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1690 b1
= ip
[tp
].vsite
.b
;
1691 c1
= ip
[tp
].vsite
.c
;
1692 spread_vsite4FDN(ia
, a1
, b1
, c1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1694 case F_VSITEN
: inc
= spread_vsiten(ia
, ip
, x
, f
, fshift
, pbc_null2
, g
); break;
1696 gmx_fatal(FARGS
, "No such vsite type %d in %s, line %d", ftype
, __FILE__
, __LINE__
);
1698 clear_rvec(f
[ia
[1]]);
1700 /* Increment loop variables */
1708 /*! \brief Clears the task force buffer elements that are written by task idTask */
1709 static void clearTaskForceBufferUsedElements(InterdependentTask
* idTask
)
1711 int ntask
= idTask
->spreadTask
.size();
1712 for (int ti
= 0; ti
< ntask
; ti
++)
1714 const AtomIndex
* atomList
= &idTask
->atomIndex
[idTask
->spreadTask
[ti
]];
1715 int natom
= atomList
->atom
.size();
1716 RVec
* force
= idTask
->force
.data();
1717 for (int i
= 0; i
< natom
; i
++)
1719 clear_rvec(force
[atomList
->atom
[i
]]);
1724 void spread_vsite_f(const gmx_vsite_t
* vsite
,
1725 const rvec
* gmx_restrict x
,
1726 rvec
* gmx_restrict f
,
1727 rvec
* gmx_restrict fshift
,
1736 const t_commrec
* cr
,
1737 gmx_wallcycle
* wcycle
)
1739 wallcycle_start(wcycle
, ewcVSITESPREAD
);
1740 const bool useDomdec
= vsite
->useDomdec
;
1741 GMX_ASSERT(!useDomdec
|| (cr
!= nullptr && DOMAINDECOMP(cr
)),
1742 "When vsites are set up with domain decomposition, we need a valid commrec");
1744 t_pbc pbc
, *pbc_null
;
1746 /* We only need to do pbc when we have inter-cg vsites */
1747 if ((useDomdec
|| bMolPBC
) && vsite
->numInterUpdategroupVsites
)
1749 /* This is wasting some CPU time as we now do this multiple times
1752 pbc_null
= set_pbc_dd(&pbc
, pbcType
, useDomdec
? cr
->dd
->numCells
: nullptr, FALSE
, box
);
1761 dd_clear_f_vsites(cr
->dd
, f
);
1764 if (vsite
->nthreads
== 1)
1771 spread_vsite_f_thread(x
, f
, fshift
, VirCorr
, dxdf
, idef
->iparams
, idef
->il
, g
, pbc_null
);
1775 for (int i
= 0; i
< DIM
; i
++)
1777 for (int j
= 0; j
< DIM
; j
++)
1779 vir
[i
][j
] += -0.5 * dxdf
[i
][j
];
1786 /* First spread the vsites that might depend on non-local vsites */
1789 clear_mat(vsite
->tData
[vsite
->nthreads
]->dxdf
);
1791 spread_vsite_f_thread(x
, f
, fshift
, VirCorr
, vsite
->tData
[vsite
->nthreads
]->dxdf
,
1792 idef
->iparams
, vsite
->tData
[vsite
->nthreads
]->ilist
, g
, pbc_null
);
1794 #pragma omp parallel num_threads(vsite->nthreads)
1798 int thread
= gmx_omp_get_thread_num();
1799 VsiteThread
& tData
= *vsite
->tData
[thread
];
1802 if (thread
== 0 || fshift
== nullptr)
1808 fshift_t
= tData
.fshift
;
1810 for (int i
= 0; i
< SHIFTS
; i
++)
1812 clear_rvec(fshift_t
[i
]);
1817 clear_mat(tData
.dxdf
);
1820 if (tData
.useInterdependentTask
)
1822 /* Spread the vsites that spread outside our local range.
1823 * This is done using a thread-local force buffer force.
1824 * First we need to copy the input vsite forces to force.
1826 InterdependentTask
* idTask
= &tData
.idTask
;
1828 /* Clear the buffer elements set by our task during
1829 * the last call to spread_vsite_f.
1831 clearTaskForceBufferUsedElements(idTask
);
1833 int nvsite
= idTask
->vsite
.size();
1834 for (int i
= 0; i
< nvsite
; i
++)
1836 copy_rvec(f
[idTask
->vsite
[i
]], idTask
->force
[idTask
->vsite
[i
]]);
1838 spread_vsite_f_thread(x
, as_rvec_array(idTask
->force
.data()), fshift_t
, VirCorr
,
1839 tData
.dxdf
, idef
->iparams
, tData
.idTask
.ilist
, g
, pbc_null
);
1841 /* We need a barrier before reducing forces below
1842 * that have been produced by a different thread above.
1846 /* Loop over all thread task and reduce forces they
1847 * produced on atoms that fall in our range.
1848 * Note that atomic reduction would be a simpler solution,
1849 * but that might not have good support on all platforms.
1851 int ntask
= idTask
->reduceTask
.size();
1852 for (int ti
= 0; ti
< ntask
; ti
++)
1854 const InterdependentTask
* idt_foreign
=
1855 &vsite
->tData
[idTask
->reduceTask
[ti
]]->idTask
;
1856 const AtomIndex
* atomList
= &idt_foreign
->atomIndex
[thread
];
1857 const RVec
* f_foreign
= idt_foreign
->force
.data();
1859 int natom
= atomList
->atom
.size();
1860 for (int i
= 0; i
< natom
; i
++)
1862 int ind
= atomList
->atom
[i
];
1863 rvec_inc(f
[ind
], f_foreign
[ind
]);
1864 /* Clearing of f_foreign is done at the next step */
1867 /* Clear the vsite forces, both in f and force */
1868 for (int i
= 0; i
< nvsite
; i
++)
1870 int ind
= tData
.idTask
.vsite
[i
];
1872 clear_rvec(tData
.idTask
.force
[ind
]);
1876 /* Spread the vsites that spread locally only */
1877 spread_vsite_f_thread(x
, f
, fshift_t
, VirCorr
, tData
.dxdf
, idef
->iparams
,
1878 tData
.ilist
, g
, pbc_null
);
1880 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1883 if (fshift
!= nullptr)
1885 for (int th
= 1; th
< vsite
->nthreads
; th
++)
1887 for (int i
= 0; i
< SHIFTS
; i
++)
1889 rvec_inc(fshift
[i
], vsite
->tData
[th
]->fshift
[i
]);
1896 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
1898 /* MSVC doesn't like matrix references, so we use a pointer */
1899 const matrix
* dxdf
= &vsite
->tData
[th
]->dxdf
;
1901 for (int i
= 0; i
< DIM
; i
++)
1903 for (int j
= 0; j
< DIM
; j
++)
1905 vir
[i
][j
] += -0.5 * (*dxdf
)[i
][j
];
1914 dd_move_f_vsites(cr
->dd
, f
, fshift
);
1917 inc_nrnb(nrnb
, eNR_VSITE2
, vsite_count(idef
->il
, F_VSITE2
));
1918 inc_nrnb(nrnb
, eNR_VSITE2FD
, vsite_count(idef
->il
, F_VSITE2FD
));
1919 inc_nrnb(nrnb
, eNR_VSITE3
, vsite_count(idef
->il
, F_VSITE3
));
1920 inc_nrnb(nrnb
, eNR_VSITE3FD
, vsite_count(idef
->il
, F_VSITE3FD
));
1921 inc_nrnb(nrnb
, eNR_VSITE3FAD
, vsite_count(idef
->il
, F_VSITE3FAD
));
1922 inc_nrnb(nrnb
, eNR_VSITE3OUT
, vsite_count(idef
->il
, F_VSITE3OUT
));
1923 inc_nrnb(nrnb
, eNR_VSITE4FD
, vsite_count(idef
->il
, F_VSITE4FD
));
1924 inc_nrnb(nrnb
, eNR_VSITE4FDN
, vsite_count(idef
->il
, F_VSITE4FDN
));
1925 inc_nrnb(nrnb
, eNR_VSITEN
, vsite_count(idef
->il
, F_VSITEN
));
1927 wallcycle_stop(wcycle
, ewcVSITESPREAD
);
1930 /*! \brief Returns the an array with group indices for each atom
1932 * \param[in] grouping The paritioning of the atom range into atom groups
1934 static std::vector
<int> makeAtomToGroupMapping(const gmx::RangePartitioning
& grouping
)
1936 std::vector
<int> atomToGroup(grouping
.fullRange().end(), 0);
1938 for (int group
= 0; group
< grouping
.numBlocks(); group
++)
1940 auto block
= grouping
.block(group
);
1941 std::fill(atomToGroup
.begin() + block
.begin(), atomToGroup
.begin() + block
.end(), group
);
1947 int countNonlinearVsites(const gmx_mtop_t
& mtop
)
1949 int numNonlinearVsites
= 0;
1950 for (const gmx_molblock_t
& molb
: mtop
.molblock
)
1952 const gmx_moltype_t
& molt
= mtop
.moltype
[molb
.type
];
1954 for (const auto& ilist
: extractILists(molt
.ilist
, IF_VSITE
))
1956 if (ilist
.functionType
!= F_VSITE2
&& ilist
.functionType
!= F_VSITE3
1957 && ilist
.functionType
!= F_VSITEN
)
1959 numNonlinearVsites
+= molb
.nmol
* ilist
.iatoms
.size() / (1 + NRAL(ilist
.functionType
));
1964 return numNonlinearVsites
;
1967 int countInterUpdategroupVsites(const gmx_mtop_t
& mtop
,
1968 gmx::ArrayRef
<const gmx::RangePartitioning
> updateGroupingPerMoleculetype
)
1970 int n_intercg_vsite
= 0;
1971 for (const gmx_molblock_t
& molb
: mtop
.molblock
)
1973 const gmx_moltype_t
& molt
= mtop
.moltype
[molb
.type
];
1975 std::vector
<int> atomToGroup
;
1976 if (!updateGroupingPerMoleculetype
.empty())
1978 atomToGroup
= makeAtomToGroupMapping(updateGroupingPerMoleculetype
[molb
.type
]);
1980 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
1982 const int nral
= NRAL(ftype
);
1983 const InteractionList
& il
= molt
.ilist
[ftype
];
1984 for (int i
= 0; i
< il
.size(); i
+= 1 + nral
)
1986 bool isInterGroup
= atomToGroup
.empty();
1989 const int group
= atomToGroup
[il
.iatoms
[1 + i
]];
1990 for (int a
= 1; a
< nral
; a
++)
1992 if (atomToGroup
[il
.iatoms
[1 + a
]] != group
)
1994 isInterGroup
= true;
2001 n_intercg_vsite
+= molb
.nmol
;
2007 return n_intercg_vsite
;
2010 std::unique_ptr
<gmx_vsite_t
> initVsite(const gmx_mtop_t
& mtop
, const t_commrec
* cr
)
2012 GMX_RELEASE_ASSERT(cr
!= nullptr, "We need a valid commrec");
2014 std::unique_ptr
<gmx_vsite_t
> vsite
;
2016 /* check if there are vsites */
2018 for (int ftype
= 0; ftype
< F_NRE
; ftype
++)
2020 if (interaction_function
[ftype
].flags
& IF_VSITE
)
2022 GMX_ASSERT(ftype
>= c_ftypeVsiteStart
&& ftype
< c_ftypeVsiteEnd
,
2023 "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2025 nvsite
+= gmx_mtop_ftype_count(&mtop
, ftype
);
2029 GMX_ASSERT(ftype
< c_ftypeVsiteStart
|| ftype
>= c_ftypeVsiteEnd
,
2030 "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2039 vsite
= std::make_unique
<gmx_vsite_t
>();
2041 gmx::ArrayRef
<const gmx::RangePartitioning
> updateGroupingPerMoleculetype
;
2042 if (DOMAINDECOMP(cr
))
2044 updateGroupingPerMoleculetype
= getUpdateGroupingPerMoleculetype(*cr
->dd
);
2046 vsite
->numInterUpdategroupVsites
= countInterUpdategroupVsites(mtop
, updateGroupingPerMoleculetype
);
2048 vsite
->useDomdec
= (DOMAINDECOMP(cr
) && cr
->dd
->nnodes
> 1);
2050 vsite
->nthreads
= gmx_omp_nthreads_get(emntVSITE
);
2052 if (vsite
->nthreads
> 1)
2054 /* We need one extra thread data structure for the overlap vsites */
2055 vsite
->tData
.resize(vsite
->nthreads
+ 1);
2056 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2057 for (int thread
= 0; thread
< vsite
->nthreads
; thread
++)
2061 vsite
->tData
[thread
] = std::make_unique
<VsiteThread
>();
2063 InterdependentTask
& idTask
= vsite
->tData
[thread
]->idTask
;
2065 idTask
.atomIndex
.resize(vsite
->nthreads
);
2067 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2069 if (vsite
->nthreads
> 1)
2071 vsite
->tData
[vsite
->nthreads
] = std::make_unique
<VsiteThread
>();
2078 gmx_vsite_t::gmx_vsite_t() {}
2080 gmx_vsite_t::~gmx_vsite_t() {}
2082 static inline void flagAtom(InterdependentTask
* idTask
, int atom
, int thread
, int nthread
, int natperthread
)
2084 if (!idTask
->use
[atom
])
2086 idTask
->use
[atom
] = true;
2087 thread
= atom
/ natperthread
;
2088 /* Assign all non-local atom force writes to thread 0 */
2089 if (thread
>= nthread
)
2093 idTask
->atomIndex
[thread
].atom
.push_back(atom
);
2097 /*\brief Here we try to assign all vsites that are in our local range.
2099 * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2100 * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2101 * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2102 * but not on other vsites are assigned to task tData->id_task.ilist.
2103 * taskIndex[] is set for all vsites in our range, either to our local tasks
2104 * or to the single last task as taskIndex[]=2*nthreads.
2106 static void assignVsitesToThread(VsiteThread
* tData
,
2110 gmx::ArrayRef
<int> taskIndex
,
2111 const t_ilist
* ilist
,
2112 const t_iparams
* ip
,
2113 const unsigned short* ptype
)
2115 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2117 tData
->ilist
[ftype
].nr
= 0;
2118 tData
->idTask
.ilist
[ftype
].nr
= 0;
2120 int nral1
= 1 + NRAL(ftype
);
2122 t_iatom
* iat
= ilist
[ftype
].iatoms
;
2123 for (int i
= 0; i
< ilist
[ftype
].nr
;)
2125 if (ftype
== F_VSITEN
)
2127 /* The 3 below is from 1+NRAL(ftype)=3 */
2128 inc
= ip
[iat
[i
]].vsiten
.n
* 3;
2131 if (iat
[1 + i
] < tData
->rangeStart
|| iat
[1 + i
] >= tData
->rangeEnd
)
2133 /* This vsite belongs to a different thread */
2138 /* We would like to assign this vsite to task thread,
2139 * but it might depend on atoms outside the atom range of thread
2140 * or on another vsite not assigned to task thread.
2143 if (ftype
!= F_VSITEN
)
2145 for (int j
= i
+ 2; j
< i
+ nral1
; j
++)
2147 /* Do a range check to avoid a harmless race on taskIndex */
2148 if (iat
[j
] < tData
->rangeStart
|| iat
[j
] >= tData
->rangeEnd
|| taskIndex
[iat
[j
]] != thread
)
2150 if (!tData
->useInterdependentTask
|| ptype
[iat
[j
]] == eptVSite
)
2152 /* At least one constructing atom is a vsite
2153 * that is not assigned to the same thread.
2154 * Put this vsite into a separate task.
2160 /* There are constructing atoms outside our range,
2161 * put this vsite into a second task to be executed
2162 * on the same thread. During construction no barrier
2163 * is needed between the two tasks on the same thread.
2164 * During spreading we need to run this task with
2165 * an additional thread-local intermediate force buffer
2166 * (or atomic reduction) and a barrier between the two
2169 task
= nthread
+ thread
;
2175 for (int j
= i
+ 2; j
< i
+ inc
; j
+= 3)
2177 /* Do a range check to avoid a harmless race on taskIndex */
2178 if (iat
[j
] < tData
->rangeStart
|| iat
[j
] >= tData
->rangeEnd
|| taskIndex
[iat
[j
]] != thread
)
2180 GMX_ASSERT(ptype
[iat
[j
]] != eptVSite
,
2181 "A vsite to be assigned in assignVsitesToThread has a vsite as "
2182 "a constructing atom that does not belong to our task, such "
2183 "vsites should be assigned to the single 'master' task");
2185 task
= nthread
+ thread
;
2190 /* Update this vsite's thread index entry */
2191 taskIndex
[iat
[1 + i
]] = task
;
2193 if (task
== thread
|| task
== nthread
+ thread
)
2195 /* Copy this vsite to the thread data struct of thread */
2199 il_task
= &tData
->ilist
[ftype
];
2203 il_task
= &tData
->idTask
.ilist
[ftype
];
2205 /* Ensure we have sufficient memory allocated */
2206 if (il_task
->nr
+ inc
> il_task
->nalloc
)
2208 il_task
->nalloc
= over_alloc_large(il_task
->nr
+ inc
);
2209 srenew(il_task
->iatoms
, il_task
->nalloc
);
2211 /* Copy the vsite data to the thread-task local array */
2212 for (int j
= i
; j
< i
+ inc
; j
++)
2214 il_task
->iatoms
[il_task
->nr
++] = iat
[j
];
2216 if (task
== nthread
+ thread
)
2218 /* This vsite write outside our own task force block.
2219 * Put it into the interdependent task list and flag
2220 * the atoms involved for reduction.
2222 tData
->idTask
.vsite
.push_back(iat
[i
+ 1]);
2223 if (ftype
!= F_VSITEN
)
2225 for (int j
= i
+ 2; j
< i
+ nral1
; j
++)
2227 flagAtom(&tData
->idTask
, iat
[j
], thread
, nthread
, natperthread
);
2232 for (int j
= i
+ 2; j
< i
+ inc
; j
+= 3)
2234 flagAtom(&tData
->idTask
, iat
[j
], thread
, nthread
, natperthread
);
2245 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2246 static void assignVsitesToSingleTask(VsiteThread
* tData
,
2248 gmx::ArrayRef
<const int> taskIndex
,
2249 const t_ilist
* ilist
,
2250 const t_iparams
* ip
)
2252 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2254 tData
->ilist
[ftype
].nr
= 0;
2255 tData
->idTask
.ilist
[ftype
].nr
= 0;
2257 int nral1
= 1 + NRAL(ftype
);
2259 t_iatom
* iat
= ilist
[ftype
].iatoms
;
2260 t_ilist
* il_task
= &tData
->ilist
[ftype
];
2262 for (int i
= 0; i
< ilist
[ftype
].nr
;)
2264 if (ftype
== F_VSITEN
)
2266 /* The 3 below is from 1+NRAL(ftype)=3 */
2267 inc
= ip
[iat
[i
]].vsiten
.n
* 3;
2269 /* Check if the vsite is assigned to our task */
2270 if (taskIndex
[iat
[1 + i
]] == task
)
2272 /* Ensure we have sufficient memory allocated */
2273 if (il_task
->nr
+ inc
> il_task
->nalloc
)
2275 il_task
->nalloc
= over_alloc_large(il_task
->nr
+ inc
);
2276 srenew(il_task
->iatoms
, il_task
->nalloc
);
2278 /* Copy the vsite data to the thread-task local array */
2279 for (int j
= i
; j
< i
+ inc
; j
++)
2281 il_task
->iatoms
[il_task
->nr
++] = iat
[j
];
2290 void split_vsites_over_threads(const t_ilist
* ilist
, const t_iparams
* ip
, const t_mdatoms
* mdatoms
, gmx_vsite_t
* vsite
)
2292 int vsite_atom_range
, natperthread
;
2294 if (vsite
->nthreads
== 1)
2300 /* The current way of distributing the vsites over threads in primitive.
2301 * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2302 * without taking into account how the vsites are distributed.
2303 * Without domain decomposition we at least tighten the upper bound
2304 * of the range (useful for common systems such as a vsite-protein
2306 * With domain decomposition, as long as the vsites are distributed
2307 * uniformly in each domain along the major dimension, usually x,
2308 * it will also perform well.
2310 if (!vsite
->useDomdec
)
2312 vsite_atom_range
= -1;
2313 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2316 if (ftype
!= F_VSITEN
)
2318 int nral1
= 1 + NRAL(ftype
);
2319 const t_iatom
* iat
= ilist
[ftype
].iatoms
;
2320 for (int i
= 0; i
< ilist
[ftype
].nr
; i
+= nral1
)
2322 for (int j
= i
+ 1; j
< i
+ nral1
; j
++)
2324 vsite_atom_range
= std::max(vsite_atom_range
, iat
[j
]);
2332 const t_iatom
* iat
= ilist
[ftype
].iatoms
;
2335 while (i
< ilist
[ftype
].nr
)
2337 /* The 3 below is from 1+NRAL(ftype)=3 */
2338 vs_ind_end
= i
+ ip
[iat
[i
]].vsiten
.n
* 3;
2340 vsite_atom_range
= std::max(vsite_atom_range
, iat
[i
+ 1]);
2341 while (i
< vs_ind_end
)
2343 vsite_atom_range
= std::max(vsite_atom_range
, iat
[i
+ 2]);
2351 natperthread
= (vsite_atom_range
+ vsite
->nthreads
- 1) / vsite
->nthreads
;
2355 /* Any local or not local atom could be involved in virtual sites.
2356 * But since we usually have very few non-local virtual sites
2357 * (only non-local vsites that depend on local vsites),
2358 * we distribute the local atom range equally over the threads.
2359 * When assigning vsites to threads, we should take care that the last
2360 * threads also covers the non-local range.
2362 vsite_atom_range
= mdatoms
->nr
;
2363 natperthread
= (mdatoms
->homenr
+ vsite
->nthreads
- 1) / vsite
->nthreads
;
2368 fprintf(debug
, "virtual site thread dist: natoms %d, range %d, natperthread %d\n",
2369 mdatoms
->nr
, vsite_atom_range
, natperthread
);
2372 /* To simplify the vsite assignment, we make an index which tells us
2373 * to which task particles, both non-vsites and vsites, are assigned.
2375 vsite
->taskIndex
.resize(mdatoms
->nr
);
2377 /* Initialize the task index array. Here we assign the non-vsite
2378 * particles to task=thread, so we easily figure out if vsites
2379 * depend on local and/or non-local particles in assignVsitesToThread.
2381 gmx::ArrayRef
<int> taskIndex
= vsite
->taskIndex
;
2384 for (int i
= 0; i
< mdatoms
->nr
; i
++)
2386 if (mdatoms
->ptype
[i
] == eptVSite
)
2388 /* vsites are not assigned to a task yet */
2393 /* assign non-vsite particles to task thread */
2394 taskIndex
[i
] = thread
;
2396 if (i
== (thread
+ 1) * natperthread
&& thread
< vsite
->nthreads
)
2403 #pragma omp parallel num_threads(vsite->nthreads)
2407 int thread
= gmx_omp_get_thread_num();
2408 VsiteThread
& tData
= *vsite
->tData
[thread
];
2410 /* Clear the buffer use flags that were set before */
2411 if (tData
.useInterdependentTask
)
2413 InterdependentTask
& idTask
= tData
.idTask
;
2415 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2416 * we clear the force buffer at the next step,
2417 * so we need to do it here as well.
2419 clearTaskForceBufferUsedElements(&idTask
);
2421 idTask
.vsite
.resize(0);
2422 for (int t
= 0; t
< vsite
->nthreads
; t
++)
2424 AtomIndex
& atomIndex
= idTask
.atomIndex
[t
];
2425 int natom
= atomIndex
.atom
.size();
2426 for (int i
= 0; i
< natom
; i
++)
2428 idTask
.use
[atomIndex
.atom
[i
]] = false;
2430 atomIndex
.atom
.resize(0);
2435 /* To avoid large f_buf allocations of #threads*vsite_atom_range
2436 * we don't use task2 with more than 200000 atoms. This doesn't
2437 * affect performance, since with such a large range relatively few
2438 * vsites will end up in the separate task.
2439 * Note that useTask2 should be the same for all threads.
2441 tData
.useInterdependentTask
= (vsite_atom_range
<= 200000);
2442 if (tData
.useInterdependentTask
)
2444 size_t natoms_use_in_vsites
= vsite_atom_range
;
2445 InterdependentTask
& idTask
= tData
.idTask
;
2446 /* To avoid resizing and re-clearing every nstlist steps,
2447 * we never down size the force buffer.
2449 if (natoms_use_in_vsites
> idTask
.force
.size() || natoms_use_in_vsites
> idTask
.use
.size())
2451 idTask
.force
.resize(natoms_use_in_vsites
, { 0, 0, 0 });
2452 idTask
.use
.resize(natoms_use_in_vsites
, false);
2456 /* Assign all vsites that can execute independently on threads */
2457 tData
.rangeStart
= thread
* natperthread
;
2458 if (thread
< vsite
->nthreads
- 1)
2460 tData
.rangeEnd
= (thread
+ 1) * natperthread
;
2464 /* The last thread should cover up to the end of the range */
2465 tData
.rangeEnd
= mdatoms
->nr
;
2467 assignVsitesToThread(&tData
, thread
, vsite
->nthreads
, natperthread
, taskIndex
, ilist
,
2468 ip
, mdatoms
->ptype
);
2470 if (tData
.useInterdependentTask
)
2472 /* In the worst case, all tasks write to force ranges of
2473 * all other tasks, leading to #tasks^2 scaling (this is only
2474 * the overhead, the actual flops remain constant).
2475 * But in most cases there is far less coupling. To improve
2476 * scaling at high thread counts we therefore construct
2477 * an index to only loop over the actually affected tasks.
2479 InterdependentTask
& idTask
= tData
.idTask
;
2481 /* Ensure assignVsitesToThread finished on other threads */
2484 idTask
.spreadTask
.resize(0);
2485 idTask
.reduceTask
.resize(0);
2486 for (int t
= 0; t
< vsite
->nthreads
; t
++)
2488 /* Do we write to the force buffer of task t? */
2489 if (!idTask
.atomIndex
[t
].atom
.empty())
2491 idTask
.spreadTask
.push_back(t
);
2493 /* Does task t write to our force buffer? */
2494 if (!vsite
->tData
[t
]->idTask
.atomIndex
[thread
].atom
.empty())
2496 idTask
.reduceTask
.push_back(t
);
2501 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2503 /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2504 * to a single task that will not run in parallel with other tasks.
2506 assignVsitesToSingleTask(vsite
->tData
[vsite
->nthreads
].get(), 2 * vsite
->nthreads
, taskIndex
,
2509 if (debug
&& vsite
->nthreads
> 1)
2511 fprintf(debug
, "virtual site useInterdependentTask %d, nuse:\n",
2512 static_cast<int>(vsite
->tData
[0]->useInterdependentTask
));
2513 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
2515 fprintf(debug
, " %4d", vsite
->tData
[th
]->idTask
.nuse
);
2517 fprintf(debug
, "\n");
2519 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2521 if (ilist
[ftype
].nr
> 0)
2523 fprintf(debug
, "%-20s thread dist:", interaction_function
[ftype
].longname
);
2524 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
2526 fprintf(debug
, " %4d %4d ", vsite
->tData
[th
]->ilist
[ftype
].nr
,
2527 vsite
->tData
[th
]->idTask
.ilist
[ftype
].nr
);
2529 fprintf(debug
, "\n");
2535 int nrOrig
= vsiteIlistNrCount(ilist
);
2537 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
2539 nrThreaded
+= vsiteIlistNrCount(vsite
->tData
[th
]->ilist
)
2540 + vsiteIlistNrCount(vsite
->tData
[th
]->idTask
.ilist
);
2542 GMX_ASSERT(nrThreaded
== nrOrig
,
2543 "The number of virtual sites assigned to all thread task has to match the total "
2544 "number of virtual sites");
2548 void set_vsite_top(gmx_vsite_t
* vsite
, const gmx_localtop_t
* top
, const t_mdatoms
* md
)
2550 if (vsite
->nthreads
> 1)
2552 split_vsites_over_threads(top
->idef
.il
, top
->idef
.iparams
, md
, vsite
);