2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
46 #include "gromacs/domdec/domdec.h"
47 #include "gromacs/domdec/domdec_struct.h"
48 #include "gromacs/gmxlib/network.h"
49 #include "gromacs/gmxlib/nrnb.h"
50 #include "gromacs/math/functions.h"
51 #include "gromacs/math/vec.h"
52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
53 #include "gromacs/mdtypes/commrec.h"
54 #include "gromacs/pbcutil/ishift.h"
55 #include "gromacs/pbcutil/mshift.h"
56 #include "gromacs/pbcutil/pbc.h"
57 #include "gromacs/topology/ifunc.h"
58 #include "gromacs/topology/mtop_util.h"
59 #include "gromacs/utility/exceptions.h"
60 #include "gromacs/utility/fatalerror.h"
61 #include "gromacs/utility/gmxassert.h"
62 #include "gromacs/utility/gmxomp.h"
63 #include "gromacs/utility/smalloc.h"
66 /* The strategy used here for assigning virtual sites to (thread-)tasks
69 * We divide the atom range that vsites operate on (natoms_local with DD,
70 * 0 - last atom involved in vsites without DD) equally over all threads.
72 * Vsites in the local range constructed from atoms in the local range
73 * and/or other vsites that are fully local are assigned to a simple,
76 * Vsites that are not assigned after using the above criterion get assigned
77 * to a so called "interdependent" thread task when none of the constructing
78 * atoms is a vsite. These tasks are called interdependent, because one task
79 * accesses atoms assigned to a different task/thread.
80 * Note that this option is turned off with large (local) atom counts
81 * to avoid high memory usage.
83 * Any remaining vsites are assigned to a separate master thread task.
88 static void init_ilist(t_ilist
*ilist
)
90 for (int i
= 0; i
< F_NRE
; i
++)
94 ilist
[i
].iatoms
= nullptr;
98 /*! \brief List of atom indices belonging to a task */
100 //! List of atom indices
101 std::vector
<int> atom
;
104 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
105 struct InterdependentTask
107 //! The interaction lists, only vsite entries are used
108 t_ilist ilist
[F_NRE
];
109 //! Thread/task-local force buffer
110 std::vector
<RVec
> force
;
111 //! The atom indices of the vsites of our task
112 std::vector
<int> vsite
;
113 //! Flags if elements in force are spread to or not
114 std::vector
<bool> use
;
115 //! The number of entries set to true in use
117 //! Array of atoms indices, size nthreads, covering all nuse set elements in use
118 std::vector
<AtomIndex
> atomIndex
;
119 //! List of tasks (force blocks) this task spread forces to
120 std::vector
<int> spreadTask
;
121 //! List of tasks that write to this tasks force block range
122 std::vector
<int> reduceTask
;
131 /*! \brief Vsite thread task data structure */
133 //! Start of atom range of this task
135 //! End of atom range of this task
137 //! The interaction lists, only vsite entries are used
138 t_ilist ilist
[F_NRE
];
139 //! Local fshift accumulation buffer
141 //! Local virial dx*df accumulation buffer
143 //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
144 bool useInterdependentTask
;
145 //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
146 InterdependentTask idTask
;
148 /*! \brief Constructor */
154 clear_rvecs(SHIFTS
, fshift
);
156 useInterdependentTask
= false;
161 /* The start and end values of for the vsite indices in the ftype enum.
162 * The validity of these values is checked in init_vsite.
163 * This is used to avoid loops over all ftypes just to get the vsite entries.
164 * (We should replace the fixed ilist array by only the used entries.)
166 static const int c_ftypeVsiteStart
= F_VSITE2
;
167 static const int c_ftypeVsiteEnd
= F_VSITEN
+ 1;
170 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
172 * \param[in] ilist The interaction list
174 static int vsiteIlistNrCount(const t_ilist
*ilist
)
177 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
179 nr
+= ilist
[ftype
].nr
;
185 static int pbc_rvec_sub(const t_pbc
*pbc
, const rvec xi
, const rvec xj
, rvec dx
)
189 return pbc_dx_aiuc(pbc
, xi
, xj
, dx
);
193 rvec_sub(xi
, xj
, dx
);
198 /* Vsite construction routines */
200 static void constr_vsite2(const rvec xi
, const rvec xj
, rvec x
, real a
, const t_pbc
*pbc
)
208 pbc_dx_aiuc(pbc
, xj
, xi
, dx
);
209 x
[XX
] = xi
[XX
] + a
*dx
[XX
];
210 x
[YY
] = xi
[YY
] + a
*dx
[YY
];
211 x
[ZZ
] = xi
[ZZ
] + a
*dx
[ZZ
];
215 x
[XX
] = b
*xi
[XX
] + a
*xj
[XX
];
216 x
[YY
] = b
*xi
[YY
] + a
*xj
[YY
];
217 x
[ZZ
] = b
*xi
[ZZ
] + a
*xj
[ZZ
];
221 /* TOTAL: 10 flops */
224 static void constr_vsite3(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
,
234 pbc_dx_aiuc(pbc
, xj
, xi
, dxj
);
235 pbc_dx_aiuc(pbc
, xk
, xi
, dxk
);
236 x
[XX
] = xi
[XX
] + a
*dxj
[XX
] + b
*dxk
[XX
];
237 x
[YY
] = xi
[YY
] + a
*dxj
[YY
] + b
*dxk
[YY
];
238 x
[ZZ
] = xi
[ZZ
] + a
*dxj
[ZZ
] + b
*dxk
[ZZ
];
242 x
[XX
] = c
*xi
[XX
] + a
*xj
[XX
] + b
*xk
[XX
];
243 x
[YY
] = c
*xi
[YY
] + a
*xj
[YY
] + b
*xk
[YY
];
244 x
[ZZ
] = c
*xi
[ZZ
] + a
*xj
[ZZ
] + b
*xk
[ZZ
];
248 /* TOTAL: 17 flops */
251 static void constr_vsite3FD(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
,
257 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
258 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
261 /* temp goes from i to a point on the line jk */
262 temp
[XX
] = xij
[XX
] + a
*xjk
[XX
];
263 temp
[YY
] = xij
[YY
] + a
*xjk
[YY
];
264 temp
[ZZ
] = xij
[ZZ
] + a
*xjk
[ZZ
];
267 c
= b
*gmx::invsqrt(iprod(temp
, temp
));
270 x
[XX
] = xi
[XX
] + c
*temp
[XX
];
271 x
[YY
] = xi
[YY
] + c
*temp
[YY
];
272 x
[ZZ
] = xi
[ZZ
] + c
*temp
[ZZ
];
275 /* TOTAL: 34 flops */
278 static void constr_vsite3FAD(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, const t_pbc
*pbc
)
281 real a1
, b1
, c1
, invdij
;
283 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
284 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
287 invdij
= gmx::invsqrt(iprod(xij
, xij
));
288 c1
= invdij
* invdij
* iprod(xij
, xjk
);
289 xp
[XX
] = xjk
[XX
] - c1
*xij
[XX
];
290 xp
[YY
] = xjk
[YY
] - c1
*xij
[YY
];
291 xp
[ZZ
] = xjk
[ZZ
] - c1
*xij
[ZZ
];
293 b1
= b
*gmx::invsqrt(iprod(xp
, xp
));
296 x
[XX
] = xi
[XX
] + a1
*xij
[XX
] + b1
*xp
[XX
];
297 x
[YY
] = xi
[YY
] + a1
*xij
[YY
] + b1
*xp
[YY
];
298 x
[ZZ
] = xi
[ZZ
] + a1
*xij
[ZZ
] + b1
*xp
[ZZ
];
301 /* TOTAL: 63 flops */
304 static void constr_vsite3OUT(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
,
305 real a
, real b
, real c
, const t_pbc
*pbc
)
309 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
310 pbc_rvec_sub(pbc
, xk
, xi
, xik
);
311 cprod(xij
, xik
, temp
);
314 x
[XX
] = xi
[XX
] + a
*xij
[XX
] + b
*xik
[XX
] + c
*temp
[XX
];
315 x
[YY
] = xi
[YY
] + a
*xij
[YY
] + b
*xik
[YY
] + c
*temp
[YY
];
316 x
[ZZ
] = xi
[ZZ
] + a
*xij
[ZZ
] + b
*xik
[ZZ
] + c
*temp
[ZZ
];
319 /* TOTAL: 33 flops */
322 static void constr_vsite4FD(const rvec xi
, const rvec xj
, const rvec xk
, const rvec xl
, rvec x
,
323 real a
, real b
, real c
, const t_pbc
*pbc
)
325 rvec xij
, xjk
, xjl
, temp
;
328 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
329 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
330 pbc_rvec_sub(pbc
, xl
, xj
, xjl
);
333 /* temp goes from i to a point on the plane jkl */
334 temp
[XX
] = xij
[XX
] + a
*xjk
[XX
] + b
*xjl
[XX
];
335 temp
[YY
] = xij
[YY
] + a
*xjk
[YY
] + b
*xjl
[YY
];
336 temp
[ZZ
] = xij
[ZZ
] + a
*xjk
[ZZ
] + b
*xjl
[ZZ
];
339 d
= c
*gmx::invsqrt(iprod(temp
, temp
));
342 x
[XX
] = xi
[XX
] + d
*temp
[XX
];
343 x
[YY
] = xi
[YY
] + d
*temp
[YY
];
344 x
[ZZ
] = xi
[ZZ
] + d
*temp
[ZZ
];
347 /* TOTAL: 43 flops */
350 static void constr_vsite4FDN(const rvec xi
, const rvec xj
, const rvec xk
, const rvec xl
, rvec x
,
351 real a
, real b
, real c
, const t_pbc
*pbc
)
353 rvec xij
, xik
, xil
, ra
, rb
, rja
, rjb
, rm
;
356 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
357 pbc_rvec_sub(pbc
, xk
, xi
, xik
);
358 pbc_rvec_sub(pbc
, xl
, xi
, xil
);
371 rvec_sub(ra
, xij
, rja
);
372 rvec_sub(rb
, xij
, rjb
);
378 d
= c
*gmx::invsqrt(norm2(rm
));
381 x
[XX
] = xi
[XX
] + d
*rm
[XX
];
382 x
[YY
] = xi
[YY
] + d
*rm
[YY
];
383 x
[ZZ
] = xi
[ZZ
] + d
*rm
[ZZ
];
386 /* TOTAL: 47 flops */
390 static int constr_vsiten(const t_iatom
*ia
, const t_iparams ip
[],
391 rvec
*x
, const t_pbc
*pbc
)
398 n3
= 3*ip
[ia
[0]].vsiten
.n
;
401 copy_rvec(x
[ai
], x1
);
403 for (int i
= 3; i
< n3
; i
+= 3)
406 a
= ip
[ia
[i
]].vsiten
.a
;
409 pbc_dx_aiuc(pbc
, x
[ai
], x1
, dx
);
413 rvec_sub(x
[ai
], x1
, dx
);
415 dsum
[XX
] += a
*dx
[XX
];
416 dsum
[YY
] += a
*dx
[YY
];
417 dsum
[ZZ
] += a
*dx
[ZZ
];
421 x
[av
][XX
] = x1
[XX
] + dsum
[XX
];
422 x
[av
][YY
] = x1
[YY
] + dsum
[YY
];
423 x
[av
][ZZ
] = x1
[ZZ
] + dsum
[ZZ
];
428 /*! \brief PBC modes for vsite construction and spreading */
431 all
, // Apply normal, simple PBC for all vsites
432 chargeGroup
, // Keep vsite in the same periodic image as the rest of it's charge group
433 none
// No PBC treatment needed
436 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
438 * \param[in] pbcPtr A pointer to a PBC struct or nullptr when no PBC treatment is required
439 * \param[in] vsite A pointer to the vsite struct, can be nullptr
441 static PbcMode
getPbcMode(const t_pbc
*pbcPtr
,
442 const gmx_vsite_t
*vsite
)
444 if (pbcPtr
== nullptr)
446 return PbcMode::none
;
448 else if (vsite
!= nullptr && vsite
->bHaveChargeGroups
)
450 return PbcMode::chargeGroup
;
458 static void construct_vsites_thread(const gmx_vsite_t
*vsite
,
461 const t_iparams ip
[], const t_ilist ilist
[],
462 const t_pbc
*pbc_null
)
474 const PbcMode pbcMode
= getPbcMode(pbc_null
, vsite
);
475 /* We need another pbc pointer, as with charge groups we switch per vsite */
476 const t_pbc
*pbc_null2
= pbc_null
;
477 const int *vsite_pbc
= nullptr;
479 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
481 if (ilist
[ftype
].nr
== 0)
487 int nra
= interaction_function
[ftype
].nratoms
;
489 int nr
= ilist
[ftype
].nr
;
491 const t_iatom
*ia
= ilist
[ftype
].iatoms
;
493 if (pbcMode
== PbcMode::chargeGroup
)
495 vsite_pbc
= vsite
->vsite_pbc_loc
[ftype
- c_ftypeVsiteStart
];
498 for (int i
= 0; i
< nr
; )
501 /* The vsite and constructing atoms */
504 /* Constants for constructing vsites */
505 real a1
= ip
[tp
].vsite
.a
;
506 /* Check what kind of pbc we need to use */
509 if (pbcMode
== PbcMode::all
)
511 /* No charge groups, vsite follows its own pbc */
513 copy_rvec(x
[avsite
], xpbc
);
515 else if (pbcMode
== PbcMode::chargeGroup
)
517 pbc_atom
= vsite_pbc
[i
/(1 + nra
)];
522 /* We need to copy the coordinates here,
523 * single for single atom cg's pbc_atom
524 * is the vsite itself.
526 copy_rvec(x
[pbc_atom
], xpbc
);
528 pbc_null2
= pbc_null
;
539 /* Copy the old position */
541 copy_rvec(x
[avsite
], xv
);
543 /* Construct the vsite depending on type */
550 constr_vsite2(x
[ai
], x
[aj
], x
[avsite
], a1
, pbc_null2
);
556 constr_vsite3(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
562 constr_vsite3FD(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
568 constr_vsite3FAD(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
575 constr_vsite3OUT(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, c1
, pbc_null2
);
583 constr_vsite4FD(x
[ai
], x
[aj
], x
[ak
], x
[al
], x
[avsite
], a1
, b1
, c1
,
592 constr_vsite4FDN(x
[ai
], x
[aj
], x
[ak
], x
[al
], x
[avsite
], a1
, b1
, c1
,
596 inc
= constr_vsiten(ia
, ip
, x
, pbc_null2
);
599 gmx_fatal(FARGS
, "No such vsite type %d in %s, line %d",
600 ftype
, __FILE__
, __LINE__
);
605 /* Match the pbc of this vsite to the rest of its charge group */
607 int ishift
= pbc_dx_aiuc(pbc_null
, x
[avsite
], xpbc
, dx
);
608 if (ishift
!= CENTRAL
)
610 rvec_add(xpbc
, dx
, x
[avsite
]);
615 /* Calculate velocity of vsite... */
617 rvec_sub(x
[avsite
], xv
, vv
);
618 svmul(inv_dt
, vv
, v
[avsite
]);
621 /* Increment loop variables */
629 void construct_vsites(const gmx_vsite_t
*vsite
,
632 const t_iparams ip
[], const t_ilist ilist
[],
633 int ePBC
, gmx_bool bMolPBC
,
637 const bool useDomdec
= (vsite
!= nullptr && vsite
->useDomdec
);
638 GMX_ASSERT(!useDomdec
|| (cr
!= nullptr && DOMAINDECOMP(cr
)), "When vsites are set up with domain decomposition, we need a valid commrec");
639 // TODO: Remove this assertion when we remove charge groups
640 GMX_ASSERT(vsite
!= nullptr || ePBC
== epbcNONE
, "Without a vsite struct we can not do PBC (in case we have charge groups)");
642 t_pbc pbc
, *pbc_null
;
644 /* We only need to do pbc when we have inter-cg vsites.
645 * Note that with domain decomposition we do not need to apply PBC here
646 * when we have at least 3 domains along each dimension. Currently we
647 * do not optimize this case.
649 if (ePBC
!= epbcNONE
&& (useDomdec
|| bMolPBC
) &&
650 !(vsite
!= nullptr && vsite
->n_intercg_vsite
== 0))
652 /* This is wasting some CPU time as we now do this multiple times
656 clear_ivec(null_ivec
);
657 pbc_null
= set_pbc_dd(&pbc
, ePBC
,
658 useDomdec
? cr
->dd
->nc
: null_ivec
,
668 dd_move_x_vsites(cr
->dd
, box
, x
);
671 // cppcheck-suppress nullPointerRedundantCheck
672 if (vsite
== nullptr || vsite
->nthreads
== 1)
674 construct_vsites_thread(vsite
,
681 #pragma omp parallel num_threads(vsite->nthreads)
685 const int th
= gmx_omp_get_thread_num();
686 const VsiteThread
&tData
= *vsite
->tData
[th
];
687 GMX_ASSERT(tData
.rangeStart
>= 0, "The thread data should be initialized before calling construct_vsites");
689 construct_vsites_thread(vsite
,
693 if (tData
.useInterdependentTask
)
695 /* Here we don't need a barrier (unlike the spreading),
696 * since both tasks only construct vsites from particles,
697 * or local vsites, not from non-local vsites.
699 construct_vsites_thread(vsite
,
701 ip
, tData
.idTask
.ilist
,
705 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
707 /* Now we can construct the vsites that might depend on other vsites */
708 construct_vsites_thread(vsite
,
710 ip
, vsite
->tData
[vsite
->nthreads
]->ilist
,
715 static void spread_vsite2(const t_iatom ia
[], real a
,
717 rvec f
[], rvec fshift
[],
718 const t_pbc
*pbc
, const t_graph
*g
)
729 svmul(1 - a
, f
[av
], fi
);
730 svmul( a
, f
[av
], fj
);
739 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, av
), di
);
741 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, aj
), di
);
746 siv
= pbc_dx_aiuc(pbc
, x
[ai
], x
[av
], dx
);
747 sij
= pbc_dx_aiuc(pbc
, x
[ai
], x
[aj
], dx
);
755 if (fshift
&& (siv
!= CENTRAL
|| sij
!= CENTRAL
))
757 rvec_inc(fshift
[siv
], f
[av
]);
758 rvec_dec(fshift
[CENTRAL
], fi
);
759 rvec_dec(fshift
[sij
], fj
);
762 /* TOTAL: 13 flops */
765 void constructVsitesGlobal(const gmx_mtop_t
&mtop
,
766 gmx::ArrayRef
<gmx::RVec
> x
)
768 GMX_ASSERT(x
.size() >= static_cast<size_t>(mtop
.natoms
), "x should contain the whole system");
770 for (int mb
= 0; mb
< mtop
.nmolblock
; mb
++)
772 const gmx_molblock_t
&molb
= mtop
.molblock
[mb
];
773 const gmx_moltype_t
&molt
= mtop
.moltype
[molb
.type
];
774 if (vsiteIlistNrCount(molt
.ilist
) > 0)
776 int atomOffset
= molb
.globalAtomStart
;
777 for (int mol
= 0; mol
< molb
.nmol
; mol
++)
779 construct_vsites(nullptr, as_rvec_array(x
.data()) + atomOffset
,
781 mtop
.ffparams
.iparams
, molt
.ilist
,
782 epbcNONE
, TRUE
, nullptr, nullptr);
783 atomOffset
+= molt
.atoms
.nr
;
789 static void spread_vsite3(const t_iatom ia
[], real a
, real b
,
791 rvec f
[], rvec fshift
[],
792 const t_pbc
*pbc
, const t_graph
*g
)
804 svmul(1 - a
- b
, f
[av
], fi
);
805 svmul( a
, f
[av
], fj
);
806 svmul( b
, f
[av
], fk
);
816 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, ia
[1]), di
);
818 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, aj
), di
);
820 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, ak
), di
);
825 siv
= pbc_dx_aiuc(pbc
, x
[ai
], x
[av
], dx
);
826 sij
= pbc_dx_aiuc(pbc
, x
[ai
], x
[aj
], dx
);
827 sik
= pbc_dx_aiuc(pbc
, x
[ai
], x
[ak
], dx
);
836 if (fshift
&& (siv
!= CENTRAL
|| sij
!= CENTRAL
|| sik
!= CENTRAL
))
838 rvec_inc(fshift
[siv
], f
[av
]);
839 rvec_dec(fshift
[CENTRAL
], fi
);
840 rvec_dec(fshift
[sij
], fj
);
841 rvec_dec(fshift
[sik
], fk
);
844 /* TOTAL: 20 flops */
847 static void spread_vsite3FD(const t_iatom ia
[], real a
, real b
,
849 rvec f
[], rvec fshift
[],
850 gmx_bool VirCorr
, matrix dxdf
,
851 const t_pbc
*pbc
, const t_graph
*g
)
853 real c
, invl
, fproj
, a1
;
854 rvec xvi
, xij
, xjk
, xix
, fv
, temp
;
855 t_iatom av
, ai
, aj
, ak
;
863 copy_rvec(f
[av
], fv
);
865 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
866 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
869 /* xix goes from i to point x on the line jk */
870 xix
[XX
] = xij
[XX
]+a
*xjk
[XX
];
871 xix
[YY
] = xij
[YY
]+a
*xjk
[YY
];
872 xix
[ZZ
] = xij
[ZZ
]+a
*xjk
[ZZ
];
875 invl
= gmx::invsqrt(iprod(xix
, xix
));
879 fproj
= iprod(xix
, fv
)*invl
*invl
; /* = (xix . f)/(xix . xix) */
881 temp
[XX
] = c
*(fv
[XX
]-fproj
*xix
[XX
]);
882 temp
[YY
] = c
*(fv
[YY
]-fproj
*xix
[YY
]);
883 temp
[ZZ
] = c
*(fv
[ZZ
]-fproj
*xix
[ZZ
]);
886 /* c is already calculated in constr_vsite3FD
887 storing c somewhere will save 26 flops! */
890 f
[ai
][XX
] += fv
[XX
] - temp
[XX
];
891 f
[ai
][YY
] += fv
[YY
] - temp
[YY
];
892 f
[ai
][ZZ
] += fv
[ZZ
] - temp
[ZZ
];
893 f
[aj
][XX
] += a1
*temp
[XX
];
894 f
[aj
][YY
] += a1
*temp
[YY
];
895 f
[aj
][ZZ
] += a1
*temp
[ZZ
];
896 f
[ak
][XX
] += a
*temp
[XX
];
897 f
[ak
][YY
] += a
*temp
[YY
];
898 f
[ak
][ZZ
] += a
*temp
[ZZ
];
903 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
905 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
907 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, aj
), di
);
912 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
919 if (fshift
&& (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
))
921 rvec_dec(fshift
[svi
], fv
);
922 fshift
[CENTRAL
][XX
] += fv
[XX
] - (1 + a
)*temp
[XX
];
923 fshift
[CENTRAL
][YY
] += fv
[YY
] - (1 + a
)*temp
[YY
];
924 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - (1 + a
)*temp
[ZZ
];
925 fshift
[ sji
][XX
] += temp
[XX
];
926 fshift
[ sji
][YY
] += temp
[YY
];
927 fshift
[ sji
][ZZ
] += temp
[ZZ
];
928 fshift
[ skj
][XX
] += a
*temp
[XX
];
929 fshift
[ skj
][YY
] += a
*temp
[YY
];
930 fshift
[ skj
][ZZ
] += a
*temp
[ZZ
];
935 /* When VirCorr=TRUE, the virial for the current forces is not
936 * calculated from the redistributed forces. This means that
937 * the effect of non-linear virtual site constructions on the virial
938 * needs to be added separately. This contribution can be calculated
939 * in many ways, but the simplest and cheapest way is to use
940 * the first constructing atom ai as a reference position in space:
941 * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
945 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
947 for (int i
= 0; i
< DIM
; i
++)
949 for (int j
= 0; j
< DIM
; j
++)
951 /* As xix is a linear combination of j and k, use that here */
952 dxdf
[i
][j
] += -xiv
[i
]*fv
[j
] + xix
[i
]*temp
[j
];
957 /* TOTAL: 61 flops */
960 static void spread_vsite3FAD(const t_iatom ia
[], real a
, real b
,
962 rvec f
[], rvec fshift
[],
963 gmx_bool VirCorr
, matrix dxdf
,
964 const t_pbc
*pbc
, const t_graph
*g
)
966 rvec xvi
, xij
, xjk
, xperp
, Fpij
, Fppp
, fv
, f1
, f2
, f3
;
967 real a1
, b1
, c1
, c2
, invdij
, invdij2
, invdp
, fproj
;
968 t_iatom av
, ai
, aj
, ak
;
969 int svi
, sji
, skj
, d
;
976 copy_rvec(f
[ia
[1]], fv
);
978 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
979 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
982 invdij
= gmx::invsqrt(iprod(xij
, xij
));
983 invdij2
= invdij
* invdij
;
984 c1
= iprod(xij
, xjk
) * invdij2
;
985 xperp
[XX
] = xjk
[XX
] - c1
*xij
[XX
];
986 xperp
[YY
] = xjk
[YY
] - c1
*xij
[YY
];
987 xperp
[ZZ
] = xjk
[ZZ
] - c1
*xij
[ZZ
];
988 /* xperp in plane ijk, perp. to ij */
989 invdp
= gmx::invsqrt(iprod(xperp
, xperp
));
994 /* a1, b1 and c1 are already calculated in constr_vsite3FAD
995 storing them somewhere will save 45 flops! */
997 fproj
= iprod(xij
, fv
)*invdij2
;
998 svmul(fproj
, xij
, Fpij
); /* proj. f on xij */
999 svmul(iprod(xperp
, fv
)*invdp
*invdp
, xperp
, Fppp
); /* proj. f on xperp */
1000 svmul(b1
*fproj
, xperp
, f3
);
1003 rvec_sub(fv
, Fpij
, f1
); /* f1 = f - Fpij */
1004 rvec_sub(f1
, Fppp
, f2
); /* f2 = f - Fpij - Fppp */
1005 for (d
= 0; (d
< DIM
); d
++)
1013 f
[ai
][XX
] += fv
[XX
] - f1
[XX
] + c1
*f2
[XX
] + f3
[XX
];
1014 f
[ai
][YY
] += fv
[YY
] - f1
[YY
] + c1
*f2
[YY
] + f3
[YY
];
1015 f
[ai
][ZZ
] += fv
[ZZ
] - f1
[ZZ
] + c1
*f2
[ZZ
] + f3
[ZZ
];
1016 f
[aj
][XX
] += f1
[XX
] - c2
*f2
[XX
] - f3
[XX
];
1017 f
[aj
][YY
] += f1
[YY
] - c2
*f2
[YY
] - f3
[YY
];
1018 f
[aj
][ZZ
] += f1
[ZZ
] - c2
*f2
[ZZ
] - f3
[ZZ
];
1019 f
[ak
][XX
] += f2
[XX
];
1020 f
[ak
][YY
] += f2
[YY
];
1021 f
[ak
][ZZ
] += f2
[ZZ
];
1026 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
1028 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1030 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, aj
), di
);
1035 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1042 if (fshift
&& (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
))
1044 rvec_dec(fshift
[svi
], fv
);
1045 fshift
[CENTRAL
][XX
] += fv
[XX
] - f1
[XX
] - (1-c1
)*f2
[XX
] + f3
[XX
];
1046 fshift
[CENTRAL
][YY
] += fv
[YY
] - f1
[YY
] - (1-c1
)*f2
[YY
] + f3
[YY
];
1047 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - f1
[ZZ
] - (1-c1
)*f2
[ZZ
] + f3
[ZZ
];
1048 fshift
[ sji
][XX
] += f1
[XX
] - c1
*f2
[XX
] - f3
[XX
];
1049 fshift
[ sji
][YY
] += f1
[YY
] - c1
*f2
[YY
] - f3
[YY
];
1050 fshift
[ sji
][ZZ
] += f1
[ZZ
] - c1
*f2
[ZZ
] - f3
[ZZ
];
1051 fshift
[ skj
][XX
] += f2
[XX
];
1052 fshift
[ skj
][YY
] += f2
[YY
];
1053 fshift
[ skj
][ZZ
] += f2
[ZZ
];
1061 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1063 for (i
= 0; i
< DIM
; i
++)
1065 for (j
= 0; j
< DIM
; j
++)
1067 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1070 + xij
[i
]*(f1
[j
] + (1 - c2
)*f2
[j
] - f3
[j
])
1076 /* TOTAL: 113 flops */
1079 static void spread_vsite3OUT(const t_iatom ia
[], real a
, real b
, real c
,
1081 rvec f
[], rvec fshift
[],
1082 gmx_bool VirCorr
, matrix dxdf
,
1083 const t_pbc
*pbc
, const t_graph
*g
)
1085 rvec xvi
, xij
, xik
, fv
, fj
, fk
;
1096 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1097 ski
= pbc_rvec_sub(pbc
, x
[ak
], x
[ai
], xik
);
1100 copy_rvec(f
[av
], fv
);
1107 fj
[XX
] = a
*fv
[XX
] - xik
[ZZ
]*cfy
+ xik
[YY
]*cfz
;
1108 fj
[YY
] = xik
[ZZ
]*cfx
+ a
*fv
[YY
] - xik
[XX
]*cfz
;
1109 fj
[ZZ
] = -xik
[YY
]*cfx
+ xik
[XX
]*cfy
+ a
*fv
[ZZ
];
1111 fk
[XX
] = b
*fv
[XX
] + xij
[ZZ
]*cfy
- xij
[YY
]*cfz
;
1112 fk
[YY
] = -xij
[ZZ
]*cfx
+ b
*fv
[YY
] + xij
[XX
]*cfz
;
1113 fk
[ZZ
] = xij
[YY
]*cfx
- xij
[XX
]*cfy
+ b
*fv
[ZZ
];
1116 f
[ai
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
];
1117 f
[ai
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
];
1118 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
];
1119 rvec_inc(f
[aj
], fj
);
1120 rvec_inc(f
[ak
], fk
);
1125 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
1127 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1129 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, ai
), di
);
1134 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1141 if (fshift
&& (svi
!= CENTRAL
|| sji
!= CENTRAL
|| ski
!= CENTRAL
))
1143 rvec_dec(fshift
[svi
], fv
);
1144 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
];
1145 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
];
1146 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
];
1147 rvec_inc(fshift
[sji
], fj
);
1148 rvec_inc(fshift
[ski
], fk
);
1155 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1157 for (int i
= 0; i
< DIM
; i
++)
1159 for (int j
= 0; j
< DIM
; j
++)
1161 dxdf
[i
][j
] += -xiv
[i
]*fv
[j
] + xij
[i
]*fj
[j
] + xik
[i
]*fk
[j
];
1166 /* TOTAL: 54 flops */
1169 static void spread_vsite4FD(const t_iatom ia
[], real a
, real b
, real c
,
1171 rvec f
[], rvec fshift
[],
1172 gmx_bool VirCorr
, matrix dxdf
,
1173 const t_pbc
*pbc
, const t_graph
*g
)
1175 real d
, invl
, fproj
, a1
;
1176 rvec xvi
, xij
, xjk
, xjl
, xix
, fv
, temp
;
1177 int av
, ai
, aj
, ak
, al
;
1179 int svi
, sji
, skj
, slj
, m
;
1187 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1188 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
1189 slj
= pbc_rvec_sub(pbc
, x
[al
], x
[aj
], xjl
);
1192 /* xix goes from i to point x on the plane jkl */
1193 for (m
= 0; m
< DIM
; m
++)
1195 xix
[m
] = xij
[m
] + a
*xjk
[m
] + b
*xjl
[m
];
1199 invl
= gmx::invsqrt(iprod(xix
, xix
));
1201 /* 4 + ?10? flops */
1203 copy_rvec(f
[av
], fv
);
1205 fproj
= iprod(xix
, fv
)*invl
*invl
; /* = (xix . f)/(xix . xix) */
1207 for (m
= 0; m
< DIM
; m
++)
1209 temp
[m
] = d
*(fv
[m
] - fproj
*xix
[m
]);
1213 /* c is already calculated in constr_vsite3FD
1214 storing c somewhere will save 35 flops! */
1217 for (m
= 0; m
< DIM
; m
++)
1219 f
[ai
][m
] += fv
[m
] - temp
[m
];
1220 f
[aj
][m
] += a1
*temp
[m
];
1221 f
[ak
][m
] += a
*temp
[m
];
1222 f
[al
][m
] += b
*temp
[m
];
1228 ivec_sub(SHIFT_IVEC(g
, ia
[1]), SHIFT_IVEC(g
, ai
), di
);
1230 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1232 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, aj
), di
);
1234 ivec_sub(SHIFT_IVEC(g
, al
), SHIFT_IVEC(g
, aj
), di
);
1239 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1247 (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
|| slj
!= CENTRAL
))
1249 rvec_dec(fshift
[svi
], fv
);
1250 for (m
= 0; m
< DIM
; m
++)
1252 fshift
[CENTRAL
][m
] += fv
[m
] - (1 + a
+ b
)*temp
[m
];
1253 fshift
[ sji
][m
] += temp
[m
];
1254 fshift
[ skj
][m
] += a
*temp
[m
];
1255 fshift
[ slj
][m
] += b
*temp
[m
];
1264 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1266 for (i
= 0; i
< DIM
; i
++)
1268 for (j
= 0; j
< DIM
; j
++)
1270 dxdf
[i
][j
] += -xiv
[i
]*fv
[j
] + xix
[i
]*temp
[j
];
1275 /* TOTAL: 77 flops */
1279 static void spread_vsite4FDN(const t_iatom ia
[], real a
, real b
, real c
,
1281 rvec f
[], rvec fshift
[],
1282 gmx_bool VirCorr
, matrix dxdf
,
1283 const t_pbc
*pbc
, const t_graph
*g
)
1285 rvec xvi
, xij
, xik
, xil
, ra
, rb
, rja
, rjb
, rab
, rm
, rt
;
1286 rvec fv
, fj
, fk
, fl
;
1290 int av
, ai
, aj
, ak
, al
;
1291 int svi
, sij
, sik
, sil
;
1293 /* DEBUG: check atom indices */
1300 copy_rvec(f
[av
], fv
);
1302 sij
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1303 sik
= pbc_rvec_sub(pbc
, x
[ak
], x
[ai
], xik
);
1304 sil
= pbc_rvec_sub(pbc
, x
[al
], x
[ai
], xil
);
1317 rvec_sub(ra
, xij
, rja
);
1318 rvec_sub(rb
, xij
, rjb
);
1319 rvec_sub(rb
, ra
, rab
);
1322 cprod(rja
, rjb
, rm
);
1325 invrm
= gmx::invsqrt(norm2(rm
));
1326 denom
= invrm
*invrm
;
1329 cfx
= c
*invrm
*fv
[XX
];
1330 cfy
= c
*invrm
*fv
[YY
];
1331 cfz
= c
*invrm
*fv
[ZZ
];
1342 fj
[XX
] = ( -rm
[XX
]*rt
[XX
]) * cfx
+ ( rab
[ZZ
]-rm
[YY
]*rt
[XX
]) * cfy
+ (-rab
[YY
]-rm
[ZZ
]*rt
[XX
]) * cfz
;
1343 fj
[YY
] = (-rab
[ZZ
]-rm
[XX
]*rt
[YY
]) * cfx
+ ( -rm
[YY
]*rt
[YY
]) * cfy
+ ( rab
[XX
]-rm
[ZZ
]*rt
[YY
]) * cfz
;
1344 fj
[ZZ
] = ( rab
[YY
]-rm
[XX
]*rt
[ZZ
]) * cfx
+ (-rab
[XX
]-rm
[YY
]*rt
[ZZ
]) * cfy
+ ( -rm
[ZZ
]*rt
[ZZ
]) * cfz
;
1355 fk
[XX
] = ( -rm
[XX
]*rt
[XX
]) * cfx
+ (-a
*rjb
[ZZ
]-rm
[YY
]*rt
[XX
]) * cfy
+ ( a
*rjb
[YY
]-rm
[ZZ
]*rt
[XX
]) * cfz
;
1356 fk
[YY
] = ( a
*rjb
[ZZ
]-rm
[XX
]*rt
[YY
]) * cfx
+ ( -rm
[YY
]*rt
[YY
]) * cfy
+ (-a
*rjb
[XX
]-rm
[ZZ
]*rt
[YY
]) * cfz
;
1357 fk
[ZZ
] = (-a
*rjb
[YY
]-rm
[XX
]*rt
[ZZ
]) * cfx
+ ( a
*rjb
[XX
]-rm
[YY
]*rt
[ZZ
]) * cfy
+ ( -rm
[ZZ
]*rt
[ZZ
]) * cfz
;
1368 fl
[XX
] = ( -rm
[XX
]*rt
[XX
]) * cfx
+ ( b
*rja
[ZZ
]-rm
[YY
]*rt
[XX
]) * cfy
+ (-b
*rja
[YY
]-rm
[ZZ
]*rt
[XX
]) * cfz
;
1369 fl
[YY
] = (-b
*rja
[ZZ
]-rm
[XX
]*rt
[YY
]) * cfx
+ ( -rm
[YY
]*rt
[YY
]) * cfy
+ ( b
*rja
[XX
]-rm
[ZZ
]*rt
[YY
]) * cfz
;
1370 fl
[ZZ
] = ( b
*rja
[YY
]-rm
[XX
]*rt
[ZZ
]) * cfx
+ (-b
*rja
[XX
]-rm
[YY
]*rt
[ZZ
]) * cfy
+ ( -rm
[ZZ
]*rt
[ZZ
]) * cfz
;
1373 f
[ai
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
] - fl
[XX
];
1374 f
[ai
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
] - fl
[YY
];
1375 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
] - fl
[ZZ
];
1376 rvec_inc(f
[aj
], fj
);
1377 rvec_inc(f
[ak
], fk
);
1378 rvec_inc(f
[al
], fl
);
1383 ivec_sub(SHIFT_IVEC(g
, av
), SHIFT_IVEC(g
, ai
), di
);
1385 ivec_sub(SHIFT_IVEC(g
, aj
), SHIFT_IVEC(g
, ai
), di
);
1387 ivec_sub(SHIFT_IVEC(g
, ak
), SHIFT_IVEC(g
, ai
), di
);
1389 ivec_sub(SHIFT_IVEC(g
, al
), SHIFT_IVEC(g
, ai
), di
);
1394 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1401 if (fshift
&& (svi
!= CENTRAL
|| sij
!= CENTRAL
|| sik
!= CENTRAL
|| sil
!= CENTRAL
))
1403 rvec_dec(fshift
[svi
], fv
);
1404 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
] - fl
[XX
];
1405 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
] - fl
[YY
];
1406 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
] - fl
[ZZ
];
1407 rvec_inc(fshift
[sij
], fj
);
1408 rvec_inc(fshift
[sik
], fk
);
1409 rvec_inc(fshift
[sil
], fl
);
1417 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1419 for (i
= 0; i
< DIM
; i
++)
1421 for (j
= 0; j
< DIM
; j
++)
1423 dxdf
[i
][j
] += -xiv
[i
]*fv
[j
] + xij
[i
]*fj
[j
] + xik
[i
]*fk
[j
] + xil
[i
]*fl
[j
];
1428 /* Total: 207 flops (Yuck!) */
1432 static int spread_vsiten(const t_iatom ia
[], const t_iparams ip
[],
1434 rvec f
[], rvec fshift
[],
1435 const t_pbc
*pbc
, const t_graph
*g
)
1443 n3
= 3*ip
[ia
[0]].vsiten
.n
;
1445 copy_rvec(x
[av
], xv
);
1447 for (i
= 0; i
< n3
; i
+= 3)
1452 ivec_sub(SHIFT_IVEC(g
, ai
), SHIFT_IVEC(g
, av
), di
);
1457 siv
= pbc_dx_aiuc(pbc
, x
[ai
], xv
, dx
);
1463 a
= ip
[ia
[i
]].vsiten
.a
;
1464 svmul(a
, f
[av
], fi
);
1465 rvec_inc(f
[ai
], fi
);
1466 if (fshift
&& siv
!= CENTRAL
)
1468 rvec_inc(fshift
[siv
], fi
);
1469 rvec_dec(fshift
[CENTRAL
], fi
);
1478 static int vsite_count(const t_ilist
*ilist
, int ftype
)
1480 if (ftype
== F_VSITEN
)
1482 return ilist
[ftype
].nr
/3;
1486 return ilist
[ftype
].nr
/(1 + interaction_function
[ftype
].nratoms
);
1490 static void spread_vsite_f_thread(const gmx_vsite_t
*vsite
,
1492 rvec f
[], rvec
*fshift
,
1493 gmx_bool VirCorr
, matrix dxdf
,
1494 t_iparams ip
[], const t_ilist ilist
[],
1495 const t_graph
*g
, const t_pbc
*pbc_null
)
1497 const PbcMode pbcMode
= getPbcMode(pbc_null
, vsite
);
1498 /* We need another pbc pointer, as with charge groups we switch per vsite */
1499 const t_pbc
*pbc_null2
= pbc_null
;
1500 const int *vsite_pbc
= nullptr;
1502 /* this loop goes backwards to be able to build *
1503 * higher type vsites from lower types */
1504 for (int ftype
= c_ftypeVsiteEnd
- 1; ftype
>= c_ftypeVsiteStart
; ftype
--)
1506 if (ilist
[ftype
].nr
== 0)
1512 int nra
= interaction_function
[ftype
].nratoms
;
1514 int nr
= ilist
[ftype
].nr
;
1516 const t_iatom
*ia
= ilist
[ftype
].iatoms
;
1518 if (pbcMode
== PbcMode::all
)
1520 pbc_null2
= pbc_null
;
1522 else if (pbcMode
== PbcMode::chargeGroup
)
1524 vsite_pbc
= vsite
->vsite_pbc_loc
[ftype
- c_ftypeVsiteStart
];
1527 for (int i
= 0; i
< nr
; )
1529 if (vsite_pbc
!= nullptr)
1531 if (vsite_pbc
[i
/(1 + nra
)] > -2)
1533 pbc_null2
= pbc_null
;
1537 pbc_null2
= nullptr;
1543 /* Constants for constructing */
1545 a1
= ip
[tp
].vsite
.a
;
1546 /* Construct the vsite depending on type */
1550 spread_vsite2(ia
, a1
, x
, f
, fshift
, pbc_null2
, g
);
1553 b1
= ip
[tp
].vsite
.b
;
1554 spread_vsite3(ia
, a1
, b1
, x
, f
, fshift
, pbc_null2
, g
);
1557 b1
= ip
[tp
].vsite
.b
;
1558 spread_vsite3FD(ia
, a1
, b1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1561 b1
= ip
[tp
].vsite
.b
;
1562 spread_vsite3FAD(ia
, a1
, b1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1565 b1
= ip
[tp
].vsite
.b
;
1566 c1
= ip
[tp
].vsite
.c
;
1567 spread_vsite3OUT(ia
, a1
, b1
, c1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1570 b1
= ip
[tp
].vsite
.b
;
1571 c1
= ip
[tp
].vsite
.c
;
1572 spread_vsite4FD(ia
, a1
, b1
, c1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1575 b1
= ip
[tp
].vsite
.b
;
1576 c1
= ip
[tp
].vsite
.c
;
1577 spread_vsite4FDN(ia
, a1
, b1
, c1
, x
, f
, fshift
, VirCorr
, dxdf
, pbc_null2
, g
);
1580 inc
= spread_vsiten(ia
, ip
, x
, f
, fshift
, pbc_null2
, g
);
1583 gmx_fatal(FARGS
, "No such vsite type %d in %s, line %d",
1584 ftype
, __FILE__
, __LINE__
);
1586 clear_rvec(f
[ia
[1]]);
1588 /* Increment loop variables */
1596 /*! \brief Clears the task force buffer elements that are written by task idTask */
1597 static void clearTaskForceBufferUsedElements(InterdependentTask
*idTask
)
1599 int ntask
= idTask
->spreadTask
.size();
1600 for (int ti
= 0; ti
< ntask
; ti
++)
1602 const AtomIndex
*atomList
= &idTask
->atomIndex
[idTask
->spreadTask
[ti
]];
1603 int natom
= atomList
->atom
.size();
1604 RVec
*force
= idTask
->force
.data();
1605 for (int i
= 0; i
< natom
; i
++)
1607 clear_rvec(force
[atomList
->atom
[i
]]);
1612 void spread_vsite_f(const gmx_vsite_t
*vsite
,
1613 const rvec
* gmx_restrict x
,
1614 rvec
* gmx_restrict f
, rvec
* gmx_restrict fshift
,
1615 gmx_bool VirCorr
, matrix vir
,
1616 t_nrnb
*nrnb
, const t_idef
*idef
,
1617 int ePBC
, gmx_bool bMolPBC
, const t_graph
*g
, const matrix box
,
1620 const bool useDomdec
= vsite
->useDomdec
;
1621 GMX_ASSERT(!useDomdec
|| (cr
!= nullptr && DOMAINDECOMP(cr
)), "When vsites are set up with domain decomposition, we need a valid commrec");
1623 t_pbc pbc
, *pbc_null
;
1625 /* We only need to do pbc when we have inter-cg vsites */
1626 if ((useDomdec
|| bMolPBC
) && vsite
->n_intercg_vsite
)
1628 /* This is wasting some CPU time as we now do this multiple times
1631 pbc_null
= set_pbc_dd(&pbc
, ePBC
, useDomdec
? cr
->dd
->nc
: nullptr, FALSE
, box
);
1640 dd_clear_f_vsites(cr
->dd
, f
);
1643 if (vsite
->nthreads
== 1)
1650 spread_vsite_f_thread(vsite
,
1653 idef
->iparams
, idef
->il
,
1658 for (int i
= 0; i
< DIM
; i
++)
1660 for (int j
= 0; j
< DIM
; j
++)
1662 vir
[i
][j
] += -0.5*dxdf
[i
][j
];
1669 /* First spread the vsites that might depend on non-local vsites */
1672 clear_mat(vsite
->tData
[vsite
->nthreads
]->dxdf
);
1674 spread_vsite_f_thread(vsite
,
1676 VirCorr
, vsite
->tData
[vsite
->nthreads
]->dxdf
,
1678 vsite
->tData
[vsite
->nthreads
]->ilist
,
1681 #pragma omp parallel num_threads(vsite->nthreads)
1685 int thread
= gmx_omp_get_thread_num();
1686 VsiteThread
*tData
= vsite
->tData
[thread
];
1689 if (thread
== 0 || fshift
== nullptr)
1695 fshift_t
= tData
->fshift
;
1697 for (int i
= 0; i
< SHIFTS
; i
++)
1699 clear_rvec(fshift_t
[i
]);
1704 clear_mat(tData
->dxdf
);
1707 if (tData
->useInterdependentTask
)
1709 /* Spread the vsites that spread outside our local range.
1710 * This is done using a thread-local force buffer force.
1711 * First we need to copy the input vsite forces to force.
1713 InterdependentTask
*idTask
= &tData
->idTask
;
1715 /* Clear the buffer elements set by our task during
1716 * the last call to spread_vsite_f.
1718 clearTaskForceBufferUsedElements(idTask
);
1720 int nvsite
= idTask
->vsite
.size();
1721 for (int i
= 0; i
< nvsite
; i
++)
1723 copy_rvec(f
[idTask
->vsite
[i
]],
1724 idTask
->force
[idTask
->vsite
[i
]]);
1726 spread_vsite_f_thread(vsite
,
1727 x
, as_rvec_array(idTask
->force
.data()), fshift_t
,
1728 VirCorr
, tData
->dxdf
,
1730 tData
->idTask
.ilist
,
1733 /* We need a barrier before reducing forces below
1734 * that have been produced by a different thread above.
1738 /* Loop over all thread task and reduce forces they
1739 * produced on atoms that fall in our range.
1740 * Note that atomic reduction would be a simpler solution,
1741 * but that might not have good support on all platforms.
1743 int ntask
= idTask
->reduceTask
.size();
1744 for (int ti
= 0; ti
< ntask
; ti
++)
1746 const InterdependentTask
*idt_foreign
= &vsite
->tData
[idTask
->reduceTask
[ti
]]->idTask
;
1747 const AtomIndex
*atomList
= &idt_foreign
->atomIndex
[thread
];
1748 const RVec
*f_foreign
= idt_foreign
->force
.data();
1750 int natom
= atomList
->atom
.size();
1751 for (int i
= 0; i
< natom
; i
++)
1753 int ind
= atomList
->atom
[i
];
1754 rvec_inc(f
[ind
], f_foreign
[ind
]);
1755 /* Clearing of f_foreign is done at the next step */
1758 /* Clear the vsite forces, both in f and force */
1759 for (int i
= 0; i
< nvsite
; i
++)
1761 int ind
= tData
->idTask
.vsite
[i
];
1763 clear_rvec(tData
->idTask
.force
[ind
]);
1767 /* Spread the vsites that spread locally only */
1768 spread_vsite_f_thread(vsite
,
1770 VirCorr
, tData
->dxdf
,
1775 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1778 if (fshift
!= nullptr)
1780 for (int th
= 1; th
< vsite
->nthreads
; th
++)
1782 for (int i
= 0; i
< SHIFTS
; i
++)
1784 rvec_inc(fshift
[i
], vsite
->tData
[th
]->fshift
[i
]);
1791 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
1793 /* MSVC doesn't like matrix references, so we use a pointer */
1794 const matrix
*dxdf
= &vsite
->tData
[th
]->dxdf
;
1796 for (int i
= 0; i
< DIM
; i
++)
1798 for (int j
= 0; j
< DIM
; j
++)
1800 vir
[i
][j
] += -0.5*(*dxdf
)[i
][j
];
1809 dd_move_f_vsites(cr
->dd
, f
, fshift
);
1812 inc_nrnb(nrnb
, eNR_VSITE2
, vsite_count(idef
->il
, F_VSITE2
));
1813 inc_nrnb(nrnb
, eNR_VSITE3
, vsite_count(idef
->il
, F_VSITE3
));
1814 inc_nrnb(nrnb
, eNR_VSITE3FD
, vsite_count(idef
->il
, F_VSITE3FD
));
1815 inc_nrnb(nrnb
, eNR_VSITE3FAD
, vsite_count(idef
->il
, F_VSITE3FAD
));
1816 inc_nrnb(nrnb
, eNR_VSITE3OUT
, vsite_count(idef
->il
, F_VSITE3OUT
));
1817 inc_nrnb(nrnb
, eNR_VSITE4FD
, vsite_count(idef
->il
, F_VSITE4FD
));
1818 inc_nrnb(nrnb
, eNR_VSITE4FDN
, vsite_count(idef
->il
, F_VSITE4FDN
));
1819 inc_nrnb(nrnb
, eNR_VSITEN
, vsite_count(idef
->il
, F_VSITEN
));
1822 /*! \brief Returns the an array with charge-group indices for each atom
1824 * \param[in] chargeGroups The charge group block struct
1826 static std::vector
<int> atom2cg(const t_block
&chargeGroups
)
1828 std::vector
<int> a2cg(chargeGroups
.index
[chargeGroups
.nr
], 0);
1830 for (int chargeGroup
= 0; chargeGroup
< chargeGroups
.nr
; chargeGroup
++)
1832 std::fill(a2cg
.begin() + chargeGroups
.index
[chargeGroup
],
1833 a2cg
.begin() + chargeGroups
.index
[chargeGroup
+ 1],
1840 int count_intercg_vsites(const gmx_mtop_t
*mtop
)
1842 gmx_molblock_t
*molb
;
1843 gmx_moltype_t
*molt
;
1844 int n_intercg_vsite
;
1846 n_intercg_vsite
= 0;
1847 for (int mb
= 0; mb
< mtop
->nmolblock
; mb
++)
1849 molb
= &mtop
->molblock
[mb
];
1850 molt
= &mtop
->moltype
[molb
->type
];
1852 std::vector
<int> a2cg
= atom2cg(molt
->cgs
);
1853 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
1855 int nral
= NRAL(ftype
);
1856 t_ilist
*il
= &molt
->ilist
[ftype
];
1857 const t_iatom
*ia
= il
->iatoms
;
1858 for (int i
= 0; i
< il
->nr
; i
+= 1 + nral
)
1860 int cg
= a2cg
[ia
[1+i
]];
1861 for (int a
= 1; a
< nral
; a
++)
1863 if (a2cg
[ia
[1+a
]] != cg
)
1865 n_intercg_vsite
+= molb
->nmol
;
1873 return n_intercg_vsite
;
1876 static int **get_vsite_pbc(const t_iparams
*iparams
, const t_ilist
*ilist
,
1877 const t_atom
*atom
, const t_mdatoms
*md
,
1880 /* Make an atom to charge group index */
1881 std::vector
<int> a2cg
= atom2cg(cgs
);
1883 /* Make an array that tells if the pbc of an atom is set */
1884 std::vector
<bool> pbc_set(cgs
.index
[cgs
.nr
], false);
1885 /* PBC is set for all non vsites */
1886 for (int a
= 0; a
< cgs
.index
[cgs
.nr
]; a
++)
1888 if ((atom
&& atom
[a
].ptype
!= eptVSite
) ||
1889 (md
&& md
->ptype
[a
] != eptVSite
))
1896 snew(vsite_pbc
, F_VSITEN
-F_VSITE2
+1);
1898 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
1901 int nral
= NRAL(ftype
);
1902 const t_ilist
*il
= &ilist
[ftype
];
1903 const t_iatom
*ia
= il
->iatoms
;
1906 snew(vsite_pbc
[ftype
-F_VSITE2
], il
->nr
/(1 + nral
));
1907 vsite_pbc_f
= vsite_pbc
[ftype
-F_VSITE2
];
1912 int vsi
= i
/(1 + nral
);
1913 t_iatom vsite
= ia
[i
+1];
1914 int cg_v
= a2cg
[vsite
];
1915 /* A value of -2 signals that this vsite and its contructing
1916 * atoms are all within the same cg, so no pbc is required.
1918 vsite_pbc_f
[vsi
] = -2;
1919 /* Check if constructing atoms are outside the vsite's cg */
1921 if (ftype
== F_VSITEN
)
1923 nc3
= 3*iparams
[ia
[i
]].vsiten
.n
;
1924 for (int j
= 0; j
< nc3
; j
+= 3)
1926 if (a2cg
[ia
[i
+j
+2]] != cg_v
)
1928 vsite_pbc_f
[vsi
] = -1;
1934 for (int a
= 1; a
< nral
; a
++)
1936 if (a2cg
[ia
[i
+1+a
]] != cg_v
)
1938 vsite_pbc_f
[vsi
] = -1;
1942 if (vsite_pbc_f
[vsi
] == -1)
1944 /* Check if this is the first processed atom of a vsite only cg */
1945 gmx_bool bViteOnlyCG_and_FirstAtom
= TRUE
;
1946 for (int a
= cgs
.index
[cg_v
]; a
< cgs
.index
[cg_v
+ 1]; a
++)
1948 /* Non-vsites already have pbc set, so simply check for pbc_set */
1951 bViteOnlyCG_and_FirstAtom
= FALSE
;
1955 if (bViteOnlyCG_and_FirstAtom
)
1957 /* First processed atom of a vsite only charge group.
1958 * The pbc of the input coordinates to construct_vsites
1959 * should be preserved.
1961 vsite_pbc_f
[vsi
] = vsite
;
1963 else if (cg_v
!= a2cg
[ia
[1+i
+1]])
1965 /* This vsite has a different charge group index
1966 * than it's first constructing atom
1967 * and the charge group has more than one atom,
1968 * search for the first normal particle
1969 * or vsite that already had its pbc defined.
1970 * If nothing is found, use full pbc for this vsite.
1972 for (int a
= cgs
.index
[cg_v
]; a
< cgs
.index
[cg_v
+ 1]; a
++)
1974 if (a
!= vsite
&& pbc_set
[a
])
1976 vsite_pbc_f
[vsi
] = a
;
1979 fprintf(debug
, "vsite %d match pbc with atom %d\n",
1987 fprintf(debug
, "vsite atom %d cg %d - %d pbc atom %d\n",
1988 vsite
+1, cgs
.index
[cg_v
] + 1, cgs
.index
[cg_v
+ 1],
1989 vsite_pbc_f
[vsi
] + 1);
1993 if (ftype
== F_VSITEN
)
1995 /* The other entries in vsite_pbc_f are not used for center vsites */
2003 /* This vsite now has its pbc defined */
2004 pbc_set
[vsite
] = true;
2013 gmx_vsite_t
*initVsite(const gmx_mtop_t
&mtop
,
2016 GMX_RELEASE_ASSERT(cr
!= nullptr, "We need a valid commrec");
2018 /* check if there are vsites */
2020 for (int ftype
= 0; ftype
< F_NRE
; ftype
++)
2022 if (interaction_function
[ftype
].flags
& IF_VSITE
)
2024 GMX_ASSERT(ftype
>= c_ftypeVsiteStart
&& ftype
< c_ftypeVsiteEnd
, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2026 nvsite
+= gmx_mtop_ftype_count(&mtop
, ftype
);
2030 GMX_ASSERT(ftype
< c_ftypeVsiteStart
|| ftype
>= c_ftypeVsiteEnd
, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2039 gmx_vsite_t
*vsite
= new(gmx_vsite_t
);
2041 vsite
->n_intercg_vsite
= count_intercg_vsites(&mtop
);
2043 vsite
->bHaveChargeGroups
= (ncg_mtop(&mtop
) < mtop
.natoms
);
2045 vsite
->useDomdec
= (DOMAINDECOMP(cr
) && cr
->dd
->nnodes
> 1);
2047 /* If we don't have charge groups, the vsite follows its own pbc.
2049 * With charge groups, each vsite needs to follow the pbc of the charge
2050 * group. Thus we need to keep track of PBC. Here we assume that without
2051 * domain decomposition all molecules are whole (which will not be
2052 * the case with periodic molecules).
2054 if (vsite
->bHaveChargeGroups
&&
2055 vsite
->n_intercg_vsite
> 0 &&
2058 vsite
->nvsite_pbc_molt
= mtop
.nmoltype
;
2059 snew(vsite
->vsite_pbc_molt
, vsite
->nvsite_pbc_molt
);
2060 for (int mt
= 0; mt
< mtop
.nmoltype
; mt
++)
2062 const gmx_moltype_t
&molt
= mtop
.moltype
[mt
];
2063 vsite
->vsite_pbc_molt
[mt
] = get_vsite_pbc(mtop
.ffparams
.iparams
,
2065 molt
.atoms
.atom
, nullptr,
2069 snew(vsite
->vsite_pbc_loc_nalloc
, c_ftypeVsiteEnd
- c_ftypeVsiteStart
);
2070 snew(vsite
->vsite_pbc_loc
, c_ftypeVsiteEnd
- c_ftypeVsiteStart
);
2074 vsite
->vsite_pbc_molt
= nullptr;
2075 vsite
->vsite_pbc_loc
= nullptr;
2078 vsite
->nthreads
= gmx_omp_nthreads_get(emntVSITE
);
2080 if (vsite
->nthreads
> 1)
2082 /* We need one extra thread data structure for the overlap vsites */
2083 snew(vsite
->tData
, vsite
->nthreads
+ 1);
2084 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2085 for (int thread
= 0; thread
< vsite
->nthreads
; thread
++)
2089 vsite
->tData
[thread
] = new VsiteThread
;
2091 InterdependentTask
*idTask
= &vsite
->tData
[thread
]->idTask
;
2093 idTask
->atomIndex
.resize(vsite
->nthreads
);
2095 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2097 if (vsite
->nthreads
> 1)
2099 vsite
->tData
[vsite
->nthreads
] = new VsiteThread
;
2103 vsite
->taskIndex
= nullptr;
2104 vsite
->taskIndexNalloc
= 0;
2109 static gmx_inline
void flagAtom(InterdependentTask
*idTask
, int atom
,
2110 int thread
, int nthread
, int natperthread
)
2112 if (!idTask
->use
[atom
])
2114 idTask
->use
[atom
] = true;
2115 thread
= atom
/natperthread
;
2116 /* Assign all non-local atom force writes to thread 0 */
2117 if (thread
>= nthread
)
2121 idTask
->atomIndex
[thread
].atom
.push_back(atom
);
2125 /*\brief Here we try to assign all vsites that are in our local range.
2127 * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2128 * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2129 * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2130 * but not on other vsites are assigned to task tData->id_task.ilist.
2131 * taskIndex[] is set for all vsites in our range, either to our local tasks
2132 * or to the single last task as taskIndex[]=2*nthreads.
2134 static void assignVsitesToThread(VsiteThread
*tData
,
2139 const t_ilist
*ilist
,
2140 const t_iparams
*ip
,
2141 const unsigned short *ptype
)
2143 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2145 tData
->ilist
[ftype
].nr
= 0;
2146 tData
->idTask
.ilist
[ftype
].nr
= 0;
2148 int nral1
= 1 + NRAL(ftype
);
2150 t_iatom
*iat
= ilist
[ftype
].iatoms
;
2151 for (int i
= 0; i
< ilist
[ftype
].nr
; )
2153 if (ftype
== F_VSITEN
)
2155 /* The 3 below is from 1+NRAL(ftype)=3 */
2156 inc
= ip
[iat
[i
]].vsiten
.n
*3;
2159 if (iat
[1 + i
] < tData
->rangeStart
||
2160 iat
[1 + i
] >= tData
->rangeEnd
)
2162 /* This vsite belongs to a different thread */
2167 /* We would like to assign this vsite to task thread,
2168 * but it might depend on atoms outside the atom range of thread
2169 * or on another vsite not assigned to task thread.
2172 if (ftype
!= F_VSITEN
)
2174 for (int j
= i
+ 2; j
< i
+ nral1
; j
++)
2176 /* Do a range check to avoid a harmless race on taskIndex */
2177 if (iat
[j
] < tData
->rangeStart
||
2178 iat
[j
] >= tData
->rangeEnd
||
2179 taskIndex
[iat
[j
]] != thread
)
2181 if (!tData
->useInterdependentTask
||
2182 ptype
[iat
[j
]] == eptVSite
)
2184 /* At least one constructing atom is a vsite
2185 * that is not assigned to the same thread.
2186 * Put this vsite into a separate task.
2192 /* There are constructing atoms outside our range,
2193 * put this vsite into a second task to be executed
2194 * on the same thread. During construction no barrier
2195 * is needed between the two tasks on the same thread.
2196 * During spreading we need to run this task with
2197 * an additional thread-local intermediate force buffer
2198 * (or atomic reduction) and a barrier between the two
2201 task
= nthread
+ thread
;
2207 for (int j
= i
+ 2; j
< i
+ inc
; j
+= 3)
2209 /* Do a range check to avoid a harmless race on taskIndex */
2210 if (iat
[j
] < tData
->rangeStart
||
2211 iat
[j
] >= tData
->rangeEnd
||
2212 taskIndex
[iat
[j
]] != thread
)
2214 GMX_ASSERT(ptype
[iat
[j
]] != eptVSite
, "A vsite to be assigned in assignVsitesToThread has a vsite as a constructing atom that does not belong to our task, such vsites should be assigned to the single 'master' task");
2216 task
= nthread
+ thread
;
2221 /* Update this vsite's thread index entry */
2222 taskIndex
[iat
[1+i
]] = task
;
2224 if (task
== thread
|| task
== nthread
+ thread
)
2226 /* Copy this vsite to the thread data struct of thread */
2230 il_task
= &tData
->ilist
[ftype
];
2234 il_task
= &tData
->idTask
.ilist
[ftype
];
2236 /* Ensure we have sufficient memory allocated */
2237 if (il_task
->nr
+ inc
> il_task
->nalloc
)
2239 il_task
->nalloc
= over_alloc_large(il_task
->nr
+ inc
);
2240 srenew(il_task
->iatoms
, il_task
->nalloc
);
2242 /* Copy the vsite data to the thread-task local array */
2243 for (int j
= i
; j
< i
+ inc
; j
++)
2245 il_task
->iatoms
[il_task
->nr
++] = iat
[j
];
2247 if (task
== nthread
+ thread
)
2249 /* This vsite write outside our own task force block.
2250 * Put it into the interdependent task list and flag
2251 * the atoms involved for reduction.
2253 tData
->idTask
.vsite
.push_back(iat
[i
+ 1]);
2254 if (ftype
!= F_VSITEN
)
2256 for (int j
= i
+ 2; j
< i
+ nral1
; j
++)
2258 flagAtom(&tData
->idTask
, iat
[j
],
2259 thread
, nthread
, natperthread
);
2264 for (int j
= i
+ 2; j
< i
+ inc
; j
+= 3)
2266 flagAtom(&tData
->idTask
, iat
[j
],
2267 thread
, nthread
, natperthread
);
2278 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2279 static void assignVsitesToSingleTask(VsiteThread
*tData
,
2281 const int *taskIndex
,
2282 const t_ilist
*ilist
,
2283 const t_iparams
*ip
)
2285 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2287 tData
->ilist
[ftype
].nr
= 0;
2288 tData
->idTask
.ilist
[ftype
].nr
= 0;
2290 int nral1
= 1 + NRAL(ftype
);
2292 t_iatom
*iat
= ilist
[ftype
].iatoms
;
2293 t_ilist
*il_task
= &tData
->ilist
[ftype
];
2295 for (int i
= 0; i
< ilist
[ftype
].nr
; )
2297 if (ftype
== F_VSITEN
)
2299 /* The 3 below is from 1+NRAL(ftype)=3 */
2300 inc
= ip
[iat
[i
]].vsiten
.n
*3;
2302 /* Check if the vsite is assigned to our task */
2303 if (taskIndex
[iat
[1 + i
]] == task
)
2305 /* Ensure we have sufficient memory allocated */
2306 if (il_task
->nr
+ inc
> il_task
->nalloc
)
2308 il_task
->nalloc
= over_alloc_large(il_task
->nr
+ inc
);
2309 srenew(il_task
->iatoms
, il_task
->nalloc
);
2311 /* Copy the vsite data to the thread-task local array */
2312 for (int j
= i
; j
< i
+ inc
; j
++)
2314 il_task
->iatoms
[il_task
->nr
++] = iat
[j
];
2323 void split_vsites_over_threads(const t_ilist
*ilist
,
2324 const t_iparams
*ip
,
2325 const t_mdatoms
*mdatoms
,
2328 int vsite_atom_range
, natperthread
;
2330 if (vsite
->nthreads
== 1)
2336 /* The current way of distributing the vsites over threads in primitive.
2337 * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2338 * without taking into account how the vsites are distributed.
2339 * Without domain decomposition we at least tighten the upper bound
2340 * of the range (useful for common systems such as a vsite-protein
2342 * With domain decomposition, as long as the vsites are distributed
2343 * uniformly in each domain along the major dimension, usually x,
2344 * it will also perform well.
2346 if (!vsite
->useDomdec
)
2348 vsite_atom_range
= -1;
2349 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2352 if (ftype
!= F_VSITEN
)
2354 int nral1
= 1 + NRAL(ftype
);
2355 const t_iatom
*iat
= ilist
[ftype
].iatoms
;
2356 for (int i
= 0; i
< ilist
[ftype
].nr
; i
+= nral1
)
2358 for (int j
= i
+ 1; j
< i
+ nral1
; j
++)
2360 vsite_atom_range
= std::max(vsite_atom_range
, iat
[j
]);
2368 const t_iatom
*iat
= ilist
[ftype
].iatoms
;
2371 while (i
< ilist
[ftype
].nr
)
2373 /* The 3 below is from 1+NRAL(ftype)=3 */
2374 vs_ind_end
= i
+ ip
[iat
[i
]].vsiten
.n
*3;
2376 vsite_atom_range
= std::max(vsite_atom_range
, iat
[i
+1]);
2377 while (i
< vs_ind_end
)
2379 vsite_atom_range
= std::max(vsite_atom_range
, iat
[i
+2]);
2387 natperthread
= (vsite_atom_range
+ vsite
->nthreads
- 1)/vsite
->nthreads
;
2391 /* Any local or not local atom could be involved in virtual sites.
2392 * But since we usually have very few non-local virtual sites
2393 * (only non-local vsites that depend on local vsites),
2394 * we distribute the local atom range equally over the threads.
2395 * When assigning vsites to threads, we should take care that the last
2396 * threads also covers the non-local range.
2398 vsite_atom_range
= mdatoms
->nr
;
2399 natperthread
= (mdatoms
->homenr
+ vsite
->nthreads
- 1)/vsite
->nthreads
;
2404 fprintf(debug
, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms
->nr
, vsite_atom_range
, natperthread
);
2407 /* To simplify the vsite assignment, we make an index which tells us
2408 * to which task particles, both non-vsites and vsites, are assigned.
2410 if (mdatoms
->nr
> vsite
->taskIndexNalloc
)
2412 vsite
->taskIndexNalloc
= over_alloc_large(mdatoms
->nr
);
2413 srenew(vsite
->taskIndex
, vsite
->taskIndexNalloc
);
2416 /* Initialize the task index array. Here we assign the non-vsite
2417 * particles to task=thread, so we easily figure out if vsites
2418 * depend on local and/or non-local particles in assignVsitesToThread.
2420 int *taskIndex
= vsite
->taskIndex
;
2423 for (int i
= 0; i
< mdatoms
->nr
; i
++)
2425 if (mdatoms
->ptype
[i
] == eptVSite
)
2427 /* vsites are not assigned to a task yet */
2432 /* assign non-vsite particles to task thread */
2433 taskIndex
[i
] = thread
;
2435 if (i
== (thread
+ 1)*natperthread
&& thread
< vsite
->nthreads
)
2442 #pragma omp parallel num_threads(vsite->nthreads)
2446 int thread
= gmx_omp_get_thread_num();
2447 VsiteThread
*tData
= vsite
->tData
[thread
];
2449 /* Clear the buffer use flags that were set before */
2450 if (tData
->useInterdependentTask
)
2452 InterdependentTask
*idTask
= &tData
->idTask
;
2454 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2455 * we clear the force buffer at the next step,
2456 * so we need to do it here as well.
2458 clearTaskForceBufferUsedElements(idTask
);
2460 idTask
->vsite
.resize(0);
2461 for (int t
= 0; t
< vsite
->nthreads
; t
++)
2463 AtomIndex
*atomIndex
= &idTask
->atomIndex
[t
];
2464 int natom
= atomIndex
->atom
.size();
2465 for (int i
= 0; i
< natom
; i
++)
2467 idTask
->use
[atomIndex
->atom
[i
]] = false;
2469 atomIndex
->atom
.resize(0);
2474 /* To avoid large f_buf allocations of #threads*vsite_atom_range
2475 * we don't use task2 with more than 200000 atoms. This doesn't
2476 * affect performance, since with such a large range relatively few
2477 * vsites will end up in the separate task.
2478 * Note that useTask2 should be the same for all threads.
2480 tData
->useInterdependentTask
= (vsite_atom_range
<= 200000);
2481 if (tData
->useInterdependentTask
)
2483 size_t natoms_use_in_vsites
= vsite_atom_range
;
2484 InterdependentTask
*idTask
= &tData
->idTask
;
2485 /* To avoid resizing and re-clearing every nstlist steps,
2486 * we never down size the force buffer.
2488 if (natoms_use_in_vsites
> idTask
->force
.size() ||
2489 natoms_use_in_vsites
> idTask
->use
.size())
2491 idTask
->force
.resize(natoms_use_in_vsites
, { 0, 0, 0 });
2492 idTask
->use
.resize(natoms_use_in_vsites
, false);
2496 /* Assign all vsites that can execute independently on threads */
2497 tData
->rangeStart
= thread
*natperthread
;
2498 if (thread
< vsite
->nthreads
- 1)
2500 tData
->rangeEnd
= (thread
+ 1)*natperthread
;
2504 /* The last thread should cover up to the end of the range */
2505 tData
->rangeEnd
= mdatoms
->nr
;
2507 assignVsitesToThread(tData
,
2508 thread
, vsite
->nthreads
,
2511 ilist
, ip
, mdatoms
->ptype
);
2513 if (tData
->useInterdependentTask
)
2515 /* In the worst case, all tasks write to force ranges of
2516 * all other tasks, leading to #tasks^2 scaling (this is only
2517 * the overhead, the actual flops remain constant).
2518 * But in most cases there is far less coupling. To improve
2519 * scaling at high thread counts we therefore construct
2520 * an index to only loop over the actually affected tasks.
2522 InterdependentTask
*idTask
= &tData
->idTask
;
2524 /* Ensure assignVsitesToThread finished on other threads */
2527 idTask
->spreadTask
.resize(0);
2528 idTask
->reduceTask
.resize(0);
2529 for (int t
= 0; t
< vsite
->nthreads
; t
++)
2531 /* Do we write to the force buffer of task t? */
2532 if (idTask
->atomIndex
[t
].atom
.size() > 0)
2534 idTask
->spreadTask
.push_back(t
);
2536 /* Does task t write to our force buffer? */
2537 if (vsite
->tData
[t
]->idTask
.atomIndex
[thread
].atom
.size() > 0)
2539 idTask
->reduceTask
.push_back(t
);
2544 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2546 /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2547 * to a single task that will not run in parallel with other tasks.
2549 assignVsitesToSingleTask(vsite
->tData
[vsite
->nthreads
],
2554 if (debug
&& vsite
->nthreads
> 1)
2556 fprintf(debug
, "virtual site useInterdependentTask %d, nuse:\n",
2557 vsite
->tData
[0]->useInterdependentTask
);
2558 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
2560 fprintf(debug
, " %4d", vsite
->tData
[th
]->idTask
.nuse
);
2562 fprintf(debug
, "\n");
2564 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2566 if (ilist
[ftype
].nr
> 0)
2568 fprintf(debug
, "%-20s thread dist:",
2569 interaction_function
[ftype
].longname
);
2570 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
2572 fprintf(debug
, " %4d %4d ",
2573 vsite
->tData
[th
]->ilist
[ftype
].nr
,
2574 vsite
->tData
[th
]->idTask
.ilist
[ftype
].nr
);
2576 fprintf(debug
, "\n");
2582 int nrOrig
= vsiteIlistNrCount(ilist
);
2584 for (int th
= 0; th
< vsite
->nthreads
+ 1; th
++)
2587 vsiteIlistNrCount(vsite
->tData
[th
]->ilist
) +
2588 vsiteIlistNrCount(vsite
->tData
[th
]->idTask
.ilist
);
2590 GMX_ASSERT(nrThreaded
== nrOrig
, "The number of virtual sites assigned to all thread task has to match the total number of virtual sites");
2594 void set_vsite_top(gmx_vsite_t
*vsite
,
2595 const gmx_localtop_t
*top
,
2596 const t_mdatoms
*md
)
2598 if (vsite
->n_intercg_vsite
> 0 && vsite
->bHaveChargeGroups
)
2600 vsite
->vsite_pbc_loc
= get_vsite_pbc(top
->idef
.iparams
,
2601 top
->idef
.il
, nullptr, md
,
2605 if (vsite
->nthreads
> 1)
2607 if (vsite
->bHaveChargeGroups
)
2609 gmx_fatal(FARGS
, "The combination of threading, virtual sites and charge groups is not implemented");
2612 split_vsites_over_threads(top
->idef
.il
, top
->idef
.iparams
,