2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* This file is completely threadsafe - keep it that way! */
49 #include "gromacs/domdec/domdec.h"
50 #include "gromacs/fileio/copyrite.h"
51 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
52 #include "gromacs/legacyheaders/nrnb.h"
53 #include "gromacs/legacyheaders/types/commrec.h"
54 #include "gromacs/math/units.h"
55 #include "gromacs/math/vec.h"
56 #include "gromacs/mdlib/constr.h"
57 #include "gromacs/mdlib/mdrun.h"
58 #include "gromacs/pbcutil/pbc.h"
59 #include "gromacs/pbcutil/pbc-simd.h"
60 #include "gromacs/simd/simd.h"
61 #include "gromacs/simd/simd_math.h"
62 #include "gromacs/simd/vector_operations.h"
63 #include "gromacs/topology/block.h"
64 #include "gromacs/topology/mtop_util.h"
65 #include "gromacs/utility/bitmask.h"
66 #include "gromacs/utility/cstringutil.h"
67 #include "gromacs/utility/exceptions.h"
68 #include "gromacs/utility/fatalerror.h"
69 #include "gromacs/utility/gmxomp.h"
70 #include "gromacs/utility/smalloc.h"
72 /* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
73 #if GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700) && !defined(__ICL)
78 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
80 // This was originally work-in-progress for augmenting the SIMD module with
81 // masked load/store operations. Instead, that turned into and extended SIMD
82 // interface that supports gather/scatter in all platforms, which will be
83 // part of a future Gromacs version. However, since the code for bonded
84 // interactions and LINCS was already written it would be a pity not to get
85 // the performance gains in Gromacs-5.1. For this reason we have added it as
86 // a bit of a hack in the two files that use it. It will be replaced with the
87 // new generic functionality after version 5.1
90 static gmx_inline
void gmx_simdcall
91 gmx_hack_simd_transpose4_r(gmx_simd_double_t
*row0
,
92 gmx_simd_double_t
*row1
,
93 gmx_simd_double_t
*row2
,
94 gmx_simd_double_t
*row3
)
96 __m256d tmp0
, tmp1
, tmp2
, tmp3
;
98 tmp0
= _mm256_unpacklo_pd(*row0
, *row1
);
99 tmp2
= _mm256_unpacklo_pd(*row2
, *row3
);
100 tmp1
= _mm256_unpackhi_pd(*row0
, *row1
);
101 tmp3
= _mm256_unpackhi_pd(*row2
, *row3
);
102 *row0
= _mm256_permute2f128_pd(tmp0
, tmp2
, 0x20);
103 *row1
= _mm256_permute2f128_pd(tmp1
, tmp3
, 0x20);
104 *row2
= _mm256_permute2f128_pd(tmp0
, tmp2
, 0x31);
105 *row3
= _mm256_permute2f128_pd(tmp1
, tmp3
, 0x31);
108 static gmx_inline
void gmx_simdcall
109 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t
*a
,
110 gmx_simd_double_t
*row0
,
111 gmx_simd_double_t
*row1
,
112 gmx_simd_double_t
*row2
,
113 gmx_simd_double_t
*row3
)
120 gmx_hack_simd_transpose4_r(row0
, row1
, row2
, row3
);
123 static gmx_inline
void gmx_simdcall
124 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_double_t row0
,
125 gmx_simd_double_t row1
,
126 gmx_simd_double_t row2
,
127 gmx_simd_double_t row3
,
128 gmx_simd4_double_t
*a
)
135 gmx_hack_simd_transpose4_r(&a
[0], &a
[1], &a
[2], &a
[3]);
139 # if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
140 # define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
141 # define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)), (x))
143 # define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
144 # define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), (x))
147 # else /* single instead of double */
148 static gmx_inline
void gmx_simdcall
149 gmx_hack_simd_transpose4_r(gmx_simd_float_t
*row0
,
150 gmx_simd_float_t
*row1
,
151 gmx_simd_float_t
*row2
,
152 gmx_simd_float_t
*row3
)
154 __m256 tmp0
, tmp1
, tmp2
, tmp3
;
156 tmp0
= _mm256_unpacklo_ps(*row0
, *row1
);
157 tmp2
= _mm256_unpacklo_ps(*row2
, *row3
);
158 tmp1
= _mm256_unpackhi_ps(*row0
, *row1
);
159 tmp3
= _mm256_unpackhi_ps(*row2
, *row3
);
160 *row0
= _mm256_shuffle_ps(tmp0
, tmp2
, 0x44);
161 *row1
= _mm256_shuffle_ps(tmp0
, tmp2
, 0xEE);
162 *row2
= _mm256_shuffle_ps(tmp1
, tmp3
, 0x44);
163 *row3
= _mm256_shuffle_ps(tmp1
, tmp3
, 0xEE);
166 static gmx_inline
void gmx_simdcall
167 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t
*a
,
168 gmx_simd_float_t
*row0
,
169 gmx_simd_float_t
*row1
,
170 gmx_simd_float_t
*row2
,
171 gmx_simd_float_t
*row3
)
173 *row0
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[0]), a
[4], 1);
174 *row1
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[1]), a
[5], 1);
175 *row2
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[2]), a
[6], 1);
176 *row3
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[3]), a
[7], 1);
178 gmx_hack_simd_transpose4_r(row0
, row1
, row2
, row3
);
181 static gmx_inline
void gmx_simdcall
182 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_float_t row0
,
183 gmx_simd_float_t row1
,
184 gmx_simd_float_t row2
,
185 gmx_simd_float_t row3
,
186 gmx_simd4_float_t
*a
)
188 gmx_hack_simd_transpose4_r(&row0
, &row1
, &row2
, &row3
);
190 a
[0] = _mm256_extractf128_ps(row0
, 0);
191 a
[1] = _mm256_extractf128_ps(row1
, 0);
192 a
[2] = _mm256_extractf128_ps(row2
, 0);
193 a
[3] = _mm256_extractf128_ps(row3
, 0);
194 a
[4] = _mm256_extractf128_ps(row0
, 1);
195 a
[5] = _mm256_extractf128_ps(row1
, 1);
196 a
[6] = _mm256_extractf128_ps(row2
, 1);
197 a
[7] = _mm256_extractf128_ps(row3
, 1);
199 #if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
200 # define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
201 # define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)), (x))
203 # define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
204 # define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_set_epi32(0, -1, -1, -1), (x))
211 #if GMX_SIMD_HAVE_REAL
212 /*! \brief Store differences between indexed rvecs in SIMD registers.
214 * Returns SIMD register with the difference vectors:
215 * v[pair_index[i*2]] - v[pair_index[i*2 + 1]]
217 * \param[in] v Array of rvecs
218 * \param[in] pair_index Index pairs for GMX_SIMD_REAL_WIDTH vector pairs
219 * \param[in,out] buf Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
220 * \param[out] dx SIMD register with x difference
221 * \param[out] dy SIMD register with y difference
222 * \param[out] dz SIMD register with z difference
224 static gmx_inline
void gmx_simdcall
225 gmx_hack_simd_gather_rvec_dist_pair_index(const rvec
*v
,
226 const int *pair_index
,
227 real gmx_unused
*buf
,
232 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
234 gmx_simd4_real_t d
[GMX_SIMD_REAL_WIDTH
];
237 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
239 d
[i
] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v
[pair_index
[i
*2 + 0]][0])),
240 gmx_hack_simd4_load3_r(&(v
[pair_index
[i
*2 + 1]][0])));
243 gmx_hack_simd4_transpose_to_simd_r(d
, dx
, dy
, dz
, &tmp
);
246 GMX_ALIGNED(real
, GMX_SIMD_REAL_WIDTH
) buf_aligned
[3*GMX_SIMD_REAL_WIDTH
];
248 real
* buf_aligned
= buf
;
253 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
255 /* Store the distances packed and aligned */
256 for (m
= 0; m
< DIM
; m
++)
258 buf_aligned
[m
*GMX_SIMD_REAL_WIDTH
+ i
] =
259 v
[pair_index
[i
*2]][m
] - v
[pair_index
[i
*2 + 1]][m
];
262 *dx
= gmx_simd_load_r(buf_aligned
+ 0*GMX_SIMD_REAL_WIDTH
);
263 *dy
= gmx_simd_load_r(buf_aligned
+ 1*GMX_SIMD_REAL_WIDTH
);
264 *dz
= gmx_simd_load_r(buf_aligned
+ 2*GMX_SIMD_REAL_WIDTH
);
269 /*! \brief Stores SIMD vector into multiple rvecs.
271 * \param[in] x SIMD register with x-components of the vectors
272 * \param[in] y SIMD register with y-components of the vectors
273 * \param[in] z SIMD register with z-components of the vectors
274 * \param[in,out] buf Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
275 * \param[out] v Array of GMX_SIMD_REAL_WIDTH rvecs
277 static gmx_inline
void gmx_simdcall
278 gmx_simd_store_vec_to_rvec(gmx_simd_real_t x
,
281 real gmx_unused
*buf
,
284 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
286 gmx_simd4_real_t s4
[GMX_SIMD_REAL_WIDTH
];
287 gmx_simd_real_t zero
= gmx_simd_setzero_r();
289 gmx_hack_simd_transpose_to_simd4_r(x
, y
, z
, zero
, s4
);
291 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
293 gmx_hack_simd4_store3_r(v
[i
], s4
[i
]);
297 GMX_ALIGNED(real
, GMX_SIMD_REAL_WIDTH
) buf_aligned
[3*GMX_SIMD_REAL_WIDTH
];
299 real
* buf_aligned
= buf
;
304 gmx_simd_store_r(buf_aligned
+ 0*GMX_SIMD_REAL_WIDTH
, x
);
305 gmx_simd_store_r(buf_aligned
+ 1*GMX_SIMD_REAL_WIDTH
, y
);
306 gmx_simd_store_r(buf_aligned
+ 2*GMX_SIMD_REAL_WIDTH
, z
);
308 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
310 for (m
= 0; m
< DIM
; m
++)
312 v
[i
][m
] = buf_aligned
[m
*GMX_SIMD_REAL_WIDTH
+ i
];
317 #endif /* GMX_SIMD_HAVE_REAL */
321 int b0
; /* first constraint for this task */
322 int b1
; /* b1-1 is the last constraint for this task */
323 int ntriangle
; /* the number of constraints in triangles */
324 int *triangle
; /* the list of triangle constraints */
325 int *tri_bits
; /* the bits tell if the matrix element should be used */
326 int tri_alloc
; /* allocation size of triangle and tri_bits */
327 int nind
; /* number of indices */
328 int *ind
; /* constraint index for updating atom data */
329 int nind_r
; /* number of indices */
330 int *ind_r
; /* constraint index for updating atom data */
331 int ind_nalloc
; /* allocation size of ind and ind_r */
332 tensor vir_r_m_dr
; /* temporary variable for virial calculation */
333 real dhdlambda
; /* temporary variable for lambda derivative */
334 real
*simd_buf
; /* Aligned work array for SIMD */
337 typedef struct gmx_lincsdata
{
338 int ncg
; /* the global number of constraints */
339 int ncg_flex
; /* the global number of flexible constraints */
340 int ncg_triangle
; /* the global number of constraints in triangles */
341 int nIter
; /* the number of iterations */
342 int nOrder
; /* the order of the matrix expansion */
343 int max_connect
; /* the maximum number of constrains connected to a single atom */
345 int nc_real
; /* the number of real constraints */
346 int nc
; /* the number of constraints including padding for SIMD */
347 int nc_alloc
; /* the number we allocated memory for */
348 int ncc
; /* the number of constraint connections */
349 int ncc_alloc
; /* the number we allocated memory for */
350 real matlam
; /* the FE lambda value used for filling blc and blmf */
351 int *con_index
; /* mapping from topology to LINCS constraints */
352 real
*bllen0
; /* the reference distance in topology A */
353 real
*ddist
; /* the reference distance in top B - the r.d. in top A */
354 int *bla
; /* the atom pairs involved in the constraints */
355 real
*blc
; /* 1/sqrt(invmass1 + invmass2) */
356 real
*blc1
; /* as blc, but with all masses 1 */
357 int *blnr
; /* index into blbnb and blmf */
358 int *blbnb
; /* list of constraint connections */
359 int ntriangle
; /* the local number of constraints in triangles */
360 int ncc_triangle
; /* the number of constraint connections in triangles */
361 gmx_bool bCommIter
; /* communicate before each LINCS interation */
362 real
*blmf
; /* matrix of mass factors for constraint connections */
363 real
*blmf1
; /* as blmf, but with all masses 1 */
364 real
*bllen
; /* the reference bond length */
365 int *nlocat
; /* the local atom count per constraint, can be NULL */
367 int ntask
; /* The number of tasks = #threads for LINCS */
368 lincs_task_t
*task
; /* LINCS thread division */
369 gmx_bitmask_t
*atf
; /* atom flags for thread parallelization */
370 int atf_nalloc
; /* allocation size of atf */
371 gmx_bool bTaskDep
; /* are the LINCS tasks interdependent? */
372 /* arrays for temporary storage in the LINCS algorithm */
379 real
*mlambda
; /* the Lagrange multipliers * -1 */
380 /* storage for the constraint RMS relative deviation output */
384 /* Define simd_width for memory allocation used for SIMD code */
386 static const int simd_width
= GMX_SIMD_REAL_WIDTH
;
388 static const int simd_width
= 1;
390 /* Align to 128 bytes, consistent with the current implementation of
391 AlignedAllocator, which currently forces 128 byte alignment. */
392 static const int align_bytes
= 128;
394 real
*lincs_rmsd_data(struct gmx_lincsdata
*lincsd
)
396 return lincsd
->rmsd_data
;
399 real
lincs_rmsd(struct gmx_lincsdata
*lincsd
, gmx_bool bSD2
)
401 if (lincsd
->rmsd_data
[0] > 0)
403 return sqrt(lincsd
->rmsd_data
[bSD2
? 2 : 1]/lincsd
->rmsd_data
[0]);
411 /* Do a set of nrec LINCS matrix multiplications.
412 * This function will return with up to date thread-local
413 * constraint data, without an OpenMP barrier.
415 static void lincs_matrix_expand(const struct gmx_lincsdata
*lincsd
,
416 const lincs_task_t
*li_task
,
418 real
*rhs1
, real
*rhs2
, real
*sol
)
420 int b0
, b1
, nrec
, rec
;
421 const int *blnr
= lincsd
->blnr
;
422 const int *blbnb
= lincsd
->blbnb
;
426 nrec
= lincsd
->nOrder
;
428 for (rec
= 0; rec
< nrec
; rec
++)
432 if (lincsd
->bTaskDep
)
436 for (b
= b0
; b
< b1
; b
++)
442 for (n
= blnr
[b
]; n
< blnr
[b
+1]; n
++)
444 mvb
= mvb
+ blcc
[n
]*rhs1
[blbnb
[n
]];
447 sol
[b
] = sol
[b
] + mvb
;
455 } /* nrec*(ncons+2*nrtot) flops */
457 if (lincsd
->ntriangle
> 0)
459 /* Perform an extra nrec recursions for only the constraints
460 * involved in rigid triangles.
461 * In this way their accuracy should come close to those of the other
462 * constraints, since traingles of constraints can produce eigenvalues
463 * around 0.7, while the effective eigenvalue for bond constraints
464 * is around 0.4 (and 0.7*0.7=0.5).
467 if (lincsd
->bTaskDep
)
469 /* We need a barrier here, since other threads might still be
470 * reading the contents of rhs1 and/o rhs2.
471 * We could avoid this barrier by introducing two extra rhs
472 * arrays for the triangle constraints only.
477 /* Constraints involved in a triangle are ensured to be in the same
478 * LINCS task. This means no barriers are required during the extra
479 * iterations for the triangle constraints.
481 const int *triangle
= li_task
->triangle
;
482 const int *tri_bits
= li_task
->tri_bits
;
484 for (rec
= 0; rec
< nrec
; rec
++)
488 for (tb
= 0; tb
< li_task
->ntriangle
; tb
++)
490 int b
, bits
, nr0
, nr1
, n
;
498 for (n
= nr0
; n
< nr1
; n
++)
500 if (bits
& (1 << (n
- nr0
)))
502 mvb
= mvb
+ blcc
[n
]*rhs1
[blbnb
[n
]];
506 sol
[b
] = sol
[b
] + mvb
;
514 } /* nrec*(ntriangle + ncc_triangle*2) flops */
518 static void lincs_update_atoms_noind(int ncons
, const int *bla
,
520 const real
*fac
, rvec
*r
,
525 real mvb
, im1
, im2
, tmp0
, tmp1
, tmp2
;
529 for (b
= 0; b
< ncons
; b
++)
545 } /* 16 ncons flops */
549 for (b
= 0; b
< ncons
; b
++)
567 static void lincs_update_atoms_ind(int ncons
, const int *ind
, const int *bla
,
569 const real
*fac
, rvec
*r
,
574 real mvb
, im1
, im2
, tmp0
, tmp1
, tmp2
;
578 for (bi
= 0; bi
< ncons
; bi
++)
595 } /* 16 ncons flops */
599 for (bi
= 0; bi
< ncons
; bi
++)
614 } /* 16 ncons flops */
618 static void lincs_update_atoms(struct gmx_lincsdata
*li
, int th
,
620 const real
*fac
, rvec
*r
,
626 /* Single thread, we simply update for all constraints */
627 lincs_update_atoms_noind(li
->nc_real
,
628 li
->bla
, prefac
, fac
, r
, invmass
, x
);
632 /* Update the atom vector components for our thread local
633 * constraints that only access our local atom range.
634 * This can be done without a barrier.
636 lincs_update_atoms_ind(li
->task
[th
].nind
, li
->task
[th
].ind
,
637 li
->bla
, prefac
, fac
, r
, invmass
, x
);
639 if (li
->task
[li
->ntask
].nind
> 0)
641 /* Update the constraints that operate on atoms
642 * in multiple thread atom blocks on the master thread.
647 lincs_update_atoms_ind(li
->task
[li
->ntask
].nind
,
648 li
->task
[li
->ntask
].ind
,
649 li
->bla
, prefac
, fac
, r
, invmass
, x
);
656 /* Calculate the constraint distance vectors r to project on from x.
657 * Determine the right-hand side of the matrix equation using quantity f.
658 * This function only differs from calc_dr_x_xp_simd below in that
659 * no constraint length is subtracted and no PBC is used for f.
661 static void gmx_simdcall
662 calc_dr_x_f_simd(int b0
,
665 const rvec
* gmx_restrict x
,
666 const rvec
* gmx_restrict f
,
667 const real
* gmx_restrict blc
,
668 const pbc_simd_t
* pbc_simd
,
669 real
* gmx_restrict vbuf1
,
670 real
* gmx_restrict vbuf2
,
671 rvec
* gmx_restrict r
,
672 real
* gmx_restrict rhs
,
673 real
* gmx_restrict sol
)
677 assert(b0
% GMX_SIMD_REAL_WIDTH
== 0);
679 for (bs
= b0
; bs
< b1
; bs
+= GMX_SIMD_REAL_WIDTH
)
681 gmx_simd_real_t rx_S
, ry_S
, rz_S
, n2_S
, il_S
;
682 gmx_simd_real_t fx_S
, fy_S
, fz_S
, ip_S
, rhs_S
;
684 gmx_hack_simd_gather_rvec_dist_pair_index(x
, bla
+ bs
*2, vbuf1
,
685 &rx_S
, &ry_S
, &rz_S
);
687 pbc_correct_dx_simd(&rx_S
, &ry_S
, &rz_S
, pbc_simd
);
689 n2_S
= gmx_simd_norm2_r(rx_S
, ry_S
, rz_S
);
690 il_S
= gmx_simd_invsqrt_r(n2_S
);
692 rx_S
= gmx_simd_mul_r(rx_S
, il_S
);
693 ry_S
= gmx_simd_mul_r(ry_S
, il_S
);
694 rz_S
= gmx_simd_mul_r(rz_S
, il_S
);
696 gmx_simd_store_vec_to_rvec(rx_S
, ry_S
, rz_S
, vbuf1
, r
+ bs
);
698 gmx_hack_simd_gather_rvec_dist_pair_index(f
, bla
+ bs
*2, vbuf2
,
699 &fx_S
, &fy_S
, &fz_S
);
701 ip_S
= gmx_simd_iprod_r(rx_S
, ry_S
, rz_S
,
704 rhs_S
= gmx_simd_mul_r(gmx_simd_load_r(blc
+ bs
), ip_S
);
706 gmx_simd_store_r(rhs
+ bs
, rhs_S
);
707 gmx_simd_store_r(sol
+ bs
, rhs_S
);
710 #endif /* LINCS_SIMD */
712 /* LINCS projection, works on derivatives of the coordinates */
713 static void do_lincsp(rvec
*x
, rvec
*f
, rvec
*fp
, t_pbc
*pbc
,
714 struct gmx_lincsdata
*lincsd
, int th
,
716 int econq
, gmx_bool bCalcDHDL
,
717 gmx_bool bCalcVir
, tensor rmdf
)
720 int *bla
, *blnr
, *blbnb
;
722 real
*blc
, *blmf
, *blcc
, *rhs1
, *rhs2
, *sol
;
724 b0
= lincsd
->task
[th
].b0
;
725 b1
= lincsd
->task
[th
].b1
;
730 blbnb
= lincsd
->blbnb
;
731 if (econq
!= econqForce
)
733 /* Use mass-weighted parameters */
739 /* Use non mass-weighted parameters */
741 blmf
= lincsd
->blmf1
;
743 blcc
= lincsd
->tmpncc
;
750 /* This SIMD code does the same as the plain-C code after the #else.
751 * The only difference is that we always call pbc code, as with SIMD
752 * the overhead of pbc computation (when not needed) is small.
756 /* Convert the pbc struct for SIMD */
757 set_pbc_simd(pbc
, &pbc_simd
);
759 /* Compute normalized x i-j vectors, store in r.
760 * Compute the inner product of r and xp i-j and store in rhs1.
762 calc_dr_x_f_simd(b0
, b1
, bla
, x
, f
, blc
,
764 lincsd
->task
[th
].simd_buf
,
765 lincsd
->task
[th
].simd_buf
+ GMX_SIMD_REAL_WIDTH
*DIM
,
768 #else /* LINCS_SIMD */
770 /* Compute normalized i-j vectors */
773 for (b
= b0
; b
< b1
; b
++)
777 pbc_dx_aiuc(pbc
, x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
783 for (b
= b0
; b
< b1
; b
++)
787 rvec_sub(x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
789 } /* 16 ncons flops */
792 for (b
= b0
; b
< b1
; b
++)
799 mvb
= blc
[b
]*(r
[b
][0]*(f
[i
][0] - f
[j
][0]) +
800 r
[b
][1]*(f
[i
][1] - f
[j
][1]) +
801 r
[b
][2]*(f
[i
][2] - f
[j
][2]));
807 #endif /* LINCS_SIMD */
809 if (lincsd
->bTaskDep
)
811 /* We need a barrier, since the matrix construction below
812 * can access entries in r of other threads.
817 /* Construct the (sparse) LINCS matrix */
818 for (b
= b0
; b
< b1
; b
++)
822 for (n
= blnr
[b
]; n
< blnr
[b
+1]; n
++)
824 blcc
[n
] = blmf
[n
]*iprod(r
[b
], r
[blbnb
[n
]]);
827 /* Together: 23*ncons + 6*nrtot flops */
829 lincs_matrix_expand(lincsd
, &lincsd
->task
[th
], blcc
, rhs1
, rhs2
, sol
);
830 /* nrec*(ncons+2*nrtot) flops */
832 if (econq
== econqDeriv_FlexCon
)
834 /* We only want to constraint the flexible constraints,
835 * so we mask out the normal ones by setting sol to 0.
837 for (b
= b0
; b
< b1
; b
++)
839 if (!(lincsd
->bllen0
[b
] == 0 && lincsd
->ddist
[b
] == 0))
846 /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
847 for (b
= b0
; b
< b1
; b
++)
852 /* When constraining forces, we should not use mass weighting,
853 * so we pass invmass=NULL, which results in the use of 1 for all atoms.
855 lincs_update_atoms(lincsd
, th
, 1.0, sol
, r
,
856 (econq
!= econqForce
) ? invmass
: NULL
, fp
);
863 for (b
= b0
; b
< b1
; b
++)
865 dhdlambda
-= sol
[b
]*lincsd
->ddist
[b
];
868 lincsd
->task
[th
].dhdlambda
= dhdlambda
;
873 /* Constraint virial,
874 * determines sum r_bond x delta f,
875 * where delta f is the constraint correction
876 * of the quantity that is being constrained.
878 for (b
= b0
; b
< b1
; b
++)
883 mvb
= lincsd
->bllen
[b
]*sol
[b
];
884 for (i
= 0; i
< DIM
; i
++)
887 for (j
= 0; j
< DIM
; j
++)
889 rmdf
[i
][j
] += tmp1
*r
[b
][j
];
892 } /* 23 ncons flops */
897 /* Calculate the constraint distance vectors r to project on from x.
898 * Determine the right-hand side of the matrix equation using coordinates xp.
900 static void gmx_simdcall
901 calc_dr_x_xp_simd(int b0
,
904 const rvec
* gmx_restrict x
,
905 const rvec
* gmx_restrict xp
,
906 const real
* gmx_restrict bllen
,
907 const real
* gmx_restrict blc
,
908 const pbc_simd_t
* pbc_simd
,
909 real
* gmx_restrict vbuf1
,
910 real
* gmx_restrict vbuf2
,
911 rvec
* gmx_restrict r
,
912 real
* gmx_restrict rhs
,
913 real
* gmx_restrict sol
)
917 assert(b0
% GMX_SIMD_REAL_WIDTH
== 0);
919 for (bs
= b0
; bs
< b1
; bs
+= GMX_SIMD_REAL_WIDTH
)
921 gmx_simd_real_t rx_S
, ry_S
, rz_S
, n2_S
, il_S
;
922 gmx_simd_real_t rxp_S
, ryp_S
, rzp_S
, ip_S
, rhs_S
;
924 gmx_hack_simd_gather_rvec_dist_pair_index(x
, bla
+ bs
*2, vbuf1
,
925 &rx_S
, &ry_S
, &rz_S
);
927 pbc_correct_dx_simd(&rx_S
, &ry_S
, &rz_S
, pbc_simd
);
929 n2_S
= gmx_simd_norm2_r(rx_S
, ry_S
, rz_S
);
930 il_S
= gmx_simd_invsqrt_r(n2_S
);
932 rx_S
= gmx_simd_mul_r(rx_S
, il_S
);
933 ry_S
= gmx_simd_mul_r(ry_S
, il_S
);
934 rz_S
= gmx_simd_mul_r(rz_S
, il_S
);
936 gmx_simd_store_vec_to_rvec(rx_S
, ry_S
, rz_S
, vbuf1
, r
+ bs
);
938 gmx_hack_simd_gather_rvec_dist_pair_index(xp
, bla
+ bs
*2, vbuf2
,
939 &rxp_S
, &ryp_S
, &rzp_S
);
941 pbc_correct_dx_simd(&rxp_S
, &ryp_S
, &rzp_S
, pbc_simd
);
943 ip_S
= gmx_simd_iprod_r(rx_S
, ry_S
, rz_S
,
944 rxp_S
, ryp_S
, rzp_S
);
946 rhs_S
= gmx_simd_mul_r(gmx_simd_load_r(blc
+ bs
),
947 gmx_simd_sub_r(ip_S
, gmx_simd_load_r(bllen
+ bs
)));
949 gmx_simd_store_r(rhs
+ bs
, rhs_S
);
950 gmx_simd_store_r(sol
+ bs
, rhs_S
);
953 #endif /* LINCS_SIMD */
955 /* Determine the distances and right-hand side for the next iteration */
956 static void calc_dist_iter(int b0
,
959 const rvec
* gmx_restrict xp
,
960 const real
* gmx_restrict bllen
,
961 const real
* gmx_restrict blc
,
964 real
* gmx_restrict rhs
,
965 real
* gmx_restrict sol
,
970 for (b
= b0
; b
< b1
; b
++)
972 real len
, len2
, dlen2
, mvb
;
978 pbc_dx_aiuc(pbc
, xp
[bla
[2*b
]], xp
[bla
[2*b
+1]], dx
);
982 rvec_sub(xp
[bla
[2*b
]], xp
[bla
[2*b
+1]], dx
);
985 dlen2
= 2*len2
- norm2(dx
);
986 if (dlen2
< wfac
*len2
)
988 /* not race free - see detailed comment in caller */
993 mvb
= blc
[b
]*(len
- dlen2
*gmx_invsqrt(dlen2
));
1001 } /* 20*ncons flops */
1005 /* As the function above, but using SIMD intrinsics */
1006 static void gmx_simdcall
1007 calc_dist_iter_simd(int b0
,
1010 const rvec
* gmx_restrict x
,
1011 const real
* gmx_restrict bllen
,
1012 const real
* gmx_restrict blc
,
1013 const pbc_simd_t
* pbc_simd
,
1015 real
* gmx_restrict vbuf
,
1016 real
* gmx_restrict rhs
,
1017 real
* gmx_restrict sol
,
1020 gmx_simd_real_t min_S
= gmx_simd_set1_r(GMX_REAL_MIN
);
1021 gmx_simd_real_t two_S
= gmx_simd_set1_r(2.0);
1022 gmx_simd_real_t wfac_S
= gmx_simd_set1_r(wfac
);
1023 gmx_simd_bool_t warn_S
;
1027 assert(b0
% GMX_SIMD_REAL_WIDTH
== 0);
1029 /* Initialize all to FALSE */
1030 warn_S
= gmx_simd_cmplt_r(two_S
, gmx_simd_setzero_r());
1032 for (bs
= b0
; bs
< b1
; bs
+= GMX_SIMD_REAL_WIDTH
)
1034 gmx_simd_real_t rx_S
, ry_S
, rz_S
, n2_S
;
1035 gmx_simd_real_t len_S
, len2_S
, dlen2_S
, lc_S
, blc_S
;
1037 gmx_hack_simd_gather_rvec_dist_pair_index(x
, bla
+ bs
*2, vbuf
,
1038 &rx_S
, &ry_S
, &rz_S
);
1040 pbc_correct_dx_simd(&rx_S
, &ry_S
, &rz_S
, pbc_simd
);
1042 n2_S
= gmx_simd_norm2_r(rx_S
, ry_S
, rz_S
);
1044 len_S
= gmx_simd_load_r(bllen
+ bs
);
1045 len2_S
= gmx_simd_mul_r(len_S
, len_S
);
1047 dlen2_S
= gmx_simd_fmsub_r(two_S
, len2_S
, n2_S
);
1049 warn_S
= gmx_simd_or_b(warn_S
,
1050 gmx_simd_cmplt_r(dlen2_S
,
1051 gmx_simd_mul_r(wfac_S
, len2_S
)));
1053 /* Avoid 1/0 by taking the max with REAL_MIN.
1054 * Note: when dlen2 is close to zero (90 degree constraint rotation),
1055 * the accuracy of the algorithm is no longer relevant.
1057 dlen2_S
= gmx_simd_max_r(dlen2_S
, min_S
);
1059 lc_S
= gmx_simd_fnmadd_r(dlen2_S
, gmx_simd_invsqrt_r(dlen2_S
), len_S
);
1061 blc_S
= gmx_simd_load_r(blc
+ bs
);
1063 lc_S
= gmx_simd_mul_r(blc_S
, lc_S
);
1065 gmx_simd_store_r(rhs
+ bs
, lc_S
);
1066 gmx_simd_store_r(sol
+ bs
, lc_S
);
1069 if (gmx_simd_anytrue_b(warn_S
))
1074 #endif /* LINCS_SIMD */
1076 static void do_lincs(rvec
*x
, rvec
*xp
, matrix box
, t_pbc
*pbc
,
1077 struct gmx_lincsdata
*lincsd
, int th
,
1078 const real
*invmass
,
1081 real wangle
, gmx_bool
*bWarn
,
1082 real invdt
, rvec
* gmx_restrict v
,
1083 gmx_bool bCalcVir
, tensor vir_r_m_dr
)
1085 int b0
, b1
, b
, i
, j
, n
, iter
;
1086 int *bla
, *blnr
, *blbnb
;
1088 real
*blc
, *blmf
, *bllen
, *blcc
, *rhs1
, *rhs2
, *sol
, *blc_sol
, *mlambda
;
1091 b0
= lincsd
->task
[th
].b0
;
1092 b1
= lincsd
->task
[th
].b1
;
1096 blnr
= lincsd
->blnr
;
1097 blbnb
= lincsd
->blbnb
;
1099 blmf
= lincsd
->blmf
;
1100 bllen
= lincsd
->bllen
;
1101 blcc
= lincsd
->tmpncc
;
1102 rhs1
= lincsd
->tmp1
;
1103 rhs2
= lincsd
->tmp2
;
1105 blc_sol
= lincsd
->tmp4
;
1106 mlambda
= lincsd
->mlambda
;
1107 nlocat
= lincsd
->nlocat
;
1111 /* This SIMD code does the same as the plain-C code after the #else.
1112 * The only difference is that we always call pbc code, as with SIMD
1113 * the overhead of pbc computation (when not needed) is small.
1115 pbc_simd_t pbc_simd
;
1117 /* Convert the pbc struct for SIMD */
1118 set_pbc_simd(pbc
, &pbc_simd
);
1120 /* Compute normalized x i-j vectors, store in r.
1121 * Compute the inner product of r and xp i-j and store in rhs1.
1123 calc_dr_x_xp_simd(b0
, b1
, bla
, x
, xp
, bllen
, blc
,
1125 lincsd
->task
[th
].simd_buf
,
1126 lincsd
->task
[th
].simd_buf
+ GMX_SIMD_REAL_WIDTH
*DIM
,
1129 #else /* LINCS_SIMD */
1133 /* Compute normalized i-j vectors */
1134 for (b
= b0
; b
< b1
; b
++)
1139 pbc_dx_aiuc(pbc
, x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
1142 pbc_dx_aiuc(pbc
, xp
[bla
[2*b
]], xp
[bla
[2*b
+1]], dx
);
1143 mvb
= blc
[b
]*(iprod(r
[b
], dx
) - bllen
[b
]);
1150 /* Compute normalized i-j vectors */
1151 for (b
= b0
; b
< b1
; b
++)
1153 real tmp0
, tmp1
, tmp2
, rlen
, mvb
;
1157 tmp0
= x
[i
][0] - x
[j
][0];
1158 tmp1
= x
[i
][1] - x
[j
][1];
1159 tmp2
= x
[i
][2] - x
[j
][2];
1160 rlen
= gmx_invsqrt(tmp0
*tmp0
+ tmp1
*tmp1
+ tmp2
*tmp2
);
1161 r
[b
][0] = rlen
*tmp0
;
1162 r
[b
][1] = rlen
*tmp1
;
1163 r
[b
][2] = rlen
*tmp2
;
1164 /* 16 ncons flops */
1168 mvb
= blc
[b
]*(r
[b
][0]*(xp
[i
][0] - xp
[j
][0]) +
1169 r
[b
][1]*(xp
[i
][1] - xp
[j
][1]) +
1170 r
[b
][2]*(xp
[i
][2] - xp
[j
][2]) - bllen
[b
]);
1175 /* Together: 26*ncons + 6*nrtot flops */
1178 #endif /* LINCS_SIMD */
1180 if (lincsd
->bTaskDep
)
1182 /* We need a barrier, since the matrix construction below
1183 * can access entries in r of other threads.
1188 /* Construct the (sparse) LINCS matrix */
1189 for (b
= b0
; b
< b1
; b
++)
1191 for (n
= blnr
[b
]; n
< blnr
[b
+1]; n
++)
1193 blcc
[n
] = blmf
[n
]*iprod(r
[b
], r
[blbnb
[n
]]);
1196 /* Together: 26*ncons + 6*nrtot flops */
1198 lincs_matrix_expand(lincsd
, &lincsd
->task
[th
], blcc
, rhs1
, rhs2
, sol
);
1199 /* nrec*(ncons+2*nrtot) flops */
1202 for (b
= b0
; b
< b1
; b
++)
1204 mlambda
[b
] = blc
[b
]*sol
[b
];
1207 for (b
= b0
; b
< b1
; b
+= GMX_SIMD_REAL_WIDTH
)
1209 gmx_simd_store_r(mlambda
+ b
,
1210 gmx_simd_mul_r(gmx_simd_load_r(blc
+ b
),
1211 gmx_simd_load_r(sol
+ b
)));
1215 /* Update the coordinates */
1216 lincs_update_atoms(lincsd
, th
, 1.0, mlambda
, r
, invmass
, xp
);
1219 ******** Correction for centripetal effects ********
1224 wfac
= cos(DEG2RAD
*wangle
);
1227 for (iter
= 0; iter
< lincsd
->nIter
; iter
++)
1229 if ((lincsd
->bCommIter
&& DOMAINDECOMP(cr
) && cr
->dd
->constraints
))
1234 /* Communicate the corrected non-local coordinates */
1235 if (DOMAINDECOMP(cr
))
1237 dd_move_x_constraints(cr
->dd
, box
, xp
, NULL
, FALSE
);
1242 else if (lincsd
->bTaskDep
)
1248 calc_dist_iter_simd(b0
, b1
, bla
, xp
, bllen
, blc
, &pbc_simd
, wfac
,
1249 lincsd
->task
[th
].simd_buf
, rhs1
, sol
, bWarn
);
1251 calc_dist_iter(b0
, b1
, bla
, xp
, bllen
, blc
, pbc
, wfac
,
1253 /* 20*ncons flops */
1256 lincs_matrix_expand(lincsd
, &lincsd
->task
[th
], blcc
, rhs1
, rhs2
, sol
);
1257 /* nrec*(ncons+2*nrtot) flops */
1260 for (b
= b0
; b
< b1
; b
++)
1264 mvb
= blc
[b
]*sol
[b
];
1269 for (b
= b0
; b
< b1
; b
+= GMX_SIMD_REAL_WIDTH
)
1271 gmx_simd_real_t mvb
;
1273 mvb
= gmx_simd_mul_r(gmx_simd_load_r(blc
+ b
),
1274 gmx_simd_load_r(sol
+ b
));
1275 gmx_simd_store_r(blc_sol
+ b
, mvb
);
1276 gmx_simd_store_r(mlambda
+ b
,
1277 gmx_simd_add_r(gmx_simd_load_r(mlambda
+ b
), mvb
));
1281 /* Update the coordinates */
1282 lincs_update_atoms(lincsd
, th
, 1.0, blc_sol
, r
, invmass
, xp
);
1284 /* nit*ncons*(37+9*nrec) flops */
1288 /* Update the velocities */
1289 lincs_update_atoms(lincsd
, th
, invdt
, mlambda
, r
, invmass
, v
);
1290 /* 16 ncons flops */
1293 if (nlocat
!= NULL
&& (bCalcDHDL
|| bCalcVir
))
1295 if (lincsd
->bTaskDep
)
1297 /* In lincs_update_atoms threads might cross-read mlambda */
1301 /* Only account for local atoms */
1302 for (b
= b0
; b
< b1
; b
++)
1304 mlambda
[b
] *= 0.5*nlocat
[b
];
1313 for (b
= b0
; b
< b1
; b
++)
1315 /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1316 * later after the contributions are reduced over the threads.
1318 dhdl
-= lincsd
->mlambda
[b
]*lincsd
->ddist
[b
];
1320 lincsd
->task
[th
].dhdlambda
= dhdl
;
1325 /* Constraint virial */
1326 for (b
= b0
; b
< b1
; b
++)
1330 tmp0
= -bllen
[b
]*mlambda
[b
];
1331 for (i
= 0; i
< DIM
; i
++)
1333 tmp1
= tmp0
*r
[b
][i
];
1334 for (j
= 0; j
< DIM
; j
++)
1336 vir_r_m_dr
[i
][j
] -= tmp1
*r
[b
][j
];
1339 } /* 22 ncons flops */
1343 * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1344 * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1346 * (26+nrec)*ncons + (6+2*nrec)*nrtot
1347 * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1349 * (63+nrec)*ncons + (6+4*nrec)*nrtot
1353 /* Sets the elements in the LINCS matrix for task li_task */
1354 static void set_lincs_matrix_task(struct gmx_lincsdata
*li
,
1355 lincs_task_t
*li_task
,
1356 const real
*invmass
,
1361 /* Construct the coupling coefficient matrix blmf */
1362 li_task
->ntriangle
= 0;
1364 for (i
= li_task
->b0
; i
< li_task
->b1
; i
++)
1369 a2
= li
->bla
[2*i
+1];
1370 for (n
= li
->blnr
[i
]; (n
< li
->blnr
[i
+1]); n
++)
1372 int k
, sign
, center
, end
;
1376 /* If we are using multiple, independent tasks for LINCS,
1377 * the calls to check_assign_connected should have
1378 * put all connected constraints in our task.
1380 assert(li
->bTaskDep
|| (k
>= li_task
->b0
&& k
< li_task
->b1
));
1382 if (a1
== li
->bla
[2*k
] || a2
== li
->bla
[2*k
+1])
1390 if (a1
== li
->bla
[2*k
] || a1
== li
->bla
[2*k
+1])
1400 li
->blmf
[n
] = sign
*invmass
[center
]*li
->blc
[i
]*li
->blc
[k
];
1401 li
->blmf1
[n
] = sign
*0.5;
1402 if (li
->ncg_triangle
> 0)
1406 /* Look for constraint triangles */
1407 for (nk
= li
->blnr
[k
]; (nk
< li
->blnr
[k
+1]); nk
++)
1410 if (kk
!= i
&& kk
!= k
&&
1411 (li
->bla
[2*kk
] == end
|| li
->bla
[2*kk
+1] == end
))
1413 /* If we are using multiple tasks for LINCS,
1414 * the calls to check_assign_triangle should have
1415 * put all constraints in the triangle in our task.
1417 assert(k
>= li_task
->b0
&& k
< li_task
->b1
);
1418 assert(kk
>= li_task
->b0
&& kk
< li_task
->b1
);
1420 if (li_task
->ntriangle
== 0 ||
1421 li_task
->triangle
[li_task
->ntriangle
- 1] < i
)
1423 /* Add this constraint to the triangle list */
1424 li_task
->triangle
[li_task
->ntriangle
] = i
;
1425 li_task
->tri_bits
[li_task
->ntriangle
] = 0;
1426 li_task
->ntriangle
++;
1427 if (li
->blnr
[i
+1] - li
->blnr
[i
] > static_cast<int>(sizeof(li_task
->tri_bits
[0])*8 - 1))
1429 gmx_fatal(FARGS
, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
1430 li
->blnr
[i
+1] - li
->blnr
[i
],
1431 sizeof(li_task
->tri_bits
[0])*8-1);
1434 li_task
->tri_bits
[li_task
->ntriangle
-1] |= (1 << (n
- li
->blnr
[i
]));
1443 /* Sets the elements in the LINCS matrix */
1444 void set_lincs_matrix(struct gmx_lincsdata
*li
, real
*invmass
, real lambda
)
1447 const real invsqrt2
= 0.7071067811865475244;
1449 for (i
= 0; (i
< li
->nc
); i
++)
1454 a2
= li
->bla
[2*i
+1];
1455 li
->blc
[i
] = gmx_invsqrt(invmass
[a1
] + invmass
[a2
]);
1456 li
->blc1
[i
] = invsqrt2
;
1459 /* Construct the coupling coefficient matrix blmf */
1460 int th
, ntriangle
= 0, ncc_triangle
= 0;
1461 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle) num_threads(li->ntask) schedule(static)
1462 for (th
= 0; th
< li
->ntask
; th
++)
1466 set_lincs_matrix_task(li
, &li
->task
[th
], invmass
, &ncc_triangle
);
1467 ntriangle
= li
->task
[th
].ntriangle
;
1469 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1471 li
->ntriangle
= ntriangle
;
1472 li
->ncc_triangle
= ncc_triangle
;
1476 fprintf(debug
, "Of the %d constraints %d participate in triangles\n",
1477 li
->nc
, li
->ntriangle
);
1478 fprintf(debug
, "There are %d couplings of which %d in triangles\n",
1479 li
->ncc
, li
->ncc_triangle
);
1483 * so we know with which lambda value the masses have been set.
1485 li
->matlam
= lambda
;
1488 static int count_triangle_constraints(const t_ilist
*ilist
,
1489 const t_blocka
*at2con
)
1491 int ncon1
, ncon_tot
;
1492 int c0
, a00
, a01
, n1
, c1
, a10
, a11
, ac1
, n2
, c2
, a20
, a21
;
1495 t_iatom
*ia1
, *ia2
, *iap
;
1497 ncon1
= ilist
[F_CONSTR
].nr
/3;
1498 ncon_tot
= ncon1
+ ilist
[F_CONSTRNC
].nr
/3;
1500 ia1
= ilist
[F_CONSTR
].iatoms
;
1501 ia2
= ilist
[F_CONSTRNC
].iatoms
;
1504 for (c0
= 0; c0
< ncon_tot
; c0
++)
1507 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c0
);
1510 for (n1
= at2con
->index
[a01
]; n1
< at2con
->index
[a01
+1]; n1
++)
1515 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c1
);
1526 for (n2
= at2con
->index
[ac1
]; n2
< at2con
->index
[ac1
+1]; n2
++)
1529 if (c2
!= c0
&& c2
!= c1
)
1531 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c2
);
1534 if (a20
== a00
|| a21
== a00
)
1548 return ncon_triangle
;
1551 static gmx_bool
more_than_two_sequential_constraints(const t_ilist
*ilist
,
1552 const t_blocka
*at2con
)
1554 t_iatom
*ia1
, *ia2
, *iap
;
1555 int ncon1
, ncon_tot
, c
;
1557 gmx_bool bMoreThanTwoSequentialConstraints
;
1559 ncon1
= ilist
[F_CONSTR
].nr
/3;
1560 ncon_tot
= ncon1
+ ilist
[F_CONSTRNC
].nr
/3;
1562 ia1
= ilist
[F_CONSTR
].iatoms
;
1563 ia2
= ilist
[F_CONSTRNC
].iatoms
;
1565 bMoreThanTwoSequentialConstraints
= FALSE
;
1566 for (c
= 0; c
< ncon_tot
&& !bMoreThanTwoSequentialConstraints
; c
++)
1568 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c
);
1571 /* Check if this constraint has constraints connected at both atoms */
1572 if (at2con
->index
[a1
+1] - at2con
->index
[a1
] > 1 &&
1573 at2con
->index
[a2
+1] - at2con
->index
[a2
] > 1)
1575 bMoreThanTwoSequentialConstraints
= TRUE
;
1579 return bMoreThanTwoSequentialConstraints
;
1582 static int int_comp(const void *a
, const void *b
)
1584 return (*(int *)a
) - (*(int *)b
);
1587 gmx_lincsdata_t
init_lincs(FILE *fplog
, const gmx_mtop_t
*mtop
,
1588 int nflexcon_global
, const t_blocka
*at2con
,
1589 gmx_bool bPLINCS
, int nIter
, int nProjOrder
)
1591 struct gmx_lincsdata
*li
;
1593 gmx_moltype_t
*molt
;
1594 gmx_bool bMoreThanTwoSeq
;
1598 fprintf(fplog
, "\nInitializing%s LINear Constraint Solver\n",
1599 bPLINCS
? " Parallel" : "");
1605 gmx_mtop_ftype_count(mtop
, F_CONSTR
) +
1606 gmx_mtop_ftype_count(mtop
, F_CONSTRNC
);
1607 li
->ncg_flex
= nflexcon_global
;
1610 li
->nOrder
= nProjOrder
;
1612 li
->max_connect
= 0;
1613 for (mt
= 0; mt
< mtop
->nmoltype
; mt
++)
1617 molt
= &mtop
->moltype
[mt
];
1618 for (a
= 0; a
< molt
->atoms
.nr
; a
++)
1620 li
->max_connect
= std::max(li
->max_connect
,
1621 at2con
[mt
].index
[a
+ 1] - at2con
[mt
].index
[a
]);
1625 li
->ncg_triangle
= 0;
1626 bMoreThanTwoSeq
= FALSE
;
1627 for (mb
= 0; mb
< mtop
->nmolblock
; mb
++)
1629 molt
= &mtop
->moltype
[mtop
->molblock
[mb
].type
];
1632 mtop
->molblock
[mb
].nmol
*
1633 count_triangle_constraints(molt
->ilist
,
1634 &at2con
[mtop
->molblock
[mb
].type
]);
1636 if (!bMoreThanTwoSeq
&&
1637 more_than_two_sequential_constraints(molt
->ilist
, &at2con
[mtop
->molblock
[mb
].type
]))
1639 bMoreThanTwoSeq
= TRUE
;
1643 /* Check if we need to communicate not only before LINCS,
1644 * but also before each iteration.
1645 * The check for only two sequential constraints is only
1646 * useful for the common case of H-bond only constraints.
1647 * With more effort we could also make it useful for small
1648 * molecules with nr. sequential constraints <= nOrder-1.
1650 li
->bCommIter
= (bPLINCS
&& (li
->nOrder
< 1 || bMoreThanTwoSeq
));
1652 if (debug
&& bPLINCS
)
1654 fprintf(debug
, "PLINCS communication before each iteration: %d\n",
1658 /* LINCS can run on any number of threads.
1659 * Currently the number is fixed for the whole simulation,
1660 * but it could be set in set_lincs().
1661 * The current constraint to task assignment code can create independent
1662 * tasks only when not more than two constraints are connected sequentially.
1664 li
->ntask
= gmx_omp_nthreads_get(emntLINCS
);
1665 li
->bTaskDep
= (li
->ntask
> 1 && bMoreThanTwoSeq
);
1668 fprintf(debug
, "LINCS: using %d threads, tasks are %sdependent\n",
1669 li
->ntask
, li
->bTaskDep
? "" : "in");
1677 /* Allocate an extra elements for "task-overlap" constraints */
1678 snew(li
->task
, li
->ntask
+ 1);
1681 #pragma omp parallel for num_threads(li->ntask)
1682 for (th
= 0; th
< li
->ntask
; th
++)
1686 /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
1687 snew_aligned(li
->task
[th
].simd_buf
, 2*simd_width
*DIM
,
1690 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1693 if (bPLINCS
|| li
->ncg_triangle
> 0)
1695 please_cite(fplog
, "Hess2008a");
1699 please_cite(fplog
, "Hess97a");
1704 fprintf(fplog
, "The number of constraints is %d\n", li
->ncg
);
1707 fprintf(fplog
, "There are inter charge-group constraints,\n"
1708 "will communicate selected coordinates each lincs iteration\n");
1710 if (li
->ncg_triangle
> 0)
1713 "%d constraints are involved in constraint triangles,\n"
1714 "will apply an additional matrix expansion of order %d for couplings\n"
1715 "between constraints inside triangles\n",
1716 li
->ncg_triangle
, li
->nOrder
);
1723 /* Sets up the work division over the threads */
1724 static void lincs_thread_setup(struct gmx_lincsdata
*li
, int natoms
)
1731 if (natoms
> li
->atf_nalloc
)
1733 li
->atf_nalloc
= over_alloc_large(natoms
);
1734 srenew(li
->atf
, li
->atf_nalloc
);
1738 /* Clear the atom flags */
1739 for (a
= 0; a
< natoms
; a
++)
1741 bitmask_clear(&atf
[a
]);
1744 if (li
->ntask
> BITMASK_SIZE
)
1746 gmx_fatal(FARGS
, "More than %d threads is not supported for LINCS.", BITMASK_SIZE
);
1749 for (th
= 0; th
< li
->ntask
; th
++)
1751 lincs_task_t
*li_task
;
1754 li_task
= &li
->task
[th
];
1756 /* For each atom set a flag for constraints from each */
1757 for (b
= li_task
->b0
; b
< li_task
->b1
; b
++)
1759 bitmask_set_bit(&atf
[li
->bla
[b
*2 ]], th
);
1760 bitmask_set_bit(&atf
[li
->bla
[b
*2 + 1]], th
);
1764 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1765 for (th
= 0; th
< li
->ntask
; th
++)
1769 lincs_task_t
*li_task
;
1773 li_task
= &li
->task
[th
];
1775 if (li_task
->b1
- li_task
->b0
> li_task
->ind_nalloc
)
1777 li_task
->ind_nalloc
= over_alloc_large(li_task
->b1
-li_task
->b0
);
1778 srenew(li_task
->ind
, li_task
->ind_nalloc
);
1779 srenew(li_task
->ind_r
, li_task
->ind_nalloc
);
1782 bitmask_init_low_bits(&mask
, th
);
1785 li_task
->nind_r
= 0;
1786 for (b
= li_task
->b0
; b
< li_task
->b1
; b
++)
1788 /* We let the constraint with the lowest thread index
1789 * operate on atoms with constraints from multiple threads.
1791 if (bitmask_is_disjoint(atf
[li
->bla
[b
*2]], mask
) &&
1792 bitmask_is_disjoint(atf
[li
->bla
[b
*2+1]], mask
))
1794 /* Add the constraint to the local atom update index */
1795 li_task
->ind
[li_task
->nind
++] = b
;
1799 /* Add the constraint to the rest block */
1800 li_task
->ind_r
[li_task
->nind_r
++] = b
;
1804 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1807 /* We need to copy all constraints which have not be assigned
1808 * to a thread to a separate list which will be handled by one thread.
1810 li_m
= &li
->task
[li
->ntask
];
1813 for (th
= 0; th
< li
->ntask
; th
++)
1815 lincs_task_t
*li_task
;
1818 li_task
= &li
->task
[th
];
1820 if (li_m
->nind
+ li_task
->nind_r
> li_m
->ind_nalloc
)
1822 li_m
->ind_nalloc
= over_alloc_large(li_m
->nind
+li_task
->nind_r
);
1823 srenew(li_m
->ind
, li_m
->ind_nalloc
);
1826 for (b
= 0; b
< li_task
->nind_r
; b
++)
1828 li_m
->ind
[li_m
->nind
++] = li_task
->ind_r
[b
];
1833 fprintf(debug
, "LINCS thread %d: %d constraints\n",
1840 fprintf(debug
, "LINCS thread r: %d constraints\n",
1845 /* There is no realloc with alignment, so here we make one for reals.
1846 * Note that this function does not preserve the contents of the memory.
1848 static void resize_real_aligned(real
**ptr
, int nelem
)
1850 sfree_aligned(*ptr
);
1851 snew_aligned(*ptr
, nelem
, align_bytes
);
1854 static void assign_constraint(struct gmx_lincsdata
*li
,
1855 int constraint_index
,
1857 real lenA
, real lenB
,
1858 const t_blocka
*at2con
)
1864 /* Make an mapping of local topology constraint index to LINCS index */
1865 li
->con_index
[constraint_index
] = con
;
1867 li
->bllen0
[con
] = lenA
;
1868 li
->ddist
[con
] = lenB
- lenA
;
1869 /* Set the length to the topology A length */
1870 li
->bllen
[con
] = lenA
;
1871 li
->bla
[2*con
] = a1
;
1872 li
->bla
[2*con
+1] = a2
;
1874 /* Make space in the constraint connection matrix for constraints
1875 * connected to both end of the current constraint.
1878 at2con
->index
[a1
+ 1] - at2con
->index
[a1
] - 1 +
1879 at2con
->index
[a2
+ 1] - at2con
->index
[a2
] - 1;
1881 li
->blnr
[con
+ 1] = li
->ncc
;
1883 /* Increase the constraint count */
1887 /* Check if constraint with topology index constraint_index is connected
1888 * to other constraints, and if so add those connected constraints to our task.
1890 static void check_assign_connected(struct gmx_lincsdata
*li
,
1891 const t_iatom
*iatom
,
1895 const t_blocka
*at2con
)
1897 /* Currently this function only supports constraint groups
1898 * in which all constraints share at least one atom
1899 * (e.g. H-bond constraints).
1900 * Check both ends of the current constraint for
1901 * connected constraints. We need to assign those
1906 for (end
= 0; end
< 2; end
++)
1910 a
= (end
== 0 ? a1
: a2
);
1912 for (k
= at2con
->index
[a
]; k
< at2con
->index
[a
+ 1]; k
++)
1917 /* Check if constraint cc has not yet been assigned */
1918 if (li
->con_index
[cc
] == -1)
1924 lenA
= idef
->iparams
[type
].constr
.dA
;
1925 lenB
= idef
->iparams
[type
].constr
.dB
;
1927 if (bDynamics
|| lenA
!= 0 || lenB
!= 0)
1929 assign_constraint(li
, cc
, iatom
[3*cc
+ 1], iatom
[3*cc
+ 2], lenA
, lenB
, at2con
);
1936 /* Check if constraint with topology index constraint_index is involved
1937 * in a constraint triangle, and if so add the other two constraints
1938 * in the triangle to our task.
1940 static void check_assign_triangle(struct gmx_lincsdata
*li
,
1941 const t_iatom
*iatom
,
1944 int constraint_index
,
1946 const t_blocka
*at2con
)
1948 int nca
, cc
[32], ca
[32], k
;
1949 int c_triangle
[2] = { -1, -1 };
1952 for (k
= at2con
->index
[a1
]; k
< at2con
->index
[a1
+ 1]; k
++)
1957 if (c
!= constraint_index
)
1961 aa1
= iatom
[c
*3 + 1];
1962 aa2
= iatom
[c
*3 + 2];
1978 for (k
= at2con
->index
[a2
]; k
< at2con
->index
[a2
+ 1]; k
++)
1983 if (c
!= constraint_index
)
1987 aa1
= iatom
[c
*3 + 1];
1988 aa2
= iatom
[c
*3 + 2];
1991 for (i
= 0; i
< nca
; i
++)
1995 c_triangle
[0] = cc
[i
];
2002 for (i
= 0; i
< nca
; i
++)
2006 c_triangle
[0] = cc
[i
];
2014 if (c_triangle
[0] >= 0)
2018 for (end
= 0; end
< 2; end
++)
2020 /* Check if constraint c_triangle[end] has not yet been assigned */
2021 if (li
->con_index
[c_triangle
[end
]] == -1)
2026 i
= c_triangle
[end
]*3;
2028 lenA
= idef
->iparams
[type
].constr
.dA
;
2029 lenB
= idef
->iparams
[type
].constr
.dB
;
2031 if (bDynamics
|| lenA
!= 0 || lenB
!= 0)
2033 assign_constraint(li
, c_triangle
[end
], iatom
[i
+ 1], iatom
[i
+ 2], lenA
, lenB
, at2con
);
2040 static void set_matrix_indices(struct gmx_lincsdata
*li
,
2041 const lincs_task_t
*li_task
,
2042 const t_blocka
*at2con
,
2043 gmx_bool bSortMatrix
)
2047 for (b
= li_task
->b0
; b
< li_task
->b1
; b
++)
2052 a2
= li
->bla
[b
*2 + 1];
2055 for (k
= at2con
->index
[a1
]; k
< at2con
->index
[a1
+ 1]; k
++)
2059 concon
= li
->con_index
[at2con
->a
[k
]];
2062 li
->blbnb
[i
++] = concon
;
2065 for (k
= at2con
->index
[a2
]; k
< at2con
->index
[a2
+ 1]; k
++)
2069 concon
= li
->con_index
[at2con
->a
[k
]];
2072 li
->blbnb
[i
++] = concon
;
2078 /* Order the blbnb matrix to optimize memory access */
2079 qsort(&(li
->blbnb
[li
->blnr
[b
]]), li
->blnr
[b
+ 1] - li
->blnr
[b
],
2080 sizeof(li
->blbnb
[0]), int_comp
);
2085 void set_lincs(const t_idef
*idef
,
2086 const t_mdatoms
*md
,
2089 struct gmx_lincsdata
*li
)
2091 int natoms
, nflexcon
;
2094 int i
, ncc_alloc_old
, ncon_tot
;
2099 /* Zero the thread index ranges.
2100 * Otherwise without local constraints we could return with old ranges.
2102 for (i
= 0; i
< li
->ntask
; i
++)
2106 li
->task
[i
].nind
= 0;
2110 li
->task
[li
->ntask
].nind
= 0;
2113 /* This is the local topology, so there are only F_CONSTR constraints */
2114 if (idef
->il
[F_CONSTR
].nr
== 0)
2116 /* There are no constraints,
2117 * we do not need to fill any data structures.
2124 fprintf(debug
, "Building the LINCS connectivity\n");
2127 if (DOMAINDECOMP(cr
))
2129 if (cr
->dd
->constraints
)
2133 dd_get_constraint_range(cr
->dd
, &start
, &natoms
);
2137 natoms
= cr
->dd
->nat_home
;
2142 natoms
= md
->homenr
;
2144 at2con
= make_at2con(0, natoms
, idef
->il
, idef
->iparams
, bDynamics
,
2147 ncon_tot
= idef
->il
[F_CONSTR
].nr
/3;
2149 /* Ensure we have enough padding for aligned loads for each thread */
2150 if (ncon_tot
+ li
->ntask
*simd_width
> li
->nc_alloc
|| li
->nc_alloc
== 0)
2152 li
->nc_alloc
= over_alloc_dd(ncon_tot
+ li
->ntask
*simd_width
);
2153 srenew(li
->con_index
, li
->nc_alloc
);
2154 resize_real_aligned(&li
->bllen0
, li
->nc_alloc
);
2155 resize_real_aligned(&li
->ddist
, li
->nc_alloc
);
2156 srenew(li
->bla
, 2*li
->nc_alloc
);
2157 resize_real_aligned(&li
->blc
, li
->nc_alloc
);
2158 resize_real_aligned(&li
->blc1
, li
->nc_alloc
);
2159 srenew(li
->blnr
, li
->nc_alloc
+ 1);
2160 resize_real_aligned(&li
->bllen
, li
->nc_alloc
);
2161 srenew(li
->tmpv
, li
->nc_alloc
);
2162 if (DOMAINDECOMP(cr
))
2164 srenew(li
->nlocat
, li
->nc_alloc
);
2166 resize_real_aligned(&li
->tmp1
, li
->nc_alloc
);
2167 resize_real_aligned(&li
->tmp2
, li
->nc_alloc
);
2168 resize_real_aligned(&li
->tmp3
, li
->nc_alloc
);
2169 resize_real_aligned(&li
->tmp4
, li
->nc_alloc
);
2170 resize_real_aligned(&li
->mlambda
, li
->nc_alloc
);
2173 iatom
= idef
->il
[F_CONSTR
].iatoms
;
2175 ncc_alloc_old
= li
->ncc_alloc
;
2176 li
->blnr
[0] = li
->ncc
;
2178 /* Assign the constraints for li->ntask LINCS tasks.
2179 * We target a uniform distribution of constraints over the tasks.
2180 * Note that when flexible constraints are present, but are removed here
2181 * (e.g. because we are doing EM) we get imbalance, but since that doesn't
2182 * happen during normal MD, that's ok.
2184 int ncon_assign
, ncon_target
, con
, th
;
2186 /* Determine the number of constraints we need to assign here */
2187 ncon_assign
= ncon_tot
;
2190 /* With energy minimization, flexible constraints are ignored
2191 * (and thus minimized, as they should be).
2193 ncon_assign
-= nflexcon
;
2196 /* Set the target constraint count per task to exactly uniform,
2197 * this might be overridden below.
2199 ncon_target
= (ncon_assign
+ li
->ntask
- 1)/li
->ntask
;
2201 /* Mark all constraints as unassigned by setting their index to -1 */
2202 for (con
= 0; con
< ncon_tot
; con
++)
2204 li
->con_index
[con
] = -1;
2208 for (th
= 0; th
< li
->ntask
; th
++)
2210 lincs_task_t
*li_task
;
2212 li_task
= &li
->task
[th
];
2215 /* With indepedent tasks we likely have H-bond constraints or constraint
2216 * pairs. The connected constraints will be pulled into the task, so the
2217 * constraints per task will often exceed ncon_target.
2218 * Triangle constraints can also increase the count, but there are
2219 * relatively few of those, so we usually expect to get ncon_target.
2223 /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
2224 * since otherwise a lot of operations can be wasted.
2225 * There are several ways to round here, we choose the one
2226 * that alternates block sizes, which helps with Intel HT.
2228 ncon_target
= ((ncon_assign
*(th
+ 1))/li
->ntask
- li
->nc_real
+ GMX_SIMD_REAL_WIDTH
- 1) & ~(GMX_SIMD_REAL_WIDTH
- 1);
2232 /* Continue filling the arrays where we left off with the previous task,
2233 * including padding for SIMD.
2235 li_task
->b0
= li
->nc
;
2237 while (con
< ncon_tot
&& li
->nc
- li_task
->b0
< ncon_target
)
2239 if (li
->con_index
[con
] == -1)
2244 type
= iatom
[3*con
];
2245 a1
= iatom
[3*con
+ 1];
2246 a2
= iatom
[3*con
+ 2];
2247 lenA
= idef
->iparams
[type
].constr
.dA
;
2248 lenB
= idef
->iparams
[type
].constr
.dB
;
2249 /* Skip the flexible constraints when not doing dynamics */
2250 if (bDynamics
|| lenA
!= 0 || lenB
!= 0)
2252 assign_constraint(li
, con
, a1
, a2
, lenA
, lenB
, &at2con
);
2254 if (li
->ntask
> 1 && !li
->bTaskDep
)
2256 /* We can generate independent tasks. Check if we
2257 * need to assign connected constraints to our task.
2259 check_assign_connected(li
, iatom
, idef
, bDynamics
,
2262 if (li
->ntask
> 1 && li
->ncg_triangle
> 0)
2264 /* Ensure constraints in one triangle are assigned
2267 check_assign_triangle(li
, iatom
, idef
, bDynamics
,
2268 con
, a1
, a2
, &at2con
);
2276 li_task
->b1
= li
->nc
;
2280 /* Copy the last atom pair indices and lengths for constraints
2281 * up to a multiple of simd_width, such that we can do all
2282 * SIMD operations without having to worry about end effects.
2286 li
->nc
= ((li_task
->b1
+ simd_width
- 1)/simd_width
)*simd_width
;
2287 last
= li_task
->b1
- 1;
2288 for (i
= li_task
->b1
; i
< li
->nc
; i
++)
2290 li
->bla
[i
*2 ] = li
->bla
[last
*2 ];
2291 li
->bla
[i
*2 + 1] = li
->bla
[last
*2 + 1];
2292 li
->bllen0
[i
] = li
->bllen0
[last
];
2293 li
->ddist
[i
] = li
->ddist
[last
];
2294 li
->bllen
[i
] = li
->bllen
[last
];
2295 li
->blnr
[i
+ 1] = li
->blnr
[last
+ 1];
2299 /* Keep track of how many constraints we assigned */
2300 li
->nc_real
+= li_task
->b1
- li_task
->b0
;
2304 fprintf(debug
, "LINCS task %d constraints %d - %d\n",
2305 th
, li_task
->b0
, li_task
->b1
);
2309 assert(li
->nc_real
== ncon_assign
);
2311 gmx_bool bSortMatrix
;
2313 /* Without DD we order the blbnb matrix to optimize memory access.
2314 * With DD the overhead of sorting is more than the gain during access.
2316 bSortMatrix
= !DOMAINDECOMP(cr
);
2318 if (li
->ncc
> li
->ncc_alloc
)
2320 li
->ncc_alloc
= over_alloc_small(li
->ncc
);
2321 srenew(li
->blbnb
, li
->ncc_alloc
);
2324 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2325 for (th
= 0; th
< li
->ntask
; th
++)
2329 lincs_task_t
*li_task
;
2331 li_task
= &li
->task
[th
];
2333 if (li
->ncg_triangle
> 0 &&
2334 li_task
->b1
- li_task
->b0
> li_task
->tri_alloc
)
2336 /* This is allocating too much, but it is difficult to improve */
2337 li_task
->tri_alloc
= over_alloc_dd(li_task
->b1
- li_task
->b0
);
2338 srenew(li_task
->triangle
, li_task
->tri_alloc
);
2339 srenew(li_task
->tri_bits
, li_task
->tri_alloc
);
2342 set_matrix_indices(li
, li_task
, &at2con
, bSortMatrix
);
2344 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2347 done_blocka(&at2con
);
2351 /* Since the matrix is static, we should free some memory */
2352 li
->ncc_alloc
= li
->ncc
;
2353 srenew(li
->blbnb
, li
->ncc_alloc
);
2356 if (li
->ncc_alloc
> ncc_alloc_old
)
2358 srenew(li
->blmf
, li
->ncc_alloc
);
2359 srenew(li
->blmf1
, li
->ncc_alloc
);
2360 srenew(li
->tmpncc
, li
->ncc_alloc
);
2363 if (DOMAINDECOMP(cr
) && dd_constraints_nlocalatoms(cr
->dd
) != NULL
)
2367 nlocat_dd
= dd_constraints_nlocalatoms(cr
->dd
);
2369 /* Convert nlocat from local topology to LINCS constraint indexing */
2370 for (con
= 0; con
< ncon_tot
; con
++)
2372 li
->nlocat
[li
->con_index
[con
]] = nlocat_dd
[con
];
2382 fprintf(debug
, "Number of constraints is %d, padded %d, couplings %d\n",
2383 li
->nc_real
, li
->nc
, li
->ncc
);
2388 lincs_thread_setup(li
, md
->nr
);
2391 set_lincs_matrix(li
, md
->invmass
, md
->lambda
);
2394 static void lincs_warning(FILE *fplog
,
2395 gmx_domdec_t
*dd
, rvec
*x
, rvec
*xprime
, t_pbc
*pbc
,
2396 int ncons
, int *bla
, real
*bllen
, real wangle
,
2397 int maxwarn
, int *warncount
)
2401 real wfac
, d0
, d1
, cosine
;
2404 wfac
= cos(DEG2RAD
*wangle
);
2406 sprintf(buf
, "bonds that rotated more than %g degrees:\n"
2407 " atom 1 atom 2 angle previous, current, constraint length\n",
2409 fprintf(stderr
, "%s", buf
);
2412 fprintf(fplog
, "%s", buf
);
2415 for (b
= 0; b
< ncons
; b
++)
2421 pbc_dx_aiuc(pbc
, x
[i
], x
[j
], v0
);
2422 pbc_dx_aiuc(pbc
, xprime
[i
], xprime
[j
], v1
);
2426 rvec_sub(x
[i
], x
[j
], v0
);
2427 rvec_sub(xprime
[i
], xprime
[j
], v1
);
2431 cosine
= iprod(v0
, v1
)/(d0
*d1
);
2434 sprintf(buf
, " %6d %6d %5.1f %8.4f %8.4f %8.4f\n",
2435 ddglatnr(dd
, i
), ddglatnr(dd
, j
),
2436 RAD2DEG
*acos(cosine
), d0
, d1
, bllen
[b
]);
2437 fprintf(stderr
, "%s", buf
);
2440 fprintf(fplog
, "%s", buf
);
2442 if (!std::isfinite(d1
))
2444 gmx_fatal(FARGS
, "Bond length not finite.");
2450 if (*warncount
> maxwarn
)
2452 too_many_constraint_warnings(econtLINCS
, *warncount
);
2456 static void cconerr(const struct gmx_lincsdata
*lincsd
,
2457 rvec
*x
, t_pbc
*pbc
,
2458 real
*ncons_loc
, real
*ssd
, real
*max
, int *imax
)
2460 const int *bla
, *nlocat
;
2463 int count
, im
, task
;
2466 bllen
= lincsd
->bllen
;
2467 nlocat
= lincsd
->nlocat
;
2473 for (task
= 0; task
< lincsd
->ntask
; task
++)
2477 for (b
= lincsd
->task
[task
].b0
; b
< lincsd
->task
[task
].b1
; b
++)
2484 pbc_dx_aiuc(pbc
, x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
2488 rvec_sub(x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
2491 len
= r2
*gmx_invsqrt(r2
);
2492 d
= fabs(len
/bllen
[b
]-1);
2493 if (d
> ma
&& (nlocat
== NULL
|| nlocat
[b
]))
2505 ssd2
+= nlocat
[b
]*d
*d
;
2511 *ncons_loc
= (nlocat
? 0.5 : 1)*count
;
2512 *ssd
= (nlocat
? 0.5 : 1)*ssd2
;
2517 gmx_bool
constrain_lincs(FILE *fplog
, gmx_bool bLog
, gmx_bool bEner
,
2520 struct gmx_lincsdata
*lincsd
, t_mdatoms
*md
,
2522 rvec
*x
, rvec
*xprime
, rvec
*min_proj
,
2523 matrix box
, t_pbc
*pbc
,
2524 real lambda
, real
*dvdlambda
,
2525 real invdt
, rvec
*v
,
2526 gmx_bool bCalcVir
, tensor vir_r_m_dr
,
2529 int maxwarn
, int *warncount
)
2532 char buf
[STRLEN
], buf2
[22], buf3
[STRLEN
];
2534 real ncons_loc
, p_ssd
, p_max
= 0;
2536 gmx_bool bOK
, bWarn
;
2540 /* This boolean should be set by a flag passed to this routine.
2541 * We can also easily check if any constraint length is changed,
2542 * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2544 bCalcDHDL
= (ir
->efep
!= efepNO
&& dvdlambda
!= NULL
);
2546 if (lincsd
->nc
== 0 && cr
->dd
== NULL
)
2550 lincsd
->rmsd_data
[0] = 0;
2551 if (ir
->eI
== eiSD2
&& v
== NULL
)
2559 lincsd
->rmsd_data
[i
] = 0;
2565 if (econq
== econqCoord
)
2567 /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2568 * also with efep!=fepNO.
2570 if (ir
->efep
!= efepNO
)
2572 if (md
->nMassPerturbed
&& lincsd
->matlam
!= md
->lambda
)
2574 set_lincs_matrix(lincsd
, md
->invmass
, md
->lambda
);
2577 for (i
= 0; i
< lincsd
->nc
; i
++)
2579 lincsd
->bllen
[i
] = lincsd
->bllen0
[i
] + lambda
*lincsd
->ddist
[i
];
2583 if (lincsd
->ncg_flex
)
2585 /* Set the flexible constraint lengths to the old lengths */
2588 for (i
= 0; i
< lincsd
->nc
; i
++)
2590 if (lincsd
->bllen
[i
] == 0)
2592 pbc_dx_aiuc(pbc
, x
[lincsd
->bla
[2*i
]], x
[lincsd
->bla
[2*i
+1]], dx
);
2593 lincsd
->bllen
[i
] = norm(dx
);
2599 for (i
= 0; i
< lincsd
->nc
; i
++)
2601 if (lincsd
->bllen
[i
] == 0)
2604 sqrt(distance2(x
[lincsd
->bla
[2*i
]],
2605 x
[lincsd
->bla
[2*i
+1]]));
2613 cconerr(lincsd
, xprime
, pbc
,
2614 &ncons_loc
, &p_ssd
, &p_max
, &p_imax
);
2617 /* This bWarn var can be updated by multiple threads
2618 * at the same time. But as we only need to detect
2619 * if a warning occured or not, this is not an issue.
2623 /* The OpenMP parallel region of constrain_lincs for coords */
2624 #pragma omp parallel num_threads(lincsd->ntask)
2628 int th
= gmx_omp_get_thread_num();
2630 clear_mat(lincsd
->task
[th
].vir_r_m_dr
);
2632 do_lincs(x
, xprime
, box
, pbc
, lincsd
, th
,
2635 ir
->LincsWarnAngle
, &bWarn
,
2637 th
== 0 ? vir_r_m_dr
: lincsd
->task
[th
].vir_r_m_dr
);
2639 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2642 if (bLog
&& fplog
&& lincsd
->nc
> 0)
2644 fprintf(fplog
, " Rel. Constraint Deviation: RMS MAX between atoms\n");
2645 fprintf(fplog
, " Before LINCS %.6f %.6f %6d %6d\n",
2646 sqrt(p_ssd
/ncons_loc
), p_max
,
2647 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
]),
2648 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
+1]));
2652 cconerr(lincsd
, xprime
, pbc
,
2653 &ncons_loc
, &p_ssd
, &p_max
, &p_imax
);
2654 /* Check if we are doing the second part of SD */
2655 if (ir
->eI
== eiSD2
&& v
== NULL
)
2663 lincsd
->rmsd_data
[0] = ncons_loc
;
2664 lincsd
->rmsd_data
[i
] = p_ssd
;
2668 lincsd
->rmsd_data
[0] = 0;
2669 lincsd
->rmsd_data
[1] = 0;
2670 lincsd
->rmsd_data
[2] = 0;
2672 if (bLog
&& fplog
&& lincsd
->nc
> 0)
2675 " After LINCS %.6f %.6f %6d %6d\n\n",
2676 sqrt(p_ssd
/ncons_loc
), p_max
,
2677 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
]),
2678 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
+1]));
2685 cconerr(lincsd
, xprime
, pbc
,
2686 &ncons_loc
, &p_ssd
, &p_max
, &p_imax
);
2689 sprintf(buf3
, " in simulation %d", cr
->ms
->sim
);
2695 sprintf(buf
, "\nStep %s, time %g (ps) LINCS WARNING%s\n"
2696 "relative constraint deviation after LINCS:\n"
2697 "rms %.6f, max %.6f (between atoms %d and %d)\n",
2698 gmx_step_str(step
, buf2
), ir
->init_t
+step
*ir
->delta_t
,
2700 sqrt(p_ssd
/ncons_loc
), p_max
,
2701 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
]),
2702 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
+1]));
2705 fprintf(fplog
, "%s", buf
);
2707 fprintf(stderr
, "%s", buf
);
2708 lincs_warning(fplog
, cr
->dd
, x
, xprime
, pbc
,
2709 lincsd
->nc
, lincsd
->bla
, lincsd
->bllen
,
2710 ir
->LincsWarnAngle
, maxwarn
, warncount
);
2712 bOK
= (p_max
< 0.5);
2715 if (lincsd
->ncg_flex
)
2717 for (i
= 0; (i
< lincsd
->nc
); i
++)
2719 if (lincsd
->bllen0
[i
] == 0 && lincsd
->ddist
[i
] == 0)
2721 lincsd
->bllen
[i
] = 0;
2728 /* The OpenMP parallel region of constrain_lincs for derivatives */
2729 #pragma omp parallel num_threads(lincsd->ntask)
2733 int th
= gmx_omp_get_thread_num();
2735 do_lincsp(x
, xprime
, min_proj
, pbc
, lincsd
, th
,
2736 md
->invmass
, econq
, bCalcDHDL
,
2737 bCalcVir
, th
== 0 ? vir_r_m_dr
: lincsd
->task
[th
].vir_r_m_dr
);
2739 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2745 /* Reduce the dH/dlambda contributions over the threads */
2750 for (th
= 0; th
< lincsd
->ntask
; th
++)
2752 dhdlambda
+= lincsd
->task
[th
].dhdlambda
;
2754 if (econq
== econqCoord
)
2756 /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2757 /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2758 dhdlambda
/= ir
->delta_t
*ir
->delta_t
;
2760 *dvdlambda
+= dhdlambda
;
2763 if (bCalcVir
&& lincsd
->ntask
> 1)
2765 for (i
= 1; i
< lincsd
->ntask
; i
++)
2767 m_add(vir_r_m_dr
, lincsd
->task
[i
].vir_r_m_dr
, vir_r_m_dr
);
2771 /* count assuming nit=1 */
2772 inc_nrnb(nrnb
, eNR_LINCS
, lincsd
->nc_real
);
2773 inc_nrnb(nrnb
, eNR_LINCSMAT
, (2+lincsd
->nOrder
)*lincsd
->ncc
);
2774 if (lincsd
->ntriangle
> 0)
2776 inc_nrnb(nrnb
, eNR_LINCSMAT
, lincsd
->nOrder
*lincsd
->ncc_triangle
);
2780 inc_nrnb(nrnb
, eNR_CONSTR_V
, lincsd
->nc_real
*2);
2784 inc_nrnb(nrnb
, eNR_CONSTR_VIR
, lincsd
->nc_real
);