2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* This file is completely threadsafe - keep it that way! */
48 #include "gromacs/domdec/domdec.h"
49 #include "gromacs/fileio/gmxfio.h"
50 #include "gromacs/legacyheaders/constr.h"
51 #include "gromacs/legacyheaders/copyrite.h"
52 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
53 #include "gromacs/legacyheaders/mdrun.h"
54 #include "gromacs/legacyheaders/nrnb.h"
55 #include "gromacs/legacyheaders/types/commrec.h"
56 #include "gromacs/math/units.h"
57 #include "gromacs/math/vec.h"
58 #include "gromacs/pbcutil/pbc-simd.h"
59 #include "gromacs/pbcutil/pbc.h"
60 #include "gromacs/simd/simd.h"
61 #include "gromacs/simd/simd_math.h"
62 #include "gromacs/simd/vector_operations.h"
63 #include "gromacs/topology/block.h"
64 #include "gromacs/topology/mtop_util.h"
65 #include "gromacs/utility/bitmask.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/gmxomp.h"
68 #include "gromacs/utility/smalloc.h"
70 /* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
71 #if defined GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700)
76 #if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
78 // This was originally work-in-progress for augmenting the SIMD module with
79 // masked load/store operations. Instead, that turned into and extended SIMD
80 // interface that supports gather/scatter in all platforms, which will be
81 // part of a future Gromacs version. However, since the code for bonded
82 // interactions and LINCS was already written it would be a pity not to get
83 // the performance gains in Gromacs-5.1. For this reason we have added it as
84 // a bit of a hack in the two files that use it. It will be replaced with the
85 // new generic functionality after version 5.1
88 static gmx_inline
void gmx_simdcall
89 gmx_hack_simd_transpose4_r(gmx_simd_double_t
*row0
,
90 gmx_simd_double_t
*row1
,
91 gmx_simd_double_t
*row2
,
92 gmx_simd_double_t
*row3
)
94 __m256d tmp0
, tmp1
, tmp2
, tmp3
;
96 tmp0
= _mm256_unpacklo_pd(*row0
, *row1
);
97 tmp2
= _mm256_unpacklo_pd(*row2
, *row3
);
98 tmp1
= _mm256_unpackhi_pd(*row0
, *row1
);
99 tmp3
= _mm256_unpackhi_pd(*row2
, *row3
);
100 *row0
= _mm256_permute2f128_pd(tmp0
, tmp2
, 0b00100000);
101 *row1
= _mm256_permute2f128_pd(tmp1
, tmp3
, 0b00100000);
102 *row2
= _mm256_permute2f128_pd(tmp0
, tmp2
, 0b00110001);
103 *row3
= _mm256_permute2f128_pd(tmp1
, tmp3
, 0b00110001);
106 static gmx_inline
void gmx_simdcall
107 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t
*a
,
108 gmx_simd_double_t
*row0
,
109 gmx_simd_double_t
*row1
,
110 gmx_simd_double_t
*row2
,
111 gmx_simd_double_t
*row3
)
118 gmx_hack_simd_transpose4_r(row0
, row1
, row2
, row3
);
121 static gmx_inline
void gmx_simdcall
122 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_double_t row0
,
123 gmx_simd_double_t row1
,
124 gmx_simd_double_t row2
,
125 gmx_simd_double_t row3
,
126 gmx_simd4_double_t
*a
)
133 gmx_hack_simd_transpose4_r(&a
[0], &a
[1], &a
[2], &a
[3]);
137 # ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
138 # define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
139 # define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)), (x))
141 # define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
142 # define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), (x))
145 # else /* single instead of double */
146 static gmx_inline
void gmx_simdcall
147 gmx_hack_simd_transpose4_r(gmx_simd_float_t
*row0
,
148 gmx_simd_float_t
*row1
,
149 gmx_simd_float_t
*row2
,
150 gmx_simd_float_t
*row3
)
152 __m256 tmp0
, tmp1
, tmp2
, tmp3
;
154 tmp0
= _mm256_unpacklo_ps(*row0
, *row1
);
155 tmp2
= _mm256_unpacklo_ps(*row2
, *row3
);
156 tmp1
= _mm256_unpackhi_ps(*row0
, *row1
);
157 tmp3
= _mm256_unpackhi_ps(*row2
, *row3
);
158 *row0
= _mm256_shuffle_ps(tmp0
, tmp2
, 0b0100010001000100);
159 *row1
= _mm256_shuffle_ps(tmp0
, tmp2
, 0b1110111011101110);
160 *row2
= _mm256_shuffle_ps(tmp1
, tmp3
, 0b0100010001000100);
161 *row3
= _mm256_shuffle_ps(tmp1
, tmp3
, 0b1110111011101110);
164 static gmx_inline
void gmx_simdcall
165 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t
*a
,
166 gmx_simd_float_t
*row0
,
167 gmx_simd_float_t
*row1
,
168 gmx_simd_float_t
*row2
,
169 gmx_simd_float_t
*row3
)
171 *row0
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[0]), a
[4], 1);
172 *row1
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[1]), a
[5], 1);
173 *row2
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[2]), a
[6], 1);
174 *row3
= _mm256_insertf128_ps(_mm256_castps128_ps256(a
[3]), a
[7], 1);
176 gmx_hack_simd_transpose4_r(row0
, row1
, row2
, row3
);
179 static gmx_inline
void gmx_simdcall
180 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_float_t row0
,
181 gmx_simd_float_t row1
,
182 gmx_simd_float_t row2
,
183 gmx_simd_float_t row3
,
184 gmx_simd4_float_t
*a
)
186 gmx_hack_simd_transpose4_r(&row0
, &row1
, &row2
, &row3
);
188 a
[0] = _mm256_extractf128_ps(row0
, 0);
189 a
[1] = _mm256_extractf128_ps(row1
, 0);
190 a
[2] = _mm256_extractf128_ps(row2
, 0);
191 a
[3] = _mm256_extractf128_ps(row3
, 0);
192 a
[4] = _mm256_extractf128_ps(row0
, 1);
193 a
[5] = _mm256_extractf128_ps(row1
, 1);
194 a
[6] = _mm256_extractf128_ps(row2
, 1);
195 a
[7] = _mm256_extractf128_ps(row3
, 1);
197 #ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
198 # define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
199 # define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)), (x))
201 # define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
202 # define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_set_epi32(0, -1, -1, -1), (x))
209 #ifdef GMX_SIMD_HAVE_REAL
210 /*! \brief Store differences between indexed rvecs in SIMD registers.
212 * Returns SIMD register with the difference vectors:
213 * v[pair_index[i*2]] - v[pair_index[i*2 + 1]]
215 * \param[in] v Array of rvecs
216 * \param[in] pair_index Index pairs for GMX_SIMD_REAL_WIDTH vector pairs
217 * \param[in,out] buf Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
218 * \param[out] dx SIMD register with x difference
219 * \param[out] dy SIMD register with y difference
220 * \param[out] dz SIMD register with z difference
222 static gmx_inline
void gmx_simdcall
223 gmx_hack_simd_gather_rvec_dist_pair_index(const rvec
*v
,
224 const int *pair_index
,
225 real gmx_unused
*buf
,
230 #if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
232 gmx_simd4_real_t d
[GMX_SIMD_REAL_WIDTH
];
235 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
237 d
[i
] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v
[pair_index
[i
*2 + 0]][0])),
238 gmx_hack_simd4_load3_r(&(v
[pair_index
[i
*2 + 1]][0])));
241 gmx_hack_simd4_transpose_to_simd_r(d
, dx
, dy
, dz
, &tmp
);
244 GMX_ALIGNED(real
, GMX_SIMD_REAL_WIDTH
) buf_aligned
[3*GMX_SIMD_REAL_WIDTH
];
246 real
* buf_aligned
= buf
;
251 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
253 /* Store the distances packed and aligned */
254 for (m
= 0; m
< DIM
; m
++)
256 buf_aligned
[m
*GMX_SIMD_REAL_WIDTH
+ i
] =
257 v
[pair_index
[i
*2]][m
] - v
[pair_index
[i
*2 + 1]][m
];
260 *dx
= gmx_simd_load_r(buf_aligned
+ 0*GMX_SIMD_REAL_WIDTH
);
261 *dy
= gmx_simd_load_r(buf_aligned
+ 1*GMX_SIMD_REAL_WIDTH
);
262 *dz
= gmx_simd_load_r(buf_aligned
+ 2*GMX_SIMD_REAL_WIDTH
);
267 /*! \brief Stores SIMD vector into multiple rvecs.
269 * \param[in] x SIMD register with x-components of the vectors
270 * \param[in] y SIMD register with y-components of the vectors
271 * \param[in] z SIMD register with z-components of the vectors
272 * \param[in,out] buf Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
273 * \param[out] v Array of GMX_SIMD_REAL_WIDTH rvecs
275 static gmx_inline
void gmx_simdcall
276 gmx_simd_store_vec_to_rvec(gmx_simd_real_t x
,
279 real gmx_unused
*buf
,
282 #if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
284 gmx_simd4_real_t s4
[GMX_SIMD_REAL_WIDTH
];
285 gmx_simd_real_t zero
= gmx_simd_setzero_r();
287 gmx_hack_simd_transpose_to_simd4_r(x
, y
, z
, zero
, s4
);
289 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
291 gmx_hack_simd4_store3_r(v
[i
], s4
[i
]);
295 GMX_ALIGNED(real
, GMX_SIMD_REAL_WIDTH
) buf_aligned
[3*GMX_SIMD_REAL_WIDTH
];
297 real
* buf_aligned
= buf
;
302 gmx_simd_store_r(buf_aligned
+ 0*GMX_SIMD_REAL_WIDTH
, x
);
303 gmx_simd_store_r(buf_aligned
+ 1*GMX_SIMD_REAL_WIDTH
, y
);
304 gmx_simd_store_r(buf_aligned
+ 2*GMX_SIMD_REAL_WIDTH
, z
);
306 for (i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
308 for (m
= 0; m
< DIM
; m
++)
310 v
[i
][m
] = buf_aligned
[m
*GMX_SIMD_REAL_WIDTH
+ i
];
315 #endif /* GMX_SIMD_HAVE_REAL */
319 int b0
; /* first constraint for this task */
320 int b1
; /* b1-1 is the last constraint for this task */
321 int ntriangle
; /* the number of constraints in triangles */
322 int *triangle
; /* the list of triangle constraints */
323 int *tri_bits
; /* the bits tell if the matrix element should be used */
324 int tri_alloc
; /* allocation size of triangle and tri_bits */
325 int nind
; /* number of indices */
326 int *ind
; /* constraint index for updating atom data */
327 int nind_r
; /* number of indices */
328 int *ind_r
; /* constraint index for updating atom data */
329 int ind_nalloc
; /* allocation size of ind and ind_r */
330 tensor vir_r_m_dr
; /* temporary variable for virial calculation */
331 real dhdlambda
; /* temporary variable for lambda derivative */
332 real
*simd_buf
; /* Aligned work array for SIMD */
335 typedef struct gmx_lincsdata
{
336 int ncg
; /* the global number of constraints */
337 int ncg_flex
; /* the global number of flexible constraints */
338 int ncg_triangle
; /* the global number of constraints in triangles */
339 int nIter
; /* the number of iterations */
340 int nOrder
; /* the order of the matrix expansion */
341 int max_connect
; /* the maximum number of constrains connected to a single atom */
343 int nc_real
; /* the number of real constraints */
344 int nc
; /* the number of constraints including padding for SIMD */
345 int nc_alloc
; /* the number we allocated memory for */
346 int ncc
; /* the number of constraint connections */
347 int ncc_alloc
; /* the number we allocated memory for */
348 real matlam
; /* the FE lambda value used for filling blc and blmf */
349 int *con_index
; /* mapping from topology to LINCS constraints */
350 real
*bllen0
; /* the reference distance in topology A */
351 real
*ddist
; /* the reference distance in top B - the r.d. in top A */
352 int *bla
; /* the atom pairs involved in the constraints */
353 real
*blc
; /* 1/sqrt(invmass1 + invmass2) */
354 real
*blc1
; /* as blc, but with all masses 1 */
355 int *blnr
; /* index into blbnb and blmf */
356 int *blbnb
; /* list of constraint connections */
357 int ntriangle
; /* the local number of constraints in triangles */
358 int ncc_triangle
; /* the number of constraint connections in triangles */
359 gmx_bool bCommIter
; /* communicate before each LINCS interation */
360 real
*blmf
; /* matrix of mass factors for constraint connections */
361 real
*blmf1
; /* as blmf, but with all masses 1 */
362 real
*bllen
; /* the reference bond length */
363 int *nlocat
; /* the local atom count per constraint, can be NULL */
365 int ntask
; /* The number of tasks = #threads for LINCS */
366 lincs_task_t
*task
; /* LINCS thread division */
367 gmx_bitmask_t
*atf
; /* atom flags for thread parallelization */
368 int atf_nalloc
; /* allocation size of atf */
369 gmx_bool bTaskDep
; /* are the LINCS tasks interdependent? */
370 /* arrays for temporary storage in the LINCS algorithm */
377 real
*mlambda
; /* the Lagrange multipliers * -1 */
378 /* storage for the constraint RMS relative deviation output */
382 /* Define simd_width for memory allocation used for SIMD code */
384 static const int simd_width
= GMX_SIMD_REAL_WIDTH
;
386 static const int simd_width
= 1;
388 /* We can't use small memory alignments on many systems, so use min 64 bytes*/
389 static const int align_bytes
= std::max
<int>(64, simd_width
*sizeof(real
));
391 real
*lincs_rmsd_data(struct gmx_lincsdata
*lincsd
)
393 return lincsd
->rmsd_data
;
396 real
lincs_rmsd(struct gmx_lincsdata
*lincsd
, gmx_bool bSD2
)
398 if (lincsd
->rmsd_data
[0] > 0)
400 return sqrt(lincsd
->rmsd_data
[bSD2
? 2 : 1]/lincsd
->rmsd_data
[0]);
408 /* Do a set of nrec LINCS matrix multiplications.
409 * This function will return with up to date thread-local
410 * constraint data, without an OpenMP barrier.
412 static void lincs_matrix_expand(const struct gmx_lincsdata
*lincsd
,
413 const lincs_task_t
*li_task
,
415 real
*rhs1
, real
*rhs2
, real
*sol
)
417 int b0
, b1
, nrec
, rec
;
418 const int *blnr
= lincsd
->blnr
;
419 const int *blbnb
= lincsd
->blbnb
;
423 nrec
= lincsd
->nOrder
;
425 for (rec
= 0; rec
< nrec
; rec
++)
429 if (lincsd
->bTaskDep
)
433 for (b
= b0
; b
< b1
; b
++)
439 for (n
= blnr
[b
]; n
< blnr
[b
+1]; n
++)
441 mvb
= mvb
+ blcc
[n
]*rhs1
[blbnb
[n
]];
444 sol
[b
] = sol
[b
] + mvb
;
452 } /* nrec*(ncons+2*nrtot) flops */
454 if (lincsd
->ntriangle
> 0)
456 /* Perform an extra nrec recursions for only the constraints
457 * involved in rigid triangles.
458 * In this way their accuracy should come close to those of the other
459 * constraints, since traingles of constraints can produce eigenvalues
460 * around 0.7, while the effective eigenvalue for bond constraints
461 * is around 0.4 (and 0.7*0.7=0.5).
464 if (lincsd
->bTaskDep
)
466 /* We need a barrier here, since other threads might still be
467 * reading the contents of rhs1 and/o rhs2.
468 * We could avoid this barrier by introducing two extra rhs
469 * arrays for the triangle constraints only.
474 /* Constraints involved in a triangle are ensured to be in the same
475 * LINCS task. This means no barriers are required during the extra
476 * iterations for the triangle constraints.
478 const int *triangle
= li_task
->triangle
;
479 const int *tri_bits
= li_task
->tri_bits
;
481 for (rec
= 0; rec
< nrec
; rec
++)
485 for (tb
= 0; tb
< li_task
->ntriangle
; tb
++)
487 int b
, bits
, nr0
, nr1
, n
;
495 for (n
= nr0
; n
< nr1
; n
++)
497 if (bits
& (1 << (n
- nr0
)))
499 mvb
= mvb
+ blcc
[n
]*rhs1
[blbnb
[n
]];
503 sol
[b
] = sol
[b
] + mvb
;
511 } /* nrec*(ntriangle + ncc_triangle*2) flops */
515 static void lincs_update_atoms_noind(int ncons
, const int *bla
,
517 const real
*fac
, rvec
*r
,
522 real mvb
, im1
, im2
, tmp0
, tmp1
, tmp2
;
526 for (b
= 0; b
< ncons
; b
++)
542 } /* 16 ncons flops */
546 for (b
= 0; b
< ncons
; b
++)
564 static void lincs_update_atoms_ind(int ncons
, const int *ind
, const int *bla
,
566 const real
*fac
, rvec
*r
,
571 real mvb
, im1
, im2
, tmp0
, tmp1
, tmp2
;
575 for (bi
= 0; bi
< ncons
; bi
++)
592 } /* 16 ncons flops */
596 for (bi
= 0; bi
< ncons
; bi
++)
611 } /* 16 ncons flops */
615 static void lincs_update_atoms(struct gmx_lincsdata
*li
, int th
,
617 const real
*fac
, rvec
*r
,
623 /* Single thread, we simply update for all constraints */
624 lincs_update_atoms_noind(li
->nc_real
,
625 li
->bla
, prefac
, fac
, r
, invmass
, x
);
629 /* Update the atom vector components for our thread local
630 * constraints that only access our local atom range.
631 * This can be done without a barrier.
633 lincs_update_atoms_ind(li
->task
[th
].nind
, li
->task
[th
].ind
,
634 li
->bla
, prefac
, fac
, r
, invmass
, x
);
636 if (li
->task
[li
->ntask
].nind
> 0)
638 /* Update the constraints that operate on atoms
639 * in multiple thread atom blocks on the master thread.
644 lincs_update_atoms_ind(li
->task
[li
->ntask
].nind
,
645 li
->task
[li
->ntask
].ind
,
646 li
->bla
, prefac
, fac
, r
, invmass
, x
);
653 /* Calculate the constraint distance vectors r to project on from x.
654 * Determine the right-hand side of the matrix equation using quantity f.
655 * This function only differs from calc_dr_x_xp_simd below in that
656 * no constraint length is subtracted and no PBC is used for f.
658 static void gmx_simdcall
659 calc_dr_x_f_simd(int b0
,
662 const rvec
* gmx_restrict x
,
663 const rvec
* gmx_restrict f
,
664 const real
* gmx_restrict blc
,
665 const pbc_simd_t
* pbc_simd
,
666 real
* gmx_restrict vbuf1
,
667 real
* gmx_restrict vbuf2
,
668 rvec
* gmx_restrict r
,
669 real
* gmx_restrict rhs
,
670 real
* gmx_restrict sol
)
674 assert(b0
% GMX_SIMD_REAL_WIDTH
== 0);
676 for (bs
= b0
; bs
< b1
; bs
+= GMX_SIMD_REAL_WIDTH
)
678 gmx_simd_real_t rx_S
, ry_S
, rz_S
, n2_S
, il_S
;
679 gmx_simd_real_t fx_S
, fy_S
, fz_S
, ip_S
, rhs_S
;
681 gmx_hack_simd_gather_rvec_dist_pair_index(x
, bla
+ bs
*2, vbuf1
,
682 &rx_S
, &ry_S
, &rz_S
);
684 pbc_correct_dx_simd(&rx_S
, &ry_S
, &rz_S
, pbc_simd
);
686 n2_S
= gmx_simd_norm2_r(rx_S
, ry_S
, rz_S
);
687 il_S
= gmx_simd_invsqrt_r(n2_S
);
689 rx_S
= gmx_simd_mul_r(rx_S
, il_S
);
690 ry_S
= gmx_simd_mul_r(ry_S
, il_S
);
691 rz_S
= gmx_simd_mul_r(rz_S
, il_S
);
693 gmx_simd_store_vec_to_rvec(rx_S
, ry_S
, rz_S
, vbuf1
, r
+ bs
);
695 gmx_hack_simd_gather_rvec_dist_pair_index(f
, bla
+ bs
*2, vbuf2
,
696 &fx_S
, &fy_S
, &fz_S
);
698 ip_S
= gmx_simd_iprod_r(rx_S
, ry_S
, rz_S
,
701 rhs_S
= gmx_simd_mul_r(gmx_simd_load_r(blc
+ bs
), ip_S
);
703 gmx_simd_store_r(rhs
+ bs
, rhs_S
);
704 gmx_simd_store_r(sol
+ bs
, rhs_S
);
707 #endif /* LINCS_SIMD */
709 /* LINCS projection, works on derivatives of the coordinates */
710 static void do_lincsp(rvec
*x
, rvec
*f
, rvec
*fp
, t_pbc
*pbc
,
711 struct gmx_lincsdata
*lincsd
, int th
,
713 int econq
, gmx_bool bCalcDHDL
,
714 gmx_bool bCalcVir
, tensor rmdf
)
717 int *bla
, *blnr
, *blbnb
;
719 real
*blc
, *blmf
, *blcc
, *rhs1
, *rhs2
, *sol
;
721 b0
= lincsd
->task
[th
].b0
;
722 b1
= lincsd
->task
[th
].b1
;
727 blbnb
= lincsd
->blbnb
;
728 if (econq
!= econqForce
)
730 /* Use mass-weighted parameters */
736 /* Use non mass-weighted parameters */
738 blmf
= lincsd
->blmf1
;
740 blcc
= lincsd
->tmpncc
;
747 /* This SIMD code does the same as the plain-C code after the #else.
748 * The only difference is that we always call pbc code, as with SIMD
749 * the overhead of pbc computation (when not needed) is small.
753 /* Convert the pbc struct for SIMD */
754 set_pbc_simd(pbc
, &pbc_simd
);
756 /* Compute normalized x i-j vectors, store in r.
757 * Compute the inner product of r and xp i-j and store in rhs1.
759 calc_dr_x_f_simd(b0
, b1
, bla
, x
, f
, blc
,
761 lincsd
->task
[th
].simd_buf
,
762 lincsd
->task
[th
].simd_buf
+ GMX_SIMD_REAL_WIDTH
*DIM
,
765 #else /* LINCS_SIMD */
767 /* Compute normalized i-j vectors */
770 for (b
= b0
; b
< b1
; b
++)
774 pbc_dx_aiuc(pbc
, x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
780 for (b
= b0
; b
< b1
; b
++)
784 rvec_sub(x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
786 } /* 16 ncons flops */
789 for (b
= b0
; b
< b1
; b
++)
796 mvb
= blc
[b
]*(r
[b
][0]*(f
[i
][0] - f
[j
][0]) +
797 r
[b
][1]*(f
[i
][1] - f
[j
][1]) +
798 r
[b
][2]*(f
[i
][2] - f
[j
][2]));
804 #endif /* LINCS_SIMD */
806 if (lincsd
->bTaskDep
)
808 /* We need a barrier, since the matrix construction below
809 * can access entries in r of other threads.
814 /* Construct the (sparse) LINCS matrix */
815 for (b
= b0
; b
< b1
; b
++)
819 for (n
= blnr
[b
]; n
< blnr
[b
+1]; n
++)
821 blcc
[n
] = blmf
[n
]*iprod(r
[b
], r
[blbnb
[n
]]);
824 /* Together: 23*ncons + 6*nrtot flops */
826 lincs_matrix_expand(lincsd
, &lincsd
->task
[th
], blcc
, rhs1
, rhs2
, sol
);
827 /* nrec*(ncons+2*nrtot) flops */
829 if (econq
== econqDeriv_FlexCon
)
831 /* We only want to constraint the flexible constraints,
832 * so we mask out the normal ones by setting sol to 0.
834 for (b
= b0
; b
< b1
; b
++)
836 if (!(lincsd
->bllen0
[b
] == 0 && lincsd
->ddist
[b
] == 0))
843 /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
844 for (b
= b0
; b
< b1
; b
++)
849 /* When constraining forces, we should not use mass weighting,
850 * so we pass invmass=NULL, which results in the use of 1 for all atoms.
852 lincs_update_atoms(lincsd
, th
, 1.0, sol
, r
,
853 (econq
!= econqForce
) ? invmass
: NULL
, fp
);
860 for (b
= b0
; b
< b1
; b
++)
862 dhdlambda
-= sol
[b
]*lincsd
->ddist
[b
];
865 lincsd
->task
[th
].dhdlambda
= dhdlambda
;
870 /* Constraint virial,
871 * determines sum r_bond x delta f,
872 * where delta f is the constraint correction
873 * of the quantity that is being constrained.
875 for (b
= b0
; b
< b1
; b
++)
880 mvb
= lincsd
->bllen
[b
]*sol
[b
];
881 for (i
= 0; i
< DIM
; i
++)
884 for (j
= 0; j
< DIM
; j
++)
886 rmdf
[i
][j
] += tmp1
*r
[b
][j
];
889 } /* 23 ncons flops */
894 /* Calculate the constraint distance vectors r to project on from x.
895 * Determine the right-hand side of the matrix equation using coordinates xp.
897 static void gmx_simdcall
898 calc_dr_x_xp_simd(int b0
,
901 const rvec
* gmx_restrict x
,
902 const rvec
* gmx_restrict xp
,
903 const real
* gmx_restrict bllen
,
904 const real
* gmx_restrict blc
,
905 const pbc_simd_t
* pbc_simd
,
906 real
* gmx_restrict vbuf1
,
907 real
* gmx_restrict vbuf2
,
908 rvec
* gmx_restrict r
,
909 real
* gmx_restrict rhs
,
910 real
* gmx_restrict sol
)
914 assert(b0
% GMX_SIMD_REAL_WIDTH
== 0);
916 for (bs
= b0
; bs
< b1
; bs
+= GMX_SIMD_REAL_WIDTH
)
918 gmx_simd_real_t rx_S
, ry_S
, rz_S
, n2_S
, il_S
;
919 gmx_simd_real_t rxp_S
, ryp_S
, rzp_S
, ip_S
, rhs_S
;
921 gmx_hack_simd_gather_rvec_dist_pair_index(x
, bla
+ bs
*2, vbuf1
,
922 &rx_S
, &ry_S
, &rz_S
);
924 pbc_correct_dx_simd(&rx_S
, &ry_S
, &rz_S
, pbc_simd
);
926 n2_S
= gmx_simd_norm2_r(rx_S
, ry_S
, rz_S
);
927 il_S
= gmx_simd_invsqrt_r(n2_S
);
929 rx_S
= gmx_simd_mul_r(rx_S
, il_S
);
930 ry_S
= gmx_simd_mul_r(ry_S
, il_S
);
931 rz_S
= gmx_simd_mul_r(rz_S
, il_S
);
933 gmx_simd_store_vec_to_rvec(rx_S
, ry_S
, rz_S
, vbuf1
, r
+ bs
);
935 gmx_hack_simd_gather_rvec_dist_pair_index(xp
, bla
+ bs
*2, vbuf2
,
936 &rxp_S
, &ryp_S
, &rzp_S
);
938 pbc_correct_dx_simd(&rxp_S
, &ryp_S
, &rzp_S
, pbc_simd
);
940 ip_S
= gmx_simd_iprod_r(rx_S
, ry_S
, rz_S
,
941 rxp_S
, ryp_S
, rzp_S
);
943 rhs_S
= gmx_simd_mul_r(gmx_simd_load_r(blc
+ bs
),
944 gmx_simd_sub_r(ip_S
, gmx_simd_load_r(bllen
+ bs
)));
946 gmx_simd_store_r(rhs
+ bs
, rhs_S
);
947 gmx_simd_store_r(sol
+ bs
, rhs_S
);
950 #endif /* LINCS_SIMD */
952 /* Determine the distances and right-hand side for the next iteration */
953 static void calc_dist_iter(int b0
,
956 const rvec
* gmx_restrict xp
,
957 const real
* gmx_restrict bllen
,
958 const real
* gmx_restrict blc
,
961 real
* gmx_restrict rhs
,
962 real
* gmx_restrict sol
,
967 for (b
= b0
; b
< b1
; b
++)
969 real len
, len2
, dlen2
, mvb
;
975 pbc_dx_aiuc(pbc
, xp
[bla
[2*b
]], xp
[bla
[2*b
+1]], dx
);
979 rvec_sub(xp
[bla
[2*b
]], xp
[bla
[2*b
+1]], dx
);
982 dlen2
= 2*len2
- norm2(dx
);
983 if (dlen2
< wfac
*len2
)
985 /* not race free - see detailed comment in caller */
990 mvb
= blc
[b
]*(len
- dlen2
*gmx_invsqrt(dlen2
));
998 } /* 20*ncons flops */
1002 /* As the function above, but using SIMD intrinsics */
1003 static void gmx_simdcall
1004 calc_dist_iter_simd(int b0
,
1007 const rvec
* gmx_restrict x
,
1008 const real
* gmx_restrict bllen
,
1009 const real
* gmx_restrict blc
,
1010 const pbc_simd_t
* pbc_simd
,
1012 real
* gmx_restrict vbuf
,
1013 real
* gmx_restrict rhs
,
1014 real
* gmx_restrict sol
,
1017 gmx_simd_real_t min_S
= gmx_simd_set1_r(GMX_REAL_MIN
);
1018 gmx_simd_real_t two_S
= gmx_simd_set1_r(2.0);
1019 gmx_simd_real_t wfac_S
= gmx_simd_set1_r(wfac
);
1020 gmx_simd_bool_t warn_S
;
1024 assert(b0
% GMX_SIMD_REAL_WIDTH
== 0);
1026 /* Initialize all to FALSE */
1027 warn_S
= gmx_simd_cmplt_r(two_S
, gmx_simd_setzero_r());
1029 for (bs
= b0
; bs
< b1
; bs
+= GMX_SIMD_REAL_WIDTH
)
1031 gmx_simd_real_t rx_S
, ry_S
, rz_S
, n2_S
;
1032 gmx_simd_real_t len_S
, len2_S
, dlen2_S
, lc_S
, blc_S
;
1034 gmx_hack_simd_gather_rvec_dist_pair_index(x
, bla
+ bs
*2, vbuf
,
1035 &rx_S
, &ry_S
, &rz_S
);
1037 pbc_correct_dx_simd(&rx_S
, &ry_S
, &rz_S
, pbc_simd
);
1039 n2_S
= gmx_simd_norm2_r(rx_S
, ry_S
, rz_S
);
1041 len_S
= gmx_simd_load_r(bllen
+ bs
);
1042 len2_S
= gmx_simd_mul_r(len_S
, len_S
);
1044 dlen2_S
= gmx_simd_fmsub_r(two_S
, len2_S
, n2_S
);
1046 warn_S
= gmx_simd_or_b(warn_S
,
1047 gmx_simd_cmplt_r(dlen2_S
,
1048 gmx_simd_mul_r(wfac_S
, len2_S
)));
1050 /* Avoid 1/0 by taking the max with REAL_MIN.
1051 * Note: when dlen2 is close to zero (90 degree constraint rotation),
1052 * the accuracy of the algorithm is no longer relevant.
1054 dlen2_S
= gmx_simd_max_r(dlen2_S
, min_S
);
1056 lc_S
= gmx_simd_fnmadd_r(dlen2_S
, gmx_simd_invsqrt_r(dlen2_S
), len_S
);
1058 blc_S
= gmx_simd_load_r(blc
+ bs
);
1060 lc_S
= gmx_simd_mul_r(blc_S
, lc_S
);
1062 gmx_simd_store_r(rhs
+ bs
, lc_S
);
1063 gmx_simd_store_r(sol
+ bs
, lc_S
);
1066 if (gmx_simd_anytrue_b(warn_S
))
1071 #endif /* LINCS_SIMD */
1073 static void do_lincs(rvec
*x
, rvec
*xp
, matrix box
, t_pbc
*pbc
,
1074 struct gmx_lincsdata
*lincsd
, int th
,
1075 const real
*invmass
,
1078 real wangle
, gmx_bool
*bWarn
,
1079 real invdt
, rvec
* gmx_restrict v
,
1080 gmx_bool bCalcVir
, tensor vir_r_m_dr
)
1082 int b0
, b1
, b
, i
, j
, n
, iter
;
1083 int *bla
, *blnr
, *blbnb
;
1085 real
*blc
, *blmf
, *bllen
, *blcc
, *rhs1
, *rhs2
, *sol
, *blc_sol
, *mlambda
;
1088 b0
= lincsd
->task
[th
].b0
;
1089 b1
= lincsd
->task
[th
].b1
;
1093 blnr
= lincsd
->blnr
;
1094 blbnb
= lincsd
->blbnb
;
1096 blmf
= lincsd
->blmf
;
1097 bllen
= lincsd
->bllen
;
1098 blcc
= lincsd
->tmpncc
;
1099 rhs1
= lincsd
->tmp1
;
1100 rhs2
= lincsd
->tmp2
;
1102 blc_sol
= lincsd
->tmp4
;
1103 mlambda
= lincsd
->mlambda
;
1104 nlocat
= lincsd
->nlocat
;
1108 /* This SIMD code does the same as the plain-C code after the #else.
1109 * The only difference is that we always call pbc code, as with SIMD
1110 * the overhead of pbc computation (when not needed) is small.
1112 pbc_simd_t pbc_simd
;
1114 /* Convert the pbc struct for SIMD */
1115 set_pbc_simd(pbc
, &pbc_simd
);
1117 /* Compute normalized x i-j vectors, store in r.
1118 * Compute the inner product of r and xp i-j and store in rhs1.
1120 calc_dr_x_xp_simd(b0
, b1
, bla
, x
, xp
, bllen
, blc
,
1122 lincsd
->task
[th
].simd_buf
,
1123 lincsd
->task
[th
].simd_buf
+ GMX_SIMD_REAL_WIDTH
*DIM
,
1126 #else /* LINCS_SIMD */
1130 /* Compute normalized i-j vectors */
1131 for (b
= b0
; b
< b1
; b
++)
1136 pbc_dx_aiuc(pbc
, x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
1139 pbc_dx_aiuc(pbc
, xp
[bla
[2*b
]], xp
[bla
[2*b
+1]], dx
);
1140 mvb
= blc
[b
]*(iprod(r
[b
], dx
) - bllen
[b
]);
1147 /* Compute normalized i-j vectors */
1148 for (b
= b0
; b
< b1
; b
++)
1150 real tmp0
, tmp1
, tmp2
, rlen
, mvb
;
1154 tmp0
= x
[i
][0] - x
[j
][0];
1155 tmp1
= x
[i
][1] - x
[j
][1];
1156 tmp2
= x
[i
][2] - x
[j
][2];
1157 rlen
= gmx_invsqrt(tmp0
*tmp0
+ tmp1
*tmp1
+ tmp2
*tmp2
);
1158 r
[b
][0] = rlen
*tmp0
;
1159 r
[b
][1] = rlen
*tmp1
;
1160 r
[b
][2] = rlen
*tmp2
;
1161 /* 16 ncons flops */
1165 mvb
= blc
[b
]*(r
[b
][0]*(xp
[i
][0] - xp
[j
][0]) +
1166 r
[b
][1]*(xp
[i
][1] - xp
[j
][1]) +
1167 r
[b
][2]*(xp
[i
][2] - xp
[j
][2]) - bllen
[b
]);
1172 /* Together: 26*ncons + 6*nrtot flops */
1175 #endif /* LINCS_SIMD */
1177 if (lincsd
->bTaskDep
)
1179 /* We need a barrier, since the matrix construction below
1180 * can access entries in r of other threads.
1185 /* Construct the (sparse) LINCS matrix */
1186 for (b
= b0
; b
< b1
; b
++)
1188 for (n
= blnr
[b
]; n
< blnr
[b
+1]; n
++)
1190 blcc
[n
] = blmf
[n
]*iprod(r
[b
], r
[blbnb
[n
]]);
1193 /* Together: 26*ncons + 6*nrtot flops */
1195 lincs_matrix_expand(lincsd
, &lincsd
->task
[th
], blcc
, rhs1
, rhs2
, sol
);
1196 /* nrec*(ncons+2*nrtot) flops */
1199 for (b
= b0
; b
< b1
; b
++)
1201 mlambda
[b
] = blc
[b
]*sol
[b
];
1204 for (b
= b0
; b
< b1
; b
+= GMX_SIMD_REAL_WIDTH
)
1206 gmx_simd_store_r(mlambda
+ b
,
1207 gmx_simd_mul_r(gmx_simd_load_r(blc
+ b
),
1208 gmx_simd_load_r(sol
+ b
)));
1212 /* Update the coordinates */
1213 lincs_update_atoms(lincsd
, th
, 1.0, mlambda
, r
, invmass
, xp
);
1216 ******** Correction for centripetal effects ********
1221 wfac
= cos(DEG2RAD
*wangle
);
1224 for (iter
= 0; iter
< lincsd
->nIter
; iter
++)
1226 if ((lincsd
->bCommIter
&& DOMAINDECOMP(cr
) && cr
->dd
->constraints
))
1231 /* Communicate the corrected non-local coordinates */
1232 if (DOMAINDECOMP(cr
))
1234 dd_move_x_constraints(cr
->dd
, box
, xp
, NULL
, FALSE
);
1239 else if (lincsd
->bTaskDep
)
1245 calc_dist_iter_simd(b0
, b1
, bla
, xp
, bllen
, blc
, &pbc_simd
, wfac
,
1246 lincsd
->task
[th
].simd_buf
, rhs1
, sol
, bWarn
);
1248 calc_dist_iter(b0
, b1
, bla
, xp
, bllen
, blc
, pbc
, wfac
,
1250 /* 20*ncons flops */
1253 lincs_matrix_expand(lincsd
, &lincsd
->task
[th
], blcc
, rhs1
, rhs2
, sol
);
1254 /* nrec*(ncons+2*nrtot) flops */
1257 for (b
= b0
; b
< b1
; b
++)
1261 mvb
= blc
[b
]*sol
[b
];
1266 for (b
= b0
; b
< b1
; b
+= GMX_SIMD_REAL_WIDTH
)
1268 gmx_simd_real_t mvb
;
1270 mvb
= gmx_simd_mul_r(gmx_simd_load_r(blc
+ b
),
1271 gmx_simd_load_r(sol
+ b
));
1272 gmx_simd_store_r(blc_sol
+ b
, mvb
);
1273 gmx_simd_store_r(mlambda
+ b
,
1274 gmx_simd_add_r(gmx_simd_load_r(mlambda
+ b
), mvb
));
1278 /* Update the coordinates */
1279 lincs_update_atoms(lincsd
, th
, 1.0, blc_sol
, r
, invmass
, xp
);
1281 /* nit*ncons*(37+9*nrec) flops */
1285 /* Update the velocities */
1286 lincs_update_atoms(lincsd
, th
, invdt
, mlambda
, r
, invmass
, v
);
1287 /* 16 ncons flops */
1290 if (nlocat
!= NULL
&& (bCalcDHDL
|| bCalcVir
))
1292 if (lincsd
->bTaskDep
)
1294 /* In lincs_update_atoms threads might cross-read mlambda */
1298 /* Only account for local atoms */
1299 for (b
= b0
; b
< b1
; b
++)
1301 mlambda
[b
] *= 0.5*nlocat
[b
];
1310 for (b
= b0
; b
< b1
; b
++)
1312 /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1313 * later after the contributions are reduced over the threads.
1315 dhdl
-= lincsd
->mlambda
[b
]*lincsd
->ddist
[b
];
1317 lincsd
->task
[th
].dhdlambda
= dhdl
;
1322 /* Constraint virial */
1323 for (b
= b0
; b
< b1
; b
++)
1327 tmp0
= -bllen
[b
]*mlambda
[b
];
1328 for (i
= 0; i
< DIM
; i
++)
1330 tmp1
= tmp0
*r
[b
][i
];
1331 for (j
= 0; j
< DIM
; j
++)
1333 vir_r_m_dr
[i
][j
] -= tmp1
*r
[b
][j
];
1336 } /* 22 ncons flops */
1340 * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1341 * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1343 * (26+nrec)*ncons + (6+2*nrec)*nrtot
1344 * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1346 * (63+nrec)*ncons + (6+4*nrec)*nrtot
1350 /* Sets the elements in the LINCS matrix for task li_task */
1351 static void set_lincs_matrix_task(struct gmx_lincsdata
*li
,
1352 lincs_task_t
*li_task
,
1353 const real
*invmass
,
1358 /* Construct the coupling coefficient matrix blmf */
1359 li_task
->ntriangle
= 0;
1361 for (i
= li_task
->b0
; i
< li_task
->b1
; i
++)
1366 a2
= li
->bla
[2*i
+1];
1367 for (n
= li
->blnr
[i
]; (n
< li
->blnr
[i
+1]); n
++)
1369 int k
, sign
, center
, end
;
1373 /* If we are using multiple, independent tasks for LINCS,
1374 * the calls to check_assign_connected should have
1375 * put all connected constraints in our task.
1377 assert(li
->bTaskDep
|| (k
>= li_task
->b0
&& k
< li_task
->b1
));
1379 if (a1
== li
->bla
[2*k
] || a2
== li
->bla
[2*k
+1])
1387 if (a1
== li
->bla
[2*k
] || a1
== li
->bla
[2*k
+1])
1397 li
->blmf
[n
] = sign
*invmass
[center
]*li
->blc
[i
]*li
->blc
[k
];
1398 li
->blmf1
[n
] = sign
*0.5;
1399 if (li
->ncg_triangle
> 0)
1403 /* Look for constraint triangles */
1404 for (nk
= li
->blnr
[k
]; (nk
< li
->blnr
[k
+1]); nk
++)
1407 if (kk
!= i
&& kk
!= k
&&
1408 (li
->bla
[2*kk
] == end
|| li
->bla
[2*kk
+1] == end
))
1410 /* If we are using multiple tasks for LINCS,
1411 * the calls to check_assign_triangle should have
1412 * put all constraints in the triangle in our task.
1414 assert(k
>= li_task
->b0
&& k
< li_task
->b1
);
1415 assert(kk
>= li_task
->b0
&& kk
< li_task
->b1
);
1417 if (li_task
->ntriangle
== 0 ||
1418 li_task
->triangle
[li_task
->ntriangle
- 1] < i
)
1420 /* Add this constraint to the triangle list */
1421 li_task
->triangle
[li_task
->ntriangle
] = i
;
1422 li_task
->tri_bits
[li_task
->ntriangle
] = 0;
1423 li_task
->ntriangle
++;
1424 if (li
->blnr
[i
+1] - li
->blnr
[i
] > static_cast<int>(sizeof(li_task
->tri_bits
[0])*8 - 1))
1426 gmx_fatal(FARGS
, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
1427 li
->blnr
[i
+1] - li
->blnr
[i
],
1428 sizeof(li_task
->tri_bits
[0])*8-1);
1431 li_task
->tri_bits
[li_task
->ntriangle
-1] |= (1 << (n
- li
->blnr
[i
]));
1440 /* Sets the elements in the LINCS matrix */
1441 void set_lincs_matrix(struct gmx_lincsdata
*li
, real
*invmass
, real lambda
)
1444 const real invsqrt2
= 0.7071067811865475244;
1446 for (i
= 0; (i
< li
->nc
); i
++)
1451 a2
= li
->bla
[2*i
+1];
1452 li
->blc
[i
] = gmx_invsqrt(invmass
[a1
] + invmass
[a2
]);
1453 li
->blc1
[i
] = invsqrt2
;
1456 /* Construct the coupling coefficient matrix blmf */
1457 int th
, ntriangle
= 0, ncc_triangle
= 0;
1458 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle) num_threads(li->ntask) schedule(static)
1459 for (th
= 0; th
< li
->ntask
; th
++)
1461 set_lincs_matrix_task(li
, &li
->task
[th
], invmass
, &ncc_triangle
);
1462 ntriangle
= li
->task
[th
].ntriangle
;
1464 li
->ntriangle
= ntriangle
;
1465 li
->ncc_triangle
= ncc_triangle
;
1469 fprintf(debug
, "Of the %d constraints %d participate in triangles\n",
1470 li
->nc
, li
->ntriangle
);
1471 fprintf(debug
, "There are %d couplings of which %d in triangles\n",
1472 li
->ncc
, li
->ncc_triangle
);
1476 * so we know with which lambda value the masses have been set.
1478 li
->matlam
= lambda
;
1481 static int count_triangle_constraints(t_ilist
*ilist
, t_blocka
*at2con
)
1483 int ncon1
, ncon_tot
;
1484 int c0
, a00
, a01
, n1
, c1
, a10
, a11
, ac1
, n2
, c2
, a20
, a21
;
1487 t_iatom
*ia1
, *ia2
, *iap
;
1489 ncon1
= ilist
[F_CONSTR
].nr
/3;
1490 ncon_tot
= ncon1
+ ilist
[F_CONSTRNC
].nr
/3;
1492 ia1
= ilist
[F_CONSTR
].iatoms
;
1493 ia2
= ilist
[F_CONSTRNC
].iatoms
;
1496 for (c0
= 0; c0
< ncon_tot
; c0
++)
1499 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c0
);
1502 for (n1
= at2con
->index
[a01
]; n1
< at2con
->index
[a01
+1]; n1
++)
1507 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c1
);
1518 for (n2
= at2con
->index
[ac1
]; n2
< at2con
->index
[ac1
+1]; n2
++)
1521 if (c2
!= c0
&& c2
!= c1
)
1523 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c2
);
1526 if (a20
== a00
|| a21
== a00
)
1540 return ncon_triangle
;
1543 static gmx_bool
more_than_two_sequential_constraints(const t_ilist
*ilist
,
1544 const t_blocka
*at2con
)
1546 t_iatom
*ia1
, *ia2
, *iap
;
1547 int ncon1
, ncon_tot
, c
;
1549 gmx_bool bMoreThanTwoSequentialConstraints
;
1551 ncon1
= ilist
[F_CONSTR
].nr
/3;
1552 ncon_tot
= ncon1
+ ilist
[F_CONSTRNC
].nr
/3;
1554 ia1
= ilist
[F_CONSTR
].iatoms
;
1555 ia2
= ilist
[F_CONSTRNC
].iatoms
;
1557 bMoreThanTwoSequentialConstraints
= FALSE
;
1558 for (c
= 0; c
< ncon_tot
&& !bMoreThanTwoSequentialConstraints
; c
++)
1560 iap
= constr_iatomptr(ncon1
, ia1
, ia2
, c
);
1563 /* Check if this constraint has constraints connected at both atoms */
1564 if (at2con
->index
[a1
+1] - at2con
->index
[a1
] > 1 &&
1565 at2con
->index
[a2
+1] - at2con
->index
[a2
] > 1)
1567 bMoreThanTwoSequentialConstraints
= TRUE
;
1571 return bMoreThanTwoSequentialConstraints
;
1574 static int int_comp(const void *a
, const void *b
)
1576 return (*(int *)a
) - (*(int *)b
);
1579 gmx_lincsdata_t
init_lincs(FILE *fplog
, gmx_mtop_t
*mtop
,
1580 int nflexcon_global
, t_blocka
*at2con
,
1581 gmx_bool bPLINCS
, int nIter
, int nProjOrder
)
1583 struct gmx_lincsdata
*li
;
1585 gmx_moltype_t
*molt
;
1586 gmx_bool bMoreThanTwoSeq
;
1590 fprintf(fplog
, "\nInitializing%s LINear Constraint Solver\n",
1591 bPLINCS
? " Parallel" : "");
1597 gmx_mtop_ftype_count(mtop
, F_CONSTR
) +
1598 gmx_mtop_ftype_count(mtop
, F_CONSTRNC
);
1599 li
->ncg_flex
= nflexcon_global
;
1602 li
->nOrder
= nProjOrder
;
1604 li
->max_connect
= 0;
1605 for (mt
= 0; mt
< mtop
->nmoltype
; mt
++)
1609 molt
= &mtop
->moltype
[mt
];
1610 for (a
= 0; a
< molt
->atoms
.nr
; a
++)
1612 li
->max_connect
= std::max(li
->max_connect
,
1613 at2con
[mt
].index
[a
+ 1] - at2con
[mt
].index
[a
]);
1617 li
->ncg_triangle
= 0;
1618 bMoreThanTwoSeq
= FALSE
;
1619 for (mb
= 0; mb
< mtop
->nmolblock
; mb
++)
1621 molt
= &mtop
->moltype
[mtop
->molblock
[mb
].type
];
1624 mtop
->molblock
[mb
].nmol
*
1625 count_triangle_constraints(molt
->ilist
,
1626 &at2con
[mtop
->molblock
[mb
].type
]);
1628 if (!bMoreThanTwoSeq
&&
1629 more_than_two_sequential_constraints(molt
->ilist
, &at2con
[mtop
->molblock
[mb
].type
]))
1631 bMoreThanTwoSeq
= TRUE
;
1635 /* Check if we need to communicate not only before LINCS,
1636 * but also before each iteration.
1637 * The check for only two sequential constraints is only
1638 * useful for the common case of H-bond only constraints.
1639 * With more effort we could also make it useful for small
1640 * molecules with nr. sequential constraints <= nOrder-1.
1642 li
->bCommIter
= (bPLINCS
&& (li
->nOrder
< 1 || bMoreThanTwoSeq
));
1644 if (debug
&& bPLINCS
)
1646 fprintf(debug
, "PLINCS communication before each iteration: %d\n",
1650 /* LINCS can run on any number of threads.
1651 * Currently the number is fixed for the whole simulation,
1652 * but it could be set in set_lincs().
1653 * The current constraint to task assignment code can create independent
1654 * tasks only when not more than two constraints are connected sequentially.
1656 li
->ntask
= gmx_omp_nthreads_get(emntLINCS
);
1657 li
->bTaskDep
= (li
->ntask
> 1 && bMoreThanTwoSeq
);
1660 fprintf(debug
, "LINCS: using %d threads, tasks are %sdependent\n",
1661 li
->ntask
, li
->bTaskDep
? "" : "in");
1669 /* Allocate an extra elements for "task-overlap" constraints */
1670 snew(li
->task
, li
->ntask
+ 1);
1673 #pragma omp parallel for num_threads(li->ntask)
1674 for (th
= 0; th
< li
->ntask
; th
++)
1676 /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
1677 snew_aligned(li
->task
[th
].simd_buf
, 2*simd_width
*DIM
,
1681 if (bPLINCS
|| li
->ncg_triangle
> 0)
1683 please_cite(fplog
, "Hess2008a");
1687 please_cite(fplog
, "Hess97a");
1692 fprintf(fplog
, "The number of constraints is %d\n", li
->ncg
);
1695 fprintf(fplog
, "There are inter charge-group constraints,\n"
1696 "will communicate selected coordinates each lincs iteration\n");
1698 if (li
->ncg_triangle
> 0)
1701 "%d constraints are involved in constraint triangles,\n"
1702 "will apply an additional matrix expansion of order %d for couplings\n"
1703 "between constraints inside triangles\n",
1704 li
->ncg_triangle
, li
->nOrder
);
1711 /* Sets up the work division over the threads */
1712 static void lincs_thread_setup(struct gmx_lincsdata
*li
, int natoms
)
1719 if (natoms
> li
->atf_nalloc
)
1721 li
->atf_nalloc
= over_alloc_large(natoms
);
1722 srenew(li
->atf
, li
->atf_nalloc
);
1726 /* Clear the atom flags */
1727 for (a
= 0; a
< natoms
; a
++)
1729 bitmask_clear(&atf
[a
]);
1732 if (li
->ntask
> BITMASK_SIZE
)
1734 gmx_fatal(FARGS
, "More than %d threads is not supported for LINCS.", BITMASK_SIZE
);
1737 for (th
= 0; th
< li
->ntask
; th
++)
1739 lincs_task_t
*li_task
;
1742 li_task
= &li
->task
[th
];
1744 /* For each atom set a flag for constraints from each */
1745 for (b
= li_task
->b0
; b
< li_task
->b1
; b
++)
1747 bitmask_set_bit(&atf
[li
->bla
[b
*2 ]], th
);
1748 bitmask_set_bit(&atf
[li
->bla
[b
*2 + 1]], th
);
1752 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1753 for (th
= 0; th
< li
->ntask
; th
++)
1755 lincs_task_t
*li_task
;
1759 li_task
= &li
->task
[th
];
1761 if (li_task
->b1
- li_task
->b0
> li_task
->ind_nalloc
)
1763 li_task
->ind_nalloc
= over_alloc_large(li_task
->b1
-li_task
->b0
);
1764 srenew(li_task
->ind
, li_task
->ind_nalloc
);
1765 srenew(li_task
->ind_r
, li_task
->ind_nalloc
);
1768 bitmask_init_low_bits(&mask
, th
);
1771 li_task
->nind_r
= 0;
1772 for (b
= li_task
->b0
; b
< li_task
->b1
; b
++)
1774 /* We let the constraint with the lowest thread index
1775 * operate on atoms with constraints from multiple threads.
1777 if (bitmask_is_disjoint(atf
[li
->bla
[b
*2]], mask
) &&
1778 bitmask_is_disjoint(atf
[li
->bla
[b
*2+1]], mask
))
1780 /* Add the constraint to the local atom update index */
1781 li_task
->ind
[li_task
->nind
++] = b
;
1785 /* Add the constraint to the rest block */
1786 li_task
->ind_r
[li_task
->nind_r
++] = b
;
1791 /* We need to copy all constraints which have not be assigned
1792 * to a thread to a separate list which will be handled by one thread.
1794 li_m
= &li
->task
[li
->ntask
];
1797 for (th
= 0; th
< li
->ntask
; th
++)
1799 lincs_task_t
*li_task
;
1802 li_task
= &li
->task
[th
];
1804 if (li_m
->nind
+ li_task
->nind_r
> li_m
->ind_nalloc
)
1806 li_m
->ind_nalloc
= over_alloc_large(li_m
->nind
+li_task
->nind_r
);
1807 srenew(li_m
->ind
, li_m
->ind_nalloc
);
1810 for (b
= 0; b
< li_task
->nind_r
; b
++)
1812 li_m
->ind
[li_m
->nind
++] = li_task
->ind_r
[b
];
1817 fprintf(debug
, "LINCS thread %d: %d constraints\n",
1824 fprintf(debug
, "LINCS thread r: %d constraints\n",
1829 /* There is no realloc with alignment, so here we make one for reals.
1830 * Note that this function does not preserve the contents of the memory.
1832 static void resize_real_aligned(real
**ptr
, int nelem
)
1834 sfree_aligned(*ptr
);
1835 snew_aligned(*ptr
, nelem
, align_bytes
);
1838 static void assign_constraint(struct gmx_lincsdata
*li
,
1839 int constraint_index
,
1841 real lenA
, real lenB
,
1842 const t_blocka
*at2con
)
1848 /* Make an mapping of local topology constraint index to LINCS index */
1849 li
->con_index
[constraint_index
] = con
;
1851 li
->bllen0
[con
] = lenA
;
1852 li
->ddist
[con
] = lenB
- lenA
;
1853 /* Set the length to the topology A length */
1854 li
->bllen
[con
] = lenA
;
1855 li
->bla
[2*con
] = a1
;
1856 li
->bla
[2*con
+1] = a2
;
1858 /* Make space in the constraint connection matrix for constraints
1859 * connected to both end of the current constraint.
1862 at2con
->index
[a1
+ 1] - at2con
->index
[a1
] - 1 +
1863 at2con
->index
[a2
+ 1] - at2con
->index
[a2
] - 1;
1865 li
->blnr
[con
+ 1] = li
->ncc
;
1867 /* Increase the constraint count */
1871 /* Check if constraint with topology index constraint_index is connected
1872 * to other constraints, and if so add those connected constraints to our task.
1874 static void check_assign_connected(struct gmx_lincsdata
*li
,
1875 const t_iatom
*iatom
,
1879 const t_blocka
*at2con
)
1881 /* Currently this function only supports constraint groups
1882 * in which all constraints share at least one atom
1883 * (e.g. H-bond constraints).
1884 * Check both ends of the current constraint for
1885 * connected constraints. We need to assign those
1890 for (end
= 0; end
< 2; end
++)
1894 a
= (end
== 0 ? a1
: a2
);
1896 for (k
= at2con
->index
[a
]; k
< at2con
->index
[a
+ 1]; k
++)
1901 /* Check if constraint cc has not yet been assigned */
1902 if (li
->con_index
[cc
] == -1)
1908 lenA
= idef
->iparams
[type
].constr
.dA
;
1909 lenB
= idef
->iparams
[type
].constr
.dB
;
1911 if (bDynamics
|| lenA
!= 0 || lenB
!= 0)
1913 assign_constraint(li
, cc
, iatom
[3*cc
+ 1], iatom
[3*cc
+ 2], lenA
, lenB
, at2con
);
1920 /* Check if constraint with topology index constraint_index is involved
1921 * in a constraint triangle, and if so add the other two constraints
1922 * in the triangle to our task.
1924 static void check_assign_triangle(struct gmx_lincsdata
*li
,
1925 const t_iatom
*iatom
,
1928 int constraint_index
,
1930 const t_blocka
*at2con
)
1932 int nca
, cc
[32], ca
[32], k
;
1933 int c_triangle
[2] = { -1, -1 };
1936 for (k
= at2con
->index
[a1
]; k
< at2con
->index
[a1
+ 1]; k
++)
1941 if (c
!= constraint_index
)
1945 aa1
= iatom
[c
*3 + 1];
1946 aa2
= iatom
[c
*3 + 2];
1962 for (k
= at2con
->index
[a2
]; k
< at2con
->index
[a2
+ 1]; k
++)
1967 if (c
!= constraint_index
)
1971 aa1
= iatom
[c
*3 + 1];
1972 aa2
= iatom
[c
*3 + 2];
1975 for (i
= 0; i
< nca
; i
++)
1979 c_triangle
[0] = cc
[i
];
1986 for (i
= 0; i
< nca
; i
++)
1990 c_triangle
[0] = cc
[i
];
1998 if (c_triangle
[0] >= 0)
2002 for (end
= 0; end
< 2; end
++)
2004 /* Check if constraint c_triangle[end] has not yet been assigned */
2005 if (li
->con_index
[c_triangle
[end
]] == -1)
2010 i
= c_triangle
[end
]*3;
2012 lenA
= idef
->iparams
[type
].constr
.dA
;
2013 lenB
= idef
->iparams
[type
].constr
.dB
;
2015 if (bDynamics
|| lenA
!= 0 || lenB
!= 0)
2017 assign_constraint(li
, c_triangle
[end
], iatom
[i
+ 1], iatom
[i
+ 2], lenA
, lenB
, at2con
);
2024 static void set_matrix_indices(struct gmx_lincsdata
*li
,
2025 const lincs_task_t
*li_task
,
2026 const t_blocka
*at2con
,
2027 gmx_bool bSortMatrix
)
2031 for (b
= li_task
->b0
; b
< li_task
->b1
; b
++)
2036 a2
= li
->bla
[b
*2 + 1];
2039 for (k
= at2con
->index
[a1
]; k
< at2con
->index
[a1
+ 1]; k
++)
2043 concon
= li
->con_index
[at2con
->a
[k
]];
2046 li
->blbnb
[i
++] = concon
;
2049 for (k
= at2con
->index
[a2
]; k
< at2con
->index
[a2
+ 1]; k
++)
2053 concon
= li
->con_index
[at2con
->a
[k
]];
2056 li
->blbnb
[i
++] = concon
;
2062 /* Order the blbnb matrix to optimize memory access */
2063 qsort(&(li
->blbnb
[li
->blnr
[b
]]), li
->blnr
[b
+ 1] - li
->blnr
[b
],
2064 sizeof(li
->blbnb
[0]), int_comp
);
2069 void set_lincs(t_idef
*idef
, t_mdatoms
*md
,
2070 gmx_bool bDynamics
, t_commrec
*cr
,
2071 struct gmx_lincsdata
*li
)
2073 int natoms
, nflexcon
;
2076 int i
, ncc_alloc_old
, ncon_tot
;
2081 /* Zero the thread index ranges.
2082 * Otherwise without local constraints we could return with old ranges.
2084 for (i
= 0; i
< li
->ntask
; i
++)
2088 li
->task
[i
].nind
= 0;
2092 li
->task
[li
->ntask
].nind
= 0;
2095 /* This is the local topology, so there are only F_CONSTR constraints */
2096 if (idef
->il
[F_CONSTR
].nr
== 0)
2098 /* There are no constraints,
2099 * we do not need to fill any data structures.
2106 fprintf(debug
, "Building the LINCS connectivity\n");
2109 if (DOMAINDECOMP(cr
))
2111 if (cr
->dd
->constraints
)
2115 dd_get_constraint_range(cr
->dd
, &start
, &natoms
);
2119 natoms
= cr
->dd
->nat_home
;
2124 natoms
= md
->homenr
;
2126 at2con
= make_at2con(0, natoms
, idef
->il
, idef
->iparams
, bDynamics
,
2129 ncon_tot
= idef
->il
[F_CONSTR
].nr
/3;
2131 /* Ensure we have enough padding for aligned loads for each thread */
2132 if (ncon_tot
+ li
->ntask
*simd_width
> li
->nc_alloc
|| li
->nc_alloc
== 0)
2134 li
->nc_alloc
= over_alloc_dd(ncon_tot
+ li
->ntask
*simd_width
);
2135 srenew(li
->con_index
, li
->nc_alloc
);
2136 resize_real_aligned(&li
->bllen0
, li
->nc_alloc
);
2137 resize_real_aligned(&li
->ddist
, li
->nc_alloc
);
2138 srenew(li
->bla
, 2*li
->nc_alloc
);
2139 resize_real_aligned(&li
->blc
, li
->nc_alloc
);
2140 resize_real_aligned(&li
->blc1
, li
->nc_alloc
);
2141 srenew(li
->blnr
, li
->nc_alloc
+ 1);
2142 resize_real_aligned(&li
->bllen
, li
->nc_alloc
);
2143 srenew(li
->tmpv
, li
->nc_alloc
);
2144 if (DOMAINDECOMP(cr
))
2146 srenew(li
->nlocat
, li
->nc_alloc
);
2148 resize_real_aligned(&li
->tmp1
, li
->nc_alloc
);
2149 resize_real_aligned(&li
->tmp2
, li
->nc_alloc
);
2150 resize_real_aligned(&li
->tmp3
, li
->nc_alloc
);
2151 resize_real_aligned(&li
->tmp4
, li
->nc_alloc
);
2152 resize_real_aligned(&li
->mlambda
, li
->nc_alloc
);
2155 iatom
= idef
->il
[F_CONSTR
].iatoms
;
2157 ncc_alloc_old
= li
->ncc_alloc
;
2158 li
->blnr
[0] = li
->ncc
;
2160 /* Assign the constraints for li->ntask LINCS tasks.
2161 * We target a uniform distribution of constraints over the tasks.
2162 * Note that when flexible constraints are present, but are removed here
2163 * (e.g. because we are doing EM) we get imbalance, but since that doesn't
2164 * happen during normal MD, that's ok.
2166 int ncon_assign
, ncon_target
, con
, th
;
2168 /* Determine the number of constraints we need to assign here */
2169 ncon_assign
= ncon_tot
;
2172 /* With energy minimization, flexible constraints are ignored
2173 * (and thus minimized, as they should be).
2175 ncon_assign
-= nflexcon
;
2178 /* Set the target constraint count per task to exactly uniform,
2179 * this might be overridden below.
2181 ncon_target
= (ncon_assign
+ li
->ntask
- 1)/li
->ntask
;
2183 /* Mark all constraints as unassigned by setting their index to -1 */
2184 for (con
= 0; con
< ncon_tot
; con
++)
2186 li
->con_index
[con
] = -1;
2190 for (th
= 0; th
< li
->ntask
; th
++)
2192 lincs_task_t
*li_task
;
2194 li_task
= &li
->task
[th
];
2197 /* With indepedent tasks we likely have H-bond constraints or constraint
2198 * pairs. The connected constraints will be pulled into the task, so the
2199 * constraints per task will often exceed ncon_target.
2200 * Triangle constraints can also increase the count, but there are
2201 * relatively few of those, so we usually expect to get ncon_target.
2205 /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
2206 * since otherwise a lot of operations can be wasted.
2207 * There are several ways to round here, we choose the one
2208 * that alternates block sizes, which helps with Intel HT.
2210 ncon_target
= ((ncon_assign
*(th
+ 1))/li
->ntask
- li
->nc_real
+ GMX_SIMD_REAL_WIDTH
- 1) & ~(GMX_SIMD_REAL_WIDTH
- 1);
2214 /* Continue filling the arrays where we left off with the previous task,
2215 * including padding for SIMD.
2217 li_task
->b0
= li
->nc
;
2219 while (con
< ncon_tot
&& li
->nc
- li_task
->b0
< ncon_target
)
2221 if (li
->con_index
[con
] == -1)
2226 type
= iatom
[3*con
];
2227 a1
= iatom
[3*con
+ 1];
2228 a2
= iatom
[3*con
+ 2];
2229 lenA
= idef
->iparams
[type
].constr
.dA
;
2230 lenB
= idef
->iparams
[type
].constr
.dB
;
2231 /* Skip the flexible constraints when not doing dynamics */
2232 if (bDynamics
|| lenA
!= 0 || lenB
!= 0)
2234 assign_constraint(li
, con
, a1
, a2
, lenA
, lenB
, &at2con
);
2236 if (li
->ntask
> 1 && !li
->bTaskDep
)
2238 /* We can generate independent tasks. Check if we
2239 * need to assign connected constraints to our task.
2241 check_assign_connected(li
, iatom
, idef
, bDynamics
,
2244 if (li
->ntask
> 1 && li
->ncg_triangle
> 0)
2246 /* Ensure constraints in one triangle are assigned
2249 check_assign_triangle(li
, iatom
, idef
, bDynamics
,
2250 con
, a1
, a2
, &at2con
);
2258 li_task
->b1
= li
->nc
;
2262 /* Copy the last atom pair indices and lengths for constraints
2263 * up to a multiple of simd_width, such that we can do all
2264 * SIMD operations without having to worry about end effects.
2268 li
->nc
= ((li_task
->b1
+ simd_width
- 1)/simd_width
)*simd_width
;
2269 last
= li_task
->b1
- 1;
2270 for (i
= li_task
->b1
; i
< li
->nc
; i
++)
2272 li
->bla
[i
*2 ] = li
->bla
[last
*2 ];
2273 li
->bla
[i
*2 + 1] = li
->bla
[last
*2 + 1];
2274 li
->bllen0
[i
] = li
->bllen0
[last
];
2275 li
->ddist
[i
] = li
->ddist
[last
];
2276 li
->bllen
[i
] = li
->bllen
[last
];
2277 li
->blnr
[i
+ 1] = li
->blnr
[last
+ 1];
2281 /* Keep track of how many constraints we assigned */
2282 li
->nc_real
+= li_task
->b1
- li_task
->b0
;
2286 fprintf(debug
, "LINCS task %d constraints %d - %d\n",
2287 th
, li_task
->b0
, li_task
->b1
);
2291 assert(li
->nc_real
== ncon_assign
);
2293 gmx_bool bSortMatrix
;
2295 /* Without DD we order the blbnb matrix to optimize memory access.
2296 * With DD the overhead of sorting is more than the gain during access.
2298 bSortMatrix
= !DOMAINDECOMP(cr
);
2300 if (li
->ncc
> li
->ncc_alloc
)
2302 li
->ncc_alloc
= over_alloc_small(li
->ncc
);
2303 srenew(li
->blbnb
, li
->ncc_alloc
);
2306 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2307 for (th
= 0; th
< li
->ntask
; th
++)
2309 lincs_task_t
*li_task
;
2311 li_task
= &li
->task
[th
];
2313 if (li
->ncg_triangle
> 0 &&
2314 li_task
->b1
- li_task
->b0
> li_task
->tri_alloc
)
2316 /* This is allocating too much, but it is difficult to improve */
2317 li_task
->tri_alloc
= over_alloc_dd(li_task
->b1
- li_task
->b0
);
2318 srenew(li_task
->triangle
, li_task
->tri_alloc
);
2319 srenew(li_task
->tri_bits
, li_task
->tri_alloc
);
2322 set_matrix_indices(li
, li_task
, &at2con
, bSortMatrix
);
2325 done_blocka(&at2con
);
2329 /* Since the matrix is static, we should free some memory */
2330 li
->ncc_alloc
= li
->ncc
;
2331 srenew(li
->blbnb
, li
->ncc_alloc
);
2334 if (li
->ncc_alloc
> ncc_alloc_old
)
2336 srenew(li
->blmf
, li
->ncc_alloc
);
2337 srenew(li
->blmf1
, li
->ncc_alloc
);
2338 srenew(li
->tmpncc
, li
->ncc_alloc
);
2341 if (DOMAINDECOMP(cr
) && dd_constraints_nlocalatoms(cr
->dd
) != NULL
)
2345 nlocat_dd
= dd_constraints_nlocalatoms(cr
->dd
);
2347 /* Convert nlocat from local topology to LINCS constraint indexing */
2348 for (con
= 0; con
< ncon_tot
; con
++)
2350 li
->nlocat
[li
->con_index
[con
]] = nlocat_dd
[con
];
2360 fprintf(debug
, "Number of constraints is %d, padded %d, couplings %d\n",
2361 li
->nc_real
, li
->nc
, li
->ncc
);
2366 lincs_thread_setup(li
, md
->nr
);
2369 set_lincs_matrix(li
, md
->invmass
, md
->lambda
);
2372 static void lincs_warning(FILE *fplog
,
2373 gmx_domdec_t
*dd
, rvec
*x
, rvec
*xprime
, t_pbc
*pbc
,
2374 int ncons
, int *bla
, real
*bllen
, real wangle
,
2375 int maxwarn
, int *warncount
)
2379 real wfac
, d0
, d1
, cosine
;
2382 wfac
= cos(DEG2RAD
*wangle
);
2384 sprintf(buf
, "bonds that rotated more than %g degrees:\n"
2385 " atom 1 atom 2 angle previous, current, constraint length\n",
2387 fprintf(stderr
, "%s", buf
);
2390 fprintf(fplog
, "%s", buf
);
2393 for (b
= 0; b
< ncons
; b
++)
2399 pbc_dx_aiuc(pbc
, x
[i
], x
[j
], v0
);
2400 pbc_dx_aiuc(pbc
, xprime
[i
], xprime
[j
], v1
);
2404 rvec_sub(x
[i
], x
[j
], v0
);
2405 rvec_sub(xprime
[i
], xprime
[j
], v1
);
2409 cosine
= iprod(v0
, v1
)/(d0
*d1
);
2412 sprintf(buf
, " %6d %6d %5.1f %8.4f %8.4f %8.4f\n",
2413 ddglatnr(dd
, i
), ddglatnr(dd
, j
),
2414 RAD2DEG
*acos(cosine
), d0
, d1
, bllen
[b
]);
2415 fprintf(stderr
, "%s", buf
);
2418 fprintf(fplog
, "%s", buf
);
2420 if (!gmx_isfinite(d1
))
2422 gmx_fatal(FARGS
, "Bond length not finite.");
2428 if (*warncount
> maxwarn
)
2430 too_many_constraint_warnings(econtLINCS
, *warncount
);
2434 static void cconerr(const struct gmx_lincsdata
*lincsd
,
2435 rvec
*x
, t_pbc
*pbc
,
2436 real
*ncons_loc
, real
*ssd
, real
*max
, int *imax
)
2438 const int *bla
, *nlocat
;
2441 int count
, im
, task
;
2444 bllen
= lincsd
->bllen
;
2445 nlocat
= lincsd
->nlocat
;
2451 for (task
= 0; task
< lincsd
->ntask
; task
++)
2455 for (b
= lincsd
->task
[task
].b0
; b
< lincsd
->task
[task
].b1
; b
++)
2462 pbc_dx_aiuc(pbc
, x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
2466 rvec_sub(x
[bla
[2*b
]], x
[bla
[2*b
+1]], dx
);
2469 len
= r2
*gmx_invsqrt(r2
);
2470 d
= fabs(len
/bllen
[b
]-1);
2471 if (d
> ma
&& (nlocat
== NULL
|| nlocat
[b
]))
2483 ssd2
+= nlocat
[b
]*d
*d
;
2489 *ncons_loc
= (nlocat
? 0.5 : 1)*count
;
2490 *ssd
= (nlocat
? 0.5 : 1)*ssd2
;
2495 gmx_bool
constrain_lincs(FILE *fplog
, gmx_bool bLog
, gmx_bool bEner
,
2498 struct gmx_lincsdata
*lincsd
, t_mdatoms
*md
,
2500 rvec
*x
, rvec
*xprime
, rvec
*min_proj
,
2501 matrix box
, t_pbc
*pbc
,
2502 real lambda
, real
*dvdlambda
,
2503 real invdt
, rvec
*v
,
2504 gmx_bool bCalcVir
, tensor vir_r_m_dr
,
2507 int maxwarn
, int *warncount
)
2510 char buf
[STRLEN
], buf2
[22], buf3
[STRLEN
];
2512 real ncons_loc
, p_ssd
, p_max
= 0;
2514 gmx_bool bOK
, bWarn
;
2518 /* This boolean should be set by a flag passed to this routine.
2519 * We can also easily check if any constraint length is changed,
2520 * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2522 bCalcDHDL
= (ir
->efep
!= efepNO
&& dvdlambda
!= NULL
);
2524 if (lincsd
->nc
== 0 && cr
->dd
== NULL
)
2528 lincsd
->rmsd_data
[0] = 0;
2529 if (ir
->eI
== eiSD2
&& v
== NULL
)
2537 lincsd
->rmsd_data
[i
] = 0;
2543 if (econq
== econqCoord
)
2545 /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2546 * also with efep!=fepNO.
2548 if (ir
->efep
!= efepNO
)
2550 if (md
->nMassPerturbed
&& lincsd
->matlam
!= md
->lambda
)
2552 set_lincs_matrix(lincsd
, md
->invmass
, md
->lambda
);
2555 for (i
= 0; i
< lincsd
->nc
; i
++)
2557 lincsd
->bllen
[i
] = lincsd
->bllen0
[i
] + lambda
*lincsd
->ddist
[i
];
2561 if (lincsd
->ncg_flex
)
2563 /* Set the flexible constraint lengths to the old lengths */
2566 for (i
= 0; i
< lincsd
->nc
; i
++)
2568 if (lincsd
->bllen
[i
] == 0)
2570 pbc_dx_aiuc(pbc
, x
[lincsd
->bla
[2*i
]], x
[lincsd
->bla
[2*i
+1]], dx
);
2571 lincsd
->bllen
[i
] = norm(dx
);
2577 for (i
= 0; i
< lincsd
->nc
; i
++)
2579 if (lincsd
->bllen
[i
] == 0)
2582 sqrt(distance2(x
[lincsd
->bla
[2*i
]],
2583 x
[lincsd
->bla
[2*i
+1]]));
2591 cconerr(lincsd
, xprime
, pbc
,
2592 &ncons_loc
, &p_ssd
, &p_max
, &p_imax
);
2595 /* This bWarn var can be updated by multiple threads
2596 * at the same time. But as we only need to detect
2597 * if a warning occured or not, this is not an issue.
2601 /* The OpenMP parallel region of constrain_lincs for coords */
2602 #pragma omp parallel num_threads(lincsd->ntask)
2604 int th
= gmx_omp_get_thread_num();
2606 clear_mat(lincsd
->task
[th
].vir_r_m_dr
);
2608 do_lincs(x
, xprime
, box
, pbc
, lincsd
, th
,
2611 ir
->LincsWarnAngle
, &bWarn
,
2613 th
== 0 ? vir_r_m_dr
: lincsd
->task
[th
].vir_r_m_dr
);
2616 if (bLog
&& fplog
&& lincsd
->nc
> 0)
2618 fprintf(fplog
, " Rel. Constraint Deviation: RMS MAX between atoms\n");
2619 fprintf(fplog
, " Before LINCS %.6f %.6f %6d %6d\n",
2620 sqrt(p_ssd
/ncons_loc
), p_max
,
2621 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
]),
2622 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
+1]));
2626 cconerr(lincsd
, xprime
, pbc
,
2627 &ncons_loc
, &p_ssd
, &p_max
, &p_imax
);
2628 /* Check if we are doing the second part of SD */
2629 if (ir
->eI
== eiSD2
&& v
== NULL
)
2637 lincsd
->rmsd_data
[0] = ncons_loc
;
2638 lincsd
->rmsd_data
[i
] = p_ssd
;
2642 lincsd
->rmsd_data
[0] = 0;
2643 lincsd
->rmsd_data
[1] = 0;
2644 lincsd
->rmsd_data
[2] = 0;
2646 if (bLog
&& fplog
&& lincsd
->nc
> 0)
2649 " After LINCS %.6f %.6f %6d %6d\n\n",
2650 sqrt(p_ssd
/ncons_loc
), p_max
,
2651 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
]),
2652 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
+1]));
2659 cconerr(lincsd
, xprime
, pbc
,
2660 &ncons_loc
, &p_ssd
, &p_max
, &p_imax
);
2663 sprintf(buf3
, " in simulation %d", cr
->ms
->sim
);
2669 sprintf(buf
, "\nStep %s, time %g (ps) LINCS WARNING%s\n"
2670 "relative constraint deviation after LINCS:\n"
2671 "rms %.6f, max %.6f (between atoms %d and %d)\n",
2672 gmx_step_str(step
, buf2
), ir
->init_t
+step
*ir
->delta_t
,
2674 sqrt(p_ssd
/ncons_loc
), p_max
,
2675 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
]),
2676 ddglatnr(cr
->dd
, lincsd
->bla
[2*p_imax
+1]));
2679 fprintf(fplog
, "%s", buf
);
2681 fprintf(stderr
, "%s", buf
);
2682 lincs_warning(fplog
, cr
->dd
, x
, xprime
, pbc
,
2683 lincsd
->nc
, lincsd
->bla
, lincsd
->bllen
,
2684 ir
->LincsWarnAngle
, maxwarn
, warncount
);
2686 bOK
= (p_max
< 0.5);
2689 if (lincsd
->ncg_flex
)
2691 for (i
= 0; (i
< lincsd
->nc
); i
++)
2693 if (lincsd
->bllen0
[i
] == 0 && lincsd
->ddist
[i
] == 0)
2695 lincsd
->bllen
[i
] = 0;
2702 /* The OpenMP parallel region of constrain_lincs for derivatives */
2703 #pragma omp parallel num_threads(lincsd->ntask)
2705 int th
= gmx_omp_get_thread_num();
2707 do_lincsp(x
, xprime
, min_proj
, pbc
, lincsd
, th
,
2708 md
->invmass
, econq
, bCalcDHDL
,
2709 bCalcVir
, th
== 0 ? vir_r_m_dr
: lincsd
->task
[th
].vir_r_m_dr
);
2715 /* Reduce the dH/dlambda contributions over the threads */
2720 for (th
= 0; th
< lincsd
->ntask
; th
++)
2722 dhdlambda
+= lincsd
->task
[th
].dhdlambda
;
2726 /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2727 /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2728 dhdlambda
/= ir
->delta_t
*ir
->delta_t
;
2730 *dvdlambda
+= dhdlambda
;
2733 if (bCalcVir
&& lincsd
->ntask
> 1)
2735 for (i
= 1; i
< lincsd
->ntask
; i
++)
2737 m_add(vir_r_m_dr
, lincsd
->task
[i
].vir_r_m_dr
, vir_r_m_dr
);
2741 /* count assuming nit=1 */
2742 inc_nrnb(nrnb
, eNR_LINCS
, lincsd
->nc_real
);
2743 inc_nrnb(nrnb
, eNR_LINCSMAT
, (2+lincsd
->nOrder
)*lincsd
->ncc
);
2744 if (lincsd
->ntriangle
> 0)
2746 inc_nrnb(nrnb
, eNR_LINCSMAT
, lincsd
->nOrder
*lincsd
->ncc_triangle
);
2750 inc_nrnb(nrnb
, eNR_CONSTR_V
, lincsd
->nc_real
*2);
2754 inc_nrnb(nrnb
, eNR_CONSTR_VIR
, lincsd
->nc_real
);