Merge branch release-5-1
[gromacs.git] / src / gromacs / mdlib / clincs.cpp
blob5a83907139614c665b90c54c27206b6805fb63ba
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* This file is completely threadsafe - keep it that way! */
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <assert.h>
43 #include <stdlib.h>
45 #include <cmath>
47 #include <algorithm>
49 #include "gromacs/domdec/domdec.h"
50 #include "gromacs/fileio/copyrite.h"
51 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
52 #include "gromacs/legacyheaders/nrnb.h"
53 #include "gromacs/legacyheaders/types/commrec.h"
54 #include "gromacs/math/units.h"
55 #include "gromacs/math/vec.h"
56 #include "gromacs/mdlib/constr.h"
57 #include "gromacs/mdlib/mdrun.h"
58 #include "gromacs/pbcutil/pbc.h"
59 #include "gromacs/pbcutil/pbc-simd.h"
60 #include "gromacs/simd/simd.h"
61 #include "gromacs/simd/simd_math.h"
62 #include "gromacs/simd/vector_operations.h"
63 #include "gromacs/topology/block.h"
64 #include "gromacs/topology/mtop_util.h"
65 #include "gromacs/utility/bitmask.h"
66 #include "gromacs/utility/cstringutil.h"
67 #include "gromacs/utility/exceptions.h"
68 #include "gromacs/utility/fatalerror.h"
69 #include "gromacs/utility/gmxomp.h"
70 #include "gromacs/utility/smalloc.h"
72 /* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
73 #if GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700) && !defined(__ICL)
74 # define LINCS_SIMD
75 #endif
78 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
80 // This was originally work-in-progress for augmenting the SIMD module with
81 // masked load/store operations. Instead, that turned into and extended SIMD
82 // interface that supports gather/scatter in all platforms, which will be
83 // part of a future Gromacs version. However, since the code for bonded
84 // interactions and LINCS was already written it would be a pity not to get
85 // the performance gains in Gromacs-5.1. For this reason we have added it as
86 // a bit of a hack in the two files that use it. It will be replaced with the
87 // new generic functionality after version 5.1
89 # ifdef GMX_DOUBLE
90 static gmx_inline void gmx_simdcall
91 gmx_hack_simd_transpose4_r(gmx_simd_double_t *row0,
92 gmx_simd_double_t *row1,
93 gmx_simd_double_t *row2,
94 gmx_simd_double_t *row3)
96 __m256d tmp0, tmp1, tmp2, tmp3;
98 tmp0 = _mm256_unpacklo_pd(*row0, *row1);
99 tmp2 = _mm256_unpacklo_pd(*row2, *row3);
100 tmp1 = _mm256_unpackhi_pd(*row0, *row1);
101 tmp3 = _mm256_unpackhi_pd(*row2, *row3);
102 *row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
103 *row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
104 *row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);
105 *row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31);
108 static gmx_inline void gmx_simdcall
109 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t *a,
110 gmx_simd_double_t *row0,
111 gmx_simd_double_t *row1,
112 gmx_simd_double_t *row2,
113 gmx_simd_double_t *row3)
115 *row0 = a[0];
116 *row1 = a[1];
117 *row2 = a[2];
118 *row3 = a[3];
120 gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
123 static gmx_inline void gmx_simdcall
124 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_double_t row0,
125 gmx_simd_double_t row1,
126 gmx_simd_double_t row2,
127 gmx_simd_double_t row3,
128 gmx_simd4_double_t *a)
130 a[0] = row0;
131 a[1] = row1;
132 a[2] = row2;
133 a[3] = row3;
135 gmx_hack_simd_transpose4_r(&a[0], &a[1], &a[2], &a[3]);
139 # if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
140 # define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
141 # define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)), (x))
142 # else
143 # define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
144 # define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), (x))
145 # endif
147 # else /* single instead of double */
148 static gmx_inline void gmx_simdcall
149 gmx_hack_simd_transpose4_r(gmx_simd_float_t *row0,
150 gmx_simd_float_t *row1,
151 gmx_simd_float_t *row2,
152 gmx_simd_float_t *row3)
154 __m256 tmp0, tmp1, tmp2, tmp3;
156 tmp0 = _mm256_unpacklo_ps(*row0, *row1);
157 tmp2 = _mm256_unpacklo_ps(*row2, *row3);
158 tmp1 = _mm256_unpackhi_ps(*row0, *row1);
159 tmp3 = _mm256_unpackhi_ps(*row2, *row3);
160 *row0 = _mm256_shuffle_ps(tmp0, tmp2, 0x44);
161 *row1 = _mm256_shuffle_ps(tmp0, tmp2, 0xEE);
162 *row2 = _mm256_shuffle_ps(tmp1, tmp3, 0x44);
163 *row3 = _mm256_shuffle_ps(tmp1, tmp3, 0xEE);
166 static gmx_inline void gmx_simdcall
167 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t *a,
168 gmx_simd_float_t *row0,
169 gmx_simd_float_t *row1,
170 gmx_simd_float_t *row2,
171 gmx_simd_float_t *row3)
173 *row0 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[0]), a[4], 1);
174 *row1 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[1]), a[5], 1);
175 *row2 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[2]), a[6], 1);
176 *row3 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[3]), a[7], 1);
178 gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
181 static gmx_inline void gmx_simdcall
182 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_float_t row0,
183 gmx_simd_float_t row1,
184 gmx_simd_float_t row2,
185 gmx_simd_float_t row3,
186 gmx_simd4_float_t *a)
188 gmx_hack_simd_transpose4_r(&row0, &row1, &row2, &row3);
190 a[0] = _mm256_extractf128_ps(row0, 0);
191 a[1] = _mm256_extractf128_ps(row1, 0);
192 a[2] = _mm256_extractf128_ps(row2, 0);
193 a[3] = _mm256_extractf128_ps(row3, 0);
194 a[4] = _mm256_extractf128_ps(row0, 1);
195 a[5] = _mm256_extractf128_ps(row1, 1);
196 a[6] = _mm256_extractf128_ps(row2, 1);
197 a[7] = _mm256_extractf128_ps(row3, 1);
199 #if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
200 # define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
201 # define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)), (x))
202 #else
203 # define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
204 # define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_set_epi32(0, -1, -1, -1), (x))
205 #endif
207 #endif /* double */
209 #endif /* AVX */
211 #if GMX_SIMD_HAVE_REAL
212 /*! \brief Store differences between indexed rvecs in SIMD registers.
214 * Returns SIMD register with the difference vectors:
215 * v[pair_index[i*2]] - v[pair_index[i*2 + 1]]
217 * \param[in] v Array of rvecs
218 * \param[in] pair_index Index pairs for GMX_SIMD_REAL_WIDTH vector pairs
219 * \param[in,out] buf Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
220 * \param[out] dx SIMD register with x difference
221 * \param[out] dy SIMD register with y difference
222 * \param[out] dz SIMD register with z difference
224 static gmx_inline void gmx_simdcall
225 gmx_hack_simd_gather_rvec_dist_pair_index(const rvec *v,
226 const int *pair_index,
227 real gmx_unused *buf,
228 gmx_simd_real_t *dx,
229 gmx_simd_real_t *dy,
230 gmx_simd_real_t *dz)
232 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
233 int i;
234 gmx_simd4_real_t d[GMX_SIMD_REAL_WIDTH];
235 gmx_simd_real_t tmp;
237 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
239 d[i] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 0]][0])),
240 gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 1]][0])));
243 gmx_hack_simd4_transpose_to_simd_r(d, dx, dy, dz, &tmp);
244 #else
245 #if GMX_ALIGNMENT
246 GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) buf_aligned[3*GMX_SIMD_REAL_WIDTH];
247 #else
248 real* buf_aligned = buf;
249 #endif
251 int i, m;
253 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
255 /* Store the distances packed and aligned */
256 for (m = 0; m < DIM; m++)
258 buf_aligned[m*GMX_SIMD_REAL_WIDTH + i] =
259 v[pair_index[i*2]][m] - v[pair_index[i*2 + 1]][m];
262 *dx = gmx_simd_load_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH);
263 *dy = gmx_simd_load_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH);
264 *dz = gmx_simd_load_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH);
265 #endif
269 /*! \brief Stores SIMD vector into multiple rvecs.
271 * \param[in] x SIMD register with x-components of the vectors
272 * \param[in] y SIMD register with y-components of the vectors
273 * \param[in] z SIMD register with z-components of the vectors
274 * \param[in,out] buf Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
275 * \param[out] v Array of GMX_SIMD_REAL_WIDTH rvecs
277 static gmx_inline void gmx_simdcall
278 gmx_simd_store_vec_to_rvec(gmx_simd_real_t x,
279 gmx_simd_real_t y,
280 gmx_simd_real_t z,
281 real gmx_unused *buf,
282 rvec *v)
284 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
285 int i;
286 gmx_simd4_real_t s4[GMX_SIMD_REAL_WIDTH];
287 gmx_simd_real_t zero = gmx_simd_setzero_r();
289 gmx_hack_simd_transpose_to_simd4_r(x, y, z, zero, s4);
291 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
293 gmx_hack_simd4_store3_r(v[i], s4[i]);
295 #else
296 #if GMX_ALIGNMENT
297 GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) buf_aligned[3*GMX_SIMD_REAL_WIDTH];
298 #else
299 real* buf_aligned = buf;
300 #endif
302 int i, m;
304 gmx_simd_store_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH, x);
305 gmx_simd_store_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH, y);
306 gmx_simd_store_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH, z);
308 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
310 for (m = 0; m < DIM; m++)
312 v[i][m] = buf_aligned[m*GMX_SIMD_REAL_WIDTH + i];
315 #endif
317 #endif /* GMX_SIMD_HAVE_REAL */
320 typedef struct {
321 int b0; /* first constraint for this task */
322 int b1; /* b1-1 is the last constraint for this task */
323 int ntriangle; /* the number of constraints in triangles */
324 int *triangle; /* the list of triangle constraints */
325 int *tri_bits; /* the bits tell if the matrix element should be used */
326 int tri_alloc; /* allocation size of triangle and tri_bits */
327 int nind; /* number of indices */
328 int *ind; /* constraint index for updating atom data */
329 int nind_r; /* number of indices */
330 int *ind_r; /* constraint index for updating atom data */
331 int ind_nalloc; /* allocation size of ind and ind_r */
332 tensor vir_r_m_dr; /* temporary variable for virial calculation */
333 real dhdlambda; /* temporary variable for lambda derivative */
334 real *simd_buf; /* Aligned work array for SIMD */
335 } lincs_task_t;
337 typedef struct gmx_lincsdata {
338 int ncg; /* the global number of constraints */
339 int ncg_flex; /* the global number of flexible constraints */
340 int ncg_triangle; /* the global number of constraints in triangles */
341 int nIter; /* the number of iterations */
342 int nOrder; /* the order of the matrix expansion */
343 int max_connect; /* the maximum number of constrains connected to a single atom */
345 int nc_real; /* the number of real constraints */
346 int nc; /* the number of constraints including padding for SIMD */
347 int nc_alloc; /* the number we allocated memory for */
348 int ncc; /* the number of constraint connections */
349 int ncc_alloc; /* the number we allocated memory for */
350 real matlam; /* the FE lambda value used for filling blc and blmf */
351 int *con_index; /* mapping from topology to LINCS constraints */
352 real *bllen0; /* the reference distance in topology A */
353 real *ddist; /* the reference distance in top B - the r.d. in top A */
354 int *bla; /* the atom pairs involved in the constraints */
355 real *blc; /* 1/sqrt(invmass1 + invmass2) */
356 real *blc1; /* as blc, but with all masses 1 */
357 int *blnr; /* index into blbnb and blmf */
358 int *blbnb; /* list of constraint connections */
359 int ntriangle; /* the local number of constraints in triangles */
360 int ncc_triangle; /* the number of constraint connections in triangles */
361 gmx_bool bCommIter; /* communicate before each LINCS interation */
362 real *blmf; /* matrix of mass factors for constraint connections */
363 real *blmf1; /* as blmf, but with all masses 1 */
364 real *bllen; /* the reference bond length */
365 int *nlocat; /* the local atom count per constraint, can be NULL */
367 int ntask; /* The number of tasks = #threads for LINCS */
368 lincs_task_t *task; /* LINCS thread division */
369 gmx_bitmask_t *atf; /* atom flags for thread parallelization */
370 int atf_nalloc; /* allocation size of atf */
371 gmx_bool bTaskDep; /* are the LINCS tasks interdependent? */
372 /* arrays for temporary storage in the LINCS algorithm */
373 rvec *tmpv;
374 real *tmpncc;
375 real *tmp1;
376 real *tmp2;
377 real *tmp3;
378 real *tmp4;
379 real *mlambda; /* the Lagrange multipliers * -1 */
380 /* storage for the constraint RMS relative deviation output */
381 real rmsd_data[3];
382 } t_gmx_lincsdata;
384 /* Define simd_width for memory allocation used for SIMD code */
385 #ifdef LINCS_SIMD
386 static const int simd_width = GMX_SIMD_REAL_WIDTH;
387 #else
388 static const int simd_width = 1;
389 #endif
390 /* Align to 128 bytes, consistent with the current implementation of
391 AlignedAllocator, which currently forces 128 byte alignment. */
392 static const int align_bytes = 128;
394 real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
396 return lincsd->rmsd_data;
399 real lincs_rmsd(struct gmx_lincsdata *lincsd, gmx_bool bSD2)
401 if (lincsd->rmsd_data[0] > 0)
403 return sqrt(lincsd->rmsd_data[bSD2 ? 2 : 1]/lincsd->rmsd_data[0]);
405 else
407 return 0;
411 /* Do a set of nrec LINCS matrix multiplications.
412 * This function will return with up to date thread-local
413 * constraint data, without an OpenMP barrier.
415 static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
416 const lincs_task_t *li_task,
417 const real *blcc,
418 real *rhs1, real *rhs2, real *sol)
420 int b0, b1, nrec, rec;
421 const int *blnr = lincsd->blnr;
422 const int *blbnb = lincsd->blbnb;
424 b0 = li_task->b0;
425 b1 = li_task->b1;
426 nrec = lincsd->nOrder;
428 for (rec = 0; rec < nrec; rec++)
430 int b;
432 if (lincsd->bTaskDep)
434 #pragma omp barrier
436 for (b = b0; b < b1; b++)
438 real mvb;
439 int n;
441 mvb = 0;
442 for (n = blnr[b]; n < blnr[b+1]; n++)
444 mvb = mvb + blcc[n]*rhs1[blbnb[n]];
446 rhs2[b] = mvb;
447 sol[b] = sol[b] + mvb;
450 real *swap;
452 swap = rhs1;
453 rhs1 = rhs2;
454 rhs2 = swap;
455 } /* nrec*(ncons+2*nrtot) flops */
457 if (lincsd->ntriangle > 0)
459 /* Perform an extra nrec recursions for only the constraints
460 * involved in rigid triangles.
461 * In this way their accuracy should come close to those of the other
462 * constraints, since traingles of constraints can produce eigenvalues
463 * around 0.7, while the effective eigenvalue for bond constraints
464 * is around 0.4 (and 0.7*0.7=0.5).
467 if (lincsd->bTaskDep)
469 /* We need a barrier here, since other threads might still be
470 * reading the contents of rhs1 and/o rhs2.
471 * We could avoid this barrier by introducing two extra rhs
472 * arrays for the triangle constraints only.
474 #pragma omp barrier
477 /* Constraints involved in a triangle are ensured to be in the same
478 * LINCS task. This means no barriers are required during the extra
479 * iterations for the triangle constraints.
481 const int *triangle = li_task->triangle;
482 const int *tri_bits = li_task->tri_bits;
484 for (rec = 0; rec < nrec; rec++)
486 int tb;
488 for (tb = 0; tb < li_task->ntriangle; tb++)
490 int b, bits, nr0, nr1, n;
491 real mvb;
493 b = triangle[tb];
494 bits = tri_bits[tb];
495 mvb = 0;
496 nr0 = blnr[b];
497 nr1 = blnr[b+1];
498 for (n = nr0; n < nr1; n++)
500 if (bits & (1 << (n - nr0)))
502 mvb = mvb + blcc[n]*rhs1[blbnb[n]];
505 rhs2[b] = mvb;
506 sol[b] = sol[b] + mvb;
509 real *swap;
511 swap = rhs1;
512 rhs1 = rhs2;
513 rhs2 = swap;
514 } /* nrec*(ntriangle + ncc_triangle*2) flops */
518 static void lincs_update_atoms_noind(int ncons, const int *bla,
519 real prefac,
520 const real *fac, rvec *r,
521 const real *invmass,
522 rvec *x)
524 int b, i, j;
525 real mvb, im1, im2, tmp0, tmp1, tmp2;
527 if (invmass != NULL)
529 for (b = 0; b < ncons; b++)
531 i = bla[2*b];
532 j = bla[2*b+1];
533 mvb = prefac*fac[b];
534 im1 = invmass[i];
535 im2 = invmass[j];
536 tmp0 = r[b][0]*mvb;
537 tmp1 = r[b][1]*mvb;
538 tmp2 = r[b][2]*mvb;
539 x[i][0] -= tmp0*im1;
540 x[i][1] -= tmp1*im1;
541 x[i][2] -= tmp2*im1;
542 x[j][0] += tmp0*im2;
543 x[j][1] += tmp1*im2;
544 x[j][2] += tmp2*im2;
545 } /* 16 ncons flops */
547 else
549 for (b = 0; b < ncons; b++)
551 i = bla[2*b];
552 j = bla[2*b+1];
553 mvb = prefac*fac[b];
554 tmp0 = r[b][0]*mvb;
555 tmp1 = r[b][1]*mvb;
556 tmp2 = r[b][2]*mvb;
557 x[i][0] -= tmp0;
558 x[i][1] -= tmp1;
559 x[i][2] -= tmp2;
560 x[j][0] += tmp0;
561 x[j][1] += tmp1;
562 x[j][2] += tmp2;
567 static void lincs_update_atoms_ind(int ncons, const int *ind, const int *bla,
568 real prefac,
569 const real *fac, rvec *r,
570 const real *invmass,
571 rvec *x)
573 int bi, b, i, j;
574 real mvb, im1, im2, tmp0, tmp1, tmp2;
576 if (invmass != NULL)
578 for (bi = 0; bi < ncons; bi++)
580 b = ind[bi];
581 i = bla[2*b];
582 j = bla[2*b+1];
583 mvb = prefac*fac[b];
584 im1 = invmass[i];
585 im2 = invmass[j];
586 tmp0 = r[b][0]*mvb;
587 tmp1 = r[b][1]*mvb;
588 tmp2 = r[b][2]*mvb;
589 x[i][0] -= tmp0*im1;
590 x[i][1] -= tmp1*im1;
591 x[i][2] -= tmp2*im1;
592 x[j][0] += tmp0*im2;
593 x[j][1] += tmp1*im2;
594 x[j][2] += tmp2*im2;
595 } /* 16 ncons flops */
597 else
599 for (bi = 0; bi < ncons; bi++)
601 b = ind[bi];
602 i = bla[2*b];
603 j = bla[2*b+1];
604 mvb = prefac*fac[b];
605 tmp0 = r[b][0]*mvb;
606 tmp1 = r[b][1]*mvb;
607 tmp2 = r[b][2]*mvb;
608 x[i][0] -= tmp0;
609 x[i][1] -= tmp1;
610 x[i][2] -= tmp2;
611 x[j][0] += tmp0;
612 x[j][1] += tmp1;
613 x[j][2] += tmp2;
614 } /* 16 ncons flops */
618 static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
619 real prefac,
620 const real *fac, rvec *r,
621 const real *invmass,
622 rvec *x)
624 if (li->ntask == 1)
626 /* Single thread, we simply update for all constraints */
627 lincs_update_atoms_noind(li->nc_real,
628 li->bla, prefac, fac, r, invmass, x);
630 else
632 /* Update the atom vector components for our thread local
633 * constraints that only access our local atom range.
634 * This can be done without a barrier.
636 lincs_update_atoms_ind(li->task[th].nind, li->task[th].ind,
637 li->bla, prefac, fac, r, invmass, x);
639 if (li->task[li->ntask].nind > 0)
641 /* Update the constraints that operate on atoms
642 * in multiple thread atom blocks on the master thread.
644 #pragma omp barrier
645 #pragma omp master
647 lincs_update_atoms_ind(li->task[li->ntask].nind,
648 li->task[li->ntask].ind,
649 li->bla, prefac, fac, r, invmass, x);
655 #ifdef LINCS_SIMD
656 /* Calculate the constraint distance vectors r to project on from x.
657 * Determine the right-hand side of the matrix equation using quantity f.
658 * This function only differs from calc_dr_x_xp_simd below in that
659 * no constraint length is subtracted and no PBC is used for f.
661 static void gmx_simdcall
662 calc_dr_x_f_simd(int b0,
663 int b1,
664 const int * bla,
665 const rvec * gmx_restrict x,
666 const rvec * gmx_restrict f,
667 const real * gmx_restrict blc,
668 const pbc_simd_t * pbc_simd,
669 real * gmx_restrict vbuf1,
670 real * gmx_restrict vbuf2,
671 rvec * gmx_restrict r,
672 real * gmx_restrict rhs,
673 real * gmx_restrict sol)
675 int bs;
677 assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
679 for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
681 gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
682 gmx_simd_real_t fx_S, fy_S, fz_S, ip_S, rhs_S;
684 gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
685 &rx_S, &ry_S, &rz_S);
687 pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
689 n2_S = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
690 il_S = gmx_simd_invsqrt_r(n2_S);
692 rx_S = gmx_simd_mul_r(rx_S, il_S);
693 ry_S = gmx_simd_mul_r(ry_S, il_S);
694 rz_S = gmx_simd_mul_r(rz_S, il_S);
696 gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
698 gmx_hack_simd_gather_rvec_dist_pair_index(f, bla + bs*2, vbuf2,
699 &fx_S, &fy_S, &fz_S);
701 ip_S = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
702 fx_S, fy_S, fz_S);
704 rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs), ip_S);
706 gmx_simd_store_r(rhs + bs, rhs_S);
707 gmx_simd_store_r(sol + bs, rhs_S);
710 #endif /* LINCS_SIMD */
712 /* LINCS projection, works on derivatives of the coordinates */
713 static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
714 struct gmx_lincsdata *lincsd, int th,
715 real *invmass,
716 int econq, gmx_bool bCalcDHDL,
717 gmx_bool bCalcVir, tensor rmdf)
719 int b0, b1, b;
720 int *bla, *blnr, *blbnb;
721 rvec *r;
722 real *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
724 b0 = lincsd->task[th].b0;
725 b1 = lincsd->task[th].b1;
727 bla = lincsd->bla;
728 r = lincsd->tmpv;
729 blnr = lincsd->blnr;
730 blbnb = lincsd->blbnb;
731 if (econq != econqForce)
733 /* Use mass-weighted parameters */
734 blc = lincsd->blc;
735 blmf = lincsd->blmf;
737 else
739 /* Use non mass-weighted parameters */
740 blc = lincsd->blc1;
741 blmf = lincsd->blmf1;
743 blcc = lincsd->tmpncc;
744 rhs1 = lincsd->tmp1;
745 rhs2 = lincsd->tmp2;
746 sol = lincsd->tmp3;
748 #ifdef LINCS_SIMD
750 /* This SIMD code does the same as the plain-C code after the #else.
751 * The only difference is that we always call pbc code, as with SIMD
752 * the overhead of pbc computation (when not needed) is small.
754 pbc_simd_t pbc_simd;
756 /* Convert the pbc struct for SIMD */
757 set_pbc_simd(pbc, &pbc_simd);
759 /* Compute normalized x i-j vectors, store in r.
760 * Compute the inner product of r and xp i-j and store in rhs1.
762 calc_dr_x_f_simd(b0, b1, bla, x, f, blc,
763 &pbc_simd,
764 lincsd->task[th].simd_buf,
765 lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
766 r, rhs1, sol);
768 #else /* LINCS_SIMD */
770 /* Compute normalized i-j vectors */
771 if (pbc)
773 for (b = b0; b < b1; b++)
775 rvec dx;
777 pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
778 unitv(dx, r[b]);
781 else
783 for (b = b0; b < b1; b++)
785 rvec dx;
787 rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
788 unitv(dx, r[b]);
789 } /* 16 ncons flops */
792 for (b = b0; b < b1; b++)
794 int i, j;
795 real mvb;
797 i = bla[2*b];
798 j = bla[2*b+1];
799 mvb = blc[b]*(r[b][0]*(f[i][0] - f[j][0]) +
800 r[b][1]*(f[i][1] - f[j][1]) +
801 r[b][2]*(f[i][2] - f[j][2]));
802 rhs1[b] = mvb;
803 sol[b] = mvb;
804 /* 7 flops */
807 #endif /* LINCS_SIMD */
809 if (lincsd->bTaskDep)
811 /* We need a barrier, since the matrix construction below
812 * can access entries in r of other threads.
814 #pragma omp barrier
817 /* Construct the (sparse) LINCS matrix */
818 for (b = b0; b < b1; b++)
820 int n;
822 for (n = blnr[b]; n < blnr[b+1]; n++)
824 blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
825 } /* 6 nr flops */
827 /* Together: 23*ncons + 6*nrtot flops */
829 lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
830 /* nrec*(ncons+2*nrtot) flops */
832 if (econq == econqDeriv_FlexCon)
834 /* We only want to constraint the flexible constraints,
835 * so we mask out the normal ones by setting sol to 0.
837 for (b = b0; b < b1; b++)
839 if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
841 sol[b] = 0;
846 /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
847 for (b = b0; b < b1; b++)
849 sol[b] *= blc[b];
852 /* When constraining forces, we should not use mass weighting,
853 * so we pass invmass=NULL, which results in the use of 1 for all atoms.
855 lincs_update_atoms(lincsd, th, 1.0, sol, r,
856 (econq != econqForce) ? invmass : NULL, fp);
858 if (bCalcDHDL)
860 real dhdlambda;
862 dhdlambda = 0;
863 for (b = b0; b < b1; b++)
865 dhdlambda -= sol[b]*lincsd->ddist[b];
868 lincsd->task[th].dhdlambda = dhdlambda;
871 if (bCalcVir)
873 /* Constraint virial,
874 * determines sum r_bond x delta f,
875 * where delta f is the constraint correction
876 * of the quantity that is being constrained.
878 for (b = b0; b < b1; b++)
880 real mvb, tmp1;
881 int i, j;
883 mvb = lincsd->bllen[b]*sol[b];
884 for (i = 0; i < DIM; i++)
886 tmp1 = mvb*r[b][i];
887 for (j = 0; j < DIM; j++)
889 rmdf[i][j] += tmp1*r[b][j];
892 } /* 23 ncons flops */
896 #ifdef LINCS_SIMD
897 /* Calculate the constraint distance vectors r to project on from x.
898 * Determine the right-hand side of the matrix equation using coordinates xp.
900 static void gmx_simdcall
901 calc_dr_x_xp_simd(int b0,
902 int b1,
903 const int * bla,
904 const rvec * gmx_restrict x,
905 const rvec * gmx_restrict xp,
906 const real * gmx_restrict bllen,
907 const real * gmx_restrict blc,
908 const pbc_simd_t * pbc_simd,
909 real * gmx_restrict vbuf1,
910 real * gmx_restrict vbuf2,
911 rvec * gmx_restrict r,
912 real * gmx_restrict rhs,
913 real * gmx_restrict sol)
915 int bs;
917 assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
919 for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
921 gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
922 gmx_simd_real_t rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
924 gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
925 &rx_S, &ry_S, &rz_S);
927 pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
929 n2_S = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
930 il_S = gmx_simd_invsqrt_r(n2_S);
932 rx_S = gmx_simd_mul_r(rx_S, il_S);
933 ry_S = gmx_simd_mul_r(ry_S, il_S);
934 rz_S = gmx_simd_mul_r(rz_S, il_S);
936 gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
938 gmx_hack_simd_gather_rvec_dist_pair_index(xp, bla + bs*2, vbuf2,
939 &rxp_S, &ryp_S, &rzp_S);
941 pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
943 ip_S = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
944 rxp_S, ryp_S, rzp_S);
946 rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs),
947 gmx_simd_sub_r(ip_S, gmx_simd_load_r(bllen + bs)));
949 gmx_simd_store_r(rhs + bs, rhs_S);
950 gmx_simd_store_r(sol + bs, rhs_S);
953 #endif /* LINCS_SIMD */
955 /* Determine the distances and right-hand side for the next iteration */
956 static void calc_dist_iter(int b0,
957 int b1,
958 const int * bla,
959 const rvec * gmx_restrict xp,
960 const real * gmx_restrict bllen,
961 const real * gmx_restrict blc,
962 const t_pbc * pbc,
963 real wfac,
964 real * gmx_restrict rhs,
965 real * gmx_restrict sol,
966 gmx_bool * bWarn)
968 int b;
970 for (b = b0; b < b1; b++)
972 real len, len2, dlen2, mvb;
973 rvec dx;
975 len = bllen[b];
976 if (pbc)
978 pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
980 else
982 rvec_sub(xp[bla[2*b]], xp[bla[2*b+1]], dx);
984 len2 = len*len;
985 dlen2 = 2*len2 - norm2(dx);
986 if (dlen2 < wfac*len2)
988 /* not race free - see detailed comment in caller */
989 *bWarn = TRUE;
991 if (dlen2 > 0)
993 mvb = blc[b]*(len - dlen2*gmx_invsqrt(dlen2));
995 else
997 mvb = blc[b]*len;
999 rhs[b] = mvb;
1000 sol[b] = mvb;
1001 } /* 20*ncons flops */
1004 #ifdef LINCS_SIMD
1005 /* As the function above, but using SIMD intrinsics */
1006 static void gmx_simdcall
1007 calc_dist_iter_simd(int b0,
1008 int b1,
1009 const int * bla,
1010 const rvec * gmx_restrict x,
1011 const real * gmx_restrict bllen,
1012 const real * gmx_restrict blc,
1013 const pbc_simd_t * pbc_simd,
1014 real wfac,
1015 real * gmx_restrict vbuf,
1016 real * gmx_restrict rhs,
1017 real * gmx_restrict sol,
1018 gmx_bool * bWarn)
1020 gmx_simd_real_t min_S = gmx_simd_set1_r(GMX_REAL_MIN);
1021 gmx_simd_real_t two_S = gmx_simd_set1_r(2.0);
1022 gmx_simd_real_t wfac_S = gmx_simd_set1_r(wfac);
1023 gmx_simd_bool_t warn_S;
1025 int bs;
1027 assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
1029 /* Initialize all to FALSE */
1030 warn_S = gmx_simd_cmplt_r(two_S, gmx_simd_setzero_r());
1032 for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
1034 gmx_simd_real_t rx_S, ry_S, rz_S, n2_S;
1035 gmx_simd_real_t len_S, len2_S, dlen2_S, lc_S, blc_S;
1037 gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf,
1038 &rx_S, &ry_S, &rz_S);
1040 pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
1042 n2_S = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
1044 len_S = gmx_simd_load_r(bllen + bs);
1045 len2_S = gmx_simd_mul_r(len_S, len_S);
1047 dlen2_S = gmx_simd_fmsub_r(two_S, len2_S, n2_S);
1049 warn_S = gmx_simd_or_b(warn_S,
1050 gmx_simd_cmplt_r(dlen2_S,
1051 gmx_simd_mul_r(wfac_S, len2_S)));
1053 /* Avoid 1/0 by taking the max with REAL_MIN.
1054 * Note: when dlen2 is close to zero (90 degree constraint rotation),
1055 * the accuracy of the algorithm is no longer relevant.
1057 dlen2_S = gmx_simd_max_r(dlen2_S, min_S);
1059 lc_S = gmx_simd_fnmadd_r(dlen2_S, gmx_simd_invsqrt_r(dlen2_S), len_S);
1061 blc_S = gmx_simd_load_r(blc + bs);
1063 lc_S = gmx_simd_mul_r(blc_S, lc_S);
1065 gmx_simd_store_r(rhs + bs, lc_S);
1066 gmx_simd_store_r(sol + bs, lc_S);
1069 if (gmx_simd_anytrue_b(warn_S))
1071 *bWarn = TRUE;
1074 #endif /* LINCS_SIMD */
1076 static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
1077 struct gmx_lincsdata *lincsd, int th,
1078 const real *invmass,
1079 t_commrec *cr,
1080 gmx_bool bCalcDHDL,
1081 real wangle, gmx_bool *bWarn,
1082 real invdt, rvec * gmx_restrict v,
1083 gmx_bool bCalcVir, tensor vir_r_m_dr)
1085 int b0, b1, b, i, j, n, iter;
1086 int *bla, *blnr, *blbnb;
1087 rvec *r;
1088 real *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
1089 int *nlocat;
1091 b0 = lincsd->task[th].b0;
1092 b1 = lincsd->task[th].b1;
1094 bla = lincsd->bla;
1095 r = lincsd->tmpv;
1096 blnr = lincsd->blnr;
1097 blbnb = lincsd->blbnb;
1098 blc = lincsd->blc;
1099 blmf = lincsd->blmf;
1100 bllen = lincsd->bllen;
1101 blcc = lincsd->tmpncc;
1102 rhs1 = lincsd->tmp1;
1103 rhs2 = lincsd->tmp2;
1104 sol = lincsd->tmp3;
1105 blc_sol = lincsd->tmp4;
1106 mlambda = lincsd->mlambda;
1107 nlocat = lincsd->nlocat;
1109 #ifdef LINCS_SIMD
1111 /* This SIMD code does the same as the plain-C code after the #else.
1112 * The only difference is that we always call pbc code, as with SIMD
1113 * the overhead of pbc computation (when not needed) is small.
1115 pbc_simd_t pbc_simd;
1117 /* Convert the pbc struct for SIMD */
1118 set_pbc_simd(pbc, &pbc_simd);
1120 /* Compute normalized x i-j vectors, store in r.
1121 * Compute the inner product of r and xp i-j and store in rhs1.
1123 calc_dr_x_xp_simd(b0, b1, bla, x, xp, bllen, blc,
1124 &pbc_simd,
1125 lincsd->task[th].simd_buf,
1126 lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
1127 r, rhs1, sol);
1129 #else /* LINCS_SIMD */
1131 if (pbc)
1133 /* Compute normalized i-j vectors */
1134 for (b = b0; b < b1; b++)
1136 rvec dx;
1137 real mvb;
1139 pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
1140 unitv(dx, r[b]);
1142 pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
1143 mvb = blc[b]*(iprod(r[b], dx) - bllen[b]);
1144 rhs1[b] = mvb;
1145 sol[b] = mvb;
1148 else
1150 /* Compute normalized i-j vectors */
1151 for (b = b0; b < b1; b++)
1153 real tmp0, tmp1, tmp2, rlen, mvb;
1155 i = bla[2*b];
1156 j = bla[2*b+1];
1157 tmp0 = x[i][0] - x[j][0];
1158 tmp1 = x[i][1] - x[j][1];
1159 tmp2 = x[i][2] - x[j][2];
1160 rlen = gmx_invsqrt(tmp0*tmp0 + tmp1*tmp1 + tmp2*tmp2);
1161 r[b][0] = rlen*tmp0;
1162 r[b][1] = rlen*tmp1;
1163 r[b][2] = rlen*tmp2;
1164 /* 16 ncons flops */
1166 i = bla[2*b];
1167 j = bla[2*b+1];
1168 mvb = blc[b]*(r[b][0]*(xp[i][0] - xp[j][0]) +
1169 r[b][1]*(xp[i][1] - xp[j][1]) +
1170 r[b][2]*(xp[i][2] - xp[j][2]) - bllen[b]);
1171 rhs1[b] = mvb;
1172 sol[b] = mvb;
1173 /* 10 flops */
1175 /* Together: 26*ncons + 6*nrtot flops */
1178 #endif /* LINCS_SIMD */
1180 if (lincsd->bTaskDep)
1182 /* We need a barrier, since the matrix construction below
1183 * can access entries in r of other threads.
1185 #pragma omp barrier
1188 /* Construct the (sparse) LINCS matrix */
1189 for (b = b0; b < b1; b++)
1191 for (n = blnr[b]; n < blnr[b+1]; n++)
1193 blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
1196 /* Together: 26*ncons + 6*nrtot flops */
1198 lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
1199 /* nrec*(ncons+2*nrtot) flops */
1201 #ifndef LINCS_SIMD
1202 for (b = b0; b < b1; b++)
1204 mlambda[b] = blc[b]*sol[b];
1206 #else
1207 for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1209 gmx_simd_store_r(mlambda + b,
1210 gmx_simd_mul_r(gmx_simd_load_r(blc + b),
1211 gmx_simd_load_r(sol + b)));
1213 #endif
1215 /* Update the coordinates */
1216 lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
1219 ******** Correction for centripetal effects ********
1222 real wfac;
1224 wfac = cos(DEG2RAD*wangle);
1225 wfac = wfac*wfac;
1227 for (iter = 0; iter < lincsd->nIter; iter++)
1229 if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints))
1231 #pragma omp barrier
1232 #pragma omp master
1234 /* Communicate the corrected non-local coordinates */
1235 if (DOMAINDECOMP(cr))
1237 dd_move_x_constraints(cr->dd, box, xp, NULL, FALSE);
1240 #pragma omp barrier
1242 else if (lincsd->bTaskDep)
1244 #pragma omp barrier
1247 #ifdef LINCS_SIMD
1248 calc_dist_iter_simd(b0, b1, bla, xp, bllen, blc, &pbc_simd, wfac,
1249 lincsd->task[th].simd_buf, rhs1, sol, bWarn);
1250 #else
1251 calc_dist_iter(b0, b1, bla, xp, bllen, blc, pbc, wfac,
1252 rhs1, sol, bWarn);
1253 /* 20*ncons flops */
1254 #endif
1256 lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
1257 /* nrec*(ncons+2*nrtot) flops */
1259 #ifndef LINCS_SIMD
1260 for (b = b0; b < b1; b++)
1262 real mvb;
1264 mvb = blc[b]*sol[b];
1265 blc_sol[b] = mvb;
1266 mlambda[b] += mvb;
1268 #else
1269 for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1271 gmx_simd_real_t mvb;
1273 mvb = gmx_simd_mul_r(gmx_simd_load_r(blc + b),
1274 gmx_simd_load_r(sol + b));
1275 gmx_simd_store_r(blc_sol + b, mvb);
1276 gmx_simd_store_r(mlambda + b,
1277 gmx_simd_add_r(gmx_simd_load_r(mlambda + b), mvb));
1279 #endif
1281 /* Update the coordinates */
1282 lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
1284 /* nit*ncons*(37+9*nrec) flops */
1286 if (v != NULL)
1288 /* Update the velocities */
1289 lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
1290 /* 16 ncons flops */
1293 if (nlocat != NULL && (bCalcDHDL || bCalcVir))
1295 if (lincsd->bTaskDep)
1297 /* In lincs_update_atoms threads might cross-read mlambda */
1298 #pragma omp barrier
1301 /* Only account for local atoms */
1302 for (b = b0; b < b1; b++)
1304 mlambda[b] *= 0.5*nlocat[b];
1308 if (bCalcDHDL)
1310 real dhdl;
1312 dhdl = 0;
1313 for (b = b0; b < b1; b++)
1315 /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1316 * later after the contributions are reduced over the threads.
1318 dhdl -= lincsd->mlambda[b]*lincsd->ddist[b];
1320 lincsd->task[th].dhdlambda = dhdl;
1323 if (bCalcVir)
1325 /* Constraint virial */
1326 for (b = b0; b < b1; b++)
1328 real tmp0, tmp1;
1330 tmp0 = -bllen[b]*mlambda[b];
1331 for (i = 0; i < DIM; i++)
1333 tmp1 = tmp0*r[b][i];
1334 for (j = 0; j < DIM; j++)
1336 vir_r_m_dr[i][j] -= tmp1*r[b][j];
1339 } /* 22 ncons flops */
1342 /* Total:
1343 * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1344 * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1346 * (26+nrec)*ncons + (6+2*nrec)*nrtot
1347 * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1348 * if nit=1
1349 * (63+nrec)*ncons + (6+4*nrec)*nrtot
1353 /* Sets the elements in the LINCS matrix for task li_task */
1354 static void set_lincs_matrix_task(struct gmx_lincsdata *li,
1355 lincs_task_t *li_task,
1356 const real *invmass,
1357 int *ncc_triangle)
1359 int i;
1361 /* Construct the coupling coefficient matrix blmf */
1362 li_task->ntriangle = 0;
1363 *ncc_triangle = 0;
1364 for (i = li_task->b0; i < li_task->b1; i++)
1366 int a1, a2, n;
1368 a1 = li->bla[2*i];
1369 a2 = li->bla[2*i+1];
1370 for (n = li->blnr[i]; (n < li->blnr[i+1]); n++)
1372 int k, sign, center, end;
1374 k = li->blbnb[n];
1376 /* If we are using multiple, independent tasks for LINCS,
1377 * the calls to check_assign_connected should have
1378 * put all connected constraints in our task.
1380 assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
1382 if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
1384 sign = -1;
1386 else
1388 sign = 1;
1390 if (a1 == li->bla[2*k] || a1 == li->bla[2*k+1])
1392 center = a1;
1393 end = a2;
1395 else
1397 center = a2;
1398 end = a1;
1400 li->blmf[n] = sign*invmass[center]*li->blc[i]*li->blc[k];
1401 li->blmf1[n] = sign*0.5;
1402 if (li->ncg_triangle > 0)
1404 int nk, kk;
1406 /* Look for constraint triangles */
1407 for (nk = li->blnr[k]; (nk < li->blnr[k+1]); nk++)
1409 kk = li->blbnb[nk];
1410 if (kk != i && kk != k &&
1411 (li->bla[2*kk] == end || li->bla[2*kk+1] == end))
1413 /* If we are using multiple tasks for LINCS,
1414 * the calls to check_assign_triangle should have
1415 * put all constraints in the triangle in our task.
1417 assert(k >= li_task->b0 && k < li_task->b1);
1418 assert(kk >= li_task->b0 && kk < li_task->b1);
1420 if (li_task->ntriangle == 0 ||
1421 li_task->triangle[li_task->ntriangle - 1] < i)
1423 /* Add this constraint to the triangle list */
1424 li_task->triangle[li_task->ntriangle] = i;
1425 li_task->tri_bits[li_task->ntriangle] = 0;
1426 li_task->ntriangle++;
1427 if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li_task->tri_bits[0])*8 - 1))
1429 gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
1430 li->blnr[i+1] - li->blnr[i],
1431 sizeof(li_task->tri_bits[0])*8-1);
1434 li_task->tri_bits[li_task->ntriangle-1] |= (1 << (n - li->blnr[i]));
1435 (*ncc_triangle)++;
1443 /* Sets the elements in the LINCS matrix */
1444 void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
1446 int i;
1447 const real invsqrt2 = 0.7071067811865475244;
1449 for (i = 0; (i < li->nc); i++)
1451 int a1, a2;
1453 a1 = li->bla[2*i];
1454 a2 = li->bla[2*i+1];
1455 li->blc[i] = gmx_invsqrt(invmass[a1] + invmass[a2]);
1456 li->blc1[i] = invsqrt2;
1459 /* Construct the coupling coefficient matrix blmf */
1460 int th, ntriangle = 0, ncc_triangle = 0;
1461 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle) num_threads(li->ntask) schedule(static)
1462 for (th = 0; th < li->ntask; th++)
1466 set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle);
1467 ntriangle = li->task[th].ntriangle;
1469 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1471 li->ntriangle = ntriangle;
1472 li->ncc_triangle = ncc_triangle;
1474 if (debug)
1476 fprintf(debug, "Of the %d constraints %d participate in triangles\n",
1477 li->nc, li->ntriangle);
1478 fprintf(debug, "There are %d couplings of which %d in triangles\n",
1479 li->ncc, li->ncc_triangle);
1482 /* Set matlam,
1483 * so we know with which lambda value the masses have been set.
1485 li->matlam = lambda;
1488 static int count_triangle_constraints(const t_ilist *ilist,
1489 const t_blocka *at2con)
1491 int ncon1, ncon_tot;
1492 int c0, a00, a01, n1, c1, a10, a11, ac1, n2, c2, a20, a21;
1493 int ncon_triangle;
1494 gmx_bool bTriangle;
1495 t_iatom *ia1, *ia2, *iap;
1497 ncon1 = ilist[F_CONSTR].nr/3;
1498 ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1500 ia1 = ilist[F_CONSTR].iatoms;
1501 ia2 = ilist[F_CONSTRNC].iatoms;
1503 ncon_triangle = 0;
1504 for (c0 = 0; c0 < ncon_tot; c0++)
1506 bTriangle = FALSE;
1507 iap = constr_iatomptr(ncon1, ia1, ia2, c0);
1508 a00 = iap[1];
1509 a01 = iap[2];
1510 for (n1 = at2con->index[a01]; n1 < at2con->index[a01+1]; n1++)
1512 c1 = at2con->a[n1];
1513 if (c1 != c0)
1515 iap = constr_iatomptr(ncon1, ia1, ia2, c1);
1516 a10 = iap[1];
1517 a11 = iap[2];
1518 if (a10 == a01)
1520 ac1 = a11;
1522 else
1524 ac1 = a10;
1526 for (n2 = at2con->index[ac1]; n2 < at2con->index[ac1+1]; n2++)
1528 c2 = at2con->a[n2];
1529 if (c2 != c0 && c2 != c1)
1531 iap = constr_iatomptr(ncon1, ia1, ia2, c2);
1532 a20 = iap[1];
1533 a21 = iap[2];
1534 if (a20 == a00 || a21 == a00)
1536 bTriangle = TRUE;
1542 if (bTriangle)
1544 ncon_triangle++;
1548 return ncon_triangle;
1551 static gmx_bool more_than_two_sequential_constraints(const t_ilist *ilist,
1552 const t_blocka *at2con)
1554 t_iatom *ia1, *ia2, *iap;
1555 int ncon1, ncon_tot, c;
1556 int a1, a2;
1557 gmx_bool bMoreThanTwoSequentialConstraints;
1559 ncon1 = ilist[F_CONSTR].nr/3;
1560 ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1562 ia1 = ilist[F_CONSTR].iatoms;
1563 ia2 = ilist[F_CONSTRNC].iatoms;
1565 bMoreThanTwoSequentialConstraints = FALSE;
1566 for (c = 0; c < ncon_tot && !bMoreThanTwoSequentialConstraints; c++)
1568 iap = constr_iatomptr(ncon1, ia1, ia2, c);
1569 a1 = iap[1];
1570 a2 = iap[2];
1571 /* Check if this constraint has constraints connected at both atoms */
1572 if (at2con->index[a1+1] - at2con->index[a1] > 1 &&
1573 at2con->index[a2+1] - at2con->index[a2] > 1)
1575 bMoreThanTwoSequentialConstraints = TRUE;
1579 return bMoreThanTwoSequentialConstraints;
1582 static int int_comp(const void *a, const void *b)
1584 return (*(int *)a) - (*(int *)b);
1587 gmx_lincsdata_t init_lincs(FILE *fplog, const gmx_mtop_t *mtop,
1588 int nflexcon_global, const t_blocka *at2con,
1589 gmx_bool bPLINCS, int nIter, int nProjOrder)
1591 struct gmx_lincsdata *li;
1592 int mt, mb;
1593 gmx_moltype_t *molt;
1594 gmx_bool bMoreThanTwoSeq;
1596 if (fplog)
1598 fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n",
1599 bPLINCS ? " Parallel" : "");
1602 snew(li, 1);
1604 li->ncg =
1605 gmx_mtop_ftype_count(mtop, F_CONSTR) +
1606 gmx_mtop_ftype_count(mtop, F_CONSTRNC);
1607 li->ncg_flex = nflexcon_global;
1609 li->nIter = nIter;
1610 li->nOrder = nProjOrder;
1612 li->max_connect = 0;
1613 for (mt = 0; mt < mtop->nmoltype; mt++)
1615 int a;
1617 molt = &mtop->moltype[mt];
1618 for (a = 0; a < molt->atoms.nr; a++)
1620 li->max_connect = std::max(li->max_connect,
1621 at2con[mt].index[a + 1] - at2con[mt].index[a]);
1625 li->ncg_triangle = 0;
1626 bMoreThanTwoSeq = FALSE;
1627 for (mb = 0; mb < mtop->nmolblock; mb++)
1629 molt = &mtop->moltype[mtop->molblock[mb].type];
1631 li->ncg_triangle +=
1632 mtop->molblock[mb].nmol*
1633 count_triangle_constraints(molt->ilist,
1634 &at2con[mtop->molblock[mb].type]);
1636 if (!bMoreThanTwoSeq &&
1637 more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]))
1639 bMoreThanTwoSeq = TRUE;
1643 /* Check if we need to communicate not only before LINCS,
1644 * but also before each iteration.
1645 * The check for only two sequential constraints is only
1646 * useful for the common case of H-bond only constraints.
1647 * With more effort we could also make it useful for small
1648 * molecules with nr. sequential constraints <= nOrder-1.
1650 li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
1652 if (debug && bPLINCS)
1654 fprintf(debug, "PLINCS communication before each iteration: %d\n",
1655 li->bCommIter);
1658 /* LINCS can run on any number of threads.
1659 * Currently the number is fixed for the whole simulation,
1660 * but it could be set in set_lincs().
1661 * The current constraint to task assignment code can create independent
1662 * tasks only when not more than two constraints are connected sequentially.
1664 li->ntask = gmx_omp_nthreads_get(emntLINCS);
1665 li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
1666 if (debug)
1668 fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n",
1669 li->ntask, li->bTaskDep ? "" : "in");
1671 if (li->ntask == 1)
1673 snew(li->task, 1);
1675 else
1677 /* Allocate an extra elements for "task-overlap" constraints */
1678 snew(li->task, li->ntask + 1);
1680 int th;
1681 #pragma omp parallel for num_threads(li->ntask)
1682 for (th = 0; th < li->ntask; th++)
1686 /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
1687 snew_aligned(li->task[th].simd_buf, 2*simd_width*DIM,
1688 align_bytes);
1690 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1693 if (bPLINCS || li->ncg_triangle > 0)
1695 please_cite(fplog, "Hess2008a");
1697 else
1699 please_cite(fplog, "Hess97a");
1702 if (fplog)
1704 fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1705 if (bPLINCS)
1707 fprintf(fplog, "There are inter charge-group constraints,\n"
1708 "will communicate selected coordinates each lincs iteration\n");
1710 if (li->ncg_triangle > 0)
1712 fprintf(fplog,
1713 "%d constraints are involved in constraint triangles,\n"
1714 "will apply an additional matrix expansion of order %d for couplings\n"
1715 "between constraints inside triangles\n",
1716 li->ncg_triangle, li->nOrder);
1720 return li;
1723 /* Sets up the work division over the threads */
1724 static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
1726 lincs_task_t *li_m;
1727 int th;
1728 gmx_bitmask_t *atf;
1729 int a;
1731 if (natoms > li->atf_nalloc)
1733 li->atf_nalloc = over_alloc_large(natoms);
1734 srenew(li->atf, li->atf_nalloc);
1737 atf = li->atf;
1738 /* Clear the atom flags */
1739 for (a = 0; a < natoms; a++)
1741 bitmask_clear(&atf[a]);
1744 if (li->ntask > BITMASK_SIZE)
1746 gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
1749 for (th = 0; th < li->ntask; th++)
1751 lincs_task_t *li_task;
1752 int b;
1754 li_task = &li->task[th];
1756 /* For each atom set a flag for constraints from each */
1757 for (b = li_task->b0; b < li_task->b1; b++)
1759 bitmask_set_bit(&atf[li->bla[b*2 ]], th);
1760 bitmask_set_bit(&atf[li->bla[b*2 + 1]], th);
1764 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1765 for (th = 0; th < li->ntask; th++)
1769 lincs_task_t *li_task;
1770 gmx_bitmask_t mask;
1771 int b;
1773 li_task = &li->task[th];
1775 if (li_task->b1 - li_task->b0 > li_task->ind_nalloc)
1777 li_task->ind_nalloc = over_alloc_large(li_task->b1-li_task->b0);
1778 srenew(li_task->ind, li_task->ind_nalloc);
1779 srenew(li_task->ind_r, li_task->ind_nalloc);
1782 bitmask_init_low_bits(&mask, th);
1784 li_task->nind = 0;
1785 li_task->nind_r = 0;
1786 for (b = li_task->b0; b < li_task->b1; b++)
1788 /* We let the constraint with the lowest thread index
1789 * operate on atoms with constraints from multiple threads.
1791 if (bitmask_is_disjoint(atf[li->bla[b*2]], mask) &&
1792 bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
1794 /* Add the constraint to the local atom update index */
1795 li_task->ind[li_task->nind++] = b;
1797 else
1799 /* Add the constraint to the rest block */
1800 li_task->ind_r[li_task->nind_r++] = b;
1804 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1807 /* We need to copy all constraints which have not be assigned
1808 * to a thread to a separate list which will be handled by one thread.
1810 li_m = &li->task[li->ntask];
1812 li_m->nind = 0;
1813 for (th = 0; th < li->ntask; th++)
1815 lincs_task_t *li_task;
1816 int b;
1818 li_task = &li->task[th];
1820 if (li_m->nind + li_task->nind_r > li_m->ind_nalloc)
1822 li_m->ind_nalloc = over_alloc_large(li_m->nind+li_task->nind_r);
1823 srenew(li_m->ind, li_m->ind_nalloc);
1826 for (b = 0; b < li_task->nind_r; b++)
1828 li_m->ind[li_m->nind++] = li_task->ind_r[b];
1831 if (debug)
1833 fprintf(debug, "LINCS thread %d: %d constraints\n",
1834 th, li_task->nind);
1838 if (debug)
1840 fprintf(debug, "LINCS thread r: %d constraints\n",
1841 li_m->nind);
1845 /* There is no realloc with alignment, so here we make one for reals.
1846 * Note that this function does not preserve the contents of the memory.
1848 static void resize_real_aligned(real **ptr, int nelem)
1850 sfree_aligned(*ptr);
1851 snew_aligned(*ptr, nelem, align_bytes);
1854 static void assign_constraint(struct gmx_lincsdata *li,
1855 int constraint_index,
1856 int a1, int a2,
1857 real lenA, real lenB,
1858 const t_blocka *at2con)
1860 int con;
1862 con = li->nc;
1864 /* Make an mapping of local topology constraint index to LINCS index */
1865 li->con_index[constraint_index] = con;
1867 li->bllen0[con] = lenA;
1868 li->ddist[con] = lenB - lenA;
1869 /* Set the length to the topology A length */
1870 li->bllen[con] = lenA;
1871 li->bla[2*con] = a1;
1872 li->bla[2*con+1] = a2;
1874 /* Make space in the constraint connection matrix for constraints
1875 * connected to both end of the current constraint.
1877 li->ncc +=
1878 at2con->index[a1 + 1] - at2con->index[a1] - 1 +
1879 at2con->index[a2 + 1] - at2con->index[a2] - 1;
1881 li->blnr[con + 1] = li->ncc;
1883 /* Increase the constraint count */
1884 li->nc++;
1887 /* Check if constraint with topology index constraint_index is connected
1888 * to other constraints, and if so add those connected constraints to our task.
1890 static void check_assign_connected(struct gmx_lincsdata *li,
1891 const t_iatom *iatom,
1892 const t_idef *idef,
1893 int bDynamics,
1894 int a1, int a2,
1895 const t_blocka *at2con)
1897 /* Currently this function only supports constraint groups
1898 * in which all constraints share at least one atom
1899 * (e.g. H-bond constraints).
1900 * Check both ends of the current constraint for
1901 * connected constraints. We need to assign those
1902 * to the same task.
1904 int end;
1906 for (end = 0; end < 2; end++)
1908 int a, k;
1910 a = (end == 0 ? a1 : a2);
1912 for (k = at2con->index[a]; k < at2con->index[a + 1]; k++)
1914 int cc;
1916 cc = at2con->a[k];
1917 /* Check if constraint cc has not yet been assigned */
1918 if (li->con_index[cc] == -1)
1920 int type;
1921 real lenA, lenB;
1923 type = iatom[cc*3];
1924 lenA = idef->iparams[type].constr.dA;
1925 lenB = idef->iparams[type].constr.dB;
1927 if (bDynamics || lenA != 0 || lenB != 0)
1929 assign_constraint(li, cc, iatom[3*cc + 1], iatom[3*cc + 2], lenA, lenB, at2con);
1936 /* Check if constraint with topology index constraint_index is involved
1937 * in a constraint triangle, and if so add the other two constraints
1938 * in the triangle to our task.
1940 static void check_assign_triangle(struct gmx_lincsdata *li,
1941 const t_iatom *iatom,
1942 const t_idef *idef,
1943 int bDynamics,
1944 int constraint_index,
1945 int a1, int a2,
1946 const t_blocka *at2con)
1948 int nca, cc[32], ca[32], k;
1949 int c_triangle[2] = { -1, -1 };
1951 nca = 0;
1952 for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
1954 int c;
1956 c = at2con->a[k];
1957 if (c != constraint_index)
1959 int aa1, aa2;
1961 aa1 = iatom[c*3 + 1];
1962 aa2 = iatom[c*3 + 2];
1963 if (aa1 != a1)
1965 cc[nca] = c;
1966 ca[nca] = aa1;
1967 nca++;
1969 if (aa2 != a1)
1971 cc[nca] = c;
1972 ca[nca] = aa2;
1973 nca++;
1978 for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
1980 int c;
1982 c = at2con->a[k];
1983 if (c != constraint_index)
1985 int aa1, aa2, i;
1987 aa1 = iatom[c*3 + 1];
1988 aa2 = iatom[c*3 + 2];
1989 if (aa1 != a2)
1991 for (i = 0; i < nca; i++)
1993 if (aa1 == ca[i])
1995 c_triangle[0] = cc[i];
1996 c_triangle[1] = c;
2000 if (aa2 != a2)
2002 for (i = 0; i < nca; i++)
2004 if (aa2 == ca[i])
2006 c_triangle[0] = cc[i];
2007 c_triangle[1] = c;
2014 if (c_triangle[0] >= 0)
2016 int end;
2018 for (end = 0; end < 2; end++)
2020 /* Check if constraint c_triangle[end] has not yet been assigned */
2021 if (li->con_index[c_triangle[end]] == -1)
2023 int i, type;
2024 real lenA, lenB;
2026 i = c_triangle[end]*3;
2027 type = iatom[i];
2028 lenA = idef->iparams[type].constr.dA;
2029 lenB = idef->iparams[type].constr.dB;
2031 if (bDynamics || lenA != 0 || lenB != 0)
2033 assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
2040 static void set_matrix_indices(struct gmx_lincsdata *li,
2041 const lincs_task_t *li_task,
2042 const t_blocka *at2con,
2043 gmx_bool bSortMatrix)
2045 int b;
2047 for (b = li_task->b0; b < li_task->b1; b++)
2049 int a1, a2, i, k;
2051 a1 = li->bla[b*2];
2052 a2 = li->bla[b*2 + 1];
2054 i = li->blnr[b];
2055 for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
2057 int concon;
2059 concon = li->con_index[at2con->a[k]];
2060 if (concon != b)
2062 li->blbnb[i++] = concon;
2065 for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
2067 int concon;
2069 concon = li->con_index[at2con->a[k]];
2070 if (concon != b)
2072 li->blbnb[i++] = concon;
2076 if (bSortMatrix)
2078 /* Order the blbnb matrix to optimize memory access */
2079 qsort(&(li->blbnb[li->blnr[b]]), li->blnr[b + 1] - li->blnr[b],
2080 sizeof(li->blbnb[0]), int_comp);
2085 void set_lincs(const t_idef *idef,
2086 const t_mdatoms *md,
2087 gmx_bool bDynamics,
2088 t_commrec *cr,
2089 struct gmx_lincsdata *li)
2091 int natoms, nflexcon;
2092 t_blocka at2con;
2093 t_iatom *iatom;
2094 int i, ncc_alloc_old, ncon_tot;
2096 li->nc_real = 0;
2097 li->nc = 0;
2098 li->ncc = 0;
2099 /* Zero the thread index ranges.
2100 * Otherwise without local constraints we could return with old ranges.
2102 for (i = 0; i < li->ntask; i++)
2104 li->task[i].b0 = 0;
2105 li->task[i].b1 = 0;
2106 li->task[i].nind = 0;
2108 if (li->ntask > 1)
2110 li->task[li->ntask].nind = 0;
2113 /* This is the local topology, so there are only F_CONSTR constraints */
2114 if (idef->il[F_CONSTR].nr == 0)
2116 /* There are no constraints,
2117 * we do not need to fill any data structures.
2119 return;
2122 if (debug)
2124 fprintf(debug, "Building the LINCS connectivity\n");
2127 if (DOMAINDECOMP(cr))
2129 if (cr->dd->constraints)
2131 int start;
2133 dd_get_constraint_range(cr->dd, &start, &natoms);
2135 else
2137 natoms = cr->dd->nat_home;
2140 else
2142 natoms = md->homenr;
2144 at2con = make_at2con(0, natoms, idef->il, idef->iparams, bDynamics,
2145 &nflexcon);
2147 ncon_tot = idef->il[F_CONSTR].nr/3;
2149 /* Ensure we have enough padding for aligned loads for each thread */
2150 if (ncon_tot + li->ntask*simd_width > li->nc_alloc || li->nc_alloc == 0)
2152 li->nc_alloc = over_alloc_dd(ncon_tot + li->ntask*simd_width);
2153 srenew(li->con_index, li->nc_alloc);
2154 resize_real_aligned(&li->bllen0, li->nc_alloc);
2155 resize_real_aligned(&li->ddist, li->nc_alloc);
2156 srenew(li->bla, 2*li->nc_alloc);
2157 resize_real_aligned(&li->blc, li->nc_alloc);
2158 resize_real_aligned(&li->blc1, li->nc_alloc);
2159 srenew(li->blnr, li->nc_alloc + 1);
2160 resize_real_aligned(&li->bllen, li->nc_alloc);
2161 srenew(li->tmpv, li->nc_alloc);
2162 if (DOMAINDECOMP(cr))
2164 srenew(li->nlocat, li->nc_alloc);
2166 resize_real_aligned(&li->tmp1, li->nc_alloc);
2167 resize_real_aligned(&li->tmp2, li->nc_alloc);
2168 resize_real_aligned(&li->tmp3, li->nc_alloc);
2169 resize_real_aligned(&li->tmp4, li->nc_alloc);
2170 resize_real_aligned(&li->mlambda, li->nc_alloc);
2173 iatom = idef->il[F_CONSTR].iatoms;
2175 ncc_alloc_old = li->ncc_alloc;
2176 li->blnr[0] = li->ncc;
2178 /* Assign the constraints for li->ntask LINCS tasks.
2179 * We target a uniform distribution of constraints over the tasks.
2180 * Note that when flexible constraints are present, but are removed here
2181 * (e.g. because we are doing EM) we get imbalance, but since that doesn't
2182 * happen during normal MD, that's ok.
2184 int ncon_assign, ncon_target, con, th;
2186 /* Determine the number of constraints we need to assign here */
2187 ncon_assign = ncon_tot;
2188 if (!bDynamics)
2190 /* With energy minimization, flexible constraints are ignored
2191 * (and thus minimized, as they should be).
2193 ncon_assign -= nflexcon;
2196 /* Set the target constraint count per task to exactly uniform,
2197 * this might be overridden below.
2199 ncon_target = (ncon_assign + li->ntask - 1)/li->ntask;
2201 /* Mark all constraints as unassigned by setting their index to -1 */
2202 for (con = 0; con < ncon_tot; con++)
2204 li->con_index[con] = -1;
2207 con = 0;
2208 for (th = 0; th < li->ntask; th++)
2210 lincs_task_t *li_task;
2212 li_task = &li->task[th];
2214 #ifdef LINCS_SIMD
2215 /* With indepedent tasks we likely have H-bond constraints or constraint
2216 * pairs. The connected constraints will be pulled into the task, so the
2217 * constraints per task will often exceed ncon_target.
2218 * Triangle constraints can also increase the count, but there are
2219 * relatively few of those, so we usually expect to get ncon_target.
2221 if (li->bTaskDep)
2223 /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
2224 * since otherwise a lot of operations can be wasted.
2225 * There are several ways to round here, we choose the one
2226 * that alternates block sizes, which helps with Intel HT.
2228 ncon_target = ((ncon_assign*(th + 1))/li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1) & ~(GMX_SIMD_REAL_WIDTH - 1);
2230 #endif
2232 /* Continue filling the arrays where we left off with the previous task,
2233 * including padding for SIMD.
2235 li_task->b0 = li->nc;
2237 while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
2239 if (li->con_index[con] == -1)
2241 int type, a1, a2;
2242 real lenA, lenB;
2244 type = iatom[3*con];
2245 a1 = iatom[3*con + 1];
2246 a2 = iatom[3*con + 2];
2247 lenA = idef->iparams[type].constr.dA;
2248 lenB = idef->iparams[type].constr.dB;
2249 /* Skip the flexible constraints when not doing dynamics */
2250 if (bDynamics || lenA != 0 || lenB != 0)
2252 assign_constraint(li, con, a1, a2, lenA, lenB, &at2con);
2254 if (li->ntask > 1 && !li->bTaskDep)
2256 /* We can generate independent tasks. Check if we
2257 * need to assign connected constraints to our task.
2259 check_assign_connected(li, iatom, idef, bDynamics,
2260 a1, a2, &at2con);
2262 if (li->ntask > 1 && li->ncg_triangle > 0)
2264 /* Ensure constraints in one triangle are assigned
2265 * to the same task.
2267 check_assign_triangle(li, iatom, idef, bDynamics,
2268 con, a1, a2, &at2con);
2273 con++;
2276 li_task->b1 = li->nc;
2278 if (simd_width > 1)
2280 /* Copy the last atom pair indices and lengths for constraints
2281 * up to a multiple of simd_width, such that we can do all
2282 * SIMD operations without having to worry about end effects.
2284 int i, last;
2286 li->nc = ((li_task->b1 + simd_width - 1)/simd_width)*simd_width;
2287 last = li_task->b1 - 1;
2288 for (i = li_task->b1; i < li->nc; i++)
2290 li->bla[i*2 ] = li->bla[last*2 ];
2291 li->bla[i*2 + 1] = li->bla[last*2 + 1];
2292 li->bllen0[i] = li->bllen0[last];
2293 li->ddist[i] = li->ddist[last];
2294 li->bllen[i] = li->bllen[last];
2295 li->blnr[i + 1] = li->blnr[last + 1];
2299 /* Keep track of how many constraints we assigned */
2300 li->nc_real += li_task->b1 - li_task->b0;
2302 if (debug)
2304 fprintf(debug, "LINCS task %d constraints %d - %d\n",
2305 th, li_task->b0, li_task->b1);
2309 assert(li->nc_real == ncon_assign);
2311 gmx_bool bSortMatrix;
2313 /* Without DD we order the blbnb matrix to optimize memory access.
2314 * With DD the overhead of sorting is more than the gain during access.
2316 bSortMatrix = !DOMAINDECOMP(cr);
2318 if (li->ncc > li->ncc_alloc)
2320 li->ncc_alloc = over_alloc_small(li->ncc);
2321 srenew(li->blbnb, li->ncc_alloc);
2324 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2325 for (th = 0; th < li->ntask; th++)
2329 lincs_task_t *li_task;
2331 li_task = &li->task[th];
2333 if (li->ncg_triangle > 0 &&
2334 li_task->b1 - li_task->b0 > li_task->tri_alloc)
2336 /* This is allocating too much, but it is difficult to improve */
2337 li_task->tri_alloc = over_alloc_dd(li_task->b1 - li_task->b0);
2338 srenew(li_task->triangle, li_task->tri_alloc);
2339 srenew(li_task->tri_bits, li_task->tri_alloc);
2342 set_matrix_indices(li, li_task, &at2con, bSortMatrix);
2344 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2347 done_blocka(&at2con);
2349 if (cr->dd == NULL)
2351 /* Since the matrix is static, we should free some memory */
2352 li->ncc_alloc = li->ncc;
2353 srenew(li->blbnb, li->ncc_alloc);
2356 if (li->ncc_alloc > ncc_alloc_old)
2358 srenew(li->blmf, li->ncc_alloc);
2359 srenew(li->blmf1, li->ncc_alloc);
2360 srenew(li->tmpncc, li->ncc_alloc);
2363 if (DOMAINDECOMP(cr) && dd_constraints_nlocalatoms(cr->dd) != NULL)
2365 int *nlocat_dd;
2367 nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
2369 /* Convert nlocat from local topology to LINCS constraint indexing */
2370 for (con = 0; con < ncon_tot; con++)
2372 li->nlocat[li->con_index[con]] = nlocat_dd[con];
2375 else
2377 li->nlocat = NULL;
2380 if (debug)
2382 fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n",
2383 li->nc_real, li->nc, li->ncc);
2386 if (li->ntask > 1)
2388 lincs_thread_setup(li, md->nr);
2391 set_lincs_matrix(li, md->invmass, md->lambda);
2394 static void lincs_warning(FILE *fplog,
2395 gmx_domdec_t *dd, rvec *x, rvec *xprime, t_pbc *pbc,
2396 int ncons, int *bla, real *bllen, real wangle,
2397 int maxwarn, int *warncount)
2399 int b, i, j;
2400 rvec v0, v1;
2401 real wfac, d0, d1, cosine;
2402 char buf[STRLEN];
2404 wfac = cos(DEG2RAD*wangle);
2406 sprintf(buf, "bonds that rotated more than %g degrees:\n"
2407 " atom 1 atom 2 angle previous, current, constraint length\n",
2408 wangle);
2409 fprintf(stderr, "%s", buf);
2410 if (fplog)
2412 fprintf(fplog, "%s", buf);
2415 for (b = 0; b < ncons; b++)
2417 i = bla[2*b];
2418 j = bla[2*b+1];
2419 if (pbc)
2421 pbc_dx_aiuc(pbc, x[i], x[j], v0);
2422 pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
2424 else
2426 rvec_sub(x[i], x[j], v0);
2427 rvec_sub(xprime[i], xprime[j], v1);
2429 d0 = norm(v0);
2430 d1 = norm(v1);
2431 cosine = iprod(v0, v1)/(d0*d1);
2432 if (cosine < wfac)
2434 sprintf(buf, " %6d %6d %5.1f %8.4f %8.4f %8.4f\n",
2435 ddglatnr(dd, i), ddglatnr(dd, j),
2436 RAD2DEG*acos(cosine), d0, d1, bllen[b]);
2437 fprintf(stderr, "%s", buf);
2438 if (fplog)
2440 fprintf(fplog, "%s", buf);
2442 if (!std::isfinite(d1))
2444 gmx_fatal(FARGS, "Bond length not finite.");
2447 (*warncount)++;
2450 if (*warncount > maxwarn)
2452 too_many_constraint_warnings(econtLINCS, *warncount);
2456 static void cconerr(const struct gmx_lincsdata *lincsd,
2457 rvec *x, t_pbc *pbc,
2458 real *ncons_loc, real *ssd, real *max, int *imax)
2460 const int *bla, *nlocat;
2461 const real *bllen;
2462 real ma, ssd2;
2463 int count, im, task;
2465 bla = lincsd->bla;
2466 bllen = lincsd->bllen;
2467 nlocat = lincsd->nlocat;
2469 ma = 0;
2470 ssd2 = 0;
2471 im = 0;
2472 count = 0;
2473 for (task = 0; task < lincsd->ntask; task++)
2475 int b;
2477 for (b = lincsd->task[task].b0; b < lincsd->task[task].b1; b++)
2479 real len, d, r2;
2480 rvec dx;
2482 if (pbc)
2484 pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
2486 else
2488 rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
2490 r2 = norm2(dx);
2491 len = r2*gmx_invsqrt(r2);
2492 d = fabs(len/bllen[b]-1);
2493 if (d > ma && (nlocat == NULL || nlocat[b]))
2495 ma = d;
2496 im = b;
2498 if (nlocat == NULL)
2500 ssd2 += d*d;
2501 count++;
2503 else
2505 ssd2 += nlocat[b]*d*d;
2506 count += nlocat[b];
2511 *ncons_loc = (nlocat ? 0.5 : 1)*count;
2512 *ssd = (nlocat ? 0.5 : 1)*ssd2;
2513 *max = ma;
2514 *imax = im;
2517 gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
2518 t_inputrec *ir,
2519 gmx_int64_t step,
2520 struct gmx_lincsdata *lincsd, t_mdatoms *md,
2521 t_commrec *cr,
2522 rvec *x, rvec *xprime, rvec *min_proj,
2523 matrix box, t_pbc *pbc,
2524 real lambda, real *dvdlambda,
2525 real invdt, rvec *v,
2526 gmx_bool bCalcVir, tensor vir_r_m_dr,
2527 int econq,
2528 t_nrnb *nrnb,
2529 int maxwarn, int *warncount)
2531 gmx_bool bCalcDHDL;
2532 char buf[STRLEN], buf2[22], buf3[STRLEN];
2533 int i, p_imax;
2534 real ncons_loc, p_ssd, p_max = 0;
2535 rvec dx;
2536 gmx_bool bOK, bWarn;
2538 bOK = TRUE;
2540 /* This boolean should be set by a flag passed to this routine.
2541 * We can also easily check if any constraint length is changed,
2542 * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2544 bCalcDHDL = (ir->efep != efepNO && dvdlambda != NULL);
2546 if (lincsd->nc == 0 && cr->dd == NULL)
2548 if (bLog || bEner)
2550 lincsd->rmsd_data[0] = 0;
2551 if (ir->eI == eiSD2 && v == NULL)
2553 i = 2;
2555 else
2557 i = 1;
2559 lincsd->rmsd_data[i] = 0;
2562 return bOK;
2565 if (econq == econqCoord)
2567 /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2568 * also with efep!=fepNO.
2570 if (ir->efep != efepNO)
2572 if (md->nMassPerturbed && lincsd->matlam != md->lambda)
2574 set_lincs_matrix(lincsd, md->invmass, md->lambda);
2577 for (i = 0; i < lincsd->nc; i++)
2579 lincsd->bllen[i] = lincsd->bllen0[i] + lambda*lincsd->ddist[i];
2583 if (lincsd->ncg_flex)
2585 /* Set the flexible constraint lengths to the old lengths */
2586 if (pbc != NULL)
2588 for (i = 0; i < lincsd->nc; i++)
2590 if (lincsd->bllen[i] == 0)
2592 pbc_dx_aiuc(pbc, x[lincsd->bla[2*i]], x[lincsd->bla[2*i+1]], dx);
2593 lincsd->bllen[i] = norm(dx);
2597 else
2599 for (i = 0; i < lincsd->nc; i++)
2601 if (lincsd->bllen[i] == 0)
2603 lincsd->bllen[i] =
2604 sqrt(distance2(x[lincsd->bla[2*i]],
2605 x[lincsd->bla[2*i+1]]));
2611 if (bLog && fplog)
2613 cconerr(lincsd, xprime, pbc,
2614 &ncons_loc, &p_ssd, &p_max, &p_imax);
2617 /* This bWarn var can be updated by multiple threads
2618 * at the same time. But as we only need to detect
2619 * if a warning occured or not, this is not an issue.
2621 bWarn = FALSE;
2623 /* The OpenMP parallel region of constrain_lincs for coords */
2624 #pragma omp parallel num_threads(lincsd->ntask)
2628 int th = gmx_omp_get_thread_num();
2630 clear_mat(lincsd->task[th].vir_r_m_dr);
2632 do_lincs(x, xprime, box, pbc, lincsd, th,
2633 md->invmass, cr,
2634 bCalcDHDL,
2635 ir->LincsWarnAngle, &bWarn,
2636 invdt, v, bCalcVir,
2637 th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2639 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2642 if (bLog && fplog && lincsd->nc > 0)
2644 fprintf(fplog, " Rel. Constraint Deviation: RMS MAX between atoms\n");
2645 fprintf(fplog, " Before LINCS %.6f %.6f %6d %6d\n",
2646 sqrt(p_ssd/ncons_loc), p_max,
2647 ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2648 ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2650 if (bLog || bEner)
2652 cconerr(lincsd, xprime, pbc,
2653 &ncons_loc, &p_ssd, &p_max, &p_imax);
2654 /* Check if we are doing the second part of SD */
2655 if (ir->eI == eiSD2 && v == NULL)
2657 i = 2;
2659 else
2661 i = 1;
2663 lincsd->rmsd_data[0] = ncons_loc;
2664 lincsd->rmsd_data[i] = p_ssd;
2666 else
2668 lincsd->rmsd_data[0] = 0;
2669 lincsd->rmsd_data[1] = 0;
2670 lincsd->rmsd_data[2] = 0;
2672 if (bLog && fplog && lincsd->nc > 0)
2674 fprintf(fplog,
2675 " After LINCS %.6f %.6f %6d %6d\n\n",
2676 sqrt(p_ssd/ncons_loc), p_max,
2677 ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2678 ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2681 if (bWarn)
2683 if (maxwarn >= 0)
2685 cconerr(lincsd, xprime, pbc,
2686 &ncons_loc, &p_ssd, &p_max, &p_imax);
2687 if (MULTISIM(cr))
2689 sprintf(buf3, " in simulation %d", cr->ms->sim);
2691 else
2693 buf3[0] = 0;
2695 sprintf(buf, "\nStep %s, time %g (ps) LINCS WARNING%s\n"
2696 "relative constraint deviation after LINCS:\n"
2697 "rms %.6f, max %.6f (between atoms %d and %d)\n",
2698 gmx_step_str(step, buf2), ir->init_t+step*ir->delta_t,
2699 buf3,
2700 sqrt(p_ssd/ncons_loc), p_max,
2701 ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2702 ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2703 if (fplog)
2705 fprintf(fplog, "%s", buf);
2707 fprintf(stderr, "%s", buf);
2708 lincs_warning(fplog, cr->dd, x, xprime, pbc,
2709 lincsd->nc, lincsd->bla, lincsd->bllen,
2710 ir->LincsWarnAngle, maxwarn, warncount);
2712 bOK = (p_max < 0.5);
2715 if (lincsd->ncg_flex)
2717 for (i = 0; (i < lincsd->nc); i++)
2719 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
2721 lincsd->bllen[i] = 0;
2726 else
2728 /* The OpenMP parallel region of constrain_lincs for derivatives */
2729 #pragma omp parallel num_threads(lincsd->ntask)
2733 int th = gmx_omp_get_thread_num();
2735 do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
2736 md->invmass, econq, bCalcDHDL,
2737 bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2739 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2743 if (bCalcDHDL)
2745 /* Reduce the dH/dlambda contributions over the threads */
2746 real dhdlambda;
2747 int th;
2749 dhdlambda = 0;
2750 for (th = 0; th < lincsd->ntask; th++)
2752 dhdlambda += lincsd->task[th].dhdlambda;
2754 if (econq == econqCoord)
2756 /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2757 /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2758 dhdlambda /= ir->delta_t*ir->delta_t;
2760 *dvdlambda += dhdlambda;
2763 if (bCalcVir && lincsd->ntask > 1)
2765 for (i = 1; i < lincsd->ntask; i++)
2767 m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
2771 /* count assuming nit=1 */
2772 inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
2773 inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
2774 if (lincsd->ntriangle > 0)
2776 inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder*lincsd->ncc_triangle);
2778 if (v)
2780 inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real*2);
2782 if (bCalcVir)
2784 inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
2787 return bOK;